https://www.tensorflow.org/recommenders/examples/featurization#turning_categorical_features_into_embeddings

In [6]:
import pprint

import tensorflow_datasets as tfds

import numpy as np
from tensorflow import keras

import tensorflow as tf

ratings = tfds.load("movielens/100k-ratings", split="train")

for x in ratings.take(1).as_numpy_iterator():
    pprint.pprint(x)

{'bucketized_user_age': 45.0,
 'movie_genres': array([7]),
 'movie_id': b'357',
 'movie_title': b"One Flew Over the Cuckoo's Nest (1975)",
 'raw_user_age': 46.0,
 'timestamp': 879024327,
 'user_gender': True,
 'user_id': b'138',
 'user_occupation_label': 4,
 'user_occupation_text': b'doctor',
 'user_rating': 4.0,
 'user_zip_code': b'53211'}


2023-05-22 12:45:13.471508: W tensorflow/core/kernels/data/cache_dataset_ops.cc:856] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


## StringLookup

In [9]:
movie_title_lookup = keras.layers.StringLookup()

In [10]:
movie_title_lookup.adapt(ratings.map(lambda x: x["movie_title"]))

In [11]:
print(f"Vocabulary: {movie_title_lookup.get_vocabulary()[:3]}")

Vocabulary: ['[UNK]', 'Star Wars (1977)', 'Contact (1997)']


In [12]:
movie_title_lookup(["Star Wars (1977)", "One Flew Over the Cuckoo's Nest (1975)"])

<tf.Tensor: shape=(2,), dtype=int64, numpy=array([ 1, 58])>

In [13]:
movie_title_lookup(["Star Wars (1977)", "อาตมาฟ้าผ่า (2023)"])

<tf.Tensor: shape=(2,), dtype=int64, numpy=array([1, 0])>

## Hashing

In [16]:
# We set up a large number of bins to reduce the chance of hash collisions.
num_hashing_bins = 200_000

movie_title_hashing = keras.layers.Hashing(
    num_bins=num_hashing_bins
)

In [17]:
movie_title_hashing(["Star Wars (1977)", "One Flew Over the Cuckoo's Nest (1975)"])

<tf.Tensor: shape=(2,), dtype=int64, numpy=array([101016,  96565])>

In [19]:
movie_title_hashing(["Star Wars (1977)", "อาตมาฟ้าผ่า (2023)"])

<tf.Tensor: shape=(2,), dtype=int64, numpy=array([101016,  11243])>

## Embedding

In [20]:
movie_title_embedding = keras.layers.Embedding(
    # Let's use the explicit vocabulary lookup.
    input_dim=movie_title_lookup.vocab_size(),
    output_dim=32
)



In [21]:
movie_title_model = keras.Sequential([movie_title_lookup, movie_title_embedding])

In [22]:
movie_title_model(["Star Wars (1977)"])



<tf.Tensor: shape=(1, 32), dtype=float32, numpy=
array([[-0.03976804, -0.0415496 , -0.01106708,  0.01324985, -0.00261353,
        -0.02245682,  0.04873098,  0.02778157,  0.03387476, -0.04363858,
        -0.03595002,  0.01188491, -0.00124532, -0.0250195 , -0.04118379,
        -0.02669857, -0.04516503,  0.02814424,  0.04141983, -0.00908929,
         0.01978109,  0.01217794,  0.00402678,  0.0276508 , -0.03961364,
         0.01118432, -0.0254912 , -0.01327118,  0.03487224,  0.0377438 ,
         0.00740225,  0.04585507]], dtype=float32)>

In [23]:
movie_title_model(["อาตมาฟ้าผ่า (2023)"])



<tf.Tensor: shape=(1, 32), dtype=float32, numpy=
array([[ 0.00870454,  0.01667029, -0.03952014,  0.03339828,  0.01844809,
        -0.01724365,  0.04233545, -0.02298212,  0.01944694,  0.02790696,
         0.00716276,  0.00797569,  0.02400592,  0.02485884,  0.01714841,
        -0.02012392, -0.01661407, -0.0033948 , -0.01215196, -0.01838628,
        -0.02694302,  0.04167242, -0.03338395,  0.0347913 , -0.0431049 ,
         0.03890902,  0.03594854,  0.01720348, -0.03736428,  0.03122853,
         0.0263013 ,  0.00762074]], dtype=float32)>

In [24]:
movie_title_model(["Transformer (2023)"])



<tf.Tensor: shape=(1, 32), dtype=float32, numpy=
array([[ 0.00870454,  0.01667029, -0.03952014,  0.03339828,  0.01844809,
        -0.01724365,  0.04233545, -0.02298212,  0.01944694,  0.02790696,
         0.00716276,  0.00797569,  0.02400592,  0.02485884,  0.01714841,
        -0.02012392, -0.01661407, -0.0033948 , -0.01215196, -0.01838628,
        -0.02694302,  0.04167242, -0.03338395,  0.0347913 , -0.0431049 ,
         0.03890902,  0.03594854,  0.01720348, -0.03736428,  0.03122853,
         0.0263013 ,  0.00762074]], dtype=float32)>

## Text Vectorization

In [16]:
title_text = keras.layers.TextVectorization(
    max_tokens=movie_title_lookup.vocab_size(),
    output_mode='int',
    output_sequence_length=4
)





In [18]:
title_text.adapt(movie_title_lookup.get_vocabulary())

In [19]:
for row in ratings.batch(1).map(lambda x: x["movie_title"]).take(1):
    print(title_text(row))

tf.Tensor([[ 67   1 267   2]], shape=(1, 4), dtype=int64)


In [26]:
class MovieModel(tf.keras.Model):

    def __init__(self):
        super().__init__()

        max_tokens = 10_000

        self.title_embedding = tf.keras.Sequential([
          movie_title_lookup,
          tf.keras.layers.Embedding(movie_title_lookup.vocab_size(), 32)
        ])
        self.title_text_embedding = tf.keras.Sequential([
          tf.keras.layers.TextVectorization(max_tokens=movie_title_lookup.vocab_size()),
          tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
          # We average the embedding of individual words to get one embedding vector
          # per title.
          tf.keras.layers.GlobalAveragePooling1D(),
        ])

    def call(self, inputs):
        return tf.concat([
            self.title_embedding(inputs["movie_title"]),
            self.title_text_embedding(inputs["movie_title"]),
        ], axis=1)

In [27]:
movie_model = MovieModel()

movie_model.title_text_embedding.layers[0].adapt(
    movie_title_lookup.get_vocabulary())

for row in ratings.batch(1).take(1):
    print(f"Computed representations: {movie_model(row)}")









Computed representations: [[ 0.01668649 -0.02470998  0.00087453 -0.01957372 -0.03918434  0.03484757
   0.04127902  0.04971499  0.00717026 -0.03783311  0.02099004 -0.0333066
   0.00397264 -0.00721947  0.01684313  0.00868932  0.04040403  0.02759239
   0.02041711 -0.04278275 -0.03107814  0.03146942 -0.04349905 -0.04913602
  -0.03564036  0.01462188  0.03932277 -0.02871143  0.00535514 -0.03882653
   0.03556228 -0.0487872   0.00037591  0.0005873  -0.00334165  0.00749038
  -0.02618823 -0.01187599 -0.00856781 -0.00213595 -0.00520481  0.01038452
  -0.01337023 -0.00917381 -0.00410407 -0.00226621  0.00111128 -0.00036775
  -0.00890375  0.00025452  0.00016249  0.00018749 -0.01879689  0.00883781
   0.00048551 -0.01632429 -0.00713462  0.02292983 -0.02531117 -0.01108099
   0.00531709 -0.00898669  0.01554871 -0.00992816]]
