In [27]:
import os
import math
import numpy as np
import pandas as pd

import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs
import matplotlib
import numpy

In [28]:
users_df = pd.read_csv('Data/u.csv', sep='|')
users_df.columns = ['user_id','age','gender','job','zip']
movies_df  = pd.read_csv('Data/items.csv', sep='|')
movies_df.columns = ['movie_id', 'movie_title', 'release_date', 'video_release_date', 'imdb_url', 'unknown',  "Action",
    "Adventure", "Animation", "Children's", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir",
    "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]
ratings_df = pd.read_csv('Data/votes.csv', sep='|')
ratings_df.columns = ['user_id', 'movie_id', 'rating', 'timestamp']

In [29]:
users_df["user_id"] = users_df["user_id"].apply(lambda x: f"user_id_{x}")
users_df["age"] = users_df["age"].apply(lambda x: f"age_{x}")
users_df["job"] = users_df["job"].apply(lambda x: f"job_{x}")

movies_df["movie_id"] = movies_df["movie_id"].apply(lambda x: f"movie_id_{x}")

ratings_df["movie_id"] = ratings_df["movie_id"].apply(lambda x: f"movie_id_{x}")
ratings_df["user_id"] = ratings_df["user_id"].apply(lambda x: f"user_id_{x}")
ratings_df["rating"] = ratings_df["rating"].apply(lambda x: float(x))

In [30]:
ratings_group = ratings_df.sort_values(by=["timestamp"]).groupby("user_id")

ratings_data = pd.DataFrame(
    data={
        "user_id": list(ratings_group.groups.keys()),
        "movie_id": list(ratings_group.movie_id.apply(list)),
        "rating": list(ratings_group.rating.apply(list)),
        "timestamp": list(ratings_group.timestamp.apply(list)),
    }
)

In [31]:
sequence_length = 2
step_size = 1

def create_sequences(values, window_size, step_size):
    sequences = []
    start_index = 0
    while True:
        end_index = start_index + window_size
        seq = values[start_index:end_index]
        if len(seq) < window_size:
            seq = values[-window_size:]
            if len(seq) == window_size:
                sequences.append(seq)
            break
        sequences.append(seq)
        start_index += step_size
    return sequences


ratings_data.movie_id = ratings_data.movie_id.apply(
    lambda ids: create_sequences(ids, sequence_length, step_size)
)

ratings_data.rating = ratings_data.rating.apply(
    lambda ids: create_sequences(ids, sequence_length, step_size)
)

del ratings_data["timestamp"]

In [32]:
ratings_data_movies = ratings_data[["user_id", "movie_id"]].explode(
    "movie_id", ignore_index=True
)

ratings_data_rating = ratings_data[["rating"]].explode("rating", ignore_index=True)

ratings_data_transformed = []
ratings_data_transformed = pd.concat([ratings_data_movies, ratings_data_rating], axis=1)

ratings_data_transformed = ratings_data_transformed.join(
    users_df.set_index("user_id"), on="user_id"
)

ratings_data_transformed.movie_id = ratings_data_transformed.movie_id.apply(lambda x: ",".join(x))

ratings_data_transformed.rating = ratings_data_transformed.rating.apply(lambda x: ",".join([str(v) for v in x]))

del ratings_data_transformed["zip"]

ratings_data_transformed.rename(
    columns={"movie_id": "sequence_movie_ids", "rating": "sequence_ratings"},
    inplace=True,
)

print(ratings_data_transformed.head())

     user_id         sequence_movie_ids sequence_ratings     age gender  \
0  user_id_1  movie_id_168,movie_id_172          5.0,5.0  age_24      M   
1  user_id_1  movie_id_172,movie_id_165          5.0,5.0  age_24      M   
2  user_id_1  movie_id_165,movie_id_156          5.0,4.0  age_24      M   
3  user_id_1  movie_id_156,movie_id_166          4.0,5.0  age_24      M   
4  user_id_1  movie_id_166,movie_id_196          5.0,5.0  age_24      M   

              job  
0  job_technician  
1  job_technician  
2  job_technician  
3  job_technician  
4  job_technician  


In [33]:
#random_selection = np.random.rand(len(ratings_data_transformed.index)) <= 0.80
#train_data = ratings_data_transformed[random_selection]
#test_data = ratings_data_transformed[~random_selection]

#train_data.to_csv("train_data_100K-v2.csv", index=False, sep="|", header=False)
#test_data.to_csv("test_data_100K-v2.csv", index=False, sep="|", header=False)

In [34]:
CSV_HEADER = list(ratings_data_transformed.columns)

CATEGORICAL_FEATURES_WITH_VOCABULARY = {
    "user_id": list(users_df.user_id.unique()),
    "movie_id": list(movies_df.movie_id.unique()),
    "gender": list(users_df.gender.unique()),
    "age": list(users_df.age.unique()),
    "job": list(users_df.job.unique()),
}

USER_FEATURES = ["gender", "age", "job"]

MOVIE_FEATURES = ["genre"]

In [35]:
def get_dataset_from_csv(csv_file_path, shuffle=False, batch_size=128):
    def process(features):
        movie_ids_string = features["sequence_movie_ids"]
        sequence_movie_ids = tf.strings.split(movie_ids_string, ",").to_tensor()

        # The last movie id in the sequence is the target movie.
        features["target_movie_id"] = sequence_movie_ids[:, -1]
        features["sequence_movie_ids"] = sequence_movie_ids[:, :-1]

        ratings_string = features["sequence_ratings"]
        sequence_ratings = tf.strings.to_number(
            tf.strings.split(ratings_string, ","), tf.dtypes.float32
        ).to_tensor()

        # The last rating in the sequence is the target for the model to predict.
        target = sequence_ratings[:, -1]
        features["sequence_ratings"] = sequence_ratings[:, :-1]

        return features, target

    dataset = tf.data.experimental.make_csv_dataset(
        csv_file_path,
        batch_size=batch_size,
        column_names=CSV_HEADER,
        num_epochs=1,
        header=False,
        field_delim="|",
        shuffle=shuffle,
    ).map(process)

    return dataset

In [36]:
def create_model_inputs():
    return {
        "user_id": tf.keras.layers.Input(name="user_id", shape=(1,), dtype=tf.string),
        "sequence_movie_ids": tf.keras.layers.Input(
            name="sequence_movie_ids", shape=(sequence_length - 1,), dtype=tf.string
        ),
        "target_movie_id": tf.keras.layers.Input(
            name="target_movie_id", shape=(1,), dtype=tf.string
        ),
        "sequence_ratings": tf.keras.layers.Input(
            name="sequence_ratings", shape=(sequence_length - 1,), dtype=tf.float32
        ),
        "gender": tf.keras.layers.Input(name="gender", shape=(1,), dtype=tf.string),
        "age": tf.keras.layers.Input(name="age", shape=(1,), dtype=tf.string),
        "job": tf.keras.layers.Input(name="job", shape=(1,), dtype=tf.string),
    }

In [37]:
import math

def encode_input_features(
    inputs,
    include_user_id=True,
    include_user_features=True,
    include_movie_features=True,
):

    encoded_transformer_features = []
    encoded_other_features = []

    other_feature_names = []
    if include_user_id:
        other_feature_names.append("user_id")
    if include_user_features:
        other_feature_names.extend(USER_FEATURES)

    ## Encode user features
    for feature_name in other_feature_names:
        # Convert the string input values into integer indices.
        vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
        idx = tf.keras.layers.experimental.preprocessing.StringLookup(vocabulary=vocabulary, mask_token=None, num_oov_indices=0)(
            inputs[feature_name]
        )
        # Compute embedding dimensions
        embedding_dims = int(math.sqrt(len(vocabulary)))
        # Create an embedding layer with the specified dimensions.
        embedding_encoder = tf.keras.layers.Embedding(
            input_dim=len(vocabulary),
            output_dim=embedding_dims,
            name=f"{feature_name}_embedding",
        )
        # Convert the index values to embedding representations.
        encoded_other_features.append(embedding_encoder(idx))
    
    ## Create a single embedding vector for the user features
    if len(encoded_other_features) > 1:
        encoded_other_features = tf.keras.layers.concatenate(encoded_other_features)
    elif len(encoded_other_features) == 1:
        encoded_other_features = encoded_other_features[0]
    else:
        encoded_other_features = None

    ## Create a movie embedding encoder
    movie_vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY["movie_id"]
    movie_embedding_dims = int(math.sqrt(len(movie_vocabulary)))
    # Create a lookup to convert string values to integer indices.
    movie_index_lookup = tf.keras.layers.experimental.preprocessing.StringLookup(
        vocabulary=movie_vocabulary,
        mask_token=None,
        num_oov_indices=0,
        name="movie_index_lookup",
    )
    # Create an embedding layer with the specified dimensions.
    movie_embedding_encoder = tf.keras.layers.Embedding(
        input_dim=len(movie_vocabulary),
        output_dim=movie_embedding_dims,
        name=f"movie_embedding",
    )
    
    ######################################################## Create a vector lookup for movie genres.
    movie_genres = movies_df[["Action","Adventure", "Animation", "Children's", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir",
    "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]]
    genre_vectors = movie_genres.to_numpy()

    movie_genres_lookup = tf.keras.layers.Embedding(
        input_dim=genre_vectors.shape[0],
        output_dim=genre_vectors.shape[1],
        embeddings_initializer=tf.keras.initializers.Constant(genre_vectors),
        trainable=False,
        name="genres_vector",
    )
    # Create a processing layer for genres.
    movie_embedding_processor = tf.keras.layers.Dense(
        units=movie_embedding_dims,
        activation="relu",
        name="process_movie_embedding_with_genres",
    )

    ## Define a function to encode a given movie id.
    def encode_movie(movie_id):
        # Convert the string input values into integer indices.
        movie_idx = movie_index_lookup(movie_id)
        movie_embedding = movie_embedding_encoder(movie_idx)
        encoded_movie = movie_embedding
        if include_movie_features:
            movie_genres_vector = movie_genres_lookup(movie_idx)
            encoded_movie = movie_embedding_processor(
                tf.keras.layers.concatenate([movie_embedding, movie_genres_vector])
            )
        return encoded_movie

    ## Encoding target_movie_id
    target_movie_id = inputs["target_movie_id"]
    encoded_target_movie = encode_movie(target_movie_id)

    ## Encoding sequence movie_ids.
    sequence_movies_ids = inputs["sequence_movie_ids"]
    encoded_sequence_movies = encode_movie(sequence_movies_ids)
    # Create positional embedding.
    position_embedding_encoder = tf.keras.layers.Embedding(
        input_dim=sequence_length,
        output_dim=movie_embedding_dims,
        name="position_embedding",
    )
    positions = tf.range(start=0, limit=sequence_length - 1, delta=1)
    encodded_positions = position_embedding_encoder(positions)
    # Retrieve sequence ratings to incorporate them into the encoding of the movie.
    sequence_ratings = tf.expand_dims(inputs["sequence_ratings"], -1)
    # Add the positional encoding to the movie encodings and multiply them by rating.
    encoded_sequence_movies_with_poistion_and_rating = tf.keras.layers.Multiply()(
        [(encoded_sequence_movies + encodded_positions), sequence_ratings]
    )

    # Construct the transformer inputs.
    for encoded_movie in tf.unstack(
        encoded_sequence_movies_with_poistion_and_rating, axis=1
    ):
        encoded_transformer_features.append(tf.expand_dims(encoded_movie, 1))
    encoded_transformer_features.append(encoded_target_movie)

    encoded_transformer_features = tf.keras.layers.concatenate(
        encoded_transformer_features, axis=1
    )

    return encoded_transformer_features, encoded_other_features

In [46]:
include_user_id = True
include_user_features = True
include_movie_features = True

hidden_units = [64, 64, 64]

dropout_rate = 0.1
num_heads = 2

def create_model():
    inputs = create_model_inputs()
    transformer_features, other_features = encode_input_features(
        inputs, include_user_id, include_user_features, include_movie_features
    )

    # Create a multi-headed attention layer.
    attention_output = tf.keras.layers.MultiHeadAttention(
        num_heads=num_heads, key_dim=transformer_features.shape[2], dropout=dropout_rate
    )(transformer_features, transformer_features)

    # Transformer block.
    attention_output = tf.keras.layers.Dropout(dropout_rate)(attention_output)
    x1 = tf.keras.layers.Add()([transformer_features, attention_output])
    x1 = tf.keras.layers.LayerNormalization()(x1)
    x2 = tf.keras.layers.LeakyReLU()(x1)
    x2 = tf.keras.layers.Dense(units=x2.shape[-1])(x2)
    x2 = tf.keras.layers.Dropout(dropout_rate)(x2)
    transformer_features = tf.keras.layers.Add()([x1, x2])
    transformer_features = tf.keras.layers.LayerNormalization()(transformer_features)
    features = tf.keras.layers.Flatten()(transformer_features)

    # Included the other features.
    if other_features is not None:
        features = tf.keras.layers.concatenate(
            [features, tf.keras.layers.Reshape([other_features.shape[-1]])(other_features)]
        )

    # Fully-connected layers.
    for num_units in hidden_units:
        features = tf.keras.layers.Dense(num_units)(features)
        features = tf.keras.layers.BatchNormalization()(features)
        features = tf.keras.layers.LeakyReLU()(features)
        features = tf.keras.layers.Dropout(dropout_rate)(features)

    outputs = tf.keras.layers.Dense(units=1)(features)
        
    ## Adding a Lambda layer to convert the output to rating by scaling it with the help of available rating information
    max_rating = 5
    min_rating = 1
    x = tf.keras.layers.Lambda(lambda x: x*(max_rating - min_rating) + min_rating)(outputs)
    model = tf.keras.Model(inputs=inputs, outputs=x)
    
    return model

model = create_model()

  return bool(asarray(a1 == a2).all())


In [39]:
# Compile the model.
model.compile(
    optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1), ## Adagrad, Adadelta work better
    loss=tf.keras.losses.MeanSquaredError(),
    metrics=[tf.keras.metrics.RootMeanSquaredError()],
)

# Read the training data.
train_dataset = get_dataset_from_csv("train_data_100K.csv", shuffle=True, batch_size=265)

# Read the test data.
test_dataset = get_dataset_from_csv("test_data_100K.csv", batch_size=265)

# Callbacks
my_callbacks = [
    tf.keras.callbacks.EarlyStopping(patience=3),
    tf.keras.callbacks.TensorBoard(log_dir='./logs')
]

# Fit the model with the training data.
model.fit(train_dataset, epochs=50, batch_size = 8, verbose = 1, validation_data=test_dataset, callbacks=my_callbacks)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50


<keras.callbacks.History at 0x20d15b73070>

In [40]:
## Saves model
model.save('my_bst_model')

## Loads model
#new_model = tf.keras.models.load_model('my_bst_model')



INFO:tensorflow:Assets written to: my_bst_model\assets


INFO:tensorflow:Assets written to: my_bst_model\assets
