In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Multiply, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2



In [2]:
user_movie_ratings = pd.io.parsers.read_csv('ratings.dat', 
    names=['user_id', 'movie_id', 'rating', 'time'],
    engine='python', delimiter='::')


In [3]:
user_movie_ratings.drop(0, inplace=True)


In [4]:
# user_movie_ratings

user_movie_ratings['movie_id'] = user_movie_ratings['movie_id'].astype(int)
user_movie_ratings['user_id'] = user_movie_ratings['user_id'].astype(int)
user_movie_ratings = user_movie_ratings[['user_id', 'movie_id', 'rating']]
user_movie_ratings = user_movie_ratings.sort_values(by='movie_id')

user_movie_ratings


Unnamed: 0,user_id,movie_id,rating
502765,3087,1,5
746175,4452,1,5
438720,2681,1,5
637509,3840,1,5
637598,3841,1,2
...,...,...,...
779010,4653,3952,3
2898,23,3952,4
992932,5998,3952,4
246126,1489,3952,2


In [5]:

try:
    movie_data = pd.io.parsers.read_csv('movies.dat',
                                        names=['movie_id', 'title', 'genre'],
                                        engine='python', delimiter='::', encoding='utf-8')
except UnicodeDecodeError:
    print("UTF-8 decoding failed. Trying with 'latin1' encoding.")
    movie_data = pd.io.parsers.read_csv('movies.dat',
                                        names=['movie_id', 'title', 'genre'],
                                        engine='python', delimiter='::', encoding='latin1')

movie_data['movie_id'] = movie_data['movie_id'].astype(int)
movie_data['title'] = movie_data['title'].astype(str)  # Ensure title column is string type
movie_data = movie_data.sort_values(by='movie_id')

movie_data['serial_number'] = range(1, len(movie_data) + 1)
movie_data['serial_number'] = movie_data['serial_number'].astype(int)
movie_data = movie_data[['movie_id', 'title', 'serial_number']]
movie_data


movie_data

UTF-8 decoding failed. Trying with 'latin1' encoding.


Unnamed: 0,movie_id,title,serial_number
0,1,Toy Story (1995),1
1,2,Jumanji (1995),2
2,3,Grumpier Old Men (1995),3
3,4,Waiting to Exhale (1995),4
4,5,Father of the Bride Part II (1995),5
...,...,...,...
3878,3948,Meet the Parents (2000),3879
3879,3949,Requiem for a Dream (2000),3880
3880,3950,Tigerland (2000),3881
3881,3951,Two Family House (2000),3882


In [6]:
merged_df = pd.merge(user_movie_ratings, movie_data, on='movie_id', how='left')
merged_df = merged_df.dropna(subset=['title'])
merged_df = merged_df.dropna(subset=['movie_id'])
merged_df['serial_number'] = merged_df['serial_number'].astype(int)

merged_df

Unnamed: 0,user_id,movie_id,rating,title,serial_number
0,3087,1,5,Toy Story (1995),1
1,4452,1,5,Toy Story (1995),1
2,2681,1,5,Toy Story (1995),1
3,3840,1,5,Toy Story (1995),1
4,3841,1,2,Toy Story (1995),1
...,...,...,...,...,...
1000203,4653,3952,3,"Contender, The (2000)",3883
1000204,23,3952,4,"Contender, The (2000)",3883
1000205,5998,3952,4,"Contender, The (2000)",3883
1000206,1489,3952,2,"Contender, The (2000)",3883


In [7]:
user_movie_ratings = merged_df[['user_id', 'serial_number', 'rating']]
user_movie_ratings = user_movie_ratings.sort_values(by='serial_number')
# movie_data = merged_df[[ 'serial_number', 'title']]
# user_movie_ratings['movie_id'] = range(1, len(movie_data) + 1)

user_movie_ratings


Unnamed: 0,user_id,serial_number,rating
0,3087,1,5
1394,808,1,4
1393,4041,1,5
1392,5268,1,5
1391,692,1,5
...,...,...,...
999943,3452,3883,4
999942,1487,3883,4
999941,531,3883,4
999963,151,3883,5


In [8]:
movie_data = merged_df[[ 'serial_number', 'title']]
# duplicated_rows
movie_data = movie_data.drop_duplicates(subset='title', keep='first')
movie_data = movie_data.dropna(subset=['title'])

movie_data

Unnamed: 0,serial_number,title
0,1,Toy Story (1995)
2077,2,Jumanji (1995)
2778,3,Grumpier Old Men (1995)
3256,4,Waiting to Exhale (1995)
3426,5,Father of the Bride Part II (1995)
...,...,...
998560,3879,Meet the Parents (2000)
999422,3880,Requiem for a Dream (2000)
999726,3881,Tigerland (2000)
999780,3882,Two Family House (2000)


In [9]:
movie_data['movie_id'] = range(1, len(movie_data) + 1)
merged_df = pd.merge(user_movie_ratings, movie_data, on='serial_number', how='left')
merged_df = merged_df.dropna(subset=['title'])



merged_df

Unnamed: 0,user_id,serial_number,rating,title,movie_id
0,3087,1,5,Toy Story (1995),1
1,808,1,4,Toy Story (1995),1
2,4041,1,5,Toy Story (1995),1
3,5268,1,5,Toy Story (1995),1
4,692,1,5,Toy Story (1995),1
...,...,...,...,...,...
1000203,3452,3883,4,"Contender, The (2000)",3706
1000204,1487,3883,4,"Contender, The (2000)",3706
1000205,531,3883,4,"Contender, The (2000)",3706
1000206,151,3883,5,"Contender, The (2000)",3706


In [10]:

merged_df['movie_id'] = merged_df['movie_id'].astype(int)
merged_df

Unnamed: 0,user_id,serial_number,rating,title,movie_id
0,3087,1,5,Toy Story (1995),1
1,808,1,4,Toy Story (1995),1
2,4041,1,5,Toy Story (1995),1
3,5268,1,5,Toy Story (1995),1
4,692,1,5,Toy Story (1995),1
...,...,...,...,...,...
1000203,3452,3883,4,"Contender, The (2000)",3706
1000204,1487,3883,4,"Contender, The (2000)",3706
1000205,531,3883,4,"Contender, The (2000)",3706
1000206,151,3883,5,"Contender, The (2000)",3706


In [11]:
user_movie_ratings = merged_df[['user_id', 'movie_id', 'rating']]   
user_movie_ratings = user_movie_ratings.sort_values(by='movie_id')
# movie_data = merged_df[[ 'serial_number', 'title']]
unique_serial_numbers = user_movie_ratings['movie_id'].nunique()
print("Number of unique serial number values:", unique_serial_numbers)
user_movie_ratings['rating'] = user_movie_ratings['rating'].astype(float)
user_movie_ratings['user_id'] = user_movie_ratings['user_id'].astype(int)

num_users = np.max(user_movie_ratings.user_id.values)
num_movies = unique_serial_numbers
non_zero_count = (user_movie_ratings['rating'] != 0).sum()
print(non_zero_count)
user_movie_ratings

Number of unique serial number values: 3706
1000208


Unnamed: 0,user_id,movie_id,rating
0,3087,1,5.0
1394,1998,1,5.0
1393,660,1,3.0
1392,4452,1,5.0
1391,2681,1,5.0
...,...,...,...
999943,4653,3706,3.0
999942,23,3706,4.0
999941,4543,3706,3.0
999963,2122,3706,3.0


In [12]:
movie_data = merged_df[['movie_id','title']]
movie_data = movie_data.drop_duplicates(subset='title', keep='first')
movie_data.to_csv('output.csv', index=False)  # Change 'output.csv' to the desired file path and nam
movie_data

Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
2077,2,Jumanji (1995)
2778,3,Grumpier Old Men (1995)
3256,4,Waiting to Exhale (1995)
3426,5,Father of the Bride Part II (1995)
...,...,...
998560,3702,Meet the Parents (2000)
999422,3703,Requiem for a Dream (2000)
999726,3704,Tigerland (2000)
999780,3705,Two Family House (2000)


In [13]:
import tensorflow.keras as keras
from tensorflow.keras.layers import (
    Concatenate,
    Dense,
    Embedding,
    Flatten,
    Input,
    Multiply,
)
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2
from typing import List


def create_ncf(
    number_of_users: int,
    number_of_items: int,
    latent_dim_mf: int = 4,
    latent_dim_mlp: int = 32,
    reg_mf: int = 0,
    reg_mlp: int = 0.01,
    dense_layers: List[int] = [8, 4],
    reg_layers: List[int] = [0.01, 0.01],
    activation_dense: str = "relu",
) -> keras.Model:

    # input layer
    user = Input(shape=(1,), dtype="int32", name="user_id")
    item = Input(shape=(1,), dtype="int32", name="movie_id")

    # embedding layers
    mf_user_embedding = Embedding(
        input_dim=number_of_users,
        output_dim=latent_dim_mf,
        name="mf_user_embedding",
        embeddings_initializer="RandomNormal",
        embeddings_regularizer=l2(reg_mf),
    )

    mf_item_embedding = Embedding(
        input_dim=number_of_items,
        output_dim=latent_dim_mf,
        name="mf_item_embedding",
        embeddings_initializer="RandomNormal",
        embeddings_regularizer=l2(reg_mf),
    )

    mlp_user_embedding = Embedding(
        input_dim=number_of_users,
        output_dim=latent_dim_mlp,
        name="mlp_user_embedding",
        embeddings_initializer="RandomNormal",
        embeddings_regularizer=l2(reg_mlp),
    )
    mlp_item_embedding = Embedding(
        input_dim=number_of_items,
        output_dim=latent_dim_mlp,
        name="mlp_item_embedding",
        embeddings_initializer="RandomNormal",
        embeddings_regularizer=l2(reg_mlp),
    )

    # MF vector
    mf_user_latent = Flatten()(mf_user_embedding(user))
    mf_item_latent = Flatten()(mf_item_embedding(item))
    mf_cat_latent = Multiply()([mf_user_latent, mf_item_latent])

    # MLP vector
    mlp_user_latent = Flatten()(mlp_user_embedding(user))
    mlp_item_latent = Flatten()(mlp_item_embedding(item))
    mlp_cat_latent = Concatenate()([mlp_user_latent, mlp_item_latent])

    mlp_vector = mlp_cat_latent

    # build dense layers for model
    for i in range(len(dense_layers)):
        layer = Dense(
            dense_layers[i],
            activity_regularizer=l2(reg_layers[i]),
            activation=activation_dense,
            name="layer%d" % i,
        )
        mlp_vector = layer(mlp_vector)

    predict_layer = Concatenate()([mf_cat_latent, mlp_vector])

    result = Dense(
        1, activation="sigmoid", kernel_initializer="lecun_uniform", name="rating"
    )

    output = result(predict_layer)

    model = Model(
        inputs=[user, item],
        outputs=[output],
    )

    return model


In [14]:
# collapse
from tensorflow.keras.optimizers import Adam
ratings_mat = np.ndarray(
    shape = (np.max(user_movie_ratings.user_id.values),np.max(user_movie_ratings.movie_id.values))
)
ratings_mat[user_movie_ratings.user_id.values-1, user_movie_ratings.movie_id.values-1] = user_movie_ratings.rating.values


n_users, n_items = ratings_mat.shape
ncf_model = create_ncf(n_users, n_items)

ncf_model.compile(
    optimizer=Adam(), 
    loss="binary_crossentropy",
    metrics=[
        tf.keras.metrics.TruePositives(name="tp"),
        tf.keras.metrics.FalsePositives(name="fp"),
        tf.keras.metrics.TrueNegatives(name="tn"),
        tf.keras.metrics.FalseNegatives(name="fn"),
        tf.keras.metrics.BinaryAccuracy(name="accuracy"),
        tf.keras.metrics.Precision(name="precision"),
        tf.keras.metrics.Recall(name="recall"),
        tf.keras.metrics.AUC(name="auc"),
    ],
)
ncf_model._name = "neural_collaborative_filtering"
ncf_model.summary()

In [15]:
# def make_tf_dataset(
#     df: pd.DataFrame,
#     targets: List[str],
#     val_split: float = 0.1,
#     batch_size: int = 512,
#     seed=42,
# ):
#     """Make TensorFlow dataset from Pandas DataFrame.
#     :param df: input DataFrame - only contains features and target(s)
#     :param targets: list of columns names corresponding to targets
#     :param val_split: fraction of the data that should be used for validation
#     :param batch_size: batch size for training
#     :param seed: random seed for shuffling data - `None` won't shuffle the data"""

#     n_val = round(df.shape[0] * val_split)
#     if seed:
#         # shuffle all the rows
#         x = df.sample(frac=1, random_state=seed).to_dict("series")
#     else:
#         x = df.to_dict("series")
#     y = dict()
#     for t in targets:
#         y[t] = x.pop(t)
#     ds = tf.data.Dataset.from_tensor_slices((x, y))

#     ds_val = ds.take(n_val).batch(batch_size)
#     ds_train = ds.skip(n_val).batch(batch_size)
#     return ds_train, ds_val

In [16]:
# ds_train, ds_val = make_tf_dataset(user_movie_ratings, ["rating"])

# # for features, targets in ds_val.take(1):  # Take one batch as an example
# #     print("Features:")
# #     print(features)
# #     print("Targets:")
# #     print(targets)


Features:
{'user_id': <tf.Tensor: shape=(512,), dtype=int32, numpy=
array([3017, 1181,  934,   48, 2554, 1146,  352, 1837, 5788, 5897, 2826,
       1667, 1791, 1982, 2942, 2901, 3130, 1448, 2131, 3539, 3876, 2899,
       2575,  560, 5313, 5357, 5246, 5941, 1924, 3259, 5699, 4310, 1600,
       5752,  534,  889, 4596, 1758, 2194, 4470, 3503, 1545, 2989, 1194,
       3591, 5910,  949, 4602, 4531, 2203, 1958,  891, 3490, 5367, 2744,
       5590, 5652,  543, 1749, 5747, 4359,  240, 4869, 2223,  352, 3916,
        550, 3087,  587, 2122, 4739, 1088,  125, 2566, 4523, 1680, 1392,
       4042, 3768, 2427, 3762, 2185, 5779,  798, 4736,  949, 1974,  386,
       4227, 1968, 1243, 4344, 4887, 1680, 1593, 1926, 2860, 2251,  176,
       3420, 1579, 1897, 5232, 2638, 1850, 3719, 3905,  520, 5608, 4491,
       2015, 4808,  352, 5000, 2665, 2888, 3154,  949, 1038, 4344, 1882,
        129,  996, 5812, 1676, 3519, 5480, 5406, 1962, 1955, 5793, 3996,
       5211, 4351, 1854, 2861, 6010, 3512, 1910, 2099, 1

In [25]:
# # Define a function to get the shape of each element in the dataset
# def get_element_shapes(dataset):
#     element_shapes = []
#     for element in dataset:
#         if isinstance(element, dict):
#             shapes = {key: item.shape if hasattr(item, 'shape') else None for key, item in element.items()}
#         elif isinstance(element, tuple):
#             shapes = [item.shape if hasattr(item, 'shape') else None for item in element]
#         else:
#             shapes = element.shape if hasattr(element, 'shape') else None
#         element_shapes.append(shapes)
#     return element_shapes

# # Get the shapes of elements in the training dataset
# train_shapes = get_element_shapes(ds_train)
# val_shapes = get_element_shapes(ds_val)

# # Print the shapes
# total_samples_train = len(ds_train)

# print("Training dataset shapes:", train_shapes)
# print("Validation dataset shapes:", val_shapes)
# number_of_batches = total_samples_train / 512
# print(number_of_batches)
# if len(user_movie_ratings) == len(ds_train) + len(ds_val):
#     print("All data from user_movie_ratings is included in training and validation datasets.")
# else:
#     print("Some data from user_movie_ratings is missing in training or validation datasets.")


Training dataset shapes: [[None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, None], [None, N

In [32]:
# import numpy as np

# # Assuming user_movie_ratings is your DataFrame
# # If not, replace it with your actual DataFrame name

# # Define the percentage of data to be allocated to the validation set
# val_split_percent = 0.1

# # Get unique user IDs
# unique_users = user_movie_ratings['user_id'].unique()

# # Initialize lists to store data for training and validation sets
# train_data = []
# val_data = []

# # Iterate over unique users
# for user_id in unique_users:
#     # Filter interactions for the current user
#     user_interactions = user_movie_ratings[user_movie_ratings['user_id'] == user_id]
    
#     # Determine the number of interactions to allocate to the validation set
#     num_val_interactions = int(np.ceil(len(user_interactions) * val_split_percent))
    
#     # Randomly shuffle interactions for the current user
#     user_interactions = user_interactions.sample(frac=1, random_state=42)
    
#     # Split interactions into training and validation sets
#     val_data.extend(user_interactions[:num_val_interactions].values.tolist())
#     train_data.extend(user_interactions[num_val_interactions:].values.tolist())

# # Convert lists to DataFrames
# train_df = pd.DataFrame(train_data, columns=user_movie_ratings.columns)
# val_df = pd.DataFrame(val_data, columns=user_movie_ratings.columns)

# # Now, you have properly split your dataset into training and validation sets
# # You can proceed to create TensorFlow datasets from train_df and val_df

# # Now, you have properly split your dataset into training and validation sets using stratified sampling
# if len(user_movie_ratings) == len(train_df) + len(val_df):
#     print("All data from user_movie_ratings is included in training and validation datasets.")
# else:
#     print("Some data from user_movie_ratings is missing in training or validation datasets.")

# batch_size = 512

# def create_tf_dataset(df):
#     x_dict = df.drop(columns=['rating']).to_dict("series")
#     y = df['rating']
#     ds = tf.data.Dataset.from_tensor_slices((x_dict, y))
#     return ds.batch(batch_size)

# ds_train = create_tf_dataset(train_df)
# ds_val = create_tf_dataset(val_df)
# if len(user_movie_ratings) == len(ds_train) + len(ds_val):
#     print("All data from user_movie_ratings is included in training and validation datasets.")
# else:
#     print("Some data from user_movie_ratings is missing in training or validation datasets.")

All data from user_movie_ratings is included in training and validation datasets.
Some data from user_movie_ratings is missing in training or validation datasets.


In [40]:
import numpy as np
import pandas as pd
import tensorflow as tf

def df_to_tfrecord(df, filename):
    with tf.io.TFRecordWriter(filename) as writer:
        for _, row in df.iterrows():
            example = tf.train.Example(features=tf.train.Features(
                feature={
                    'user_id': tf.train.Feature(float_list=tf.train.FloatList(value=[row['user_id']])),
                    'movie_id': tf.train.Feature(int64_list=tf.train.Int64List(value=[int(row['movie_id'])])),
                    'rating': tf.train.Feature(float_list=tf.train.FloatList(value=[row['rating']])),
                    # Add more features as needed
                }
            ))
            writer.write(example.SerializeToString())

def load_tfrecord_dataset(filename, batch_size):
    def parse_tfrecord_fn(example):
        feature_description = {
            'user_id': tf.io.FixedLenFeature([], tf.float32),
            'movie_id': tf.io.FixedLenFeature([], tf.int64),
            'rating': tf.io.FixedLenFeature([], tf.float32),
            # Add more features as needed
        }
        example = tf.io.parse_single_example(example, feature_description)
        return example['user_id'], example['movie_id'], example['rating']

    dataset = tf.data.TFRecordDataset(filename)
    dataset = dataset.map(parse_tfrecord_fn)
    dataset = dataset.batch(batch_size)
    return dataset

# Split the data into train and validation
train_size = int(len(user_movie_ratings) * 0.9)
train_data = user_movie_ratings[:train_size]
val_data = user_movie_ratings[train_size:]

# Convert and save to TFRecord format
df_to_tfrecord(train_data, 'train_data.tfrecord')
df_to_tfrecord(val_data, 'val_data.tfrecord')

# Load TFRecord datasets
batch_size = 32
train_dataset = load_tfrecord_dataset('train_data.tfrecord', batch_size)
val_dataset = load_tfrecord_dataset('val_data.tfrecord', batch_size)


In [44]:
import os
import tensorflow as tf
import pandas as pd

def df_to_tfrecord(df, filename):
    with tf.io.TFRecordWriter(filename) as writer:
        for _, row in df.iterrows():
            example = tf.train.Example(features=tf.train.Features(
                feature={
                    'user_id': tf.train.Feature(float_list=tf.train.FloatList(value=[row['user_id']])),
                    'movie_id': tf.train.Feature(int64_list=tf.train.Int64List(value=[int(row['movie_id'])])),
                    # Add more features as needed
                }
            ))
            writer.write(example.SerializeToString())

def parse_tfrecord_fn(example):
    feature_description = {
        'user_id': tf.io.FixedLenFeature([], tf.float32),
        'movie_id': tf.io.FixedLenFeature([], tf.int64),
        # Add more features as needed
    }
    example = tf.io.parse_single_example(example, feature_description)
    return example['user_id'], example['movie_id']  # Return features and labels

def load_tfrecord_dataset(filename, batch_size):
    dataset = tf.data.TFRecordDataset(filename)
    dataset = dataset.map(parse_tfrecord_fn)
    dataset = dataset.batch(batch_size)
    return dataset

# Split the data into training and validation sets
train_size = int(len(user_movie_ratings) * 0.9)
train_data = user_movie_ratings[:train_size]
val_data = user_movie_ratings[train_size:]

# Convert data to TFRecord format
df_to_tfrecord(train_data, 'train_data.tfrecord')
df_to_tfrecord(val_data, 'val_data.tfrecord')

# Create datasets
batch_size = 32
train_dataset = load_tfrecord_dataset('train_data.tfrecord', batch_size)
val_dataset = load_tfrecord_dataset('val_data.tfrecord', batch_size)

# Model training
train_hist = ncf_model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=10,
    verbose=1,
)


Epoch 1/10


ValueError: Layer 'functional_1' expected 2 input(s). Received 1 instead.

In [33]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Assuming ncf_model is your trained NCF model
# Extract the embeddings for movies
movie_embeddings = ncf_model.get_layer('mlp_item_embedding').get_weights()[0]

# Calculate cosine similarity between all pairs of movie embeddings
cosine_similarities = cosine_similarity(movie_embeddings)

# Define a function to get top 10 similar movies for a given movie_id
def get_top_similar_movies(movie_id, movie_data):
    # Get the index of the movie_id in the embeddings
    movie_index = movie_id - 1  # Assuming movie_id starts from 1
    # Calculate similarity scores for all movies
    similarity_scores = cosine_similarities[movie_index]
    # Sort the movies based on similarity scores
    similar_movies_indices = np.argsort(similarity_scores)[::-1][1:11]  # Exclude the movie itself
    # Get the titles of the top 10 similar movies
    similar_movie_titles = movie_data.loc[movie_data['movie_id'].isin(similar_movies_indices + 1), 'title'].values
    return similar_movie_titles

# Example usage:
movie_id = 41 # Example movie_id
top_similar_movie_titles = get_top_similar_movies(movie_id, movie_data)
print("Top 10 similar movies for movie_id", movie_id, ":")
for title in top_similar_movie_titles:
    print(title)


Top 10 similar movies for movie_id 41 :
Better Off Dead... (1985)
Back to the Future Part II (1989)
Saboteur (1942)
Sid and Nancy (1986)
High Plains Drifter (1972)
Time Bandits (1981)
Honeymoon in Vegas (1992)
Man with the Golden Arm, The (1955)
Boys and Girls (2000)
Creature From the Black Lagoon, The (1954)
