In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense, Dropout

In [3]:
movies        = pd.read_csv('/content/drive/MyDrive/KU Module/Project Dissertation/Project_Model/movies.csv')
links         = pd.read_csv('/content/drive/MyDrive/KU Module/Project Dissertation/Project_Model/links.csv')
ratings       = pd.read_csv('/content/drive/MyDrive/KU Module/Project Dissertation/Project_Model/ratings.csv')
tags          = pd.read_csv('/content/drive/MyDrive/KU Module/Project Dissertation/Project_Model/tags.csv')

In [4]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [5]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [6]:
movies.isnull().sum()

Unnamed: 0,0
movieId,0
title,0
genres,0


In [7]:
print(movies['genres'].nunique())
print(movies['genres'].unique())

951
['Adventure|Animation|Children|Comedy|Fantasy'
 'Adventure|Children|Fantasy' 'Comedy|Romance' 'Comedy|Drama|Romance'
 'Comedy' 'Action|Crime|Thriller' 'Adventure|Children' 'Action'
 'Action|Adventure|Thriller' 'Comedy|Horror'
 'Adventure|Animation|Children' 'Drama' 'Action|Adventure|Romance'
 'Crime|Drama' 'Drama|Romance' 'Action|Comedy|Crime|Drama|Thriller'
 'Comedy|Crime|Thriller' 'Crime|Drama|Horror|Mystery|Thriller'
 'Drama|Sci-Fi' 'Children|Drama' 'Adventure|Drama|Fantasy|Mystery|Sci-Fi'
 'Mystery|Sci-Fi|Thriller' 'Children|Comedy' 'Drama|War'
 'Action|Crime|Drama' 'Action|Adventure|Fantasy' 'Comedy|Drama|Thriller'
 'Mystery|Thriller' 'Animation|Children|Drama|Musical|Romance'
 'Crime|Mystery|Thriller' 'Adventure|Drama' 'Drama|Thriller'
 'Comedy|Crime' 'Action|Sci-Fi|Thriller' 'Action|Comedy|Horror|Thriller'
 'Comedy|Drama' 'Documentary' 'Action|Crime|Drama|Thriller'
 'Crime|Drama|Romance' 'Action|Adventure|Drama' 'Action|Thriller'
 'Drama|Horror|Thriller' 'Comedy|Horror|Roman

In [8]:
links

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0
...,...,...,...
9737,193581,5476944,432131.0
9738,193583,5914996,445030.0
9739,193585,6397426,479308.0
9740,193587,8391976,483455.0


In [9]:
links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  9742 non-null   int64  
 1   imdbId   9742 non-null   int64  
 2   tmdbId   9734 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 228.5 KB


In [10]:
links.isnull().sum() # There is some missing values in links dataframe

Unnamed: 0,0
movieId,0
imdbId,0
tmdbId,8


In [11]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [12]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [13]:
ratings.isnull().sum()

Unnamed: 0,0
userId,0
movieId,0
rating,0
timestamp,0


In [14]:
ratings['userId'].nunique()

610

In [15]:
tags

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200
...,...,...,...,...
3678,606,7382,for katie,1171234019
3679,606,7936,austere,1173392334
3680,610,3265,gun fu,1493843984
3681,610,3265,heroic bloodshed,1493843978


In [16]:
tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   userId     3683 non-null   int64 
 1   movieId    3683 non-null   int64 
 2   tag        3683 non-null   object
 3   timestamp  3683 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 115.2+ KB


In [17]:
tags.isnull().sum()

Unnamed: 0,0
userId,0
movieId,0
tag,0
timestamp,0


In [18]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [19]:
ratings['rating'].unique()

array([4. , 5. , 3. , 2. , 1. , 4.5, 3.5, 2.5, 0.5, 1.5])

# Initial Model Setup and Baseline Performance

## Neural Collaborative Filtering (NCF) Model

##  Generalized Matrix Factorization (GMF)

In [20]:
# Encode userId and movieId to be consecutive integers
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

In [21]:
ratings['userId'] = user_encoder.fit_transform(ratings['userId'])
ratings['movieId'] = movie_encoder.fit_transform(ratings['movieId'])

In [22]:
# Split the data into training and testing sets
train, test = train_test_split(ratings, test_size=0.2, random_state=42)

# Separate features and target
X_train = train[['userId', 'movieId']]
y_train = train['rating']
X_test = test[['userId', 'movieId']]
y_test = test['rating']

In [23]:
# Define constants
embedding_dim = 32
dense_units_1 = 128
dense_units_2 = 64
dropout_rate = 0.2
learning_rate = 0.001
num_users = ratings['userId'].nunique()
num_movies = ratings['movieId'].nunique()

# Define input layers
user_input = Input(shape=(1,), name='user_input')
movie_input = Input(shape=(1,), name='movie_input')

# Define embedding layers
user_embedding = Embedding(input_dim=num_users, output_dim=embedding_dim, name='user_embedding')(user_input)
movie_embedding = Embedding(input_dim=num_movies, output_dim=embedding_dim, name='movie_embedding')(movie_input)

# Flatten embeddings
user_vec = Flatten(name='user_flatten')(user_embedding)
movie_vec = Flatten(name='movie_flatten')(movie_embedding)

# Concatenate embeddings
concat = Concatenate(name='concat')([user_vec, movie_vec])

# Add dense layers
x = Dense(128, activation='relu', name='dense_1')(concat)
x = Dropout(0.2, name='dropout')(x)
x = Dense(64, activation='relu', name='dense_2')(x)
output = Dense(1, activation='linear', name='output')(x)  # Linear activation for regression

# Create and compile model
model = Model(inputs=[user_input, movie_input], outputs=output)
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error'])

# Model summary
model.summary()

In [24]:
# Start timer for training time
start_time = time.time()

# Train the model
history = model.fit(
    [X_train['userId'], X_train['movieId']],
    y_train,
    epochs=10,
    batch_size=64,
    validation_split=0.2
)

# End timer for training time
training_time = time.time() - start_time

Epoch 1/10
[1m1009/1009[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 5ms/step - loss: 2.4452 - mean_absolute_error: 1.1261 - val_loss: 0.8025 - val_mean_absolute_error: 0.6924
Epoch 2/10
[1m1009/1009[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 0.7258 - mean_absolute_error: 0.6575 - val_loss: 0.7716 - val_mean_absolute_error: 0.6764
Epoch 3/10
[1m1009/1009[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 0.6643 - mean_absolute_error: 0.6252 - val_loss: 0.7744 - val_mean_absolute_error: 0.6800
Epoch 4/10
[1m1009/1009[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 0.6227 - mean_absolute_error: 0.6040 - val_loss: 0.7769 - val_mean_absolute_error: 0.6786
Epoch 5/10
[1m1009/1009[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 0.5845 - mean_absolute_error: 0.5822 - val_loss: 0.7863 - val_mean_absolute_error: 0.6877
Epoch 6/10
[1m1009/1009[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [

In [25]:
# Evaluate the model on the test data
test_loss, test_mae = model.evaluate([X_test['userId'], X_test['movieId']], y_test)
print(f"Test Loss: {test_loss}, Test MAE: {test_mae}")

# Make predictions on the test set
predictions = model.predict([X_test['userId'], X_test['movieId']])
predicted_ratings = predictions.flatten()

# Calculate MSE and RMSE
mse = mean_squared_error(y_test, predicted_ratings)
rmse = np.sqrt(mse)

# Calculate MAE
mae = mean_absolute_error(y_test, predicted_ratings)

# Print the results
print(f"Training Time: {training_time:.2f} seconds")
print(f"MSE: {mse}, RMSE: {rmse}, MAE: {mae}")

[1m631/631[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 0.8570 - mean_absolute_error: 0.7035
Test Loss: 0.8406106233596802, Test MAE: 0.6975073218345642
[1m631/631[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Training Time: 32.01 seconds
MSE: 0.8406107713400364, RMSE: 0.916848281527558, MAE: 0.6975074909680635
