<a href="https://colab.research.google.com/github/vishnuaswanth/Movie_recommender/blob/main/My_model_sample.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
from time import time
from scipy.sparse import csc_matrix
import pandas as pd
import numpy as np
import h5py
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from torch.nn.parameter import Parameter
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, root_mean_squared_error
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import itertools



torch.manual_seed(1284)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
os.chdir('/content/drive/MyDrive/Main project - Movie recommender/ml-25m/')


Create model metrics only when model_metrics csv not found

In [None]:
# ==================================================================
# Check if model_metrics.csv already exists, if not then run this
# ==================================================================

model_metrics_df = pd.DataFrame(columns=['model_name','number_of_users', 'number_of_movies','RMSE', 'MAE', 'NDCG', 'NDCG_10'])

## Genome matrix

In [None]:
def create_train_test_data_for_genome(path='./', train_movies = 500, test_movies = 500):
    genome_base_data = pd.read_csv(path+'genome-scores.csv')
    genome_base_data['relevance'] = genome_base_data['relevance'].astype(float)

    sorted_unique_movies = sorted(genome_base_data['movieId'].unique())

    train_movie_set = sorted_unique_movies[:train_movies]
    test_movie_set = sorted_unique_movies[:test_movies]

    train_data = genome_base_data[genome_base_data['movieId'].isin(train_movie_set)]
    test_data = genome_base_data[genome_base_data['movieId'].isin(test_movie_set)]
    train_matrix = train_data.pivot(index='movieId', columns='tagId', values='relevance')
    test_matrix = test_data.pivot(index='movieId', columns='tagId', values='relevance')

    train_matrix = train_matrix.fillna(1e-12)
    test_matrix = test_matrix.fillna(1e-12)

    train_matrix = train_matrix.reset_index()
    test_matrix = test_matrix.reset_index()

    print(f'Number of unique movies in genome data: {len(sorted_unique_movies)}')
    print(f'Number of movies to train: {len(train_movie_set)}')
    print(f'Number of movies to test: {len(test_movie_set)}')
    print(f'train matrix shape {train_matrix.shape}')
    print(f'test matrix shape {test_matrix.shape}')
    return train_matrix, test_matrix, test_movie_set, train_movie_set

In [None]:
train_matrix, test_matrix, movie_list, train_movie_list = create_train_test_data_for_genome('',1000, 3000)

Number of unique movies in genome data: 13816
Number of movies to train: 500
Number of movies to test: 1000
train matrix shape (500, 1129)
test matrix shape (1000, 1129)


# Review Data

# Load Ratings Data

In [None]:
def create_train_test_data_for_reviews(total_movie_list, train_movie_list, n_u=2000, path='./'):
    ratings_data = pd.read_csv(path+'ratings.csv')
    ratings_data['rating'] = ratings_data['rating'].astype(float)

    # Convert userId to categorical and get unique values
    ratings_data['userId'] = pd.Categorical(ratings_data['userId'])

    # Encode userId to ensure it is zero-indexed
    ratings_data['userId'] = ratings_data['userId'].cat.codes

    test_rating_data = ratings_data[ratings_data['movieId'].isin(total_movie_list)] # I will predict rating for all movies (train+test movies)
    train_rating_data = ratings_data[ratings_data['movieId'].isin(train_movie_list)] # I will predict rating for only train movies

    un_users = sorted(test_rating_data['userId'].unique())[:n_u]

    test_rating_data = test_rating_data[test_rating_data['userId'].isin(un_users)]
    train_rating_data = train_rating_data[train_rating_data['userId'].isin(un_users)]

    print(f'test data shape {test_rating_data.shape}')
    print(f'train data shape {train_rating_data.shape}')
    print(f'Number of reviewers considered: {len(un_users)}')

    return test_rating_data, train_rating_data, un_users


In [None]:
test_rating_data, train_rating_data, un_users = create_train_test_data_for_reviews(movie_list, train_movie_list)

test data shape (36678, 4)
train data shape (21741, 4)
Number of reviewers considered: 1000


# Merge Rating data with Genome Matrix

In [None]:
def merge_rating_data_with_genome_matrix(df, matrix):
    merged_data = pd.merge(df, matrix, on='movieId', how='left')
    print(f'merged data shape {merged_data.shape}')
    return merged_data

In [None]:
merged_training_data = merge_rating_data_with_genome_matrix(train_rating_data, train_matrix)

merged data shape (21741, 1132)


In [None]:
merged_test_data = merge_rating_data_with_genome_matrix(test_rating_data, test_matrix)

merged data shape (36678, 1132)


### Make train and test data out of merged training data

In [None]:
train_df, val_df = train_test_split(merged_training_data, test_size=0.15, random_state=42)

# Tensors

In [None]:
def convert_data_to_tensors(df):

    # Convert userId to tensors
    user_ids = torch.tensor(df['userId'].values, dtype=torch.long)

    # Drop userId from the rest of the features as it's now embedded
    genome_features = torch.tensor(df.drop(columns=['userId', 'movieId', 'rating', 'timestamp']).values, dtype=torch.float32)

    # Target tensors
    y_tensor = torch.tensor(df['rating'].values, dtype=torch.float32).view(-1, 1)

    return user_ids, genome_features, y_tensor



In [None]:
user_ids_train, genome_features_train, y_train_tensor = convert_data_to_tensors(train_df)
user_ids_val, genome_features_val, y_val_tensor = convert_data_to_tensors(val_df)
test_user_ids, test_genome_features, test_y_tensor = convert_data_to_tensors(merged_test_data)
test_NDCG_user_ids, test_NDCG_genome_features, test_NDCG_y_tensor = convert_data_to_tensors(test_NDCG_data)

In [None]:
max_user_id = max(torch.max(user_ids_train).item(), torch.max(user_ids_val).item(), torch.max(test_user_ids).item())


## Neural network model and Embedding creation

In [None]:
### Model with Embeddings ###

# Define a neural network with user embeddings and dropout layers
class PerceptronWithEmbedding(nn.Module):
    def __init__(self, num_users, embedding_dim, genome_dim):
        super(PerceptronWithEmbedding, self).__init__()

        # Embedding layer for userId
        self.user_embedding = nn.Embedding(num_users, embedding_dim)

        # Fully connected layers
        self.fc1 = nn.Linear(embedding_dim + genome_dim, 128)
        self.dropout1 = nn.Dropout(0.3)
        self.fc2 = nn.Linear(128, 64)
        self.dropout2 = nn.Dropout(0.3)
        self.fc3 = nn.Linear(64, 32)
        self.dropout3 = nn.Dropout(0.3)
        self.fc4 = nn.Linear(32, 1)  # Output layer for regression

    def forward(self, user_id, genome_features):
        # Embed user_id and concatenate with genome features
        user_embedded = self.user_embedding(user_id)
        x = torch.cat((user_embedded, genome_features), dim=1)

        # Pass through fully connected layers
        x = torch.relu(self.fc1(x))
        x = self.dropout1(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout2(x)
        x = torch.relu(self.fc3(x))
        x = self.dropout3(x)
        return self.fc4(x)


### Initialize the Neural Network and the embeddings

In [None]:
# Initialize model with embedding
dim_u = max_user_id + 1 # Number of unique users
embedding_dim = 500  # Embedding space dimension for users
genome_dim = genome_features_train.shape[1]  # Dimension of genome features

model = PerceptronWithEmbedding(num_users=dim_u, embedding_dim=embedding_dim, genome_dim=genome_dim)

# Loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

### Train Neural Network

In [None]:
# Training loop
epochs = 1500
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(user_ids_train, genome_features_train)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()
    if epoch % 50 == 0:
      print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

# Validation
model.eval()
with torch.no_grad():
    val_preds = model(user_ids_val, genome_features_val).squeeze().numpy()
y_val = y_val_tensor.squeeze().numpy()
mse_nn = mean_squared_error(y_val, val_preds)



Epoch 1, Loss: 14.926592826843262
Epoch 51, Loss: 1.4952545166015625
Epoch 101, Loss: 1.2745264768600464
Epoch 151, Loss: 1.0872846841812134
Epoch 201, Loss: 0.9718148708343506
Epoch 251, Loss: 0.8781037926673889
Epoch 301, Loss: 0.8189628720283508
Epoch 351, Loss: 0.7752320766448975
Epoch 401, Loss: 0.7217406630516052
Epoch 451, Loss: 0.6794877052307129
Epoch 501, Loss: 0.6411100625991821
Epoch 551, Loss: 0.6130377054214478
Epoch 601, Loss: 0.5808885097503662
Epoch 651, Loss: 0.5553146004676819
Epoch 701, Loss: 0.5288379192352295
Epoch 751, Loss: 0.506364107131958
Epoch 801, Loss: 0.48895135521888733
Epoch 851, Loss: 0.4739910662174225
Epoch 901, Loss: 0.4621239900588989
Epoch 951, Loss: 0.4525378346443176
Epoch 1001, Loss: 0.4307727813720703
Epoch 1051, Loss: 0.42035460472106934
Epoch 1101, Loss: 0.4126986861228943
Epoch 1151, Loss: 0.39761289954185486
Epoch 1201, Loss: 0.3922157287597656
Epoch 1251, Loss: 0.38237544894218445
Epoch 1301, Loss: 0.3733557462692261
Epoch 1351, Loss: 0.3

In [None]:
# Print Validation test reults
print(f'Neural Network - Validation score - MSE - {mse_nn}')

Neural Network - Validation score - MSE - 0.7903012037277222


### Save the Neural Network parameter weights

In [None]:
# Save the neural network model
torch.save(model.state_dict(), 'pytorch_my_model.pth')

## Get Neural Network Model metrics




In [None]:
# Validation
model.eval()
with torch.no_grad():
    test_preds_nn = model(test_user_ids, test_genome_features).squeeze().numpy()
y_test = test_y_tensor.squeeze().numpy()
# mse_test_nn = mean_squared_error(y_test, test_preds_nn)
rmse_test_nn = root_mean_squared_error(y_test, test_preds_nn)
mae_test_nn = mean_absolute_error(y_test, test_preds_nn)


In [None]:
print(f'Neural network test score - MAE - {mae_test_nn}')
print(f'Neural network test score - RMSE - {rmse_test_nn}')

Neural network test score - MAE - 0.5637496709823608
Neural network test score - RMSE - 0.7770323157310486


In [None]:
best_preds = test_preds_nn # for NDCG calculation

# Load Neural Network if already found

In [None]:
model.load_state_dict(torch.load('pytorch_my_model.pth'))
model.eval()

  model.load_state_dict(torch.load('pytorch_my_model.pth'))


PerceptronWithEmbedding(
  (user_embedding): Embedding(2006, 500)
  (fc1): Linear(in_features=1628, out_features=128, bias=True)
  (dropout1): Dropout(p=0.3, inplace=False)
  (fc2): Linear(in_features=128, out_features=64, bias=True)
  (dropout2): Dropout(p=0.3, inplace=False)
  (fc3): Linear(in_features=64, out_features=32, bias=True)
  (dropout3): Dropout(p=0.3, inplace=False)
  (fc4): Linear(in_features=32, out_features=1, bias=True)
)

# Feature Space preparation for training and predicting linear model

In [None]:
class MachineLearningModel():
    def __init__(self, embeddings):
        self.embeddings = embeddings

    def get_input_embeddings(self, df):
        # Convert userId to tensors
        input_embeddings = np.array([self.embeddings[user_id] for user_id in df['userId']])
        return input_embeddings

    def hstack_data(self, input_embeddings, other_features):
        return np.hstack((input_embeddings, other_features))

    def prepare_data(self, df):
        # Convert userId to tensors
        input_embeddings = self.get_input_embeddings(df)
        input_linear_model = self.hstack_data(input_embeddings, df.drop(columns=['userId', 'movieId', 'rating', 'timestamp']).values)
        return input_linear_model

    def initialize_PCA(self, X, n_components=100):
        pca = PCA(n_components=n_components)
        pca.fit(X)
        self.PCA = pca
        return

    def transform_data(self, X):
        return self.PCA.transform(X)

    def prepare_data_PCA(self, df):
        # Convert userId to tensors
        input_embeddings = self.get_input_embeddings(df)
        input_feature_PCA = df.drop(columns=['userId', 'movieId', 'rating', 'timestamp']).values
        PCA_feature = self.transform_data(input_feature_PCA)
        input_linear_model = self.hstack_data(input_embeddings, PCA_feature)
        return input_linear_model

In [None]:
input_mlm = MachineLearningModel(model.user_embedding.weight.detach().numpy())

In [None]:
X_train_lm = input_mlm.prepare_data(train_df)
X_val_lm = input_mlm.prepare_data(val_df)
X_test_lm = input_mlm.prepare_data(merged_test_data)

In [None]:
# Do this for fitting PCA, default changes to 100 dim - only genome column space
input_mlm.initialize_PCA(train_df.drop(columns=['userId', 'movieId', 'rating', 'timestamp']).values)


In [None]:
X_PCA_train_lm = input_mlm.prepare_data_PCA(train_df)
X_PCA_val_lm = input_mlm.prepare_data_PCA(val_df)
X_PCA_test_lm = input_mlm.prepare_data_PCA(merged_test_data)

# SVR Model

In [None]:
# Support Vector Machine Model
svm = SVR(max_iter=1000)
svm.fit(X_train_lm, train_df['rating'].values)
svm_preds = svm.predict(X_val_lm)
mse_svm = mean_squared_error(val_df['rating'].values, svm_preds)



In [None]:
svm_preds_test = svm.predict(X_test_lm)
mse_svm_test = mean_squared_error(merged_test_data['rating'].values, svm_preds_test)

In [None]:
# Support Vector Machine Model
svm_PCA = SVR(max_iter=5000)
svm_PCA.fit(X_PCA_train_lm, train_df['rating'].values)
svm_PCA_preds = svm_PCA.predict(X_PCA_val_lm)
mse_svm = mean_squared_error(val_df['rating'].values, svm_PCA_preds)



In [None]:
# View Validation score
print(f'SMR - RBF kernel with PCA validation score - MSE - {mse_svm}')

### Save SVR weights

In [None]:
joblib.dump(svm_PCA, 'svm_model_PCA.pkl')

['svm_model_PCA.pkl']

# SVR with PCA and RBF kernel metrics calculation

In [None]:
svm_PCA_preds_test = svm_PCA.predict(X_PCA_test_lm)


In [None]:
rmse_svm_PCA_test = mean_squared_error(merged_test_data['rating'].values, svm_PCA_preds_test)
mae_svm_PCA_test = mean_absolute_error(merged_test_data['rating'].values, svm_PCA_preds_test)

In [None]:
print(f'SVR - RBF kernel with PCA test score - MAE - {mae_svm_PCA_test}')
print(f'SVR - RBF kernel with PCA test score - RMSE - {rmse_svm_PCA_test}')

SVR - RBF kernel with PCA test score - MAE - 0.9956080144487983
SVR - RBF kernel with PCA test score - RMSE - 1.3982190170590407


In [None]:
best_preds = svm_PCA_preds_test # for NDCG calculation


## Load Saved SVR Model

In [None]:
svm_PCA = joblib.load('svm_model_PCA.pkl')


# Random Forest Regression Model

In [None]:
# Random Forest Model
rf = RandomForestRegressor(n_estimators=20, random_state=42)
rf.fit(X_train_lm, train_df['rating'].values)
rf_preds = rf.predict(X_val_lm)
mse_rf = mean_squared_error(val_df['rating'].values, rf_preds)

In [None]:
rf_preds_test = rf.predict(X_test_lm)
mse_rf = mean_squared_error(merged_test_data['rating'].values, rf_preds_test)

In [None]:
# Random Forest Model - PCA
rf_PCA = RandomForestRegressor(n_estimators=20, random_state=42)
rf_PCA.fit(X_PCA_train_lm, train_df['rating'].values)
rf_PCA_preds = rf_PCA.predict(X_PCA_val_lm)
mse_rf_PCA = mean_squared_error(val_df['rating'].values, rf_PCA_preds)

In [None]:
print(f'Random forest with PCA validation score - MSE - {mse_rf_PCA}')

### Save Random forest model with PCA data

In [None]:
joblib.dump(rf_PCA, 'rf_model_PCA.pkl')

['rf_model_PCA.pkl']

# Random Forest regression with 20 estimators and PCA data metrics calculation

In [None]:
rf_PCA_preds_test = rf_PCA.predict(X_PCA_test_lm)
rmse_rf_PCA_test = root_mean_squared_error(merged_test_data['rating'].values, rf_PCA_preds_test)
mae_rf_PCA_test = mean_absolute_error(merged_test_data['rating'].values, rf_PCA_preds_test)

In [None]:
print(f'Random forest with PCA test score - MAE - {mae_rf_PCA_test}')
print(f'Random forest with PCA test score - RMSE - {rmse_rf_PCA_test}')

Random forest with PCA test score - MAE - 0.5297155219003721
Random forest with PCA test score - RMSE - 0.7412247380143584


In [None]:
best_preds = rf_PCA_preds_test # for NDCG calculation

## Load Random Forest regression with 20 estimators and PCA data Model

In [None]:
rf_PCA = joblib.load('rf_model_PCA.pkl')

Saving and Loading ML models

In [None]:
# Save the trained models
joblib.dump(svm, 'svm_model.pkl')
joblib.dump(rf, 'rf_model.pkl')

['rf_model.pkl']

In [None]:
# Load the models back
svm_model_loaded = joblib.load('svm_model.pkl')
rf_model_loaded = joblib.load('rf_model.pkl')

### NDCG and NDCG_10 functions

In [None]:
def dcg_k(score_label, k):
    dcg, i = 0., 0
    for s in score_label:
        if i < k:
            dcg += (2**s[1]-1) / np.log2(2+i)
            i += 1
    return dcg

def ndcg_k(y_hat, y, k):
    score_label = np.stack([y_hat, y], axis=1).tolist()
    score_label = sorted(score_label, key=lambda d:d[0], reverse=True)
    score_label_ = sorted(score_label, key=lambda d:d[1], reverse=True)
    norm, i = 0., 0
    for s in score_label_:
        if i < k:
            norm += (2**s[1]-1) / np.log2(2+i)
            i += 1
    dcg = dcg_k(score_label, k)
    return dcg / norm

def call_ndcg(y_hat, y):
    ndcg_sum, num = 0, 0
    y_hat, y = y_hat.T, y.T
    n_users = y.shape[0]

    for i in range(n_users):
        y_hat_i = y_hat[i][np.where(y[i])]
        y_i = y[i][np.where(y[i])]

        if y_i.shape[0] < 2:
            continue

        ndcg_sum += ndcg_k(y_hat_i, y_i, y_i.shape[0])  # user-wise calculation
        num += 1

    return ndcg_sum / num

def call_ndcg_10(y_hat, y):
    ndcg_sum, num = 0, 0
    y_hat, y = y_hat.T, y.T
    n_users = y.shape[0]

    for i in range(n_users):
        y_hat_i = y_hat[i][np.where(y[i])]
        y_i = y[i][np.where(y[i])]

        if y_i.shape[0] < 2:
            continue

        ndcg_sum += ndcg_k(y_hat_i, y_i, 10)  # user-wise calculation
        num += 1

    return ndcg_sum / num

Matrix preparation for NDCG

In [None]:
y_matrix = merged_test_data.pivot(index='movieId', columns='userId', values='rating')
y_matrix = y_matrix.fillna(1e-16)
y = y_matrix.to_numpy()

In [None]:
merged_test_data_hat = merged_test_data.copy()
merged_test_data_hat['rating'] = best_preds
y_hat_matrix = merged_test_data_hat.pivot(index='movieId', columns='userId', values='rating')
y_hat_matrix = y_hat_matrix.fillna(1e-16)
y_hat = y_hat_matrix.to_numpy()


In [None]:
ndcg_Val = call_ndcg(y_hat, y)
ndcg_10_Val = call_ndcg_10(y_hat, y)


In [None]:
print(f'Neural Network - Validation score - NDCG - {ndcg_Val}')
print(f'Neural Network - Validation score - NDCG_10 - {ndcg_10_Val}')

Neural Network - Validation score - NDCG - 0.9352666203065995
Neural Network - Validation score - NDCG_10 - 0.8392399849971608


# ADD Model metrics to Dataframe and save as CSV

In [None]:
ndcg_Val

0.9314661007898867

In [None]:
ndcg_10_Val

0.8285462084394231

In [None]:
print(f'SVR - RBF kernel with PCA test score - MAE - {mae_svm_PCA_test}')
print(f'SVR - RBF kernel with PCA test score - RMSE - {rmse_svm_PCA_test}')

In [None]:
new_row = {'model_name': 'SVM with PCA - 2000 iterations', 'number_of_users': 2000, 'number_of_movies': 3000 ,'RMSE':rmse_svm_PCA_test, 'MAE': mae_svm_PCA_test, 'NDCG': ndcg_Val, 'NDCG_10': ndcg_10_Val}
model_metrics_df = pd.concat([model_metrics_df, pd.DataFrame([new_row])], ignore_index=True)

In [None]:
model_metrics_df.head()

Unnamed: 0,model_name,number_of_users,number_of_movies,RMSE,MAE,NDCG,NDCG_10
0,Neural Network,2000,3000,0.78398,0.575923,0.931466,0.828546
1,Random forest with PCA - 20 estimators,2000,3000,0.741225,0.529716,0.948769,0.88387
2,SVM with PCA - 1000 iterations,2000,3000,1.761355,1.136178,0.892603,0.728701
4,SVM with PCA - 2000 iterations,2000,3000,1.398219,0.995608,0.896163,0.740958


In [None]:
# 'model_metrics_df' saved as model_metrics.csv

model_metrics_df.to_csv('model_metrics.csv', index=False)

If runtimes re-starts

In [None]:
model_metrics_df = pd.read_csv('model_metrics.csv')