In [121]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim

import time
import numpy as np

In [122]:
# Load the dataset
data = pd.read_csv('finalpopular.csv')

In [123]:
#previous df with all unseen movie filled as 0 rating
#it is also needed here to sort movie seen by user already
user_to_movie_df=data.pivot_table(index='userId',columns='title',values='rating').fillna(0)
movies_list = user_to_movie_df.columns
user_to_movie_df.head()

title,10 Things I Hate About You (1999),12 Angry Men (1957),2001: A Space Odyssey (1968),28 Days Later (2002),300 (2007),"40-Year-Old Virgin, The (2005)",A.I. Artificial Intelligence (2001),"Abyss, The (1989)",Ace Ventura: Pet Detective (1994),Ace Ventura: When Nature Calls (1995),...,Willy Wonka & the Chocolate Factory (1971),"Wizard of Oz, The (1939)","Wolf of Wall Street, The (2013)",X-Men (2000),X-Men: The Last Stand (2006),X2: X-Men United (2003),You've Got Mail (1998),Young Frankenstein (1974),Zombieland (2009),Zoolander (2001)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,5.0,5.0,0.0,5.0,0.0,0.0,0.0,5.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0
4,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [124]:
#encoding identifies classes in a categorical coloumn and mark them by unique integer
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

In [125]:
#fit_transform returns index for each class that exists in provided pandas series
#movie_encoder.classes_ will return actual values 
data['userId_encoded'] = user_encoder.fit_transform(data['userId'])
data['movieId_encoded'] = movie_encoder.fit_transform(data['movieId'])
data.head()

Unnamed: 0.1,Unnamed: 0,userId,movieId,rating,title,totalRatingCount,userId_encoded,movieId_encoded
0,0,1,1,4.0,Toy Story (1995),215,0,0
1,1,5,1,4.0,Toy Story (1995),215,4,0
2,2,7,1,4.5,Toy Story (1995),215,6,0
3,3,15,1,2.5,Toy Story (1995),215,14,0
4,4,17,1,4.5,Toy Story (1995),215,16,0


In [126]:
# Normalize the ratings as it makes easier for model to train
min_rating = data['rating'].min()
max_rating = data['rating'].max()
data['rating_normalized'] = (data['rating'] - min_rating) / (max_rating - min_rating)

data.head()

Unnamed: 0.1,Unnamed: 0,userId,movieId,rating,title,totalRatingCount,userId_encoded,movieId_encoded,rating_normalized
0,0,1,1,4.0,Toy Story (1995),215,0,0,0.777778
1,1,5,1,4.0,Toy Story (1995),215,4,0,0.777778
2,2,7,1,4.5,Toy Story (1995),215,6,0,0.888889
3,3,15,1,2.5,Toy Story (1995),215,14,0,0.444444
4,4,17,1,4.5,Toy Story (1995),215,16,0,0.888889


In [127]:
# Split the data into training and testing sets
X = data[['userId_encoded', 'movieId_encoded']]
y = data['rating_normalized']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [128]:
#create a dataset class with all defined attributed
class RatingsDataset(Dataset):
    def __init__(self, users, movies, ratings):
        self.users = users
        self.movies = movies
        self.ratings = ratings

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return self.users[idx], self.movies[idx], self.ratings[idx]


In [129]:
#defining dataset in pytorch
train_dataset = RatingsDataset(
    torch.tensor(X_train['userId_encoded'].values, dtype=torch.long),
    torch.tensor(X_train['movieId_encoded'].values, dtype=torch.long),
    torch.tensor(y_train.values, dtype=torch.float)
)

test_dataset = RatingsDataset(
    torch.tensor(X_test['userId_encoded'].values, dtype=torch.long),
    torch.tensor(X_test['movieId_encoded'].values, dtype=torch.long),
    torch.tensor(y_test.values, dtype=torch.float)
)

In [130]:
#dataloader simply divide data rows into minibatches
train_loader = DataLoader(train_dataset, batch_size=100, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=100, shuffle=False)

In [131]:
#defining pytroch model
class RecommenderNet(nn.Module):
    #instanstiate the model with given structure
    #num of users and movies are required to create embedding
    def __init__(self, num_users, num_movies, embedding_size=50):
        super(RecommenderNet, self).__init__()
        #embedding layer(nn.Embedding(A,B)) is matrix of size AxB which means A is represnted by B-dimesnional space
        self.user_embedding = nn.Embedding(num_users, embedding_size)
        self.movie_embedding = nn.Embedding(num_movies, embedding_size)
        #nn.Linear(A,B) is a linear tarnsformation of A into B by multiplying it with a suffcient size matrix
        #if not stated, weight are initialized here
        self.fc1 = nn.Linear(embedding_size * 2, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
        #Dropout regularization with probability 0.2
        self.dropout = nn.Dropout(0.2)

    #this is forward propagation code
    def forward(self, user, movie):
        #return 50-D vector specific to user and movie
        user_embed = self.user_embedding(user)
        movie_embed = self.movie_embedding(movie)
        #concatenate one beside other
        x = torch.cat([user_embed, movie_embed], dim=1)
        x = self.dropout(x)
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

In [132]:
#create model with defined parameters
num_users = len(user_encoder.classes_)
num_movies = len(movie_encoder.classes_)
embedding_size = 50
model = RecommenderNet(num_users, num_movies, embedding_size)

In [47]:
#loss function of mean square loss is used
criterion = nn.MSELoss()
#adam optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [10]:
#note time on starting
start_time = time.time()



epochs = 50
#initialize best value of loss as infinity
best_loss = float('inf')
#patience parameter
patience = 10
epochs_no_improve = 0
for epoch in range(epochs):
    #activate training mode for model
    model.train()
    running_loss = 0.0
    for users, movies, ratings in train_loader:
        #push a mini-batch of users and movies in forward model
        output = model.forward(users, movies)
        #calculate mean square loss for entire minibatch
        #ratings is just a list (1d tensor), it need to be a vector of shape (batchsize,1)
        #loss is always averaged to give loss per data row in a minibatch
        loss = criterion(output, ratings.unsqueeze(1))
        
        # Update parameters
        #clear out all calculated gradeints in the previos batch
        optimizer.zero_grad()
        #calculate gradients by back propagation
        loss.backward()
        #update paramters keeping in mind the momentum and moments
        optimizer.step()
        
        #since loss is averaged for each minibatch, we multiply by size of batch to scale it back to original loss
        #This is necessary as last minibatch won't be of same size
        #user.size(0) returns a python float 
        running_loss += loss.item() * users.size(0)
    
    
    #total avg loss after each epoch
    train_loss = running_loss / len(X_train)

    
    #activate evaluation mode of model
    model.eval()
    val_loss = 0.0
    #donot calculate and change any gradient
    with torch.no_grad():
        for users, movies, ratings in test_loader:
            output = model.forward(users, movies)
            loss = criterion(output, ratings.unsqueeze(1))
            val_loss += loss.item() * users.size(0)

    total_val_loss = val_loss/len(test_loader.dataset)

    print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Val Loss: {total_val_loss:.4f}")

    # Early stopping if no improvement in 5 epochs
    if total_val_loss < best_loss:
        best_loss = total_val_loss
        #always save the best model with minimum validation loss
        torch.save(model.state_dict(), 'best_model.pth')
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1

    if epochs_no_improve >= patience:
        print("Early stopping triggered")
        break

print(f'\nDuration: {time.time() - start_time:.0f} seconds') # print the time elapsed   

Epoch 1/50, Train Loss: 0.0601, Val Loss: 0.0506
Epoch 2/50, Train Loss: 0.0480, Val Loss: 0.0430
Epoch 3/50, Train Loss: 0.0445, Val Loss: 0.0421
Epoch 4/50, Train Loss: 0.0424, Val Loss: 0.0428
Epoch 5/50, Train Loss: 0.0410, Val Loss: 0.0415
Epoch 6/50, Train Loss: 0.0397, Val Loss: 0.0402
Epoch 7/50, Train Loss: 0.0389, Val Loss: 0.0387
Epoch 8/50, Train Loss: 0.0382, Val Loss: 0.0388
Epoch 9/50, Train Loss: 0.0376, Val Loss: 0.0394
Epoch 10/50, Train Loss: 0.0373, Val Loss: 0.0381
Epoch 11/50, Train Loss: 0.0366, Val Loss: 0.0373
Epoch 12/50, Train Loss: 0.0362, Val Loss: 0.0373
Epoch 13/50, Train Loss: 0.0358, Val Loss: 0.0367
Epoch 14/50, Train Loss: 0.0352, Val Loss: 0.0362
Epoch 15/50, Train Loss: 0.0350, Val Loss: 0.0360
Epoch 16/50, Train Loss: 0.0348, Val Loss: 0.0367
Epoch 17/50, Train Loss: 0.0343, Val Loss: 0.0365
Epoch 18/50, Train Loss: 0.0342, Val Loss: 0.0377
Epoch 19/50, Train Loss: 0.0339, Val Loss: 0.0375
Epoch 20/50, Train Loss: 0.0336, Val Loss: 0.0371
Epoch 21/

In [118]:
# Load the saved model parameters
model.load_state_dict(torch.load('best_model.pth'))
model.eval()

#fetch all movie ids and their correspponding indices as encoded by labelencoder
all_movie_ids = data['movieId'].unique()
movie_indices = movie_encoder.transform(all_movie_ids)
#convert list of indices into a tensor
movie_idx = torch.tensor(movie_indices, dtype=torch.long)

#movie_id_dictionary which maps each movie id with its original title
movie_id_dict =pd.Series(data.title.values, index=data.movieId).to_dict()

def recommend_movies(user_id, num_recommendations):
    
    
    # Encode the user_id and fill it in a list of length same as of movie id
    user_idx = torch.tensor([user_encoder.transform([user_id])[0]] * len(movie_idx), dtype=torch.long)
    
    # Predict ratings
    with torch.no_grad():
        #we will push all movie id at once in the model , so output is 450 ratings predicted by model for a given user
        #it can also be done in a loop with a single movie id one by one
        #.squeeze make it a list back from 2d tensor of shape (450x1)
        predicted_ratings = model(user_idx, movie_idx).squeeze()
        #print(predicted_ratings.numpy())
        
    
     #argsort gives an array which sort the provided list
    sorted_indices = np.argsort(predicted_ratings.numpy())[::-1]
    
    sorted_movie_idx = movie_idx.numpy()[sorted_indices]
    
    top_movie_ids = movie_encoder.inverse_transform(sorted_movie_idx)
    
    #array of user given rating to be excluded
    userrating = np.array(user_to_movie_df.loc[user_id])
    
    #get indices for user rating in decreasing order , then count only rating given and finally lenth of it
    #it gives total movie scence by user
    total_movie_scene = (userrating[np.argsort(userrating)[::-1]]>0).sum()
    
    #use same idences to sort movies and take first test2 movies
    moviealreadyseen = movies_list[np.argsort(userrating)[::-1]][:total_movie_scene]
    

    finallist=[]
    for i in top_movie_ids:
        title = movie_id_dict[i]
        
        if len(finallist) == num_recommendations:
            break
        elif title in moviealreadyseen:
            continue   
        else:
            finallist.append(title)
    
    
    return finallist

In [119]:
recommend_movies(203,10)

['Fight Club (1999)',
 'City of God (Cidade de Deus) (2002)',
 'Departed, The (2006)',
 'Brazil (1985)',
 'American History X (1998)',
 'Eternal Sunshine of the Spotless Mind (2004)',
 'Snatch (2000)',
 'To Kill a Mockingbird (1962)',
 'Goodfellas (1990)',
 "Schindler's List (1993)"]