In [None]:

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from surprise import SVDpp
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from collections import defaultdict
from surprise.model_selection import train_test_split 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import roc_auc_score


In [None]:
#reading files
df_ratings = pd.read_csv('ratings.csv', sep='\t', encoding='latin-1', usecols=['user_id', 'movie_id', 'rating'])
df_movies = pd.read_csv('movies.csv', sep='\t', encoding='latin-1', usecols=['movie_id', 'title', 'genres'])

print(df_movies.head(5))
print(df_ratings.head(5))


In [3]:
#splitting data into train and test sets
train_split, test_split = train_test_split(df_ratings, test_size = 0.25, random_state = 20)
print("Training data size:", train_split.shape)
print("Test data size:", test_split.shape)


Training data size: (750156, 3)
Test data size: (250053, 3)


In [4]:
#reader for parsing the ratings file
reader = Reader(rating_scale=(1, 5))
#building the train and test set, loading the data from dataframe 
train_build = Dataset.load_from_df(train_split, reader)
test_build = Dataset.load_from_df(test_split, reader)
trainset = train_build.build_full_trainset()
testset = test_build.build_full_trainset().build_testset() 
print("Test set size:", len(testset))


Test set size: 250053


In [5]:
#model building
#takes in factors, epochs, learning rate and regularization parameter 
model = SVDpp(n_factors=20,n_epochs=5,lr_all=0.09,reg_all=0.5) 
model.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x2aaada1f9810>

In [6]:
#making predictions
predictions = model.test(testset)
#calculating rmse
accuracy.rmse(predictions, verbose = True)
#Save all the predicted ratings and convert it to a dataframe 



RMSE: 0.9641


0.9640903306319306

In [7]:
all_recommendations_list = defaultdict(list)
all_recommendations_df = pd.DataFrame([])
# Create empty lists to collect data for each column
user_list = []
movie_id_list = []
predicted_rating_list = []

for uid, iid, true_r, est, _ in predictions:
    all_recommendations_list[uid].append((iid, est))
    user_list.append(uid)
    movie_id_list.append(iid)
    predicted_rating_list.append(est)


#for uid, iid, true_r, est, _ in predictions:
    #all_recommendations_list[uid].append((iid, est))
    
# Create a DataFrame using the collected data
all_recommendations_df = pd.DataFrame({'user': user_list, 'movie_id': movie_id_list, 'predicted_rating': predicted_rating_list})
    #all_recommendations_df = all_recommendations_df.append(pd.DataFrame({'user': uid, 'movieId': iid, 'predicted_rating' : est}, index=[0]), ignore_index=True); 
print(all_recommendations_df.head(5)) 
print(all_recommendations_df.shape)



   user  movie_id  predicted_rating
0  1490       648          4.001559
1  1490      1527          4.451440
2  1490      2278          3.934697
3  1490      3404          3.882309
4  1490      1752          3.695100
(250053, 3)


In [8]:
print(df_movies.head(5)) 

   movie_id                               title                        genres
0         1                    Toy Story (1995)   Animation|Children's|Comedy
1         2                      Jumanji (1995)  Adventure|Children's|Fantasy
2         3             Grumpier Old Men (1995)                Comedy|Romance
3         4            Waiting to Exhale (1995)                  Comedy|Drama
4         5  Father of the Bride Part II (1995)                        Comedy


In [9]:
#Merging with movies file to get genre, title information for predictions
all_recommendations_df_details = pd.merge(all_recommendations_df,df_movies, on='movie_id', how='inner') 
print(all_recommendations_df_details.head(5))


   user  movie_id  predicted_rating                       title  \
0  1490       648          4.001559  Mission: Impossible (1996)   
1  4478       648          3.390776  Mission: Impossible (1996)   
2  5643       648          3.108085  Mission: Impossible (1996)   
3  4054       648          3.457194  Mission: Impossible (1996)   
4   134       648          2.998958  Mission: Impossible (1996)   

                     genres  
0  Action|Adventure|Mystery  
1  Action|Adventure|Mystery  
2  Action|Adventure|Mystery  
3  Action|Adventure|Mystery  
4  Action|Adventure|Mystery  


In [10]:
#List of top n recommendations list as per SVD++
def get_top_n_recommendation_list_df(all_recommendations_df_details, n=10):
    top_n_recommendations_df = all_recommendations_df_details.sort_values(['user','predicted_rating'] ,ascending=[True, False])
    return top_n_recommendations_df


In [11]:
top_n_recommendations_df = get_top_n_recommendation_list_df(all_recommendations_df_details, 10) 
print(top_n_recommendations_df.head())


        user  movie_id  predicted_rating  \
55722      1       720          4.219996   
238962     1      1545          4.202383   
186764     1       914          4.047105   
15601      1      2692          4.015032   
128703     1       150          3.991828   

                                                    title  \
55722   Wallace & Gromit: The Best of Aardman Animatio...   
238962                                     Ponette (1996)   
186764                                My Fair Lady (1964)   
15601                    Run Lola Run (Lola rennt) (1998)   
128703                                   Apollo 13 (1995)   

                      genres  
55722              Animation  
238962                 Drama  
186764       Musical|Romance  
15601   Action|Crime|Romance  
128703                 Drama  


In [13]:
metrics=[]
true_positives_array = []
est_array = []
for rating_threshold in np.arange(0,5.5,0.5):
    truePositives = 0
    trueNegatives = 0
    falseNegatives = 0
    falsePositives = 0
    accuracy =0
    precision =0
    recall =0
    f1_score = 0
    for uid,_, true_r, est, _ in predictions:
        if(true_r >= rating_threshold and est >= rating_threshold):
            truePositives = truePositives + 1 
            true_positives_array.append(true_r) 
            est_array.append(est)
#here
        elif(true_r >= rating_threshold and est<= rating_threshold):
            falseNegatives = falseNegatives + 1 
        elif(true_r <= rating_threshold and est >=rating_threshold):
             falsePositives = falsePositives + 1 
        elif(true_r <= rating_threshold and est<=rating_threshold):
            trueNegatives = trueNegatives + 1 
        if(truePositives > 0):
            accuracy = (truePositives + trueNegatives ) / (truePositives + trueNegatives + falsePositives + falseNegatives)
            precision = truePositives / (truePositives + falsePositives)
            recall = truePositives / (truePositives + falseNegatives) 
            f1_score = 2 * (precision * recall) / (precision + recall)



In [16]:
metrics.append([rating_threshold,truePositives,trueNegatives,falsePositives,falseNegatives,accuracy,precision,recall,f1_score])
metrics_df = pd.DataFrame(metrics)
metrics_df.rename(columns={0:'rating_threshold', 1:'truePositives', 2: 'trueNegatives', 3: 'falsePositives', 4:'falseNegatives', 5: 'Accuracy', 6: 'Precision', 7:'Recall', 8:'F1 Score'},inplace=True)
true_bin_array =[]
for x in true_positives_array:
    if x >= rating_threshold: 
        x=1
    else: 
        x=0
    true_bin_array.append(x)
auc_score =roc_auc_score(true_bin_array,est_array,multi_class='raise',average='macro')
print('AUC Score: ',auc_score)


AUC Score:  0.73209397893631


In [18]:
#Calculate precision and recall at n
def get_precision_recall_at_n(predictions,topn,rating_threshold): 
    all_actual_predicted_list = defaultdict(list)
    precision = dict()
    recall= dict()
    no_of_relevant_items = 0 
    no_of_recommended_items_at_top_n = 0 
    no_of_relevant_recommended_items_at_top_n = 0  
    for uid, iid, true_r, est, _ in predictions:
        all_actual_predicted_list[uid].append((est, true_r)) 
    for uid, user_ratings in all_actual_predicted_list.items():
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        no_of_relevant_items = sum((true_r >= rating_threshold) for (_, true_r) in user_ratings)
        no_of_recommended_items_at_top_n = sum((est >= rating_threshold) for (est, _) in user_ratings[:topn])
        no_of_relevant_recommended_items_at_top_n = sum(((true_r >= rating_threshold) and (est >= rating_threshold)) for (est, true_r) in user_ratings[:topn])
        precision[uid] = no_of_relevant_recommended_items_at_top_n / no_of_recommended_items_at_top_n if no_of_recommended_items_at_top_n != 0 else 1
        recall[uid] = no_of_relevant_recommended_items_at_top_n / no_of_relevant_items if no_of_relevant_items != 0 else 1
    return precision, recall 

rating_threshold=3 
precision_recall_at_n = [] 
for topn in range(2,20):
    precision, recall = get_precision_recall_at_n(predictions,topn,rating_threshold)
    precision_at_n = sum(prec for prec in precision.values()) / len(precision)

    recall_at_n = sum(rec for rec in recall.values()) / len(recall)
    precision_recall_at_n.append({'topN' : topn, 'Precision' : precision_at_n, 'Recall': recall_at_n})
for n in range(3,9):
    print(precision_recall_at_n[n])
#get user high rated and liked movies
all_movie_df_details = pd.merge(df_ratings,df_movies, on='movie_id', how='inner')
all_movie_df_details = all_movie_df_details.sort_values(['user_id','rating'],ascending=[True, False])
print(all_movie_df_details.loc[all_movie_df_details['user_id'] == 10].head(10)) #user 10 top 10 rated movies
#user 10 top 10 movie recommendations list 
print(top_n_recommendations_df.loc[top_n_recommendations_df['user'] == 10].head(10))

{'topN': 5, 'Precision': 0.9358360927152541, 'Recall': 0.325613324942311}
{'topN': 6, 'Precision': 0.932784216335538, 'Recall': 0.37363168470378766}
{'topN': 7, 'Precision': 0.9304347997477102, 'Recall': 0.4155271779874058}
{'topN': 8, 'Precision': 0.9274310154525397, 'Recall': 0.4520480343468461}
{'topN': 9, 'Precision': 0.9251108351728972, 'Recall': 0.48474759417241287}
{'topN': 10, 'Precision': 0.9226531456953639, 'Recall': 0.5134083550618816}
       user_id  movie_id  rating                                   title  \
2252        10       914       5                     My Fair Lady (1964)   
5906        10      1197       5              Princess Bride, The (1987)   
8927        10      2804       5               Christmas Story, A (1983)   
10279       10       594       5  Snow White and the Seven Dwarfs (1937)   
11043       10       919       5                Wizard of Oz, The (1939)   
12761       10       595       5             Beauty and the Beast (1991)   
14007       10   