In [6]:

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from surprise import SVDpp
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from collections import defaultdict
from surprise.model_selection import train_test_split 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import roc_auc_score


In [7]:
#reading files
df_ratings = pd.read_csv('ratings.csv', sep='\t', encoding='latin-1', usecols=['user_id', 'movie_id', 'rating'])
df_movies = pd.read_csv('movies.csv', sep='\t', encoding='latin-1', usecols=['movie_id', 'title', 'genres'])

print(df_movies.head(5))
print(df_ratings.head(5))


   movie_id                               title                        genres
0         1                    Toy Story (1995)   Animation|Children's|Comedy
1         2                      Jumanji (1995)  Adventure|Children's|Fantasy
2         3             Grumpier Old Men (1995)                Comedy|Romance
3         4            Waiting to Exhale (1995)                  Comedy|Drama
4         5  Father of the Bride Part II (1995)                        Comedy
   user_id  movie_id  rating
0        1      1193       5
1        1       661       3
2        1       914       3
3        1      3408       4
4        1      2355       5


In [8]:
#splitting data into train and test sets
train_split, test_split = train_test_split(df_ratings, test_size = 0.25, random_state = 20)
print("Training data size:", train_split.shape)
print("Test data size:", test_split.shape)


Training data size: (750156, 3)
Test data size: (250053, 3)


In [9]:
#reader for parsing the ratings file
reader = Reader(rating_scale=(1, 5))
#building the train and test set, loading the data from dataframe 
train_build = Dataset.load_from_df(train_split, reader)
test_build = Dataset.load_from_df(test_split, reader)
trainset = train_build.build_full_trainset()
testset = test_build.build_full_trainset().build_testset() 
print("Test set size:", len(testset))


Test set size: 250053


In [10]:
#model building
#takes in factors, epochs, learning rate and regularization parameter 
model = SVDpp(n_factors=30,n_epochs=20,lr_all=0.01,reg_all=0.05) 
model.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x7fa76875b340>

In [11]:
#making predictions
predictions = model.test(testset)
#calculating rmse
accuracy.rmse(predictions, verbose = True)
#Save all the predicted ratings and convert it to a dataframe 



RMSE: 0.8564


0.8564282060778694

In [13]:
all_recommendations_list = defaultdict(list)
all_recommendations_df = pd.DataFrame([])
# Create empty lists to collect data for each column
user_list = []
movie_id_list = []
predicted_rating_list = []

for uid, iid, true_r, est, _ in predictions:
    all_recommendations_list[uid].append((iid, est))
    user_list.append(uid)
    movie_id_list.append(iid)
    predicted_rating_list.append(est)


#for uid, iid, true_r, est, _ in predictions:
    #all_recommendations_list[uid].append((iid, est))
    
# Create a DataFrame using the collected data
all_recommendations_df = pd.DataFrame({'user': user_list, 'movie_id': movie_id_list, 'predicted_rating': predicted_rating_list})
    #all_recommendations_df = all_recommendations_df.append(pd.DataFrame({'user': uid, 'movieId': iid, 'predicted_rating' : est}, index=[0]), ignore_index=True); 
print(all_recommendations_df.head(5)) 
print(all_recommendations_df.shape)



   user  movie_id  predicted_rating
0  1490       648          4.052456
1  1490      1527          4.122910
2  1490      2278          3.985968
3  1490      3404          4.168386
4  1490      1752          3.406558
(250053, 3)


In [14]:
print(df_movies.head(5)) 

   movie_id                               title                        genres
0         1                    Toy Story (1995)   Animation|Children's|Comedy
1         2                      Jumanji (1995)  Adventure|Children's|Fantasy
2         3             Grumpier Old Men (1995)                Comedy|Romance
3         4            Waiting to Exhale (1995)                  Comedy|Drama
4         5  Father of the Bride Part II (1995)                        Comedy


In [15]:
#Merging with movies file to get genre, title information for predictions
all_recommendations_df_details = pd.merge(all_recommendations_df,df_movies, on='movie_id', how='inner') 
print(all_recommendations_df_details.head(5))


   user  movie_id  predicted_rating                       title  \
0  1490       648          4.052456  Mission: Impossible (1996)   
1  4478       648          3.149444  Mission: Impossible (1996)   
2  5643       648          3.027120  Mission: Impossible (1996)   
3  4054       648          3.603353  Mission: Impossible (1996)   
4   134       648          3.044482  Mission: Impossible (1996)   

                     genres  
0  Action|Adventure|Mystery  
1  Action|Adventure|Mystery  
2  Action|Adventure|Mystery  
3  Action|Adventure|Mystery  
4  Action|Adventure|Mystery  


In [16]:
#List of top n recommendations list as per SVD++
def get_top_n_recommendation_list_df(all_recommendations_df_details, n=10):
    top_n_recommendations_df = all_recommendations_df_details.sort_values(['user','predicted_rating'] ,ascending=[True, False])
    return top_n_recommendations_df


In [17]:
top_n_recommendations_df = get_top_n_recommendation_list_df(all_recommendations_df_details, 10) 
print(top_n_recommendations_df.head())


        user  movie_id  predicted_rating                              title  \
186764     1       914          4.436481                My Fair Lady (1964)   
87763      1      1097          4.392588  E.T. the Extra-Terrestrial (1982)   
128703     1       150          4.363103                   Apollo 13 (1995)   
81325      1       588          4.303915                     Aladdin (1992)   
169456     1      2398          4.268643      Miracle on 34th Street (1947)   

                                     genres  
186764                      Musical|Romance  
87763       Children's|Drama|Fantasy|Sci-Fi  
128703                                Drama  
81325   Animation|Children's|Comedy|Musical  
169456                                Drama  


In [18]:
metrics=[]
true_positives_array = []
est_array = []
for rating_threshold in np.arange(0,5.5,0.5):
    truePositives = 0
    trueNegatives = 0
    falseNegatives = 0
    falsePositives = 0
    accuracy =0
    precision =0
    recall =0
    f1_score = 0
    for uid,_, true_r, est, _ in predictions:
        if(true_r >= rating_threshold and est >= rating_threshold):
            truePositives = truePositives + 1 
            true_positives_array.append(true_r) 
            est_array.append(est)
#here
        elif(true_r >= rating_threshold and est<= rating_threshold):
            falseNegatives = falseNegatives + 1 
        elif(true_r <= rating_threshold and est >=rating_threshold):
             falsePositives = falsePositives + 1 
        elif(true_r <= rating_threshold and est<=rating_threshold):
            trueNegatives = trueNegatives + 1 
        if(truePositives > 0):
            accuracy = (truePositives + trueNegatives ) / (truePositives + trueNegatives + falsePositives + falseNegatives)
            precision = truePositives / (truePositives + falsePositives)
            recall = truePositives / (truePositives + falseNegatives) 
            f1_score = 2 * (precision * recall) / (precision + recall)



In [19]:
metrics.append([rating_threshold,truePositives,trueNegatives,falsePositives,falseNegatives,accuracy,precision,recall,f1_score])
metrics_df = pd.DataFrame(metrics)
metrics_df.rename(columns={0:'rating_threshold', 1:'truePositives', 2: 'trueNegatives', 3: 'falsePositives', 4:'falseNegatives', 5: 'Accuracy', 6: 'Precision', 7:'Recall', 8:'F1 Score'},inplace=True)
true_bin_array =[]
for x in true_positives_array:
    if x >= rating_threshold: 
        x=1
    else: 
        x=0
    true_bin_array.append(x)
auc_score =roc_auc_score(true_bin_array,est_array,multi_class='raise',average='macro')
print('AUC Score: ',auc_score)


AUC Score:  0.8024315736376826


In [20]:
#Calculate precision and recall at n
def get_precision_recall_at_n(predictions,topn,rating_threshold): 
    all_actual_predicted_list = defaultdict(list)
    precision = dict()
    recall= dict()
    no_of_relevant_items = 0 
    no_of_recommended_items_at_top_n = 0 
    no_of_relevant_recommended_items_at_top_n = 0  
    for uid, iid, true_r, est, _ in predictions:
        all_actual_predicted_list[uid].append((est, true_r)) 
    for uid, user_ratings in all_actual_predicted_list.items():
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        no_of_relevant_items = sum((true_r >= rating_threshold) for (_, true_r) in user_ratings)
        no_of_recommended_items_at_top_n = sum((est >= rating_threshold) for (est, _) in user_ratings[:topn])
        no_of_relevant_recommended_items_at_top_n = sum(((true_r >= rating_threshold) and (est >= rating_threshold)) for (est, true_r) in user_ratings[:topn])
        precision[uid] = no_of_relevant_recommended_items_at_top_n / no_of_recommended_items_at_top_n if no_of_recommended_items_at_top_n != 0 else 1
        recall[uid] = no_of_relevant_recommended_items_at_top_n / no_of_relevant_items if no_of_relevant_items != 0 else 1
    return precision, recall 

rating_threshold=3 
precision_recall_at_n = [] 
for topn in range(2,20):
    precision, recall = get_precision_recall_at_n(predictions,topn,rating_threshold)
    precision_at_n = sum(prec for prec in precision.values()) / len(precision)

    recall_at_n = sum(rec for rec in recall.values()) / len(recall)
    precision_recall_at_n.append({'topN' : topn, 'Precision' : precision_at_n, 'Recall': recall_at_n})
for n in range(3,9):
    print(precision_recall_at_n[n])
#get user high rated and liked movies
all_movie_df_details = pd.merge(df_ratings,df_movies, on='movie_id', how='inner')
all_movie_df_details = all_movie_df_details.sort_values(['user_id','rating'],ascending=[True, False])
print(all_movie_df_details.loc[all_movie_df_details['user_id'] == 5256].head(10)) #user 10 top 10 rated movies
#user 10 top 10 movie recommendations list 
print(top_n_recommendations_df.loc[top_n_recommendations_df['user'] == 5256].head(10))

{'topN': 5, 'Precision': 0.9623096026490223, 'Recall': 0.3287980458145225}
{'topN': 6, 'Precision': 0.9588493377483411, 'Recall': 0.3760652138599872}
{'topN': 7, 'Precision': 0.9557391201513703, 'Recall': 0.4173926925989891}
{'topN': 8, 'Precision': 0.953149243140966, 'Recall': 0.45350193126483435}
{'topN': 9, 'Precision': 0.9508152659518367, 'Recall': 0.48525873870281866}
{'topN': 10, 'Precision': 0.9483391805949732, 'Recall': 0.5132430667011895}
       user_id  movie_id  rating                                      title  \
8826      5256      1287       5                             Ben-Hur (1959)   
16646     5256      1035       5                 Sound of Music, The (1965)   
25502     5256      1270       5                  Back to the Future (1985)   
27849     5256       527       5                    Schindler's List (1993)   
32487     5256      1721       5                             Titanic (1997)   
41463     5256       150       5                           Apollo 13 (1995

In [21]:
top_movies=(all_movie_df_details.loc[all_movie_df_details['user_id'] == 5256].head(10))

In [22]:
top_movies.head(10)

Unnamed: 0,user_id,movie_id,rating,title,genres
8826,5256,1287,5,Ben-Hur (1959),Action|Adventure|Drama
16646,5256,1035,5,"Sound of Music, The (1965)",Musical
25502,5256,1270,5,Back to the Future (1985),Comedy|Sci-Fi
27849,5256,527,5,Schindler's List (1993),Drama|War
32487,5256,1721,5,Titanic (1997),Drama|Romance
41463,5256,150,5,Apollo 13 (1995),Drama
43416,5256,1,5,Toy Story (1995),Animation|Children's|Comedy
49326,5256,260,5,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi
52116,5256,1207,5,To Kill a Mockingbird (1962),Drama
54569,5256,2028,5,Saving Private Ryan (1998),Action|Drama|War


In [23]:
filtered_df=(top_n_recommendations_df.loc[top_n_recommendations_df['user'] == 5256].head(10))
filtered_df.head(10)

Unnamed: 0,user,movie_id,predicted_rating,title,genres
158799,5256,953,4.642577,It's a Wonderful Life (1946),Drama
64229,5256,1234,4.622672,"Sting, The (1973)",Comedy|Crime
15679,5256,1704,4.610209,Good Will Hunting (1997),Drama
162769,5256,590,4.548422,Dances with Wolves (1990),Adventure|Drama|Western
60623,5256,1307,4.507815,When Harry Met Sally... (1989),Comedy|Romance
139860,5256,954,4.44533,Mr. Smith Goes to Washington (1939),Drama
157433,5256,858,4.420507,"Godfather, The (1972)",Action|Crime|Drama
2565,5256,1101,4.416268,Top Gun (1986),Action|Romance
127111,5256,2067,4.400063,Doctor Zhivago (1965),Drama|Romance|War
44158,5256,1204,4.393091,Lawrence of Arabia (1962),Adventure|War


In [24]:
filtered_df['title'].values

array(["It's a Wonderful Life (1946)", 'Sting, The (1973)',
       'Good Will Hunting (1997)', 'Dances with Wolves (1990)',
       'When Harry Met Sally... (1989)',
       'Mr. Smith Goes to Washington (1939)', 'Godfather, The (1972)',
       'Top Gun (1986)', 'Doctor Zhivago (1965)',
       'Lawrence of Arabia (1962)'], dtype=object)

In [25]:
top_movies['title'].values

array(['Ben-Hur (1959)', 'Sound of Music, The (1965)',
       'Back to the Future (1985)', "Schindler's List (1993)",
       'Titanic (1997)', 'Apollo 13 (1995)', 'Toy Story (1995)',
       'Star Wars: Episode IV - A New Hope (1977)',
       'To Kill a Mockingbird (1962)', 'Saving Private Ryan (1998)'],
      dtype=object)