## MovieLens Recommender Systems

## Load the dataset

In [58]:
import os
import pandas as pd
from surprise import Dataset, Reader, KNNBasic, evaluate, accuracy

In [150]:
ratings_file = './ml-100k/u.data'

In [151]:
ratings_df = pd.read_csv( ratings_file, delimiter='\t', header = None)
ratings_df.columns = ['userid', 'movieid', 'rating', 'timestamp']

In [152]:
ratings_df.head(10)

Unnamed: 0,userid,movieid,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
5,298,474,4,884182806
6,115,265,2,881171488
7,253,465,5,891628467
8,305,451,3,886324817
9,6,86,3,883603013


### Convert to required dataset

In [28]:
reader = Reader(rating_scale=(1, 5))

In [29]:
data = Dataset.load_from_df(ratings_df[['userid', 
                                        'movieid', 
                                        'rating']], reader=reader)

In [31]:
data.df[0:10]

Unnamed: 0,userid,movieid,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1
5,298,474,4
6,115,265,2
7,253,465,5
8,305,451,3
9,6,86,3


## User Based Cosine Similarity Algorithm

In [69]:
data.split(2)

In [70]:
user_based_cosine_sim = {'name': 'cosine', 
                         'user_based': True}

knn = KNNBasic(sim_options=user_based_cosine_sim)

In [71]:
for trainset, testset in data.folds():

    # train and test algorithm.
    knn.train(trainset)
    predictions = knn.test(testset)

    # Compute and print Root Mean Squared Error
    rmse = accuracy.rmse(predictions, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0266
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0299


## Item Based Cosine Similarity Algorithm

In [73]:
item_based_consine_sim = sim_options = {'name': 'cosine', 
                                        'user_based': False}

knn = KNNBasic(sim_options=item_based_consine_sim)

In [74]:
for trainset, testset in data.folds():

    # train and test algorithm.
    knn.train(trainset)
    predictions = knn.test(testset)

    # Compute and print Root Mean Squared Error
    rmse = accuracy.rmse(predictions, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0494
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0506


### Using Evaluate

In [66]:
data.split(5)

In [76]:
knn_basic = KNNBasic(sim_options=item_based_consine_sim)

In [77]:
evaluate(knn_basic, data, measures=['RMSE', 'MAE'])

Evaluating RMSE, MAE of algorithm KNNBasic.

------------
Fold 1
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0494
MAE:  0.8361
------------
Fold 2
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0506
MAE:  0.8378
------------
------------
Mean RMSE: 1.0500
Mean MAE : 0.8370
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'mae': [0.83614763769849121, 0.83780877256996711],
                            'rmse': [1.0493943771260412, 1.0505824890850979]})

## Using SVD

In [79]:
from surprise import SVD

In [81]:
data.split(5)

In [82]:
svd = SVD()
evaluate(svd, data, measures=['RMSE', 'MAE'])

Evaluating RMSE, MAE of algorithm SVD.

------------
Fold 1
RMSE: 0.9339
MAE:  0.7348
------------
Fold 2
RMSE: 0.9341
MAE:  0.7362
------------
Fold 3
RMSE: 0.9389
MAE:  0.7392
------------
Fold 4
RMSE: 0.9384
MAE:  0.7419
------------
Fold 5
RMSE: 0.9315
MAE:  0.7354
------------
------------
Mean RMSE: 0.9353
Mean MAE : 0.7375
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'mae': [0.73484914539801016,
                             0.73621647782410726,
                             0.73920333350691902,
                             0.74191748525867474,
                             0.73535012901913277],
                            'rmse': [0.93388608642318149,
                             0.934074712912775,
                             0.93893310352521386,
                             0.93835124327064889,
                             0.93146011802383866]})

## Build the final model

In [83]:
# Retrieve the trainset.
trainset = data.build_full_trainset()
svd.train(trainset)

## Make Prediction

In [87]:
userid = str(196)
itemid = str(302)
print( svd.predict(userid, 302) )

user: 196        item: 302        r_ui = None   est = 4.26   {'was_impossible': False}


## Load the movies data

In [153]:
movies_file = './ml-100k/u.item'

In [154]:
movies_df = pd.read_csv( movies_file, delimiter = '\|', header = None )
movies_df = movies_df.iloc[:,:2]
movies_df.columns = ['movieid', 'title']

  if __name__ == '__main__':


In [155]:
movies_df.head(10)

Unnamed: 0,movieid,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)
5,6,Shanghai Triad (Yao a yao yao dao waipo qiao) ...
6,7,Twelve Monkeys (1995)
7,8,Babe (1995)
8,9,Dead Man Walking (1995)
9,10,Richard III (1995)


In [156]:
movies_dict = dict(zip(movies_df.movieid, movies_df.title))

### Finding which user has not watched which movie

In [157]:
testset = trainset.build_anti_testset()

In [158]:
predictions = svd.test(testset)

In [121]:
predictions[0:10]

[Prediction(uid=416, iid=679, r_ui=3.5298600000000002, est=2.9417780922715462, details={'was_impossible': False}),
 Prediction(uid=416, iid=435, r_ui=3.5298600000000002, est=4.4026283585092099, details={'was_impossible': False}),
 Prediction(uid=416, iid=508, r_ui=3.5298600000000002, est=3.4650718988401277, details={'was_impossible': False}),
 Prediction(uid=416, iid=208, r_ui=3.5298600000000002, est=3.9074376341060644, details={'was_impossible': False}),
 Prediction(uid=416, iid=1124, r_ui=3.5298600000000002, est=4.0093064005301793, details={'was_impossible': False}),
 Prediction(uid=416, iid=129, r_ui=3.5298600000000002, est=4.3106905047122908, details={'was_impossible': False}),
 Prediction(uid=416, iid=270, r_ui=3.5298600000000002, est=4.1414382141776036, details={'was_impossible': False}),
 Prediction(uid=416, iid=408, r_ui=3.5298600000000002, est=4.2879553219756241, details={'was_impossible': False}),
 Prediction(uid=416, iid=523, r_ui=3.5298600000000002, est=4.4971254226464792, 

In [140]:
prediction_df = pd.DataFrame(predictions, columns=['userid', 'movieid', 'r_ui', 'p_rating', 'details'])

In [148]:
def get_user_prediction( userid, predictions, n ):
    top_movies_watched = ratings_df[ratings_df.userid == userid].sort_values('rating', ascending = False)[0:10]
    print('TOP MOVIES WATCHED BY USER')
    print('--------------------------')    
    top_movies_watched.apply( lambda rec: print( movies_dict[rec.movieid], " : ",rec.rating), axis = 1)
    top_movies_recommended = predictions[predictions.userid == userid].sort_values('p_rating', ascending = False)[0:n]
    print('TOP MOVIES RECOMMENDED FOR USER')    
    print('-------------------------------')        
    top_movies_recommended.apply( lambda rec: print( movies_dict[rec.movieid], " : ",round(rec.p_rating, 2)), axis = 1)

In [149]:
get_user_prediction( 113, prediction_df, 10)

TOP MOVIES WATCHED BY USER
--------------------------
Fan, The (1996)  :  5
Chasing Amy (1997)  :  5
Trigger Effect, The (1996)  :  5
Hoodlum (1997)  :  5
Career Girls (1997)  :  5
Men in Black (1997)  :  5
Contact (1997)  :  5
A Chef in Love (1996)  :  5
My Best Friend's Wedding (1997)  :  5
Star Wars (1977)  :  5
TOP MOVIES RECOMMENDED FOR USER
-------------------------------
Princess Bride, The (1987)  :  4.98
Raiders of the Lost Ark (1981)  :  4.96
Close Shave, A (1995)  :  4.95
Great Escape, The (1963)  :  4.92
Rear Window (1954)  :  4.89
Empire Strikes Back, The (1980)  :  4.88
Secrets & Lies (1996)  :  4.88
Wrong Trousers, The (1993)  :  4.87
Bridge on the River Kwai, The (1957)  :  4.85
North by Northwest (1959)  :  4.84
