# Loading dataset



In [None]:
import pandas as pd

In [None]:
df1 = pd.read_csv('movies.csv')
df2 = pd.read_csv('ratings.csv')

df1

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [None]:
df2

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [None]:
df = pd.merge(df1, df2, on='movieId')
df

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483
...,...,...,...,...,...,...
100831,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,184,4.0,1537109082
100832,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,184,3.5,1537109545
100833,193585,Flint (2017),Drama,184,3.5,1537109805
100834,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,184,3.5,1537110021


In [None]:
columns = ['userId', 'movieId', 'rating']
df_model = pd.DataFrame(df, columns = columns)
df_model

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,5,1,4.0
2,7,1,4.5
3,15,1,2.5
4,17,1,4.5
...,...,...,...
100831,184,193581,4.0
100832,184,193583,3.5
100833,184,193585,3.5
100834,184,193587,3.5


# Data slicing/cleaning
* Remove movie with too less reviews 


* Remove user who give too less reviews 

In [None]:
f = ['count','mean']

df_movie_summary = df_model.groupby('movieId')['rating'].agg(f)
df_movie_summary.index = df_movie_summary.index.map(int)
movie_benchmark = round(df_movie_summary['count'].quantile(0.7), 0)
drop_movie_list = df_movie_summary[df_movie_summary['count'] < movie_benchmark].index

print('Movie minimum times of review: {}'.format(movie_benchmark))

df_user_summary = df.groupby('userId')['rating'].agg(f)
df_user_summary.index = df_user_summary.index.map(int)
user_benchmark = round(df_user_summary['count'].quantile(0.7),0)
drop_user_list = df_user_summary[df_user_summary['count'] < user_benchmark].index

print('User minimum times of review: {}'.format(user_benchmark))

Movie minimum times of review: 7.0
User minimum times of review: 140.0


In [None]:
print('Original Shape: {}'.format(df_model.shape))
df_model = df_model[~df_model['movieId'].isin(drop_movie_list)]
df_model = df_model[~df_model['userId'].isin(drop_user_list)]
print('After Trim Shape: {}'.format(df_model.shape))
print('-Data Examples-')
df_model.head(5)

Original Shape: (100836, 3)
After Trim Shape: (63950, 3)
-Data Examples-


Unnamed: 0,userId,movieId,rating
0,1,1,4.0
2,7,1,4.5
5,18,1,3.5
6,19,1,4.0
7,21,1,3.5


COLLABORATIVE FILTERING

In [None]:
!pip install surprise



In [None]:
from surprise import Reader
from surprise import Dataset
from surprise import SVD
from surprise.model_selection import cross_validate

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df_model[['userId', 'movieId', 'rating']], reader)

# Matrix factorization CF using sklearn surprise SVD

In [None]:
svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'])

{'fit_time': (3.25606369972229,
  3.254474639892578,
  3.2918949127197266,
  3.264202356338501,
  3.2724130153656006),
 'test_mae': array([0.6389986 , 0.63943468, 0.63987654, 0.63991795, 0.63999041]),
 'test_rmse': array([0.83462749, 0.83048015, 0.83289854, 0.83458534, 0.83292532]),
 'test_time': (0.10285472869873047,
  0.15715289115905762,
  0.10260510444641113,
  0.10139822959899902,
  0.15891695022583008)}

In [None]:
df1.set_index('movieId', inplace = True)
df1

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy
...,...,...
193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
193585,Flint (2017),Drama
193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [None]:
data_596 = df_model[(df_model['userId'] == 596) & (df_model['rating'] == 5)]
data_596 = data_596.set_index('movieId')
data_596 = data_596.join(df1)['title']
print(data_596)

movieId
2288                                      Thing, The (1982)
3000               Princess Mononoke (Mononoke-hime) (1997)
4878                                    Donnie Darko (2001)
5971           My Neighbor Totoro (Tonari no Totoro) (1988)
31658     Howl's Moving Castle (Hauru no ugoku shiro) (2...
57669                                      In Bruges (2008)
110102           Captain America: The Winter Soldier (2014)
122882                            Mad Max: Fury Road (2015)
122906                                 Black Panther (2017)
122916                                Thor: Ragnarok (2017)
166528                  Rogue One: A Star Wars Story (2016)
167746                         The Lego Batman Movie (2017)
168252                                         Logan (2017)
Name: title, dtype: object


In [None]:
data_596 = df1.copy()
data_596 = data_596.reset_index()
data_596 = data_596[~data_596['movieId'].isin(drop_movie_list)]

data = Dataset.load_from_df(df_model[['userId', 'movieId', 'rating']], reader)

trainset = data.build_full_trainset()
svd.fit(trainset)

data_596['score prediction'] = data_596['movieId'].apply(lambda x: svd.predict(596, x).est)

data_596 = data_596.drop('movieId', axis = 1)

data_596 = data_596.sort_values('score prediction', ascending=False)
print(data_596.head(10))

                                                  title  ... score prediction
704       Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)  ...         4.264245
971                                       Patton (1970)  ...         4.259882
224           Star Wars: Episode IV - A New Hope (1977)  ...         4.249884
277                    Shawshank Redemption, The (1994)  ...         4.241721
6710                            Dark Knight, The (2008)  ...         4.236139
898   Star Wars: Episode V - The Empire Strikes Back...  ...         4.225674
4176                City of God (Cidade de Deus) (2002)  ...         4.209287
733                        It's a Wonderful Life (1946)  ...         4.190095
863              Monty Python and the Holy Grail (1975)  ...         4.168855
1494        Seven Samurai (Shichinin no samurai) (1954)  ...         4.160940

[10 rows x 3 columns]


# Recommendation using pearson correlation

In [None]:
df_p = pd.pivot_table(df,values='rating',index='userId',columns='movieId')
print(df_p.shape)
df_p

(610, 9724)


movieId,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,34,36,38,39,40,41,42,43,...,185135,185435,185473,185585,186587,187031,187541,187593,187595,187717,188189,188301,188675,188751,188797,188833,189043,189111,189333,189381,189547,189713,190183,190207,190209,190213,190215,190219,190221,191005,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,4.0,,4.0,,,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.5,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,,,,,,,3.0,,,,,,,,,,,2.0,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,4.0,,,,,,,,,,,,,,,,,,,,4.0,,,,,,,,,,,,4.0,4.0,,3.0,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,2.5,,,,3.5,,4.0,4.0,2.0,,,,,,,,,3.5,4.5,,,4.0,,3.5,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
607,4.0,,,,,,,,,,3.0,,,,,,,,,,,,,,3.0,,,,,,,,3.0,4.0,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,,,,,,4.5,,,2.0,,3.5,,,2.0,,,,,,,3.0,3.5,3.5,,,3.0,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
609,3.0,,,,,,,,,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
def recommend(movie_title, min_count):
    print("For movie ({})".format(movie_title))
    print("- Top 10 movies recommended based on Pearsons'R correlation - ")
    i = int(df1.index[df1['title'] == movie_title][0])
    target = df_p[i]
    similar_to_target = df_p.corrwith(target)
    corr_target = pd.DataFrame(similar_to_target, columns = ['PearsonR'])
    corr_target.dropna(inplace = True)
    corr_target = corr_target.sort_values('PearsonR', ascending = False)
    corr_target.index = corr_target.index.map(int)
    corr_target = corr_target.join(df1).join(df_movie_summary)[['PearsonR', 'title', 'count', 'mean']]
    print(corr_target[corr_target['count']>min_count][:10].to_string(index=False))

In [None]:
recommend("Black Panther (2017)", 0)

For movie (Black Panther (2017))
- Top 10 movies recommended based on Pearsons'R correlation - 


  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)


 PearsonR                                                      title  count      mean
      1.0                                            Phantasm (1979)     14  2.892857
      1.0                         Doctor Who: The Next Doctor (2008)      4  2.875000
      1.0                                            Oblivion (2013)     20  3.300000
      1.0                         Ponyo (Gake no ue no Ponyo) (2008)     11  4.000000
      1.0                                Paul Blart: Mall Cop (2009)      7  2.714286
      1.0                               Jack the Giant Slayer (2013)      5  2.200000
      1.0                                     Misérables, Les (2012)      7  3.500000
      1.0                 I Now Pronounce You Chuck and Larry (2007)     11  3.454545
      1.0                          Doctor Who: Last Christmas (2014)      2  3.750000
      1.0  Doctor Who: The Doctor, the Widow and the Wardrobe (2011)      3  4.166667


In [None]:
recommend("Thor: Ragnarok (2017)", 0)

For movie (Thor: Ragnarok (2017))
- Top 10 movies recommended based on Pearsons'R correlation - 


  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)


 PearsonR                      title  count      mean
      1.0      In the Bedroom (2001)     10  3.000000
      1.0             Solaris (2002)     20  3.075000
      1.0      Good Girl, The (2002)     12  3.208333
      1.0      Principal, The (1987)      2  3.250000
      1.0      One Hour Photo (2002)     19  3.473684
      1.0  Johnny Dangerously (1984)      7  2.857143
      1.0    Punch-Drunk Love (2002)     33  3.621212
      1.0               Tully (2000)      2  2.750000
      1.0    Two Weeks Notice (2002)     19  3.263158
      1.0    Right Stuff, The (1983)     22  3.863636


In [None]:
recommend("WALL·E (2008)", 0)

For movie (WALL·E (2008))
- Top 10 movies recommended based on Pearsons'R correlation - 


  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)


 PearsonR                                                                                   title  count      mean
      1.0                                                                         Waitress (2007)      6  3.833333
      1.0                                                         Marvel One-Shot: Item 47 (2012)      2  2.750000
      1.0                                 A Pigeon Sat on a Branch Reflecting on Existence (2014)      2  4.000000
      1.0  Wes Craven's New Nightmare (Nightmare on Elm Street Part 7: Freddy's Finale, A) (1994)     12  2.916667
      1.0                                                                    The Overnight (2015)      2  3.750000
      1.0                                                                             Dope (2015)      6  3.583333
      1.0                                                                          Lincoln (2012)      4  4.500000
      1.0                                       Twilight Saga: Breaking Dawn - P