# Collaborative Movie Recommendation

[Code Heroku](https://www.youtube.com/watch?v=3ecNC-So0r4&t=528s)

## Toy Dataset

In [1]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
import pandas as pd

In [2]:
ratings = pd.read_csv('data/toy_dataset.csv')
ratings

Unnamed: 0.1,Unnamed: 0,action1,action2,action3,romantic1,romantic2,romantic3
0,user 1,4.0,5.0,3.0,,2.0,1.0
1,user 2,5.0,3.0,3.0,2.0,2.0,
2,user 3,1.0,,,4.0,5.0,4.0
3,user 4,,2.0,1.0,4.0,,3.0
4,user 5,1.0,,2.0,3.0,3.0,4.0


make the first columns as index 1

In [3]:
ratings = pd.read_csv('data/toy_dataset.csv', index_col=0)
ratings

Unnamed: 0,action1,action2,action3,romantic1,romantic2,romantic3
user 1,4.0,5.0,3.0,,2.0,1.0
user 2,5.0,3.0,3.0,2.0,2.0,
user 3,1.0,,,4.0,5.0,4.0
user 4,,2.0,1.0,4.0,,3.0
user 5,1.0,,2.0,3.0,3.0,4.0


remove all null falues

In [4]:
ratings = pd.read_csv('data/toy_dataset.csv', index_col=0)
ratings = ratings.fillna(0)
ratings

Unnamed: 0,action1,action2,action3,romantic1,romantic2,romantic3
user 1,4.0,5.0,3.0,0.0,2.0,1.0
user 2,5.0,3.0,3.0,2.0,2.0,0.0
user 3,1.0,0.0,0.0,4.0,5.0,4.0
user 4,0.0,2.0,1.0,4.0,0.0,3.0
user 5,1.0,0.0,2.0,3.0,3.0,4.0


In [5]:
def standardize(row):
    new_row = (row - row.mean()) / (row.max() - row.min())
    return new_row

ratings_std = ratings.apply(standardize)
ratings_std

Unnamed: 0,action1,action2,action3,romantic1,romantic2,romantic3
user 1,0.36,0.6,0.4,-0.65,-0.08,-0.35
user 2,0.56,0.2,0.4,-0.15,-0.08,-0.6
user 3,-0.24,-0.4,-0.6,0.35,0.52,0.4
user 4,-0.44,0.0,-0.266667,0.35,-0.48,0.15
user 5,-0.24,-0.4,0.066667,0.1,0.12,0.4


angular distances

In [7]:
# Transpose bertujuan untuk mencari kemiripan antar item dibutuhkan menjadi rows
item_similarity = cosine_similarity(ratings_std.T)
print(item_similarity)

[[ 1.          0.70668875  0.81368151 -0.79941088 -0.02539184 -0.91410609]
 [ 0.70668875  1.          0.72310153 -0.84515425 -0.5189993  -0.84337386]
 [ 0.81368151  0.72310153  1.         -0.84794611 -0.3799803  -0.80218063]
 [-0.79941088 -0.84515425 -0.84794611  1.          0.14803913  0.72374686]
 [-0.02539184 -0.5189993  -0.3799803   0.14803913  1.          0.39393939]
 [-0.91410609 -0.84337386 -0.80218063  0.72374686  0.39393939  1.        ]]


In [8]:
item_similarity_df = pd.DataFrame(item_similarity, index=ratings.columns, columns=ratings.columns)
item_similarity_df

Unnamed: 0,action1,action2,action3,romantic1,romantic2,romantic3
action1,1.0,0.706689,0.813682,-0.799411,-0.025392,-0.914106
action2,0.706689,1.0,0.723102,-0.845154,-0.518999,-0.843374
action3,0.813682,0.723102,1.0,-0.847946,-0.37998,-0.802181
romantic1,-0.799411,-0.845154,-0.847946,1.0,0.148039,0.723747
romantic2,-0.025392,-0.518999,-0.37998,0.148039,1.0,0.393939
romantic3,-0.914106,-0.843374,-0.802181,0.723747,0.393939,1.0


In [14]:
# Built recommendations

def get_similar_movies(movie_name, user_rating):
    similar_score = item_similarity_df[movie_name] * (user_rating - 2.5)
    similar_score = similar_score.sort_values(ascending=False)
    
    return similar_score

print(get_similar_movies('action1',5))

action1      2.500000
action3      2.034204
action2      1.766722
romantic2   -0.063480
romantic1   -1.998527
romantic3   -2.285265
Name: action1, dtype: float64


In [15]:
print(get_similar_movies('romantic3',1))

action1      1.371159
action2      1.265061
action3      1.203271
romantic2   -0.590909
romantic1   -1.085620
romantic3   -1.500000
Name: romantic3, dtype: float64


In [16]:
print(get_similar_movies('romantic3',5))

romantic3    2.500000
romantic1    1.809367
romantic2    0.984848
action3     -2.005452
action2     -2.108435
action1     -2.285265
Name: romantic3, dtype: float64


In [23]:
action_lover = [('action1',5), ('romantic2',5), ('romantic3',1)]

similar_movies = pd.DataFrame()

for movie,rating in action_lover:
    similar_movies = similar_movies.append(get_similar_movies(movie, rating), ignore_index=True)
    
similar_movies.head()

Unnamed: 0,action1,action3,action2,romantic2,romantic1,romantic3
0,2.5,2.034204,1.766722,-0.06348,-1.998527,-2.285265
1,-0.06348,-0.949951,-1.297498,2.5,0.370098,0.984848
2,1.371159,1.203271,1.265061,-0.590909,-1.08562,-1.5


In [24]:
similar_movies.sum().sort_values(ascending=False)

action1      3.807680
action3      2.287524
romantic2    1.845611
action2      1.734284
romantic1   -2.714050
romantic3   -2.800417
dtype: float64

## Movie Lens Dataset

In [25]:
import pandas as pd

In [27]:
ratings = pd.read_csv('data/ratings.csv')
movies = pd.read_csv('data/movies.csv')
ratings = pd.merge(movies, ratings).drop(['genres', 'timestamp'], axis=1)
ratings.head()

Unnamed: 0,movieId,title,userId,rating
0,1,Toy Story (1995),1,4.0
1,1,Toy Story (1995),5,4.0
2,1,Toy Story (1995),7,4.5
3,1,Toy Story (1995),15,2.5
4,1,Toy Story (1995),17,4.5


In [29]:
user_ratings = ratings.pivot_table(index=['userId'], columns=['title'], values='rating')
user_ratings.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [32]:
# Let's remove movies that rated from less tahn 10 users and fill remaining Nan as 0
user_ratings = user_ratings.dropna(thresh=10, axis=1).fillna(0)

In [33]:
user_ratings

title,"'burbs, The (1989)",(500) Days of Summer (2009),10 Cloverfield Lane (2016),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),12 Angry Men (1957),12 Years a Slave (2013),127 Hours (2010),...,Zack and Miri Make a Porno (2008),Zero Dark Thirty (2012),Zero Effect (1998),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.0,0.0,4.5,3.5,0.0
609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
# Built similarity matrix
item_similarity_df = user_ratings.corr(method='pearson')
item_similarity_df.head(20)

title,"'burbs, The (1989)",(500) Days of Summer (2009),10 Cloverfield Lane (2016),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),12 Angry Men (1957),12 Years a Slave (2013),127 Hours (2010),...,Zack and Miri Make a Porno (2008),Zero Dark Thirty (2012),Zero Effect (1998),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"'burbs, The (1989)",1.0,0.063117,-0.023768,0.143482,0.011998,0.087931,0.224052,0.034223,0.009277,0.008331,...,0.017477,0.03247,0.134701,0.153158,0.101301,0.049897,0.003233,0.187953,0.062174,0.353194
(500) Days of Summer (2009),0.063117,1.0,0.142471,0.273989,0.19396,0.148903,0.142141,0.159756,0.135486,0.200135,...,0.374515,0.178655,0.068407,0.414585,0.355723,0.252226,0.216007,0.053614,0.241092,0.125905
10 Cloverfield Lane (2016),-0.023768,0.142471,1.0,-0.005799,0.112396,0.006139,-0.016835,0.031704,-0.024275,0.272943,...,0.242663,0.099059,-0.023477,0.272347,0.241751,0.195054,0.319371,0.177846,0.096638,0.002733
10 Things I Hate About You (1999),0.143482,0.273989,-0.005799,1.0,0.24467,0.223481,0.211473,0.011784,0.091964,0.043383,...,0.243118,0.104858,0.13246,0.091853,0.158637,0.281934,0.050031,0.121029,0.130813,0.110612
"10,000 BC (2008)",0.011998,0.19396,0.112396,0.24467,1.0,0.234459,0.119132,0.059187,-0.025882,0.089328,...,0.260261,0.087592,0.094913,0.184521,0.242299,0.240231,0.094773,0.088045,0.203002,0.083518
101 Dalmatians (1996),0.087931,0.148903,0.006139,0.223481,0.234459,1.0,0.285112,0.119843,0.072399,0.029967,...,0.114968,0.077232,0.096294,0.067134,0.113224,0.184324,0.054024,0.047804,0.156932,0.078734
101 Dalmatians (One Hundred and One Dalmatians) (1961),0.224052,0.142141,-0.016835,0.211473,0.119132,0.285112,1.0,0.134037,0.017264,-0.046277,...,0.120302,0.125816,0.049818,0.08365,0.171654,0.27426,0.077594,0.085606,0.24882,0.171118
12 Angry Men (1957),0.034223,0.159756,0.031704,0.011784,0.059187,0.119843,0.134037,1.0,0.132979,0.058862,...,0.104518,0.028415,0.079905,0.241435,0.144652,0.122107,0.056742,-0.001708,0.074306,0.102744
12 Years a Slave (2013),0.009277,0.135486,-0.024275,0.091964,-0.025882,0.072399,0.017264,0.132979,1.0,0.249931,...,0.024045,0.038127,0.013786,0.190366,0.10415,0.017351,0.063325,0.002528,0.037469,0.004213
127 Hours (2010),0.008331,0.200135,0.272943,0.043383,0.089328,0.029967,-0.046277,0.058862,0.249931,1.0,...,0.223135,0.154299,0.012907,0.364841,0.198926,0.091416,0.225747,0.128638,0.153335,0.002912


In [35]:
def get_similar_movies(movie_name, user_rating):
    similar_score = item_similarity_df[movie_name] * (user_rating - 2.5)
    similar_score = similar_score.sort_values(ascending=False)
    
    return similar_score

In [38]:
action_lover = [('2001: A Space Odyssey (1968)',3), ('(500) Days of Summer (2009)',5), ('17 Again (2009)',5),
                ('10 Things I Hate About You (1999)',3), ('127 Hours (2010)',2)]

similar_movies = pd.DataFrame()

for movie,rating in action_lover:
    similar_movies = similar_movies.append(get_similar_movies(movie, rating), ignore_index=True)
    
similar_movies.head()
similar_movies.sum().sort_values(ascending=False)

17 Again (2009)                       3.475111
(500) Days of Summer (2009)           3.451106
Definitely, Maybe (2008)              2.292629
Friends with Benefits (2011)          2.261864
Adventureland (2009)                  2.210003
                                        ...   
Forget Paris (1995)                  -0.171266
Drop Zone (1994)                     -0.172499
Madness of King George, The (1994)   -0.201419
Disclosure (1994)                    -0.213279
Clear and Present Danger (1994)      -0.269910
Length: 2269, dtype: float64