In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.impute import KNNImputer

# set figure size
plt.rcParams['figure.figsize'] = (12,6)

In [2]:
R = pd.read_csv('data/ratings.csv',index_col='movieId')

In [3]:
R = R[['userId', 'rating']]
R.head()

Unnamed: 0_level_0,userId,rating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,4.0
3,1,4.0
6,1,4.0
47,1,5.0
50,1,5.0


In [4]:
movies = pd.read_csv('data/movies.csv', index_col='movieId')
movies

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy
...,...,...
193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
193585,Flint (2017),Drama
193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [5]:
R = R.join(movies[['title']], on='movieId', how='inner')
R

Unnamed: 0_level_0,userId,rating,title
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,4.0,Toy Story (1995)
1,5,4.0,Toy Story (1995)
1,7,4.5,Toy Story (1995)
1,15,2.5,Toy Story (1995)
1,17,4.5,Toy Story (1995)
...,...,...,...
160341,610,2.5,Bloodmoon (1997)
160527,610,4.5,Sympathy for the Underdog (1971)
160836,610,3.0,Hazard (2005)
163937,610,3.5,Blair Witch (2016)


In [6]:
R = R.pivot_table(index='userId', columns='title', values='rating')
R.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [7]:
imputer = KNNImputer()

In [8]:
imputer.fit_transform(R)

array([[4. , 4. , 3.5, ..., 2. , 4. , 1. ],
       [4. , 4. , 3.5, ..., 2. , 3. , 1. ],
       [4. , 4. , 3.5, ..., 2. , 2.7, 1. ],
       ...,
       [4. , 4. , 3.5, ..., 2. , 3. , 1. ],
       [4. , 4. , 3.5, ..., 2. , 3.2, 1. ],
       [4. , 4. , 3.5, ..., 1.5, 3.5, 1. ]])

In [9]:
R_imputed = pd.DataFrame(imputer.fit_transform(R), index=R.index, columns=R.columns)
R_imputed

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,4.0,3.5,5.0,4.0,1.5,4.1,3.0,4.3,3.3,...,1.5,4.1,3.666667,3.0,3.0,3.9,2.7,2.0,4.0,1.0
2,4.0,4.0,3.5,5.0,4.0,1.5,3.0,3.0,3.8,3.2,...,1.5,4.1,3.666667,3.0,3.0,4.2,3.0,2.0,3.0,1.0
3,4.0,4.0,3.5,5.0,4.0,1.5,2.3,3.0,2.8,3.1,...,1.5,4.2,3.666667,3.0,3.0,3.8,2.9,2.0,2.7,1.0
4,4.0,4.0,3.5,5.0,4.0,1.5,3.2,3.0,3.7,3.1,...,1.5,3.8,3.666667,3.0,3.0,3.3,2.2,2.0,3.0,1.0
5,4.0,4.0,3.5,5.0,4.0,1.5,2.2,3.0,3.7,3.1,...,1.5,4.2,3.666667,3.0,3.0,4.1,2.3,2.0,3.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,4.0,4.0,3.5,5.0,4.0,1.5,2.9,3.0,4.2,3.2,...,1.5,3.5,3.666667,3.0,3.0,4.1,3.2,2.0,3.3,1.0
607,4.0,4.0,3.5,5.0,4.0,1.5,3.7,3.0,3.9,3.3,...,1.5,3.8,3.666667,3.0,3.0,4.3,3.5,2.0,3.5,1.0
608,4.0,4.0,3.5,5.0,4.0,1.5,2.3,3.0,4.5,3.4,...,1.5,3.8,3.666667,3.0,3.0,4.5,3.5,2.0,3.0,1.0
609,4.0,4.0,3.5,5.0,4.0,1.5,2.8,3.0,3.7,3.1,...,1.5,4.1,3.666667,3.0,3.0,4.2,2.7,2.0,3.2,1.0


### Calculate Cosine Similarities between users

In [10]:
def cosim(vec1, vec2):
    """function to calcualte the cosine similarity between two vectors"""
    
    num = np.dot(vec1, vec2)
    denom = np.sqrt(np.dot(vec1, vec1)) * np.sqrt(np.dot(vec2, vec2))
    return num / denom

In [11]:
# Create some user vectors and check the similarity
user_500 = R_imputed.loc[500]
user_420 = R_imputed.loc[420]

In [12]:
cosim(user_500, user_420)

0.9969419702187148

In [13]:
# Save the similarities
cs = pd.DataFrame(cosine_similarity(R_imputed), index=R_imputed.index, columns=R_imputed.index)

In [14]:
cs

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.996211,0.992444,0.993506,0.995570,0.996140,0.995129,0.996734,0.997106,0.994213,...,0.997515,0.996156,0.991197,0.996082,0.995464,0.994854,0.996360,0.992643,0.994802,0.996078
2,0.996211,1.000000,0.995213,0.995291,0.996922,0.996245,0.996133,0.997005,0.996963,0.995731,...,0.996929,0.996468,0.993636,0.996948,0.996761,0.996474,0.996317,0.994785,0.996968,0.995922
3,0.992444,0.995213,1.000000,0.993960,0.995386,0.993634,0.994230,0.994429,0.994259,0.994311,...,0.993657,0.994415,0.992351,0.994681,0.995649,0.993686,0.993521,0.993290,0.995953,0.993475
4,0.993506,0.995291,0.993960,1.000000,0.995787,0.994333,0.995823,0.995547,0.995117,0.995180,...,0.994391,0.995278,0.992579,0.995570,0.995616,0.994764,0.994700,0.991820,0.995816,0.993542
5,0.995570,0.996922,0.995386,0.995787,1.000000,0.995587,0.996272,0.997072,0.996718,0.995513,...,0.996408,0.996713,0.993839,0.996920,0.996515,0.996315,0.996157,0.994203,0.997523,0.995357
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.994854,0.996474,0.993686,0.994764,0.996315,0.994868,0.995466,0.996696,0.996013,0.994772,...,0.996098,0.995951,0.993311,0.996874,0.996172,1.000000,0.995449,0.993918,0.996507,0.995030
607,0.996360,0.996317,0.993521,0.994700,0.996157,0.996769,0.996299,0.996881,0.996569,0.995245,...,0.997043,0.996538,0.992349,0.996872,0.996296,0.995449,1.000000,0.993436,0.996061,0.995500
608,0.992643,0.994785,0.993290,0.991820,0.994203,0.992559,0.993148,0.994404,0.993988,0.992733,...,0.994239,0.994311,0.991026,0.993976,0.994477,0.993918,0.993436,1.000000,0.994842,0.992711
609,0.994802,0.996968,0.995953,0.995816,0.997523,0.995611,0.996253,0.997115,0.996372,0.996049,...,0.995801,0.996761,0.993872,0.997044,0.996784,0.996507,0.996061,0.994842,1.000000,0.994668
