In [299]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation
import math

%matplotlib inline

from subprocess import check_output
#print(check_output(["ls", "../ml-100k"]).decode("utf8"))

In [2]:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('./ml-100k/u.user', sep='|', names=u_cols,
                    encoding='utf8', parse_dates=True) 

r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('./ml-100k/u.data', sep='\t', names=r_cols,
                      encoding='latin-1')

m_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']
movies = pd.read_csv('./ml-100k/u.item', sep='|', names=m_cols, usecols=range(5),
                     encoding='latin-1')

In [3]:
movie_ratings = pd.merge(movies, ratings)
df = pd.merge(movie_ratings, users)

df.head(2)

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url,user_id,rating,unix_timestamp,age,sex,occupation,zip_code
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,308,4,887736532,60,M,retired,95076
1,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,308,5,887737890,60,M,retired,95076


In [4]:
df

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url,user_id,rating,unix_timestamp,age,sex,occupation,zip_code
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,308,4,887736532,60,M,retired,95076
1,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,308,5,887737890,60,M,retired,95076
2,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),308,4,887739608,60,M,retired,95076
3,7,Twelve Monkeys (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Twelve%20Monk...,308,4,887738847,60,M,retired,95076
4,8,Babe (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Babe%20(1995),308,5,887736696,60,M,retired,95076
5,9,Dead Man Walking (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Dead%20Man%20...,308,4,887737194,60,M,retired,95076
6,11,Seven (Se7en) (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Se7en%20(1995),308,5,887737837,60,M,retired,95076
7,12,"Usual Suspects, The (1995)",14-Aug-1995,,http://us.imdb.com/M/title-exact?Usual%20Suspe...,308,5,887737243,60,M,retired,95076
8,15,Mr. Holland's Opus (1995),29-Jan-1996,,http://us.imdb.com/M/title-exact?Mr.%20Holland...,308,3,887739426,60,M,retired,95076
9,17,From Dusk Till Dawn (1996),05-Feb-1996,,http://us.imdb.com/M/title-exact?From%20Dusk%2...,308,4,887739056,60,M,retired,95076


In [5]:
users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [6]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [7]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [8]:
movies.head()

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995)


In [73]:
m_genre = ['genre', 'no']
genre = pd.read_csv('./ml-100k/u.genre', sep='|', names=m_genre,
                     encoding='latin-1',usecols=range(1))


In [100]:
genre

Unnamed: 0,genre
0,unknown
1,Action
2,Adventure
3,Animation
4,Children's
5,Comedy
6,Crime
7,Documentary
8,Drama
9,Fantasy


In [82]:
genre['genre'].values.tolist()

['unknown',
 'Action',
 'Adventure',
 'Animation',
 "Children's",
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']

### One-Hot-Encoding

In [84]:
m_cols2 = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']
m_cols2 = m_cols2+genre['genre'].values.tolist()
m_cols2

['movie_id',
 'title',
 'release_date',
 'video_release_date',
 'imdb_url',
 'unknown',
 'Action',
 'Adventure',
 'Animation',
 "Children's",
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']

In [87]:
movies2 = pd.read_csv('./ml-100k/u.item', sep='|', names=m_cols2 , encoding='latin-1')
movies2.tail()

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1681,1682,Scream of Stone (Schrei aus Stein) (1991),08-Mar-1996,,http://us.imdb.com/M/title-exact?Schrei%20aus%...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Data Pre-Processing

In [88]:
df.drop(df.columns[[3,4,7]], axis=1, inplace=True)
ratings.drop( "unix_timestamp", inplace = True, axis = 1 ) 
movies.drop(movies.columns[[3,4]], inplace = True, axis = 1 )
#Dropping all the columns that are not really needed
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 9 columns):
movie_id        100000 non-null int64
title           100000 non-null object
release_date    99991 non-null object
user_id         100000 non-null int64
rating          100000 non-null int64
age             100000 non-null int64
sex             100000 non-null object
occupation      100000 non-null object
zip_code        100000 non-null object
dtypes: int64(4), object(5)
memory usage: 7.6+ MB


### Movie Rating

In [90]:
movie_stats = df.groupby('title').agg({'rating': [np.size, np.mean]})
movie_stats.head()

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
'Til There Was You (1997),9,2.333333
1-900 (1994),5,2.6
101 Dalmatians (1996),109,2.908257
12 Angry Men (1957),125,4.344
187 (1997),41,3.02439


In [91]:
min_50 = movie_stats['rating']['size'] >= 50
movie_stats[min_50].sort_values([('rating', 'mean')], ascending=False).head()

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
"Close Shave, A (1995)",112,4.491071
Schindler's List (1993),298,4.466443
"Wrong Trousers, The (1993)",118,4.466102
Casablanca (1942),243,4.45679
Wallace & Gromit: The Best of Aardman Animation (1996),67,4.447761


In [94]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [95]:
users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


Rating

In [209]:
ratings_matrix = ratings.pivot_table(index=['movie_id'],columns=['user_id'],values='rating').reset_index(drop=True)

In [210]:
ratings_matrix

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
0,5.0,4.0,,,4.0,4.0,,,,4.0,...,2.0,3.0,4.0,,4.0,,,5.0,,
1,3.0,,,,3.0,,,,,,...,4.0,,,,,,,,,5.0
2,4.0,,,,,,,,,,...,,,4.0,,,,,,,
3,3.0,,,,,,5.0,,,4.0,...,5.0,,,,,,2.0,,,
4,3.0,,,,,,,,,,...,,,,,,,,,,
5,5.0,,,,,,,,5.0,,...,,,5.0,,,,,,,
6,4.0,,,,,2.0,5.0,3.0,4.0,4.0,...,,,4.0,,4.0,,4.0,4.0,,
7,1.0,,,,,4.0,5.0,,,,...,,,,,,,5.0,,,
8,5.0,,,,,4.0,5.0,,,4.0,...,,1.0,4.0,5.0,3.0,5.0,3.0,,,3.0
9,3.0,2.0,,,,,4.0,,,,...,,,,,,,,,,


In [212]:
ratings_matrix.fillna( 0, inplace = True )
ratings_matrix.head(5)

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
0,5.0,4.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0,4.0,...,2.0,3.0,4.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0
1,3.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,4.0,...,5.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
4,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Cosine Similarity

In [227]:
user_similarity = pairwise_distances( ratings_matrix, metric="cosine" )
#np.fill_diagonal( user_similarity, 0 ) #Filling diagonals with 0s for future use when sorting is done
ratings_matrix_u = pd.DataFrame(user_similarity )
ratings_matrix_u

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
0,0.000000,0.597618,0.669755,0.545062,0.713286,0.883656,0.379021,0.518886,0.503712,0.726065,...,0.964613,1.000000,1.000000,1.000000,0.964613,1.0,1.0,1.0,0.952817,0.952817
1,0.597618,0.000000,0.726931,0.497429,0.681164,0.916437,0.616597,0.662998,0.744748,0.828918,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.0,1.0,1.0,0.921701,0.921701
2,0.669755,0.726931,0.000000,0.675134,0.787043,0.893278,0.627079,0.799206,0.726331,0.841896,...,1.000000,1.000000,1.000000,1.000000,0.967708,1.0,1.0,1.0,1.000000,0.903125
3,0.545062,0.497429,0.675134,0.000000,0.665761,0.909692,0.510717,0.509764,0.580956,0.747439,...,1.000000,1.000000,0.905978,0.905978,0.962391,1.0,1.0,1.0,0.943587,0.924782
4,0.713286,0.681164,0.787043,0.665761,0.000000,0.962701,0.665231,0.740839,0.727552,0.944547,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.0,1.0,1.0,1.000000,0.905789
5,0.883656,0.916437,0.893278,0.909692,0.962701,0.000000,0.860383,0.916124,0.848936,0.796903,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.0,1.0,1.0,1.000000,1.000000
6,0.379021,0.616597,0.627079,0.510717,0.665231,0.860383,0.000000,0.576485,0.472538,0.681377,...,1.000000,0.948502,1.000000,1.000000,0.948502,1.0,1.0,1.0,0.948502,0.948502
7,0.518886,0.662998,0.799206,0.509764,0.740839,0.916124,0.576485,0.000000,0.575571,0.732236,...,1.000000,0.917967,0.934373,0.934373,0.917967,1.0,1.0,1.0,0.917967,1.000000
8,0.503712,0.744748,0.726331,0.580956,0.727552,0.848936,0.472538,0.575571,0.000000,0.711486,...,1.000000,1.000000,0.942640,0.942640,0.928300,1.0,1.0,1.0,0.942640,0.928300
9,0.726065,0.828918,0.841896,0.747439,0.944547,0.796903,0.681377,0.732236,0.711486,0.000000,...,1.000000,1.000000,0.919736,0.919736,1.000000,1.0,1.0,1.0,1.000000,1.000000


In [216]:
movie_similarity = pairwise_distances( ratings_matrix.as_matrix(), metric="cosine" )
np.fill_diagonal( movie_similarity, 0 ) #Filling diagonals with 0s for future use when sorting is done
ratings_matrix_f = pd.DataFrame( movie_similarity )
ratings_matrix_f

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
0,0.000000,0.597618,0.669755,0.545062,0.713286,0.883656,0.379021,0.518886,0.503712,0.726065,...,0.964613,1.000000,1.000000,1.000000,0.964613,1.0,1.0,1.0,0.952817,0.952817
1,0.597618,0.000000,0.726931,0.497429,0.681164,0.916437,0.616597,0.662998,0.744748,0.828918,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.0,1.0,1.0,0.921701,0.921701
2,0.669755,0.726931,0.000000,0.675134,0.787043,0.893278,0.627079,0.799206,0.726331,0.841896,...,1.000000,1.000000,1.000000,1.000000,0.967708,1.0,1.0,1.0,1.000000,0.903125
3,0.545062,0.497429,0.675134,0.000000,0.665761,0.909692,0.510717,0.509764,0.580956,0.747439,...,1.000000,1.000000,0.905978,0.905978,0.962391,1.0,1.0,1.0,0.943587,0.924782
4,0.713286,0.681164,0.787043,0.665761,0.000000,0.962701,0.665231,0.740839,0.727552,0.944547,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.0,1.0,1.0,1.000000,0.905789
5,0.883656,0.916437,0.893278,0.909692,0.962701,0.000000,0.860383,0.916124,0.848936,0.796903,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.0,1.0,1.0,1.000000,1.000000
6,0.379021,0.616597,0.627079,0.510717,0.665231,0.860383,0.000000,0.576485,0.472538,0.681377,...,1.000000,0.948502,1.000000,1.000000,0.948502,1.0,1.0,1.0,0.948502,0.948502
7,0.518886,0.662998,0.799206,0.509764,0.740839,0.916124,0.576485,0.000000,0.575571,0.732236,...,1.000000,0.917967,0.934373,0.934373,0.917967,1.0,1.0,1.0,0.917967,1.000000
8,0.503712,0.744748,0.726331,0.580956,0.727552,0.848936,0.472538,0.575571,0.000000,0.711486,...,1.000000,1.000000,0.942640,0.942640,0.928300,1.0,1.0,1.0,0.942640,0.928300
9,0.726065,0.828918,0.841896,0.747439,0.944547,0.796903,0.681377,0.732236,0.711486,0.000000,...,1.000000,1.000000,0.919736,0.919736,1.000000,1.0,1.0,1.0,1.000000,1.000000


### Recomemdation

In [219]:
try:
    #user_inp=input('Enter the reference movie title based on which recommendations are to be made: ')
    user_inp="Toy Story (1995)"
    inp=movies[movies['title']==user_inp].index.tolist()
    inp=inp[0]
    
    movies['similarity'] = ratings_matrix_f.iloc[inp]
    movies.columns = ['movie_id', 'title', 'release_date','similarity']
    movies.head(2)
    
except:
    print("Sorry, the movie is not in the database!")

In [220]:
print("Recommended movies based on your choice of ",user_inp ,": \n", movies.sort_values( ["similarity"], ascending = False )[1:20])

Recommended movies based on your choice of  Toy Story (1995) : 
       movie_id                                         title release_date  \
1571      1572                Wend Kuuni (God's Gift) (1982)  01-Jan-1982   
1623      1624                                   Hush (1998)  10-Mar-1998   
1624      1625                             Nightwatch (1997)  22-Apr-1997   
1569      1570                        Quartier Mozart (1992)  01-Jan-1992   
1568      1569      Vie est belle, La (Life is Rosey) (1987)  01-Jan-1987   
1567      1568                  Vermont Is For Lovers (1992)  01-Jan-1992   
1566      1567                                Careful (1992)  01-Jan-1992   
1565      1566               Man from Down Under, The (1943)  01-Jan-1943   
1564      1565                                  Daens (1992)  01-Jan-1992   
1563      1564                   To Cross the Rubicon (1991)  01-Jan-1991   
1562      1563        Promise, The (Versprechen, Das) (1994)  01-Jan-1994   
1561      1

In [181]:
user_inp="Terminator, The (1984)"
inp=movies[movies['title']==user_inp].index.tolist()
inp=inp[0]
inp

194

In [184]:
movies['similarity'] = ratings_matrix.iloc[inp]

In [185]:
movies

Unnamed: 0,movie_id,title,release_date,similarity
0,1,Toy Story (1995),01-Jan-1995,
1,2,GoldenEye (1995),01-Jan-1995,5.0
2,3,Four Rooms (1995),01-Jan-1995,0.0
3,4,Get Shorty (1995),01-Jan-1995,0.0
4,5,Copycat (1995),01-Jan-1995,0.0
5,6,Shanghai Triad (Yao a yao yao dao waipo qiao) ...,01-Jan-1995,0.0
6,7,Twelve Monkeys (1995),01-Jan-1995,4.0
7,8,Babe (1995),01-Jan-1995,5.0
8,9,Dead Man Walking (1995),01-Jan-1995,5.0
9,10,Richard III (1995),22-Jan-1996,0.0


In [224]:
def computeCosineSimilarity(firstProdeucIndex,secondProductIndex):    
    sumdotAB = 0
    sumMagA = 0
    sumMagB = 0  
    for i in range(0,len(ratings_matrix)):
        a = ratings_matrix.iloc[:,firstProdeucIndex].values[i]
        b = ratings_matrix.iloc[:,secondProductIndex].values[i]
        #print(a)
        #print(b)
        sumdotAB = sumdotAB + np.dot(a,b) 
        
        sumMagA = sumMagA + np.dot(a,a) 
        sumMagB = sumMagB + np.dot(b,b) 
      
    #print(sumdotAB)
    #print(sumMagA)
    #print(sumMagB)
    cosine_similarity = sumdotAB / (math.sqrt(sumMagA)*math.sqrt(sumMagB))
    return cosine_similarity

In [None]:
def computeUserCosineSimilarity(uerdf):
    

In [203]:
cosAC = computeCosineSimilarity(0,1)
print(cosAC)

0.166930983869


In [152]:
ratings_matrix.head(2)

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
0,5.0,4.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0,4.0,...,2.0,3.0,4.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0
1,3.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0


In [162]:
for i in range(0,100):
    cosAC = computeCosineSimilarity(0,i)
    if cosAC > 0.5:
        print(i,' ',cosAC)

0   1.0
91   0.540533561184


In [168]:
ratings_matrix.iloc[:,0]

0       5.0
1       3.0
2       4.0
3       3.0
4       3.0
5       5.0
6       4.0
7       1.0
8       5.0
9       3.0
10      2.0
11      5.0
12      5.0
13      5.0
14      5.0
15      5.0
16      3.0
17      4.0
18      5.0
19      4.0
20      1.0
21      4.0
22      4.0
23      3.0
24      4.0
25      3.0
26      2.0
27      4.0
28      1.0
29      3.0
       ... 
1652    0.0
1653    0.0
1654    0.0
1655    0.0
1656    0.0
1657    0.0
1658    0.0
1659    0.0
1660    0.0
1661    0.0
1662    0.0
1663    0.0
1664    0.0
1665    0.0
1666    0.0
1667    0.0
1668    0.0
1669    0.0
1670    0.0
1671    0.0
1672    0.0
1673    0.0
1674    0.0
1675    0.0
1676    0.0
1677    0.0
1678    0.0
1679    0.0
1680    0.0
1681    0.0
Name: 1, Length: 1682, dtype: float64

In [171]:
ratings_matrix.iloc[:,94]

0       5.0
1       2.0
2       1.0
3       0.0
4       0.0
5       0.0
6       5.0
7       5.0
8       0.0
9       0.0
10      0.0
11      0.0
12      0.0
13      5.0
14      4.0
15      0.0
16      0.0
17      0.0
18      0.0
19      0.0
20      0.0
21      4.0
22      0.0
23      3.0
24      3.0
25      3.0
26      0.0
27      4.0
28      0.0
29      0.0
       ... 
1652    0.0
1653    0.0
1654    0.0
1655    0.0
1656    0.0
1657    0.0
1658    0.0
1659    0.0
1660    0.0
1661    0.0
1662    0.0
1663    0.0
1664    0.0
1665    0.0
1666    0.0
1667    0.0
1668    0.0
1669    0.0
1670    0.0
1671    0.0
1672    0.0
1673    0.0
1674    0.0
1675    0.0
1676    0.0
1677    0.0
1678    0.0
1679    0.0
1680    0.0
1681    0.0
Name: 95, Length: 1682, dtype: float64

In [179]:
movies.iloc[194]

movie_id                           195
title           Terminator, The (1984)
release_date               01-Jan-1984
similarity                    0.999285
Name: 194, dtype: object

In [222]:
cosAC = computeCosineSimilarity(0,6)
print(cosAC)

0.998561492302


In [223]:
cosAC = computeCosineSimilarity(0,1)
print(cosAC)

0.995213420545


In [232]:
user_rating_matrix = (ratings.pivot_table(index=['movie_id'],
                                          columns=['user_id'],values='rating').reset_index(drop=True)).T

In [235]:
user_rating_matrix.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,


In [283]:
user_rating_matrix.fillna( 0, inplace = True )
user_rating_matrix

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,4.0,0.0,0.0,0.0,0.0,0.0,2.0,4.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,5.0,0.0,0.0,5.0,5.0,5.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,5.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,4.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [282]:
user_rating_matrix.T.iloc[:,942]

0       0.0
1       5.0
2       0.0
3       0.0
4       0.0
5       0.0
6       0.0
7       0.0
8       3.0
9       0.0
10      4.0
11      5.0
12      0.0
13      0.0
14      0.0
15      0.0
16      0.0
17      0.0
18      0.0
19      0.0
20      0.0
21      4.0
22      4.0
23      4.0
24      0.0
25      0.0
26      4.0
27      4.0
28      0.0
29      0.0
       ... 
1652    0.0
1653    0.0
1654    0.0
1655    0.0
1656    0.0
1657    0.0
1658    0.0
1659    0.0
1660    0.0
1661    0.0
1662    0.0
1663    0.0
1664    0.0
1665    0.0
1666    0.0
1667    0.0
1668    0.0
1669    0.0
1670    0.0
1671    0.0
1672    0.0
1673    0.0
1674    0.0
1675    0.0
1676    0.0
1677    0.0
1678    0.0
1679    0.0
1680    0.0
1681    0.0
Name: 943, Length: 1682, dtype: float64

In [262]:
len(user_rating_matrix.T)


1682

In [297]:
def computeUserCosineSimilarity(userIndex,userRatingDf):    
    sumdotAB = 0
    sumMagA = 0
    sumMagB = 0  
    for i in range(0,10):
        for j in range(0,10):
            a = userRatingDf.iloc[:,userIndex].values[j]
            b = userRatingDf.iloc[:,i].values[j]
            #print(a)
            #print(b)
            sumdotAB = sumdotAB + np.dot(a,b) 
        
            sumMagA = sumMagA + np.dot(a,a) 
            sumMagB = sumMagB + np.dot(b,b) 
    
    print(sumdotAB)
        
    return sumdotAB

In [298]:
computeUserCosineSimilarity(0,user_rating_matrix)

834168.0


834168.0