In [9]:
import pandas as pd
import numpy as np

In [10]:
movies_df = pd.read_csv("D:\Mymovie .csv")

In [11]:
movies_df.shape

(8992, 3)

In [12]:
movies_df[0:5]

Unnamed: 0,userId,movie,rating
0,3,Toy Story (1995),4.0
1,6,Toy Story (1995),5.0
2,8,Toy Story (1995),4.0
3,10,Toy Story (1995),4.0
4,11,Toy Story (1995),4.5


In [14]:
#number of unique users in the dataset
len(movies_df.userId.unique())

4081

In [15]:
len(movies_df.movie.unique())

10

In [16]:
user_movies_df = movies_df.pivot(index='userId',
                                 columns='movie',
                                 values='rating').reset_index(drop=True)

In [17]:
user_movies_df

movie,Father of the Bride Part II (1995),GoldenEye (1995),Grumpier Old Men (1995),Heat (1995),Jumanji (1995),Sabrina (1995),Sudden Death (1995),Tom and Huck (1995),Toy Story (1995),Waiting to Exhale (1995)
0,,,,,3.5,,,,,
1,,,4.0,,,,,,,
2,,,,,,,,,4.0,
3,,4.0,,3.0,,,,,,
4,,,,,3.0,,,,,
...,...,...,...,...,...,...,...,...,...,...
4076,4.0,,,,,,,,,
4077,3.5,,,,,,,,4.0,
4078,,3.0,4.0,5.0,,3.0,1.0,,4.0,
4079,,,,,,,,,5.0,


In [19]:
# Set unique user-id's from original dataset as index for new created pivot matrix
user_movies_df.index = movies_df.userId.unique()

In [20]:
user_movies_df.head()

movie,Father of the Bride Part II (1995),GoldenEye (1995),Grumpier Old Men (1995),Heat (1995),Jumanji (1995),Sabrina (1995),Sudden Death (1995),Tom and Huck (1995),Toy Story (1995),Waiting to Exhale (1995)
3,,,,,3.5,,,,,
6,,,4.0,,,,,,,
8,,,,,,,,,4.0,
10,,4.0,,3.0,,,,,,
11,,,,,3.0,,,,,


In [21]:
#Impute those NaNs with 0 values
user_movies_df.fillna(0, inplace=True)

In [22]:
user_movies_df

movie,Father of the Bride Part II (1995),GoldenEye (1995),Grumpier Old Men (1995),Heat (1995),Jumanji (1995),Sabrina (1995),Sudden Death (1995),Tom and Huck (1995),Toy Story (1995),Waiting to Exhale (1995)
3,0.0,0.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
10,0.0,4.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
7044,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7070,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
7080,0.0,3.0,4.0,5.0,0.0,3.0,1.0,0.0,4.0,0.0
7087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0


In [23]:
#Calculating Cosine Similarity between Users
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation

In [24]:
user_sim = 1 - pairwise_distances( user_movies_df.values,metric='cosine')

In [25]:
user_sim

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.55337157],
       [0.        , 1.        , 0.        , ..., 0.45883147, 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.45883147, 1.        ,
        0.62254302],
       ...,
       [0.        , 0.45883147, 0.45883147, ..., 1.        , 0.45883147,
        0.47607054],
       [0.        , 0.        , 1.        , ..., 0.45883147, 1.        ,
        0.62254302],
       [0.55337157, 0.        , 0.62254302, ..., 0.47607054, 0.62254302,
        1.        ]])

In [26]:
#Store the results in a dataframe
user_sim_df = pd.DataFrame(user_sim)

In [27]:
#Set the index and column names to user ids 
user_sim_df.index = movies_df.userId.unique()
user_sim_df.columns = movies_df.userId.unique()

In [28]:
# Similarity scores for first five customers (3 & 11 are similar customers)
user_sim_df.iloc[0:5, 0:5]

Unnamed: 0,3,6,8,10,11
3,1.0,0.0,0.0,0.0,1.0
6,0.0,1.0,0.0,0.0,0.0
8,0.0,0.0,1.0,0.0,0.0
10,0.0,0.0,0.0,1.0,0.0
11,1.0,0.0,0.0,0.0,1.0


In [29]:
# Correlation with self is always 1, so replace diagonal values of correlation matrix with zeros
np.fill_diagonal(user_sim, 0)
user_sim_df.iloc[0:5, 0:5]

Unnamed: 0,3,6,8,10,11
3,0.0,0.0,0.0,0.0,1.0
6,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0
11,1.0,0.0,0.0,0.0,0.0


In [20]:
#Most Similar Users
user_sim_df.idxmax(axis=1)[0:]

# 3 & 11 are highly similar followed by 6 & 168 and so on

3         11
6        168
8         16
10      4047
11         3
        ... 
7044      80
7070    1808
7080     708
7087       8
7105    4110
Length: 4081, dtype: int64

In [28]:
movies_df[(movies_df['userId']==8) | (movies_df['userId']==16)]

Unnamed: 0,userId,movie,rating
2,8,Toy Story (1995),4.0
8,16,Toy Story (1995),3.0
3727,8,Grumpier Old Men (1995),5.0
5205,8,Heat (1995),3.0
5207,16,Heat (1995),3.0
7445,8,GoldenEye (1995),4.0


In [22]:
user_1=movies_df[movies_df['userId']==6]

In [23]:
user_2=movies_df[movies_df['userId']==168]

In [24]:
user_2.movie

60    Toy Story (1995)
Name: movie, dtype: object

In [25]:
user_1.movie

1              Toy Story (1995)
3725    Grumpier Old Men (1995)
6464             Sabrina (1995)
Name: movie, dtype: object

In [26]:
# Join movies watched by both and then we can recommend the ones which have not been watched
# so we can recommend Grumpier and Sabrina top user id: 168 based on below matrix
pd.merge(user_1,user_2,on='movie',how='outer')

Unnamed: 0,userId_x,movie,rating_x,userId_y,rating_y
0,6,Toy Story (1995),5.0,168.0,4.5
1,6,Grumpier Old Men (1995),3.0,,
2,6,Sabrina (1995),5.0,,
