In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

rating_header = ["user_id", "movie_id", "rating", "timestamp"]
df = pd.read_csv("u.data", sep = '\t', header = None, names=rating_header)
df.drop(["timestamp","rating"],axis=1,inplace=True)

df.head()

Unnamed: 0,user_id,movie_id
0,196,242
1,186,302
2,22,377
3,244,51
4,166,346


In [2]:
movie_header = ["movie_id", "title", "release_date", "video_release_date", "IMDb_URL",
         "unknown", "Action", "Adventure", "Animation","Children's", "Comedy", "Crime",
         "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", 
         "Romance", "Sci-Fi", "Thriller", "War", "Western"]
movies = pd.read_csv("u.item", sep = '|', header = None, encoding = 'latin1', names = movie_header)
movies = movies.drop(columns=["video_release_date","title", "release_date", "IMDb_URL"])
movies.head()

Unnamed: 0,movie_id,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,4,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


In [3]:
merge_df_movie = pd.merge(df, movies, on='movie_id')
merge_df_movie.head()

Unnamed: 0,user_id,movie_id,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,196,242,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,63,242,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,226,242,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,154,242,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,306,242,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
movie_vec = movies
movie_vec = movie_vec.drop(columns=["movie_id"])
movie_vec.head()

Unnamed: 0,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


In [5]:
user_vec = merge_df_movie
user_vec.drop(['movie_id'],axis=1,inplace=True)
user_vec = user_vec.groupby("user_id").mean()
user_vec.head()

Unnamed: 0_level_0,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,0.003676,0.275735,0.154412,0.044118,0.091912,0.334559,0.091912,0.018382,0.393382,0.007353,0.003676,0.047794,0.047794,0.018382,0.161765,0.158088,0.191176,0.091912,0.022059
2,0.0,0.16129,0.048387,0.016129,0.064516,0.258065,0.145161,0.0,0.564516,0.016129,0.032258,0.032258,0.016129,0.064516,0.258065,0.064516,0.193548,0.048387,0.0
3,0.0,0.259259,0.074074,0.0,0.0,0.222222,0.185185,0.018519,0.407407,0.0,0.037037,0.092593,0.037037,0.203704,0.092593,0.148148,0.388889,0.092593,0.0
4,0.0,0.333333,0.166667,0.0,0.0,0.166667,0.166667,0.041667,0.25,0.0,0.0,0.041667,0.041667,0.208333,0.125,0.25,0.458333,0.083333,0.0
5,0.005714,0.32,0.188571,0.08,0.165714,0.468571,0.051429,0.0,0.154286,0.011429,0.005714,0.16,0.068571,0.017143,0.108571,0.188571,0.108571,0.08,0.011429


In [6]:
user_movie_similarity_matrix = cosine_similarity(user_vec.values,movie_vec.values)
user_movie_similarity_matrix = pd.DataFrame(user_movie_similarity_matrix, index=user_vec.index,columns=movie_vec.index)
user_movie_similarity_matrix.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.389184,0.513844,0.273847,0.830056,0.559451,0.563494,0.558575,0.678031,0.563494,0.491546,...,0.472927,0.563494,0.563494,0.563494,0.563494,0.563494,0.357488,0.562299,0.479233,0.563494
2,0.262152,0.312086,0.259463,0.761489,0.699072,0.756768,0.596272,0.686589,0.756768,0.580983,...,0.336358,0.756768,0.756768,0.756768,0.756768,0.756768,0.428093,0.77974,0.345951,0.756768
3,0.170561,0.554322,0.516984,0.682242,0.753309,0.541603,0.522233,0.483255,0.541603,0.47001,...,0.609272,0.541603,0.541603,0.541603,0.541603,0.541603,0.452602,0.47001,0.29542,0.541603
4,0.124154,0.713886,0.591364,0.558694,0.651809,0.322562,0.456172,0.310385,0.322562,0.304114,...,0.722272,0.322562,0.322562,0.322562,0.322562,0.322562,0.5322,0.342129,0.215041,0.322562
5,0.575055,0.496847,0.151396,0.759072,0.253024,0.215141,0.338062,0.63486,0.215141,0.231009,...,0.422577,0.215141,0.215141,0.215141,0.215141,0.215141,0.214106,0.259181,0.653392,0.215141


In [7]:
def get_the_most_similar_movies(user_id, user_movie_matrix,num):
    user_vec = user_movie_matrix.loc[user_id].values 
    sorted_index = np.argsort(user_vec)[::-1][:num]
    return list(user_movie_matrix.columns[sorted_index])
    #Find the top-n movies most similar to the user

In [8]:
def get_the_most_similar_users(movie_id, user_movie_matrix,num):
    movie_vec = user_movie_matrix.loc[:,movie_id].values 
    sorted_index = np.argsort(movie_vec)[::-1][:num]
    return list(user_movie_matrix.index[sorted_index])    
    #Find the top-n users most similar to the movie

In [9]:
get_the_most_similar_movies(1, user_movie_similarity_matrix,10)

[3, 73, 1137, 336, 44, 1630, 1556, 1632, 1240, 1307]

In [10]:
get_the_most_similar_users(1, user_movie_similarity_matrix,10)

[55, 386, 217, 37, 564, 513, 619, 359, 671, 396]