In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


In [3]:
# Load the ratings data
ratings_df = pd.read_csv('ratings.csv')
# Load the movies data
movies_df = pd.read_csv('movies.csv')


In [4]:
# View the first 5 rows of the ratings data
print(ratings_df.head())
# View the first 5 rows of the movies data
print(movies_df.head())


   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


In [5]:
# Merge the ratings and movies dataframes
ratings_movies_df = pd.merge(ratings_df, movies_df, on='movieId')


In [6]:
# Remove the 'timestamp' column
ratings_movies_df.drop('timestamp', axis=1, inplace=True)


In [8]:
ratings_movies_df.head()

Unnamed: 0,userId,movieId,rating,title,genres
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [9]:
# Create a pivot table
user_movie_ratings = ratings_movies_df.pivot_table(index='userId', columns='movieId', values='rating')


In [13]:
user_movie_ratings.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# Replace NaN with 0
user_movie_ratings.fillna(0, inplace=True)


In [14]:
# Calculate the cosine similarity matrix
cosine_sim_matrix = cosine_similarity(user_movie_ratings)


In [15]:
def get_similar_users(user_id, cosine_sim_matrix):
    # Get the row index for the user id
    user_index = user_id - 1
    # Get the cosine similarity scores for the user
    user_scores = cosine_sim_matrix[user_index]
    # Sort the scores in descending order and get the top 10 users
    similar_users = np.argsort(-user_scores)[:10]
    # Return the similar user ids
    return similar_users + 1

# Get the similar users for user 1
similar_users = get_similar_users(1, cosine_sim_matrix)

# Print the similar user ids
print(similar_users)

[  1 266 313 368  57  91 469  39 288 452]
