In [1]:
import pandas as pd 
import numpy as np
from sklearn.neighbors import NearestNeighbors

In [4]:
# Read ratings.csv and drop "timestamp" column
ratings = pd.read_csv('../csv/ratings.csv')
print(ratings.head())
print(ratings.shape)
  
# Read movies.csv and drop "genres" column
movies = pd.read_csv('../csv/movies.csv')
print(movies.head())
print(movies.shape)

   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931
(100836, 4)
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
(9742, 3)


In [3]:
# Join the two dataframes on movieId
df = pd.merge(ratings, movies, on='movieId')
print(df.head())

   userId  movieId  rating   timestamp             title  \
0       1        1     4.0   964982703  Toy Story (1995)   
1       5        1     4.0   847434962  Toy Story (1995)   
2       7        1     4.5  1106635946  Toy Story (1995)   
3      15        1     2.5  1510577970  Toy Story (1995)   
4      17        1     4.5  1305696483  Toy Story (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1  Adventure|Animation|Children|Comedy|Fantasy  
2  Adventure|Animation|Children|Comedy|Fantasy  
3  Adventure|Animation|Children|Comedy|Fantasy  
4  Adventure|Animation|Children|Comedy|Fantasy  


In [16]:
# Using pd.pivot_table, transform the table into a matrix so that each row represents a movie and each column represents a user
# (index should be "movieId")
pivot_table = pd.pivot_table(df, index='movieId', columns='userId', values='rating')

# fill NaN values with 0
pivot_table.fillna(0, inplace=True)
print(pivot_table.head())
# store the pivot table in a excel file
pivot_table.to_excel('../csv/pivot_table.xlsx')
movie_id_ = 318
# get data of the movie with id 318
movie_data = pivot_table.loc[movie_id_].values.reshape(1, -1)
print(movie_data)

userId   1    2    3    4    5    6    7    8    9    10   ...  601  602  603  \
movieId                                                    ...                  
1        4.0  0.0  0.0  0.0  4.0  0.0  4.5  0.0  0.0  0.0  ...  4.0  0.0  4.0   
2        0.0  0.0  0.0  0.0  0.0  4.0  0.0  4.0  0.0  0.0  ...  0.0  4.0  0.0   
3        4.0  0.0  0.0  0.0  0.0  5.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
4        0.0  0.0  0.0  0.0  0.0  3.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
5        0.0  0.0  0.0  0.0  0.0  5.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   

userId   604  605  606  607  608  609  610  
movieId                                     
1        3.0  4.0  2.5  4.0  2.5  3.0  5.0  
2        5.0  3.5  0.0  0.0  2.0  0.0  0.0  
3        0.0  0.0  0.0  0.0  2.0  0.0  0.0  
4        0.0  0.0  0.0  0.0  0.0  0.0  0.0  
5        3.0  0.0  0.0  0.0  0.0  0.0  0.0  

[5 rows x 610 columns]


KeyboardInterrupt: 

In [32]:
# Create a function that finds the similar movies to a given movie id.
def similar_movies(df, k, movie_id):
    # df is the movie-user matrix 
    # k is the number of similar movies to find
    # movie_id is the movie id to find similar movies to
    # Build a NearestNesighbors model
    neighbors = NearestNeighbors(n_neighbors=k, algorithm='brute', metric='cosine')
    # Fit the model to using df
    neighbors.fit(df)
    # Find the closest neighbors using .kneighbors and passing the rankings associated with the movie_id
    # This step will return a list of movie ids
    film = df.loc[movie_id].values.reshape(1, -1)
    distances, indices = neighbors.kneighbors(film)
    # return a list with the recommended movie titles
    title = []
    indices = indices.tolist()[0]
    for i in indices:
            title.append(df.iloc[i].name)
    return title

In [33]:
movie_name = 'Shawshank Redemption, The (1994)'
# Find the movieId associated with the movie_name
movie_id = movies[movies['title'] == movie_name]['movieId'].values[0]
# Use similar_movies() to find 10 movie recommendations
recommended_movies = similar_movies(pivot_table, 10, movie_id)
print("Since you watched %s: \n" % movie_name)
for i, m in enumerate(recommended_movies):
    movie_title = movies[movies['movieId'] == m]['title'].values[0]
    print("%d: %s" % (i+1, movie_title))

318
Since you watched Shawshank Redemption, The (1994): 

1: Shawshank Redemption, The (1994)
2: Forrest Gump (1994)
3: Pulp Fiction (1994)
4: Silence of the Lambs, The (1991)
5: Usual Suspects, The (1995)
6: Schindler's List (1993)
7: Fight Club (1999)
8: Braveheart (1995)
9: Matrix, The (1999)
10: Apollo 13 (1995)


In [34]:
## Modify the fuction to take the user id as well and ensure that the recomended movies are not already watched by the user. 

def similar_movies_2(df, k, movie_id, user_id):
    neighbors = NearestNeighbors(n_neighbors=k, algorithm='brute', metric='cosine')
    neighbors.fit(df)
    film = df.loc[movie_id].values.reshape(1, -1)
    distances, indices = neighbors.kneighbors(film)
    title = []
    indices = indices.tolist()[0]
    for i in indices:
        title.append(df.iloc[i].name)
    # get the movies that the user has already watched
    user_movies = df.loc[:, user_id]
    user_movies = user_movies[user_movies > 0].index.values.tolist()
    # remove the watched movies from the recommended movies
    recommended_movies = [m for m in title if m not in user_movies]
    return recommended_movies[:k]


In [38]:
movie_name = 'Shawshank Redemption, The (1994)'
user_id = 5

recommended_movies = similar_movies_2(pivot_table, 10, movie_id, user_id)

print("Since you watched %s: \n" % movie_name)
for i, m in enumerate(recommended_movies):
    movie_title = movies[movies['movieId'] == m]['title'].values[0]
    print("%d: %s" % (i+1, movie_title))

Since you watched Shawshank Redemption, The (1994): 

1: Forrest Gump (1994)
2: Silence of the Lambs, The (1991)
3: Fight Club (1999)
4: Matrix, The (1999)
