## Creating the matrix

In [1]:
import pandas as pd
import numpy as np

from sklearn.decomposition import NMF 


In [2]:
movies = pd.read_csv("./data/movies.csv")
ratings = pd.read_csv("./data/ratings.csv")


In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [4]:
ratings.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247


In [5]:
ratings = ratings.rename(columns={'movieId': 'movie_id'})
movies = movies.rename(columns={'movieId': 'movie_id'})
ratings = ratings.rename(columns={'userId': 'user_id'})


In [6]:
# calculate the number of ratings per movie
rating_count = ratings.groupby('movie_id')[['rating']].count()

In [7]:
# filter for movies with more than 20 ratings and extract the index
popular_movies = rating_count[rating_count['rating']>20].index

In [8]:
# filter the ratings matrix and only keep the popular movies
df = ratings[ratings['movie_id'].isin(popular_movies)].copy()

In [9]:
# need to remake user ids and movie ids since they are not sequential
user_ids = df['user_id'].unique()
user_id_map = {v:k for k,v in enumerate(user_ids)}
df['user_id'] = df['user_id'].map(user_id_map)

In [10]:
movie_ids = df['movie_id'].unique()
movie_id_map = {v:k for k,v in enumerate(movie_ids)}
df['movie_id'] = df['movie_id'].map(movie_id_map)

In [11]:
df.tail()

Unnamed: 0,user_id,movie_id,rating,timestamp
100803,609,808,4.0,1493847175
100808,609,643,4.0,1493846503
100829,609,809,5.0,1493845631
100830,609,644,4.0,1493879365
100834,609,810,5.0,1493846352


In [12]:
# filter out unpopular movies
movies = movies[movies['movie_id'].isin(movie_ids)]

In [13]:
# redefine movie ids
movies['movie_id'] = movies['movie_id'].map(movie_id_map)

In [14]:
movie_title = movies.sort_values('movie_id')['title']
movie_title

0                     Toy Story (1995)
2              Grumpier Old Men (1995)
5                          Heat (1995)
43         Seven (a.k.a. Se7en) (1995)
46          Usual Suspects, The (1995)
                     ...              
1435        Terms of Endearment (1983)
2968               Little Nicky (2000)
3158                   Joe Dirt (2001)
2249                  RoboCop 2 (1990)
1488    Poseidon Adventure, The (1972)
Name: title, Length: 1235, dtype: object

In [15]:
from scipy.sparse import csr_matrix
R = csr_matrix((df['rating'], (df['user_id'], df['movie_id'])))

In [16]:
R.shape

(610, 1235)

In [17]:
R

<610x1235 sparse matrix of type '<class 'numpy.float64'>'
	with 66658 stored elements in Compressed Sparse Row format>

In [18]:
type(R)

scipy.sparse._csr.csr_matrix

In [19]:
#R[:10,:10].toarray()

In [20]:
Rating = pd.DataFrame(R.todense(), columns=movie_title)
Rating

title,Toy Story (1995),Grumpier Old Men (1995),Heat (1995),Seven (a.k.a. Se7en) (1995),"Usual Suspects, The (1995)",From Dusk Till Dawn (1996),Bottle Rocket (1996),Braveheart (1995),Rob Roy (1995),Desperado (1995),...,Freaky Friday (2003),Weekend at Bernie's (1989),"Darjeeling Limited, The (2007)","American Werewolf in London, An (1981)",eXistenZ (1999),Terms of Endearment (1983),Little Nicky (2000),Joe Dirt (2001),RoboCop 2 (1990),"Poseidon Adventure, The (1972)"
0,4.0,4.0,4.0,5.0,5.0,3.0,5.0,4.0,5.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4.0,0.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,2.5,0.0,0.0,3.0,4.5,4.0,0.0,3.5,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0
606,4.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,2.5,2.0,0.0,4.5,4.5,3.0,0.0,4.0,0.0,3.0,...,0.0,0.5,0.0,0.0,4.5,0.0,2.5,1.0,1.5,0.0
608,3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Neighborhood Based Collaborative Filtering Updated

In [21]:
from sklearn.metrics.pairwise import cosine_similarity

# machine-learning libraries
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import pairwise
#from scipy.sparse import csr_matrix

# miscellaneous
import pickle

In [22]:
R.shape

(610, 1235)

In [23]:
movies

Unnamed: 0,movie_id,title,genres
0,0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,325,Jumanji (1995),Adventure|Children|Fantasy
2,1,Grumpier Old Men (1995),Comedy|Romance
4,326,Father of the Bride Part II (1995),Comedy
5,2,Heat (1995),Action|Crime|Thriller
...,...,...,...
9162,808,"Big Short, The (2015)",Drama
9223,643,Zootopia (2016),Action|Adventure|Animation|Children|Comedy
9392,809,Arrival (2016),Sci-Fi
9433,644,Rogue One: A Star Wars Story (2016),Action|Adventure|Fantasy|Sci-Fi


In [24]:
# initialize the unsupervised model NearestNeighbors
model = NearestNeighbors(metric='cosine')


In [25]:
# fit it to the Ratings matrix
model.fit(R)

In [26]:
with open('./distance_recommender.pkl', 'wb') as file:
    pickle.dump(model, file)

In [27]:
with open('./distance_recommender.pkl', 'rb') as file:
    model = pickle.load(file)

In [28]:
new_user_query = {"Toy Story (1995)": 4,
                 "Big Short, The (2015)":3,
                 "Arrival (2016)":3.5,
                 "Logan (2017)":5}

In [31]:
new_user_dataframe =  pd.DataFrame(new_user_query, columns=movie_title, index=['new_user'])
new_user_dataframe

title,Toy Story (1995),Grumpier Old Men (1995),Heat (1995),Seven (a.k.a. Se7en) (1995),"Usual Suspects, The (1995)",From Dusk Till Dawn (1996),Bottle Rocket (1996),Braveheart (1995),Rob Roy (1995),Desperado (1995),...,Freaky Friday (2003),Weekend at Bernie's (1989),"Darjeeling Limited, The (2007)","American Werewolf in London, An (1981)",eXistenZ (1999),Terms of Endearment (1983),Little Nicky (2000),Joe Dirt (2001),RoboCop 2 (1990),"Poseidon Adventure, The (1972)"
new_user,4,,,,,,,,,,...,,,,,,,,,,


In [32]:
# filling the missing values
new_user_dataframe_imputed = new_user_dataframe.fillna(0)
new_user_dataframe_imputed

title,Toy Story (1995),Grumpier Old Men (1995),Heat (1995),Seven (a.k.a. Se7en) (1995),"Usual Suspects, The (1995)",From Dusk Till Dawn (1996),Bottle Rocket (1996),Braveheart (1995),Rob Roy (1995),Desperado (1995),...,Freaky Friday (2003),Weekend at Bernie's (1989),"Darjeeling Limited, The (2007)","American Werewolf in London, An (1981)",eXistenZ (1999),Terms of Endearment (1983),Little Nicky (2000),Joe Dirt (2001),RoboCop 2 (1990),"Poseidon Adventure, The (1972)"
new_user,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
# calculates the distances to all other users in the data!
similarity_scores, neighbor_ids = model.kneighbors(
    new_user_dataframe_imputed,
    n_neighbors=5,
    return_distance=True
)



In [34]:


# sklearn returns a list of predictions
# extract the first and only value of the list

neighbors_df = pd.DataFrame(
    data = {'neighbor_id': neighbor_ids[0], 'similarity_score': similarity_scores[0]}
)

neighbors_df

Unnamed: 0,neighbor_id,similarity_score
0,470,0.735897
1,566,0.775979
2,251,0.796163
3,278,0.844609
4,144,0.846016


In [36]:
# only look at ratings for users that are similar!
neighborhood = Rating.iloc[neighbor_ids[0]]
neighborhood

title,Toy Story (1995),Grumpier Old Men (1995),Heat (1995),Seven (a.k.a. Se7en) (1995),"Usual Suspects, The (1995)",From Dusk Till Dawn (1996),Bottle Rocket (1996),Braveheart (1995),Rob Roy (1995),Desperado (1995),...,Freaky Friday (2003),Weekend at Bernie's (1989),"Darjeeling Limited, The (2007)","American Werewolf in London, An (1981)",eXistenZ (1999),Terms of Endearment (1983),Little Nicky (2000),Joe Dirt (2001),RoboCop 2 (1990),"Poseidon Adventure, The (1972)"
470,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
566,3.5,0.0,0.0,0.0,1.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
251,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
278,3.0,0.0,0.0,0.0,3.5,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
144,5.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
neighborhood_filtered = neighborhood.drop(new_user_query.keys(), axis=1)
neighborhood_filtered

title,Grumpier Old Men (1995),Heat (1995),Seven (a.k.a. Se7en) (1995),"Usual Suspects, The (1995)",From Dusk Till Dawn (1996),Bottle Rocket (1996),Braveheart (1995),Rob Roy (1995),Desperado (1995),Billy Madison (1995),...,Freaky Friday (2003),Weekend at Bernie's (1989),"Darjeeling Limited, The (2007)","American Werewolf in London, An (1981)",eXistenZ (1999),Terms of Endearment (1983),Little Nicky (2000),Joe Dirt (2001),RoboCop 2 (1990),"Poseidon Adventure, The (1972)"
470,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
566,0.0,0.0,0.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
251,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
278,0.0,0.0,0.0,3.5,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
144,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
# calculate the summed up rating for each movie
# summing up introduces a bias for popular movies
# averaging introduces bias for movies only seen by few users in the neighboorhood

df_score = neighborhood_filtered.sum()
df_score

title
Grumpier Old Men (1995)           0.0
Heat (1995)                       0.0
Seven (a.k.a. Se7en) (1995)       0.0
Usual Suspects, The (1995)        9.5
From Dusk Till Dawn (1996)        0.0
                                 ... 
Terms of Endearment (1983)        0.0
Little Nicky (2000)               0.0
Joe Dirt (2001)                   0.0
RoboCop 2 (1990)                  0.0
Poseidon Adventure, The (1972)    0.0
Length: 1231, dtype: float64

In [39]:
df_score_ranked = df_score.sort_values(ascending=False).index.tolist()
df_score_ranked

['Inception (2010)',
 'Pulp Fiction (1994)',
 'Finding Nemo (2003)',
 'Toy Story 3 (2010)',
 'Incredibles, The (2004)',
 'Usual Suspects, The (1995)',
 'Shaun of the Dead (2004)',
 'Grand Budapest Hotel, The (2014)',
 'Avengers, The (2012)',
 'There Will Be Blood (2007)',
 'Forrest Gump (1994)',
 'Scott Pilgrim vs. the World (2010)',
 'Mad Max: Fury Road (2015)',
 'Blade Runner (1982)',
 'Life Is Beautiful (La Vita è bella) (1997)',
 'The Lego Movie (2014)',
 'Ratatouille (2007)',
 'Wreck-It Ralph (2012)',
 'Interstellar (2014)',
 'Skyfall (2012)',
 'Prestige, The (2006)',
 'Lord of the Rings: The Two Towers, The (2002)',
 'Dark Knight, The (2008)',
 'Rogue One: A Star Wars Story (2016)',
 'Matrix, The (1999)',
 'Inglourious Basterds (2009)',
 'The Revenant (2015)',
 'Lion King, The (1994)',
 'Fight Club (1999)',
 'Shawshank Redemption, The (1994)',
 'WALL·E (2008)',
 'V for Vendetta (2006)',
 'Star Wars: Episode VII - The Force Awakens (2015)',
 'Lord of the Rings: The Fellowship of t

In [40]:
recommendations = df_score_ranked[:3]
recommendations

['Inception (2010)', 'Pulp Fiction (1994)', 'Finding Nemo (2003)']