### Non Negative Matrix Factorization [ NMF ] for recommender Systems

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import NMF
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import pairwise
from scipy.sparse import csr_matrix
import sklearn
import pickle


Bad key "text.kerning_factor" on line 4 in
/home/seun/anaconda3/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.1.3/matplotlibrc.template
or from the matplotlib source distribution


#### 0. candiate generation

### 1. Load in the database that contains the movie infomation

In [2]:
ratings = pd.read_csv('ml-latest-small/ratings.csv')
movies = pd.read_csv('ml-latest-small/movies.csv')

### Preprocessing
##### filter out movies rated by less than 20 users
##### filter out movies with an average rating lower than 2
##### create a sparse user item matrix

In [3]:
ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
# calculate the number of ratings per movie
rating_per_movie=ratings.groupby('movieId')['userId'].count()
rating_per_movie = pd.DataFrame(rating_per_movie)
rating_per_movie.head(5)

Unnamed: 0_level_0,userId
movieId,Unnamed: 1_level_1
1,215
2,110
3,52
4,7
5,49


In [5]:
# filter the ratings matrix and only keep the popular movies
popular_movies=rating_per_movie.loc[rating_per_movie['userId'] > 19]
popular_movies.head(5)

Unnamed: 0_level_0,userId
movieId,Unnamed: 1_level_1
1,215
2,110
3,52
5,49
6,102


In [6]:
# filter the rating matrix and only keep the popular movies
ratings = ratings.set_index('movieId').loc[popular_movies.index]
ratings = ratings.reset_index()
ratings.head(5)

Unnamed: 0,movieId,userId,rating,timestamp
0,1,1,4.0,964982703
1,1,5,4.0,847434962
2,1,7,4.5,1106635946
3,1,15,2.5,1510577970
4,1,17,4.5,1305696483


### User rating 

In [7]:
df = ratings.merge(movies, on='movieId', how='outer')
df.head(5)

Unnamed: 0,movieId,userId,rating,timestamp,title,genres
0,1,1.0,4.0,964982700.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,5.0,4.0,847435000.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,1,7.0,4.5,1106636000.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,1,15.0,2.5,1510578000.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,1,17.0,4.5,1305696000.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [8]:
df.drop(['timestamp'],axis=1)
comedy_movie=df[df["genres"] == "Comedy|Drama"]
comedy_movie = comedy_movie.dropna()
comedy_movie=comedy_movie[['movieId','userId','rating','genres']]
comedy_movie.head(4)

Unnamed: 0,movieId,userId,rating,genres
5448,235,1.0,4.0,Comedy|Drama
5449,235,4.0,2.0,Comedy|Drama
5450,235,8.0,3.0,Comedy|Drama
5451,235,18.0,4.5,Comedy|Drama


In [9]:
#pd.set_option('display.max_rows', 500)
#values, counts = np.unique(comedy_movie['userId'], return_counts=True)
#pd.DataFrame (counts, columns = ['count'])
#pd.DataFrame (values, columns = ['values'])

In [10]:
comedy_movie=comedy_movie[comedy_movie['userId']==474]
comedy_movie=comedy_movie[comedy_movie.rating > 2]

# transform to dictionary
comedy_movie_dict=comedy_movie.to_dict()
query=comedy_movie_dict['rating']
query

# make a list of rated movie
comedy_movie_movies_rated=list(query.keys())
comedy_movie.head(5)

Unnamed: 0,movieId,userId,rating,genres
5502,235,474.0,4.0,Comedy|Drama
6263,272,474.0,3.5,Comedy|Drama
7179,308,474.0,4.0,Comedy|Drama
8367,345,474.0,3.5,Comedy|Drama
9624,371,474.0,3.0,Comedy|Drama


### Training data 

In [11]:
#df3 = df[~((df['userId'] == 474.0) & (df['genres'] == 'Comedy|Drama'))]#
#df3 = df3[df3['userId'].notnull()]
#df3=df3[['movieId','userId','rating','timestamp']]

df3 = ratings[~((ratings['userId'] == 474.0) & (ratings['rating'] == '4|5'))]
df3.shape

(67898, 4)

In [12]:
# Initialize a sparse user-item rating matrix 
# (data, (row_ind, col_ind)
R=csr_matrix((df3['rating'],(df3['userId'],df3['movieId'])))
R

<611x168253 sparse matrix of type '<class 'numpy.float64'>'
	with 67898 stored elements in Compressed Sparse Row format>

### 2. Training the model
#### initialize the model,  AND fit it on the user item matrix
-optionally, tune the number of components (hidden features): what happens if you set the number of components to a really low number?
-decrease the tol to train for a longer time

In [13]:
# initialize the unsupervised model
# 55 hidden features, F=55
model = NMF(n_components=400, init='nndsvd', max_iter=20000, tol=0.01, verbose=0)

# fit it to the user-item rating matrix
model.fit(R)

NMF(init='nndsvd', max_iter=20000, n_components=400, tol=0.01)

### 3. Model inspection

In [15]:
R

<611x168253 sparse matrix of type '<class 'numpy.float64'>'
	with 67898 stored elements in Compressed Sparse Row format>

In [16]:
model.components_.shape

(400, 168253)

In [17]:
# user-'genre' matrix [611x55]
P=model.transform(R)

# movie-'genre' matrix [55x168253]
Q=model.components_

P.shape, Q.shape

((611, 400), (400, 168253))

In [18]:
# R -> encoding -> P -> decoding -> Rhat
R_hat = model.inverse_transform(model.transform(R))

### 4. Error reconstruction ¶


In [19]:
R.shape, R_hat.shape

((611, 168253), (611, 168253))

In [20]:
model.reconstruction_err_

297.86588233651366

### 5. Model deployment: Make recommendations for a new user

##### Save the trained model on the hard drive

In [None]:
with open('./NMF_recommender.pkl', 'wb') as file:
    pickle.dump(model, file)

#### Read the model from hard drive

In [None]:
with open('./NMF_recommender.pkl', 'rb') as file:
    model = pickle.load(file)

In [21]:
model.reconstruction_err_

297.86588233651366

#### Construct a user vector to test

In [22]:
list(query.values())

[4.0,
 3.5,
 4.0,
 3.5,
 3.0,
 2.5,
 4.0,
 4.5,
 4.0,
 3.5,
 3.0,
 3.5,
 3.0,
 4.0,
 3.5,
 4.0,
 3.0,
 4.0,
 4.0,
 4.0,
 3.5,
 4.0,
 4.0,
 4.0,
 3.5]

In [23]:
data = list(query.values())   # the ratings of the new user
row_ind = [0]*len(data)       # we use just a single row 0 for this user 
col_ind = list(query.keys())  # the columns (=movieId) of the ratings
#data, row_ind, col_ind


In [24]:
# new user vector: needs to have the same format as the training data

user_vec = csr_matrix((data, (row_ind, col_ind)), shape=(1, R.shape[1]))
user_vec

<1x168253 sparse matrix of type '<class 'numpy.float64'>'
	with 25 stored elements in Compressed Sparse Row format>

#### Calculate the score
* Transform the user vector to its dense representation (encoding), 
* Inverse transform the dense vector into the sparse representation (decoding)

In [25]:
# user_vec -> encoding -> p_user_vec -> decoding -> user_vec_hat

scores = model.inverse_transform(model.transform(user_vec))

# convert to a pandas series
scores = pd.Series(scores[0])
scores

0         0.000000
1         0.020578
2         0.017790
3         0.001466
4         0.000000
            ...   
168248    0.000000
168249    0.000000
168250    0.000000
168251    0.000000
168252    0.004075
Length: 168253, dtype: float64

In [26]:
# sort the scores from high to low 
scores = scores.sort_values(ascending=False)
scores

5502     0.182906
5445     0.157510
2571     0.153142
5349     0.145997
1097     0.138374
           ...   
56762    0.000000
56763    0.000000
56764    0.000000
56765    0.000000
84126    0.000000
Length: 168253, dtype: float64

In [29]:
# get the movieIds of the top 10 entries
recommendations = scores.head(15).index
recommendations

Int64Index([5502, 5445, 2571, 5349, 1097, 5378,  480, 3793, 3527, 1214, 1200,
            6333, 1527, 5218, 1356],
           dtype='int64')

In [28]:
movies.set_index('movieId').loc[recommendations]

Unnamed: 0,title,genres
5502,Signs (2002),Horror|Sci-Fi|Thriller
5445,Minority Report (2002),Action|Crime|Mystery|Sci-Fi|Thriller
2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller
5349,Spider-Man (2002),Action|Adventure|Sci-Fi|Thriller
1097,E.T. the Extra-Terrestrial (1982),Children|Drama|Sci-Fi
5378,Star Wars: Episode II - Attack of the Clones (...,Action|Adventure|Sci-Fi|IMAX
480,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller
3793,X-Men (2000),Action|Adventure|Sci-Fi
3527,Predator (1987),Action|Sci-Fi|Thriller
1214,Alien (1979),Horror|Sci-Fi


## Make the model a function file

In [30]:
# collaborative filtering = look at ratings only!
def recommend_nmf(query, model, ratings, k=10):
    """
    Filters and recommends the top k movies for any given input query based on a trained NMF model. 
    Returns a list of k movie ids.
    """
    R = csr_matrix((ratings['rating'], (ratings['userId'], ratings['movieId'])))
    model.fit(R)
    R_hat = model.inverse_transform(model.transform(R))
    
    data = list(query.values())   # the ratings of the new user
    row_ind = [0]*len(data) 
    col_ind = list(query.keys()) 
    
    # new user vector: needs to have the same format as the training data

    user_vec = csr_matrix((data, (row_ind, col_ind)), shape=(1, R.shape[1]))
    user_vec
    
   
    # 2. scoring
    
    scores = model.inverse_transform(model.transform(user_vec))

    # convert to a pandas series
    scores = pd.Series(scores[0])
    scores[comedy_movie['movieId']]=0
    scores = scores.sort_values(ascending=False)
    
    
    recommendations = scores.head(k).index
    
    return movies.set_index('movieId').loc[recommendations]

 

In [31]:
query

{5502: 4.0,
 6263: 3.5,
 7179: 4.0,
 8367: 3.5,
 9624: 3.0,
 11787: 2.5,
 12971: 4.0,
 19056: 4.5,
 25304: 4.0,
 28884: 3.5,
 31141: 3.0,
 31442: 3.5,
 35498: 3.0,
 41902: 4.0,
 42291: 3.5,
 42659: 4.0,
 42816: 3.0,
 43524: 4.0,
 48388: 4.0,
 49653: 4.0,
 52014: 3.5,
 55060: 4.0,
 58298: 4.0,
 59076: 4.0,
 59411: 3.5}

In [32]:
recommend_nmf(query, model, ratings, 10)

Unnamed: 0,title,genres
5502,Signs (2002),Horror|Sci-Fi|Thriller
5349,Spider-Man (2002),Action|Adventure|Sci-Fi|Thriller
5445,Minority Report (2002),Action|Crime|Mystery|Sci-Fi|Thriller
2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller
6333,X2: X-Men United (2003),Action|Adventure|Sci-Fi|Thriller
3793,X-Men (2000),Action|Adventure|Sci-Fi
1527,"Fifth Element, The (1997)",Action|Adventure|Comedy|Sci-Fi
1097,E.T. the Extra-Terrestrial (1982),Children|Drama|Sci-Fi
5378,Star Wars: Episode II - Attack of the Clones (...,Action|Adventure|Sci-Fi|IMAX
1214,Alien (1979),Horror|Sci-Fi
