## Creating the matrix

In [7]:
import pandas as pd
import numpy as np

from sklearn.decomposition import NMF 


In [8]:
movies = pd.read_csv("./data/movies.csv")
ratings = pd.read_csv("./data/ratings.csv")


In [9]:
movies.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [10]:
ratings.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247


In [11]:
ratings = ratings.rename(columns={'movieId': 'movie_id'})
movies = movies.rename(columns={'movieId': 'movie_id'})
ratings = ratings.rename(columns={'userId': 'user_id'})


In [12]:
movies.tail()

Unnamed: 0,movie_id,title,genres
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation
9741,193609,Andrew Dice Clay: Dice Rules (1991),Comedy


In [13]:
# calculate the number of ratings per movie
rating_count = ratings.groupby('movie_id')[['rating']].count()

In [14]:
# filter for movies with more than 20 ratings and extract the index
popular_movies = rating_count[rating_count['rating']>20].index
popular_movies

Index([     1,      2,      3,      5,      6,      7,     10,     11,     16,
           17,
       ...
       122920, 122922, 134130, 134853, 139385, 148626, 152081, 164179, 166528,
       168252],
      dtype='int64', name='movie_id', length=1235)

In [15]:
# filter the ratings matrix and only keep the popular movies
df = ratings[ratings['movie_id'].isin(popular_movies)].copy()

In [16]:
# need to remake user ids and movie ids since they are not sequential
user_ids = df['user_id'].unique()
user_id_map = {v:k for k,v in enumerate(user_ids)}
df['user_id'] = df['user_id'].map(user_id_map)

In [17]:
movie_ids = df['movie_id'].unique()
movie_id_map = {v:k for k,v in enumerate(movie_ids)}
df['movie_id'] = df['movie_id'].map(movie_id_map)

In [18]:
df.shape

(66658, 4)

In [19]:
# filter out unpopular movies
movies = movies[movies['movie_id'].isin(movie_ids)]

In [20]:
# redefine movie ids
movies['movie_id'] = movies['movie_id'].map(movie_id_map)

In [21]:
movie_title = movies.sort_values('movie_id')['title']
movie_title

0                     Toy Story (1995)
2              Grumpier Old Men (1995)
5                          Heat (1995)
43         Seven (a.k.a. Se7en) (1995)
46          Usual Suspects, The (1995)
                     ...              
1435        Terms of Endearment (1983)
2968               Little Nicky (2000)
3158                   Joe Dirt (2001)
2249                  RoboCop 2 (1990)
1488    Poseidon Adventure, The (1972)
Name: title, Length: 1235, dtype: object

In [22]:
from scipy.sparse import csr_matrix
R = csr_matrix((df['rating'], (df['user_id'], df['movie_id'])))

In [23]:
R.shape

(610, 1235)

In [24]:
R

<610x1235 sparse matrix of type '<class 'numpy.float64'>'
	with 66658 stored elements in Compressed Sparse Row format>

In [25]:
type(R)

scipy.sparse._csr.csr_matrix

In [26]:
#R[:10,:10].toarray()

In [28]:
R_df = pd.DataFrame(R.todense(), columns=movie_title)
R_df


title,Toy Story (1995),Grumpier Old Men (1995),Heat (1995),Seven (a.k.a. Se7en) (1995),"Usual Suspects, The (1995)",From Dusk Till Dawn (1996),Bottle Rocket (1996),Braveheart (1995),Rob Roy (1995),Desperado (1995),...,Freaky Friday (2003),Weekend at Bernie's (1989),"Darjeeling Limited, The (2007)","American Werewolf in London, An (1981)",eXistenZ (1999),Terms of Endearment (1983),Little Nicky (2000),Joe Dirt (2001),RoboCop 2 (1990),"Poseidon Adventure, The (1972)"
0,4.0,4.0,4.0,5.0,5.0,3.0,5.0,4.0,5.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4.0,0.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,2.5,0.0,0.0,3.0,4.5,4.0,0.0,3.5,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0
606,4.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,2.5,2.0,0.0,4.5,4.5,3.0,0.0,4.0,0.0,3.0,...,0.0,0.5,0.0,0.0,4.5,0.0,2.5,1.0,1.5,0.0
608,3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Non-Negative Matrix Factorization for Recommender Systems

In [29]:
nmf_model = NMF(n_components = 1000, max_iter = 1000, verbose = 2)

In [30]:
nmf_model.fit(R_df)

violation: 1.0
violation: 0.20437059427541024
violation: 0.07837163599197207
violation: 0.03959409904530152
violation: 0.02278357226825259
violation: 0.014801217380561208
violation: 0.010109630300460165
violation: 0.007415061044807685
violation: 0.00569186691402991
violation: 0.004342664244371478
violation: 0.0034330231702382375
violation: 0.0027889329219550063
violation: 0.002312335216762691
violation: 0.0019565652379100076
violation: 0.001711916203084992
violation: 0.0015358778037237704
violation: 0.0013688348522457494
violation: 0.0012124852059685306
violation: 0.0010695711861092862
violation: 0.0009555049468149999
violation: 0.0008548971827635442
violation: 0.0007658083408439466
violation: 0.0006901706438162434
violation: 0.0006258227267669954
violation: 0.0005704635128466077
violation: 0.0005224075149461674
violation: 0.00047688531371153946
violation: 0.00043604914292253875
violation: 0.0004014956316747097
violation: 0.00036978782407514016
violation: 0.0003422452310935086
violatio

In [32]:
Q_matrix = nmf_model.components_
Q_matrix

array([[0.00700934, 0.0002379 , 0.        , ..., 0.00289633, 0.        ,
        0.        ],
       [0.00031998, 0.        , 0.00152697, ..., 0.        , 0.        ,
        0.        ],
       [0.0305277 , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [33]:
Q_matrix.shape

(1000, 1235)

1. *Get the user-feature matrix/dataframe $P$ by using the model method transform on the full imputed user/movie dataframe/matrix*

In [47]:
# to dataframe for better visualization
Q = pd.DataFrame(Q_matrix, columns=movie_title, index= nmf_model.get_feature_names_out())
Q

title,Toy Story (1995),Grumpier Old Men (1995),Heat (1995),Seven (a.k.a. Se7en) (1995),"Usual Suspects, The (1995)",From Dusk Till Dawn (1996),Bottle Rocket (1996),Braveheart (1995),Rob Roy (1995),Desperado (1995),...,Freaky Friday (2003),Weekend at Bernie's (1989),"Darjeeling Limited, The (2007)","American Werewolf in London, An (1981)",eXistenZ (1999),Terms of Endearment (1983),Little Nicky (2000),Joe Dirt (2001),RoboCop 2 (1990),"Poseidon Adventure, The (1972)"
nmf0,0.007009,0.000238,0.000000,0.031149,7.943170e-03,0.000000,2.232522e-07,0.000000e+00,0.0,0.046818,...,0.000000,0.0,7.127996e-07,0.000009,0.000035,0.000000,0.0,0.002896,0.0,0.0
nmf1,0.000320,0.000000,0.001527,0.000000,2.426287e-02,0.000000,0.000000e+00,0.000000e+00,0.0,0.035491,...,0.000037,0.0,0.000000e+00,0.000000,0.000143,0.000000,0.0,0.000000,0.0,0.0
nmf2,0.030528,0.000000,0.000000,0.120303,6.049743e-02,0.000005,0.000000e+00,0.000000e+00,0.0,0.000000,...,0.000000,0.0,0.000000e+00,0.000000,0.000000,0.009237,0.0,0.000000,0.0,0.0
nmf3,0.058329,0.002082,0.000000,0.176844,4.324464e-05,0.000000,0.000000e+00,0.000000e+00,0.0,0.066567,...,0.000000,0.0,0.000000e+00,0.000024,0.000045,0.000000,0.0,0.052232,0.0,0.0
nmf4,0.095930,0.000000,0.000000,0.014889,1.804305e-04,0.000000,0.000000e+00,0.000000e+00,0.0,0.032615,...,0.000000,0.0,0.000000e+00,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
nmf995,0.000000,0.000000,0.000000,0.000000,3.668177e-07,0.000000,0.000000e+00,1.692703e-07,0.0,0.000000,...,0.000000,0.0,0.000000e+00,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0
nmf996,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,0.0,0.000000,...,0.000000,0.0,0.000000e+00,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0
nmf997,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,0.0,0.000000,...,0.000000,0.0,0.000000e+00,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0
nmf998,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,0.0,0.000000,...,0.000000,0.0,0.000000e+00,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0


In [34]:
P_matrix = nmf_model.transform(R_df)
P_matrix

violation: 1.0
violation: 1.0589401013675224
violation: 0.6885528938576028
violation: 0.3097049874009922
violation: 0.13950866937895723
violation: 0.06832424314522664
violation: 0.03341310819878629
violation: 0.017311249450905578
violation: 0.011278218418327582
violation: 0.007741542926395385
violation: 0.005647929170916368
violation: 0.004332011516617054
violation: 0.0029550812251183394
violation: 0.0024061036350385033
violation: 0.002147630527771599
violation: 0.0015777510873401428
violation: 0.0012818861674331612
violation: 0.0010654446381756888
violation: 0.000875176340629976
violation: 0.0007307716404220853
violation: 0.0006252599047055455
violation: 0.0005099441288182714
violation: 0.00042395102157727495
violation: 0.00036882234504288797
violation: 0.0003204236849114726
violation: 0.0002824896964703272
violation: 0.0002494944381050829
violation: 0.00023273787344708823
violation: 0.00021237841076320925
violation: 0.00018641706092731658
violation: 0.00017264070938902278
violation: 

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.08874513],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.30092311, 0.        , 0.        , ..., 0.        , 0.07261369,
        0.00942916],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.30524661],
       [0.        , 0.        , 0.        , ..., 0.        , 0.64080725,
        0.        ]])

In [35]:
R_hat_matrix = np.dot(P_matrix, Q_matrix)
R_hat_matrix

array([[3.99991710e+00, 3.99881314e+00, 4.00248681e+00, ...,
        1.07926597e-06, 1.69527102e-09, 1.48846406e-04],
       [3.08309074e-05, 1.74984211e-07, 7.58619225e-06, ...,
        9.53387016e-11, 5.31395250e-18, 0.00000000e+00],
       [3.83249069e-06, 1.39431535e-11, 2.12490132e-10, ...,
        2.40224174e-13, 7.12695398e-08, 5.42056806e-09],
       ...,
       [2.49988094e+00, 1.99783356e+00, 1.00079776e-01, ...,
        1.00031095e+00, 1.50048219e+00, 0.00000000e+00],
       [3.00066369e+00, 4.62323361e-03, 2.53411351e-03, ...,
        3.05207123e-10, 5.09093146e-05, 0.00000000e+00],
       [4.99958540e+00, 2.24166621e-04, 4.99625531e+00, ...,
        3.00001895e+00, 1.31143964e-04, 1.59053275e-16]])

In [36]:
# This is the sum or error so it is fine.
nmf_model.reconstruction_err_

6.734862197642844

In [37]:
import pickle

with open('nmf_model1.pkl',mode='wb') as file:
    pickle.dump(nmf_model,file)

In [38]:
with open('nmf_model1.pkl','rb') as file:
    loaded_model = pickle.load(file)

In [39]:
loaded_model

In [40]:
new_user_query = {"Toy Story (1995)": 5,
                 "Grumpier Old Men (1995)":2,
                 "Heat (1995)":3.5}


In [41]:
new_user_dataframe =  pd.DataFrame(new_user_query, columns=movie_title, index=["new_user"])
new_user_dataframe

title,Toy Story (1995),Grumpier Old Men (1995),Heat (1995),Seven (a.k.a. Se7en) (1995),"Usual Suspects, The (1995)",From Dusk Till Dawn (1996),Bottle Rocket (1996),Braveheart (1995),Rob Roy (1995),Desperado (1995),...,Freaky Friday (2003),Weekend at Bernie's (1989),"Darjeeling Limited, The (2007)","American Werewolf in London, An (1981)",eXistenZ (1999),Terms of Endearment (1983),Little Nicky (2000),Joe Dirt (2001),RoboCop 2 (1990),"Poseidon Adventure, The (1972)"
new_user,5,2,3.5,,,,,,,,...,,,,,,,,,,


In [42]:
new_user_dataframe_imputed = new_user_dataframe.fillna(0)
new_user_dataframe_imputed

title,Toy Story (1995),Grumpier Old Men (1995),Heat (1995),Seven (a.k.a. Se7en) (1995),"Usual Suspects, The (1995)",From Dusk Till Dawn (1996),Bottle Rocket (1996),Braveheart (1995),Rob Roy (1995),Desperado (1995),...,Freaky Friday (2003),Weekend at Bernie's (1989),"Darjeeling Limited, The (2007)","American Werewolf in London, An (1981)",eXistenZ (1999),Terms of Endearment (1983),Little Nicky (2000),Joe Dirt (2001),RoboCop 2 (1990),"Poseidon Adventure, The (1972)"
new_user,5,2,3.5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
P_new_user_matrix = loaded_model.transform(new_user_dataframe_imputed)
P_new_user_matrix

violation: 1.0
violation: 2.437198153353398
violation: 1.4315237694339609
violation: 0.21879880887531608
violation: 0.1556572184061561
violation: 0.06178911069071979
violation: 0.016458718854186236
violation: 0.009224578270520093
violation: 0.00233012395092348
violation: 0.0009454354827115188
violation: 0.0007042513193686026
violation: 0.00018168095090319352
violation: 2.6945226310217965e-05
Converged at iteration 14


array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.000000

In [44]:
# get as dataframe for a better visualizarion
P_new_user = pd.DataFrame(P_new_user_matrix, 
                         columns = loaded_model.get_feature_names_out(),
                         index = ['new_user'])

In [45]:
P_new_user

Unnamed: 0,nmf0,nmf1,nmf2,nmf3,nmf4,nmf5,nmf6,nmf7,nmf8,nmf9,...,nmf990,nmf991,nmf992,nmf993,nmf994,nmf995,nmf996,nmf997,nmf998,nmf999
new_user,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.117472,0.000572,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [48]:
R_hat_new_user_matrix = np.dot(P_new_user, Q)
R_hat_new_user_matrix

array([[4.99858871, 0.45253305, 1.61357773, ..., 0.        , 0.        ,
        0.        ]])

In [50]:
# get as dataframe for a better visualizarion
R_hat_new_user = pd.DataFrame(data=R_hat_new_user_matrix,
                         columns=movie_title,
                         index = ['new_user'])
R_hat_new_user

title,Toy Story (1995),Grumpier Old Men (1995),Heat (1995),Seven (a.k.a. Se7en) (1995),"Usual Suspects, The (1995)",From Dusk Till Dawn (1996),Bottle Rocket (1996),Braveheart (1995),Rob Roy (1995),Desperado (1995),...,Freaky Friday (2003),Weekend at Bernie's (1989),"Darjeeling Limited, The (2007)","American Werewolf in London, An (1981)",eXistenZ (1999),Terms of Endearment (1983),Little Nicky (2000),Joe Dirt (2001),RoboCop 2 (1990),"Poseidon Adventure, The (1972)"
new_user,4.998589,0.452533,1.613578,0.110271,0.053865,0.008155,0.0,0.10386,0.0,0.051461,...,1.342441e-11,0.0,0.000277,0.0,0.0,0.003076,0.0,0.0,0.0,0.0


In [51]:
new_user_query.keys()

dict_keys(['Toy Story (1995)', 'Grumpier Old Men (1995)', 'Heat (1995)'])

In [52]:
R_hat_new_user_filtered=R_hat_new_user.drop(new_user_query.keys(), axis=1)
R_hat_new_user_filtered

title,Seven (a.k.a. Se7en) (1995),"Usual Suspects, The (1995)",From Dusk Till Dawn (1996),Bottle Rocket (1996),Braveheart (1995),Rob Roy (1995),Desperado (1995),Billy Madison (1995),Clerks (1994),Dumb & Dumber (Dumb and Dumber) (1994),...,Freaky Friday (2003),Weekend at Bernie's (1989),"Darjeeling Limited, The (2007)","American Werewolf in London, An (1981)",eXistenZ (1999),Terms of Endearment (1983),Little Nicky (2000),Joe Dirt (2001),RoboCop 2 (1990),"Poseidon Adventure, The (1972)"
new_user,0.110271,0.053865,0.008155,0.0,0.10386,0.0,0.051461,0.011358,0.102705,0.071684,...,1.342441e-11,0.0,0.000277,0.0,0.0,0.003076,0.0,0.0,0.0,0.0


In [53]:
ranked = R_hat_new_user_filtered.T.sort_values(by=["new_user"], ascending=False).index.to_list()
ranked

['Rock, The (1996)',
 'Trainspotting (1996)',
 'Broken Arrow (1996)',
 'Independence Day (a.k.a. ID4) (1996)',
 'Mulholland Falls (1996)',
 'Father of the Bride Part II (1995)',
 'Silence of the Lambs, The (1991)',
 'Executive Decision (1996)',
 'Birdcage, The (1996)',
 'Terminator 2: Judgment Day (1991)',
 'Willy Wonka & the Chocolate Factory (1971)',
 'Craft, The (1996)',
 'Goodfellas (1990)',
 'Casino (1995)',
 'Fugitive, The (1993)',
 'Hunt for Red October, The (1990)',
 'Indiana Jones and the Last Crusade (1989)',
 '101 Dalmatians (1996)',
 'Untouchables, The (1987)',
 "Mr. Holland's Opus (1995)",
 'Twelve Monkeys (a.k.a. 12 Monkeys) (1995)',
 'Get Shorty (1995)',
 'Ransom (1996)',
 'Jurassic Park (1993)',
 'Clueless (1995)',
 'Eraser (1996)',
 'Fargo (1996)',
 'Cape Fear (1991)',
 'Sleepless in Seattle (1993)',
 'Sabrina (1995)',
 'Glory (1989)',
 'Terminator, The (1984)',
 'GoldenEye (1995)',
 'Heavy Metal (1981)',
 'Courage Under Fire (1996)',
 'Time to Kill, A (1996)',
 'Strip

In [54]:
recommendation = ranked[:3]
recommendation

['Rock, The (1996)', 'Trainspotting (1996)', 'Broken Arrow (1996)']