In [1]:
import numpy as np
from sklearn.decomposition import NMF
import pandas as pd

In [2]:
# decide on the movies
movies = ['Titanic', 'Tiffany', 'StarTrek', 'Terminator', 'StarWars']
users = ['Julius', 'Dina', 'Flann', 'Alex', 'Xaver']

In [3]:
# movie, ratings by users
data = [
    [2, 4, 2, 5, 2],
    [3, 5, 0, 5, 3],
    [2, 0, 4, 3, 5],
    [2, 3, 4, 3, 4],
    [5, 0, 3, 4, 3]
]

In [4]:
# build the user_movie_ratings_matrix
R = pd.DataFrame(data, index=users, columns=movies)
R

Unnamed: 0,Titanic,Tiffany,StarTrek,Terminator,StarWars
Julius,2,4,2,5,2
Dina,3,5,0,5,3
Flann,2,0,4,3,5
Alex,2,3,4,3,4
Xaver,5,0,3,4,3


In [5]:
#create a model
# model assumes R ~ PQ'
model = NMF(n_components=2)
model.fit(R)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
    n_components=2, random_state=None, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [6]:
# create movie-genre matrix
Q = pd.DataFrame(model.components_, columns=movies, index=['feature1', 'feature2'])
Q

Unnamed: 0,Titanic,Tiffany,StarTrek,Terminator,StarWars
feature1,1.511426,0.0,2.089755,1.501122,2.112315
feature2,0.878604,2.264529,0.0,1.993976,0.727743


In [7]:
# create user-genre matrix
P = pd.DataFrame(model.transform(R), columns=['feature1', 'feature2'], index=users)
P

Unnamed: 0,feature1,feature2
Julius,0.636789,1.778866
Dina,0.365153,2.303797
Flann,1.978556,0.0
Alex,1.44152,0.864799
Xaver,1.785283,0.393376


In [8]:
# Build and print the reconstructed matrix!
Rhat = pd.DataFrame(np.dot(P, Q), index=users, columns=movies)
Rhat

Unnamed: 0,Titanic,Tiffany,StarTrek,Terminator,StarWars
Julius,2.525379,4.028293,1.330733,4.502915,2.639656
Dina,2.576029,5.217016,0.763081,5.141856,2.447891
Flann,2.99044,0.0,4.134696,2.970055,4.179333
Alex,2.938566,1.958362,3.012423,3.888286,3.674295
Xaver,3.043944,0.890812,3.730803,3.46431,4.057355


In [9]:
R

Unnamed: 0,Titanic,Tiffany,StarTrek,Terminator,StarWars
Julius,2,4,2,5,2
Dina,3,5,0,5,3
Flann,2,0,4,3,5
Alex,2,3,4,3,4
Xaver,5,0,3,4,3


In [10]:
# look at the reconstruction error
model.reconstruction_err_

3.818859426888038

In [11]:
# Add a new user
Stefan = [[5, 0, 0, 0, 5]]

In [12]:
Rhat = pd.concat([Rhat, pd.DataFrame(np.dot(model.transform(Stefan), Q), index=['Stefan'], columns=movies)])

In [13]:
R = pd.concat([R, pd.DataFrame(Stefan, index=['Stefan'], columns=movies)])

In [14]:
Rhat

Unnamed: 0,Titanic,Tiffany,StarTrek,Terminator,StarWars
Julius,2.525379,4.028293,1.330733,4.502915,2.639656
Dina,2.576029,5.217016,0.763081,5.141856,2.447891
Flann,2.99044,0.0,4.134696,2.970055,4.179333
Alex,2.938566,1.958362,3.012423,3.888286,3.674295
Xaver,3.043944,0.890812,3.730803,3.46431,4.057355
Stefan,2.05127,0.026172,2.822123,2.050246,2.861001


In [15]:
R

Unnamed: 0,Titanic,Tiffany,StarTrek,Terminator,StarWars
Julius,2,4,2,5,2
Dina,3,5,0,5,3
Flann,2,0,4,3,5
Alex,2,3,4,3,4
Xaver,5,0,3,4,3
Stefan,5,0,0,0,5


In [16]:
# Add a new movie
Moonlight = [[0, 0, 4, 0, 0, 4]]

In [17]:
model2 = NMF(n_components=2)
model2.fit(R.T)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
    n_components=2, random_state=None, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [18]:
pd.DataFrame(model2.components_, columns=users + ['Stefan'])

Unnamed: 0,Julius,Dina,Flann,Alex,Xaver,Stefan
0,0.95596,1.038734,1.86153,1.54768,1.875835,1.720977
1,1.482024,1.658239,0.223974,0.845144,0.415935,0.0


In [19]:
model2.transform(R.T)

array([[2.03701712, 0.11199629],
       [0.        , 2.84770556],
       [1.46126377, 0.04085459],
       [0.89357627, 2.63932681],
       [2.4249332 , 0.        ]])

In [20]:
model2.transform(Moonlight)

array([[0.999726, 0.      ]])

### Imputation

Ideas for imputation could be:

- Write your own NMF algorithm to impute missing values
- Fill missing values with 0
- Fill missing values with user mean
- Fill missing values with movie mean
- Fill missing values using KNNImputer from sklearn:

In [46]:
from sklearn.impute import KNNImputer

In [47]:
imputer = KNNImputer(n_neighbors=2)

In [48]:
R

Unnamed: 0,Titanic,Tiffany,StarTrek,Terminator,StarWars
Julius,2,4.0,2.0,5,2
Dina,3,5.0,,5,3
Flann,2,,4.0,3,5
Alex,2,3.0,4.0,3,4
Xaver,5,0.0,3.0,4,3


In [49]:
imputer.fit_transform(R)

array([[2., 4., 2., 5., 2.],
       [3., 5., 3., 5., 3.],
       [2., 4., 4., 3., 5.],
       [2., 3., 4., 3., 4.],
       [5., 0., 3., 4., 3.]])

### Save a model

In [60]:
import pickle

binary = pickle.dumps(model)
open('nmf_model.bin', 'wb').write(binary)

670

In [61]:
binary = open('nmf_model.bin', 'rb').read()
nmf = pickle.loads(binary)

In [63]:
nmf.components_

array([[1.51142584, 0.        , 2.08975471, 1.50112247, 2.11231473],
       [0.87860437, 2.26452883, 0.        , 1.993976  , 0.72774299]])