In [1]:
import pandas as pd
import numpy as np
import os
from scipy.sparse import coo_matrix, csr_matrix
from collections import namedtuple
from sklearn.decomposition import NMF
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
MV_users = pd.read_csv('data/users.csv')
MV_movies = pd.read_csv('data/movies.csv')
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [3]:
class RecSys():
    def __init__(self,data):
        self.data=data
        self.allusers = list(self.data.users['uID'])
        self.allmovies = list(self.data.movies['mID'])
        self.genres = list(self.data.movies.columns.drop(['mID', 'title', 'year']))
        self.mid2idx = dict(zip(self.data.movies.mID,list(range(len(self.data.movies)))))
        self.uid2idx = dict(zip(self.data.users.uID,list(range(len(self.data.users)))))
        self.Mr=self.rating_matrix()
        self.Mm=None
        self.sim=np.zeros((len(self.allmovies),len(self.allmovies)))
        
    def rating_matrix(self):
        """
        Convert the rating matrix to numpy array of shape (#allusers,#allmovies)
        """
        ind_movie = [self.mid2idx[x] for x in self.data.train.mID] 
        ind_user = [self.uid2idx[x] for x in self.data.train.uID]
        rating_train = list(self.data.train.rating)
        
        return np.array(coo_matrix((rating_train, (ind_user, ind_movie)), shape=(len(self.allusers), len(self.allmovies))).toarray())        
        
    def rmse(self,yp):
        yp[np.isnan(yp)]=3 #In case there is nan values in prediction, it will impute to 3.
        yt=np.array(self.data.test.rating)
        return np.sqrt(((yt-yp)**2).mean())

In [4]:
train.head()

Unnamed: 0,uID,mID,rating
0,744,1210,5
1,3040,1584,4
2,1451,1293,5
3,5455,3176,2
4,2507,3074,5


In [5]:
test.head()

Unnamed: 0,uID,mID,rating
0,2233,440,4
1,4274,587,5
2,2498,454,3
3,2868,2336,5
4,1636,2686,5


In [6]:
Data = namedtuple('Data', ['users','movies','train','test'])
data = Data(MV_users, MV_movies, train, test)

rating_system = RecSys(data)

In [7]:
print(f"Users {len(rating_system.allusers)}, Movies {len(rating_system.allmovies)} ")
print("Rating matrix rows: users, column movie_rating")
print(rating_system.Mr)
print(rating_system.Mr.shape)

Users 6040, Movies 3883 
Rating matrix rows: users, column movie_rating
[[5 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [3 0 0 ... 0 0 0]]
(6040, 3883)


In [8]:
# Perform NMF on ratings matrix
rs = RecSys(data)
ratingsMatrix = rs.Mr
model = NMF(n_components = 18, random_state = 42, init="nndsvda", solver="mu", beta_loss="kullback-leibler", max_iter=1000).fit(ratingsMatrix)
W = model.transform(ratingsMatrix)
H = model.components_

In [11]:
print(f"W shape{W.shape} H shape {H.shape}")

W shape(6040, 18) H shape (18, 3883)


In [12]:
# Reconstruct user data as predictions from NMF
X = model.inverse_transform(W)
X.shape

(6040, 3883)

In [13]:
# Adapt the predict method of RecSys() to make predictions from the reconstructed user data, rather than the using the baseline / imputation methods.
yhat = []
n_test = len(rs.data.test)
for i in range(n_test):
    x = rs.data.test.iloc[i]
    mid = x.mID
    uid = x.uID
    yhat.append(X[rs.uid2idx[uid],rs.mid2idx[mid]])

In [14]:
yhat = np.asarray(yhat)
yhat[np.isnan(yhat)] = 3 
labs = np.array(rs.data.test.rating)
RMSE = np.sqrt(((labs-yhat)**2).mean())

print("The RMSE of the predictions made using NMF was:", RMSE)

The RMSE of the predictions made using NMF was: 2.8850867946900705
