In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time
from sklearn.model_selection import train_test_split
from scipy.sparse import coo_matrix, csr_matrix
from scipy.spatial.distance import jaccard, cosine 
from pytest import approx

In [2]:
MV_users = pd.read_csv('data/users.csv')
MV_movies = pd.read_csv('data/movies.csv')
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [3]:
from collections import namedtuple
Data = namedtuple('Data', ['users','movies','train','test'])
data = Data(MV_users, MV_movies, train, test)

In [4]:
class RecSys():
    def __init__(self,data):
        self.data=data
        self.allusers = list(self.data.users['uID'])
        self.allmovies = list(self.data.movies['mID'])
        self.genres = list(self.data.movies.columns.drop(['mID', 'title', 'year']))
        self.mid2idx = dict(zip(self.data.movies.mID,list(range(len(self.data.movies)))))
        self.uid2idx = dict(zip(self.data.users.uID,list(range(len(self.data.users)))))
        self.Mr=self.rating_matrix()
        self.Mm=None 
        self.sim=np.zeros((len(self.allmovies),len(self.allmovies)))
        
    def rating_matrix(self):
        """
        Convert the rating matrix to numpy array of shape (#allusers,#allmovies)
        """
        ind_movie = [self.mid2idx[x] for x in self.data.train.mID] 
        ind_user = [self.uid2idx[x] for x in self.data.train.uID]
        rating_train = list(train.rating)
        return np.array(coo_matrix((rating_train, (ind_user, ind_movie)), shape=(len(self.allusers), len(self.allmovies))).toarray())


    

    def predict_everything_to_3(self):
        """
        Predict everything to 3 for the test data
        """
        # your code here
        return np.ones(len(self.data.test.rating)) * 3
    
    
    def predict_to_user_average(self):
        """
        Predict to average rating for the user.
        Returns numpy array of shape (#users,)
        """
        # your code here

        user_mean = self.data.train.drop(['mID'], axis = 1).groupby('uID').mean()
        return self.data.test.merge(user_mean, how = 'left', on = 'uID').rating_y

            
    
    def predict_from_sim(self,uid,mid):
        """
        Predict a user rating on a movie given userID and movieID
        """
        # your code here
        
        index_user = self.uid2idx[uid]
        index_movie = self.mid2idx[mid]
        
        similar_movie = self.sim[index_movie]
        rating_user = self.Mr[index_user,]
        
        return np.dot(similar_movie, rating_user) / np.sum(similar_movie[np.nonzero(rating_user)])
        

    
    def predict(self):
        """
        Predict ratings in the test data. Returns predicted rating in a numpy array of size (# of rows in testdata,)
        """
        # your code here
        pred = list(map(lambda x,y: self.predict_from_sim(x, y), self.data.test.uID,self.data.test.mID))
        return np.array(pred)

        
        
        
    
    def rmse(self,yp):
        yp[np.isnan(yp)]=3 #In case there is nan values in prediction, it will impute to 3.
        yt=np.array(self.data.test.rating)
        return np.sqrt(((yt-yp)**2).mean())

    
class ContentBased(RecSys):
    def __init__(self,data):
        super().__init__(data)
        self.data=data
        self.Mm = self.calc_movie_feature_matrix()
       
        
    def calc_movie_feature_matrix(self):
        """
        Create movie feature matrix in a numpy array of shape (#allmovies, #genres) 
        """
        # your code here
        
        return (data.movies.drop(['mID','title', 'year'], axis = 1).to_numpy())
        
    
    def calc_item_item_similarity(self):
        """
        Create item-item similarity using Jaccard similarity
        """
        # your code here
        
        from scipy.spatial.distance import pdist, squareform
        
        dist = pdist(self.Mm, metric = 'jaccard')
        dist = squareform(dist)
        dist = 1 - dist
        self.sim = dist
        
        
                
class Collaborative(RecSys):    
    def __init__(self,data):
        super().__init__(data)
        
    def calc_item_item_similarity(self, simfunction, *X):  
        """
        Create item-item similarity using similarity function. 
        X is an optional transformed matrix of Mr
        """    
        if len(X)==0:
            self.sim = simfunction()            
        else:
            self.sim = simfunction(X[0]) # *X passes in a tuple format of (X,), to X[0] will be the actual transformed matrix
            
    def cossim(self):    
        """
        Calculates item-item similarity for all pairs of items using cosine similarity (values from 0 to 1) on utility matrix
        Returns a cosine similarity matrix of size (#all movies, #all movies)
        """
        # your code here
        from sklearn.metrics.pairwise import cosine_similarity
        
        rm = self.Mr.copy().astype('float64')
        
        mean_list = np.sum(rm, axis = 1) / np.count_nonzero(rm, axis = 1)
        
        
        for i in range(len(mean_list)):
            rm[i,:] = np.where(rm[i,:] == 0, mean_list[i], rm[i,:])
            rm[i,:] -= mean_list[i]
            
        cs = cosine_similarity(csr_matrix(rm.T))
        cs = (cs-np.min(cs))/(np.max(cs)-np.min(cs))
        
        return cs
    
    def jacsim(self,Xr):
        """
        Calculates item-item similarity for all pairs of items using jaccard similarity (values from 0 to 1)
        Xr is the transformed rating matrix.
        """     

        from scipy.spatial.distance import pdist, squareform
        
        dist = pdist(Xr.T, metric = 'jaccard')
        dist = squareform(dist)
        dist = 1 - dist
        return dist
    
    

In [5]:
rs = RecSys(data)

In [6]:
rs.data.movies.columns

Index(['mID', 'title', 'year', 'Doc', 'Com', 'Hor', 'Adv', 'Wes', 'Dra', 'Ani',
       'War', 'Chi', 'Cri', 'Thr', 'Sci', 'Mys', 'Rom', 'Fil', 'Fan', 'Act',
       'Mus'],
      dtype='object')

There are 18 genres of movies.

In [7]:
from sklearn.decomposition import NMF

model = NMF(random_state = 42, n_components = 19)
nmf_features = model.fit_transform(rs.Mr)
components = model.components_

In [12]:
preds = []

for i in range(rs.data.test.shape[0]):
    pred = np.dot((nmf_features[rs.uid2idx[rs.data.test.iloc[i,0]]]).T,components[:,rs.mid2idx[rs.data.test.iloc[i,1]]])
    preds.append(pred)

In [18]:
rmse = np.sqrt(np.mean((np.array(rs.data.test.rating) - preds) ** 2))

In [19]:
rmse

2.857168439056352