In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import math
#from IPython.core import display
from sklearn.decomposition import NMF
from scipy import sparse

from tqdm import tqdm
from sklearn.cluster import KMeans
from kneed import KneeLocator
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import KFold
import time

In [2]:
## Read in data

path = '/Users/ving2000/Downloads/CDS 403/Final_project/uci-restaurant-consumer-data/'

users = pd.read_csv('cleaned_userprofile.csv').set_index('userID')
usercuisine = pd.read_csv('user_cuisine.csv')

restaurants = pd.read_csv('cleaned_restaurants.csv')
rescuisine = pd.read_csv('restaurant_cuisine.csv')

ratings = pd.read_csv(os.path.join(path, 'rating_final.csv'))

In [3]:
ucuisine = pd.get_dummies(usercuisine, columns = ['category'], prefix = '', prefix_sep = '').groupby('userID', as_index = False).max()
users = users.merge(ucuisine.drop('Rcuisine', axis = 1), on = 'userID', how = 'left').set_index('userID')

<br><br><br><br>

In [4]:
def GetRMatrix (actual_df):
    """
    Transforms actual_df to a userID x placeID matrix 
    actual_df : ratings dataframe
    """
   
    ratings_df = actual_df.copy()
    ratings_df['rating'] = ratings_df['rating'] + 1
    df = pd.DataFrame(columns = restaurants.placeID, index = users.index)
    for i, row in df.iterrows():
        fdf = ratings_df[ratings_df['userID'] == i]
        dct = dict(zip(fdf.placeID, fdf.rating))
        df.loc[i] = dct
    return df.fillna(0).values

#----------------------------------------------------

def GetRMatrix2 (actual_df, restaurant_clusters):
    """
    Transforms actual_df to a userID x placeID matrix 
    actual_df : ratings dataframe
    """
   
    ratings_df = actual_df.copy()
    ## Add 1 to ratings
    ratings_df['rating'] = ratings_df['rating'] + 1
    df = pd.DataFrame(columns = restaurant_clusters.placeID, index = users.index)
    ## i = userID
    for i, row in df.iterrows():
        ## Get all actual ratings from user i
        fdf = ratings_df[ratings_df['userID'] == i]
        ## Store in a dictionary (placeID-column : rating)
        dct = dict(zip(fdf.placeID, fdf.rating))
        ## Get the cluster of each restaurant
        clusters = fdf.merge(restaurant_clusters, on = 'placeID', how = 'left')
        ## Calculate average rating of each restaurant group
        ## index  rating
        ## 0      3
        ## 5      6
        avg_clusters = clusters[['rating', 'cluster']].groupby('cluster').mean().to_dict()['rating']
        
        ## Loop thru every restaurant group user i has visited
        for n in avg_clusters.keys():
            ## Get all restaurants in group n
            ids = restaurant_clusters[restaurant_clusters['cluster'] == n].placeID
            ## Remove the ones that we already put in dct
            ids = ids[~ids.isin(fdf.placeID)]
            for res in ids:
                dct[res] = avg_clusters[n]
            
        df.loc[i] = dct
    return df.fillna(0).values

#----------------------------------------------------
def GroupRestaurants (restaurants):
    res = restaurants.copy().drop(['latitude', 'longitude', 'name'], 1).set_index('placeID')
    
    SSE = []
    for cluster in tqdm(range(6,21)):
        kmeans = KMeans(n_jobs = -1, n_clusters = cluster, init='k-means++')
        kmeans.fit(res)
        SSE.append(kmeans.inertia_)
        
    kn = KneeLocator(np.arange(6, 21), SSE, curve='convex', direction='decreasing').knee
    kmeans = KMeans(n_jobs = -1, n_clusters = kn, init='k-means++')
    kmeans.fit(res)
    preds = kmeans.predict(res)
    res['cluster'] = preds
    
    return res.reset_index()[['placeID', 'cluster']]

#----------------------------------------------------

def NMFModel (k, Rdf):
    """
    Non-neg matrix factorization
    
    Inputs: 
    k: number of latent features
    Rdf: R matrix from GetRMatrix
    """
    
    nmf = NMF(k, max_iter = 25)
    transformed_df = nmf.fit_transform(Rdf)
    inverse_df = nmf.inverse_transform(transformed_df) - 1
    
    #Clipping
    inverse_df[inverse_df > 2] = 2.                
    inverse_df[inverse_df < 0] = 0. 
    return inverse_df

#----------------------------------------------------

def Recommend (restaurants, R_preds, userID, num_recom = 10):
    
    """
    Sorts ratings and matches placeIDs with names
    """
    
    df = pd.DataFrame(R_preds, index = users.index, columns = restaurants.placeID)
    predictions = df.loc[userID].to_dict()
    spredictions = sorted(predictions.items(), key = lambda x: (-x[1], x[0]))[:num_recom]
    
    recoms = pd.DataFrame()
    for i in spredictions:
        dff = restaurants[restaurants['placeID'] == i[0]][['placeID', 'name']]
        dff['predicted'] = i[1]
        recoms = pd.concat((recoms, dff), axis = 0)
        
    return recoms, df

In [5]:
len(R.nonzero()[0]) / float(R.shape[0] * R.shape[1])

NameError: name 'R' is not defined

In [57]:
nmf_model = NMF(n_components=4)     # starts with 20 latents factors

# Matrix factorization               # V ~ W.H  (Find two non-negative matrices (W, H) whose product approximates the non- negative matrix X. )
nmf_model.fit(R)                     # R can be array-like or sparse, here it is array-like (dense)
Theta = nmf_model.transform(R)       # user latent factors (= W, called the features matrix)
M = nmf_model.components_.T          # item latent factors (= H.T) (H is called the coefficient matrix)

# Making the predictions
R_pred = M.dot(Theta.T)              # See http://stackoverflow.com/questions/24739121/nonnegative-matrix-factorization-in-sklearn
R_pred = R_pred.T   

In [8]:
import random
ratings = ratings.sample(frac = 1)
R_train = ratings.iloc[:round(ratings.shape[0]*0.70), :]
R_test = ratings[~ratings.index.isin(R_train.index)]

In [10]:
parametersNMF = {
                    'n_components' : 20,     # number of latent factors
                    'init' : 'random', 
                    'random_state' : 0, 
                    'alpha' : 0.01,          # regularization term
                    'l1_ratio' : 0,          # set regularization = L2 
                    'max_iter' : 15
                }

estimator = NMF(**parametersNMF)

In [11]:
def get_rmse(pred, actual):
    pred = pred[actual.nonzero()].flatten()     # Ignore nonzero terms
    actual = actual[actual.nonzero()].flatten() # Ignore nonzero terms
    return np.sqrt(mean_squared_error(pred, actual))

In [103]:
err = 0
n_iter = 0.
n_folds = 5
rclusters = GroupRestaurants(restaurants)

kf = KFold(n_splits=2)
for train_index, test_index in kf.split(ratings_df):   
    
    R_train = GetRMatrix(ratings.iloc[train_index, :])
    R_test = GetRMatrix(ratings.iloc[test_index, :])

    # Training (matrix factorization)
    t0 = time.time()
    estimator.fit(R_train)  
    Theta = estimator.transform(R_train)       # user features
    M = estimator.components_.T                # item features
    print("Fit in %0.3fs" % (time.time() - t0))
    n_iter += estimator.n_iter_ 

    # Making the predictions
    R_pred = M.dot(Theta.T)
    R_pred = R_pred.T      
    
    # Clipping values                                                    
    R_pred[R_pred > 2] = 2.           # clips ratings above 5             
    R_pred[R_pred < 0] = 0.           # clips ratings below 1

    # Computing the error on the validation set 
    err += get_rmse(R_pred, R_test)
    print (get_rmse(R_pred, R_test))
    
print("*** RMSE Error : ", err / n_folds)
print("Mean number of iterations:", n_iter / n_folds)

Fit in 0.007s
2.3592180896136195
Fit in 0.007s
2.2890733309021964
*** RMSE Error :  0.9296582841031633
Mean number of iterations: 5.6


<br><br><br>
# GRID SEARCH

In [12]:
from sklearn.model_selection import ShuffleSplit
cv = ShuffleSplit(n_splits=5, test_size=.25, random_state=0)
cv.split(ratings)

<generator object BaseShuffleSplit.split at 0x12d8f82a0>

In [22]:
param =        {
                    'n_components' : np.arange(2, 21),
                    'alpha' : [0, 0.001, 0.01],
                    'l1_ratio' : [0], 
                    'max_iter' : [15, 20, 25, 30]
                }

# Keep track of RMSE and parameters
grid_search = pd.DataFrame([[0, 0, 0, 0, 0]])
grid_search.columns = ['n_components', 'alpha', 'l1_ratio', 'max_iter'] + ['RMSE']
cv = ShuffleSplit(n_splits=5, test_size=.25, random_state=0)
rclusters = GroupRestaurants(restaurants)
# nb of folds in ShuffleSplit CV
n_folds = 5      
i = 0

# Performing the Grid search
for n_components in param['n_components']:
    for alpha in param['alpha']:
        for l1_ratio in param['l1_ratio']:
            for max_iter in param['max_iter']:

                err = 0
                n_iter = 0
                print('Search', i, '/', 3 ** 3 - 1)
                for train_index, test_index in cv.split(ratings):

                    R_train = GetRMatrix2(ratings.iloc[train_index, :], rclusters)
                    R_test = GetRMatrix2(ratings.iloc[test_index, :], rclusters)
                    
                    # updating the parameters
                    parametersNMF = {
                    'n_components' : n_components,
                    'init' : 'random', 
                    'random_state' : 0, 
                    'alpha' : alpha,
                    'l1_ratio' : l1_ratio,
                    'max_iter' : max_iter}
                    estimator = NMF(**parametersNMF)
                
                    # Training (matrix factorization)
                    t0 = time.time()
                    transformed_df = estimator.fit_transform(R_train)
                    n_iter += estimator.n_iter_
                    R_pred = estimator.inverse_transform(transformed_df) - 1
                    
                    # Clipping values                                                    
                    R_pred[R_pred > 2] = 2.           # clips ratings above 5             
                    R_pred[R_pred < 0] = 0.           # clips ratings below 1

                    # Computing the error on the validation set 
                    err += get_rmse(R_pred, R_test)
    
                #print "RMSE Error : ", err / n_folds
                grid_search.loc[i] = [n_components, alpha, l1_ratio, max_iter, err / n_folds]
                print(grid_search.loc[i].tolist(), "Mean number of iterations:", n_iter / n_folds)
                i += 1

best_params = grid_search.sort_values('RMSE')[:1]
print('*** best params ***')
print(best_params)

100%|██████████| 15/15 [00:33<00:00,  2.21s/it]


Search 0 / 26
[2.0, 0.0, 0.0, 15.0, 1.733921322166172] Mean number of iterations: 14.0
Search 1 / 26
[2.0, 0.0, 0.0, 20.0, 1.733871798625744] Mean number of iterations: 19.0
Search 2 / 26
[2.0, 0.0, 0.0, 25.0, 1.7337355488593058] Mean number of iterations: 24.0
Search 3 / 26
[2.0, 0.0, 0.0, 30.0, 1.7336775148856045] Mean number of iterations: 29.0
Search 4 / 26
[2.0, 0.001, 0.0, 15.0, 1.7339252858835976] Mean number of iterations: 14.0
Search 5 / 26
[2.0, 0.001, 0.0, 20.0, 1.7338759377268833] Mean number of iterations: 19.0
Search 6 / 26
[2.0, 0.001, 0.0, 25.0, 1.7337398205148555] Mean number of iterations: 24.0
Search 7 / 26
[2.0, 0.001, 0.0, 30.0, 1.733681881582677] Mean number of iterations: 29.0
Search 8 / 26
[2.0, 0.01, 0.0, 15.0, 1.7339609930937603] Mean number of iterations: 14.0
Search 9 / 26
[2.0, 0.01, 0.0, 20.0, 1.7339132251159353] Mean number of iterations: 19.0
Search 10 / 26
[2.0, 0.01, 0.0, 25.0, 1.7337783076256472] Mean number of iterations: 24.0
Search 11 / 26
[2.0, 0.

In [23]:
grid_search.sort_values('RMSE')

Unnamed: 0,n_components,alpha,l1_ratio,max_iter,RMSE
219,20.0,0.000,0.0,30.0,1.686705
223,20.0,0.001,0.0,30.0,1.686711
227,20.0,0.010,0.0,30.0,1.686766
207,19.0,0.000,0.0,30.0,1.686800
211,19.0,0.001,0.0,30.0,1.686806
...,...,...,...,...,...
5,2.0,0.001,0.0,20.0,1.733876
9,2.0,0.010,0.0,20.0,1.733913
0,2.0,0.000,0.0,15.0,1.733921
4,2.0,0.001,0.0,15.0,1.733925
