In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import cv2

import tensorflow as tf
from tensorflow.keras.preprocessing import image
from scipy.sparse.linalg import svds

In [2]:
from src.models import (load_data
                        , data_summary
                        , combine_tables
                        , collab_mat
                        , svd_mat
                        , top_biz_pred
                       )

In [5]:
biz_df, rev_df, ckin_df, pho_df, tip_df, user_df = load_data()

In [6]:
data_summary(biz_df
             , rev_df
             , ckin_df
             , pho_df
             , tip_df
             , user_df)

name,rows,colums
business,192609,14
review,6685900,9
checkin,161950,2
photo,200000,4
tip,1223094,5
user,1637138,22


In [7]:
user_rev_biz = combine_tables(user_df
                              , rev_df
                              , biz_df)

In [8]:
user_rev_biz.city.value_counts()[0:5]

Las Vegas     1757375
Phoenix        645716
Toronto        434349
Scottsdale     329826
Charlotte      273989
Name: city, dtype: int64

In [9]:
user_biz_collab_mat = collab_mat('Scottsdale'
                                 , user_rev_biz)

In [10]:
user_biz_collab_mat.shape

(140940, 7081)

### SVD

In [11]:
sigma, user_biz_predictions = svd_mat(user_biz_collab_mat, k=40)

### Precictions

In [12]:
top_biz_pred('--2HUmLkcNHZp0xw6AMBPg'
             , df_all = user_rev_biz
             , df_mat = user_biz_collab_mat
             , df_pred = user_biz_predictions
             , n=5)

('Kristin',
 ['True Food Kitchen',
  'Cafe Monarch',
  'The Herb Box',
  'Sweet Republic',
  'Talking Stick Resort'])

In [13]:
user_rev_biz['biz_name'].loc[user_rev_biz.user_id == '--2HUmLkcNHZp0xw6AMBPg'][0:5]

3360      Butters Pancakes & Café
312378                 Wildflower
600627               Flower Child
624904               Viet Kitchen
744479                 Wildflower
Name: biz_name, dtype: object

### Validation

In [47]:
from surprise import (SVD
                      , SVDpp
                      , SlopeOne
                      , NMF
                      , NormalPredictor
                      , KNNBaseline
                      , KNNBasic
                      , KNNWithMeans
                      , KNNWithZScore
                      , BaselineOnly
                      , CoClustering)
from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate, train_test_split
from surprise import accuracy

In [17]:
data = user_rev_biz[['user_id'
                    , 'business_id'
                    , 'average_stars']].loc[user_rev_biz.city == 'Scottsdale']


In [40]:
data.shape

(329826, 3)

In [37]:
len(data.user_id.unique())

140940

In [38]:
len(data.business_id.unique())

7081

In [41]:
reader = Reader()
data = Dataset.load_from_df(data, reader)

In [43]:
trainset, testset = train_test_split(data, test_size = 0.25)

In [44]:
algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f4476f5e750>

In [45]:
predictions = algo.test(testset)

In [48]:
accuracy.rmse(predictions)

RMSE: 0.7033


0.7033168968886507

In [50]:
svd_results = cross_validate(SVD(), data, cv = 5)

{'test_rmse': array([0.69714995, 0.69688364, 0.7011446 , 0.70340271, 0.70042614]),
 'test_mae': array([0.48845975, 0.48699912, 0.49104552, 0.49082408, 0.4913452 ]),
 'fit_time': (12.882615327835083,
  12.572933673858643,
  12.632320165634155,
  14.210466384887695,
  13.12132453918457),
 'test_time': (0.3999161720275879,
  0.4119558334350586,
  0.479907751083374,
  0.4399549961090088,
  0.4253082275390625)}

In [54]:
svd_results['test_rmse'] ** 2

array([0.48601805, 0.48564681, 0.49160375, 0.49477537, 0.49059678])

In [None]:
NMF_results = cross_validate(NMF(), data, cv = 5)

In [None]:
NMF_results['test_rmse'] ** 2

In [56]:

benchmark = []
# Iterate over all algorithms
for algorithm in [SVD()
                  , NMF()
                  , NormalPredictor()
                 ]:
    # Perform cross validation
    results = cross_validate(algorithm
                             , data
                             , measures=['RMSE']
                             , cv=3
                             , verbose=True)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

Evaluating RMSE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.7101  0.7091  0.7089  0.7094  0.0005  
Fit time          10.35   10.55   11.35   10.75   0.43    
Test time         1.25    1.27    1.23    1.25    0.02    
Evaluating RMSE of algorithm NMF on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.6611  0.6554  0.6598  0.6588  0.0024  
Fit time          18.59   18.64   18.76   18.66   0.07    
Test time         0.66    0.59    0.65    0.63    0.03    
Evaluating RMSE of algorithm NormalPredictor on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    1.1027  1.1025  1.1039  1.1030  0.0006  
Fit time          0.37    0.52    0.52    0.47    0.07    
Test time         0.69    0.72    0.72    0.71    0.01    


Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
NMF,0.658756,18.662019,0.633255
SVD,0.709367,10.751169,1.250202
NormalPredictor,1.103032,0.47073,0.710494
