In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import cv2

import tensorflow as tf
from tensorflow.keras.preprocessing import image
from scipy.sparse.linalg import svds

In [109]:
%load_ext autoreload
%autoreload 2

In [113]:
from src.models import (load_data
                        , data_summary
                        , combine_tables
                        , collab_mat
                        , svd_mat
                        , top_biz_pred
                        , surprise_validate
                        , NMF_Mat
                       )

In [3]:
biz_df, rev_df, ckin_df, pho_df, tip_df, user_df = load_data()

In [4]:
data_summary(biz_df
             , rev_df
             , ckin_df
             , pho_df
             , tip_df
             , user_df)

name,rows,colums
business,192609,14
review,6685900,9
checkin,161950,2
photo,200000,4
tip,1223094,5
user,1637138,22


In [5]:
user_rev_biz = combine_tables(user_df
                              , rev_df
                              , biz_df)

In [6]:
user_rev_biz.city.value_counts()[0:5]

Las Vegas     1757375
Phoenix        645716
Toronto        434349
Scottsdale     329826
Charlotte      273989
Name: city, dtype: int64

In [7]:
user_biz_collab_mat = collab_mat('Scottsdale'
                                 , user_rev_biz)

In [8]:
user_biz_collab_mat.shape

(140940, 7081)

### SVD

In [9]:
sigma, user_biz_predictions = svd_mat(user_biz_collab_mat, k=40)

### Precictions

In [10]:
top_biz_pred('--2HUmLkcNHZp0xw6AMBPg'
             , df_all = user_rev_biz
             , df_mat = user_biz_collab_mat
             , df_pred = user_biz_predictions
             , n=5)

('Kristin',
 ['True Food Kitchen',
  'Cafe Monarch',
  'The Herb Box',
  'Sweet Republic',
  'Talking Stick Resort'])

In [11]:
user_rev_biz['biz_name'].loc[user_rev_biz.user_id == '--2HUmLkcNHZp0xw6AMBPg'][0:5]

3360      Butters Pancakes & Café
312378                 Wildflower
600627               Flower Child
624904               Viet Kitchen
744479                 Wildflower
Name: biz_name, dtype: object

### Validation

In [12]:
from surprise import (SVD
                      , SVDpp
                      , SlopeOne
                      , NMF
                      , NormalPredictor
                      , KNNBaseline
                      , KNNBasic
                      , KNNWithMeans
                      , KNNWithZScore
                      , BaselineOnly
                      , CoClustering)
from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate, train_test_split
from surprise import accuracy

In [13]:
data = user_rev_biz[['user_id'
                    , 'business_id'
                    , 'average_stars']].loc[user_rev_biz.city == 'Scottsdale']


In [26]:
data.shape

(329826, 3)

In [27]:
len(data.user_id.unique())

140940

In [28]:
len(data.business_id.unique())

7081

In [30]:
reader = Reader()
data = Dataset.load_from_df(data, reader)

In [31]:
trainset, testset = train_test_split(data, test_size = 0.25)

In [32]:
algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f71d09f0410>

In [33]:
predictions = algo.test(testset)

In [34]:
accuracy.rmse(predictions)

RMSE: 0.7049


0.7048506562909388

In [35]:
svd_results = cross_validate(SVD(), data, cv = 5)

In [36]:
svd_results['test_rmse']

array([0.70360468, 0.696282  , 0.7002084 , 0.70009012, 0.69800289])

In [37]:

benchmark = []
# Iterate over all algorithms
for algorithm in [SVD()
                  , NMF()
                  , NormalPredictor()
                  , CoClustering()
                  , BaselineOnly()
                 ]:
    # Perform cross validation
    results = cross_validate(algorithm
                             , data
                             , measures=['RMSE']
                             , cv=5
                             , verbose=True)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.7034  0.6985  0.7003  0.6988  0.6977  0.6997  0.0020  
Fit time          13.00   13.20   13.19   12.81   12.43   12.93   0.29    
Test time         1.24    0.47    0.42    0.42    1.19    0.75    0.38    
Evaluating RMSE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.6368  0.6392  0.6423  0.6381  0.6406  0.6394  0.0019  
Fit time          23.11   23.81   20.35   19.18   19.26   21.15   1.95    
Test time         1.32    0.46    0.31    1.13    0.32    0.71    0.43    
Evaluating RMSE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.1044  1.1076  1.1030  1.1017  1.1044  1.1042  0.0020  
Fit time          0.39    0.53    0.54    0.55    0.54    0.51    0.06    
Test time       

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
NMF,0.63939,21.145291,0.707996
SVD,0.699744,12.926535,0.747334
BaselineOnly,0.718198,1.829644,0.277519
CoClustering,0.804113,14.380662,0.710643
NormalPredictor,1.104213,0.511061,0.558704


### NMF

In [None]:
NMF_results = cross_validate(NMF(), data, cv = 5)

In [None]:
NMF_results['test_rmse']

In [99]:
from sklearn.decomposition import NMF

In [100]:
model_nmf = NMF(n_components = 40
               , init = 'random'
               , random_state = 0)

In [101]:
m = model_nmf.fit_transform(user_biz_collab_mat)

In [103]:
m.shape

(140940, 40)

In [104]:
h = model_nmf.components_

In [105]:
h.shape

(40, 7081)

In [106]:
nmf_mat = m @ h

In [107]:
nmf_mat.shape

(140940, 7081)

In [108]:
top_biz_pred('--2HUmLkcNHZp0xw6AMBPg'
             , df_all = user_rev_biz
             , df_mat = user_biz_collab_mat
             , df_pred = nmf_mat
             , n=5)

('Kristin',
 ['True Food Kitchen',
  'North Italia',
  "Portillo's Hot Dogs",
  "Mastro's Ocean Club",
  'Sweet Republic'])

In [110]:
user_rev_biz['biz_name'].loc[user_rev_biz.user_id == '--2HUmLkcNHZp0xw6AMBPg'][0:10]

3360        Butters Pancakes & Café
312378                   Wildflower
600627                 Flower Child
624904                 Viet Kitchen
744479                   Wildflower
746301     Original Breakfast House
771054      Andreoli Italian Grocer
838710                 Barrio Queen
1147492        Rocket Burger & Subs
1155989                 Rustic Cafe
Name: biz_name, dtype: object

### Neural Network Recommender



In [None]:
# from: https://www.youtube.com/watch?v=KmLJgq18r28

In [114]:
nmf_mat.shape

(140940, 7081)

In [115]:
user_rev_biz_scott.shape

(329826, 37)

In [116]:
n_users = user_rev_biz_scott.user_id.nunique()

In [117]:
n_users

140940

In [118]:
n_biz = user_rev_biz_scott.business_id.nunique()

In [119]:
n_biz

7081

In [120]:
n_factors = 50

In [None]:
def embedding_input(name, n_in, n_out, reg):
    inp = Input(shape=(1,)
               , dtype='int64'
               , name=name)
    return inp, Embedding(n_in
                         , n_out
                         , input_length = 1
                         , W_regularizer =12(reg))(inp)

### Using Texts to Model Business to Business Similarity

In [38]:
user_rev_biz.fillna('', inplace = True)

In [39]:
user_rev_biz.isna().sum()

user_id               0
user_name             0
user_review_count     0
yelping_since         0
useful_user_sent      0
funny_user_sent       0
cool_user_sent        0
elite                 0
friends               0
fans                  0
average_stars         0
compliment_hot        0
compliment_more       0
compliment_profile    0
compliment_cute       0
compliment_list       0
compliment_note       0
compliment_plain      0
compliment_cool       0
compliment_funny      0
compliment_writer     0
compliment_photos     0
review_id             0
business_id           0
stars_rev             0
useful_rev            0
funny_rev             0
cool_rev              0
rev_text              0
date                  0
biz_name              0
city                  0
biz_star              0
biz_review_count      0
attributes            0
categories            0
dtype: int64

In [40]:
# combine re_text and categories
def combine(rows):
    return rows['rev_text']+' '+rows['categories']

In [41]:
user_rev_biz['text'] = user_rev_biz.apply(combine
                                          , axis = 1)

In [42]:
from sklearn.feature_extraction.text import CountVectorizer

In [50]:
user_rev_biz_scott = user_rev_biz.loc[user_rev_biz.city == 'Scottsdale']

In [60]:
len(user_rev_biz_scott.user_id.value_counts())

140940

In [66]:
urbs_cond = user_rev_biz_scott.drop_duplicates(subset = 'business_id')

In [67]:
urbs_cond.shape

(7081, 37)

In [68]:
count_matrix = CountVectorizer().fit_transform(urbs_cond['text'])

In [69]:
from sklearn.metrics.pairwise import cosine_similarity

In [70]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [72]:
cosine_sim.shape

(7081, 7081)

In [81]:
biz = cosine_sim[0].argsort()[-6:][::-1][1:]
biz

array([3617, 3450, 6212, 2862, 3694])

In [95]:
biz_perc = cosine_sim[0][biz]

In [96]:
biz_perc

array([0.51604396, 0.4233622 , 0.42140704, 0.41996103, 0.41851107])

In [None]:
biz_names = urbs_cond['biz_name'].loc[urbs_cond.user_id == name_id].unique()[0]

In [87]:
urbs_cond.business_id

614        lV3qjLJF_5QujITWHZ64gg
637        p-8AMN9Q0qwym-3Pdle20Q
1125       -LRlx2j9_LB3evsRRcC9MA
1197       9E1q2uEMd881wnruicNTUA
1340       WluvnTRuDWnWDDm0YqOOYQ
                    ...          
6685321    bZ9qEOtg80nrsAbwYDNNXw
6685336    9UsSHoqWdBZ1ec32rofuoA
6685351    5HPBIG3BVMIfy4_-D9zPIQ
6685804    k-qjgxB45hV0627P6KRYJQ
6685868    a4PI8XlT_nXrH93vAo28UQ
Name: business_id, Length: 7081, dtype: object

In [86]:
biz_dict = {x: y for x in urbs_cond.business_id for y in urbs_cond.biz_name}

In [88]:
biz_dict['lV3qjLJF_5QujITWHZ64gg']

'Commercial Waste Company'

In [90]:
len(biz_dict)

7081

In [92]:
biz_df = urbs_cond[['business_id', 'biz_name']]

In [93]:
biz_df

Unnamed: 0,business_id,biz_name
614,lV3qjLJF_5QujITWHZ64gg,Janet Kirkman
637,p-8AMN9Q0qwym-3Pdle20Q,Pita Jungle
1125,-LRlx2j9_LB3evsRRcC9MA,Kaibab Animal Hospital
1197,9E1q2uEMd881wnruicNTUA,Buffalo Wild Wings
1340,WluvnTRuDWnWDDm0YqOOYQ,Cobblestone Auto Spa
...,...,...
6685321,bZ9qEOtg80nrsAbwYDNNXw,Merchants Benefit Admin
6685336,9UsSHoqWdBZ1ec32rofuoA,Caliber Match
6685351,5HPBIG3BVMIfy4_-D9zPIQ,Meng Chinese Academy
6685804,k-qjgxB45hV0627P6KRYJQ,Classified Realty


In [94]:
for idx in biz:
    print(biz_df.biz_name.iloc[idx])

Ferguson Bath, Kitchen & Lighting Gallery
Scottsdale Rooter Plumbers
Troon North Custom Tailor
The Flower Cart
Gainey Fabricare Cleaners
