In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import cv2

import tensorflow as tf
from tensorflow.keras.preprocessing import image
from scipy.sparse.linalg import svds

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from src.models import (load_data
                        , data_summary
                        , combine_tables
                        , collab_mat
                        , svd_mat
                        , top_biz_pred
                        , surprise_validate
                        , NMF_Mat
                       )

In [4]:
biz_df, rev_df, ckin_df, pho_df, tip_df, user_df = load_data()

In [5]:
data_summary(biz_df
             , rev_df
             , ckin_df
             , pho_df
             , tip_df
             , user_df)

name,rows,colums
business,192609,14
review,6685900,9
checkin,161950,2
photo,200000,4
tip,1223094,5
user,1637138,22


In [6]:
user_rev_biz = combine_tables(user_df
                              , rev_df
                              , biz_df)

In [7]:
user_rev_biz.city.value_counts()[0:5]

Las Vegas     1757375
Phoenix        645716
Toronto        434349
Scottsdale     329826
Charlotte      273989
Name: city, dtype: int64

In [8]:
user_biz_collab_mat = collab_mat('Scottsdale'
                                 , user_rev_biz)

In [9]:
user_biz_collab_mat.shape

(140940, 7081)

### SVD

In [10]:
sigma, user_biz_predictions = svd_mat(user_biz_collab_mat, k=40)

### Precictions

In [11]:
top_biz_pred('--2HUmLkcNHZp0xw6AMBPg'
             , df_all = user_rev_biz
             , df_mat = user_biz_collab_mat
             , df_pred = user_biz_predictions
             , n=5)

('Kristin',
 ['True Food Kitchen',
  'Cafe Monarch',
  'The Herb Box',
  'Sweet Republic',
  'Talking Stick Resort'])

In [12]:
user_rev_biz['biz_name'].loc[user_rev_biz.user_id == '--2HUmLkcNHZp0xw6AMBPg'][0:5]

3360      Butters Pancakes & Café
312378                 Wildflower
600627               Flower Child
624904               Viet Kitchen
744479                 Wildflower
Name: biz_name, dtype: object

### Validation

In [13]:
from surprise import (SVD
                      , SVDpp
                      , SlopeOne
                      , NMF
                      , NormalPredictor
                      , KNNBaseline
                      , KNNBasic
                      , KNNWithMeans
                      , KNNWithZScore
                      , BaselineOnly
                      , CoClustering)
from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate, train_test_split
from surprise import accuracy

In [14]:
data = user_rev_biz[['user_id'
                    , 'business_id'
                    , 'average_stars']].loc[user_rev_biz.city == 'Scottsdale']


In [15]:
data.shape

(329826, 3)

In [16]:
len(data.user_id.unique())

140940

In [17]:
len(data.business_id.unique())

7081

In [18]:
reader = Reader()
data = Dataset.load_from_df(data, reader)

In [19]:
trainset, testset = train_test_split(data, test_size = 0.25)

In [20]:
algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f4e0e820790>

In [21]:
predictions = algo.test(testset)

In [22]:
accuracy.rmse(predictions)

RMSE: 0.7041


0.7040508827856145

In [23]:
svd_results = cross_validate(SVD(), data, cv = 5)

In [24]:
svd_results['test_rmse']

array([0.69884866, 0.70561836, 0.70258484, 0.69554777, 0.69757561])

In [25]:

benchmark = []
# Iterate over all algorithms
for algorithm in [SVD()
                  , NMF()
                  , NormalPredictor()
                  , CoClustering()
                  , BaselineOnly()
                 ]:
    # Perform cross validation
    results = cross_validate(algorithm
                             , data
                             , measures=['RMSE']
                             , cv=5
                             , verbose=True)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.6975  0.7004  0.6985  0.7009  0.6987  0.6992  0.0012  
Fit time          14.11   14.43   13.55   14.46   13.62   14.03   0.39    
Test time         0.50    0.44    0.83    0.87    0.40    0.61    0.20    
Evaluating RMSE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.6416  0.6402  0.6391  0.6393  0.6360  0.6393  0.0018  
Fit time          21.65   19.56   20.62   26.83   26.59   23.05   3.06    
Test time         0.36    0.33    0.40    0.49    0.41    0.40    0.05    
Evaluating RMSE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.1032  1.0972  1.0994  1.1071  1.1022  1.1018  0.0034  
Fit time          0.47    0.63    0.63    0.65    0.64    0.60    0.07    
Test time       

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
NMF,0.639269,23.050251,0.397056
SVD,0.699194,14.034492,0.606469
BaselineOnly,0.718313,2.109087,0.350295
CoClustering,0.802002,17.989719,0.700436
NormalPredictor,1.101808,0.603883,0.6302


### NMF

In [26]:
NMF_results = cross_validate(NMF(), data, cv = 5)

In [27]:
NMF_results['test_rmse']

array([0.63116312, 0.63708406, 0.63967966, 0.64178033, 0.64351751])

In [28]:
from sklearn.decomposition import NMF

In [29]:
model_nmf = NMF(n_components = 40
               , init = 'random'
               , random_state = 0)

In [30]:
m = model_nmf.fit_transform(user_biz_collab_mat)

In [31]:
m.shape

(140940, 40)

In [32]:
h = model_nmf.components_

In [33]:
h.shape

(40, 7081)

In [34]:
nmf_mat = m @ h

In [35]:
nmf_mat.shape

(140940, 7081)

In [36]:
top_biz_pred('--2HUmLkcNHZp0xw6AMBPg'
             , df_all = user_rev_biz
             , df_mat = user_biz_collab_mat
             , df_pred = nmf_mat
             , n=5)

('Kristin',
 ['True Food Kitchen',
  'North Italia',
  "Portillo's Hot Dogs",
  "Mastro's Ocean Club",
  'Sweet Republic'])

In [37]:
user_rev_biz['biz_name'].loc[user_rev_biz.user_id == '--2HUmLkcNHZp0xw6AMBPg'][0:10]

3360        Butters Pancakes & Café
312378                   Wildflower
600627                 Flower Child
624904                 Viet Kitchen
744479                   Wildflower
746301     Original Breakfast House
771054      Andreoli Italian Grocer
838710                 Barrio Queen
1147492        Rocket Burger & Subs
1155989                 Rustic Cafe
Name: biz_name, dtype: object

### Neural Network Recommender



In [38]:
# from: https://www.youtube.com/watch?v=KmLJgq18r28

In [39]:
nmf_mat.shape

(140940, 7081)

In [46]:
user_rev_biz_scott = user_rev_biz[['user_id'
                                   , 'business_id'
                                   , 'average_stars']].loc[user_rev_biz.city == 'Scottsdale']

In [97]:
from sklearn.model_selection import train_test_split

In [101]:
X = user_rev_biz_scott[['user_id', 'business_id']]
y = user_rev_biz_scott.average_stars
X_train, X_test, y_train, y_test = train_test_split(X
                                                   , y
                                                   , test_size=0.25
                                                   , random_state=42)

In [47]:
n_users = user_rev_biz_scott.user_id.nunique()

In [48]:
n_users

140940

In [49]:
n_biz = user_rev_biz_scott.business_id.nunique()

In [50]:
n_biz

7081

In [51]:
n_factors = 50

In [96]:
from tensorflow.keras.layers import (Input
                                     , Embedding
                                     , concatenate
                                     , Flatten
                                     , Dropout
                                     , Dense)
from tensorflow.keras.models import Model

In [91]:
def embedding_input(name, n_in, n_out):
    inp = Input(shape=(1,)
               , dtype='int64'
               , name=name)
    return inp, Embedding(n_in
                         , n_out
                         , input_length = 1
                         )(inp)

In [92]:
user_in, u = embedding_input('user_in'
                            , n_users
                            , n_factors)
biz_in, b = embedding_input('biz_in'
                           , n_biz
                           , n_factors)

In [98]:
x = concatenate([u, b], axis=-1)
x = Flatten()(x)
x = Dropout(0.3)(x)
x = Dense(1)(x)
nn = Model([user_in, biz_in]
          , X)
nn.compile(optimizer = 'adam'
           , loss='mse'
           , metrics=['accuracy']
          )

tf.keras.Model: https://www.tensorflow.org/api_docs/python/tf/keras/Model

In [104]:
nn.fit([X.user_id, X.business_id]
      , y
      , batch_size=128
      , epochs = 1
      , validation_split = 0.25)

Train on 247369 samples, validate on 82457 samples
   128/247369 [..............................] - ETA: 12:27

UnimplementedError:  Cast string to int64 is not supported
	 [[node Cast (defined at <ipython-input-104-bad64cfca393>:5) ]] [Op:__inference_distributed_function_1247]

Function call stack:
distributed_function


### Using Texts to Model Business to Business Similarity

In [None]:
user_rev_biz.fillna('', inplace = True)

In [None]:
user_rev_biz.isna().sum()

In [None]:
# combine re_text and categories
def combine(rows):
    return rows['rev_text']+' '+rows['categories']

In [None]:
user_rev_biz['text'] = user_rev_biz.apply(combine
                                          , axis = 1)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
user_rev_biz_scott = user_rev_biz.loc[user_rev_biz.city == 'Scottsdale']

In [None]:
len(user_rev_biz_scott.user_id.value_counts())

In [None]:
urbs_cond = user_rev_biz_scott.drop_duplicates(subset = 'business_id')

In [None]:
urbs_cond.shape

In [None]:
count_matrix = CountVectorizer().fit_transform(urbs_cond['text'])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [None]:
cosine_sim.shape

In [None]:
biz = cosine_sim[0].argsort()[-6:][::-1][1:]
biz

In [None]:
biz_perc = cosine_sim[0][biz]

In [None]:
biz_perc

In [None]:
biz_names = urbs_cond['biz_name'].loc[urbs_cond.user_id == name_id].unique()[0]

In [None]:
urbs_cond.business_id

In [None]:
biz_dict = {x: y for x in urbs_cond.business_id for y in urbs_cond.biz_name}

In [None]:
biz_dict['lV3qjLJF_5QujITWHZ64gg']

In [None]:
len(biz_dict)

In [None]:
biz_df = urbs_cond[['business_id', 'biz_name']]

In [None]:
biz_df

In [None]:
for idx in biz:
    print(biz_df.biz_name.iloc[idx])