In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size':20})


import tensorflow as tf
from tensorflow.keras.preprocessing import image
from scipy.sparse.linalg import svds

from surprise import (SVD
                      , SVDpp
                      , NMF
                      , NormalPredictor
                      , BaselineOnly)
from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader
from surprise.model_selection.split import train_test_split
from surprise.model_selection.validation import cross_validate
from surprise import accuracy

from wordcloud import WordCloud, STOPWORDS
from collections import Counter
from matplotlib import rcParams, cm

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from src.models import (load_data
                        , data_summary
                        , combine_tables
                        , collab_mat
                        , svd_mat
                        , svd_model
                        , surprise_bench
                        , top_biz_pred
                        , NMF_Mat
                        , NN_Model
                        , NN_Results_df
                        , con_bas_biz_rec
                       )

### Load Data

In [None]:
biz_df, rev_df, ckin_df, pho_df, tip_df, user_df = load_data()

In [None]:
data_summary(biz_df
             , rev_df
             , ckin_df
             , pho_df
             , tip_df
             , user_df)

In [None]:
user_rev_biz = combine_tables(user_df
                              , rev_df
                              , biz_df)

In [None]:
user_rev_biz.city.value_counts()[0:5]

In [None]:
user_rev_biz_scott = user_rev_biz.loc[user_rev_biz.city == 'Scottsdale']
user_rev_biz_scott.shape

In [None]:
len(user_rev_biz_scott.user_id.unique())

In [None]:
len(user_rev_biz_scott.business_id.unique())

In [None]:
average_user_stars_given = pd.DataFrame(user_rev_biz_scott.average_stars.value_counts())

In [None]:
user_rev_biz_scott.average_stars.mean()

In [None]:
plt.figure(figsize=(10,8))

plt.hist(user_rev_biz_scott.average_stars, color='salmon')
plt.vlines(3.85, 0, 85000, label = 'Avg=3.85')
plt.xlabel('Stars')
plt.ylabel('Frequency')
plt.title('User Stars Given')
plt.legend();
plt.savefig('png/user_stars_given.png')

In [None]:
biz_stars_df = pd.DataFrame(user_rev_biz_scott.biz_star.value_counts().sort_index())

In [None]:
user_rev_biz_scott.biz_star.mean()

In [None]:
plt.figure(figsize=(10,8))
plt.hist(user_rev_biz_scott.biz_star, color='salmon')
plt.vlines(3.93, 0, 130000, label='Avg=3.93')
plt.xlabel('Stars')
plt.ylabel('Frequency')
plt.title('Business Star Rating')
plt.legend();
plt.savefig('png/biz_stars_rating.png')

In [None]:
user_rev_biz_scott.biz_review_count.median()

In [None]:
plt.figure(figsize=(10,8))
plt.hist(user_rev_biz_scott.biz_review_count.loc[user_rev_biz_scott.biz_review_count < 400], color='salmon')
plt.vlines(191, 0, 70000, label='Median=191')
plt.xlabel('Number of Reviews')
plt.ylabel('Frequency')
plt.title('Business Review Count')
plt.legend();
plt.savefig('png/biz_rev_count.png')

In [None]:
user_rev_biz_scott.user_review_count.median()

In [None]:
plt.figure(figsize=(10,8))
plt.hist(user_rev_biz_scott.user_review_count.loc[user_rev_biz_scott.user_review_count < 50], color='salmon')
plt.vlines(21, 0, 60000, label='Median=21')
plt.xlabel('Number of Reviews')
plt.ylabel('Frequency')
plt.title('User Review Count')
plt.legend();
plt.savefig('png/user_rev_count.png')

In [None]:
n = 34
cat_short = []
for cat in user_rev_biz_scott.categories.value_counts()[0:n].index:
    cat_short.append(cat[:9])
cat_short = np.array(cat_short)
cat_short

In [None]:
plt.figure(figsize=(10,8))

plt.barh(cat_short
       , user_rev_biz_scott.categories.value_counts()[0:n].values, color='salmon')
plt.xticks(rotation=90)
plt.ylabel('Categories')
plt.gca().invert_yaxis()
plt.title('Top 20 Business Categories');
plt.savefig('png/top_20_biz_cat.png')

In [None]:
all_rev_words_scott = ' '.join(user_rev_biz_scott.rev_text.str.lower())

# Generate a word cloud image
wordcloud = WordCloud(background_color="white", max_words=200).generate(all_rev_words_scott)

# Display the wordcloud image:
plt.figure(figsize=[12,9])
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show(); 
# plt.savefig('png/wordcloud_reviews.png');

In [None]:
stopwords = STOPWORDS

filtered_words = [word for word in all_rev_words_scott.split() if word not in stopwords]
counted_words = Counter(filtered_words)

words = []
counts = []
for letter, count in counted_words.most_common(10):
    words.append(letter)
    counts.append(count)
    
colors = cm.rainbow(np.linspace(0, 1, 10))
plt.figure(figsize=[10, 8])

plt.title('Top 10 Words in Reviews')
plt.xlabel('Count')
plt.ylabel('Words')
plt.gca().invert_yaxis()
plt.barh(words, counts, color=colors);
plt.savefig('png/top_10_words_rev.png')

In [None]:
biz_id_scott = user_rev_biz_scott.business_id.unique()

In [None]:
biz_id_scott

In [None]:
len(biz_id_scott)

In [None]:
tip_df.columns

In [None]:
tip_df.shape

In [None]:
tip_df_scott = tip_df[tip_df.business_id.isin(biz_id_scott)]

In [None]:
tip_df_scott.shape

In [None]:
all_tip_words_scott = ' '.join(tip_df_scott.text.str.lower())

# Generate a word cloud image
wordcloud = WordCloud(background_color="white", max_words=100).generate(all_tip_words_scott)

# Display the wordcloud image:
plt.figure(figsize=[12,9])
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off");
# plt.savefig('png/wordcloud_tips.png');

In [None]:
stopwords = STOPWORDS
stopwords.add('&')


filtered_words = [word for word in all_tip_words_scott.split() if word not in stopwords]
counted_words = Counter(filtered_words)

words = []
counts = []
for letter, count in counted_words.most_common(10):
    words.append(letter)
    counts.append(count)
    
colors = cm.rainbow(np.linspace(0, 1, 10))
plt.figure(figsize=[10, 8])

plt.title('Top 10 Words Tips')
plt.xlabel('Count')
plt.ylabel('Words')
plt.gca().invert_yaxis()
plt.barh(words, counts, color=colors);
plt.savefig('png/top_10_words_tip.png')

In [None]:
ckin_df.shape

In [None]:
ckin_df_scott = ckin_df[ckin_df.business_id.isin(biz_id_scott)]

In [None]:
ckin_df_scott.shape

In [None]:
ckin_df_scott.columns

In [None]:
ckin_df_scott['date'].iloc[0]

In [None]:
ckin_df_scott.head()

In [None]:
ckin_df_scott['checkins'] = ckin_df_scott.date.str.split(', ').apply(len)

In [None]:
ckin_df_scott.head()

In [None]:
ckin_df_scott.checkins.value_counts()

In [None]:
ckin_df_scott.checkins.median()

In [None]:
plt.figure(figsize=[10,8])
plt.hist(ckin_df_scott.checkins.loc[ckin_df_scott.checkins < 40], color = 'salmon')
plt.vlines(21, 0, 1500, label='Median=21')
plt.xlabel('Checkins')
plt.ylabel('Frequency')
plt.title('Checkin Frequencies')
plt.legend();
plt.savefig('png/checkin_frequencies.png')

### SVD

In [None]:
user_biz_collab_mat = collab_mat('Scottsdale'
                                 , user_rev_biz)

In [None]:
sigma, user_biz_predictions = svd_mat(user_biz_collab_mat, k=40)

### SVD Predictions

In [None]:
top_biz_pred('--2HUmLkcNHZp0xw6AMBPg'
             , df_all = user_rev_biz
             , df_mat = user_biz_collab_mat
             , df_pred = user_biz_predictions
             , n=10)

In [None]:
user_rev_biz['biz_name'].loc[user_rev_biz.user_id == '--2HUmLkcNHZp0xw6AMBPg'][0:10]

### NMF

In [None]:
nmf_mat = NMF_Mat(user_biz_collab_mat)

### NMF Predictions

In [None]:
top_biz_pred('--2HUmLkcNHZp0xw6AMBPg'
             , df_all = user_rev_biz
             , df_mat = user_biz_collab_mat
             , df_pred = nmf_mat
             , n=10)

In [None]:
user_rev_biz['biz_name'].loc[user_rev_biz.user_id == '--2HUmLkcNHZp0xw6AMBPg'][0:10]

### Validation

In [None]:
data, svd_acc, svd_cv = svd_model(df = user_rev_biz)

In [None]:
svd_acc

In [None]:
svd_cv

In [None]:
NMF_results = cross_validate(NMF(), data, cv = 5)

In [None]:
NMF_results['test_rmse']

In [None]:
bench_df = surprise_bench(df=user_rev_biz)

In [None]:
bench_df

### Neural Network Predictor

In [None]:
# Source: https://www.youtube.com/watch?v=4vwNkHFuZBk

In [None]:
from tensorflow.keras.layers import (Input
                                     , Embedding
                                     , Dot
                                     , Flatten
                                    )
from tensorflow.keras.models import Model
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import History
from tensorflow.keras.metrics import Accuracy
from tensorflow.keras.callbacks import Callback

import os

In [None]:
user_id_dict, biz_id_dict, user_df, biz_df, X, X_test, model, history = NN_Model(df = user_rev_biz
                                                                                 , n_factors = 10
                                                                                 , ep = 16)

In [None]:
test_loss = [13.972044689264932
             , 10.206517683802899
             , 8.2220497374806
             , 7.152508585348682
             , 6.552307478416431
             , 6.19530198773488
             , 5.988184054647287
             , 5.869163104445376
             , 5.796763119668136
             , 5.749054270822196
             , 5.717364028320827
             , 5.69821242415697
             , 5.6822034114588895
             , 5.668045834375318
             , 5.6577604609729235
             , 5.649560857059431
            ]

test_accuracy = [0.0
                 , 0.0004972288734279573
                 , 0.0011036054929718375
                 , 0.001467431546188891
                 , 0.0015765792923048139
                 , 0.0016129618743434548
                 , 0.0016735995886847377
                 , 0.0015765792923048139
                 , 0.0016857271548360586
                 , 0.0016129618743434548
                 , 0.0016614720225334167
                 , 0.0015523242764174938
                 , 0.0016250894404947758
                 , 0.0015765792923048139
                 , 0.0016372170066460967
                 , 0.0015523242764174938
                ]

In [None]:
print(len(test_loss))
print(len(test_accuracy))

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(12,7), tight_layout=True)
ax[0].plot(history.history['loss'], label='Train')
ax[0].plot(test_loss, label='Test')
ax[0].set_xlabel('Epochs')
ax[0].set_ylabel('Error')
ax[0].set_title('MSE Errorr')
ax[0].legend()
    
ax[1].plot(history.history['accuracy'], label='Train')
ax[1].plot(test_accuracy, label='Test')
ax[1].set_xlabel('Epochs')
ax[1].set_ylabel('Accuracy')
ax[1].set_title('Accuracy')
ax[1].legend();
plt.savefig('png/NN_loss_acc.png')

In [None]:
NN_Results_df(mod = model
             , xtest = X_test
             , n=15)

In [None]:
user_id_dict

In [None]:
 biz_id_dict

In [None]:
X[X.user_num==1].biz_name.values

In [None]:
predictions = model.predict([np.array([1 for _ in range(7081)])
                             , biz_df.index.values])

In [None]:
kristin_df = biz_df
kristin_df['user_name'] = 'Kristin'
kristin_df['user_num'] = 1
kristin_df['model_pred'] = predictions
kristin_df.head()

In [None]:
kristin_df.nlargest(28, ['model_pred']).biz_name.values

### Using Texts to Model Business to Business Similarity

In [None]:
con_bas_biz_rec(df = user_rev_biz, n = 5)

In [None]:
user_rev_biz_scott.head()
    


In [None]:
urbs_cond = user_rev_biz_scott.drop_duplicates(subset = 'business_id')

In [None]:
urbs_cond[['business_id', 'biz_name']].iloc[0]

In [None]:
urbs_cond.head()