## Sentiment Classification Yelp deployment

In [1]:
import sys
print(sys.version)

3.7.12 | packaged by conda-forge | (default, Oct 26 2021, 06:08:53) 
[GCC 9.4.0]


In [2]:
import os
import requests
import pprint
from joblib import dump, load
import numpy as np
import pandas as pd

In [3]:
pd.set_option('display.max_colwidth', 500)

#### Copy files to local FS from GCP bucket

In [4]:
def get_gcs_data (bucket_name, folder_name, file_name, path_local):
    url = 'https://storage.googleapis.com/' + bucket_name + '/' + folder_name + '/' + file_name
    r = requests.get(url)
    open(path_local + '/' + file_name , 'wb').write(r.content)

In [5]:
path_yelp_data = '/home/jupyter/yelp'
path_yelp_model = '/home/jupyter/data/yelp/yelp_model/'

os.makedirs(path_yelp_data, exist_ok=True)
os.makedirs(path_yelp_model, exist_ok=True)

In [6]:
bucket_name = 'msca-bdp-data-open'
folder_name = 'yelp/yelp_model'
file_name = ['model.joblib', 
             'nb.joblib', 'logreg.joblib', 'svm.joblib',
             'nb_small.joblib', 'logreg_small.joblib', 'svm_small.joblib']
path_local = path_yelp_model

os.makedirs(path_local, exist_ok=True)

for file in file_name:
    get_gcs_data (bucket_name = bucket_name,
                 folder_name = folder_name,
                 file_name = file,
                 path_local = path_local)
    print('Downloaded: ' + file)

Downloaded: model.joblib
Downloaded: nb.joblib
Downloaded: logreg.joblib
Downloaded: svm.joblib
Downloaded: nb_small.joblib
Downloaded: logreg_small.joblib
Downloaded: svm_small.joblib


In [7]:
!ls -l /home/jupyter/data/yelp/yelp_model/

total 1874904
-rw-r--r-- 1 root root 522995808 Oct 26 15:10 logreg.joblib
-rw-r--r-- 1 root root   7420344 Oct 26 15:10 logreg_small.joblib
-rw-r--r-- 1 root root   7661465 Oct 26 15:10 model.joblib
-rw-r--r-- 1 root root 839186606 Oct 26 15:10 nb.joblib
-rw-r--r-- 1 root root  12203294 Oct 26 15:10 nb_small.joblib
-rw-r--r-- 1 root root 522996092 Oct 26 15:10 svm.joblib
-rw-r--r-- 1 root root   7420628 Oct 26 15:10 svm_small.joblib


#### Load Models

In [8]:
# Naive Bayes
# %time clf = load(os.path.join(path_yelp_model, 'nb.joblib'))

In [9]:
# Logistic Regression
%time clf_small = load(os.path.join(path_yelp_model, 'logreg_small.joblib'))

CPU times: user 1.99 s, sys: 747 ms, total: 2.73 s
Wall time: 1.92 s


In [10]:
# Logistic Regression Small
%time clf_large = load(os.path.join(path_yelp_model, 'logreg.joblib'))

CPU times: user 1min 34s, sys: 2.82 s, total: 1min 37s
Wall time: 1min 36s


In [11]:
# Support Vector Machine
# %time clf = load(os.path.join(path_yelp_model, 'svm.joblib'))

#### Make a single prediction -- small model

In [12]:
content = ['The restaurant was awful']

In [13]:
%time y_pred = clf_small.predict(content)
y_pred_class = np.where(y_pred < 1, 'Negative', 'Positive')

y_pred_prob = clf_small.predict_proba(content)

CPU times: user 1.52 ms, sys: 132 µs, total: 1.65 ms
Wall time: 1.54 ms


In [14]:
print(f'Text: {content[0]} \
    \nSentiment: {y_pred_class[0]} \
    \nProbability Negative: = {y_pred_prob[0][0]}\
    \nProbability Positive = {y_pred_prob[0][1]}\
    \n\n')

Text: The restaurant was awful     
Sentiment: Positive     
Probability Negative: = 0.44329372484011365    
Probability Positive = 0.5567062751598864    




#### Make a single prediction -- large model

In [15]:
content = ['The restaurant was awful']

In [16]:
%time y_pred = clf_large.predict(content)
y_pred_class = np.where(y_pred < 1, 'Negative', 'Positive')

y_pred_prob = clf_large.predict_proba(content)

CPU times: user 855 µs, sys: 75 µs, total: 930 µs
Wall time: 860 µs


In [17]:
print(f'Text: {content[0]} \
    \nSentiment: {y_pred_class[0]} \
    \nProbability Negative: = {y_pred_prob[0][0]}\
    \nProbability Positive = {y_pred_prob[0][1]}\
    \n\n')

Text: The restaurant was awful     
Sentiment: Negative     
Probability Negative: = 0.9645534022764568    
Probability Positive = 0.03544659772354324    




#### Multipe predictions -- small model

In [18]:
content = ['The restaurant was awful', 'This is a fantastic restaurant with delicious food',\
    'Best pizza in town']

In [19]:
%time y_pred = clf_small.predict(content)
y_pred_class = np.where(y_pred < 1, 'Negative', 'Positive')

y_pred_prob = clf_small.predict_proba(content)

CPU times: user 538 µs, sys: 47 µs, total: 585 µs
Wall time: 455 µs


In [20]:
# for i, text in enumerate(x, start=0):
#     print(f'Text: {text} \
#     \nSentiment: {y_pred_class[i]} \
#     \nProbability Negative: = {y_pred_prob[i][0]}\
#     \nProbability Positive = {y_pred_prob[i][1]}\
#     \n\n')

In [21]:
sentiment = []
prob_neg = []
prob_pos = []

for i, text in enumerate(content, start=0):
    sentiment.append(str(y_pred_class[i]))
    prob_neg.append(str(y_pred_prob[i][0]))
    prob_pos.append(str(y_pred_prob[i][1]))
    
sent_list = list(zip(content, sentiment, prob_neg, prob_pos))     

In [22]:
sent_df_small = pd.DataFrame(sent_list,columns=['Text', 'Sentiment', 'Prob_Neg', 'Prob_Pos'])

In [23]:
sent_df_small

Unnamed: 0,Text,Sentiment,Prob_Neg,Prob_Pos
0,The restaurant was awful,Positive,0.4432937248401136,0.5567062751598864
1,This is a fantastic restaurant with delicious food,Positive,0.1982245375088199,0.80177546249118
2,Best pizza in town,Positive,0.2879813130367562,0.7120186869632438


In [24]:
sent_list_dict = sent_df_small.to_dict('records')

In [25]:
type(sent_list_dict)

list

In [26]:
sent_list_dict

[{'Text': 'The restaurant was awful',
  'Sentiment': 'Positive',
  'Prob_Neg': '0.44329372484011365',
  'Prob_Pos': '0.5567062751598864'},
 {'Text': 'This is a fantastic restaurant with delicious food',
  'Sentiment': 'Positive',
  'Prob_Neg': '0.19822453750881996',
  'Prob_Pos': '0.80177546249118'},
 {'Text': 'Best pizza in town',
  'Sentiment': 'Positive',
  'Prob_Neg': '0.2879813130367562',
  'Prob_Pos': '0.7120186869632438'}]

#### Multipe predictions -- large model

In [27]:
content = ['The restaurant was awful', 'This is a fantastic restaurant with delicious food',\
    'Best pizza in town']

In [28]:
%time y_pred = clf_large.predict(content)
y_pred_class = np.where(y_pred < 1, 'Negative', 'Positive')

y_pred_prob = clf_large.predict_proba(content)

CPU times: user 0 ns, sys: 584 µs, total: 584 µs
Wall time: 506 µs


In [29]:
# for i, text in enumerate(x, start=0):
#     print(f'Text: {text} \
#     \nSentiment: {y_pred_class[i]} \
#     \nProbability Negative: = {y_pred_prob[i][0]}\
#     \nProbability Positive = {y_pred_prob[i][1]}\
#     \n\n')

In [30]:
sentiment = []
prob_neg = []
prob_pos = []

for i, text in enumerate(content, start=0):
    sentiment.append(str(y_pred_class[i]))
    prob_neg.append(str(y_pred_prob[i][0]))
    prob_pos.append(str(y_pred_prob[i][1]))
    
sent_list = list(zip(content, sentiment, prob_neg, prob_pos))     

In [31]:
sent_df_large = pd.DataFrame(sent_list,columns=['Text', 'Sentiment', 'Prob_Neg', 'Prob_Pos'])

In [32]:
sent_df_large

Unnamed: 0,Text,Sentiment,Prob_Neg,Prob_Pos
0,The restaurant was awful,Negative,0.9645534022764568,0.0354465977235432
1,This is a fantastic restaurant with delicious food,Positive,0.0107433593615976,0.9892566406384024
2,Best pizza in town,Positive,0.0259150451105344,0.9740849548894656


In [33]:
sent_list_dict = sent_df_large.to_dict('records')
# type(sent_list_dict)
pprint.pprint(sent_list_dict)

[{'Prob_Neg': '0.9645534022764568',
  'Prob_Pos': '0.03544659772354324',
  'Sentiment': 'Negative',
  'Text': 'The restaurant was awful'},
 {'Prob_Neg': '0.010743359361597649',
  'Prob_Pos': '0.9892566406384024',
  'Sentiment': 'Positive',
  'Text': 'This is a fantastic restaurant with delicious food'},
 {'Prob_Neg': '0.02591504511053444',
  'Prob_Pos': '0.9740849548894656',
  'Sentiment': 'Positive',
  'Text': 'Best pizza in town'}]


#### Compare the confidence ratings between Small and Large models

In [34]:
sent_df_small

Unnamed: 0,Text,Sentiment,Prob_Neg,Prob_Pos
0,The restaurant was awful,Positive,0.4432937248401136,0.5567062751598864
1,This is a fantastic restaurant with delicious food,Positive,0.1982245375088199,0.80177546249118
2,Best pizza in town,Positive,0.2879813130367562,0.7120186869632438


In [35]:
sent_df_large

Unnamed: 0,Text,Sentiment,Prob_Neg,Prob_Pos
0,The restaurant was awful,Negative,0.9645534022764568,0.0354465977235432
1,This is a fantastic restaurant with delicious food,Positive,0.0107433593615976,0.9892566406384024
2,Best pizza in town,Positive,0.0259150451105344,0.9740849548894656


In [36]:
import datetime
import pytz

datetime.datetime.now(pytz.timezone('US/Central')).strftime("%a, %d %B %Y %H:%M:%S")

'Wed, 26 October 2022 10:12:33'