## Create small sample model files
Once we are happy with the model results, we can convert to sklearn pipeline to simplify deployment

In [1]:
import sys
print(sys.version)

3.7.12 | packaged by conda-forge | (default, Oct 26 2021, 06:08:53) 
[GCC 9.4.0]


In [2]:
import sklearn
import os
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.pipeline import make_pipeline
from sklearn import metrics

from joblib import dump, load

In [3]:
pd.set_option('display.max_colwidth', 200)

### Load Data

In [4]:
os.makedirs('/home/jupyter/yelp', exist_ok=True)
os.makedirs('/home/jupyter/data/yelp/yelp_model', exist_ok=True)

In [5]:
directory = 'https://storage.googleapis.com/msca-bdp-data-open/yelp/'
fileName = 'yelp_train_sentiment.json'

path = directory + fileName

In [6]:
%%time

yelp = pd.read_json(path, orient='records', lines=True)
yelp.shape

CPU times: user 1.69 s, sys: 645 ms, total: 2.34 s
Wall time: 3.38 s


(255717, 3)

In [7]:
yelp = yelp.sample(frac=0.01, replace=False, random_state=1).reset_index(drop=True)

In [8]:
pd.set_option('display.max_colwidth', 200)

#### Interpreting results
label = 0 >> Negative Sentiment  
label = 1 >> Positive Sentiment  

In [9]:
# examine the first 5 rows
yelp.head(5)

Unnamed: 0,text,label,lang
0,"Cute place. I wanted it to be good. Very disappointing. The chicken fried steak was actual very thin steak with gristle. The eggs, bacon and hash browns were fine. But how hard is it to mess ...",0,en
1,I SO thought this was going to be awesome as I've heard great things. Don't know what happened but it was not good. My mother asked what vegetables they had to put in the omelette- LOLLL- and she ...,0,en
2,Called them to come and give me an estimate on my two balconies that need repair. Daniel came out the 21st of Dec. and said he would send me an email with the estimate later that evening. No est...,0,en
3,"Wow. We are Cumbrae and Royal Beef fans, and these guys blow them out of the water! Chicken that tastes like chicken. Never knew most chicken is bleached in Canada. Now I know better!",1,en
4,"I drove from the North Valley to Mesa to have a birthday dinner with one of my girl friends. She picked Peir de Orleans and from what I saw the place didn't seem too bad. A couple bad reviews, but...",0,en


In [10]:
# define X and y
X = yelp['text']
y = yelp['label']
print(X.shape)
print(y.shape)

(2557,)
(2557,)


In [11]:
# split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1917,)
(640,)
(1917,)
(640,)


## Creating sklearn pipelines

### Naive Bayes Model

In [12]:
pipe_nb = make_pipeline(
    CountVectorizer(lowercase=False, stop_words='english', ngram_range=(1,3)),
    MultinomialNB()
)

In [13]:
%time pipe_nb.fit(X_train, y_train);

CPU times: user 1.04 s, sys: 0 ns, total: 1.04 s
Wall time: 1.04 s


Pipeline(steps=[('countvectorizer',
                 CountVectorizer(lowercase=False, ngram_range=(1, 3),
                                 stop_words='english')),
                ('multinomialnb', MultinomialNB())])

In [14]:
%time y_pred = pipe_nb.predict(X_test)

CPU times: user 154 ms, sys: 0 ns, total: 154 ms
Wall time: 152 ms


In [15]:
print(f"Test Accuracy: {metrics.accuracy_score(y_test, y_pred) * 100:.1f}%")

Test Accuracy: 90.8%


In [16]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.94      0.91       319
           1       0.94      0.88      0.90       321

    accuracy                           0.91       640
   macro avg       0.91      0.91      0.91       640
weighted avg       0.91      0.91      0.91       640



In [17]:
%time dump(pipe_nb, "/home/jupyter/data/yelp/yelp_model/nb_small.joblib")

CPU times: user 2.29 s, sys: 29 ms, total: 2.32 s
Wall time: 2.35 s


['/home/jupyter/data/yelp/yelp_model/nb_small.joblib']

### Logistic Regression Model

In [18]:
pipe_logreg = make_pipeline(
    CountVectorizer(lowercase=False, stop_words='english', ngram_range=(1,3)),
    LogisticRegression(max_iter=1000)
)

In [19]:
%time pipe_logreg.fit(X_train, y_train)

CPU times: user 4.57 s, sys: 7.64 s, total: 12.2 s
Wall time: 2.94 s


Pipeline(steps=[('countvectorizer',
                 CountVectorizer(lowercase=False, ngram_range=(1, 3),
                                 stop_words='english')),
                ('logisticregression', LogisticRegression(max_iter=1000))])

In [20]:
%time y_pred = pipe_logreg.predict(X_test)

CPU times: user 268 ms, sys: 340 ms, total: 608 ms
Wall time: 188 ms


In [21]:
print(f"Test Accuracy: {metrics.accuracy_score(y_test, y_pred) * 100:.1f}%")

Test Accuracy: 90.6%


In [22]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.89      0.90       319
           1       0.90      0.92      0.91       321

    accuracy                           0.91       640
   macro avg       0.91      0.91      0.91       640
weighted avg       0.91      0.91      0.91       640



In [23]:
%time dump(pipe_logreg, "/home/jupyter/data/yelp/yelp_model/logreg_small.joblib")

CPU times: user 2.25 s, sys: 28.8 ms, total: 2.28 s
Wall time: 2.29 s


['/home/jupyter/data/yelp/yelp_model/logreg_small.joblib']

### Support Vector Machine

In [24]:
pipe_svm = make_pipeline(
    CountVectorizer(lowercase=False, stop_words='english', ngram_range=(1,3)),
    SGDClassifier(max_iter=100, tol=None)
)

In [25]:
%time pipe_svm.fit(X_train, y_train)

CPU times: user 1.4 s, sys: 635 ms, total: 2.03 s
Wall time: 1.24 s


Pipeline(steps=[('countvectorizer',
                 CountVectorizer(lowercase=False, ngram_range=(1, 3),
                                 stop_words='english')),
                ('sgdclassifier', SGDClassifier(max_iter=100, tol=None))])

In [26]:
y_pred = pipe_svm.predict(X_test)

In [27]:
# calculate accuracy of class predictions
print(metrics.accuracy_score(y_test, y_pred))

0.9109375


In [28]:
# calculate precision and recall
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.92      0.91       319
           1       0.92      0.90      0.91       321

    accuracy                           0.91       640
   macro avg       0.91      0.91      0.91       640
weighted avg       0.91      0.91      0.91       640



In [29]:
%time dump(pipe_svm, "/home/jupyter/data/yelp/yelp_model/svm_small.joblib")

CPU times: user 2.22 s, sys: 15.9 ms, total: 2.24 s
Wall time: 2.25 s


['/home/jupyter/data/yelp/yelp_model/svm_small.joblib']

In [30]:
!ls -l /home/jupyter/data/yelp/yelp_model/

total 1874912
-rw-r--r-- 1 root root 522995808 Oct 26 13:45 logreg.joblib
-rw-r--r-- 1 root root   7420344 Oct 26 13:51 logreg_small.joblib
-rw-r--r-- 1 root root   7661465 Oct 26 13:45 model.joblib
-rw-r--r-- 1 root root 839186606 Oct 26 13:45 nb.joblib
-rw-r--r-- 1 root root  12203294 Oct 26 13:51 nb_small.joblib
-rw-r--r-- 1 root root 522996092 Oct 26 13:45 svm.joblib
-rw-r--r-- 1 root root   7420628 Oct 26 13:51 svm_small.joblib


In [31]:
# !gsutil -m cp -n /home/jupyter/data/yelp/yelp_model/*.joblib gs://msca-bdp-data/yelp/yelp_model/

In [32]:
# !gsutil -m cp -n /home/jupyter/data/yelp/yelp_model/*.joblib gs://msca-bdp-data-open/yelp/yelp_model/

In [33]:
import datetime
import pytz

datetime.datetime.now(pytz.timezone('US/Central')).strftime("%a, %d %B %Y %H:%M:%S")

'Wed, 26 October 2022 08:52:00'