## Sentiment Classification Yelp pipeline
Once we are happy with the model results, we can convert to sklearn pipeline to simplify deployment

In [1]:
import sys
print(sys.version)

3.7.12 | packaged by conda-forge | (default, Oct 26 2021, 06:08:53) 
[GCC 9.4.0]


In [2]:
import sklearn
import os
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.pipeline import make_pipeline
from sklearn import metrics

from joblib import dump, load

### Load Data

In [3]:
os.makedirs('/home/jupyter/yelp', exist_ok=True)
os.makedirs('/home/jupyter/data/yelp/yelp_model', exist_ok=True)

In [4]:
directory = 'https://storage.googleapis.com/msca-bdp-data-open/yelp/'
fileName = 'yelp_train_sentiment.json'

path = directory + fileName

In [5]:
%%time

yelp = pd.read_json(path, orient='records', lines=True)
yelp.shape

CPU times: user 1.69 s, sys: 581 ms, total: 2.27 s
Wall time: 2.69 s


(255717, 3)

In [6]:
pd.set_option('display.max_colwidth', 200)

#### Interpreting results
label = 0 >> Negative Sentiment  
label = 1 >> Positive Sentiment  

In [7]:
# examine the first 5 rows
yelp.head(5)

Unnamed: 0,text,label,lang
0,"I love Deagan's. I do. I really do. The atmosphere is cozy and festive. The shrimp tacos and house fries are my standbys. The fries are sometimes good and sometimes great, and the spicy dipping sa...",1,en
1,I love the classes at this gym. Zumba and. Radio Hip Hop are my favorite. This is such a great fun and I love that it is so reasonably priced!,1,en
2,The tables and floor were dirty. I was the only customer on a Saturday nite and the person working the counter ignored me I had a corned beef sandwich. I took three bites and threw it in the trash,0,en
3,"I had an oil change at the 15515 N Scottsdale Road location. When the car was delivered to me, there were two engine warning lights on that had not been on when I drove the car in. The technicia...",0,en
4,The absolute WORST apartment complex I have ever lived in. Moved here from out of state. Hoped to find a decently priced apartment until I got myself settled in. Wow this place has been trash. Lan...,0,en


In [8]:
# define X and y
X = yelp['text']
y = yelp['label']
print(X.shape)
print(y.shape)

(255717,)
(255717,)


In [9]:
# split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(191787,)
(63930,)
(191787,)
(63930,)


## Creating sklearn pipelines

### Naive Bayes Model

In [10]:
pipe_nb = make_pipeline(
    CountVectorizer(lowercase=False, stop_words='english', ngram_range=(1,3)),
    MultinomialNB()
)

In [11]:
%time pipe_nb.fit(X_train, y_train);

CPU times: user 1min 59s, sys: 2.97 s, total: 2min 2s
Wall time: 2min 2s


Pipeline(steps=[('countvectorizer',
                 CountVectorizer(lowercase=False, ngram_range=(1, 3),
                                 stop_words='english')),
                ('multinomialnb', MultinomialNB())])

In [12]:
%time y_pred = pipe_nb.predict(X_test)

CPU times: user 15.3 s, sys: 51.6 ms, total: 15.3 s
Wall time: 15.3 s


In [13]:
print(f"Test Accuracy: {metrics.accuracy_score(y_test, y_pred) * 100:.1f}%")

Test Accuracy: 94.6%


In [14]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.98      0.95     32016
           1       0.97      0.92      0.94     31914

    accuracy                           0.95     63930
   macro avg       0.95      0.95      0.95     63930
weighted avg       0.95      0.95      0.95     63930



In [15]:
%time dump(pipe_nb, "/home/jupyter/data/yelp/yelp_model/nb.joblib")

CPU times: user 2min 27s, sys: 2.4 s, total: 2min 30s
Wall time: 2min 30s


['/home/jupyter/data/yelp/yelp_model/nb.joblib']

### Logistic Regression Model

In [16]:
pipe_logreg = make_pipeline(
    CountVectorizer(lowercase=False, stop_words='english', ngram_range=(1,3)),
    LogisticRegression(max_iter=1000)
)

In [17]:
%time pipe_logreg.fit(X_train, y_train)

CPU times: user 11min 54s, sys: 9min 9s, total: 21min 3s
Wall time: 8min 33s


Pipeline(steps=[('countvectorizer',
                 CountVectorizer(lowercase=False, ngram_range=(1, 3),
                                 stop_words='english')),
                ('logisticregression', LogisticRegression(max_iter=1000))])

In [18]:
%time y_pred = pipe_logreg.predict(X_test)

CPU times: user 15.2 s, sys: 86.6 ms, total: 15.3 s
Wall time: 15.3 s


In [19]:
print(f"Test Accuracy: {metrics.accuracy_score(y_test, y_pred) * 100:.1f}%")

Test Accuracy: 97.3%


In [20]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97     32016
           1       0.97      0.97      0.97     31914

    accuracy                           0.97     63930
   macro avg       0.97      0.97      0.97     63930
weighted avg       0.97      0.97      0.97     63930



In [21]:
%time dump(pipe_logreg, "/home/jupyter/data/yelp/yelp_model/logreg.joblib")

CPU times: user 2min 26s, sys: 2.9 s, total: 2min 29s
Wall time: 2min 29s


['/home/jupyter/data/yelp/yelp_model/logreg.joblib']

### Support Vector Machine

In [22]:
pipe_svm = make_pipeline(
    CountVectorizer(lowercase=False, stop_words='english', ngram_range=(1,3)),
    SGDClassifier(max_iter=100, tol=None)
)

In [23]:
%time pipe_svm.fit(X_train, y_train)

CPU times: user 2min 25s, sys: 4.04 s, total: 2min 29s
Wall time: 2min 28s


Pipeline(steps=[('countvectorizer',
                 CountVectorizer(lowercase=False, ngram_range=(1, 3),
                                 stop_words='english')),
                ('sgdclassifier', SGDClassifier(max_iter=100, tol=None))])

In [24]:
y_pred = pipe_svm.predict(X_test)

In [25]:
# calculate accuracy of class predictions
print(metrics.accuracy_score(y_test, y_pred))

0.9734396996715157


In [26]:
# calculate precision and recall
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97     32016
           1       0.97      0.97      0.97     31914

    accuracy                           0.97     63930
   macro avg       0.97      0.97      0.97     63930
weighted avg       0.97      0.97      0.97     63930



In [27]:
%time dump(pipe_svm, "/home/jupyter/data/yelp/yelp_model/svm.joblib")

CPU times: user 2min 25s, sys: 2.57 s, total: 2min 28s
Wall time: 2min 28s


['/home/jupyter/data/yelp/yelp_model/svm.joblib']

In [28]:
!ls -l /home/jupyter/data/yelp/yelp_model/

total 1867420
-rw-r--r-- 1 root root 522995808 Oct 23 14:06 logreg.joblib
-rw-r--r-- 1 root root   7420344 Oct 22 21:26 logreg_small.joblib
-rw-r--r-- 1 root root 839186606 Oct 23 13:54 nb.joblib
-rw-r--r-- 1 root root  12203294 Oct 22 21:26 nb_small.joblib
-rw-r--r-- 1 root root 522996092 Oct 23 14:11 svm.joblib
-rw-r--r-- 1 root root   7420628 Oct 22 21:26 svm_small.joblib


In [None]:
# !gsutil -m cp -r /home/jupyter/data/yelp/yelp_model/ gs://msca-bdp-data/yelp/

In [30]:
import datetime
import pytz

datetime.datetime.now(pytz.timezone('US/Central')).strftime("%a, %d %B %Y %H:%M:%S")

'Sun, 23 October 2022 09:11:21'