# Sentiment Classification on Yelp

In [1]:
import sys
print(sys.version)

3.7.12 | packaged by conda-forge | (default, Oct 26 2021, 06:08:53) 
[GCC 9.4.0]


In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
import sklearn
import pandas as pd

import wordcloud
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn import metrics
import numpy as np

import eli5

### Load Data

In [4]:
directory = 'https://storage.googleapis.com/msca-bdp-data-open/yelp/'
fileName = 'yelp_train_sentiment.json'

path = directory + fileName

In [5]:
%%time

yelp = pd.read_json(path, orient='records', lines=True)
yelp.shape

CPU times: user 1.75 s, sys: 581 ms, total: 2.33 s
Wall time: 3.16 s


(255717, 3)

In [6]:
pd.set_option('display.max_colwidth', 200)

#### Interpreting results
label = 0 >> Negative Sentiment  
label = 1 >> Positive Sentiment  

In [7]:
yelp.head(5)

Unnamed: 0,text,label,lang
0,"I love Deagan's. I do. I really do. The atmosphere is cozy and festive. The shrimp tacos and house fries are my standbys. The fries are sometimes good and sometimes great, and the spicy dipping sa...",1,en
1,I love the classes at this gym. Zumba and. Radio Hip Hop are my favorite. This is such a great fun and I love that it is so reasonably priced!,1,en
2,The tables and floor were dirty. I was the only customer on a Saturday nite and the person working the counter ignored me I had a corned beef sandwich. I took three bites and threw it in the trash,0,en
3,"I had an oil change at the 15515 N Scottsdale Road location. When the car was delivered to me, there were two engine warning lights on that had not been on when I drove the car in. The technicia...",0,en
4,The absolute WORST apartment complex I have ever lived in. Moved here from out of state. Hoped to find a decently priced apartment until I got myself settled in. Wow this place has been trash. Lan...,0,en


In [8]:
# define X and y
X = yelp['text']
y = yelp['label']

print(f"X Shape: {X.shape}")
print(f"y Shape: {y.shape}")

X Shape: (255717,)
y Shape: (255717,)


In [9]:
# split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

print(f"Training records, X_train: {X_train.shape} y_train: {y_train.shape}")
print(f"Testing records, X_test: {X_test.shape} y_test: {y_test.shape}")

Training records, X_train: (191787,) y_train: (191787,)
Testing records, X_test: (63930,) y_test: (63930,)


## Part 3: Initialize vectorizer

In [10]:
vect = CountVectorizer()

# vect = CountVectorizer(lowercase=False, stop_words='english',
#                                   max_df=0.8, min_df=0.2, max_features=10000, ngram_range=(1,3))

vect = CountVectorizer(lowercase=False, stop_words='english', ngram_range=(1,3))

## Part 4: Building and evaluating models

### Naive Bayes Model

In [11]:
# instantiate a Multinomial Naive Bayes model
nb = MultinomialNB()

In [12]:
%time nb.fit(vect.fit_transform(X_train), y_train)

CPU times: user 1min 55s, sys: 3.02 s, total: 1min 58s
Wall time: 1min 58s


MultinomialNB()

In [13]:
# make class predictions
y_pred = nb.predict(vect.transform(X_test))

In [14]:
# calculate accuracy of class predictions
print(f"Test Accuracy: {metrics.accuracy_score(y_test, y_pred) * 100:.1f}%")

Test Accuracy: 94.8%


In [15]:
# calculate precision and recall
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.98      0.95     32217
           1       0.97      0.92      0.95     31713

    accuracy                           0.95     63930
   macro avg       0.95      0.95      0.95     63930
weighted avg       0.95      0.95      0.95     63930



In [16]:
# calculate the confusion matrix
print(metrics.confusion_matrix(y_test, y_pred))

[[31430   787]
 [ 2509 29204]]


In [17]:
element = 0
clf = nb

text = X_test.iloc[element]
prediction = np.where(clf.predict(vect.transform([text])) < 1, "Negative", "Positive").tolist()[element]
print('Text: >>> ' + text + '\n' + 'Sentiment: >>> ' + prediction)

Text: >>> Cute place.  I wanted it to be good.  Very disappointing.  The chicken fried steak was actual very thin steak with gristle.  The eggs, bacon and hash browns were fine.  But how hard is it to mess these up?  The gravy on the steak and also biscuits is different than traditional gravy for these items. Underwhelming.  We won't come back.  But you might want to try it and see what you think.  The portions were big.  The flavor was a fail.  I suggest more pancake options.  Go with a white gravy and get rid of the poor quality steak.
The food is pricey for the quality of food you get.
Sentiment: >>> Negative


### Logistic Regression Model

In [18]:
# instantiate a logistic regression model
logreg = LogisticRegression(max_iter=200)

In [19]:
# train the model
%time logreg.fit(vect.fit_transform(X_train), y_train)

CPU times: user 11min 38s, sys: 8min 54s, total: 20min 32s
Wall time: 8min 23s


LogisticRegression(max_iter=200)

In [20]:
# make class predictions
y_pred = logreg.predict(vect.transform(X_test))

In [21]:
# calculate accuracy of class predictions
print(f"Test Accuracy: {metrics.accuracy_score(y_test, y_pred) * 100:.1f}%")

Test Accuracy: 97.3%


In [22]:
# calculate precision and recall
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97     32217
           1       0.97      0.97      0.97     31713

    accuracy                           0.97     63930
   macro avg       0.97      0.97      0.97     63930
weighted avg       0.97      0.97      0.97     63930



In [23]:
# calculate the confusion matrix
print(metrics.confusion_matrix(y_test, y_pred))

[[31393   824]
 [  900 30813]]


In [24]:
element = 0
clf = logreg

text = X_test.iloc[element]
prediction = np.where(clf.predict(vect.transform([text])) < 1, "Negative", "Positive").tolist()[element]
print('Text: >>> ' + text + '\n' + 'Sentiment: >>> ' + prediction)

Text: >>> Cute place.  I wanted it to be good.  Very disappointing.  The chicken fried steak was actual very thin steak with gristle.  The eggs, bacon and hash browns were fine.  But how hard is it to mess these up?  The gravy on the steak and also biscuits is different than traditional gravy for these items. Underwhelming.  We won't come back.  But you might want to try it and see what you think.  The portions were big.  The flavor was a fail.  I suggest more pancake options.  Go with a white gravy and get rid of the poor quality steak.
The food is pricey for the quality of food you get.
Sentiment: >>> Negative


#### Most important Features

In [25]:
clf = logreg


feature_names = vect.get_feature_names()
coefs_with_fns = zip(feature_names, clf.coef_[0])

coefs_with_fns_df = pd.DataFrame(coefs_with_fns,
                    columns=['feature', 'coefficient'])

In [26]:
coefs_with_fns_df.sort_values(by='coefficient', ascending=True, inplace=True)
coefs_with_fns_df.head(10)

Unnamed: 0,feature,coefficient
13021529,worst,-3.908755
10352525,rude,-3.242115
2585749,Worst,-3.210558
6911565,horrible,-2.859865
1071611,Horrible,-2.735237
11770178,terrible,-2.729198
5084249,disappointing,-2.694351
8204773,mediocre,-2.635744
2068904,Terrible,-2.530562
3440002,bland,-2.451377


In [27]:
top_features_df = coefs_with_fns_df.copy()
top_features_df['coefficient'] = top_features_df['coefficient']*(-10)

data = dict(zip(top_features_df['feature'].tolist(), top_features_df['coefficient'].tolist()))

In [28]:
coefs_with_fns_df.sort_values(by='coefficient', ascending=False, inplace=True)
coefs_with_fns_df.head(10)

Unnamed: 0,feature,coefficient
4859241,delicious,3.303973
451946,Best,3.109538
2824716,amazing,3.083276
816485,Excellent,3.076312
960012,Great,3.027862
3142648,awesome,2.986845
304205,Amazing,2.869425
1326302,Love,2.844344
390155,Awesome,2.780031
5574598,excellent,2.778517


### Support Vector Machine

In [29]:
# instantiate a SVM model
svm = SGDClassifier(max_iter=100, tol=None)

In [30]:
# train the model
%time svm.fit(vect.fit_transform(X_train), y_train)

CPU times: user 2min 25s, sys: 3.88 s, total: 2min 29s
Wall time: 2min 28s


SGDClassifier(max_iter=100, tol=None)

In [31]:
# make class predictions
y_pred = svm.predict(vect.transform(X_test))

In [32]:
# calculate accuracy of class predictions
print(metrics.accuracy_score(y_test, y_pred))

0.9736117628656343


In [33]:
# calculate precision and recall
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97     32217
           1       0.97      0.97      0.97     31713

    accuracy                           0.97     63930
   macro avg       0.97      0.97      0.97     63930
weighted avg       0.97      0.97      0.97     63930



In [34]:
# calculate the confusion matrix
print(metrics.confusion_matrix(y_test, y_pred))

[[31396   821]
 [  866 30847]]


In [35]:
element = 0
clf = svm

text = X_test.iloc[element]
prediction = np.where(clf.predict(vect.transform([text])) < 1, "Negative", "Positive").tolist()[element]
print('Text: >>> ' + text + '\n' + 'Sentiment: >>> ' + prediction)

Text: >>> Cute place.  I wanted it to be good.  Very disappointing.  The chicken fried steak was actual very thin steak with gristle.  The eggs, bacon and hash browns were fine.  But how hard is it to mess these up?  The gravy on the steak and also biscuits is different than traditional gravy for these items. Underwhelming.  We won't come back.  But you might want to try it and see what you think.  The portions were big.  The flavor was a fail.  I suggest more pancake options.  Go with a white gravy and get rid of the poor quality steak.
The food is pricey for the quality of food you get.
Sentiment: >>> Negative


## Part 6.  Troubleshooting results
Focus on predictions with low confidence level

In [36]:
X_test_df = pd.DataFrame(X_test)
X_test_df.reset_index(inplace=True, drop=True)

y_test_df = pd.DataFrame(y_test)
y_test_df.reset_index(inplace=True, drop=True)

#### Scoring the test results and appending both class and probabilities

In [37]:
y_pred_prob = logreg.predict_proba(vect.transform(X_test))
y_pred = logreg.predict(vect.transform(X_test))

In [38]:
y_pred_prob_df = pd.DataFrame(y_pred_prob)
y_pred_prob_df.columns = ['0-prob', '1-prob']

y_pred_df = pd.DataFrame(y_pred)
y_pred_df.columns = ['predicted']

#### Combining the results and focusing on low confidence levels

In [39]:
results_df = X_test_df.join(y_test_df).join(y_pred_df).join(y_pred_prob_df)

In [40]:
results_review_df = results_df[(results_df['0-prob'] < 0.6) & (results_df['1-prob'] < 0.6)]
results_review_df.shape

(692, 5)

In [41]:
pd.set_option('display.max_colwidth', 2000)

#### Interpreting results
label = 0 >> Negative Sentiment  
label = 1 >> Positive Sentiment  

In [42]:
results_review_df.head(10)

Unnamed: 0,text,label,predicted,0-prob,1-prob
160,"While attending CES, I was in the mood for BBQ. I had tried the Niagara Falls & Philadelphia locations of this restaurant. So, I decided to check it out. This restaurant was right next to McCormick and Schmick Steakhouse but compared to that restaurant, it was packed. I went there lunch time. The food was exquisite. The meats, as always, were cooked to perfection. The salad bar has many tempting items but don't fill up on those because BBQ is the main attraction of this place. As long as you show your ""green"" card on your table the waiters will keep bringing you BBQ meats at your table. My favourites include Lamb Chops, Fillet Minion, and Rib Eye. Lunch buffet is roughly $35/person. Deserts are extra.",1,1,0.455603,0.544397
315,6.99 for 3 chicken wings and their left over chips....\nSighs\n\nQuality has gone down dramatically. Loved this place,0,1,0.454619,0.545381
548,"I was referred to see Dr. Bradley Gettleman by my general dentist. I had a cracked tooth and Dr Gettleman established the crack was too far to save the tooth and it would need to be removed. He proceeded to do a root canal on the tooth. I feel I was duped. The Dr did not inquire on my pain level. He should have been my advocate and made a decision not to do a root canal on a tooth that needed to be removed. I'm out of $300 which I was informed after the procedure that my dental insurance would not cover the procedure, described as incomplete endo tx, inoperable. I now have to pay again for an oral surgeon to remove the tooth. I feel he could have educated me in order for me to make a decision for the root canal or not.",0,1,0.424458,0.575542
780,"Gel peeled off the next day after my first visit. \nSecond visit: my pedicure was bumpy, two toenails diagonal. I was very bothered by how my pedicure looked so I got it redone somewhere else and it looked amazing!",0,0,0.562812,0.437188
1004,"For every person with negative points in regards to Dads being to oily, greesey, over loaded with calories... um duh. When I first pulled up the menu online, my first thought wasn't ""God, I hope this isn't too unhealthy for me"" it's GRILLED CHEESE. Nothing about the name of the sandwhich indicates this is great for my health food fanatic self... Nope, I was delighted with the taste, the selection, and was impressed by the staff. I recommend this place to anyone and everyone that I know, and have been there now 4x since I discovered that it was right down the street from where I live, and work... AWESOME!",1,1,0.484888,0.515112
1065,I have used Maid Pro for several years. The service is reliable and very convenient. I received a schedule for six months of cleaning on my time frame. I have it done bi-weekly. The cleaners are trustworthy and listen to my requests. I have and will continue to recommend their service.,1,0,0.525126,0.474874
1082,I'd just like to know who just throws a breakfast burrito half wrapped in the bag. Ohhhh yeah that's right this Burger King. Also you may want to tell the drivethru chick to smile.,0,1,0.403954,0.596046
1211,"A wonderful bookstore just south of Bloor on Spadina. Crammed full of books at great prices with a helpful owner. This is the sort of store that you can spend an hour in just discovering new finds and old friends. It's a real shame that U of T is intent on putting up a student residence on this site; a university destroying a bookstore, ironic and sad. Ten Editions is still open and has a sale going ($4 a book). When it's finally closed down it will be sorely missed.",1,0,0.503704,0.496296
1236,"Everyone loves Chipotle. If you're on a diet, use the one on Cochran road. They have the smallest portions of any of the chipotles around. Disappointing.",0,1,0.452226,0.547774
1357,"I have been to places that charge $100.00 for the kind of clean job they do. They are the best. I come from Chardon to get my car cleaned. They won't do your car if pets are allowed in your car. The hair clogs their vacuums and they would have to close and service them. This is financially prohibitive. I recomend you tip generously. The lady who owns the car wash works as hard as the young people and she is in her 70s. They don't just clean your car, they detail it.",1,1,0.40756,0.59244


## Part 7.  Visualizing predictions and most important features with eli5

#### eli5 + Logistic Regression.  All features

In [43]:
clf = logreg
eli5.show_weights(clf, vec=vect, top=20)

Weight?,Feature
+3.304,delicious
+3.110,Best
+3.083,amazing
+3.076,Excellent
+3.028,Great
+2.987,awesome
+2.869,Amazing
+2.844,Love
+2.780,Awesome
+2.779,excellent


#### eli5 + Logistic Regression.  Individual predictions

In [44]:
clf = logreg
text = X_test.iloc[0]
targets = 1 # Target for positive sentiment to align color of positive = green, negative = red 

eli5.show_prediction(clf, text, vec=vect, target_names=['Negative', 'Positive'], targets=[targets], top=20)

Contribution?,Feature
… 39 more positive …,… 39 more positive …
… 50 more negative …,… 50 more negative …
-0.470,<BIAS>
-6.751,Highlighted in text (sum)


In [45]:
clf = logreg
text = X_test.iloc[1]
targets = 1 # Target for positive sentiment to align color of positive = green, negative = red 

eli5.show_prediction(clf, text, vec=vect, target_names=['Negative', 'Positive'], targets=[targets], top=20)

Contribution?,Feature
… 25 more positive …,… 25 more positive …
… 55 more negative …,… 55 more negative …
-0.470,<BIAS>
-7.831,Highlighted in text (sum)


In [46]:
clf = logreg
text = X_test.iloc[2]
targets = 1 # Target for positive sentiment to align color of positive = green, negative = red 

eli5.show_prediction(clf, text, vec=vect, target_names=['Negative', 'Positive'], targets=[targets], top=20)

Contribution?,Feature
… 18 more positive …,… 18 more positive …
… 38 more negative …,… 38 more negative …
-0.470,<BIAS>
-6.448,Highlighted in text (sum)


In [47]:
clf = logreg
text = X_test.iloc[3]
targets = 1 # Target for positive sentiment to align color of positive = green, negative = red 

eli5.show_prediction(clf, text, vec=vect, target_names=['Negative', 'Positive'], targets=[targets], top=20)

Contribution?,Feature
… 6 more positive …,… 6 more positive …
… 8 more negative …,… 8 more negative …
-0.058,Highlighted in text (sum)
-0.470,<BIAS>


#### eli5 + SVM.  All features

In [48]:
clf = svm
eli5.show_weights(clf, vec=vect, top=20)

Weight?,Feature
+0.778,Best
+0.752,Excellent
+0.750,delicious
+0.724,Awesome
+0.717,Great
+0.710,Love
+0.705,awesome
+0.705,amazing
+0.696,Amazing
+0.666,Delicious


#### eli5 + SVM.  Individual predictions

In [49]:
clf = svm
text = X_test.iloc[0]
targets = 1 # Target for positive sentiment to align color of positive = green, negative = red 

eli5.show_prediction(clf, text, vec=vect, target_names=['Negative', 'Positive'], targets=[targets], top=20)

Contribution?,Feature
… 39 more positive …,… 39 more positive …
… 36 more negative …,… 36 more negative …
-0.119,<BIAS>
-1.723,Highlighted in text (sum)


In [50]:
clf = svm
text = X_test.iloc[1]
targets = 1 # Target for positive sentiment to align color of positive = green, negative = red 

eli5.show_prediction(clf, text, vec=vect, target_names=['Negative', 'Positive'], targets=[targets], top=20)

Contribution?,Feature
… 30 more positive …,… 30 more positive …
… 36 more negative …,… 36 more negative …
-0.119,<BIAS>
-2.075,Highlighted in text (sum)


In [51]:
clf = svm
text = X_test.iloc[2]
targets = 1 # Target for positive sentiment to align color of positive = green, negative = red 

eli5.show_prediction(clf, text, vec=vect, target_names=['Negative', 'Positive'], targets=[targets], top=20)

Contribution?,Feature
… 20 more positive …,… 20 more positive …
… 26 more negative …,… 26 more negative …
-0.119,<BIAS>
-1.575,Highlighted in text (sum)


In [52]:
clf = svm
text = X_test.iloc[3]
targets = 1 # Target for positive sentiment to align color of positive = green, negative = red 

eli5.show_prediction(clf, text, vec=vect, target_names=['Negative', 'Positive'], targets=[targets], top=20)

Contribution?,Feature
+0.012,Highlighted in text (sum)
… 4 more positive …,… 4 more positive …
… 5 more negative …,… 5 more negative …
-0.119,<BIAS>


In [53]:
import datetime
import pytz

datetime.datetime.now(pytz.timezone('US/Central')).strftime("%a, %d %B %Y %H:%M:%S")

'Sat, 22 October 2022 16:03:42'