# Sentiment Analysis with Naive Bayes

In [120]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [204]:
yelp_path = 'textdata/yelp_labelled.txt'
yelp_raw = pd.read_csv(yelp_path, delimiter='\t', header=None)
yelp_raw.columns = ['review', 'positive']
yelp_raw.head()

X_train, X_test, y_train, y_test = train_test_split(yelp_raw['review'], yelp_raw['positive'], test_size = 0.2, random_state=20)
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

X_train_reset = X_train
X_test_reset = X_test

X_train.shape

(800, 1)

In [166]:
words = ['good', 'great', 'bad', 'awful', 'not', 'angry', 'happy', 'glad', 'thrilled', 'sucks', 'recommend', 'wow', 'love', 'hate', 'nasty', 'service', 'food', 'atmosphere', 'loud', 'delicious', 'tasty', 'gross']
for word in words:
    X_train[str(word)] = X_train['review'].str.contains(str(word), case=False)
    X_test[str(word)] = X_test['review'].str.contains(str(word), case=False)
    
def print_results(y_test, y_pred, y_train, y_train_pred):
    cm = confusion_matrix(y_test, y_pred)
    accuracy = (y_test == y_pred).sum()/len(y_test)
    cm_train = confusion_matrix(y_train, y_train_pred)
    accuracy_train = (y_train == y_train_pred).sum()/len(y_train)
    print('---------------     Results     ---------------\n')
    print('---------------  Training Data  ---------------\n')
    print('Accuracy: \t{}%'.format(round(accuracy_train*100,1)))
    print('Sensitivity: \t{}%'.format(round(cm_train[1][1]*100/sum(cm_train[1]),1)))
    print('Specificity: \t{}%\n'.format(round(cm_train[0][0]*100/sum(cm_train[0]),1)))
    print('---------------   Out of Sample ---------------\n')
    print('Accuracy: \t{}%'.format(round(accuracy*100,1)))
    print('Sensitivity: \t{}%'.format(round(cm[1][1]*100/sum(cm[1]),1)))
    print('Specificity: \t{}%'.format(round(cm[0][0]*100/sum(cm[0]),1)))
    print('\n----------------Confusion Matrix---------------\n')
    print('                      Predicted Value')
    print('\t\t |  Positive | Negative  |')
    print('\t\t  -----------------------')
    print('Actual \t|Positive|\t{}   |    {}\t |'.format(cm[0][0], cm[0][1]))
    print('Value    --------------------------------')
    print('\t|Negative|\t{}   |    {}\t |'.format(cm[1][0], cm[1][1]))
    print('\n')
    

In [154]:
bnb = BernoulliNB()

bnb.fit(X_train[words], y_train)

y_pred = bnb.predict(X_test[words])

y_pred_train = bnb.predict(X_train[words])

print_results(y_test, y_pred, y_train, y_pred_train)

---------------     Results     ---------------

---------------  Training Data  ---------------

Accuracy: 	66.5%
Sensitivity: 	43.8%
Specificity: 	89.4%

---------------   Out of Sample ---------------

Accuracy: 	71.0%
Sensitivity: 	48.0%
Specificity: 	93.1%

----------------Confusion Matrix---------------

                      Predicted Value
		 |  Positive | Negative  |
		  -----------------------
Actual 	|Positive|	95   |    7	 |
Value    --------------------------------
	|Negative|	51   |    47	 |





Our first model is accurate 71% of the time on a test sample.

In [207]:
allwords = []

for review in X_train.review:
    review = review.lower()
    review = review.replace('.','').replace('!','').replace(',','').replace('?','').replace(':','').replace(';','')
    review = review.replace('"','').replace("'",'').replace('-','').replace('(','').replace(')','').replace('&','and')
    review = review.replace('$','').replace("%",'').replace('*','').replace('+','').replace('/','')
    allwords += review.split()


In [208]:
commonwords = list(pd.Series(allwords).value_counts().index[:100])

In [209]:
X_train = pd.DataFrame(X_train['review'])
X_test = pd.DataFrame(X_test['review'])

In [210]:
for word in commonwords:
    X_train[str(word)] = X_train['review'].str.contains(str(word), case=False)
    X_test[str(word)] = X_test['review'].str.contains(str(word), case=False)

bnb = BernoulliNB()

bnb.fit(X_train[commonwords], y_train)

y_pred = bnb.predict(X_test[commonwords])

y_pred_train = bnb.predict(X_train[commonwords])

print_results(y_test, y_pred, y_train, y_pred_train)

---------------     Results     ---------------

---------------  Training Data  ---------------

Accuracy: 	75.2%
Sensitivity: 	78.1%
Specificity: 	72.4%

---------------   Out of Sample ---------------

Accuracy: 	66.5%
Sensitivity: 	71.4%
Specificity: 	61.8%

----------------Confusion Matrix---------------

                      Predicted Value
		 |  Positive | Negative  |
		  -----------------------
Actual 	|Positive|	63   |    39	 |
Value    --------------------------------
	|Negative|	28   |    70	 |




Our model using the top 100 most common review words was slightly less accurate, at 66.5%. We got better at identifying negative reviews, but lost some of our accuracy we had for predicting positive reviews.

In [211]:
X_train = pd.DataFrame(X_train['review'])
X_test = pd.DataFrame(X_test['review'])

In [212]:
commonwords = list(pd.Series(allwords).value_counts().index[:400])

for word in commonwords:
    X_train[str(word)] = X_train['review'].str.contains(str(word), case=False)
    X_test[str(word)] = X_test['review'].str.contains(str(word), case=False)


bnb = BernoulliNB()

bnb.fit(X_train[commonwords], y_train)

y_pred = bnb.predict(X_test[commonwords])

y_pred_train = bnb.predict(X_train[commonwords])

print_results(y_test, y_pred, y_train, y_pred_train)

---------------     Results     ---------------

---------------  Training Data  ---------------

Accuracy: 	84.2%
Sensitivity: 	88.1%
Specificity: 	80.4%

---------------   Out of Sample ---------------

Accuracy: 	73.0%
Sensitivity: 	81.6%
Specificity: 	64.7%

----------------Confusion Matrix---------------

                      Predicted Value
		 |  Positive | Negative  |
		  -----------------------
Actual 	|Positive|	66   |    36	 |
Value    --------------------------------
	|Negative|	18   |    80	 |




We increased our ability to detect negative reviews, without affecting our ability to detect positive ones. We have a fairly steep drop in accuracy and the associated measurements as we go from training data to testing. This should alert us to the possibility that our model is overfitting.

In [216]:
# Having a column named 'review' was giving us trouble. When a review with the word "review" was encountered,
# the column containing our reviews would be replaced with a boolean type category. The columns are renamed
# "og_review," from here onward to resolve this issue

X_train.rename(columns={'review': 'og_review'}, inplace=True)
X_test.rename(columns={'review': 'og_review'}, inplace=True)


X_train = pd.DataFrame(X_train['og_review'])
X_test = pd.DataFrame(X_test['og_review'])

In [None]:
# Find which words are most correlated with either sentiment

In [217]:
print(X_train.shape)

X_train.rename(columns={'review': 'og_review'}, inplace=True)

errors = []

for word in np.unique(allwords):
    try:
        X_train[str(word)] = X_train['og_review'].str.contains(str(word), case=False)
    except:
        errors.append(word)
    
print(yelp_raw.shape)

(800, 1)
(1000, 2081)


In [236]:
corr_df = pd.concat([y_train, X_train], axis=1)
corr_list = corr_df.corr().iloc[0,:]


Index(['positive', '1', '10', '100', '1199', '12', '15', '15lb', '2', '20'], dtype='object')

In [248]:
corr_list.nlargest(20)[1:]

great        0.257740
good         0.161075
del          0.160331
nice         0.143316
eat          0.140505
delicious    0.137543
amazing      0.132095
az           0.131020
friendly     0.131020
perfect      0.117486
love         0.117446
fantastic    0.111948
friend       0.110485
fan          0.108110
ice          0.106928
awesome      0.106136
excellent    0.106136
loved        0.106136
an           0.105550
Name: positive, dtype: float64

In [249]:
corr_list.nsmallest(20)

no         -0.271282
not        -0.270048
do         -0.150424
bad        -0.134129
her        -0.133954
minutes    -0.129168
or         -0.128560
too        -0.118099
got        -0.115063
min        -0.113706
terrible   -0.113073
3          -0.109395
being      -0.109395
much       -0.109395
other      -0.108663
ok         -0.107893
nut        -0.107500
bus        -0.107202
would      -0.106261
worst      -0.103455
Name: positive, dtype: float64