# Sentiment Analysis with Naive Bayes

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [2]:
# Load Yelp review data
yelp_path = 'textdata/yelp_labelled.txt'
yelp_raw = pd.read_csv(yelp_path, delimiter='\t', header=None)
yelp_raw.columns = ['review', 'positive']

# Create train and test data sets
X_train, X_test, y_train, y_test = train_test_split(yelp_raw['review'], yelp_raw['positive'], test_size = 0.2, random_state=20)
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)



(800, 1)

Our first model tests 22 words that I believe would be indicative of a positive or negative review.

In [3]:
# Specify words
words = ['good', 'great', 'bad', 'awful', 'not', 'angry', 'happy', 'glad', 'thrilled', 'sucks', 'recommend', 'wow', 'love', 'hate', 'nasty', 'service', 'food', 'atmosphere', 'loud', 'delicious', 'tasty', 'gross']

# Create dummies indicating if a review contains a specified word
for word in words:
    X_train[str(word)] = X_train['review'].str.contains(str(word), case=False)
    X_test[str(word)] = X_test['review'].str.contains(str(word), case=False)

# Create function printing results of a model's prediction
def print_results(y_test, y_pred, y_train, y_train_pred):
    
    # Compute measurements for test data
    cm = confusion_matrix(y_test, y_pred)
    accuracy = (y_test == y_pred).sum()/len(y_test)
    
    # Computer measurements for training data
    cm_train = confusion_matrix(y_train, y_train_pred)
    accuracy_train = (y_train == y_train_pred).sum()/len(y_train)
    
    # Print Accuracy and associated meaasurements for training data
    print('---------------     Results     ---------------\n')
    print('---------------  Training Data  ---------------\n')
    print('Accuracy: \t{}%'.format(round(accuracy_train*100,1)))
    print('Sensitivity: \t{}%'.format(round(cm_train[1][1]*100/sum(cm_train[1]),1)))
    print('Specificity: \t{}%\n'.format(round(cm_train[0][0]*100/sum(cm_train[0]),1)))
    
    # Print Accuracy and associated meaasurements for test sample predictions
    print('---------------   Out of Sample ---------------\n')
    print('Accuracy: \t{}%'.format(round(accuracy*100,1)))
    print('Sensitivity: \t{}%'.format(round(cm[1][1]*100/sum(cm[1]),1)))
    print('Specificity: \t{}%'.format(round(cm[0][0]*100/sum(cm[0]),1)))
    
    # Print confusion matrix for test sample predictions
    print('\n----------------Confusion Matrix---------------\n')
    print('                      Predicted Value')
    print('\t\t |  Positive | Negative  |')
    print('\t\t  -----------------------')
    print('Actual \t|Positive|\t{}   |    {}\t |'.format(cm[0][0], cm[0][1]))
    print('Value    --------------------------------')
    print('\t|Negative|\t{}   |    {}\t |'.format(cm[1][0], cm[1][1]))
    print('\n')
    

In [4]:
# Initialize model
bnb = BernoulliNB()

# Compute parameter estimates with training data
bnb.fit(X_train[words], y_train)

# Predict target values for our test data
y_pred = bnb.predict(X_test[words])

# Predict target values with data the model was trained on
y_pred_train = bnb.predict(X_train[words])

# Print results
print_results(y_test, y_pred, y_train, y_pred_train)

---------------     Results     ---------------

---------------  Training Data  ---------------

Accuracy: 	66.5%
Sensitivity: 	43.8%
Specificity: 	89.4%

---------------   Out of Sample ---------------

Accuracy: 	71.0%
Sensitivity: 	48.0%
Specificity: 	93.1%

----------------Confusion Matrix---------------

                      Predicted Value
		 |  Positive | Negative  |
		  -----------------------
Actual 	|Positive|	95   |    7	 |
Value    --------------------------------
	|Negative|	51   |    47	 |




Our first model is accurate 71% of the time on a test sample.

In [5]:
# Create a list of all (cleaned) words in the training data

allwords = []

for review in X_train.review:
    review = review.lower()
    review = review.replace('.','').replace('!','').replace(',','').replace('?','').replace(':','').replace(';','')
    review = review.replace('"','').replace("'",'').replace('-','').replace('(','').replace(')','').replace('&','and')
    review = review.replace('$','').replace("%",'').replace('*','').replace('+','').replace('/','')
    allwords += review.split()

# Create list of 100 most common words in the data set
commonwords = list(pd.Series(allwords).value_counts().index[:100])

The next model uses the 100 most common words in our sample as factors.

In [8]:
# Remove factors from previous model
X_train = pd.DataFrame(X_train['review'])
X_test = pd.DataFrame(X_test['review'])

# Factor Dummies
for word in commonwords:
    X_train[str(word)] = X_train['review'].str.contains(str(word), case=False)
    X_test[str(word)] = X_test['review'].str.contains(str(word), case=False)

# Estimate model and test on our test sample
bnb = BernoulliNB()
bnb.fit(X_train[commonwords], y_train)
y_pred = bnb.predict(X_test[commonwords])
y_pred_train = bnb.predict(X_train[commonwords])
print_results(y_test, y_pred, y_train, y_pred_train)

---------------     Results     ---------------

---------------  Training Data  ---------------

Accuracy: 	75.2%
Sensitivity: 	78.1%
Specificity: 	72.4%

---------------   Out of Sample ---------------

Accuracy: 	66.5%
Sensitivity: 	71.4%
Specificity: 	61.8%

----------------Confusion Matrix---------------

                      Predicted Value
		 |  Positive | Negative  |
		  -----------------------
Actual 	|Positive|	63   |    39	 |
Value    --------------------------------
	|Negative|	28   |    70	 |




Our model using the top 100 most common review words was slightly less accurate, at 66.5%. We got better at identifying negative reviews, but lost some of our accuracy we had for predicting positive reviews. We'll try again with the top 400 words.

In [10]:
# Remove factors from previous model
X_train = pd.DataFrame(X_train['review'])
X_test = pd.DataFrame(X_test['review'])

# List of 400 most common words
commonwords = list(pd.Series(allwords).value_counts().index[:400])

# Create factor dummies
for word in commonwords:
    X_train[str(word)] = X_train['review'].str.contains(str(word), case=False)
    X_test[str(word)] = X_test['review'].str.contains(str(word), case=False)

# Estimate and predict
bnb = BernoulliNB()
bnb.fit(X_train[commonwords], y_train)
y_pred = bnb.predict(X_test[commonwords])
y_pred_train = bnb.predict(X_train[commonwords])
print_results(y_test, y_pred, y_train, y_pred_train)

---------------     Results     ---------------

---------------  Training Data  ---------------

Accuracy: 	84.8%
Sensitivity: 	87.8%
Specificity: 	81.7%

---------------   Out of Sample ---------------

Accuracy: 	73.0%
Sensitivity: 	82.7%
Specificity: 	63.7%

----------------Confusion Matrix---------------

                      Predicted Value
		 |  Positive | Negative  |
		  -----------------------
Actual 	|Positive|	65   |    37	 |
Value    --------------------------------
	|Negative|	17   |    81	 |




We increased our ability to detect negative reviews, without affecting our ability to detect positive ones. We have a fairly steep drop in accuracy and the associated measurements as we go from training data to testing. This should alert us to the possibility that our model is overfitting.

In [11]:
# Having a column named 'review' was giving us trouble. When a review with the word "review" was encountered,
# the column containing our reviews would be replaced with a boolean type category. The columns are renamed
# "og_review," from here onward to resolve this issue

X_train.rename(columns={'review': 'og_review'}, inplace=True)
X_test.rename(columns={'review': 'og_review'}, inplace=True)


X_train = pd.DataFrame(X_train['og_review'])
X_test = pd.DataFrame(X_test['og_review'])

In [12]:
# Find which words are most correlated with either sentiment

In [13]:
# Create factor dummies for every word in our training data set
for word in np.unique(allwords):
    try:
        X_train[str(word)] = X_train['og_review'].str.contains(str(word), case=False)
    except:
        errors.append(word)

# List of correlation coefficients between factors and target variable
corr_df = pd.concat([y_train, X_train], axis=1)
corr_list = corr_df.corr().iloc[0,:]

# List of the 25 words most positively and negatively correlated with target variable
corr_words = list(corr_list.nlargest(26)[1:].index)
corr_words += list(corr_list.nsmallest(25).index)

(800, 1)
(1000, 2)


In [38]:
# Remove factors from previous model
X_train = pd.DataFrame(X_train['og_review'])
X_test = pd.DataFrame(X_test['og_review'])

# Create factor dummies
for word in corr_words:
    X_train[str(word)] = X_train['og_review'].str.contains(str(word), case=False)
    X_test[str(word)] = X_test['og_review'].str.contains(str(word), case=False)

# Estimate and predict
bnb = BernoulliNB()
bnb.fit(X_train[corr_words], y_train)
y_pred = bnb.predict(X_test[corr_words])
y_pred_train = bnb.predict(X_train[corr_words])
print_results(y_test, y_pred, y_train, y_pred_train)

---------------     Results     ---------------

---------------  Training Data  ---------------

Accuracy: 	76.9%
Sensitivity: 	81.8%
Specificity: 	71.9%

---------------   Out of Sample ---------------

Accuracy: 	69.0%
Sensitivity: 	77.6%
Specificity: 	60.8%

----------------Confusion Matrix---------------

                      Predicted Value
		 |  Positive | Negative  |
		  -----------------------
Actual 	|Positive|	62   |    40	 |
Value    --------------------------------
	|Negative|	22   |    76	 |




It's possible a word could be highly correlated with our target but not very common in our data set. Our model would be overly sensitive to the word or words that fit this description.

In [52]:
# Create list of words that are highly correlated with our target and in the list of top 400 words
# List was adjusted to have roughly the same number of factors as the last model
corr_words = list(corr_list.nlargest(36)[1:].index) + list(corr_list.nsmallest(35).index)
corr_common_words = list(set(corr_words) & set(commonwords))
len(corr_common_words)

51

In [53]:
# Remove factors from previous model
X_train = pd.DataFrame(X_train['og_review'])
X_test = pd.DataFrame(X_test['og_review'])

# Create factor dummies
for word in corr_common_words:
    X_train[str(word)] = X_train['og_review'].str.contains(str(word), case=False)
    X_test[str(word)] = X_test['og_review'].str.contains(str(word), case=False)

# Estimate and predict
bnb = BernoulliNB()
bnb.fit(X_train[corr_common_words], y_train)
y_pred = bnb.predict(X_test[corr_common_words])
y_pred_train = bnb.predict(X_train[corr_common_words])
print_results(y_test, y_pred, y_train, y_pred_train)

---------------     Results     ---------------

---------------  Training Data  ---------------

Accuracy: 	78.6%
Sensitivity: 	83.3%
Specificity: 	73.9%

---------------   Out of Sample ---------------

Accuracy: 	70.0%
Sensitivity: 	77.6%
Specificity: 	62.7%

----------------Confusion Matrix---------------

                      Predicted Value
		 |  Positive | Negative  |
		  -----------------------
Actual 	|Positive|	64   |    38	 |
Value    --------------------------------
	|Negative|	22   |    76	 |




In [46]:
# Create model from top 100 words most positively and negatively correlated with our target, 
# that are also in the top 400 words
corr_words = list(corr_list.nlargest(101)[1:].index) + list(corr_list.nsmallest(100).index)
corr_common_words = list(set(corr_words) & set(commonwords))
len(corr_common_words)

136

In [47]:
# Remove factors from previous model
X_train = pd.DataFrame(X_train['og_review'])
X_test = pd.DataFrame(X_test['og_review'])

# Create factor dummies
for word in corr_common_words:
    X_train[str(word)] = X_train['og_review'].str.contains(str(word), case=False)
    X_test[str(word)] = X_test['og_review'].str.contains(str(word), case=False)

# Estimate and predict
bnb = BernoulliNB()
bnb.fit(X_train[corr_common_words], y_train)
y_pred = bnb.predict(X_test[corr_common_words])
y_pred_train = bnb.predict(X_train[corr_common_words])
print_results(y_test, y_pred, y_train, y_pred_train)

---------------     Results     ---------------

---------------  Training Data  ---------------

Accuracy: 	85.5%
Sensitivity: 	88.6%
Specificity: 	82.4%

---------------   Out of Sample ---------------

Accuracy: 	73.0%
Sensitivity: 	78.6%
Specificity: 	67.6%

----------------Confusion Matrix---------------

                      Predicted Value
		 |  Positive | Negative  |
		  -----------------------
Actual 	|Positive|	69   |    33	 |
Value    --------------------------------
	|Negative|	21   |    77	 |




Similar accuracy to our model with the top 400 words, with approximately 1/3 of the factors.