# Amazon Review Sentiment Analysis
The purpose of this exercise is to determine whether sentiment of Amazon reviews can be determined from features comprised solely of words from the review text. Then the transferability of the predictions needs to be tested on other categories of Amazon reviews

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gzip
import requests
import os.path

from sklearn.metrics import confusion_matrix, roc_auc_score, make_scorer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer

%matplotlib inline

In [2]:
url = 'https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Home_v1_00.tsv.gz'
fn = 'ahome_reviews.csv'
if not os.path.isfile(fn):
    r = requests.get(url, stream=True)
    with open('ahome_reviews.csv', 'wb') as handle:
        for block in r.iter_content(chunk_size=1024):
            handle.write(block)

In [3]:
home_df = pd.read_csv(gzip.open('ahome_reviews.csv'), delimiter='\t', nrows=10000,error_bad_lines=False)

b'Skipping line 5100: expected 15 fields, saw 22\n'


In [4]:
home_df = home_df.loc[:, ['star_rating', 'review_body']]
home_df.head()

Unnamed: 0,star_rating,review_body
0,1,Don't buy it clapse on me within in 5 minutes
1,5,"Love this ,I bought this for my hurricane kit ..."
2,5,"Nice style, colors and design. Best of all it..."
3,2,Meeeh. Love Banksy's work but the quality of t...
4,4,Review by John Crescitelli using Wifey's login...


In [5]:
home_df.isna().sum()

star_rating    0
review_body    3
dtype: int64

In [6]:
home_df.dropna(how='any', axis=0, inplace=True)

In [7]:
positive_words = ['good', 'better', 'best', 'great', 'greatest', 'excellent', 'amazing', 'awesome', 'incredible', 'nice']

for word in positive_words:
    home_df[word] = home_df['review_body'].apply(lambda x: 1 if word in x.lower() else 0)

In [8]:
home_df['sentiment'] = home_df['star_rating'].apply(lambda x: 1 if x>3 else 0)
home_df.describe()

Unnamed: 0,star_rating,good,better,best,great,greatest,excellent,amazing,awesome,incredible,nice,sentiment
count,9997.0,9997.0,9997.0,9997.0,9997.0,9997.0,9997.0,9997.0,9997.0,9997.0,9997.0,9997.0
mean,4.251075,0.113634,0.031209,0.025708,0.20096,0.0007,0.023807,0.014904,0.018506,0.0008,0.097329,0.797439
std,1.254595,0.317382,0.173892,0.15827,0.400738,0.026454,0.152456,0.121177,0.134777,0.028279,0.29642,0.401928
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,5.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
X = home_df.iloc[:, 2:-1]
Y = home_df.iloc[:, -1]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=1337)

In [10]:
bnb = BernoulliNB()
bnb.fit(X_train, Y_train)
bnb.score(X_test, Y_test)

0.806

In [11]:
 cross_val_score(bnb, X, Y, cv=8)

array([0.79696243, 0.7976    , 0.7976    , 0.7976    , 0.79743795,
       0.79743795, 0.79743795, 0.79743795])

In [12]:
pred = bnb.predict(X_test)
confusion_matrix(Y_test, pred)

array([[   0,  485],
       [   0, 2015]], dtype=int64)

In [13]:
clf = GradientBoostingClassifier(loss='exponential', learning_rate=0.5, n_estimators=500, subsample=0.9)
clf.fit(X_train, Y_train)
clf.score(X_train, Y_train)

0.7947178871548619

In [14]:
gb_pred = clf.predict(X_test)
confusion_matrix(Y_test, gb_pred)

array([[   0,  485],
       [   6, 2009]], dtype=int64)

In [15]:
def balanced_ttsplit(X, Y, test_split=0.25):
    '''A function that takes in data frames or arrays with features and targets
    and returns balanced training and test splits for those features and targets'''
    # Determine number of classes
    target_values = np.unique(Y)
    
    # Create a dictionary linking the classes to their indices
    indices = {}
    for value in target_values:
        indices[value] = np.where(Y==value)
        
    # Determine the smallest class and determine n_test/train based on that class
    min_length = min([len(v[0]) for k,v in indices.items()])
    
    n_test = int(test_split * min_length)
    n_train = int(min_length - n_test)
    
    X_train, X_test = pd.DataFrame(), pd.DataFrame()
    Y_train, Y_test = pd.DataFrame(), pd.DataFrame()
    
    for value in target_values:
        # Randomly select indices for test and train sets
        test_indices = np.random.choice(indices[value][0], size=n_test, replace=False)
        train_choices = [e for e in indices[value][0] if e not in test_indices]
        train_indices = np.random.choice(train_choices, size=n_train, replace=False)
        
        # Append randomly selected features to the feature sets
        X_train = pd.concat([X_train, X.iloc[train_indices]])
        X_test = pd.concat([X_test, X.iloc[test_indices]])
        
        # Append randomly selected targets to the target sets
        Y_train = pd.concat([Y_train, Y.iloc[train_indices]])
        Y_test = pd.concat([Y_test, Y.iloc[test_indices]])
       
        
    return X_train, X_test, Y_train, Y_test
  

In [16]:
X_train, X_test, Y_train, Y_test = balanced_ttsplit(X, Y)

In [17]:
bnb = BernoulliNB()
bnb.fit(X_train, Y_train)
bnb.score(X_test, Y_test)

  y = column_or_1d(y, warn=True)


0.6205533596837944

In [18]:
pred = bnb.predict(X_test)
confusion_matrix(Y_test, pred)

array([[414,  92],
       [292, 214]], dtype=int64)

In [19]:
clf = GradientBoostingClassifier(n_estimators=250)
clf.fit(X_train, Y_train)
clf.score(X_train, Y_train)

  y = column_or_1d(y, warn=True)


0.6379196840026333

In [20]:
gb_pred = clf.predict(X_test)
confusion_matrix(Y_test, gb_pred)

array([[399, 107],
       [255, 251]], dtype=int64)

In [21]:
pos_df = home_df[['star_rating', 'review_body', 'sentiment']].iloc[list(np.where(home_df['sentiment']==1))[0], :]
neg_df = home_df[['star_rating', 'review_body', 'sentiment']].iloc[list(np.where(home_df['sentiment']==0))[0], :]

In [22]:
vect = CountVectorizer(max_df=0.9, min_df=0.02)
vect.fit_transform(home_df['review_body'])
print(vect.get_feature_names())

['34', 'about', 'after', 'again', 'all', 'also', 'am', 'an', 'and', 'another', 'any', 'are', 'arrived', 'as', 'at', 'back', 'be', 'beautiful', 'because', 'bed', 'been', 'before', 'best', 'better', 'bit', 'bought', 'br', 'but', 'buy', 'by', 'came', 'can', 'clean', 'color', 'colors', 'could', 'cute', 'definitely', 'did', 'didn', 'do', 'does', 'doesn', 'don', 'easily', 'easy', 'enough', 'even', 'every', 'exactly', 'excellent', 'expected', 'far', 'fast', 'few', 'first', 'fit', 'for', 'from', 'get', 'good', 'got', 'great', 'had', 'happy', 'has', 'have', 'how', 'if', 'in', 'into', 'is', 'it', 'item', 'its', 'just', 'keep', 'light', 'like', 'little', 'long', 'look', 'looking', 'looks', 'lot', 'love', 'loved', 'made', 'make', 'me', 'more', 'much', 'my', 'need', 'needed', 'new', 'nice', 'no', 'not', 'now', 'of', 'off', 'on', 'one', 'only', 'or', 'ordered', 'other', 'our', 'out', 'over', 'perfect', 'perfectly', 'pillow', 'pretty', 'price', 'product', 'purchase', 'purchased', 'put', 'quality', 'r

In [23]:
stpwrds = ['34', 'about', 'after', 'again', 'all', 'also', 'am', 'an', 'and', 'another', 'any', 'are', 'arrived', 'as',
           'at', 'back', 'be', 'because', 'bed', 'been', 'before', 'bit', 'bought', 'br',
           'but', 'buy', 'by', 'came', 'can', 'color', 'colors', 'could', 'definitely', 'did', 'didn',
           'do', 'does', 'doesn', 'don', 'easily', 'enough', 'even', 'every', 'exactly', 'expected',
           'far', 'few', 'first', 'fit', 'for', 'from', 'get', 'got', 'had', 'has',
           'have', 'how', 'if', 'in', 'into', 'is', 'it', 'item', 'its', 'just', 'keep', 'light', 'like', 'little',
           'long', 'look', 'looking', 'looks', 'lot', 'made', 'make', 'me', 'more', 'much', 'my',
           'need', 'needed', 'new', 'no', 'not', 'now', 'of', 'off', 'on', 'one', 'only', 'or', 'ordered',
           'other', 'our', 'out', 'over', 'pillow', 'price', 'product', 'purchase',
           'purchased', 'put', 'really', 'received', 'recommend', 'room', 'see', 'set', 'sheets',
           'size', 'small', 'so', 'some', 'still', 'sturdy', 'than', 'that', 'the', 'them', 'then',
           'there', 'these', 'they', 'thing', 'this', 'time', 'to', 'too', 'two', 'up', 'use', 'used', 'using', 've', 
           'very', 'wanted', 'was', 'way', 'we', 'well', 'were', 'what', 'when', 'which', 'will', 'with', 'would', 'you', 
           'your']
vect = CountVectorizer(max_df=0.9, min_df=0.02, stop_words=stpwrds)
vect.fit_transform(pos_df['review_body'])
pos_words = vect.get_feature_names()

In [24]:
vect = CountVectorizer(max_df=0.9, min_df=0.02, stop_words=stpwrds)
vect.fit_transform(neg_df['review_body'])
neg_words = vect.get_feature_names()
negative_words = [w for w in neg_words if w not in pos_words]

In [25]:
pos_words = [w for w in pos_words if w not in neg_words]

In [26]:
new_df = home_df[['star_rating', 'review_body', 'sentiment']]
for word in pos_words:
    new_df[word] = new_df['review_body'].apply(lambda x: 1 if word in x.lower() else 0)
for word in negative_words:
    new_df[word] = new_df['review_body'].apply(lambda x: -1 if word in x.lower() else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [27]:
X = new_df.iloc[:, 3:]
Y = new_df.iloc[:, 2]

X_train, X_test, Y_train, Y_test = balanced_ttsplit(X, Y)

bnb = BernoulliNB()
bnb.fit(X_train, Y_train)
bnb.score(X_test, Y_test)

  y = column_or_1d(y, warn=True)


0.6205533596837944

In [28]:
pred = bnb.predict(X_test)
confusion_matrix(Y_test, pred)

array([[458,  48],
       [336, 170]], dtype=int64)

In [29]:
clf = GradientBoostingClassifier(n_estimators=250)
clf.fit(X_train, Y_train)
clf.score(X_train, Y_train)

  y = column_or_1d(y, warn=True)


0.7597103357472021

In [30]:
gb_pred = clf.predict(X_test)
confusion_matrix(Y_test, gb_pred)

array([[323, 183],
       [ 76, 430]], dtype=int64)

In [31]:
'''roc_auc = make_scorer(roc_auc_score)

param_dict = {'n_estimators': [100, 250, 500], 'learning_rate': [0.0001, 0.001, 0.01, 0.1, 0.25],
              'max_depth': [2, 3, 4, None], 'loss': ['deviance', 'exponential'], 
              'subsample': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}

gscv = GridSearchCV(clf, param_dict, scoring=roc_auc, iid=False, cv=6)
gscv.fit(X, Y)
gscv.score(X, Y)
gscv.best_params_
gscv_pred = gscv.predict(X_test)
confusion_matrix(Y_test, gscv_pred)'''

"roc_auc = make_scorer(roc_auc_score)\n\nparam_dict = {'n_estimators': [100, 250, 500], 'learning_rate': [0.0001, 0.001, 0.01, 0.1, 0.25],\n              'max_depth': [2, 3, 4, None], 'loss': ['deviance', 'exponential'], \n              'subsample': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}\n\ngscv = GridSearchCV(clf, param_dict, scoring=roc_auc, iid=False, cv=6)\ngscv.fit(X, Y)\ngscv.score(X, Y)\ngscv.best_params_\ngscv_pred = gscv.predict(X_test)\nconfusion_matrix(Y_test, gscv_pred)"

In [32]:
#grid_score_df = pd.DataFrame(gscv.cv_results_)
#grid_score_df.head()

In [33]:
'''pd.options.display.max_columns=None
grid_score_df = grid_score_df.sort_values('rank_test_score')
grid_score_df.to_csv('amazon_reviews_gscv_params.csv')'''

"pd.options.display.max_columns=None\ngrid_score_df = grid_score_df.sort_values('rank_test_score')\ngrid_score_df.to_csv('amazon_reviews_gscv_params.csv')"

In [34]:
rfc = RandomForestClassifier()

param_dict = {'n_estimators': [50, 100, 500],
              'max_depth': [2, 3, None], 'criterion': ['gini', 'entropy']}

gscv = GridSearchCV(rfc, param_dict, scoring='accuracy', iid=False, cv=6)
gscv.fit(X_train, Y_train)
gscv.score(X_train, Y_train)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

0.8469387755102041

In [35]:
gscv_pred = gscv.predict(X_test)
confusion_matrix(Y_test, gscv_pred)

array([[355, 151],
       [117, 389]], dtype=int64)

In [36]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(C=1, penalty='l2', solver='liblinear', max_iter=1000)

# Fit the model.
fit = lr.fit(X_train, Y_train)

# Display.
print('Coefficients')
print(fit.coef_)
print(fit.intercept_)
pred_lr = lr.predict(X_test)

print('\n Percentage accuracy')
print(lr.score(X_train, Y_train))

Coefficients
[[ 2.33126511  1.26091554  1.09372848  0.73067701  1.50256653  0.86607583
   1.58025501  1.13224901  0.03488916  1.30760661  2.23746334 -0.0201076
   1.6204229   0.35113592 -0.32461106  0.18631558 -0.31931909  0.46741116
   1.01752084  0.55875011  0.00444142  0.42253783  1.81332961 -0.27028791
   1.30917321 -0.03471483  0.26020354  0.38214261  1.69670435  0.57071517
   0.20547899  0.04208272  0.51961453 -0.46397511  0.17480597  0.24858532
   1.54952907 -0.01376132  0.18795703  0.86653474  0.81752907  0.96325676
   0.52957591  0.41924065  0.11772186  0.20502657  0.50604947  0.45295587
   0.73923044  0.39355883  0.48256408  2.41150842 -0.24729067  2.52903178
   0.19904314  0.73770041  0.5656392   0.1918377   1.1485485   0.53275282
   0.29680532  0.54989308 -0.11494333  0.2468225   0.80913782  0.41760558
   0.08169245  0.20218944  0.02629201 -0.06089366  0.56663622  0.50408056
   0.91959176 -0.07763109 -0.03181416  0.32154532 -0.06220041  0.21747574
  -0.41121836  0.59091613 

  y = column_or_1d(y, warn=True)


In [37]:
confusion_matrix(Y_test, pred_lr)

array([[328, 178],
       [ 81, 425]], dtype=int64)

In [38]:
lr = LogisticRegression(C=10, penalty='l1', solver='liblinear', max_iter=1000)

# Fit the model.
fit = lr.fit(X_train, Y_train)

# Display.
print('Coefficients')
print(fit.coef_)
print(fit.intercept_)
pred_lr = lr.predict(X_test)

print('\n Percentage accuracy')
print(lr.score(X_train, Y_train))

Coefficients
[[ 3.33888539e+00  1.39557226e+00  1.25388944e+00  7.87980215e-01
   1.83131037e+00  9.47220351e-01  1.88440213e+00  1.27893070e+00
   2.58007946e-02  1.58945458e+00  2.49970659e+00 -2.01658378e-01
   2.13736937e+00  3.69601817e-01 -4.02221172e-01  2.23618469e-01
  -3.97866615e-01  5.47765457e-01  1.22614648e+00  6.99587787e-01
   0.00000000e+00  4.57563700e-01  2.38764028e+00 -8.52148940e-01
   1.41763763e+00 -6.82325689e-02  2.91186967e-01  4.38276331e-01
   1.92429525e+00  6.51228398e-01  1.99160265e-01  6.30862220e-02
   5.67135124e-01 -4.90768682e-01  1.85078661e-01  2.78431839e-01
   1.79726738e+00 -3.48477755e-03  1.72656366e-01  9.84278633e-01
   8.99630729e-01  1.06845435e+00  5.86970577e-01  4.78621619e-01
   9.66328944e-02  2.06538801e-01  5.87876916e-01  4.27332040e-01
   8.12544072e-01  4.37758446e-01  5.14230977e-01  3.66054678e+00
  -2.72108081e-01  3.23131589e+00 -3.23352378e-01  8.25726539e-01
   6.05548302e-01  1.90793115e-01  1.34731912e+00  6.58414066e-

  y = column_or_1d(y, warn=True)


In [39]:
confusion_matrix(Y_test, pred_lr)

array([[328, 178],
       [ 82, 424]], dtype=int64)

In [40]:
url = 'https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Health_Personal_Care_v1_00.tsv.gz'
fn = 'ahealth_reviews.csv'
if not os.path.isfile(fn):
    r = requests.get(url, stream=True)
    with open('ahealth_reviews.csv', 'wb') as handle:
        for block in r.iter_content(chunk_size=1024):
            handle.write(block)
health_df = pd.read_csv(gzip.open('ahealth_reviews.csv'), delimiter='\t', nrows=5000)

In [41]:
health_df = health_df.loc[:, ['star_rating', 'review_body']]
health_df.dropna(how='any', axis=0, inplace=True)

for word in pos_words:
    health_df[word] = health_df['review_body'].apply(lambda x: 1 if word in x.lower() else 0)
for word in negative_words:
    health_df[word] = health_df['review_body'].apply(lambda x: -1 if word in x.lower() else 0)
    
health_df['sentiment'] = health_df['star_rating'].apply(lambda x: 1 if x>3 else 0)

X = health_df.iloc[:, 2:-1]
Y = health_df.iloc[:, -1]

In [42]:
models = [bnb, clf, gscv, lr]
for model in models:
    pred = model.predict(X)
    print('Accuracy for {} model: {}'.format(str(model).split('(')[0], model.score(X,Y)))

Accuracy for BernoulliNB model: 0.37695078031212487
Accuracy for GradientBoostingClassifier model: 0.7318927571028412
Accuracy for GridSearchCV model: 0.6884753901560624
Accuracy for LogisticRegression model: 0.7384953981592637


In [43]:
max_class = max(len(np.where(Y==1)[0]), len(np.where(Y==0)[0]))
print('Null Accuracy: {:2f}'.format(max_class/len(Y)))

Null Accuracy: 0.798319


In [44]:
url = 'https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Digital_Ebook_Purchase_v1_00.tsv.gz'
fn = 'aebook_reviews.csv'
if not os.path.isfile(fn):
    r = requests.get(url, stream=True)
    with open('aebook_reviews.csv', 'wb') as handle:
        for block in r.iter_content(chunk_size=1024):
            handle.write(block)
ebook_df = pd.read_csv(gzip.open('aebook_reviews.csv'), delimiter='\t', nrows=5000)

In [45]:
ebook_df = ebook_df.loc[:, ['star_rating', 'review_body']]
ebook_df.dropna(how='any', axis=0, inplace=True)

for word in pos_words:
    ebook_df[word] = ebook_df['review_body'].apply(lambda x: 1 if word in x.lower() else 0)
for word in negative_words:
    ebook_df[word] = ebook_df['review_body'].apply(lambda x: -1 if word in x.lower() else 0)
    
ebook_df['sentiment'] = ebook_df['star_rating'].apply(lambda x: 1 if x>3 else 0)

X = ebook_df.iloc[:, 2:-1]
Y = ebook_df.iloc[:, -1]

In [46]:
models = [bnb, clf, gscv, lr]
for model in models:
    pred = model.predict(X)
    print('Accuracy for {} model: {}'.format(str(model).split('(')[0], model.score(X,Y)))

Accuracy for BernoulliNB model: 0.3606
Accuracy for GradientBoostingClassifier model: 0.6996
Accuracy for GridSearchCV model: 0.6336
Accuracy for LogisticRegression model: 0.7018


In [47]:
max_class = max(len(np.where(Y==1)[0]), len(np.where(Y==0)[0]))
print('Null Accuracy: {:2f}'.format(max_class/len(Y)))

Null Accuracy: 0.848000


## Conclusion

In all models, in all cases, I have been unsuccessful in beating null accuracy (predicting only the dominant class). Possible ways to get around this road block:

- Select features other than words in the review body
- Use n-grams to catch phrases that may be more helpful than individual words
- Use the counts of the positive words and the negative counts of the negative words
- Create a cost function that heavily penalizes false predictions and iterate over a model designed to optimize based on that cost function