In [77]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [78]:
# Grab and process the raw data.

df = pd.read_csv('/Users/whaight/Downloads/sentiment labelled sentences/amazon_cells_labelled.txt', delimiter= '\t', header=None)
df.columns = ['review', 'positive']

df.head()

Unnamed: 0,review,positive
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [79]:
# Let's get those good positive and negative words:

pos_words = pd.read_csv('/Users/whaight/Downloads/sentiment labelled sentences/positive_words.csv')
pos_words.head()


Unnamed: 0,positive_sentiment_list
0,a+
1,abound
2,abounds
3,abundance
4,abundant


In [80]:
pos_word_list = pos_words['positive_sentiment_list'].unique()
print(pos_word_list[:10])

['a+' 'abound' 'abounds' 'abundance' 'abundant' 'accessable' 'accessible'
 'acclaim' 'acclaimed' 'acclamation']


In [81]:
keywords = pos_word_list

for key in keywords:
    # Note that we add spaces around the key so that we're getting the word,
    # not just pattern matching.
    df[str(key)] = df.review.str.contains(
        ' ' + str(key),
        case=False
    )

In [82]:
df['pos_sentiment'] = (df['positive'] == 1)
# Note that if you run this cell a second time everything will become false.
# So... Don't.

In [83]:
data = df[keywords]

In [84]:
data.head()

Unnamed: 0,a+,abound,abounds,abundance,abundant,accessable,accessible,acclaim,acclaimed,acclamation,...,wow,wowed,wowing,wows,yay,youthful,zeal,zenith,zest,zippy
0,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [85]:
target = df['pos_sentiment']

In [86]:
# Our data is binary / boolean, so we're importing the Bernoulli classifier.
from sklearn.naive_bayes import BernoulliNB

# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))

Number of mislabeled points out of a total 1000 points : 227


In [87]:
# Now test it out on another data set

test_df = pd.read_csv('/Users/whaight/Downloads/sentiment labelled sentences/imdb_labelled.txt', delimiter= '\t', header=None)
test_df.columns = ['review', 'positive']

test_df.head()

Unnamed: 0,review,positive
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [89]:
test_df['pos_sentiment'] = (test_df['positive'] == 1)

In [88]:
for key in keywords:
    # Note that we add spaces around the key so that we're getting the word,
    # not just pattern matching.
    test_df[str(key)] = test_df.review.str.contains(
        ' ' + str(key),
        case=False
    )

In [90]:
test_data = test_df[keywords]

In [91]:
test_target = test_df['pos_sentiment']

In [93]:
# Classify, storing the result in a new variable.
test_y_pred = bnb.predict(test_data)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (test_target != test_y_pred).sum()
))

Number of mislabeled points out of a total 1000 points : 257


In [94]:
# Grab and process the raw data.

df = pd.read_csv('/Users/whaight/Downloads/sentiment labelled sentences/amazon_cells_labelled.txt', delimiter= '\t', header=None)
df.columns = ['review', 'positive']

df.head()

Unnamed: 0,review,positive
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [95]:
neg_words = pd.read_csv('/Users/whaight/Downloads/sentiment labelled sentences/negative_words_reduced.csv')
neg_words.head()

Unnamed: 0,negative_sentiment_list
0,2-faced
1,abnormal
2,abominable
3,abominably
4,abomination


In [96]:
neg_word_list = neg_words['negative_sentiment_list'].unique()
print(neg_word_list[:10])

['2-faced' 'abnormal' 'abominable' 'abominably' 'abomination' 'abrasive'
 'abrupt' 'absence' 'absurd' 'abuse']


In [97]:
keywords = neg_word_list

for key in keywords:
    # Note that we add spaces around the key so that we're getting the word,
    # not just pattern matching.
    df[str(key)] = df.review.str.contains(
        ' ' + str(key),
        case=False
    )

In [98]:
df['neg_sentiment'] = (df['positive'] == 0)
# Note that if you run this cell a second time everything will become false.
# So... Don't.

In [99]:
# sns.heatmap(df.corr())

In [100]:
data = df[keywords]

In [101]:
data.head()

Unnamed: 0,2-faced,abnormal,abominable,abominably,abomination,abrasive,abrupt,absence,absurd,abuse,...,wrongful,wrongly,wrought,yawn,zap,zapped,zaps,zealot,zealous,zombie
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [102]:
target = df['neg_sentiment']

In [103]:
# Our data is binary / boolean, so we're importing the Bernoulli classifier.
from sklearn.naive_bayes import BernoulliNB

# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))


Number of mislabeled points out of a total 1000 points : 289


In [104]:
# Now test it out on another data set

test_df = pd.read_csv('/Users/whaight/Downloads/sentiment labelled sentences/imdb_labelled.txt', delimiter= '\t', header=None)
test_df.columns = ['review', 'positive']

test_df.head()

Unnamed: 0,review,positive
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [105]:
test_df['neg_sentiment'] = (test_df['positive'] == 0)

In [106]:
for key in keywords:
    # Note that we add spaces around the key so that we're getting the word,
    # not just pattern matching.
    test_df[str(key)] = test_df.review.str.contains(
        ' ' + str(key),
        case=False
    )

In [107]:
test_data = test_df[keywords]

In [108]:
test_target = test_df['neg_sentiment']

In [109]:
# Classify, storing the result in a new variable.
test_y_pred = bnb.predict(test_data)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (test_target != test_y_pred).sum()
))

Number of mislabeled points out of a total 1000 points : 233
