In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind
import sklearn
import scipy
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix
%matplotlib inline

# Thinkful's Iterate and Evaluate Your Classifier Challenge

Data Source: https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences

Using evaluation techniques to look at my classifier's performance in more detail.

In [4]:
#import data
df = pd.read_csv('sentiment_labelled_sentences/amazon_cells_labelled.txt',
                 delimiter='\t',
                 header=None,
                 names=['text', 'score'])
df.head()

Unnamed: 0,text,score
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [8]:
#first attempt - good words only
#cast everything to lowercase to make it easier to match words
df = df.apply(lambda x: x.astype(str).str.lower())
    
#picking keywords associated with good reviews
keywords = ['good', 'great', 'excellent', 'beautiful', 'best', 'satisfied']

for key in keywords:
    # Noted adding spaces around keyword to make sure matching on word and not pattern
    df[str(key)] = df.text.str.contains(
        ' ' + str(key) + ' ',
        case=True
    )

data = df[keywords]
target = df['score']
    
# Our data is binary / boolean so using Bernoulli classifier.


# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)
    
total_points = data.shape[0]
wrong_points = (target != y_pred).sum()
score = bnb.score(data, target)
    
    
confusion = confusion_matrix(target, y_pred)
confusion

array([[487,  13],
       [405,  95]])

In [6]:
# Test your model with different holdout groups.

from sklearn.model_selection import train_test_split
# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=20)
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(data, target).score(data, target)))

With 20% Holdout: 0.585
Testing on Sample: 0.582


As the scores here are very similar it doesn't seem like the model is overfitting.

In [7]:
#cross validation - leave one out
from sklearn.model_selection import cross_val_score
cross_val_score(bnb, data, target, cv=10)

array([0.58, 0.59, 0.59, 0.57, 0.59, 0.6 , 0.57, 0.58, 0.61, 0.53])

In [10]:
#second attempt - bad words only
#cast everything to lowercase to make it easier to match words

df = df.apply(lambda x: x.astype(str).str.lower())
    
#picking keywords associated with bad reviews
keywords = ['unsatisfactory', 'disappointed', 'disappoint', 'junk', 'painful', 'unusable', 'negative']

for key in keywords:
    # Noted adding spaces around keyword to make sure matching on word and not pattern
    df[str(key)] = df.text.str.contains(
        ' ' + str(key) + ' ',
        case=False
    )

data = df[keywords]
target = df['score']
    
# Our data is binary / boolean so using Bernoulli classifier.


# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)
    
total_points = data.shape[0]
wrong_points = (target != y_pred).sum()
score = bnb.score(data, target)
    
confusion = confusion_matrix(target, y_pred)
confusion

array([[  8, 492],
       [  0, 500]])

In [12]:
# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=20)
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(data, target).score(data, target)))

With 20% Holdout: 0.47
Testing on Sample: 0.508


In [13]:
#cross validation - leave one out
from sklearn.model_selection import cross_val_score
cross_val_score(bnb, data, target, cv=10)

array([0.52, 0.5 , 0.51, 0.5 , 0.51, 0.5 , 0.5 , 0.5 , 0.5 , 0.52])

In [27]:
%reset -f

In [30]:
#third attempt - good and bad words together

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind
import sklearn
import scipy
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix
%matplotlib inline

df = pd.read_csv('sentiment_labelled_sentences/amazon_cells_labelled.txt',
                 delimiter='\t',
                 header=None,
                 names=['text', 'score'])

#cast everything to lowercase to make it easier to match words

df = df.apply(lambda x: x.astype(str).str.lower())
    
#picking keywords associated with good reviews
good = ['good', 'great', 'excellent', 'beautiful', 'best', 'satisfied']
    
#picking keywords associated with bad reviews
bad = ['unsatisfactory', 'disappointed', 'disappoint', 'junk', 'painful', 'unusable', 'negative']
    
keywords = good + bad

for key in bad:
    # Noted adding spaces around keyword to make sure matching on word and not pattern
    df[str(key)] = df.text.str.contains(
        ' ' + str(key) + ' ',
        case=False
    )
    
for key in good:
    # Noted adding spaces around keyword to make sure matching on word and not pattern
    df[str(key)] = df.text.str.contains(
        ' ' + str(key) + ' ',
        case=True
    )
    

data = df[keywords]
target = df['score']
    
# Our data is binary / boolean so using Bernoulli classifier.


# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)
    
total_points = data.shape[0]
wrong_points = (target != y_pred).sum()
score = bnb.score(data, target)
    
confusion = confusion_matrix(target, y_pred)
confusion

#noting here this is no better than the good words only

array([[487,  13],
       [405,  95]])

In [31]:
# Use train_test_split to create the necessary training and test groups
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=20)
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(data, target).score(data, target)))

With 20% Holdout: 0.585
Testing on Sample: 0.582


In [32]:
#cross validation - leave one out
from sklearn.model_selection import cross_val_score
cross_val_score(bnb, data, target, cv=10)

array([0.58, 0.59, 0.59, 0.57, 0.59, 0.6 , 0.57, 0.58, 0.61, 0.53])

In [33]:
#resetting variables just in case

%reset -f

In [35]:
#fourth attempt - I can't imagine this making it better but not casting to lowercase

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind
import sklearn
import scipy
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix
%matplotlib inline

df = pd.read_csv('sentiment_labelled_sentences/amazon_cells_labelled.txt',
                 delimiter='\t',
                 header=None,
                 names=['text', 'score'])

#picking keywords associated with good reviews
good = ['good', 'great', 'excellent', 'beautiful', 'best', 'satisfied']
    
#picking keywords associated with bad reviews
bad = ['unsatisfactory', 'disappointed', 'disappoint', 'junk', 'painful', 'unusable', 'negative']
    
keywords = good + bad

for key in bad:
    # Noted adding spaces around keyword to make sure matching on word and not pattern
    df[str(key)] = df.text.str.contains(
        ' ' + str(key) + ' ',
        case=False
    )
    
for key in good:
    # Noted adding spaces around keyword to make sure matching on word and not pattern
    df[str(key)] = df.text.str.contains(
        ' ' + str(key) + ' ',
        case=True
    )
    

data = df[keywords]
target = df['score']
    
# Our data is binary / boolean so using Bernoulli classifier.


# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)
    
total_points = data.shape[0]
wrong_points = (target != y_pred).sum()
score = bnb.score(data, target)
    
confusion = confusion_matrix(target, y_pred)
confusion

#noting exactly the same as take three

array([[487,  13],
       [411,  89]])

In [25]:
# Use train_test_split to create the necessary training and test groups
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=20)
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(data, target).score(data, target)))

With 20% Holdout: 0.565
Testing on Sample: 0.576


In [26]:
#cross validation - leave one out
from sklearn.model_selection import cross_val_score
cross_val_score(bnb, data, target, cv=10)

array([0.57, 0.58, 0.59, 0.56, 0.59, 0.59, 0.57, 0.57, 0.61, 0.52])

In [36]:
#fifth attempt - good words only + more good words

#cast everything to lowercase to make it easier to match words
df = df.apply(lambda x: x.astype(str).str.lower())
    
#picking keywords associated with good reviews
keywords = ['good', 'great', 'excellent', 'beautiful', 'best', 'satisfied',
           'love', 'loved', 'impressed', 'good quality', 'nice', '']

for key in keywords:
    # Noted adding spaces around keyword to make sure matching on word and not pattern
    df[str(key)] = df.text.str.contains(
        ' ' + str(key) + ' ',
        case=True
    )

data = df[keywords]
target = df['score']
    
# Our data is binary / boolean so using Bernoulli classifier.


# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)
    
total_points = data.shape[0]
wrong_points = (target != y_pred).sum()
score = bnb.score(data, target)
    
    
confusion = confusion_matrix(target, y_pred)
confusion

array([[484,  16],
       [379, 121]])

In [37]:
# Use train_test_split to create the necessary training and test groups
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=20)
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(data, target).score(data, target)))

With 20% Holdout: 0.6
Testing on Sample: 0.605


In [38]:
#cross validation - leave one out
from sklearn.model_selection import cross_val_score
cross_val_score(bnb, data, target, cv=10)

array([0.6 , 0.63, 0.63, 0.58, 0.59, 0.62, 0.6 , 0.59, 0.62, 0.54])

The fifth classifier seems to work the best, as it has the most words to match on.  Having more words to match seems to impact performance.  None of my models seem to overfit.