In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind
import sklearn
import scipy
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix
%matplotlib inline

# Thinkful's Feedback Analysis Challenge

Data Source: https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences

Building my own Naive Bayes Model to classify feedback sentiment.

In [3]:
#import data
df = pd.read_csv('sentiment_labelled_sentences/amazon_cells_labelled.txt',
                 delimiter='\t',
                 header=None,
                 names=['text', 'score'])
df.head()

Unnamed: 0,text,score
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [5]:
#first attempt... basically copying the thinkful naive bayes spam filter guided example

def naive_bayes_take_one (dataframe):
    
    """Train and test naive bayes 
    
    Parameters
    --------
        dataframe (pd.DataFrame): 
            dataframe to use
        
    Return
    --------
        total_points: 
            total number of texts classified
        wrong_points: 
            number of texts incorrectly classified
        score: 
            percent correctly classified
        
    """
    
    #cast everything to lowercase to make it easier to match words
    df = dataframe
    df = df.apply(lambda x: x.astype(str).str.lower())
    
    #picking keywords associated with good reviews
    keywords = ['good', 'great', 'excellent', 'beautiful', 'best', 'satisfied']

    for key in keywords:
        # Noted adding spaces around keyword to make sure matching on word and not pattern
        df[str(key)] = df.text.str.contains(
            ' ' + str(key) + ' ',
            case=True
        )

    data = df[keywords]
    target = df['score']
    
    # Our data is binary / boolean so using Bernoulli classifier.


    # Instantiate our model and store it in a new variable.
    bnb = BernoulliNB()

    # Fit our model to the data.
    bnb.fit(data, target)

    # Classify, storing the result in a new variable.
    y_pred = bnb.predict(data)
    
    total_points = data.shape[0]
    wrong_points = (target != y_pred).sum()
    score = bnb.score(data, target)
    
    
    confusion = confusion_matrix(target, y_pred)
    
    return total_points, wrong_points, score

In [6]:
#womp womp
naive_bayes_take_one(df)

(1000, 418, 0.582)

In [7]:
#second attempt... basically copying the thinkful naive bayes spam filter guided example

def naive_bayes_take_two (dataframe):
    
    """Train and test naive bayes 
    
    Parameters
    --------
        dataframe (pd.DataFrame): 
            dataframe to use
        
    Return
    --------
        total_points: 
            total number of texts classified
        wrong_points: 
            number of texts incorrectly classified
        score: 
            percent correctly classified
        
    """
    
    #cast everything to lowercase to make it easier to match words
    df = dataframe
    df = df.apply(lambda x: x.astype(str).str.lower())
    
    #picking keywords associated with bad reviews
    keywords = ['unsatisfactory', 'disappointed', 'disappoint', 'junk', 'painful', 'unusable', 'negative']

    for key in keywords:
        # Noted adding spaces around keyword to make sure matching on word and not pattern
        df[str(key)] = df.text.str.contains(
            ' ' + str(key) + ' ',
            case=False
        )

    data = df[keywords]
    target = df['score']
    
    # Our data is binary / boolean so using Bernoulli classifier.


    # Instantiate our model and store it in a new variable.
    bnb = BernoulliNB()

    # Fit our model to the data.
    bnb.fit(data, target)

    # Classify, storing the result in a new variable.
    y_pred = bnb.predict(data)
    
    total_points = data.shape[0]
    wrong_points = (target != y_pred).sum()
    score = bnb.score(data, target)
    
    return total_points, wrong_points, score

In [8]:
#so the positive words were better...
naive_bayes_take_two(df)

(1000, 492, 0.508)

In [14]:
#third attempt... what happens if I combine the positive and negative words together?

def naive_bayes_take_three (dataframe):
    
    """Train and test naive bayes 
    
    Parameters
    --------
        dataframe (pd.DataFrame): 
            dataframe to use
        
    Return
    --------
        total_points: 
            total number of texts classified
        wrong_points: 
            number of texts incorrectly classified
        score: 
            percent correctly classified
        
    """
    
    #cast everything to lowercase to make it easier to match words
    df = dataframe
    df = df.apply(lambda x: x.astype(str).str.lower())
    
    #picking keywords associated with good reviews
    good = ['good', 'great', 'excellent', 'beautiful', 'best', 'satisfied']
    
    #picking keywords associated with bad reviews
    bad = ['unsatisfactory', 'disappointed', 'disappoint', 'junk', 'painful', 'unusable', 'negative']
    
    keywords = good + bad

    for key in bad:
        # Noted adding spaces around keyword to make sure matching on word and not pattern
        df[str(key)] = df.text.str.contains(
            ' ' + str(key) + ' ',
            case=False
        )
    
    for key in good:
        # Noted adding spaces around keyword to make sure matching on word and not pattern
        df[str(key)] = df.text.str.contains(
            ' ' + str(key) + ' ',
            case=True
        )
    

    data = df[keywords]
    target = df['score']
    
    # Our data is binary / boolean so using Bernoulli classifier.


    # Instantiate our model and store it in a new variable.
    bnb = BernoulliNB()

    # Fit our model to the data.
    bnb.fit(data, target)

    # Classify, storing the result in a new variable.
    y_pred = bnb.predict(data)
    
    total_points = data.shape[0]
    wrong_points = (target != y_pred).sum()
    score = bnb.score(data, target)
    
    return total_points, wrong_points, score

In [15]:
#not any more accurate than the first attempt
naive_bayes_take_three(df)

(1000, 418, 0.582)

My feature engineering process was to look at several reviews and to select keywords that are associated with being either good or bad.