In [1]:
import pandas as pd
import numpy as np

### From the readme, all labels are balanced. There are 500 of each positive and negative sentiment sentences.

In [2]:
yelp_df = pd.read_csv('yelp_labelled.txt', sep = '\t', header = None, names = ['sentence', 'score'])
amazon_df = pd.read_csv('amazon_cells_labelled.txt', sep = '\t', header = None, names = ['sentence', 'score'])

# tab doesnt work because the text file is a liar
imdb_df = pd.read_csv('imdb_labelled.txt', sep = '  ', header = None, names = ['sentence', 'score'])

  """


Our preprocessing included converting text to lowercase, expanding contractions, keeping only alphanumeric values, removing any accents, removing stop words, and lemmatization.  We chose these methods to essentially reduce variance and extract the most meaningful words out of a sentence. This helps us standardize what words should be in our word dictionary and reduces any chance of having multiple words that would add variance and noise.

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import re
import unidecode

In [4]:
def remove_contractions(text):
    contraction_map = {
    "ain't": "is not",
    "aren't": "are not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "I'd": "I would",
    "I'd've": "I would have",
    "I'll": "I will",
    "I'll've": "I will have",
    "I'm": "I am",
    "I've": "I have",
    "i'd": "i would",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'll've": "i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
    }
    for word in text.split():
        if word in contraction_map:
            text = text.replace(word, contraction_map[word])
    return(text)

In [5]:
# cleans data from our various dataframes

def clean_data(text):
    # convert to lower case
    text = text.lower()
    
    # remove contractions using a given dictionary
    text = remove_contractions(text)
    
    # keep only alphanumeric values
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    # remove accents
    text = unidecode.unidecode(text)
    
    return text

In [6]:
# clean data using created function, tokenize words with nltk

# clean data
yelp_df['sentence'] = yelp_df['sentence'].apply(lambda x: clean_data(x))
amazon_df['sentence'] = amazon_df['sentence'].apply(lambda x: clean_data(x))
imdb_df['sentence'] = imdb_df['sentence'].apply(lambda x: clean_data(x))

# tokenize words
yelp_df['sentence'] = yelp_df['sentence'].apply(lambda x: nltk.word_tokenize(x))
amazon_df['sentence'] = amazon_df['sentence'].apply(lambda x: nltk.word_tokenize(x))
imdb_df['sentence'] = imdb_df['sentence'].apply(lambda x: nltk.word_tokenize(x))

In [7]:
# remove stop words using created function

def remove_stopwords(text):
    words = []
    for w in text:
        if w not in stopwords.words('english'):
            words.append(w)
    return words

# remove stop words
yelp_df['sentence'] = yelp_df['sentence'].apply(lambda x: remove_stopwords(x))
amazon_df['sentence'] = amazon_df['sentence'].apply(lambda x: remove_stopwords(x))
imdb_df['sentence'] = imdb_df['sentence'].apply(lambda x: remove_stopwords(x))

In [8]:
# lemmatization of words using created function

lem = WordNetLemmatizer()

def word_lem(text):
    lem_text = []
    for i in text:
        lem_text.append(lem.lemmatize(i))
    return lem_text

yelp_df['sentence'] = yelp_df['sentence'].apply(lambda x: word_lem(x))
amazon_df['sentence'] = amazon_df['sentence'].apply(lambda x: word_lem(x))
imdb_df['sentence'] = imdb_df['sentence'].apply(lambda x: word_lem(x))

In [9]:
# sort dataframe, concat to make training and testing dataframes

yelp_df = yelp_df.sort_values('score').reset_index(drop = True)
amazon_df = amazon_df.sort_values('score').reset_index(drop = True)
imdb_df = imdb_df.sort_values('score').reset_index(drop = True)

train_frames = [yelp_df[:400], yelp_df[500:900], amazon_df[:400], amazon_df[500:900], 
                imdb_df[:400], imdb_df[500:900]]
test_frames = [yelp_df[400:500], yelp_df[900:], amazon_df[400:500], amazon_df[900:],
               imdb_df[400:500], imdb_df[900:]]

train_df = pd.concat(train_frames)
test_df = pd.concat(test_frames)

train_df = train_df.reset_index(drop = True)
test_df = test_df.reset_index(drop = True)

At this point we can only fill the word dictionary with our training data because doing else would cause overfitting. We want potentially new words in our test set to be handled exactly how they are - as new words. This in turn will give us a more robust model.

In [10]:
# create dictionary of words of the training data

def fill_training_dict(df):
    training_dict = {}
    for sentence in df['sentence']:
        for word in sentence:
            training_dict[word] = 0
    return training_dict

word_dict = fill_training_dict(train_df)            
        
# function to fill a dictionary's value based on occurance of a word, alternative to collections library
def fill_dict_values(value_dict, dataframe):
    for sentence in dataframe['sentence']:
        for word in sentence:
            if word in value_dict:
                value_dict[word] = value_dict[word] + 1
    return word_dict

word_dict = fill_dict_values(word_dict, train_df)
word_dict = fill_dict_values(word_dict, test_df)

In [11]:
# create a list of lists
# sublists are of length = len(word_dict)
# values 1 for word appearing in word_dict, 0 otherwise
    
def bag_of_words(df):
    sentence_vector = []
    for sentence in df['sentence']:
        sentence_x = []
        for key in word_dict.keys():            
            if key in sentence:
#                 sentence_x.append(1)
                sentence_x.append(sentence.count(key))
            else:
                sentence_x.append(0)
        sentence_vector.append(sentence_x)
    return sentence_vector

bow_train = bag_of_words(train_df)

In [12]:
# feature vectors for top two lines of Bag of Words model

print('Tranformed Sentence: \n{}\nBag of Words Sentence: \n{}'.format(train_df['sentence'][0], bow_train[0]))
print('Tranformed Sentence: \n{}\nBag of Words Sentence: \n{}'.format(train_df['sentence'][1], bow_train[1]))

Tranformed Sentence: 
['wasted', 'enough', 'life', 'poured', 'salt', 'wound', 'drawing', 'time', 'took', 'bring', 'check']
Bag of Words Sentence: 
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

We decided to use L2 Normalization as our only post-processing strategy. This strategy helped us best combat the huge variance of the elements in our feature vectors. 

In [13]:
# Postprocessing strategy 
# Chosen: l2 norm

def l2_norm(vector):
    return np.linalg.norm(vector)
        
def normalize_matrix(matrix):
    # iterates through each row of the dataframe/matrix
    # calculates the l2 norm of that matrix
    # divides each element in the row by the norm and appends into a new row
    # append each new row into a matrix, then return
    norm_matrix = []
    for row in matrix:
        norm_row = []
        norm = l2_norm(row)
        if norm == 0:
            norm = 1
        for element in row:
            norm_row.append(element/norm)
        norm_matrix.append(norm_row)
    return norm_matrix

The words that played the most important roles in deciding the sentiment of reviews are:
    'adapter', 'provide', 'enough', 'charging', 'current'

We also see that our Logistic Regression model has performed much better than the Gaussian Naive Bayes model,  and similarly to the Bernoulli Naive Bayes model.

In [14]:
# models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB

# metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score


In [15]:
# normalize test data
# set up training and test data

bow_test = bag_of_words(test_df)

# l2 norm makes results worse
# bow_test = normalize_matrix(bow_test)
# bow_train = normalize_matrix(bow_train)

X_train = bow_train
y_train = train_df['score']

X_test = bow_test
y_test = test_df['score']

In [16]:
# Calculating the most impactful sentence

clf = LogisticRegression()
clf.fit(X_train, y_train)
acc = clf.score(X_test, y_test)

max_value = max(clf.coef_[0])
max_index = np.argmax(clf.coef_[0])
print('Most Pos. Value: {}\nMost Pos. Index: {}\nMost Pos. Sentence: {}'.format(max_value, max_index, train_df.iloc[max_index][0]))

min_value = min(clf.coef_[0])
min_index = np.argmin(clf.coef_[0])
print('\nMost Neg. Value: {}\nMost Neg. Index: {}\nMost Neg. Sentence: {}'.format(min_value, min_index, train_df.iloc[min_index][0]))

Most Pos. Value: 2.9531754403519623
Most Pos. Index: 1016
Most Pos. Sentence: ['adapter', 'provide', 'enough', 'charging', 'current']

Most Neg. Value: -2.0877427773291934
Most Neg. Index: 161
Most Neg. Sentence: ['dessert', 'bit', 'strange']




In [17]:
# create models and print information

def run_ml(X_train, y_train, X_test, y_test):
    models = {LogisticRegression(): 'Logistic Regression' , 
              GaussianNB(): 'Gaussian Naive Bayes', 
              BernoulliNB(): 'Bernoulli Naive Bayes'}

    for clf in models:
        # fit on training data(X_train) and training labels (y_train)
        clf.fit(X_train, y_train)

        # score the model bosed on testing data(X_test) and testing labels (y_test)
        acc = clf.score(X_test, y_test)

        # predict labels of the training data
        y_pred = clf.predict(X_test)

        # create confusion matrix, precision and recall scores
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        prec = precision_score(y_test, y_pred)
        rec = recall_score(y_test, y_pred)

        print('\nModel: {}'.format(models[clf]))
        print('Accuracy: {}\nPrecision Score: {}\nRecall Score: {}\n'.format(round(acc,3), round(prec,3), round(rec,3)))
        print('True Negatives:  {} False Positives: {}\nFalse Negatives: {}  True Positives:  {}\n'.format(tn, fp, fn, tp))
        print('-------------------------')
run_ml(X_train, y_train, X_test, y_test)





Model: Logistic Regression
Accuracy: 0.763
Precision Score: 0.762
Recall Score: 0.767

True Negatives:  228 False Positives: 72
False Negatives: 70  True Positives:  230

-------------------------

Model: Gaussian Naive Bayes
Accuracy: 0.633
Precision Score: 0.706
Recall Score: 0.457

True Negatives:  243 False Positives: 57
False Negatives: 163  True Positives:  137

-------------------------

Model: Bernoulli Naive Bayes
Accuracy: 0.783
Precision Score: 0.776
Recall Score: 0.797

True Negatives:  231 False Positives: 69
False Negatives: 61  True Positives:  239

-------------------------


In [18]:
from nltk.util import ngrams

In [19]:
# n-grams, n = 2
def n_grams_matrix(df):
    sentences = []
    for sentence in df['sentence']:
        sentences.append(list(ngrams(sentence, 2)))
    return sentences

n_grams_train = n_grams_matrix(train_df)
n_grams_test = n_grams_matrix(test_df)

n_grams_train = pd.DataFrame({'sentence':n_grams_train})
n_grams_test = pd.DataFrame({'sentence':n_grams_test})

word_dict = fill_training_dict(n_grams_train)
word_dict = fill_dict_values(word_dict, n_grams_train)
word_dict = fill_dict_values(word_dict, n_grams_test)


# l2 norm makes accuracy worse 
ng_train = bag_of_words(n_grams_train)
# ng_train = normalize_matrix(ng_train)

ng_test = bag_of_words(n_grams_test)
# ng_test = normalize_matrix(ng_test)

In [20]:
# training and test data split

X_train = ng_train
y_train = train_df['score']

X_test = ng_test
y_test = test_df['score']

In [21]:
run_ml(X_train, y_train, X_test, y_test)




Model: Logistic Regression
Accuracy: 0.632
Precision Score: 0.742
Recall Score: 0.403

True Negatives:  258 False Positives: 42
False Negatives: 179  True Positives:  121

-------------------------

Model: Gaussian Naive Bayes
Accuracy: 0.637
Precision Score: 0.789
Recall Score: 0.373

True Negatives:  270 False Positives: 30
False Negatives: 188  True Positives:  112

-------------------------

Model: Bernoulli Naive Bayes
Accuracy: 0.642
Precision Score: 0.771
Recall Score: 0.403

True Negatives:  264 False Positives: 36
False Negatives: 179  True Positives:  121

-------------------------


In [22]:
# help from: https://stats.stackexchange.com/questions/134282/relationship-between-svd-and-pca-how-to-use-svd-to-perform-pca/134283

def implement_pca(X_train, X_test, n):
    # ensure data is centered
    X_train = X_train - np.mean(X_train)
    X_test = X_test - np.mean(X_test)
    
    u, s, vh = np.linalg.svd(X_train)
    
    X_train_pca = np.matmul(u[:, :n], np.diag(s[:n]))
    X_test_pca = np.matmul(X_test, vh.T[:, :n])

    return X_train_pca, X_test_pca

In [None]:
y_train = train_df['score']
y_test = test_df['score']

bow_data = []
ng_data = []
pca_dims = [10, 50, 100]

for dims in pca_dims:
    bow_data.append(implement_pca(bow_train, bow_test, dims))
bow_data.append([bow_train, bow_test])
    
for dims in pca_dims:
    ng_data.append(implement_pca(ng_train, ng_test, dims))
ng_data.append([ng_train, ng_test])  

bow_names = ['bow10', 'bow50', 'bow100', 'bow_OG']
ng_names = ['ng10', 'ng50', 'ng100', 'ng_OG']
    
for i in range(len(bow_data)):
    print('Iteration: {}, Using: {}\n'.format(i, bow_names[i]))
    run_ml(bow_data[i][0], y_train, bow_data[i][1], y_test)

for i in range(len(ng_data)):
    print('Iteration: {}, Using: {}\n'.format(i, ng_names[i]))
    run_ml(ng_data[i][0], y_train, ng_data[i][1], y_test)

The method that performs the best surprisingly is the regular Bag of Words model. This is because the N-Grams model is too wide of a dataset with an awkward amount of feature vectors that do not have any results because they were of length < N. I learned that expressions such as ('enough' and 'current') will make a review postive and ('bit' and 'strange') with make a review negative. 