In [1]:
import pandas as pd
import numpy as np
from ast import literal_eval

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.model_selection import train_test_split, cross_validate, cross_val_predict, GridSearchCV, RandomizedSearchCV, cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, ComplementNB

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import f1_score, precision_score, recall_score, plot_confusion_matrix

from sklearn.pipeline import Pipeline

import spacy
import contractions
import re

In [3]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [4]:
data = pd.read_csv('data/processed_reviews.csv', converters={'cleaned_text': literal_eval})

In [None]:
data.info()

In [None]:
data.head()

In [None]:
data.sentiment.value_counts(normalize = True)

In [6]:
stop_words = spacy.lang.en.stop_words.STOP_WORDS

In [None]:
len(stop_words)

In [5]:
stop_list = ['cannot', 'not', 'nor', 'no']

In [7]:
for item in stop_list:
    # Add the word to the set of stop words. Use lowercase!
    nlp.Defaults.stop_words.remove(item)
    
    # Set the stop_word tag on the lexeme
    nlp.vocab[item].is_stop = False

In [8]:
len(stop_words)

322

In [9]:
data.reviews[132512]

"I got these headphones as a xmas for my wife (yeah for me too). I own multiple pairs of corded headphones and one other pair of cheap bluetooth headphones (Kinivo BTH240). My daily go to headphones are a pair of Sennheiser hd 280 pro cans and I have a pair of Audio-Technica ATH-M30 that I use at work.\n\nRatings:\n\nSound: 8/10\n\nThe sound is fairly detailed and well balanced. The bass end is not overpowering nor underwhelming. Midrange is clear and distinct. High end has detail and it is not too tinny. YMMV. I would say, for the headphones I use, the sound reproduction is somewhere between my Audio-Technica ATH-M30 but not quite as good as my Sennheiser hd 280. Which is on par for the price range they are in. Come on people, stop comparing these to headphones that cost 3-5 times as much.\n\nComfort: 10/10\n\nThese fit over my ears well and these may very well be the most comfortable headphones I presently own. Most headphones I wear do not bother me much and these are no exception.\

In [10]:
data.cleaned_text[132512]

['get',
 'headphone',
 'xma',
 'wife',
 'multiple',
 'pair',
 'cord',
 'headphone',
 'pair',
 'cheap',
 'bluetooth',
 'headphone',
 'daily',
 'headphone',
 'pair',
 'can',
 'pair',
 'use',
 'work',
 'sound',
 'sound',
 'fairly',
 'detailed',
 'balance',
 'bass',
 'end',
 'overpower',
 'underwhelme',
 'clear',
 'distinct',
 'high',
 'end',
 'detail',
 'tinny',
 'headphone',
 'use',
 'sound',
 'reproduction',
 'good',
 'par',
 'price',
 'range',
 'people',
 'stop',
 'compare',
 'headphone',
 'cost',
 'time',
 'fit',
 'ear',
 'comfortable',
 'headphone',
 'presently',
 'headphone',
 'wear',
 'bother',
 'exception',
 'build',
 'quality',
 'headphone',
 'solid',
 'feel',
 'lot',
 'high',
 'end',
 'car',
 'door',
 'time',
 'tell',
 'durable',
 'lot',
 'confidence',
 'cancellation',
 'read',
 'review',
 'astounded',
 'people',
 'audacious',
 'compare',
 'headphone',
 'frankly',
 'ridiculous',
 'start',
 'move',
 'forgive',
 'give',
 'price',
 'range',
 'sound',
 'quality',
 'average',
 'outst

In [None]:
clean_text(data.reviews[132512])

In [11]:
def clean_text(text):
    # Replace &nbsp; with regular space
    text = text.replace("&nbsp;", " ")
    # Remove HTML tags and attributes
    text = re.sub(r"<[^>]+>", "", text)
    # Remove URLs
    text = re.sub(r"http\S+", "", text)
    # Remove line breaks and extra whitespace
    text = re.sub(r"\s+", " ", text)
    # Expand contractions
    text = contractions.fix(text)
    # Remove numbers and special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\d+', '', text)
    # Convert to lowercase and remove leading/trailing white space
    # Tokenize and lemmatize
    doc = nlp(text)
    lemmas = [token.lemma_.lower().strip() for token in doc if not token.is_punct and not token.is_stop and len(token) > 1]
    
    return lemmas

In [12]:
data['cleaned_text_sent'] = data.reviews.apply(lambda x: clean_text(x))

In [15]:
data.to_csv('data/cleaned_reviews.csv', index = False)

In [14]:
data.head()

Unnamed: 0,reviews,sentiment,cleaned_text,dominant_topic,cleaned_text_sent
0,"Get the SportaPros instead. They look better,...",1,"[instead, look, well, wear, street, configurat...",4,"[sportapros, instead, look, well, wear, street..."
1,I've been looking for a lighter alternative to...,1,"[look, light, alternative, absolutely, perfect...",2,"[look, light, alternative, absolutely, perfect..."
2,The finest headphones available. You can spend...,1,"[fine, headphone, available, spend, vast, amou...",2,"[fine, headphone, available, spend, vast, amou..."
3,3rd pair of these I've purchased. My wife has...,1,"[pair, purchase, wife, pair, pair, glove, box,...",3,"[rd, pair, purchase, wife, pair, pair, glove, ..."
4,My old Koss Porta Pros finally got beat to dea...,1,"[old, finally, get, beat, death, year, ago, la...",4,"[old, koss, porta, pros, finally, get, beat, d..."


In [None]:
def dummy(doc):
    return doc

In [None]:
cv = CountVectorizer(ngram_range = (1,1), 
                     stop_words = None, 
                     tokenizer = dummy, 
                     preprocessor = dummy)

In [None]:
def top_words(text, n, vect):
    # Function using count vectorizer to return top n frequent words
    words = vect.fit_transform(text)
    sum_words = words.sum(axis=0)
    word_freq = [(word, sum_words[0, idx]) for word, idx in vect.vocabulary_.items()]
    word_freq = sorted(word_freq, key=lambda x: x[1], reverse=True)
    word_df = pd.DataFrame(word_freq, columns=['word', 'count'])
    return word_df[:n]

In [None]:
top30_pos = top_words(text = data[data.sentiment == 1].cleaned_text, 
                              n = 30, 
                              vect = vect)
top30_neg = top_words(text = data[data.sentiment == 0].cleaned_text, 
                              n = 30, 
                              vect = vect)

In [None]:
top30_pos = top_words(text = data[(data.sentiment == 1)&(data.dominant_topic == 2)].cleaned_text, 
                              n = 30, 
                              vect = cv)
top30_neg = top_words(text = data[(data.sentiment == 0)&(data.dominant_topic == 2)].cleaned_text, 
                              n = 30, 
                              vect = cv)

In [None]:
pos3 = data[(data.sentiment == 1)&(data.dominant_topic == 2)].cleaned_text
neg3 = data[(data.sentiment == 0)&(data.dominant_topic == 2)].cleaned_text

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(15,10))
sns.barplot(data = top30_pos, x = 'count', y = 'word', orient = 'h', ax = axs[0])
sns.barplot(data = top30_neg, x = 'count', y = 'word', orient = 'h', ax = axs[1])
axs[0].set_title('Top 30 Frequent Positive Words')
axs[1].set_title('Top 30 Frequent Negative Words')
fig.tight_layout();

In [None]:
cv_bigram = CountVectorizer(ngram_range = (2,2), 
                     stop_words = None, 
                     tokenizer = dummy, 
                     preprocessor = dummy,
                           max_df = 0.5)

In [None]:
top30_pos_bigrams = top_words(text = data[data.sentiment == 1].cleaned_text, 
                              n = 30, 
                              vect = cv_bigram)
top30_neg_bigrams = top_words(text = data[data.sentiment == 0].cleaned_text, 
                              n = 30, 
                              vect = cv_bigram)

In [None]:
top30_pos_bigrams = top_words(text = pos3, 
                              n = 30, 
                              vect = cv_bigram)
top30_neg_bigrams = top_words(text = neg3, 
                              n = 30, 
                              vect = cv_bigram)

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(15,10))
sns.barplot(data = top30_pos_bigrams, x = 'count', y = 'word', orient = 'h', ax = axs[0])
sns.barplot(data = top30_neg_bigrams, x = 'count', y = 'word', orient = 'h', ax = axs[1])
axs[0].set_title('Top 30 Frequent Positive Bigrams')
axs[1].set_title('Top 30 Frequent Negative Bigrams')
fig.tight_layout();

In [None]:
cv_trigram = CountVectorizer(ngram_range = (3,3), 
                     stop_words = None, 
                     tokenizer = dummy, 
                     preprocessor = dummy)

In [None]:
top30_pos_trigrams = top_words(text = data[data.sentiment == 1].cleaned_text, 
                              n = 30, 
                              vect = cv_trigram)
top30_neg_trigrams = top_words(text = data[data.sentiment == 0].cleaned_text, 
                              n = 30, 
                              vect = cv_trigram)

In [None]:
top30_pos_trigrams = top_words(text = pos3, 
                              n = 30, 
                              vect = cv_trigram)
top30_neg_trigrams = top_words(text = neg3, 
                              n = 30, 
                              vect = cv_trigram)

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(15,10))
sns.barplot(data = top30_pos_trigrams, x = 'count', y = 'word', orient = 'h', ax = axs[0])
sns.barplot(data = top30_neg_trigrams, x = 'count', y = 'word', orient = 'h', ax = axs[1])
axs[0].set_title('Top 30 Frequent Positive Trigrams')
axs[1].set_title('Top 30 Frequent Negative Trigrams')
fig.tight_layout();

## Train Test Split

In [None]:
X = data.cleaned_text
y = data.sentiment

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.2, 
                                                    stratify = y,
                                                    random_state = 42)

In [None]:
y_train.value_counts(normalize = True)

In [None]:
y_test.value_counts(normalize = True)

In [None]:
def dummy(doc):
    return doc

## Count Vectorizer

In [None]:
count_vect = CountVectorizer(ngram_range = (1,1), 
                             stop_words = None, 
                             tokenizer = dummy, 
                             preprocessor = dummy, 
                             min_df = 10,
                             max_df = 0.7)


In [None]:
count_vect = CountVectorizer(ngram_range = (1,3), 
                             stop_words = None, 
                             tokenizer = dummy, 
                             preprocessor = dummy, 
                             min_df = 10,
                             max_df = 0.9)

### Logistic Regression

In [None]:
logr = LogisticRegression(class_weight = 'balanced', solver = 'saga', max_iter = 500)


In [None]:
logr_cv_model = Pipeline([
    ('vectorizer', count_vect),
    ('logreg', logr)
])

In [None]:
cv_logr = cross_validate(logr_cv_model, X_train, y_train, cv = 5, scoring = ['f1', 'precision', 'recall'])

In [None]:
model_results = pd.DataFrame(columns=['Model', 'mean f1_score', 'std f1_score', 'mean precision', 'mean recall'])

In [None]:
model_results = model_results.append({'Model': 'logred_cv_model',
                      'mean f1_score': round(cv_logr['test_f1'].mean(), 4),
                      'std f1_score': round(cv_logr['test_f1'].std(), 4),
                      'mean precision': round(cv_logr['test_precision'].mean(), 4),
                      'mean recall': round(cv_logr['test_recall'].mean(), 4)}, ignore_index = True)

In [None]:
model_results

### Naive Bayes

In [None]:
mnb = MultinomialNB(class_prior = [0.23, 0.77])

In [None]:
mnb_cv_model = Pipeline([
    ('vectorizer', count_vect),
    ('mnb', mnb)
])

In [None]:
cv_mnb = cross_validate(mnb_cv_model, X_train, y_train, cv = 5, scoring = ['precision', 'recall', 'f1'])

In [None]:
model_results = model_results.append({'Model': 'mnb_cv_model',
                      'mean f1_score': round(cv_mnb['test_f1'].mean(), 4),
                      'std f1_score': round(cv_mnb['test_f1'].std(), 4),
                      'mean precision': round(cv_mnb['test_precision'].mean(), 4),
                      'mean recall': round(cv_mnb['test_recall'].mean(), 4)}, ignore_index = True)

In [None]:
model_results

### Random Forest

In [None]:
rfc = RandomForestClassifier(random_state=42)

In [None]:
rfc_cv_model = Pipeline([
    ('vectorizer', count_vect),
    ('rfc', rfc)
])

In [None]:
cv_rf = cross_validate(rfc_cv_model, X_train, y_train, cv = 5, scoring = ['precision', 'recall', 'f1'])

In [None]:
model_results = model_results.append({'Model': 'rfc_cv_model',
                      'mean f1_score': round(cv_rf['test_f1'].mean(), 4),
                      'std f1_score': round(cv_rf['test_f1'].std(), 4),
                      'mean precision': round(cv_rf['test_precision'].mean(), 4),
                      'mean recall': round(cv_rf['test_recall'].mean(), 4)}, ignore_index = True)

In [None]:
model_results

## TF-IDF

In [None]:
tfidf = TfidfVectorizer(ngram_range = (1,1), 
                             stop_words = None, 
                             tokenizer = dummy, 
                             preprocessor = dummy, 
                             min_df = 10,
                             max_df = 0.9)

### Logistic Regression

In [None]:
logr_tfidf_model = Pipeline([
    ('vectorizer', tfidf),
    ('logreg', logr)
])

In [None]:
tfidf_logr = cross_validate(logr_tfidf_model, X_train, y_train, cv = 5, scoring = ['precision', 'recall', 'f1'])

In [None]:
model_results = model_results.append({'Model': 'tfidf_logr_model',
                      'mean f1_score': round(tfidf_logr['test_f1'].mean(), 4),
                      'std f1_score': round(tfidf_logr['test_f1'].std(), 4),
                      'mean precision': round(tfidf_logr['test_precision'].mean(), 4),
                      'mean recall': round(tfidf_logr['test_recall'].mean(), 4)}, ignore_index = True)

In [None]:
model_results

### Naive Bayes

In [None]:
mnb_tfidf_model = Pipeline([
    ('vectorizer', tfidf),
    ('clf', mnb)
])

In [None]:
tfidf_mnb = cross_validate(mnb_tfidf_model, X_train, y_train, cv = 5, scoring = ['precision', 'recall', 'f1'])

In [None]:
model_results = model_results.append({'Model': 'tfidf_mnb_model',
                      'mean f1_score': round(tfidf_mnb['test_f1'].mean(), 4),
                      'std f1_score': round(tfidf_mnb['test_f1'].std(), 4),
                      'mean precision': round(tfidf_mnb['test_precision'].mean(), 4),
                      'mean recall': round(tfidf_mnb['test_recall'].mean(), 4)}, ignore_index = True)
model_results

### Random Forest

In [None]:
rfc_tfidf_model = Pipeline([
    ('vectorizer', tfidf),
    ('clf', rfc)
])

In [None]:
tfidf_rf = cross_validate(rfc_tfidf_model, X_train, y_train, cv = 5, scoring = ['precision', 'recall', 'f1'])

In [None]:
model_results = model_results.append({'Model': 'tfidf_rf_model',
                      'mean f1_score': round(tfidf_rf['test_f1'].mean(), 4),
                      'std f1_score': round(tfidf_rf['test_f1'].std(), 4),
                      'mean precision': round(tfidf_rf['test_precision'].mean(), 4),
                      'mean recall': round(tfidf_rf['test_recall'].mean(), 4)}, ignore_index = True)
model_results

## Word Embeddings

In [None]:
reviews = X_train

In [None]:
total_vocabulary = set(word for review in reviews for word in review)

In [None]:
len(total_vocabulary)

In [None]:
glove = {}
with open('data/glove.6B.50d.txt', 'rb') as f:
    for line in f:
        parts = line.split()
        word = parts[0].decode('utf-8')
        if word in total_vocabulary:
            vector = np.array(parts[1:], dtype=np.float32)
            glove[word] = vector

In [None]:
glove['great']

In [None]:
class W2vVectorizer(object):
    
    def __init__(self, w2v):
        # Takes in a dictionary of words and vectors as input
        self.w2v = w2v
        if len(w2v) == 0:
            self.dimensions = 0
        else:
            self.dimensions = len(w2v[next(iter(glove))])
    
    # Note: Even though it doesn't do anything, it's required that this object implement a fit method or else
    # it can't be used in a scikit-learn pipeline  
    def fit(self, X, y):
        return self
            
    def transform(self, X):
        return np.array([
            np.mean([self.w2v[w] for w in words if w in self.w2v]
                   or [np.zeros(self.dimensions)], axis=0) for words in X])

In [None]:
logr_w2v_model = Pipeline([
    ('vectorizer', W2vVectorizer(glove)),
    ('logreg', logr)
])

In [None]:
w2v_logr = cross_validate(logr_tfidf_model, X_train, y_train, cv = 5, scoring = ['precision', 'recall', 'f1'])

In [None]:
model_results = model_results.append({'Model': 'w2v_logr_model',
                      'mean f1_score': round(w2v_logr['test_f1'].mean(), 4),
                      'std f1_score': round(w2v_logr['test_f1'].std(), 4),
                      'mean precision': round(w2v_logr['test_precision'].mean(), 4),
                      'mean recall': round(w2v_logr['test_recall'].mean(), 4)}, ignore_index = True)

In [None]:
model_results

In [None]:
model_results = model_results.append({'Model': 'mnb_w2v_model',
                      'mean f1_score': round(w2v_mnb['test_f1'].mean(), 4),
                      'std f1_score': round(w2v_mnb['test_f1'].std(), 4),
                      'mean precision': round(w2v_mnb['test_precision'].mean(), 4),
                      'mean recall': round(w2v_mnb['test_recall'].mean(), 4)}, ignore_index = True)

model_results

In [None]:
rfc = RandomForestClassifier(random_state=42)

rfc_w2v_model = Pipeline([
    ('vectorizer',  W2vVectorizer(glove)),
    ('rfc', rfc)
])

w2v_rf = cross_validate(rfc_w2v_model, X_train, y_train, cv = 5, scoring = ['precision', 'recall', 'f1'])



model_results = model_results.append({'Model': 'rfc_w2v_model',
                      'mean f1_score': round(w2v_rf['test_f1'].mean(), 4),
                      'std f1_score': round(w2v_rf['test_f1'].std(), 4),
                      'mean precision': round(w2v_rf['test_precision'].mean(), 4),
                      'mean recall': round(w2v_rf['test_recall'].mean(), 4)}, ignore_index = True)

model_results