In [1]:
from lib.amazon_reviews_loader import AmazonReviewsDS
from lib.amazon_reviews_cfg import DS_CFG_NO_SW, DS_CFG_SW

_POS_REV_FILE = 'dataset/pos.txt'
_NEG_REV_FILE = 'dataset/neg.txt'

def review_list_of_tokens_to_corpus(review_list):
    return [" ".join(review) for review in review_list]

if __name__ ==  '__main__':
    print('Retrieving Amazon Reviews Dataset with No Stopwords')
    amazon_rev_no_sw = AmazonReviewsDS(_POS_REV_FILE, _NEG_REV_FILE, DS_CFG_NO_SW)
    print('Retrieving Amazon Reviews Dataset with Stopwords')
    amazon_rev_sw = AmazonReviewsDS(_POS_REV_FILE, _NEG_REV_FILE, DS_CFG_SW)

    print('Splitting the datasets into train, validation and test sets')
    amazon_rev_no_sw_splits = {
            'train' : (review_list_of_tokens_to_corpus(amazon_rev_no_sw.get_train_data()[0]),
                amazon_rev_no_sw.get_train_data()[1]),
            'val': (review_list_of_tokens_to_corpus(amazon_rev_no_sw.get_val_data()[0]),
                amazon_rev_no_sw.get_val_data()[1]),
            'test' : (review_list_of_tokens_to_corpus(amazon_rev_no_sw.get_test_data()[0]),
                amazon_rev_no_sw.get_test_data()[1])
            }

    amazon_rev_sw_splits = {
            'train' : (review_list_of_tokens_to_corpus(amazon_rev_sw.get_train_data()[0]),
                amazon_rev_sw.get_train_data()[1]),
            'val': (review_list_of_tokens_to_corpus(amazon_rev_sw.get_val_data()[0]),
                amazon_rev_sw.get_val_data()[1]),
            'test' : (review_list_of_tokens_to_corpus(amazon_rev_sw.get_test_data()[0]),
                amazon_rev_sw.get_test_data()[1])
            }

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/varsrao/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Retrieving Amazon Reviews Dataset with No Stopwords
----- Dataset Synthesis Start -----
Loading Positive Reviews from dataset/pos.txt
Loading Negative Reviews from dataset/neg.txt
Generating data and labels
Tokenizing the data
Removing stop words
Shuffling the data
----- Dataset Synthesis Complete -----
Retrieving Amazon Reviews Dataset with Stopwords
----- Dataset Synthesis Start -----
Loading Positive Reviews from dataset/pos.txt
Loading Negative Reviews from dataset/neg.txt
Generating data and labels
Tokenizing the data
Shuffling the data
----- Dataset Synthesis Complete -----
Splitting the datasets into train, validation and test sets


In [2]:
print('Creating Count Vectorizers')
from sklearn.feature_extraction.text import CountVectorizer
unigram_count_vec = CountVectorizer()
bigram_count_vec = CountVectorizer(ngram_range=(2,2))
unigram_bigram_count_vec = CountVectorizer(ngram_range=(1,2))

Creating Count Vectorizers


In [3]:
print('--------------------------------------------------')
print('Fitting Count Vectorizers to the Entire DataSet: With Stopwords')
amazon_rev_sw_uni_counts = unigram_count_vec.fit_transform(
    amazon_rev_sw_splits['train'][0] +
    amazon_rev_sw_splits['val'][0] +
    amazon_rev_sw_splits['test'][0])
amazon_rev_sw_big_counts = bigram_count_vec.fit_transform(
    amazon_rev_sw_splits['train'][0] +
    amazon_rev_sw_splits['val'][0] +
    amazon_rev_sw_splits['test'][0])
amazon_rev_sw_uni_big_counts = unigram_bigram_count_vec.fit_transform(
    amazon_rev_sw_splits['train'][0] +
    amazon_rev_sw_splits['val'][0] +
    amazon_rev_sw_splits['test'][0])
print(f'Unigram Count Shape: {amazon_rev_sw_uni_counts.shape}')
print(f'Bigram Count Shape: {amazon_rev_sw_big_counts.shape}')
print(f'Unigram+Bigram Count Shape: {amazon_rev_sw_uni_big_counts.shape}')
print('--------------------------------------------------')
print('Fitting Count Vectorizers to the Entire DataSet: Without Stopwords')
amazon_rev_no_sw_uni_counts = unigram_count_vec.fit_transform(
    amazon_rev_no_sw_splits['train'][0] + 
    amazon_rev_no_sw_splits['val'][0] +
    amazon_rev_no_sw_splits['test'][0])
amazon_rev_no_sw_big_counts = bigram_count_vec.fit_transform(
    amazon_rev_no_sw_splits['train'][0] +
    amazon_rev_no_sw_splits['val'][0] +
    amazon_rev_no_sw_splits['test'][0])
amazon_rev_no_sw_uni_big_counts = unigram_bigram_count_vec.fit_transform(
    amazon_rev_no_sw_splits['train'][0] +
    amazon_rev_no_sw_splits['val'][0] +
    amazon_rev_no_sw_splits['test'][0])
print(f'Unigram Count Shape: {amazon_rev_no_sw_uni_counts.shape}')
print(f'Bigram Count Shape: {amazon_rev_no_sw_big_counts.shape}')
print(f'Unigram+Bigram Count Shape: {amazon_rev_no_sw_uni_big_counts.shape}')
print('--------------------------------------------------')

'''
If we use 3 seperate count vectorizers for train, val & test set, since the vocab
may be different in both sets, the feature (token count) representations will
be different (because either the vocab or the vocab size would be different)
'''

--------------------------------------------------
Fitting Count Vectorizers to the Entire DataSet: With Stopwords
Unigram Count Shape: (800000, 80956)
Bigram Count Shape: (800000, 1651114)
Unigram+Bigram Count Shape: (800000, 1732070)
--------------------------------------------------
Fitting Count Vectorizers to the Entire DataSet: Without Stopwords
Unigram Count Shape: (800000, 80813)
Bigram Count Shape: (800000, 2206262)
Unigram+Bigram Count Shape: (800000, 2287075)
--------------------------------------------------


'\nIf we use 3 seperate count vectorizers for train, val & test set, since the vocab\nmay be different in both sets, the feature (token count) representations will\nbe different (because either the vocab or the vocab size would be different)\n'

In [4]:
len_sw_train = len(amazon_rev_sw_splits['train'][1])
len_sw_val = len(amazon_rev_sw_splits['val'][1])
len_no_sw_train = len(amazon_rev_no_sw_splits['train'][1])
len_no_sw_val = len(amazon_rev_no_sw_splits['val'][1])

In [5]:
from sklearn.naive_bayes import MultinomialNB
import numpy as np
def calculate_avg_accuracy(pred, gt):
    return np.mean(pred==gt)

'''
Train only using the train count vectors
Validate only using the val count vectors
'''

alpha_no_sw = alpha_sw = 0
best_val_seen_so_far_no_sw = best_val_seen_so_far_sw = 0

print('Training & Validating Classifiers')
for alpha in np.arange(0.1, 1.1, 0.1):
    print(f'Alpha = {alpha}')
    # Training Without SW
    unigram_no_sw_classifier = MultinomialNB(alpha=alpha).fit(
        amazon_rev_no_sw_uni_counts[:len_no_sw_train],
        amazon_rev_no_sw_splits['train'][1])
    bigram_no_sw_classifier = MultinomialNB(alpha=alpha).fit(
        amazon_rev_no_sw_big_counts[:len_no_sw_train],
        amazon_rev_no_sw_splits['train'][1])
    unigram_bigram_no_sw_classifier = MultinomialNB(alpha=alpha).fit(
        amazon_rev_no_sw_uni_big_counts[:len_no_sw_train],
        amazon_rev_no_sw_splits['train'][1])
    
    # Training With SW
    unigram_sw_classifier = MultinomialNB(alpha=alpha).fit(
        amazon_rev_sw_uni_counts[:len_sw_train],
        amazon_rev_sw_splits['train'][1])
    bigram_sw_classifier = MultinomialNB(alpha=alpha).fit(
        amazon_rev_sw_big_counts[:len_sw_train],
        amazon_rev_sw_splits['train'][1])
    unigram_bigram_sw_classifier = MultinomialNB(alpha=alpha).fit(
        amazon_rev_sw_uni_big_counts[:len_sw_train],
        amazon_rev_sw_splits['train'][1])

    # Validation Without SW
    pred_no_sw_uni = unigram_no_sw_classifier.predict(
        amazon_rev_no_sw_uni_counts[
            len_no_sw_train:len_no_sw_train+len_no_sw_val])
    pred_no_sw_big = bigram_no_sw_classifier.predict(
        amazon_rev_no_sw_big_counts[
            len_no_sw_train:len_no_sw_train+len_no_sw_val])
    pred_no_sw_uni_big = unigram_bigram_no_sw_classifier.predict(
        amazon_rev_no_sw_uni_big_counts[
            len_no_sw_train:len_no_sw_train+len_no_sw_val])
    
    # Validation With SW
    pred_sw_uni = unigram_sw_classifier.predict(
        amazon_rev_sw_uni_counts[
            len_sw_train:len_sw_train+len_sw_val])
    pred_sw_big = bigram_sw_classifier.predict(
        amazon_rev_sw_big_counts[
            len_sw_train:len_sw_train+len_sw_val])
    pred_sw_uni_big = unigram_bigram_sw_classifier.predict(
        amazon_rev_sw_uni_big_counts[
            len_sw_train:len_sw_train+len_sw_val])
    
    # Validation Accuracy Without SW
    uni_no_sw_acc = calculate_avg_accuracy(
        pred_no_sw_uni, amazon_rev_sw_splits["val"][1])
    big_no_sw_acc = calculate_avg_accuracy(
        pred_no_sw_big, amazon_rev_no_sw_splits["val"][1])
    uni_big_no_sw_acc = calculate_avg_accuracy(
        pred_no_sw_uni_big, amazon_rev_no_sw_splits["val"][1])
    
    sum_acc_no_sw = uni_no_sw_acc + big_no_sw_acc + uni_big_no_sw_acc
    if (sum_acc_no_sw > best_val_seen_so_far_no_sw):
        best_val_seen_so_far_no_sw = sum_acc_no_sw
        alpha_no_sw = alpha
    
    # Validation Accuracy With SW
    uni_sw_acc = calculate_avg_accuracy(
        pred_sw_uni, amazon_rev_sw_splits["val"][1])
    big_sw_acc = calculate_avg_accuracy(
        pred_sw_big, amazon_rev_sw_splits["val"][1])
    uni_big_sw_acc = calculate_avg_accuracy(
        pred_sw_uni_big, amazon_rev_sw_splits["val"][1])
    
    sum_acc_sw = uni_sw_acc + big_sw_acc + uni_big_sw_acc
    if (sum_acc_sw > best_val_seen_so_far_sw):
        best_val_seen_so_far_sw = sum_acc_sw
        alpha_sw = alpha

print(f'Best Alpha for Without Stopwords: {alpha_no_sw}')
print(f'Best Alpha for With Stopwords: {alpha_sw}')

Training & Validating Classifiers
Alpha = 0.1
Alpha = 0.2
Alpha = 0.30000000000000004
Alpha = 0.4
Alpha = 0.5
Alpha = 0.6
Alpha = 0.7000000000000001
Alpha = 0.8
Alpha = 0.9
Alpha = 1.0
Best Alpha for Without Stopwords: 0.7000000000000001
Best Alpha for With Stopwords: 0.5


In [6]:
print('Retraining Model After Selecting Best Alpha')
# Training Without SW
unigram_no_sw_classifier = MultinomialNB(alpha=alpha_no_sw).fit(
    amazon_rev_no_sw_uni_counts[:len_no_sw_train],
    amazon_rev_no_sw_splits['train'][1])
bigram_no_sw_classifier = MultinomialNB(alpha=alpha_no_sw).fit(
    amazon_rev_no_sw_big_counts[:len_no_sw_train],
    amazon_rev_no_sw_splits['train'][1])
unigram_bigram_no_sw_classifier = MultinomialNB(alpha=alpha_no_sw).fit(
    amazon_rev_no_sw_uni_big_counts[:len_no_sw_train],
    amazon_rev_no_sw_splits['train'][1])

# Training With SW
unigram_sw_classifier = MultinomialNB(alpha=alpha_sw).fit(
    amazon_rev_sw_uni_counts[:len_sw_train],
    amazon_rev_sw_splits['train'][1])
bigram_sw_classifier = MultinomialNB(alpha=alpha_sw).fit(
    amazon_rev_sw_big_counts[:len_sw_train],
    amazon_rev_sw_splits['train'][1])
unigram_bigram_sw_classifier = MultinomialNB(alpha=alpha_sw).fit(
    amazon_rev_sw_uni_big_counts[:len_sw_train],
    amazon_rev_sw_splits['train'][1])

Retraining Model After Selecting Best Alpha


In [7]:
print('Testing Classifiers: Without Stopwords')
pred_no_sw_uni = unigram_no_sw_classifier.predict(
    amazon_rev_no_sw_uni_counts[-len_no_sw_val:])
pred_no_sw_big = bigram_no_sw_classifier.predict(
    amazon_rev_no_sw_big_counts[-len_no_sw_val:])
pred_no_sw_uni_big = unigram_bigram_no_sw_classifier.predict(
    amazon_rev_no_sw_uni_big_counts[-len_no_sw_val:])

print('Testing Classifiers: With Stopwords')
pred_sw_uni = unigram_sw_classifier.predict(
    amazon_rev_sw_uni_counts[-len_sw_val:])
pred_sw_big = bigram_sw_classifier.predict(
    amazon_rev_sw_big_counts[-len_sw_val:])
pred_sw_uni_big = unigram_bigram_sw_classifier.predict(
    amazon_rev_sw_uni_big_counts[-len_sw_val:])

print('Calculating Accuracies')
print('--------------------------------------------------')
print('With Stopwords:')
print('--------------------------------------------------')
print(f'Unigrams: {calculate_avg_accuracy(pred_sw_uni, amazon_rev_sw_splits["test"][1])}')
print(f'Bigrams: {calculate_avg_accuracy(pred_sw_big, amazon_rev_sw_splits["test"][1])}')
print(f'Unigrams+Bigrams: {calculate_avg_accuracy(pred_sw_uni_big, amazon_rev_sw_splits["test"][1])}')
print('--------------------------------------------------')
print('Without Stopwords:')
print('--------------------------------------------------')
print(f'Unigrams: {calculate_avg_accuracy(pred_no_sw_uni, amazon_rev_no_sw_splits["test"][1])}')
print(f'Bigrams: {calculate_avg_accuracy(pred_no_sw_big, amazon_rev_no_sw_splits["test"][1])}')
print(f'Unigrams+Bigrams: {calculate_avg_accuracy(pred_no_sw_uni_big, amazon_rev_no_sw_splits["test"][1])}')

Testing Classifiers: Without Stopwords
Testing Classifiers: With Stopwords
Calculating Accuracies
--------------------------------------------------
With Stopwords:
--------------------------------------------------
Unigrams: 0.8081375
Bigrams: 0.8282875
Unigrams+Bigrams: 0.835225
--------------------------------------------------
Without Stopwords:
--------------------------------------------------
Unigrams: 0.80405
Bigrams: 0.7886625
Unigrams+Bigrams: 0.82225
