# Sentiment Analysis of Amazon Reviews: Doc2Vec
## By Timothy Pace
This program builds a Doc2Vec model for sentiment analysis of Amazon reviews. The original Kaggle dataset can be found at: https://www.kaggle.com/bittlingmayer/amazonreviews

In [1]:
import numpy as np
import pandas as pd
import bz2
import re
import os
import gc
from six.moves import zip, range

from nltk.corpus import stopwords
from nltk import PorterStemmer

import gensim
from gensim import models

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_curve, roc_auc_score, auc
from sklearn import preprocessing
from collections import Counter, OrderedDict

from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

import multiprocessing

### Gets the Data

In [2]:
train = bz2.BZ2File('./data/amazonreviews/train.ft.txt.bz2')
test = bz2.BZ2File('./data/amazonreviews/test.ft.txt.bz2')


In [3]:
train = train.readlines()
test = test.readlines()


In [4]:
gc.collect()


5

### Pre-Processes the Data

In [5]:
train = [x.decode('utf-8') for x in train]
test = [x.decode('utf-8') for x in test]


In [6]:
train_labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in train]
train_corpus = [x.split(' ', 1)[1][:-1] for x in train]

print(train_labels[:3])
print(train_corpus[:3])


[1, 1, 1]
['Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^', "The best soundtrack ever to anything.: I'm reading a lot of reviews saying that this is the best 'game soundtrack' and I figured that I'd write a review to disagree a bit. This in my opinino is Yasunori Mitsuda's ultimate masterpiece. The music is timeless and I'm been listening to it for years now and its beauty simply refuses to fade.The price tag on this is pretty staggering I must say, but if you are going to buy any cd for this much money, this is the only one that I feel would be worth every penny.", 'Amazing!: This soundtrack is my favorite musi

In [7]:
del train
gc.collect()


0

In [8]:
train_dict = {'text': train_corpus, 'labels': train_labels}
train_df = pd.DataFrame(train_dict)

train_df.head()


Unnamed: 0,text,labels
0,Stuning even for the non-gamer: This sound tra...,1
1,The best soundtrack ever to anything.: I'm rea...,1
2,Amazing!: This soundtrack is my favorite music...,1
3,Excellent Soundtrack: I truly like this soundt...,1
4,"Remember, Pull Your Jaw Off The Floor After He...",1


In [9]:
train_df.describe()


Unnamed: 0,labels
count,3600000.0
mean,0.5
std,0.5
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


In [10]:
del train_labels, train_corpus
gc.collect()


7

In [11]:
train_df = train_df.sample(n=25000)


In [12]:
train_df.describe()


Unnamed: 0,labels
count,25000.0
mean,0.49872
std,0.500008
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [13]:
test_labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in test]
test_corpus = [x.split(' ', 1)[1][:-1] for x in test]

print(test_labels[:3])
print(test_corpus[:3])


[1, 1, 0]
['Great CD: My lovely Pat has one of the GREAT voices of her generation. I have listened to this CD for YEARS and I still LOVE IT. When I\'m in a good mood it makes me feel better. A bad mood just evaporates like sugar in the rain. This CD just oozes LIFE. Vocals are jusat STUUNNING and lyrics just kill. One of life\'s hidden gems. This is a desert isle CD in my book. Why she never made it big is just beyond me. Everytime I play this, no matter black, white, young, old, male, female EVERYBODY says one thing "Who was that singing ?"', "One of the best game music soundtracks - for a game I didn't really play: Despite the fact that I have only played a small portion of the game, the music I heard (plus the connection to Chrono Trigger which was great as well) led me to purchase the soundtrack, and it remains one of my favorite albums. There is an incredible mix of fun, epic, and emotional songs. Those sad and beautiful tracks I especially like, as there's not too many of those k

In [14]:
del test
gc.collect()


7

In [15]:
test_dict = {'text': test_corpus, 'labels': test_labels}
test_df = pd.DataFrame(test_dict)

test_df.head()


Unnamed: 0,text,labels
0,Great CD: My lovely Pat has one of the GREAT v...,1
1,One of the best game music soundtracks - for a...,1
2,Batteries died within a year ...: I bought thi...,0
3,"works fine, but Maha Energy is better: Check o...",1
4,Great for the non-audiophile: Reviewed quite a...,1


In [16]:
test_df.describe()


Unnamed: 0,labels
count,400000.0
mean,0.5
std,0.500001
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


In [17]:
del test_corpus, test_labels
gc.collect()


7

In [18]:
test_df = test_df.sample(n=5000)


In [19]:
test_df.describe()


Unnamed: 0,labels
count,5000.0
mean,0.5056
std,0.500019
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [20]:
RE_PREPROCESS = r'\W+|\d+'
train_labels = train_df.labels.values
train_corpus = np.array([re.sub(RE_PREPROCESS, ' ', text).lower() for text in train_df.text.values])
test_labels = test_df.labels.values
test_corpus = np.array([re.sub(RE_PREPROCESS, ' ', text).lower() for text in test_df.text.values])


In [21]:
del train_df, test_df
gc.collect()


7

In [22]:
eng_stopwords = stopwords.words('english')
def removeStopwords(corpus):
    corpus_stopwords_removed = []
    for i in range(len(corpus)):
        doc = [word for word in corpus[i].split() if word not in stopwords.words('english')]
        corpus_stopwords_removed.append(doc)
    return (corpus_stopwords_removed)

stemmer = PorterStemmer()
def stemWords(corpus):
    corpus_stemmed = []
    for i in range(len(corpus)):
        doc = [stemmer.stem(word) for word in corpus[i]]
        corpus_stemmed.append(doc)
    return (corpus_stemmed)

train_corpus = removeStopwords(train_corpus)
train_corpus = stemWords(train_corpus)

test_corpus = removeStopwords(test_corpus)
test_corpus = stemWords(test_corpus)


In [23]:
train_set = []
for i in range(len(train_labels)):
    doc = models.doc2vec.TaggedDocument(words=train_corpus[i], tags=[train_labels[i]])
    train_set.append(doc)

test_set = []
for i in range(len(test_labels)):
    doc = models.doc2vec.TaggedDocument(words=test_corpus[i], tags=[test_labels[i]])
    test_set.append(doc)

print("Example Training Set Doc2Vec Labeled Sentence Object: ")
print(str(train_set[0]))
print()
print("Example Test Set Doc2Vec Labeled Sentence Object: ")
print(str(test_set[0]))


Example Training Set Doc2Vec Labeled Sentence Object: 
TaggedDocument(['lousi', 'write', 'bad', 'book', 'subject', 'matter', 'innat', 'fascin', 'preston', 'write', 'poor', 'constantli', 'distract', 'attempt', 'novelist', 'word', 'load', 'scene', 'imagin', 'detail', 'victim', 'think', 'exampl', 'fond', 'repeat', 'ad', 'nauseum', 'expeci', 'oh', 'clever', 'metaphor', 'like', 'slate', 'wiper', 'popul', 'thinner', 'etc', 'singl', 'worst', 'line', 'book', 'sound', 'like', 'intent', 'joke', 'want', 'experi', 'ebola', 'want', 'ebola', 'experi', 'explain', 'line', 'mean', 'pleas', 'let', 'know'], [0])

Example Test Set Doc2Vec Labeled Sentence Object: 
TaggedDocument(['bride', 'head', 'revisit', 'product', 'omposs', 'watch', 'sequenc', 'episod', 'mix', 'upfor', 'exampl', 'episod', 'abl', 'play', 'press', 'episod', 'three', 'episod', 'disc', 'middl', 'episod', 'would', 'respondand', 'could', 'play', 'amaz', 'receiv', 'inferior', 'product', 'amazon'], [0])


In [24]:
del train_corpus, test_corpus, train_labels, test_labels
gc.collect()


0

### Trains the Doc2Vec Model

In [25]:
cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1


In [26]:
train_model = models.Doc2Vec(dm=1, vector_size=100, window=10, negative=5, hs=0, min_count=2, sample=0,
                             epochs=20, workers=cores, alpha=0.001)
train_model.build_vocab(train_set)
train_model.train(train_set, total_examples=train_model.corpus_count, epochs=train_model.epochs)

train_model.save("amzn_model.doc2vec")
train_model = models.Doc2Vec.load('amzn_model.doc2vec')


In [27]:
train_labels, train_targets = zip(*[(doc.tags[0], train_model.infer_vector(doc.words)) for doc in train_set])
test_labels, test_targets = zip(*[(doc.tags[0], train_model.infer_vector(doc.words)) for doc in test_set])


In [28]:
del train_set, test_set
gc.collect()


0

### Evaluates the Doc2Vec Model

In [29]:
le = preprocessing.LabelEncoder()
le.fit(train_labels)
train_labels_binary = le.transform(train_labels)

le.fit(test_labels)
test_labels_binary = le.transform(test_labels)

In [30]:
clf = LogisticRegression()
mdl = clf.fit(train_targets, train_labels)
y_score = mdl.predict_proba(test_targets)


In [31]:
precision_curve, recall_curve, pr_thresholds = precision_recall_curve(test_labels_binary, y_score[:, 1])
auc_val = auc(recall_curve, precision_curve)
print('AUC-PR: {0:1f}'.format(auc_val))


AUC-PR: 0.880742
