# Amazon Book Review Sentiment Analysis

In this project, I performed the sentiment analysis on the Amazon book review dataset using machinine learning models. 

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import json
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn import svm
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

### Step 1 Pre-process the Amazon book review dataset

In [2]:
# Check how many records in the dataset
count = 0
with open ("data/Books_5.json") as Train_json:
    for i in Train_json:
        count += 1
print ("Total number of reviews is {}".format(count))

Total number of reviews is 8898041


In [3]:
## Check the first example in the dataset
with open ("data/Books_5.json") as Train_json:
    first_review = next(Train_json)
    first_review = json.loads(first_review)
    print (first_review)
    print (type(first_review))

{'reviewerID': 'A10000012B7CGYKOMPQ4L', 'asin': '000100039X', 'reviewerName': 'Adam', 'helpful': [0, 0], 'reviewText': 'Spiritually and mentally inspiring! A book that allows you to question your morals and will help you discover who you really are!', 'overall': 5.0, 'summary': 'Wonderful!', 'unixReviewTime': 1355616000, 'reviewTime': '12 16, 2012'}
<class 'dict'>


In [4]:
## Store the usefull information from this huge dataset
dataset = {}
dataset["review_text"] = []
dataset['rating'] = []
count = 0
with open ('data/Books_5.json') as Train_json:
    for i in Train_json:
        count+=1
        if count % 20 == 0:
            item = json.loads(i)
            dataset["review_text"].append(item["reviewText"])
            dataset["rating"].append(item["overall"])

In [5]:
## Convert Dataset to dataframe
dataset_df = pd.DataFrame(dataset)
print('There are {} records in the dataset.'.format(len(dataset_df)))
dataset_df.head()

There are 444902 records in the dataset.


Unnamed: 0,review_text,rating
0,"This book is everything that is simple, delica...",5.0
1,When I first started writing poetry at age 12 ...,5.0
2,"Khalil Gibran's book, The Prophet, has the pow...",5.0
3,I was given this book by a writer friend who c...,5.0
4,A book to be treasured. A tremendous poet deal...,5.0


In [6]:
dataset_df_5 = dataset_df[dataset_df["rating"]==5.0]
dataset_df_4 = dataset_df[dataset_df["rating"]==4.0]
dataset_df_3 = dataset_df[dataset_df["rating"]==3.0]
dataset_df_2 = dataset_df[dataset_df["rating"]==2.0]
dataset_df_1 = dataset_df[dataset_df["rating"]==1.0]

In [7]:
dataset_df_1_train = dataset_df_1.iloc[0:5000,]
dataset_df_1_test = dataset_df_1.iloc[5000:6000,]
dataset_df_2_train = dataset_df_2.iloc[0:5000,]
dataset_df_2_test = dataset_df_2.iloc[5000:6000,]
dataset_df_3_train = dataset_df_3.iloc[0:5000,]
dataset_df_3_test = dataset_df_3.iloc[5000:6000,]
dataset_df_4_train = dataset_df_4.iloc[0:5000,]
dataset_df_4_test = dataset_df_4.iloc[5000:6000,]
dataset_df_5_train = dataset_df_5.iloc[0:5000,]
dataset_df_5_test = dataset_df_5.iloc[5000:6000,]

In [8]:
frames_train = [dataset_df_5_train, dataset_df_4_train, dataset_df_3_train,dataset_df_2_train,dataset_df_1_train]
frames_test = [dataset_df_5_test, dataset_df_4_test, dataset_df_3_test,dataset_df_2_test,dataset_df_1_test]

dataset_df_train = pd.concat(frames_train,ignore_index = True)
dataset_df_test = pd.concat(frames_test, ignore_index = True)

In [9]:
## Clean the reviews
stops_eng = set(stopwords.words("english"))
stemmer = nltk.wordnet.WordNetLemmatizer()
def data_cleaning (raw_review_str):
    ## remove potential HTML tag
    review_text = BeautifulSoup(raw_review_str).get_text()
    ## remove non-letter 
    review_letter = re.sub("[^a-zA-Z]"," ",review_text)
    ## convert to lower case and split the list
    review_lower_list = review_letter.lower().split()
    ## remove stop words and stemming
    review_nostopwords_stemmed = [stemmer.lemmatize(w) for w in review_lower_list if w not in stops_eng]
    ## return 
    return " ".join(review_nostopwords_stemmed)

In [10]:
dataset_df_train["review_cleaned"] = dataset_df_train["review_text"].apply(data_cleaning)
dataset_df_test["review_cleaned"] = dataset_df_test["review_text"].apply(data_cleaning)
#Save the train and test datasets as csv files
dataset_df_train.to_csv(path_or_buf="data/Amazon_book_review_train.csv")
dataset_df_test.to_csv(path_or_buf="data/Amazon_book_review_test.csv")

In [18]:
## Generate Features (bag of words)
vectorizer = TfidfVectorizer(analyzer = "word", ngram_range=(1, 2), tokenizer = None, preprocessor = None, stop_words = None,max_features=2000)
train_data_features = vectorizer.fit_transform(dataset_df_train["review_cleaned"])
test_data_features = vectorizer.transform(dataset_df_test["review_cleaned"]) 
print('The shape of training sample is {}.'.format(train_data_features.shape))
print('The shape of test sample is {}.'.format(test_data_features.shape))

The shape of training sample is (25000, 2000).
The shape of test sample is (5000, 2000).


In [19]:
## Convert to sparse matrix
train_data_features = train_data_features.toarray()
test_data_features = test_data_features.toarray()

### Step 2. Building Machine Learning Models 

#### 1. LDA Model

In [20]:
clf = LinearDiscriminantAnalysis()
clf.fit(train_data_features, dataset_df_train["rating"])

LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)

In [21]:
prediction_lda = clf.predict(test_data_features)

In [22]:
print("prediction accuracy is %f" % ((dataset_df_test['rating'] == prediction_lda).sum()/len(dataset_df_test)))

prediction accuracy is 0.452000


#### 2. QDA Model

In [23]:
qda = QuadraticDiscriminantAnalysis()
qda.fit(train_data_features, dataset_df_train["rating"])



QuadraticDiscriminantAnalysis(priors=None, reg_param=0.0,
               store_covariance=False, store_covariances=None, tol=0.0001)

In [24]:
prediction_qda = qda.predict(test_data_features)

In [25]:
print("prediction accuracy is %f" % ((dataset_df_test['rating'] == prediction_qda).sum()/len(dataset_df_test)))

prediction accuracy is 0.388000


#### 3. Logistic Regression

In [26]:
logreg = LogisticRegression(C=1e5,multi_class = 'auto',solver = 'lbfgs')
logreg.fit(train_data_features, dataset_df_train["rating"])



LogisticRegression(C=100000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='auto', n_jobs=None, penalty='l2', random_state=None,
          solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)

In [27]:
prediction_logreg = logreg.predict(test_data_features)

In [28]:
print("prediction accuracy is %f" % ((dataset_df_test['rating'] == prediction_logreg).sum()/len(dataset_df_test)))

prediction accuracy is 0.439800


#### 4. Random Forest

In [29]:
forest = RandomForestClassifier(n_estimators = 200) 
forest = forest.fit(train_data_features, dataset_df_train["rating"])

In [30]:
# Use the random forest to make sentiment label predictions
prediction_rm = forest.predict(test_data_features)

In [31]:
print("prediction accuracy is %f" % ((dataset_df_test['rating'] == prediction_rm).sum()/len(dataset_df_test)))

prediction accuracy is 0.445000


#### 5. Navie Bayes

In [32]:
gnb = GaussianNB()
prediction_gnb = gnb.fit(train_data_features, dataset_df_train["rating"]).predict(test_data_features)

In [33]:
print("Number of mislabeled points out of a total %d points : %d" % (len(dataset_df_test),(dataset_df_test['rating'] != prediction_gnb).sum()))
print("prediction accuracy is %f" % ((dataset_df_test['rating'] == prediction_gnb).sum()/len(dataset_df_test)))

Number of mislabeled points out of a total 5000 points : 2990
prediction accuracy is 0.402000


In [34]:
mnb=MultinomialNB()
prediction_mnb = mnb.fit(train_data_features, dataset_df_train["rating"]).predict(test_data_features)

In [35]:
print("Number of mislabeled points out of a total %d points : %d" % (len(dataset_df_test),(dataset_df_test['rating'] != prediction_mnb).sum()))
print("prediction accuracy is %f" % ((dataset_df_test['rating'] == prediction_mnb).sum()/len(dataset_df_test)))

Number of mislabeled points out of a total 5000 points : 2654
prediction accuracy is 0.469200


In [36]:
bnb = BernoulliNB()
prediction_bnb = bnb.fit(train_data_features, dataset_df_train["rating"]).predict(test_data_features)

In [37]:
print("Number of mislabeled points out of a total %d points : %d" % (len(dataset_df_test),(dataset_df_test['rating'] != prediction_bnb).sum()))
print("prediction accuracy is %f" % ((dataset_df_test['rating'] == prediction_bnb).sum()/len(dataset_df_test)))

Number of mislabeled points out of a total 5000 points : 3118
prediction accuracy is 0.376400


#### 5. SVM

In [38]:
clf = svm.SVC(decision_function_shape='ovo')
clf.fit(train_data_features, dataset_df_train["rating"])



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovo', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [39]:
prediction_svm = clf.predict(test_data_features)

In [40]:
print("prediction accuracy is %f" % ((dataset_df_test['rating'] == prediction_svm).sum()/len(dataset_df_test)))

prediction accuracy is 0.407200


### Step 3 Deep Learning based Vectorization 

In [41]:
## Put all the review of train_set in a list to generate the features
clean_train_reviews = dataset_df_train["review_cleaned"].tolist()
clean_test_reviews = dataset_df_test["review_cleaned"].tolist()
clean_total_reviews = clean_train_reviews + clean_test_reviews

In [42]:
tagged_data_total = [TaggedDocument(words=word_tokenize(_d), tags=[str(i)]) for i, _d in enumerate(clean_total_reviews)]

In [43]:
max_epochs = 100
vec_size = 20
alpha = 0.025

model = Doc2Vec(vector_size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)

'''Note: dm defines the training algorithm. If dm=1 means ‘distributed memory’ (PV-DM) and 
dm =0 means ‘distributed bag of words’ (PV-DBOW). 
Distributed Memory model preserves the word order in a document whereas 
Distributed Bag of words just uses the bag of words approach, 
which doesn’t preserve any word order.'''
  
model.build_vocab(tagged_data_total)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data_total,
                total_examples=model.corpus_count,
                epochs=model.epochs)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

model.save("data/d2v.model")
print("Model Saved")

iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39
iteration 40
iteration 41
iteration 42
iteration 43
iteration 44
iteration 45
iteration 46
iteration 47
iteration 48
iteration 49
iteration 50
iteration 51
iteration 52
iteration 53
iteration 54
iteration 55
iteration 56
iteration 57
iteration 58
iteration 59
iteration 60
iteration 61
iteration 62
iteration 63
iteration 64
iteration 65
iteration 66
iteration 67
iteration 68
iteration 69
iteration 70
iteration 71
iteration 72
iteration 73
iteration 74
iteration 75
iteration 76
iteration

In [44]:
#model= Doc2Vec.load("d2v.model")
# to find most similar doc using tags
similar_doc = model.docvecs.most_similar('1')
print(similar_doc)

[('1860', 0.839777410030365), ('25027', 0.825043261051178), ('6523', 0.8079653978347778), ('27059', 0.8006597757339478), ('552', 0.7985435724258423), ('13618', 0.7852596044540405), ('25130', 0.7847905158996582), ('25072', 0.7800161838531494), ('1304', 0.7787582874298096), ('5537', 0.7751613855361938)]


In [45]:
# to find vector of doc in training data using tags or in other words, printing the vector of document at index 1 in training data
print(model.docvecs['0'])
print(type(model.docvecs['0']))
print(model.docvecs['0'].shape)

[ -1.1123402    3.2690628   -0.08276898   2.590324     3.652088
  -3.1616101   -0.8301031  -14.866494     2.1729636   -9.439779
  -2.44938      3.7817602   -2.142102    -2.9509847    6.4941072
  -0.49279016  -6.75984     -0.12138461  -1.5224811    1.3682069 ]
<class 'numpy.ndarray'>
(20,)


In [46]:
clean_train_reviews_vec = []
for sen in clean_train_reviews:
    train_data = word_tokenize(sen)
    clean_train_reviews_vec.append(model.infer_vector(train_data))

In [47]:
print(len(clean_train_reviews_vec))
print(clean_train_reviews_vec[0])

25000
[-0.07696462 -0.0739723   0.07602923 -0.12349915 -0.05102457  0.07629477
  0.03056202 -0.59252876  0.10122231 -0.24336916 -0.38375524  0.18210688
  0.29690853 -0.03024114  0.02419298  0.19032781  0.11945209 -0.00609656
  0.19144987 -0.24580282]


In [48]:
clean_test_reviews_vec = []
for sen in clean_test_reviews:
    test_data = word_tokenize(sen)
    clean_test_reviews_vec.append(model.infer_vector(test_data))

In [49]:
print(len(clean_test_reviews_vec))
print(clean_test_reviews_vec[0])

5000
[-0.07914875 -0.06068993  0.14605258 -0.13449569 -0.03347416 -0.01190743
  0.00775984 -0.44641715  0.07136312 -0.16189079 -0.26087117  0.10519265
  0.2719514  -0.02194577 -0.01086485  0.21809791  0.12636918 -0.07747196
  0.13089037 -0.34770945]


In [50]:
clf = LinearDiscriminantAnalysis()
clf.fit(np.array(clean_train_reviews_vec), dataset_df_train["rating"])

LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)

In [51]:
prediction_lda = clf.predict(clean_test_reviews_vec)

In [52]:
print("prediction accuracy is %f" % ((dataset_df_test['rating'] == prediction_lda).sum()/len(dataset_df_test)))

prediction accuracy is 0.335600
