# Amazon Book Review Sentiment Analysis

In this project, I performed the sentiment analysis on the Amazon book review dataset using text mining machinine learning models. 

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import json
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn import svm
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

ModuleNotFoundError: No module named 'gensim'

### Step 1 Pre-process the Amazon book review dataset

In [3]:
# Check how many records in the dataset
count = 0
with open ("data/Books_5.json") as Train_json:
    for i in Train_json:
        count += 1
print ("Total number of reviews is {}".format(count))

Total number of reviews is 8898041


In [6]:
## Check the first example in the dataset
with open ("data/Books_5.json") as Train_json:
    first_review = next(Train_json)
    first_review = json.loads(first_review)
    print (first_review)
    print (type(first_review))
#The json record is converted to a dict

{'reviewerID': 'A10000012B7CGYKOMPQ4L', 'asin': '000100039X', 'reviewerName': 'Adam', 'helpful': [0, 0], 'reviewText': 'Spiritually and mentally inspiring! A book that allows you to question your morals and will help you discover who you really are!', 'overall': 5.0, 'summary': 'Wonderful!', 'unixReviewTime': 1355616000, 'reviewTime': '12 16, 2012'}
<class 'dict'>


In [3]:
## Store the usefull information from this huge dataset
dataset = {}
dataset["review_text"] = []
dataset['rating'] = []
count = 0
with open ('data/Books_5.json') as Train_json:
    for i in Train_json:
        count+=1
        if count % 20 == 0:
            item = json.loads(i)
            dataset["review_text"].append(item["reviewText"])
            dataset["rating"].append(item["overall"])

In [4]:
## Convert Dataset to dataframe
dataset_df = pd.DataFrame(dataset)
print(len(dataset_df))
dataset_df.head()

444902


Unnamed: 0,review_text,rating
0,"This book is everything that is simple, delica...",5.0
1,When I first started writing poetry at age 12 ...,5.0
2,"Khalil Gibran's book, The Prophet, has the pow...",5.0
3,I was given this book by a writer friend who c...,5.0
4,A book to be treasured. A tremendous poet deal...,5.0


In [5]:
dataset_df_5 = dataset_df[dataset_df["rating"]==5.0]
dataset_df_4 = dataset_df[dataset_df["rating"]==4.0]
dataset_df_3 = dataset_df[dataset_df["rating"]==3.0]
dataset_df_2 = dataset_df[dataset_df["rating"]==2.0]
dataset_df_1 = dataset_df[dataset_df["rating"]==1.0]

In [6]:
dataset_df_1_train = dataset_df_1.iloc[0:5000,]
dataset_df_1_test = dataset_df_1.iloc[5000:6000,]
dataset_df_2_train = dataset_df_2.iloc[0:5000,]
dataset_df_2_test = dataset_df_2.iloc[5000:6000,]
dataset_df_3_train = dataset_df_3.iloc[0:5000,]
dataset_df_3_test = dataset_df_3.iloc[5000:6000,]
dataset_df_4_train = dataset_df_4.iloc[0:5000,]
dataset_df_4_test = dataset_df_4.iloc[5000:6000,]
dataset_df_5_train = dataset_df_5.iloc[0:5000,]
dataset_df_5_test = dataset_df_5.iloc[5000:6000,]

In [7]:
frames_train = [dataset_df_5_train, dataset_df_4_train, dataset_df_3_train,dataset_df_2_train,dataset_df_1_train]
frames_test = [dataset_df_5_test, dataset_df_4_test, dataset_df_3_test,dataset_df_2_test,dataset_df_1_test]

dataset_df_train = pd.concat(frames_train,ignore_index = True)
dataset_df_test = pd.concat(frames_test, ignore_index = True)

In [8]:
## Clean the reviews
stops_eng = set(stopwords.words("english"))
stemmer = nltk.wordnet.WordNetLemmatizer()
def data_cleaning (raw_review_str):
    ## remove potential HTML tag
    review_text = BeautifulSoup(raw_review_str).get_text()
    ## remove non-letter 
    review_letter = re.sub("[^a-zA-Z]"," ",review_text)
    ## convert to lower case and split the list
    review_lower_list = review_letter.lower().split()
    ## remove stop words and stemming
    review_nostopwords_stemmed = [stemmer.lemmatize(w) for w in review_lower_list if w not in stops_eng]
    ## return 
    return " ".join(review_nostopwords_stemmed)

In [9]:
dataset_df_train["review_cleaned"] = dataset_df_train["review_text"].apply(data_cleaning)
dataset_df_test["review_cleaned"] = dataset_df_test["review_text"].apply(data_cleaning)
#Save the train and test datasets as csv files
dataset_df_train.to_csv(path_or_buf="data/Amazon_book_review_train.csv")
dataset_df_test.to_csv(path_or_buf="data/Amazon_book_review_test.csv")

In [10]:
## Generate Features (bag of words)
vectorizer = TfidfVectorizer(analyzer = "word", ngram_range=(1, 2), tokenizer = None, preprocessor = None, stop_words = None,max_features=2000)
train_data_features = vectorizer.fit_transform(dataset_df_train["review_cleaned"])
test_data_features = vectorizer.transform(dataset_df_test["review_cleaned"]) 

In [53]:
train_data_features.shape

(25000, 2000)

In [54]:
test_data_features.shape

(5000, 2000)

In [11]:
## Convert to sparse matrix
train_data_features = train_data_features.toarray()
test_data_features = test_data_features.toarray()

### Step 2. Building Machine Learning Models 

#### 1. LDA Model

In [59]:
clf = LinearDiscriminantAnalysis()
clf.fit(train_data_features, dataset_df_train["rating"])

LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)

In [60]:
prediction_lda = clf.predict(test_data_features)

In [61]:
print("prediction accuracy is %f" % ((dataset_df_test['rating'] == prediction_lda).sum()/len(dataset_df_test)))

prediction accuracy is 0.452000


#### 2. QDA Model

In [63]:
qda = QuadraticDiscriminantAnalysis()
qda.fit(train_data_features, dataset_df_train["rating"])



QuadraticDiscriminantAnalysis(priors=None, reg_param=0.0,
               store_covariance=False, store_covariances=None, tol=0.0001)

In [64]:
prediction_qda = qda.predict(test_data_features)

In [65]:
print("prediction accuracy is %f" % ((dataset_df_test['rating'] == prediction_qda).sum()/len(dataset_df_test)))

prediction accuracy is 0.388000


#### 3. Logistic Regression

In [66]:
logreg = LogisticRegression(C=1e5,multi_class = 'auto',solver = 'lbfgs')
logreg.fit(train_data_features, dataset_df_train["rating"])



LogisticRegression(C=100000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=None,
          solver='warn', tol=0.0001, verbose=0, warm_start=False)

In [67]:
prediction_logreg = logreg.predict(test_data_features)

In [68]:
print("prediction accuracy is %f" % ((dataset_df_test['rating'] == prediction_logreg).sum()/len(dataset_df_test)))

prediction accuracy is 0.436600


#### 4. Random Forest

In [70]:
forest = RandomForestClassifier(n_estimators = 200) 
forest = forest.fit(train_data_features, dataset_df_train["rating"])

In [71]:
# Use the random forest to make sentiment label predictions
prediction_rm = forest.predict(test_data_features)

In [72]:
print("prediction accuracy is %f" % ((dataset_df_test['rating'] == prediction_rm).sum()/len(dataset_df_test)))

prediction accuracy is 0.444600


#### 5. Navie Bayes

In [76]:
gnb = GaussianNB()
prediction_gnb = gnb.fit(train_data_features, dataset_df_train["rating"]).predict(test_data_features)

In [77]:
print("Number of mislabeled points out of a total %d points : %d" % (len(dataset_df_test),(dataset_df_test['rating'] != prediction_gnb).sum()))
print("prediction accuracy is %f" % ((dataset_df_test['rating'] == prediction_gnb).sum()/len(dataset_df_test)))

Number of mislabeled points out of a total 5000 points : 2990
prediction accuracy is 0.402000


In [78]:
mnb=MultinomialNB()
prediction_mnb = mnb.fit(train_data_features, dataset_df_train["rating"]).predict(test_data_features)

In [79]:
print("Number of mislabeled points out of a total %d points : %d" % (len(dataset_df_test),(dataset_df_test['rating'] != prediction_mnb).sum()))
print("prediction accuracy is %f" % ((dataset_df_test['rating'] == prediction_mnb).sum()/len(dataset_df_test)))

Number of mislabeled points out of a total 5000 points : 2654
prediction accuracy is 0.469200


In [80]:
bnb = BernoulliNB()
prediction_bnb = bnb.fit(train_data_features, dataset_df_train["rating"]).predict(test_data_features)

In [81]:
print("Number of mislabeled points out of a total %d points : %d" % (len(dataset_df_test),(dataset_df_test['rating'] != prediction_bnb).sum()))
print("prediction accuracy is %f" % ((dataset_df_test['rating'] == prediction_bnb).sum()/len(dataset_df_test)))

Number of mislabeled points out of a total 5000 points : 3118
prediction accuracy is 0.376400


#### 5. SVM

In [None]:
clf = svm.SVC(decision_function_shape='ovo')
clf.fit(train_data_features, dataset_df_train["rating"])



In [None]:
prediction_svm = clf.predict(test_data_features)

In [None]:
print("prediction accuracy is %f" % ((dataset_df_test['rating'] == prediction_svm).sum()/len(dataset_df_test)))

### Step 3 Deep Learning based Vectorization 

In [12]:
## Put all the review of train_set in a list to generate the features
clean_train_reviews = dataset_df_train["review_cleaned"].tolist()
clean_test_reviews = dataset_df_test["review_cleaned"].tolist()
clean_total_reviews = clean_train_reviews + clean_test_reviews

In [15]:
tagged_data_total = [TaggedDocument(words=word_tokenize(_d), tags=[str(i)]) for i, _d in enumerate(clean_total_reviews)]

In [None]:
max_epochs = 100
vec_size = 20
alpha = 0.025

model = Doc2Vec(vector_size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)

'''Note: dm defines the training algorithm. If dm=1 means ‘distributed memory’ (PV-DM) and 
dm =0 means ‘distributed bag of words’ (PV-DBOW). 
Distributed Memory model preserves the word order in a document whereas 
Distributed Bag of words just uses the bag of words approach, 
which doesn’t preserve any word order.'''
  
model.build_vocab(tagged_data_total)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data_total,
                total_examples=model.corpus_count,
                epochs=model.epochs)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

model.save("data/d2v.model")
print("Model Saved")

iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5


In [None]:
#model= Doc2Vec.load("d2v.model")
# to find most similar doc using tags
similar_doc = model.docvecs.most_similar('1')
print(similar_doc)

In [None]:
# to find vector of doc in training data using tags or in other words, printing the vector of document at index 1 in training data
print(model.docvecs['0'])
print(type(model.docvecs['0']))
print(model.docvecs['0'].shape)

In [None]:
clean_train_reviews_vec = []
for sen in clean_train_reviews:
    train_data = word_tokenize(sen)
    clean_train_reviews_vec.append(model.infer_vector(train_data))

In [None]:
print(len(clean_train_reviews_vec))
print(clean_train_reviews_vec[0])

In [None]:
clean_test_reviews_vec = []
for sen in clean_test_reviews:
    test_data = word_tokenize(sen)
    clean_test_reviews_vec.append(model.infer_vector(test_data))