In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import json
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

In [2]:
## Store the usefull information from this huge dataset
dataset = {}
dataset["review_text"] = []
dataset['rating'] = []
count = 0
with open ('Books_5.json') as Train_json:
    for i in Train_json:
        count+=1
        if count % 20 == 0:
            item = json.loads(i)
            dataset["review_text"].append(item["reviewText"])
            dataset["rating"].append(item["overall"]) 

In [3]:
## Convert Dataset to dataframe
dataset_df = pd.DataFrame(dataset)
print(len(dataset_df))
dataset_df.head()

444902


Unnamed: 0,rating,review_text
0,5.0,"This book is everything that is simple, delica..."
1,5.0,When I first started writing poetry at age 12 ...
2,5.0,"Khalil Gibran's book, The Prophet, has the pow..."
3,5.0,I was given this book by a writer friend who c...
4,5.0,A book to be treasured. A tremendous poet deal...


In [4]:
dataset_df_5 = dataset_df[dataset_df["rating"]==5.0]
dataset_df_4 = dataset_df[dataset_df["rating"]==4.0]
dataset_df_3 = dataset_df[dataset_df["rating"]==3.0]
dataset_df_2 = dataset_df[dataset_df["rating"]==2.0]
dataset_df_1 = dataset_df[dataset_df["rating"]==1.0]

In [5]:
dataset_df_1_train = dataset_df_1.iloc[0:5000,]
dataset_df_1_test = dataset_df_1.iloc[5000:6000,]
dataset_df_2_train = dataset_df_2.iloc[0:5000,]
dataset_df_2_test = dataset_df_2.iloc[5000:6000,]
dataset_df_3_train = dataset_df_3.iloc[0:5000,]
dataset_df_3_test = dataset_df_3.iloc[5000:6000,]
dataset_df_4_train = dataset_df_4.iloc[0:5000,]
dataset_df_4_test = dataset_df_4.iloc[5000:6000,]
dataset_df_5_train = dataset_df_5.iloc[0:5000,]
dataset_df_5_test = dataset_df_5.iloc[5000:6000,]

In [6]:
frames_train = [dataset_df_5_train, dataset_df_4_train, dataset_df_3_train,dataset_df_2_train,dataset_df_1_train]
frames_test = [dataset_df_5_test, dataset_df_4_test, dataset_df_3_test,dataset_df_2_test,dataset_df_1_test]

dataset_df_train = pd.concat(frames_train)
dataset_df_test = pd.concat(frames_test)

In [7]:
## Clean the reviews
def data_cleaning (raw_review_str):
    ## remove potential HTML tag
    review_text = BeautifulSoup(raw_review_str).get_text()
    ## remove non-letter 
    review_letter = re.sub("[^a-zA-Z]"," ",review_text)
    ## convert to lower case and split the list
    review_lower_list = review_letter.lower().split()
    ## remove stop words and stemming
    stops_eng = set(stopwords.words("english"))
    stemmer = nltk.wordnet.WordNetLemmatizer()
    review_nostopwords_stemmed = [stemmer.lemmatize(w) for w in review_lower_list]
    review_nostopwords_stemmed = [stemmer.lemmatize(w) for w in review_lower_list if w not in stops_eng]
    ## return 
    return " ".join(review_nostopwords_stemmed)

In [8]:
dataset_df_train["review_cleaned"] = dataset_df_train["review_text"].apply(data_cleaning)
dataset_df_test["review_cleaned"] = dataset_df_test["review_text"].apply(data_cleaning)



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "html.parser")

  markup_type=markup_type))


In [9]:
## Put all the review of train_set in a list to generate the features
clean_train_reviews = []
for i in dataset_df_train["review_cleaned"]:
    clean_train_reviews.append(i)

clean_test_reviews = []
for i in dataset_df_test["review_cleaned"]:
    clean_test_reviews.append(i) 

In [26]:
## Generate Features (bag of words)
vectorizer = TfidfVectorizer(analyzer = "word", ngram_range=(1, 2), tokenizer = None, preprocessor = None, stop_words = None,max_features=20000)
train_data_features = vectorizer.fit_transform(clean_train_reviews)
test_data_features = vectorizer.transform(clean_test_reviews)

In [27]:
vocab = vectorizer.get_feature_names()
#print (vocab)
print(len(vocab))

20000


In [28]:
train_data_features.shape

(25000, 20000)

In [29]:
test_data_features.shape

(5000, 20000)

In [30]:
train_data_features = train_data_features.toarray()

In [31]:
test_data_features = test_data_features.toarray()

In [32]:
gnb = GaussianNB()
y_pred_gnb = gnb.fit(train_data_features, dataset_df_train["rating"]).predict(test_data_features)

In [33]:
print("Number of mislabeled points out of a total %d points : %d" % (len(dataset_df_test),(dataset_df_test['rating'] != y_pred_gnb).sum()))
print("prediction accuracy is %f" % ((dataset_df_test['rating'] == y_pred_gnb).sum()/len(dataset_df_test)))

Number of mislabeled points out of a total 5000 points : 3278
prediction accuracy is 0.344400


In [34]:
mnb=MultinomialNB()
y_pred_mnb = mnb.fit(train_data_features, dataset_df_train["rating"]).predict(test_data_features)

In [35]:
print("Number of mislabeled points out of a total %d points : %d" % (len(dataset_df_test),(dataset_df_test['rating'] != y_pred_mnb).sum()))
print("prediction accuracy is %f" % ((dataset_df_test['rating'] == y_pred_mnb).sum()/len(dataset_df_test)))

Number of mislabeled points out of a total 5000 points : 2633
prediction accuracy is 0.473400


In [36]:
bnb = BernoulliNB()
y_pred_bnb = bnb.fit(train_data_features, dataset_df_train["rating"]).predict(test_data_features)

In [37]:
print("Number of mislabeled points out of a total %d points : %d" % (len(dataset_df_test),(dataset_df_test['rating'] != y_pred_bnb).sum()))
print("prediction accuracy is %f" % ((dataset_df_test['rating'] == y_pred_bnb).sum()/len(dataset_df_test)))

Number of mislabeled points out of a total 5000 points : 2948
prediction accuracy is 0.410400
