In [1]:
import pandas as pd
import numpy as np
import os
import re
import nltk
import json
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm

In [2]:
directory = os.getcwd()
filename = 'Books_5.json'
datafile = os.path.join(directory, filename)

In [3]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/xinmu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
## Store the usefull information from this huge dataset
dataset = {}
dataset["review_text"] = []
dataset['rating'] = []
count = 0
with open (datafile) as Train_json:
    for i in Train_json:
        count+=1
        if count % 20 == 0:
            item = json.loads(i)
            dataset["review_text"].append(item["reviewText"])
            dataset["rating"].append(item["overall"])

In [5]:
## Convert Dataset to dataframe
dataset_df = pd.DataFrame(dataset)
print(len(dataset_df))
dataset_df.head()

444902


Unnamed: 0,rating,review_text
0,5.0,"This book is everything that is simple, delica..."
1,5.0,When I first started writing poetry at age 12 ...
2,5.0,"Khalil Gibran's book, The Prophet, has the pow..."
3,5.0,I was given this book by a writer friend who c...
4,5.0,A book to be treasured. A tremendous poet deal...


In [6]:
dataset_df_5 = dataset_df[dataset_df["rating"]==5.0]
dataset_df_4 = dataset_df[dataset_df["rating"]==4.0]
dataset_df_3 = dataset_df[dataset_df["rating"]==3.0]
dataset_df_2 = dataset_df[dataset_df["rating"]==2.0]
dataset_df_1 = dataset_df[dataset_df["rating"]==1.0]
dataset_df_0 = dataset_df[dataset_df["rating"]==0.0]

In [7]:
print (len(dataset_df_0))
print (len(dataset_df_1))
print (len(dataset_df_2))
print (len(dataset_df_3))
print (len(dataset_df_4))
print (len(dataset_df_5))

0
16183
20931
47619
111070
249099


In [8]:
dataset_df_1.iloc[0,1]

'This book was a philosophical touchstone for insecure Boomers of the mid-1970s. Everybody I knew in those days had to be able to discuss it intelligently or risk being thought to have a gap in our characters as people. A folk/ talking blues singer of that era (I forget who) referred to this book in a lyric:"...a copy of Kahlil Gibran\'s \'The Prophet\' with all the significant passages highlighted--the whole damn BOOK was highlited..."In truth, the whole phenomenon was symptomatic of the societal immaturity of my generation as young adults. It taught us all sorts of theoretical concepts of human nature that were not necessarily reflective of the real world--consideration, the dignity of each person, peace, love, repudiation of prejudice--all of this in a world that anything but reflected such beliefs in Gibran\'s day. And except for the hippy-dippy pseudo-enlightenment we tried to cram down the world\'s throat in our day, our peculiar era was no better. We just used Gibran and other s

In [9]:
dataset_df_1_train = dataset_df_1.iloc[0:5000]
dataset_df_1_test = dataset_df_1.iloc[5000:6000]
dataset_df_2_train = dataset_df_2.iloc[0:5000]
dataset_df_2_test = dataset_df_2.iloc[5000:6000]
dataset_df_3_train = dataset_df_3.iloc[0:5000]
dataset_df_3_test = dataset_df_3.iloc[5000:6000]
dataset_df_4_train = dataset_df_4.iloc[0:5000]
dataset_df_4_test = dataset_df_4.iloc[5000:6000]
dataset_df_5_train = dataset_df_5.iloc[0:5000]
dataset_df_5_test = dataset_df_5.iloc[5000:6000]

In [10]:
dataset_df_1_train.head()

Unnamed: 0,rating,review_text
7,1.0,This book was a philosophical touchstone for i...
28,1.0,I really wish publishers would rate books like...
50,1.0,"like many others, i was disappointed by the di..."
89,1.0,I've decided I just can't finish this book. Th...
99,1.0,"Some parts are entertaining enough, but left m..."


In [11]:
frames_train = [dataset_df_5_train, dataset_df_4_train, dataset_df_3_train,dataset_df_2_train,dataset_df_1_train]
frames_test = [dataset_df_5_test, dataset_df_4_test, dataset_df_3_test,dataset_df_2_test,dataset_df_1_test]

dataset_df_train = pd.concat(frames_train)
dataset_df_test = pd.concat(frames_test)

In [12]:
type(dataset_df_train)

pandas.core.frame.DataFrame

In [14]:
## Clean the reviews
def data_cleaning (raw_review_str):
    ## remove potential HTML tag
    review_text = BeautifulSoup(raw_review_str).get_text()
    ## remove non-letter 
    review_letter = re.sub("[^a-zA-Z]"," ",review_text)
    ## convert to lower case and split the list
    review_lower_list = review_letter.lower().split()
    ## remove stop words and stemming
    stops_eng = set(stopwords.words("english"))
    stemmer = nltk.wordnet.WordNetLemmatizer()
    #review_nostopwords_stemmed = [stemmer.lemmatize(w) for w in review_lower_list]
    review_nostopwords_stemmed = [stemmer.lemmatize(w) for w in review_lower_list if w not in stops_eng]
    ## return 
    return " ".join(review_lower_list)

In [15]:
dataset_df_train["review_cleaned"] = dataset_df_train["review_text"].apply(data_cleaning)
dataset_df_test["review_cleaned"] = dataset_df_test["review_text"].apply(data_cleaning)



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "html.parser")

  markup_type=markup_type))


In [16]:
## Put all the review of train_set in a list to generate the features
clean_train_reviews = []
for i in dataset_df_train["review_cleaned"]:
    clean_train_reviews.append(i)

clean_test_reviews = []
for i in dataset_df_test["review_cleaned"]:
    clean_test_reviews.append(i) 

In [17]:
## Generate Features (bag of words)
vectorizer = TfidfVectorizer(analyzer = "word", ngram_range=(1, 2), tokenizer = None, preprocessor = None, stop_words = None,max_features=2000)
train_data_features = vectorizer.fit_transform(clean_train_reviews)
test_data_features = vectorizer.transform(clean_test_reviews)

In [18]:
train_data_features = train_data_features.toarray()
test_data_features = test_data_features.toarray()

In [20]:
clf = svm.SVC(decision_function_shape='ovo')
clf.fit(train_data_features, dataset_df_train["rating"])

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovo', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [21]:
prediction_svm = clf.predict(test_data_features)

In [22]:
print(type(prediction_svm))
print(type(dataset_df_test['rating']))

<class 'numpy.ndarray'>
<class 'pandas.core.series.Series'>


In [23]:
prediction_svm.shape

(5000,)

In [24]:
dataset_df_test['rating'].shape

(5000,)

In [25]:
print("prediction accuracy is %f" % ((dataset_df_test['rating'] == prediction_svm).sum()/len(dataset_df_test)))

prediction accuracy is 0.387800
