# Load Books_small.json to a list of reviews

In [1]:
# For consistency
import random

class Sentiment:
    POSITIVE = 'POSITIVE'
    NEGATIVE = 'NEGATIVE'
    NEUTRAL = 'NEUTRAL'




class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:
            # Score = 4 or 5
            return Sentiment.POSITIVE
class Review_distribute:
    def __init__ (self, reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]

    # Distribute negative and positive reviews equally
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews ))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews ))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)
        print('negative = ',len(negative))
        print('positive_shrunk = ',len(positive_shrunk))
        

# Load data

In [2]:
import json

file_name = 'Books_small_10000.json'

reviews=[]

with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        #print(review['reviewText'])
        #print(review['overall'])
        reviews.append(Review(review['reviewText'],review['overall']))

print(reviews[23].text)
print(reviews[23].score)
print(reviews[23].sentiment)

        

I enjoyed the backstory of Tiger Lily and her tribe. I especially loved the parts of the story that referred to the Peter Pan story itself; I could picture Hook, Smee, Wendy, and the other characters. Also, I really enjoyed the narration by Tinkerbell. It didn't seem in Peter Pan that Tink was so observant, but she's my favorite Disney character so I enjoyed getting the background on Tink and her faerie family's life as well. It took me longer than usual to get through this story, as I wasn't riveted by the story itself, but it was very well written and the story flowed well and had spectacular imagery that related to the Peter Pan characters. I recommend Tiger Lily if you're at all a big fan of Peter & Neverland; you'll really appreciate the detail that went into the other characters of Tink, Tiger Lily, the Indians, and the Pirates. Smee and Hook's backgrounds were especially interesting.
4.0
POSITIVE


# Prepare data

In [3]:
len(reviews)

10000

In [4]:
import numpy as np
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(reviews, test_size = 0.33, random_state = 5)

train_distribute = Review_distribute(train_data)
test_distribute = Review_distribute(test_data)

train_distribute.evenly_distribute()
test_distribute.evenly_distribute()

#print(len(train_distribute.reviews))
print(len(train_data))
print(len(test_data))

negative =  432
positive_shrunk =  432
negative =  212
positive_shrunk =  212
6700
3300


In [5]:
print(train_data[20].text)
print(train_data[20].score)
print(train_data[20].sentiment)

I've read other books (the Night Stalker series) by this author, and really enjoyed them. But this book is very, very special. Don't expect non-stop action. It isn't slow, but it is leisurely - it takes place over a year, and the pace of the book reflects that. It allows you to get to know the characters, realize that they are worth knowing and caring about, flaws and all, and root for them to grow out of insecurities and into a full, rich life. I loved it.
5.0
POSITIVE


In [6]:
x_train = train_distribute.get_text()
y_train = train_distribute.get_sentiment()

x_test = test_distribute.get_text()
y_test = test_distribute.get_sentiment()

print(len(x_train))
print(len(y_train))
print(x_train[20])
print(y_train[20])
print('---------------')
print(len(x_test))
print(x_test[150])
print(y_test[150])


864
864
I finished it, but that is the best that I can say.The whole book is a thinly veiled attempt to push a right wing, sexist, divisive, god-fearing and gun-toting agenda. the foreword by Newt Gingrich had my alarm bells ringing, but I thought, &#34;hey, give it a chance&#34;.Grammar and spelling mistakes abound, so I won't go into deep detail, but &#34;should of/would of&#34; don't mean anything. Either use &#34;should have/would have&#34; or &#34;should've/would've&#34; are also acceptable.Anyway, I do like the premise for the book even though i'm not a prepper, but find post apocalyptic style fiction entertaining, if done well. This was not done well at all and was chokingly cliche.I wanted to like it, but found myself unable. Sorry.
NEGATIVE
---------------
424
This book resonated with me because my mother and I have always lived close to each other, but our relationship isn't close. Since my dad died, my husband and I have become my mother's caretakers. After reading Hafner's 

# Bag of words vectorization

In [7]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# CountVectorizer treats words equally
# https://monkeylearn.com/blog/what-is-tf-idf/

vectorizer = TfidfVectorizer()
x_train_vectors = vectorizer.fit_transform(x_train)

x_test_vectors = vectorizer.transform(x_test)

print(x_train[20])
print(x_train_vectors[20].toarray())
print(x_train_vectors[20])


I finished it, but that is the best that I can say.The whole book is a thinly veiled attempt to push a right wing, sexist, divisive, god-fearing and gun-toting agenda. the foreword by Newt Gingrich had my alarm bells ringing, but I thought, &#34;hey, give it a chance&#34;.Grammar and spelling mistakes abound, so I won't go into deep detail, but &#34;should of/would of&#34; don't mean anything. Either use &#34;should have/would have&#34; or &#34;should've/would've&#34; are also acceptable.Anyway, I do like the premise for the book even though i'm not a prepper, but find post apocalyptic style fiction entertaining, if done well. This was not done well at all and was chokingly cliche.I wanted to like it, but found myself unable. Sorry.
[[0. 0. 0. ... 0. 0. 0.]]
  (0, 7357)	0.07814313046631408
  (0, 8256)	0.10341288423157695
  (0, 5280)	0.07198694403921489
  (0, 1552)	0.11336187510294804
  (0, 1477)	0.1256848208106615
  (0, 2444)	0.16016900613868465
  (0, 520)	0.11336187510294804
  (0, 602

# Classification

### Linear SVM

In [8]:
from sklearn import svm

clf_svm = svm.SVC(kernel = 'linear')

clf_svm.fit(x_train_vectors, y_train)

# Predict demo
print(x_test[50])
print(y_test[50])
#print(x_test[50])
#print(x_test_vectors[0])

clf_svm.predict(x_test_vectors[50])

read this for book club (sadly, I recommended it from a friend who recommended it).  BORING....not my cup of tea in the least....
NEGATIVE


array(['NEGATIVE'], dtype='<U8')

# Decision Tree

In [9]:

from sklearn.tree import DecisionTreeClassifier

dec_tree = DecisionTreeClassifier()

dec_tree.fit(x_train_vectors, y_train)

# Predict demo
# Wrong
print(x_test[120])
print(y_test[120])


dec_tree.predict(x_test_vectors[120]) 

Oh how I wanted to absolutely love you..... I am a major mc and marine fan. Anything that includes one of not both is sure to be a winner. Although, this didn't quite make it to spectacular. The instant love between the two characters is just such a major turn off. The number one cardinal rule for me is do not under any circumstance make characters immediately fall head over heels in a romance. This makes everything in the story to follow cheap and unnatural leading readers to question motive and validity. Everything about Wreck You oozed with potential, but with such a drastic faux pas in the first chapter I was ruined for the remainder.
NEGATIVE


array(['NEGATIVE'], dtype='<U8')

# Random Forest Classifier

In [10]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier()

random_forest.fit(x_train_vectors, y_train)

# Predict demo
# Wrong
print(x_test[120])
print(y_test[120])


random_forest.predict(x_test_vectors[120]) 



Oh how I wanted to absolutely love you..... I am a major mc and marine fan. Anything that includes one of not both is sure to be a winner. Although, this didn't quite make it to spectacular. The instant love between the two characters is just such a major turn off. The number one cardinal rule for me is do not under any circumstance make characters immediately fall head over heels in a romance. This makes everything in the story to follow cheap and unnatural leading readers to question motive and validity. Everything about Wreck You oozed with potential, but with such a drastic faux pas in the first chapter I was ruined for the remainder.
NEGATIVE


array(['POSITIVE'], dtype='<U8')

# Logistic Regression

In [11]:
from sklearn.linear_model import LogisticRegression

logistic_re = LogisticRegression()

logistic_re.fit(x_train_vectors, y_train)

# Predict demo

print(x_test[280])
print(y_test[280])


logistic_re.predict(x_test_vectors[280]) 



If you like James Patterson's books, I say read this one, especially If you have read NYPD.  Personally, I read a lot of his books.  This is one of my favorite series at this time.  Love the characters and just when you think you have figured out  the plot, then it will turn in different direction for big surprise.  Enjoy!!!
NEGATIVE


array(['POSITIVE'], dtype='<U8')

# Evaluation

In [12]:
# Mean Accuracy
models = [logistic_re,  dec_tree , clf_svm, random_forest]

for model in models:
    scr = model.score(x_test_vectors, y_test)
    print(str(model), scr, sep='           ')

LogisticRegression()           0.8655660377358491
DecisionTreeClassifier()           0.6910377358490566
SVC(kernel='linear')           0.8655660377358491
RandomForestClassifier()           0.8537735849056604


In [13]:
# F1 Score
from sklearn.metrics import f1_score
models = [logistic_re,  dec_tree , clf_svm, random_forest]
for model in models:
    
    f1 = f1_score(y_test, model.predict(x_test_vectors), average = None, 
                 labels = [Sentiment.POSITIVE,Sentiment.NEGATIVE])
    print(str(model))
    print(f1)
   
    
# Conclusion : SVM and Logistic Regression perform best


LogisticRegression()
[0.86524823 0.86588235]
DecisionTreeClassifier()
[0.69030733 0.69176471]
SVC(kernel='linear')
[0.86713287 0.86396181]
RandomForestClassifier()
[0.85514019 0.85238095]


### Conclusion: Models perform like shit with NEGATIVE and NEUTRAL reviews

### Issue only happen to Book_small_1000

In [77]:
# Reason

print(y_train.count(Sentiment.POSITIVE))
print(y_train.count(Sentiment.NEGATIVE))
print(y_train.count(Sentiment.NEUTRAL))
#print(y_test.count(Sentiment.POSITIVE))

432
432
0


### Play around with the model



In [16]:
play_set = ['fucking brilliant', ' grateful', ' remarkable', 'delicious', 'magnificient']

new_play_set = vectorizer.transform(play_set)

print(logistic_re.predict(new_play_set))
print(clf_svm.predict(new_play_set))


['POSITIVE' 'POSITIVE' 'POSITIVE' 'POSITIVE' 'POSITIVE']
['POSITIVE' 'POSITIVE' 'POSITIVE' 'POSITIVE' 'POSITIVE']


## Tuning model

In [26]:
from sklearn.model_selection import GridSearchCV

# Modify svm
parameters = {'kernel' : ('linear','rbf','poly','sigmoid'), 'C':(1,2,4,8,16,32)}

svc = svm.SVC()
svc_tuned = GridSearchCV(svc, parameters, cv = 10)

print(svc_tuned.fit(x_train_vectors, y_train))

GridSearchCV(cv=10, estimator=SVC(),
             param_grid={'C': (1, 2, 4, 8, 16, 32),
                         'kernel': ('linear', 'rbf', 'poly', 'sigmoid')})


In [27]:
svc_tuned.score(x_test_vectors, y_test)

0.8655660377358491

## Saving models


In [28]:
import pickle
with open('review_classifier.pkl','wb') as f:
    pickle.dump(svc_tuned,f)

### Load model

In [29]:
with open('review_classifier.pkl','rb') as f:
    loaded_svc_tuned = pickle.load(f)

In [32]:
print(x_test[239])
loaded_svc_tuned.predict(x_test_vectors[239])


For me, this was a very unhappy, depressing book. While I like to learn about the cultures of other countries, this just didn't do it for me. Three generations of unhappiness is a bit too much.


array(['NEGATIVE'], dtype='<U8')