In [106]:
# Created classes to help keep code readable and minimal

import random

class Sentiment:
    NEGAVTIVE ='NEGATIVE'
    NEUTRAL = 'NEUTRAL'
    POSITIVE = 'POSITIVE'
    
class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGAVTIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE
        
class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]          
     
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGAVTIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)
        

In [107]:
import pandas as pd
import json
import sklearn

file_name = 'Books_small_10000.json'

reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))

reviews[45].sentiment

'POSITIVE'

Preparing Training/Test Data

In [108]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(reviews, test_size=0.33, random_state=42)

train_container = ReviewContainer(train)

test_container = ReviewContainer(test)


In [109]:
train_container.evenly_distribute()
training_x = train_container.get_text()
training_y = train_container.get_sentiment()


test_container.evenly_distribute()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

print(training_y.count(Sentiment.POSITIVE))
print(training_y.count(Sentiment.NEGAVTIVE))

436
436


In [122]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer





vectorizer = TfidfVectorizer()

train_x_vectors = vectorizer.fit_transform(training_x)

test_x_vectors = vectorizer.transform(test_x)


print(training_x[0])
print(train_x_vectors[0].toarray())



haven't had the opportunity to open this book.  As it turns out, I have no use for this particular book. can I get a refund.
[[0. 0. 0. ... 0. 0. 0.]]


Classification

In [124]:
#SVM Classifier
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vectors, training_y)

print(test_x[6])

clf_svm.predict(test_x_vectors[6])





I am part way through and loving it.  I love his prospective on the south, the legal system - both local and federal, and, in this book, the federal prison system.  I am part way through and can't stop picking it up to read more.  As always, I will probably feel sad when it's over.


array(['POSITIVE'], dtype='<U8')

In [125]:
#Decision Tree
from sklearn.tree import DecisionTreeClassifier


clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, training_y)

print(test_x[6])
clf_dec.predict(test_x_vectors[6])

I am part way through and loving it.  I love his prospective on the south, the legal system - both local and federal, and, in this book, the federal prison system.  I am part way through and can't stop picking it up to read more.  As always, I will probably feel sad when it's over.


array(['NEGATIVE'], dtype='<U8')

In [126]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB()
clf_gnb.fit(train_x_vectors.toarray(), training_y)

print(test_x[6])
clf_gnb.predict(test_x_vectors[6].toarray())

I am part way through and loving it.  I love his prospective on the south, the legal system - both local and federal, and, in this book, the federal prison system.  I am part way through and can't stop picking it up to read more.  As always, I will probably feel sad when it's over.


array(['POSITIVE'], dtype='<U8')

Evaluatiing Classifiers

In [127]:
#Mean Accuracy

print(clf_svm.score(test_x_vectors, test_y))
print(clf_dec.score(test_x_vectors, test_y))
print(clf_gnb.score(test_x_vectors.toarray(), test_y))

0.8076923076923077
0.6418269230769231
0.6610576923076923


In [130]:
#F1 Score
from sklearn.metrics import f1_score

f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels= [Sentiment.POSITIVE, Sentiment.NEGAVTIVE])
#f1_score(test_y, clf_dec.predict(test_x_vectors), average=None, labels= [Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGAVTIVE])




array([0.80582524, 0.80952381])

In [129]:
test_set = ['I think this book is amazing', 'please do not buy this book', 'time well spent']

new_test = vectorizer.transform(test_set)

clf_svm.predict(new_test)

array(['POSITIVE', 'NEGATIVE', 'POSITIVE'], dtype='<U8')

Tuning Model Using Grid Search

In [133]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel': ('linear', 'rbf'), 'C': (1,4,16,32)}

svc = svm.SVC()

clf = GridSearchCV(svc, parameters, cv=5)

clf.fit(train_x_vectors, training_y)

In [134]:
#Previous SVM score was .807
print(clf.score(test_x_vectors, test_y))

0.8197115384615384


Saving Model

In [135]:
import pickle

with open('/Users/zairefrazier/Documents/Developer/models/sentiment_classifier.pkl', 'wb') as f:
    pickle.dump(clf, f)

Loading Model to use

In [136]:
with open('/Users/zairefrazier/Documents/Developer/models/sentiment_classifier.pkl', 'rb') as f:
    loaded_clf = pickle.load(f)
    

In [137]:
print(test_x[45])

loaded_clf.predict(test_x_vectors[45])

I found it very hard to connect with the author throughout this book.  The story came across to me as insincere, and more about selling books than sharing a belief or story.  Like other reviewers, I found Eben to be arrogant.  My wife reads a lot of books like this, and comparing notes afterwards she felt the same way.  The ending also seemed forced or contrived for dramatic effect.


array(['NEGATIVE'], dtype='<U8')