In [1]:
# Created classes to help keep code readable and minimal

import random

class Sentiment:
    NEGAVTIVE ='NEGATIVE'
    NEUTRAL = 'NEUTRAL'
    POSITIVE = 'POSITIVE'
    
class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGAVTIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE
        
class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]          
     
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGAVTIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)
        

In [31]:
import pandas as pd
import json
import sklearn
import numpy as np

file_name = './Data/Books_small_10000.json'

reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))

reviews[45].sentiment

'POSITIVE'

Preparing Training/Test Data

In [3]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(reviews, test_size=0.33, random_state=42)

train_container = ReviewContainer(train)

test_container = ReviewContainer(test)


In [4]:
train_container.evenly_distribute()
training_x = train_container.get_text()
training_y = train_container.get_sentiment()


test_container.evenly_distribute()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

print(training_y.count(Sentiment.POSITIVE))
print(training_y.count(Sentiment.NEGAVTIVE))

436
436


In [5]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer()

train_x_vectors = vectorizer.fit_transform(training_x)

test_x_vectors = vectorizer.transform(test_x)


print(training_x[0])
print(train_x_vectors[0].toarray())



I thought I'd be interested in this book after hearing the author interviewed on NPR, but this is just one boring recitation after another in an endless recounting of doping throughout cycling history.  Yes, Lance Armstrong is every bit the unappealing jerk he is reputed to be, and there are endless boring recitations of this as well .. and then he did this bad thing, and then he did that bad thing, and then he was a complete jerk to this person, and then he was a complete jerk to that person.  Okay already .. everybody dopes, everybody always has doped, everybody probably always will dope, this nameless person you've never heard of aided and abetted, that nameless person you've never heard of aided and abetted, this nameless rider you've never heard of had his career ruined, that nameless rider you've never heard of had his career ruined, Lance is a jerk, Lance is a jerk, Lance is a jerk.  Etc.  Doubt I'll finish the book ..
[[0. 0. 0. ... 0. 0. 0.]]


Classification

In [6]:
#SVM Classifier
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vectors, training_y)

print(test_x[6])

clf_svm.predict(test_x_vectors[6])





oMG!!!!!!!!!!!! I just can't get enough of Sean and Avery. I can't wait until the next arrangement and see what new twist is going to have but please make the story longer. I literally read this book on my way to work in less than an hour.


array(['POSITIVE'], dtype='<U8')

In [7]:
#Decision Tree
from sklearn.tree import DecisionTreeClassifier


clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, training_y)

print(test_x[6])
clf_dec.predict(test_x_vectors[6])

oMG!!!!!!!!!!!! I just can't get enough of Sean and Avery. I can't wait until the next arrangement and see what new twist is going to have but please make the story longer. I literally read this book on my way to work in less than an hour.


array(['POSITIVE'], dtype='<U8')

In [8]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB()
clf_gnb.fit(train_x_vectors.toarray(), training_y)

print(test_x[6])
clf_gnb.predict(test_x_vectors[6].toarray())

oMG!!!!!!!!!!!! I just can't get enough of Sean and Avery. I can't wait until the next arrangement and see what new twist is going to have but please make the story longer. I literally read this book on my way to work in less than an hour.


array(['NEGATIVE'], dtype='<U8')

Evaluatiing Classifiers

In [9]:
#Mean Accuracy

print(clf_svm.score(test_x_vectors, test_y))
print(clf_dec.score(test_x_vectors, test_y))
print(clf_gnb.score(test_x_vectors.toarray(), test_y))

0.8076923076923077
0.6418269230769231
0.6610576923076923


In [10]:
#F1 Score
from sklearn.metrics import f1_score

f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels= [Sentiment.POSITIVE, Sentiment.NEGAVTIVE])
#f1_score(test_y, clf_dec.predict(test_x_vectors), average=None, labels= [Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGAVTIVE])




array([0.80582524, 0.80952381])

In [11]:
test_set = ['I think this book is amazing', 'please do not buy this book', 'time well spent']

new_test = vectorizer.transform(test_set)

clf_svm.predict(new_test)

array(['POSITIVE', 'NEGATIVE', 'POSITIVE'], dtype='<U8')

Tuning Model Using Grid Search

In [12]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel': ('linear', 'rbf'), 'C': (1,4,16,32)}

svc = svm.SVC()

clf = GridSearchCV(svc, parameters, cv=5)

clf.fit(train_x_vectors, training_y)

In [13]:
#Previous SVM score was .807
print(clf.score(test_x_vectors, test_y))

0.8100961538461539


Saving Model

In [14]:
import pickle

with open('/Users/zairefrazier/Documents/Developer/models/sentiment_classifier.pkl', 'wb') as f:
    pickle.dump(clf, f)

Loading Model to use

In [15]:
with open('/Users/zairefrazier/Documents/Developer/models/sentiment_classifier.pkl', 'rb') as f:
    loaded_clf = pickle.load(f)
    

In [16]:
print(test_x[45])

loaded_clf.predict(test_x_vectors[45])

As I was reading this book I kept wondering why we were getting so much endless detail about the inn and the businesses in the town of Boonsboro; seriously, this read almost like a brochure put out by the Boonsboro chamber of commerce, with special emphasis on the inn.  Then I googled Boonsboro and discovered that the inn in question exists in "real life" and is owned by the author, as is the bookshop.  This use of a novel to promote a business venture would be fine if the story were interesting.  It's not.  The romance is only a side story -- the real story is about all the details relating to the refurbishment of the Inn, and the plans to buy stuff to put in it.  The heroine is perfectly acceptable but seems untouched by her past, the Hero is unbelievably perfect, and there is absolutely no tension in the relationship at all.  No angst, no obstacles in the way, nothing.  At the end I felt as if I'd been manipulated into reading a long ad for the inn.  I won't be reading the others in

array(['NEGATIVE'], dtype='<U8')

In [26]:
f = open('/Users/zairefrazier/Documents/Developer/NLP-Classifier/ceb_rep.txt', 'r')
ceb_rep = []
for line in f:
    ceb_rep.append(line.strip())

In [29]:
print(ceb_rep[5])

Good things as I’m like help me with work


In [36]:
ceb_Data = ['Helping me with my homework', 'College programs', 'Helping me with hard work and Guiding me','They have gotten on me about things that are important and it has helped', 'We don’t talk about my academics ']




ceb_test = vectorizer.transform(ceb_rep)

loaded_clf.predict(ceb_test)

# p_count = np.count_nonzero(ceb_res == 'POSITIVE')

# n_count = np.count_nonzero(ceb_res == 'NEGATIVE')

# print(n_count)

array(['POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'NEGATIVE',
       'POSITIVE', 'NEGATIVE', 'NEGATIVE', 'POSITIVE', 'POSITIVE',
       'POSITIVE', 'POSITIVE', 'NEGATIVE', 'POSITIVE', 'POSITIVE',
       'NEGATIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE',
       'NEGATIVE', 'POSITIVE', 'POSITIVE', 'NEGATIVE', 'NEGATIVE',
       'NEGATIVE', 'POSITIVE', 'NEGATIVE', 'NEGATIVE', 'NEGATIVE',
       'NEGATIVE', 'NEGATIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE',
       'POSITIVE', 'NEGATIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE',
       'NEGATIVE'], dtype='<U8')

Final