# Data Class

In [1]:
import random

class Sentiment:
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"
    
    
class Review:
    def __init__(self, text, score) -> None:
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self) -> None:
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE
        
class ReviewContainer:
    def __init__ (self, reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
        
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)

# Load Data

In [2]:
import json

# Read the data
PATH = 'data/sentiment/Books_small_10000.json'

reviews = []
with open(PATH, 'r') as file:
    for line in file:
        review = json.loads(line)     
        reviews.append(Review(review.get('reviewText'), review.get('overall')))

reviews[10].text

"My only complaint about this book is that it is much too short. I love this author and this series, and I can't wait for the next installment."

# Prep Data

In [3]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size=0.33, random_state=42)

# make data train Equallu Distributed
train_container = ReviewContainer(training)
train_container.evenly_distribute()

test_container = ReviewContainer(test)
test_container.evenly_distribute()

In [4]:
import numpy as np

train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_x = test_container.get_text()
test_y = test_container.get_sentiment()

unique_values, counts = np.unique(train_y, return_counts=True)

# Check the Values
for unique_value, count in zip(unique_values, counts):
    print(unique_value, count)

NEGATIVE 436
POSITIVE 436


## Bags of World Vectorization

In [5]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer()

#fit and transform vectorizer
train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x)
 
print(train_x[0])
print(train_x_vectors[0].toarray())

Lots of sex, should have a stronger story. I'm not a fan of plot HANGING stories. Does not flow together, jumps from one scene to another poorly.
[[0. 0. 0. ... 0. 0. 0.]]


## Classification

### Linear SVM

In [6]:
from sklearn.svm import SVC

# Create Model and Fit the model
clf_svm = SVC(kernel='linear')
clf_svm.fit(train_x_vectors, train_y)

In [7]:
# Predict
clf_svm.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

### Decision Tree

In [8]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier(random_state=42)
clf_dec.fit(train_x_vectors, train_y)

In [9]:
clf_dec.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')

### Naive Bayes

In [10]:
from sklearn.naive_bayes import GaussianNB

clf_nvb = GaussianNB()
clf_nvb.fit(train_x_vectors.toarray(), train_y)

In [11]:
clf_nvb.predict(train_x_vectors[0].toarray())

array(['NEGATIVE'], dtype='<U8')

### Logistic Regression

In [12]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_y)

In [13]:
clf_log.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

### Evaluation

In [14]:
print('SVM ', clf_svm.score(test_x_vectors, test_y))
print('NVB ', clf_nvb.score(test_x_vectors.toarray(), test_y))
print('Dec Tree ', clf_dec.score(test_x_vectors, test_y))
print('Log Reg ', clf_log.score(test_x_vectors, test_y))

SVM  0.8076923076923077
NVB  0.6610576923076923
Dec Tree  0.6370192307692307
Log Reg  0.8052884615384616


In [15]:
# F1 Score

from sklearn.metrics import f1_score

f1_svm = f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE])
f1_nvb = f1_score(test_y, clf_nvb.predict(test_x_vectors.toarray()), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE])
f1_dec = f1_score(test_y, clf_dec.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE])
f1_log = f1_score(test_y, clf_log.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE])

print('SVM ', f1_svm)
print('NVB ', f1_nvb)
print('Dec Tree ', f1_dec)
print('Log Reg ', f1_log)

SVM  [0.80582524 0.80952381]
NVB  [0.65693431 0.66508314]
Dec Tree  [0.63080685 0.643026  ]
Log Reg  [0.80291971 0.80760095]


In [16]:
test_set = ['not great', 'book good', 'very fun']
new_test = vectorizer.transform(test_set)

clf_svm.predict(new_test)

array(['NEGATIVE', 'POSITIVE', 'POSITIVE'], dtype='<U8')

### Tuning our model(Grid Search)

In [17]:
from sklearn.model_selection import GridSearchCV

parameters = {
    'kernel' : ('linear', 'rbf'),
    'C' : (1,4,8,16,32)
}

svc = SVC()
clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(train_x_vectors, train_y)

In [18]:
# Checking best params fitted
clf.best_params_

{'C': 1, 'kernel': 'linear'}

In [19]:
f1_score(test_y, clf.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE])

array([0.80582524, 0.80952381])

In [20]:
print('SVM ', clf_svm.score(test_x_vectors, test_y))

SVM  0.8076923076923077


## Saving Model

In [21]:
import pickle

with open('models/sentiment_classifier.pkl', 'wb') as file:
    pickle.dump(clf, file)

In [23]:
with open('models/sentiment_classifier.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

In [26]:
print(test_x[0])
print(test_y[0])

loaded_model.predict(test_x_vectors[0])

This second volume was just as rewarding as the first. To be fair, some of it is just a little boring, but the impact is huge. My preferred way to read these is to read all of a month's given content in that month (i.e., all the July 1862 material I read in July 2012). So, it takes me a year to read it, but it's easier to digest and enjoy that way.Can't wait for the Third Year!
POSITIVE


array(['POSITIVE'], dtype='<U8')