# Data classes

In [162]:
import random
class Sentiment:
    NEGATIVE = "NEGATIVE"
    POSITIVE = "POSITIVE"
    NEUTRAL = "NEUTRAL"

class Review:
    def __init__(self, text,score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()

    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE

class ReviewContainer:
    def __init__(self,reviews):
        self.reviews = reviews

    def get_text(self):
        return [x.text for x in self.reviews]

    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
        
    def evenlyDistrubtedComments(self):
        negative = list(filter(lambda x : x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x : x.sentiment == Sentiment.POSITIVE, self.reviews))

        print("Positive data ", len(positive))
        print("Negative data", len(negative))

        ## Now just make number of positive == number of negative . positive are 5000+ and negative are 400+
        positive_shrunk = positive[:len(negative)]
        total_reviews = negative + positive_shrunk
        self.reviews = total_reviews

        #too shuffle it randomly
        random.shuffle(self.reviews)
        


        

# Load Data using file

In [164]:
import json
# filename = "C:/Users/91997/OneDrive/Desktop/Books_small.json"
filename = "C:/Users/91997/Downloads/Books_small_10000.json"

reviews = []
with open(filename) as f:
    for lines in f:
        JsonData = json.loads(lines)
        review_text = JsonData["reviewText"]
        review_score = JsonData["overall"]
        reviews.append(Review(review_text, review_score))

print(reviews[5].sentiment)



POSITIVE


# Prep Data

In [166]:
from sklearn.model_selection import train_test_split

# spliting the data for training the model and testing the model
training_data, test_data  = train_test_split(reviews, test_size=0.33, random_state= 42) #training=670, test=330

train_container = ReviewContainer(training_data)
test_container = ReviewContainer(test_data)
container.evenlyDistrubtedComments()

print("distrubuted equally ", len(container.reviews))


Positive data  436
Negative data 436
distrubuted equally  872


In [98]:

train_container.evenlyDistrubtedComments()
#take to columns, 1. for description[text] and 2. for sentiment["Neg", "Pos", "Neut"]

# training_x = [x.text for x in training_data]
# training_y = [x.sentiment for x in training_data] 

training_x = train_container.get_text()
training_y = train_container.get_sentiment()

# test_x = [x.text for x in test_data]
# test_y = [x.sentiment for x in test_data]
test_container.evenlyDistrubtedComments()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

training_x[0]
training_y[0]

test_x[0]
test_y[0]

print(training_y.count(Sentiment.POSITIVE))
print(training_y.count(Sentiment.NEGATIVE))

Positive data  436
Negative data 436
Positive data  208
Negative data 208
436
436


# Bag of words vectorization

In [168]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# vectorizer = CountVectorizer()
vectorizer = TfidfVectorizer()
training_x_vectors = vectorizer.fit_transform(training_x)
test_x_vectors = vectorizer.transform(test_x)

print(training_x[0])
print(training_x_vectors.toarray())


This book &#34;Easy to Build Birdhouses - A Natural Approach: Must Know Info to Attract and Keep the Birds You Want&#34; by A. J. Hamler a Popular Woodworking Paperback, is 143 pages of colorful pictures and bird house designs.  There is even a house designed like an old Box Camera, with viewer hole, winder and pushbutton.  This is one of my most visual Birdhouse Books, recently purchased to sate' my desire to buy everything out there on bird houses. Some of these houses laid out in the book are quite basic, like a nesting box for ducks mounted on a pole and then there are several designs that are almost like works of art, detailed, painted like a newly built home and with details that rival the doll houses made for a favorite little girl.  These are not necessarily in the line of a natural approach, but they are attractive and well built.  Each house is shown as completed in a full page color picture of it, where the author has it mounted for use and display.  Feeders are also shown i

# Classifications (Model selection)

#### Linear SVM

In [170]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(training_x_vectors, training_y)

test_x[0]

clf_svm.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')

# Decesion tree

In [172]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(training_x_vectors, training_y)

clf_dec.predict(test_x_vectors[0])


array(['NEGATIVE'], dtype='<U8')

# Naive Bayes

In [174]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB()
clf_gnb.fit(training_x_vectors.toarray(), training_y)

clf_gnb.predict(test_x_vectors[0].toarray())


array(['NEGATIVE'], dtype='<U8')

# Logistic Regression

In [176]:
from sklearn.linear_model import LogisticRegression

clf_lgr = LogisticRegression()
clf_lgr.fit(training_x_vectors.toarray(), training_y)

clf_lgr.predict(test_x_vectors[0].toarray())

array(['NEGATIVE'], dtype='<U8')

# Evaluation

In [178]:
#mean Accuracy

print(clf_svm.score(test_x_vectors, test_y))
print(clf_lgr.score(test_x_vectors, test_y))
print(clf_gnb.score(test_x_vectors.toarray(), test_y))
print(clf_dec.score(test_x_vectors, test_y))

0.8076923076923077
0.8028846153846154
0.6610576923076923
0.6490384615384616


# F1 Score

In [180]:
from sklearn.metrics import f1_score
import math

svm = f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])
lgr = f1_score(test_y, clf_lgr.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])
gnb = f1_score(test_y, clf_gnb.predict(test_x_vectors.toarray()), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])
dec = f1_score(test_y, clf_dec.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])

print("Liear SVM", [f"{math.ceil(x*100)}%" for x in svm])
print("Linear Regression",[f"{math.ceil(x*100)}%" for x in lgr])
print("Naive Bayes",[f"{math.ceil(x*100)}%" for x in gnb])
print("Decision tree", [f"{math.ceil(x*100)}%" for x in dec])

Liear SVM ['81%', '0%', '81%']
Linear Regression ['81%', '0%', '81%']
Naive Bayes ['66%', '0%', '67%']
Decision tree ['65%', '0%', '66%']


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [182]:
training_y.count(Sentiment.NEGATIVE)

436

In [184]:
training_y.count(Sentiment.POSITIVE)

436

In [186]:
training_y.count(Sentiment.NEUTRAL)

0

In [78]:
len(reviews)

10000

In [188]:
test_set = ["Good", "bad book do not read", "horrible waste of time"]
test_set2 = ["not Good", "bad book do not read", "enjoyed every moment"]
new_test = vectorizer.transform(test_set2)

clf_svm.predict(new_test)

array(['NEGATIVE', 'NEGATIVE', 'POSITIVE'], dtype='<U8')

# Turning our model(with Grid Search)

In [196]:
from sklearn.model_selection import GridSearchCV
from sklearn import svm

parameters = {'kernel': ('linear','rbf'), 'C': (1,2,8,16,32)}

svc = svm.SVC( )
clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(training_x_vectors, training_y)

In [202]:
print(clf.score(test_x_vectors, test_y))

0.8076923076923077


# Saving Model

In [222]:
import pickle

with open("C:/Users/91997/OneDrive/Desktop/Machine learning project/sentiment_classifier.pkl", "wb") as f:
    pickle.dump(clf, f)
    

# Load model

In [224]:
with open("C:/Users/91997/OneDrive/Desktop/Machine learning project/sentiment_classifier.pkl", "rb") as f:
    loaded_clf = pickle.load(f)

In [210]:
print(test_x[0])

I got this book for free and boy, let me just say that I'm glad I didn't pay for it. The writing wasn't bad, but the story itself was not worth the 5 star reviews it's received. No, I wouldn't recommend this story.


In [212]:
loaded_clf.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')