In [3]:
class Sentiment:
    NEGATIVE = 'NEGATIVE'
    NEUTRAL = 'NEUTRAL'
    POSITIVE = 'POSITIVE'
    
class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        
        else: # score of 4 or 5
            return Sentiment.POSITIVE
            
        

In [4]:
import json

filename = 'Books_small.json'


reviews = []
with open(filename) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))
        
reviews[5].text # this gives us the text
reviews[5].score # this gives us the rating
reviews[5].sentiment # this gives us positive/negative/neutral rating

'POSITIVE'

# Prep Data

In [5]:
pip install -U scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [6]:
from sklearn.model_selection import train_test_split


training, test = train_test_split(reviews, train_size=0.9, random_state=42)

In [7]:
train_x = [x.text for x in training]
train_y = [x.sentiment for x in training]

test_x = [x.text for x in test]
test_y = [x.sentiment for x in test]


train_x[0]
train_y[0]

'POSITIVE'

# Bag of word Vectorization

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)

test_x_vectors = vectorizer.transform(test_x)


print(train_x[0])
print(train_x_vectors[0].toarray()) # This convert each word into digits in array

One of the best books I've ever read. The research for the history was spot on. Plus the love story held lots of surprises. Love learning about history with this quality of a book.
[[0 0 0 ... 0 0 0]]


# Classification

In [9]:
test_x[0]

#This is suppose to be positive review
## Let see whether our classifier can output the accurate 

"Every new Myke Cole book is better than the last, and this is no exception. If you haven't read the Shadow Ops series before start with Control Point, but go ahead and order Fortress Frontier and Breach Zone as well - you're going to want them."

In [10]:
## Linear SVM

from sklearn import svm # support vector machine classifier

clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vectors, train_y)

test_x[0]

clf_svm.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

# Decision Tree

In [11]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

clf_dec.predict(test_x_vectors[0])


array(['POSITIVE'], dtype='<U8')

# Evaluation !
<br> This time we compare the whole dataset</br>

In [12]:
#mean accuracy
clf_svm.score(test_x_vectors, test_y)

0.88

In [13]:
#mean accuracy
clf_dec.score(test_x_vectors, test_y)

0.76

In [15]:
#F1 Scores
from sklearn.metrics import f1_score

f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, 
                                                                        Sentiment.NEUTRAL,
                                                                       Sentiment.NEGATIVE])



#this model is bad for neutral and negative
# but this model is good for predicting positive

# Maybe is the data problems

array([0.93922652, 0.33333333, 0.28571429])

In [17]:
train_y.count(Sentiment.POSITIVE)

# our model will be highly bias because so many positive

744