In [11]:
class Review:
    def __init__(self, text,score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score > 3:
            return 'POSITIVE'
        elif self.score ==3:
            return 'NEUTRAL'
        else:
            return 'NEGATIVE'

In [19]:
import json

filename = 'Books_small.json'
reviews = []
with open (filename) as f:
    for line in f:
        x = json.loads(line) #loads data into variable x line by line
        reviews.append(Review(x['reviewText'],x['overall']))      

Machine learning models dont work with text data as input. So we need a means to convert text into a quantitative vector, ie Bag of Words

Prep Data

In [30]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(reviews, test_size = 0.2, random_state = 42)

In [31]:
train_x = [x.text for x in train]
train_y = [x.sentiment for x in train]
test_x = [x.text for x in test]
test_y = [x.sentiment for x in test]

In [44]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
train_x_vector = count_vect.fit_transform(train_x)
test_x_vector = count_vect.transform(test_x)

#### Support Vector Machines

In [47]:
from sklearn import svm

clf_svm = svm.SVC(kernel = 'linear')

clf_svm.fit(train_x_vector, train_y)

#print(test_x[5])
#clf_svm.predict(test_x_vector[5])
clf_svm.score(test_x_vector, test_y)

An intriguing book, but I am ashamed to admit I sometimes did not fully comprehend what was said, particularly when Chesterton referred to other people (writers) that I am not knowledgeable of.  At any rate a good read.


0.83

#### Decision Trees

In [49]:
from sklearn.tree import DecisionTreeClassifier

clf_tree = DecisionTreeClassifier()
clf_tree.fit(train_x_vector, train_y)
#clf_tree.predict(test_x_vector[5])
clf_tree.score(test_x_vector, test_y)

0.765

#### Naive Bayes

In [61]:
from sklearn.naive_bayes import GaussianNB
import numpy as np
clf_nb = GaussianNB()
clf_nb.fit(train_x_vector.toarray(), train_y)
clf_nb.predict(test_x_vector[5].toarray())
clf_nb.score(test_x_vector.toarray(), test_y)

0.79

In [79]:
from sklearn.naive_bayes import MultinomialNB
x = np.array(train_x_vector)
y = np.array(train_y)
clf_multi = MultinomialNB()
clf_multi.fit(train_x_vector,y)
clf_multi.predict(test_x_vector[5].toarray())
clf_multi.score(test_x_vector.toarray(), test_y)

0.87

#### Logistic Regression

In [70]:
from sklearn.linear_model import LogisticRegression
clf_log = LogisticRegression(multi_class = 'ovr')
clf_log.fit(train_x_vector, train_y)
clf_log.predict(test_x_vector[5])
clf_log.score(test_x_vector, test_y)

0.85

#### Evaluation using F1 score

In [81]:
from sklearn.metrics import f1_score

print(f1_score(test_y,clf_svm.predict(test_x_vector), average = None, labels = ['POSITIVE','NEUTRAL','NEGATIVE']))
print(f1_score(test_y,clf_tree.predict(test_x_vector), average = None, labels = ['POSITIVE','NEUTRAL','NEGATIVE']))
print(f1_score(test_y,clf_nb.predict(test_x_vector.toarray()), average = None, labels = ['POSITIVE','NEUTRAL','NEGATIVE']))
print(f1_score(test_y,clf_multi.predict(test_x_vector), average = None, labels = ['POSITIVE','NEUTRAL','NEGATIVE']))
print(f1_score(test_y,clf_log.predict(test_x_vector), average = None, labels = ['POSITIVE','NEUTRAL','NEGATIVE']))

[0.91428571 0.27777778 0.14285714]
[0.86627907 0.12903226 0.16      ]
[0.88700565 0.06451613 0.        ]
[0.93048128 0.         0.        ]
[0.92436975 0.27586207 0.14285714]


The lack of accuracy in predicting neutral or negative comments are actually due to a data problem, not a modelling problem. We have a lot more positive data than we do neutral or negative, so the model does not actually know, to a smiliar degree, how to identify neutral or negative comments.

The other flaw with bag of words approach is that non adjective words such as 'i', 'the' etc are given the same weights as sentiment-heavy words such as 'great', 'bad', etc.

#### TFIDVectorizer : words that occur more frequently carry less weight

In [88]:
from sklearn.feature_extraction.text import TfidfVectorizer

count_vect = TfidfVectorizer()
train_x_vector = count_vect.fit_transform(train_x)
test_x_vector = count_vect.transform(test_x)

#Linear SVM
clf_svm = svm.SVC(kernel = 'linear')
clf_svm.fit(train_x_vector, train_y)

#Decision Tree
clf_tree = DecisionTreeClassifier()
clf_tree.fit(train_x_vector, train_y)

#Naive Bayes
clf_nb = GaussianNB()
clf_nb.fit(train_x_vector.toarray(), train_y)

#multinomial Naive Bayes
x = np.array(train_x_vector)
y = np.array(train_y)
clf_multi = MultinomialNB()
clf_multi.fit(train_x_vector,y)

#Logistic Regression
clf_log = LogisticRegression(multi_class = 'ovr')
clf_log.fit(train_x_vector, train_y)

LogisticRegression(multi_class='ovr')

In [89]:
print(f1_score(test_y,clf_svm.predict(test_x_vector), average = None, labels = ['POSITIVE','NEUTRAL','NEGATIVE']))
print(f1_score(test_y,clf_tree.predict(test_x_vector), average = None, labels = ['POSITIVE','NEUTRAL','NEGATIVE']))
print(f1_score(test_y,clf_nb.predict(test_x_vector.toarray()), average = None, labels = ['POSITIVE','NEUTRAL','NEGATIVE']))
print(f1_score(test_y,clf_multi.predict(test_x_vector), average = None, labels = ['POSITIVE','NEUTRAL','NEGATIVE']))
print(f1_score(test_y,clf_log.predict(test_x_vector), average = None, labels = ['POSITIVE','NEUTRAL','NEGATIVE']))

[0.93333333 0.         0.        ]
[0.85630499 0.10526316 0.19047619]
[0.88450704 0.06666667 0.        ]
[0.93333333 0.         0.        ]
[0.93333333 0.         0.        ]
