In [162]:
import random

class Review:
    def __init__(self, text, score):
        self.text = text  # allowed to do review[index].text
        self.score = score  # allowed to do review[index].score
        self.sentiment = self.get_sentiment()
    
    def get_sentiment(self):
        if self.score <= 2:
            return 'Negative'
        elif self.score == 3:
            return 'Neutral'
        else:
            return 'Positive'
        
class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [i.text for i in self.reviews]  # method for list comprehension (text)
        
    def get_sentiment(self):  # same method but in different class
        return [i.sentiment for i in self.reviews]  # method for list comprehension (sentiment)
    
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == "Negative", self.reviews))
        positive = list(filter(lambda x: x.sentiment == "Positive", self.reviews))
        positive_shrunk = positive[:len(negative)]  # shrunk the number of positive reviews
        self.reviews = positive_shrunk + negative
        random.shuffle(self.reviews)  # shuffle the order of positive and negative reviews

In [163]:
import json

file_path = '/Users/tongzhu/python_projects/ml/text/books_small_10000.json'

reviews = []
with open(file_path) as f:
    for i in f:
        line = json.loads(i)
#       print(line['reviewText'])
#       print(line['overall'])
        reviews.append(Review(line['reviewText'], line['overall']))  # Review(text, score)

reviews[5].text

'I hoped for Mia to have some peace in this book, but her story is so real and raw.  Broken World was so touching and emotional because you go from Mia\'s trauma to her trying to cope.  I love the way the story displays how there is no "just bouncing back" from being sexually assaulted.  Mia showed us how those demons come for you every day and how sometimes they best you. I was so in the moment with Broken World and hurt with Mia because she was surrounded by people but so alone and I understood her feelings.  I found myself wishing I could give her some of my courage and strength or even just to be there for her.  Thank you Lizzy for putting a great character\'s voice on a strong subject and making it so that other peoples story may be heard through Mia\'s.'

In [164]:
# train test split
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size=0.33, random_state=42)  #random state = 42

train_container = ReviewContainer(training)
test_container = ReviewContainer(test)

In [165]:
train_container.evenly_distribute()
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_container.evenly_distribute()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

print(train_y.count('Positive'))  # number of positive reviews
print(train_y.count('Negative'))  # number of negative reviews

436
436


In [188]:
# bag of words
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)  # learn from the training set and transform

test_x_vectors = vectorizer.transform(test_x)  # only transform for the test set

In [202]:
# classifications

In [203]:
# linear Support Vector Machine(SVM)
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vectors, train_y)

SVC(kernel='linear')

In [204]:
# decision tree
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()

clf_dec.fit(train_x_vectors, train_y)

DecisionTreeClassifier()

In [205]:
# navie bayes
from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB()

clf_gnb.fit(train_x_vectors.toarray(), train_y)

GaussianNB()

In [206]:
# logistic regression
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()

clf_log.fit(train_x_vectors, train_y)

LogisticRegression()

In [207]:
# evaluation

In [208]:
# mean accuracy
print(clf_svm.score(test_x_vectors, test_y))
print(clf_dec.score(test_x_vectors, test_y))
print(clf_gnb.score(test_x_vectors.toarray(), test_y))
print(clf_log.score(test_x_vectors, test_y))

0.8076923076923077
0.6298076923076923
0.6610576923076923
0.8052884615384616


In [218]:
# F1 scores
from sklearn.metrics import f1_score

# svm
f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=['Positive', 'Negative'])
# this is the method we will be focusing on optimizing

array([0.80582524, 0.80952381])

In [211]:
# decision tree
f1_score(test_y, clf_dec.predict(test_x_vectors), average=None, labels=['Positive', 'Negative'])

array([0.62068966, 0.63849765])

In [212]:
# naive bayes
f1_score(test_y, clf_gnb.predict(test_x_vectors.toarray()), average=None, labels=['Positive', 'Negative'])

array([0.65693431, 0.66508314])

In [213]:
# logistic regression
f1_score(test_y, clf_log.predict(test_x_vectors), average=None, labels=['Positive', 'Negative'])

array([0.80291971, 0.80760095])

In [214]:
print(f'''Number of positive reviews in the training set: {train_y.count('Positive')}\n
Number of negative reviews in the training set: {train_y.count('Negative')}''')

Number of positive reviews in the training set: 436

Number of negative reviews in the training set: 436


In [254]:
# create random test sample
random_test = ['Too bad DO NOT buy', 'not fun at all', 'so damn good']
random_test_vectors = vectorizer.transform(random_test)

clf_svm.predict(random_test_vectors)

array(['Negative', 'Negative', 'Positive'], dtype='<U8')

In [227]:
# grid search
from sklearn.model_selection import GridSearchCV

parameters = {'kernel': ('linear', 'rbf'),
              'C': (1,2,3,4,5,6,7,8,9,10)}

untuned_clf_svm = svm.SVC()
tuned_clf_svm = GridSearchCV(untuned_clf_svm, parameters, cv=5)
tuned_clf_svm.fit(train_x_vectors, train_y)

print("Best parameters found: ", tuned_clf_svm.best_params_)
print("Best score achieved: ", tuned_clf_svm.best_score_)

Best parameters found:  {'C': 2, 'kernel': 'rbf'}
Best score achieved:  0.8463054187192119


In [245]:
print('Before tuning:')
print(f'Mean Accuracy before tuning: {clf_svm.score(test_x_vectors, test_y)}')
print(f'F1 Score before tuning: {f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=["Positive", "Negative"])}')
print('After tuning:')
print(f'Mean Accuracy after tuning: {tuned_clf_svm.score(test_x_vectors, test_y)}')
print(f'F1 Score after tuning: {f1_score(test_y, tuned_clf_svm.predict(test_x_vectors), average=None, labels=["Positive", "Negative"])}')

Before tuning:
Mean Accuracy before tuning: 0.8076923076923077
F1 Score before tuning: [0.80582524 0.80952381]
After tuning:
Mean Accuracy after tuning: 0.8173076923076923
F1 Score after tuning: [0.82075472 0.81372549]


In [251]:
# saving model
import pickle

with open('/Users/tongzhu/python_projects/models/text_classifier.pkl', 'wb') as f:
    pickle.dump(tuned_clf_svm, f)
    
with open('/Users/tongzhu/python_projects/models/text_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

In [257]:
# load model
with open('/Users/tongzhu/python_projects/models/text_classifier.pkl', 'rb') as f:
    loaded_clf = pickle.load(f)

with open('/Users/tongzhu/python_projects/models/text_vectorizer.pkl', 'rb') as f:
    loaded_vectorizer = pickle.load(f)  

In [261]:
# test if the classifier and vectorizer are loaded correctly
random_test2 = ['amazing book', 'great book', 'not good']

random_test_vectors2 = loaded_vectorizer.transform(random_test2)

loaded_clf.predict(random_test_vectors2)

array(['Positive', 'Positive', 'Negative'], dtype='<U8')