In [51]:
import random

class Review:
    def __init__(self, text, score):
        self.text = text  # allowed to do review[index].text
        self.score = score  # allowed to do review[index].score
        self.sentiment = self.get_sentiment()
    
    def get_sentiment(self):
        if self.score <= 2:
            return 'Negative'
        elif self.score == 3:
            return 'Neutral'
        else:
            return 'Positive'
        
class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [i.text for i in self.reviews]  # method for list comprehension (text)
        
    def get_sentiment(self):  # same method but in different class
        return [i.sentiment for i in self.reviews]  # method for list comprehension (sentiment)
    
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == "Negative", self.reviews))
        positive = list(filter(lambda x: x.sentiment == "Positive", self.reviews))
        positive_shrunk = positive[:len(negative)]  # shrunk the number of positive reviews
        self.reviews = positive_shrunk + negative
        random.shuffle(self.reviews)  # shuffle the order of positive and negative reviews

In [52]:
import json

file_path = '/Users/tongzhu/python_projects/ml/text/books_small_10000.json'

reviews = []
with open(file_path) as f:
    for i in f:
        line = json.loads(i)
#       print(line['reviewText'])
#       print(line['overall'])
        reviews.append(Review(line['reviewText'], line['overall']))  # Review(text, score)

In [53]:
# train test split
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size=0.33, random_state=8)  #random state = 8

train_container = ReviewContainer(training)
test_container = ReviewContainer(test)

In [54]:
train_container.evenly_distribute()
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_container.evenly_distribute()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

print(train_y.count('Positive'))  # number of positive reviews in the training set
print(train_y.count('Negative'))  # number of negative reviews in the training set

428
428


In [55]:
# bag of words
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)  # learn from the training set and transform

test_x_vectors = vectorizer.transform(test_x)  # only transform for the test set

In [56]:
# classifications

# linear Support Vector Machine(SVM)
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vectors, train_y)

SVC(kernel='linear')

In [57]:
# decision tree
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()

clf_dec.fit(train_x_vectors, train_y)

DecisionTreeClassifier()

In [58]:
# navie bayes
from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB()

clf_gnb.fit(train_x_vectors.toarray(), train_y)

GaussianNB()

In [59]:
# logistic regression
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()

clf_log.fit(train_x_vectors, train_y)

LogisticRegression()

In [60]:
# evaluation

# mean accuracy
print(clf_svm.score(test_x_vectors, test_y))
print(clf_dec.score(test_x_vectors, test_y))
print(clf_gnb.score(test_x_vectors.toarray(), test_y))
print(clf_log.score(test_x_vectors, test_y))

0.8611111111111112
0.6851851851851852
0.6597222222222222
0.8402777777777778


In [61]:
# F1 scores
from sklearn.metrics import f1_score

# svm
f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=['Positive', 'Negative'])
# this is the method we will be focusing on optimizing

array([0.85849057, 0.86363636])

In [62]:
# decision tree
f1_score(test_y, clf_dec.predict(test_x_vectors), average=None, labels=['Positive', 'Negative'])

array([0.67307692, 0.69642857])

In [63]:
# naive bayes
f1_score(test_y, clf_gnb.predict(test_x_vectors.toarray()), average=None, labels=['Positive', 'Negative'])

array([0.66361556, 0.6557377 ])

In [64]:
# logistic regression
f1_score(test_y, clf_log.predict(test_x_vectors), average=None, labels=['Positive', 'Negative'])

array([0.83610451, 0.84424379])

In [65]:
print(f'''Number of positive reviews in the training set: {train_y.count('Positive')}\n
Number of negative reviews in the training set: {train_y.count('Negative')}''')

Number of positive reviews in the training set: 428

Number of negative reviews in the training set: 428


In [66]:
# create random test sample
random_test = ['Too bad DO NOT buy', 'not fun at all', 'so damn good']
random_test_vectors = vectorizer.transform(random_test)

clf_svm.predict(random_test_vectors)

array(['Negative', 'Negative', 'Positive'], dtype='<U8')

In [67]:
# grid search
from sklearn.model_selection import GridSearchCV

parameters = {'kernel': ('linear', 'rbf'),
              'C': (0.25, 0.5, 0.75, 1, 5, 10)}

untuned_clf_svm = svm.SVC()
tuned_clf_svm = GridSearchCV(untuned_clf_svm, parameters, cv=5)
tuned_clf_svm.fit(train_x_vectors, train_y)

print("Best parameters found: ", tuned_clf_svm.best_params_)
print("Best score achieved: ", tuned_clf_svm.best_score_)

Best parameters found:  {'C': 5, 'kernel': 'rbf'}
Best score achieved:  0.8621039031687747


In [68]:
print('Before tuning:')
print(f'Mean Accuracy before tuning: {clf_svm.score(test_x_vectors, test_y)}')
print(f'F1 Score before tuning: {f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=["Positive", "Negative"])}')
print('After tuning:')
print(f'Mean Accuracy after tuning: {tuned_clf_svm.score(test_x_vectors, test_y)}')
print(f'F1 Score after tuning: {f1_score(test_y, tuned_clf_svm.predict(test_x_vectors), average=None, labels=["Positive", "Negative"])}')

Before tuning:
Mean Accuracy before tuning: 0.8611111111111112
F1 Score before tuning: [0.85849057 0.86363636]
After tuning:
Mean Accuracy after tuning: 0.8564814814814815
F1 Score after tuning: [0.85377358 0.85909091]


In [69]:
# saving model
import pickle

with open('/Users/tongzhu/python_projects/models/text_classifier.pkl', 'wb') as f:
    pickle.dump(tuned_clf_svm, f)
    
with open('/Users/tongzhu/python_projects/models/text_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

In [70]:
# load model
with open('/Users/tongzhu/python_projects/models/text_classifier.pkl', 'rb') as f:
    loaded_clf = pickle.load(f)

with open('/Users/tongzhu/python_projects/models/text_vectorizer.pkl', 'rb') as f:
    loaded_vectorizer = pickle.load(f)  

In [71]:
# test if the classifier and vectorizer are loaded correctly
random_test2 = ['amazing book', 'great book', 'not good']

random_test_vectors2 = loaded_vectorizer.transform(random_test2)

loaded_clf.predict(random_test_vectors2)

array(['Positive', 'Positive', 'Negative'], dtype='<U8')