In [3]:
import sklearn as sk
import random
import numpy as np
import pandas as pd

In [4]:
import json

In [5]:
# Load data
# Load text data in json format

In [6]:
# look at the gist of the data
with open("Books_small.json") as f:
    for line in f:
        print(line)
        break

{"reviewerID": "A1E5ZR1Z4OQJG", "asin": "1495329321", "reviewerName": "Pure Jonel \"Pure Jonel\"", "helpful": [0, 0], "reviewText": "Da Silva takes the divine by storm with this unique new novel.  She develops a world unlike any others while keeping it firmly in the real world.  This is a very well written and entertaining novel.  I was quite impressed and intrigued by the way that this solid storyline was developed, bringing the readers right into the world of the story.  I was engaged throughout and definitely enjoyed my time spent reading it.I loved the character development in this novel.  Da Silva creates a cast of high school students who actually act like high school students.  I really appreciated the fact that none of them were thrown into situations far beyond their years, nor did they deal with events as if they had decades of life experience under their belts.  It was very refreshing and added to the realism and impact of the novel.  The friendships between the characters i

In [7]:
# convert text to json and get required fields
with open("Books_small.json") as f:
    for line in f:
        review = json.loads(line) # covert text to json format
        print("REVIEW = ", review["reviewText"])
        print("SCORE = ", review["overall"])
        break

REVIEW =  Da Silva takes the divine by storm with this unique new novel.  She develops a world unlike any others while keeping it firmly in the real world.  This is a very well written and entertaining novel.  I was quite impressed and intrigued by the way that this solid storyline was developed, bringing the readers right into the world of the story.  I was engaged throughout and definitely enjoyed my time spent reading it.I loved the character development in this novel.  Da Silva creates a cast of high school students who actually act like high school students.  I really appreciated the fact that none of them were thrown into situations far beyond their years, nor did they deal with events as if they had decades of life experience under their belts.  It was very refreshing and added to the realism and impact of the novel.  The friendships between the characters in this novel were also truly touching.Overall, this novel was fantastic.  I can&#8217;t wait to read more and to find out w

In [8]:
# Raw way of creating training data
reviews = []
with open("Books_small.json") as f:
    for line in f:
        review = json.loads(line)
        reviews.append((review["reviewText"], review["overall"])) # storing data as tuple -> (text, score)

In [9]:
# retrieve from training data
reviews[random.randint(0,999)]

('Information on the CCRN dose not change that much, so using an outdated book will not effect you too much. I used this and looked up the blue print from AACN. This book is a bit winded, and goes into a lot of detail about anatomy and such that you might not really need to know it pass. If you want to be know all that information, then you are a far more studious nurse then I.  The book is broken up well so you can easily skip this part, and just study the patho. The CD is great! I fell like that helped me a ton!',
 4.0)

In [10]:
class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
    def get_sentiment(self):
        if self.score >= 4:
            return "POSITIVE"
        elif self.score == 3:
            return "NEUTRAL"
        else:
            return "NEGATIVE"

# Structured way of creating training data
reviews = []
with open("Books_small.json") as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review["reviewText"], review["overall"]))

In [11]:
# retrieve from training data
# rather than remembering the order of the index in raw data storage, we can access with it's name (text or score) itself in, 
# structured data storage
index = random.randint(0,999)
print(reviews[index].text)
print(reviews[index].score)
print(reviews[index].sentiment)

I love it! rock and roll + time travel = Love. I really enjoyed Megan and Davy's built up to romance. I love the twist. Who hasn't dreamed about time travel where you find you true love.I received a copy of this book for a honest review.
5.0
POSITIVE


In [12]:
# Prepare the data

In [13]:
# train test split

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
# syntax
# I pass in list, we can also pass numpy array and pandas dataframe
train, test = train_test_split(reviews, test_size=0.33, random_state=42) 

In [16]:
# validation
print(type(train))
print(len(train))
print(len(test))

<class 'list'>
670
330


In [17]:
# splitting x and y manually
x_train = [x.text for x in train]
y_train = [y.sentiment for y in train]
x_test = [x.text for x in test]
y_test = [y.sentiment for y in test]

In [18]:
# validating
print(len(x_train), len(y_train))
print(len(x_test), len(y_test))
index = random.randint(0,299)
print(x_train[index], y_train[index])
print(x_test[index], y_test[index])

670 670
330 330
Since I have been on a Sandra Brown reading binge, I got this one from the library and I am certainly glad that I did not buy this one. The story is about a family in a small southern town called Heaven, Louisiana.The heroine is Schyler Crandall who has returned home to Heaven from London, where she escaped to after her sister, Tricia betrays her.  Schyler has come back to the family home named Belle Terre, an old southern home that she loves, because her father has had a major heart attack.  Since her father is in the hospital, Schyler takes on the family business.The book is heavy on very crass language and sex scenes.  If you are looking for a romance novel, then this book is hot what I would call romantic, The romantic involvement for Schyler is a Cajun who lives on the bayou, Cash Boudreaux.  He is as foul mouthed and crude as they come.  How any woman could find anything romantic about this man is beyound me.There are a number of other sex scenes with other charac

In [19]:
# Bag of words

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

In [21]:
# vectorize each review
vectorizer = CountVectorizer() # initialize sklearn vectorizer
vectorizer.fit(x_train) # pass in list of sentences

CountVectorizer()

In [22]:
# vectorizer is a dictionary of words now, transform will use this dictionary to turn a sentense or para into vector of integers
x_train = vectorizer.transform(x_train)

In [23]:
# validation
print(type(x_train))
print(x_train.shape)
print(type(x_train.toarray())) # convert scipy matrix to numpy array
print(x_train[0])
x_train[0]

<class 'scipy.sparse.csr.csr_matrix'>
(670, 7372)
<class 'numpy.ndarray'>
  (0, 350)	2
  (0, 539)	1
  (0, 562)	1
  (0, 1148)	1
  (0, 1515)	1
  (0, 1558)	1
  (0, 1800)	1
  (0, 2007)	1
  (0, 2895)	1
  (0, 3054)	1
  (0, 3545)	1
  (0, 5197)	1
  (0, 6475)	1
  (0, 6593)	1
  (0, 6595)	1
  (0, 7086)	1
  (0, 7353)	1


<1x7372 sparse matrix of type '<class 'numpy.int64'>'
	with 17 stored elements in Compressed Sparse Row format>

In [24]:
# Models and classification
from sklearn.metrics import f1_score

In [25]:
# Prepare the test dataset
x_test = vectorizer.transform(x_test)

In [26]:
# Model 1
# Linear SVM
from sklearn.svm import SVC

# C - Inverse of regularization strength, smaller values specify stronger regularization. 
linear_svm_model = SVC(kernel="linear", C=0.5)
linear_svm_model.fit(x_train, y_train)

print("Training Accuracy = ", linear_svm_model.score(x_train, y_train))
print("Testing Accuracy = ", linear_svm_model.score(x_test, y_test))

# classification F1 score
# we have to mention average = None bcz we are using multiclass, if y is binary then we don't need to mention average
print("Testing F1 Score = ", f1_score(y_test, linear_svm_model.predict(x_test), average=None, labels=["POSITIVE", "NEUTRAL", "NEGATIVE"]))

Training Accuracy =  0.9985074626865672
Testing Accuracy =  0.8242424242424242
Testing F1 Score =  [0.91289199 0.23728814 0.22222222]


In [27]:
# Model 2
# RBF SVM
from sklearn.svm import SVC

# C - Inverse of regularization strength, smaller values specify stronger regularization.
# gamma - default value is 'scale', we can also give float values
rbf_svm_model = SVC(kernel="rbf", C=11, gamma=0.005)
rbf_svm_model.fit(x_train, y_train)

print("Training Accuracy = ", rbf_svm_model.score(x_train, y_train))
print("Testing Accuracy = ", rbf_svm_model.score(x_test, y_test))

# classification F1 score
print("Testing F1 Score = ", f1_score(y_test, rbf_svm_model.predict(x_test), average=None, labels=["POSITIVE", "NEUTRAL", "NEGATIVE"]))

Training Accuracy =  0.991044776119403
Testing Accuracy =  0.8515151515151516
Testing F1 Score =  [0.92561983 0.05128205 0.        ]


In [28]:
# Model 3
# RBF SVM
from sklearn.svm import SVC

# C - Inverse of regularization strength, smaller values specify stronger regularization.
# degree - polynomial degree - default is 3
poly_svm_model = SVC(kernel="poly", degree=3, C=10000)
poly_svm_model.fit(x_train, y_train)

print("Training Accuracy = ", poly_svm_model.score(x_train, y_train))
print("Testing Accuracy = ", poly_svm_model.score(x_test, y_test))

# classification F1 score
print("Testing F1 Score = ", f1_score(y_test, poly_svm_model.predict(x_test), average=None, labels=["POSITIVE", "NEUTRAL", "NEGATIVE"]))

Training Accuracy =  0.9925373134328358
Testing Accuracy =  0.7787878787878788
Testing F1 Score =  [0.88214286 0.22222222 0.14285714]


In [29]:
# Model 4
# Decision Tree 
from sklearn.tree import DecisionTreeClassifier

decision_tree_model = DecisionTreeClassifier(max_depth=5) # default max_depth is None
decision_tree_model.fit(x_train, y_train)

print("Training Accuracy = ", decision_tree_model.score(x_train, y_train))
print("Testing Accuracy = ", decision_tree_model.score(x_test, y_test))

# classification F1 score
print("Testing F1 Score = ", f1_score(y_test, decision_tree_model.predict(x_test), average=None, labels=["POSITIVE", "NEUTRAL", "NEGATIVE"]))

Training Accuracy =  0.8731343283582089
Testing Accuracy =  0.8151515151515152
Testing F1 Score =  [0.89864865 0.13333333 0.        ]


In [30]:
# helper code
# best way to convert scipy sequence to numpy array, don't use np.array(seq), do seq.toarray()
x_train = x_train.toarray()
x_test = x_test.toarray()

In [31]:
# Model 5
# Gaussian Naieve Bayes
from sklearn.naive_bayes import GaussianNB

gaussian_nb_model = GaussianNB()
gaussian_nb_model.fit(x_train, y_train)

print("Training Accuracy = ", gaussian_nb_model.score(x_train, y_train))
print("Testing Accuracy = ", gaussian_nb_model.score(x_test, y_test))

# classification F1 score
print("Testing F1 Score = ", f1_score(y_test, gaussian_nb_model.predict(x_test), average=None, labels=["POSITIVE", "NEUTRAL", "NEGATIVE"]))

Training Accuracy =  0.9776119402985075
Testing Accuracy =  0.8121212121212121
Testing F1 Score =  [0.89678511 0.08510638 0.09090909]


In [32]:
# Model 6
# Logistic Regression
from sklearn.linear_model import LogisticRegression

# C - Inverse of regularization strength; must be a positive float, 
# Like in support vector machines, smaller values specify stronger regularization.
logistic_reg_model = LogisticRegression(C=1, max_iter=1000) # default C is 1, max_iter is 100
logistic_reg_model.fit(x_train, y_train)

print("Training Accuracy = ", logistic_reg_model.score(x_train, y_train))
print("Testing Accuracy = ", logistic_reg_model.score(x_test, y_test))

# classification F1 score
print("Testing F1 Score = ", f1_score(y_test, logistic_reg_model.predict(x_test), average=None, labels=["POSITIVE", "NEUTRAL", "NEGATIVE"]))

Training Accuracy =  0.9985074626865672
Testing Accuracy =  0.8303030303030303
Testing F1 Score =  [0.91370558 0.12244898 0.1       ]


In [33]:
# Model 7
# KNN
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(3) # default neighbor parameter is 5
knn_model.fit(x_train, y_train)

print("Training Accuracy = ", knn_model.score(x_train, y_train))
print("Testing Accuracy = ", knn_model.score(x_test, y_test))

# classification F1 score
print("Testing F1 Score = ", f1_score(y_test, knn_model.predict(x_test), average=None, labels=["POSITIVE", "NEUTRAL", "NEGATIVE"]))

Training Accuracy =  0.8686567164179104
Testing Accuracy =  0.7636363636363637
Testing F1 Score =  [0.87017544 0.07272727 0.11428571]


In [34]:
# Model 8
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

# n_estimators - number of trees in the forest - default is 100
# max_depth of each tree is None by default
random_forest_model = RandomForestClassifier(max_depth=5, n_estimators=10)
random_forest_model.fit(x_train, y_train)

print("Training Accuracy = ", random_forest_model.score(x_train, y_train))
print("Testing Accuracy = ", random_forest_model.score(x_test, y_test))

# classification F1 score
print("Testing F1 Score = ", f1_score(y_test, random_forest_model.predict(x_test), average=None, labels=["POSITIVE", "NEUTRAL", "NEGATIVE"]))

Training Accuracy =  0.8253731343283582
Testing Accuracy =  0.8575757575757575
Testing F1 Score =  [0.9233279 0.        0.       ]


In [35]:
# Model 9
# AdaBoost classifier
from sklearn.ensemble import AdaBoostClassifier

adaboost_model = AdaBoostClassifier(learning_rate=1)
adaboost_model.fit(x_train, y_train)

print("Training Accuracy = ", adaboost_model.score(x_train, y_train))
print("Testing Accuracy = ", adaboost_model.score(x_test, y_test))

# classification F1 score
print("Testing F1 Score = ", f1_score(y_test, adaboost_model.predict(x_test), average=None, labels=["POSITIVE", "NEUTRAL", "NEGATIVE"]))

Training Accuracy =  0.8552238805970149
Testing Accuracy =  0.8121212121212121
Testing F1 Score =  [0.89761092 0.1509434  0.0952381 ]


In [36]:
# Model 10
# Gaussian Process Classifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

# we set the kernel to RBF
gauss_process_model = GaussianProcessClassifier(1.0 * RBF(1.0))
gauss_process_model.fit(x_train, y_train)

print("Training Accuracy = ", gauss_process_model.score(x_train, y_train))
print("Testing Accuracy = ", gauss_process_model.score(x_test, y_test))

# classification F1 score
print("Testing F1 Score = ", f1_score(y_test, gauss_process_model.predict(x_test), average=None, labels=["POSITIVE", "NEUTRAL", "NEGATIVE"]))



Training Accuracy =  0.8238805970149253
Testing Accuracy =  0.8575757575757575
Testing F1 Score =  [0.9233279 0.        0.       ]


In [37]:
# Model 11
# Quadratic Discriminant Analysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

quadratic_discriminant_model = QuadraticDiscriminantAnalysis()
quadratic_discriminant_model.fit(x_train, y_train)

print("Training Accuracy = ", quadratic_discriminant_model.score(x_train, y_train))
print("Testing Accuracy = ", quadratic_discriminant_model.score(x_test, y_test))

# classification F1 score
print("Testing F1 Score = ", f1_score(y_test, quadratic_discriminant_model.predict(x_test), average=None, labels=["POSITIVE", "NEUTRAL", "NEGATIVE"]))



Training Accuracy =  1.0
Testing Accuracy =  0.8333333333333334
Testing F1 Score =  [0.90878939 0.05128205 0.        ]


In [38]:
# Model 12
# Neural Network in sklearn
from sklearn.neural_network import MLPClassifier

# max_iter default is 200
# alpha default = 0.0001
nn_model = MLPClassifier(alpha=0.01, max_iter=1000)
nn_model.fit(x_train, y_train)

print("Training Accuracy = ", nn_model.score(x_train, y_train))
print("Testing Accuracy = ", nn_model.score(x_test, y_test))

# classification F1 score
print("Testing F1 Score = ", f1_score(y_test, nn_model.predict(x_test), average=None, labels=["POSITIVE", "NEUTRAL", "NEGATIVE"]))

Training Accuracy =  1.0
Testing Accuracy =  0.8636363636363636
Testing F1 Score =  [0.92763158 0.11111111 0.125     ]


In [39]:
# Model 13
# Gradient Boost Classifier
from sklearn.ensemble import GradientBoostingClassifier

gradient_boost_model = GradientBoostingClassifier(learning_rate = 0.1, max_depth = 4)
gradient_boost_model.fit(x_train, y_train)

print("Training Accuracy = ", gradient_boost_model.score(x_train, y_train))
print("Testing Accuracy = ", gradient_boost_model.score(x_test, y_test))

# classification F1 score
print("Testing F1 Score = ", f1_score(y_test, gradient_boost_model.predict(x_test), average=None, labels=["POSITIVE", "NEUTRAL", "NEGATIVE"]))

Training Accuracy =  0.9865671641791045
Testing Accuracy =  0.8272727272727273
Testing F1 Score =  [0.91 0.   0.  ]


In [40]:
# make training data diverse
# Balance dataset

In [41]:
pd.Series(y_train).unique()

array(['POSITIVE', 'NEGATIVE', 'NEUTRAL'], dtype=object)

In [42]:
pd.DataFrame(y_train).value_counts()

POSITIVE    552
NEUTRAL      71
NEGATIVE     47
dtype: int64

In [43]:
pd.Series(y_train).count()

670