In [24]:
class Sentiment:
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"

class Review:
    def __init__(self, text, score):
        self.text = text 
        self.score = score
        self.sentiment = self.get_sentiment()
    
    def get_sentiment(self):
        if self.score <=2:
            return Sentiment.NEGATIVE
        elif self.score ==3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE

### Load Data

In [25]:
import json

file_name = './data/sentiment/books_small.json'

# append review tuple object
reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))
reviews[5].score

4.0

### Prep Data

In [26]:
from sklearn.model_selection import train_test_split

In [27]:
# we want to take the text (X), and predict if it is positive or negative (y)
training, test = train_test_split(reviews, test_size=0.33, random_state=42)

In [28]:
train_x = [x.text for x in training]
train_y = [x.sentiment for x in training]

test_x = [x.text for x in test]
test_y = [x.sentiment for x in test]

In [29]:
train_x[5]

"I was pleasantly surprised with this book. Very well written. Fast paced..with more than kne story line yet not too many characters  that I couldn't  keep  up  with the plot. I will definitely be reading more of this author. I won't give away  the ending...but was surprised..and that was fun. Thank you, Simon Gould."

## Bag of words vectorization

In [31]:
from sklearn.feature_extraction.text import CountVectorizer 

In [40]:
vectorizer = CountVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x)

## Classification 

### SVM

In [57]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vectors, train_y)

test_x[0]
test_y[0]

clf_svm.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

### Decision Tree

In [55]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

clf_dec.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

### Naive Bayes

In [60]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB()
clf_gnb.fit(train_x_vectors.todense(), train_y)

clf_gnb.predict(test_x_vectors.todense()[0])

array(['POSITIVE'], dtype='<U8')

### Logistic Regression 

In [61]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_y)

clf_log.predict(test_x_vectors[0])



array(['POSITIVE'], dtype='<U8')

## Evaluation