# 1 - Preparing data

In [34]:
import pandas as pd
from sklearn.utils import shuffle

In [35]:
books_data=pd.read_json("./data/Books_small_10000.json",lines=True)

In [36]:
books_data.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A1F2H80A1ZNN1N,B00GDM3NQC,Connie Correll,"[0, 0]","I bought both boxed sets, books 1-5. Really a...",5,Can't stop reading!,1390435200,"01 23, 2014"
1,AI3DRTKCSK4KX,B00A5MREAM,Grandma,"[0, 0]",I enjoyed this short book. But it was way way ...,3,A leaf on the wind of all hallows,1399593600,"05 9, 2014"
2,A3KAKFHY9DAC8A,0446547573,"toobusyreading ""Inspired Kathy""","[1, 1]",I love Nicholas Sparks. I&#8217;ve read everyt...,4,Great writing from Nicholas Sparks.,1404518400,"07 5, 2014"
3,ATYBCYD6BIXVL,0955809215,Chrissie,"[0, 0]",I really enjoyed this adventure and look forwa...,4,great,1389225600,"01 9, 2014"
4,A17K95SEU3J68U,0991500776,"Sirde ""artist761""","[0, 0]",It was a decent read.. typical story line. Not...,3,It was a decent read.. typical story line ...,1404864000,"07 9, 2014"


In [37]:
books_data.shape

(10000, 9)

In [38]:
def get_sentiment(data):
        if data <=2:
            return 'NEGATIVE'
        elif data == 3:
            return 'NEUTRAL'
        else:
            return 'POSITIVE'

In [39]:
books_data['sentiment'] = books_data['overall'].map(lambda x: get_sentiment(x))

In [40]:
books_data.head(10)

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,sentiment
0,A1F2H80A1ZNN1N,B00GDM3NQC,Connie Correll,"[0, 0]","I bought both boxed sets, books 1-5. Really a...",5,Can't stop reading!,1390435200,"01 23, 2014",POSITIVE
1,AI3DRTKCSK4KX,B00A5MREAM,Grandma,"[0, 0]",I enjoyed this short book. But it was way way ...,3,A leaf on the wind of all hallows,1399593600,"05 9, 2014",NEUTRAL
2,A3KAKFHY9DAC8A,0446547573,"toobusyreading ""Inspired Kathy""","[1, 1]",I love Nicholas Sparks. I&#8217;ve read everyt...,4,Great writing from Nicholas Sparks.,1404518400,"07 5, 2014",POSITIVE
3,ATYBCYD6BIXVL,0955809215,Chrissie,"[0, 0]",I really enjoyed this adventure and look forwa...,4,great,1389225600,"01 9, 2014",POSITIVE
4,A17K95SEU3J68U,0991500776,"Sirde ""artist761""","[0, 0]",It was a decent read.. typical story line. Not...,3,It was a decent read.. typical story line ...,1404864000,"07 9, 2014",NEUTRAL
5,AAPAK68BFPHA1,B00HPHL9OW,Michelle Dodd,"[0, 0]",I hoped for Mia to have some peace in this boo...,5,Absolutely compelling book I felt Mia's Story ...,1389830400,"01 16, 2014",POSITIVE
6,A1ETRQ3VMHZYLK,0307451305,Kahoutek,"[0, 0]",The book has the fevered intensity of Oliver S...,2,A Fevered Mess,1394668800,"03 13, 2014",NEGATIVE
7,A3LVKEP5T8HO6D,0843960221,Beck,"[0, 0]","This is the First book in the Trilogy, and I'm...",5,The Road to a Hanging,1399075200,"05 3, 2014",POSITIVE
8,A14PRVP4JK88E7,B00JKW55EA,Rafase282,"[0, 0]",After reading the other book about introvert p...,5,The follow up!,1397606400,"04 16, 2014",POSITIVE
9,A3SOAJ7WJFHVFN,1493770691,Sara S.,"[2, 2]","I really, really like this book because the he...",5,Decency prevails!,1397779200,"04 18, 2014",POSITIVE


## Imbalanced data 

In [41]:
books_data['sentiment'].value_counts()

POSITIVE    8378
NEUTRAL      978
NEGATIVE     644
Name: sentiment, dtype: int64

In [42]:
positive_reviews = books_data[books_data['sentiment'] == 'POSITIVE']
negative_reviews = books_data[books_data['sentiment'] == 'NEGATIVE']
size_negative = negative_reviews.shape[0]
new_positive_reviews = positive_reviews.iloc[:size_negative,:]
reviews = pd.concat([negative_reviews,new_positive_reviews],axis=0)
books_data=shuffle(reviews)

In [43]:
books_data['sentiment'].value_counts()

POSITIVE    644
NEGATIVE    644
Name: sentiment, dtype: int64

# 2 - Train_test split

In [44]:
X = books_data['reviewText']
y = books_data['sentiment']

In [45]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.33,random_state=0)

In [46]:
print(X_test.head(1))
print(y_test.head(1))
print(X_train.head(1))
print(y_train.head(1))

314    This was a book that was hard to put down.  It...
Name: reviewText, dtype: object
314    POSITIVE
Name: sentiment, dtype: object
100    Good follow up to the original.  A little fast...
Name: reviewText, dtype: object
100    POSITIVE
Name: sentiment, dtype: object


In [47]:
y_train.value_counts()

POSITIVE    439
NEGATIVE    423
Name: sentiment, dtype: int64

In [48]:
y_test.value_counts()

NEGATIVE    221
POSITIVE    205
Name: sentiment, dtype: int64

# 3 - Bag of Words 

### The words of the input will be the columns of the matrix, the phrases will be the rows and the values will be the number of times which a word appear in a phrase

In [49]:
from sklearn.feature_extraction.text import CountVectorizer

In [50]:
vectorizer = CountVectorizer()
X_train_vectorized=vectorizer.fit_transform(X_train)

In [51]:
X_test_vectorized=vectorizer.transform(X_test)

In [86]:
X_train_vectorized=pd.DataFrame(data=X_train_vectorized.toarray(),columns = vectorizer.get_feature_names())
X_test_vectorized=pd.DataFrame(data=X_test_vectorized.toarray(),columns = vectorizer.get_feature_names())

# 4 - Classificators

In [81]:
import numpy as np
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC,SVC

In [53]:
 dict_classifiers = {"Random Forest": RandomForestClassifier(random_state=0,n_jobs=3),
    "Nearest Neighborns": KNeighborsClassifier(n_jobs=3),
    "Logistic Regression": LogisticRegression(solver = "liblinear",random_state=0),
    "Gradient Boosting": GradientBoostingClassifier(random_state=0),
    "LinearSVC": LinearSVC(random_state=0),
    "SVC":SVC(random_state=0)
}

In [54]:
classifiers_names = list(dict_classifiers.keys())
classifiers_values=list(dict_classifiers.values())


In [55]:
def train_model(model):
    model.fit(X_train_vectorized,y_train)
    y_test_pred = model.predict(X_test_vectorized)
    print(classification_report(y_test, y_test_pred))
    print('\n')
    return times

In [56]:
times,precs = [],[]
for key,value in zip(classifiers_names,classifiers_values):
    print('------------------{}------------------'.format(key))
    Time=train_model(value)
    times.append(Time)

------------------Random Forest------------------
              precision    recall  f1-score   support

    NEGATIVE       0.82      0.84      0.83       221
    POSITIVE       0.82      0.80      0.81       205

    accuracy                           0.82       426
   macro avg       0.82      0.82      0.82       426
weighted avg       0.82      0.82      0.82       426



------------------Nearest Neighborns------------------
              precision    recall  f1-score   support

    NEGATIVE       0.70      0.57      0.62       221
    POSITIVE       0.61      0.74      0.67       205

    accuracy                           0.65       426
   macro avg       0.65      0.65      0.65       426
weighted avg       0.66      0.65      0.65       426



------------------Logistic Regression------------------
              precision    recall  f1-score   support

    NEGATIVE       0.85      0.86      0.85       221
    POSITIVE       0.84      0.84      0.84       205

    accuracy     

In [59]:
model = LogisticRegression(solver = "liblinear",random_state=0)
model.fit(X_train_vectorized,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

# 5 - Grid_search

In [62]:
from sklearn.model_selection import GridSearchCV

In [64]:
penalty = ['l1','l2']
C = [1, 10, 100, 1000]

parameters = dict(penalty = penalty,C=C)

grid = GridSearchCV(model, parameters, cv = 3, verbose = 2, n_jobs = 3)
grid.fit(X_train_vectorized, y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  24 out of  24 | elapsed:    0.9s finished


GridSearchCV(cv=3, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=0, solver='liblinear',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=3,
             param_grid={'C': [1, 10, 100, 1000], 'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=2)

In [65]:
model = grid.best_estimator_
model.fit(X_train_vectorized,y_train)

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

# 6 - Evaluation

In [84]:
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns

In [87]:
scores = cross_val_score(model, pd.concat([X_train_vectorized,X_test_vectorized]), pd.concat([y_train,y_test]), cv = 5, scoring='accuracy')
scores

array([0.79844961, 0.80620155, 0.8372093 , 0.82490272, 0.85214008])

In [66]:
y_test_pred = model.predict(X_test_vectorized)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

    NEGATIVE       0.83      0.84      0.84       221
    POSITIVE       0.82      0.82      0.82       205

    accuracy                           0.83       426
   macro avg       0.83      0.83      0.83       426
weighted avg       0.83      0.83      0.83       426



In [76]:
conf_matrix = confusion_matrix(y_test, y_test_pred)

In [79]:
print('Prop. of True Negatives = ', round((conf_matrix[0,0] / y_test.shape[0]),3))
print('Prop. of True Positives = ', round((conf_matrix[1,1]/ y_test.shape[0]),3))
print('Prop. of False Positives - Type I Error = ', round((conf_matrix[0,1]/ y_test.shape[0]),3))
print('Prop. of False Negatives - Type II Error = ', round((conf_matrix[1,0]/ y_test.shape[0]),3))

Prop. of True Negatives =  0.434
Prop. of True Positives =  0.394
Prop. of False Positives - Type I Error =  0.085
Prop. of False Negatives - Type II Error =  0.087


In [67]:
test = ['Very good book ','Wonderful','Waste of time','great','I really enjoyed the experience with this book, it has give me many thoughtful moments']
vetorized_test = vectorizer.transform(test)
model.predict(vetorized_test)

array(['POSITIVE', 'POSITIVE', 'NEGATIVE', 'POSITIVE', 'POSITIVE'],
      dtype=object)

# Not the best but quite good :)