In [120]:
import spacy
import numpy as np
import pandas as pd
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.pipeline import Pipeline
nlp = spacy.load("en_core_web_sm")

In [2]:
data=pd.read_csv('review.csv')
data.head()

Unnamed: 0,reviewTime,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime
0,2014-05-21,A30TL5EWN6DFXT,120401325X,christina,"[0, 0]",They look good and stick good! I just don't li...,4,Looks Good,1400630400
1,2014-01-14,ASY55RVNIL0UD,120401325X,emily l.,"[0, 0]",These stickers work like the review says they ...,5,Really great product.,1389657600
2,2014-06-26,A2TMXE2AFO7ONB,120401325X,Erica,"[0, 0]",These are awesome and make my phone look so st...,5,LOVE LOVE LOVE,1403740800
3,2014-04-03,A8AJS1DW7L3JJ,3998899561,Agata Majchrzak,"[1, 1]",This is a fantastic case. Very stylish and pro...,5,Perfect Case,1396483200
4,2014-04-13,A2YO4SCWAWNYBI,3998899561,Alex Maslakov,"[0, 0]",this case fits perfectly on the s4 and keeps m...,5,Just what I needed,1397347200


In [3]:
data.isnull().sum()

reviewTime          0
reviewerID          0
asin                0
reviewerName      714
helpful             0
reviewText         31
overall             0
summary             0
unixReviewTime      0
dtype: int64

In [4]:
data.shape

(55045, 9)

In [5]:
data = data.dropna(axis=0, subset=['reviewText'])

In [6]:
def preprocess(string):
    doc = nlp(string)
    lemma = [token.lemma_ for token in doc if token.lemma_.isalpha() or token.lemma_ not in STOP_WORDS]
    return ' '.join(lemma)

In [7]:
data['prepro'] = data['reviewText'].apply(preprocess)
data['sentiment'] = data['overall'].map({1:0, 2:0, 3:1, 4:1, 5:1})

In [9]:
X, y = data['prepro'], data['sentiment']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [11]:
vector = TfidfVectorizer()

In [12]:
X_vtrain = vector.fit_transform(X_train)
X_vtest = vector.transform(X_test)

In [17]:
nb = MultinomialNB()
lr = LogisticRegression(solver='liblinear')

In [18]:
nb.fit(X_vtrain, y_train)
lr.fit(X_vtrain, y_train)

In [19]:
y_nb = nb.predict(X_vtest)
y_lr = lr.predict(X_vtest)

In [136]:
print('NB: {}'.format(accuracy_score(y_test, y_nb)))
print(f'LR: {accuracy_score(y_test, y_lr)}')
print("F1F1F1F1F1")
print('NB: {}'.format(f1_score(y_test, y_nb)))
print(f'LR: {f1_score(y_test, y_lr)}')
print("ROC_AUC_SCORE")
print('NB: {}'.format(roc_auc_score(y_test, nb.predict_proba(X_vtest)[:,1])))
print(f'LR: {roc_auc_score(y_test, lr.predict_proba(X_vtest)[:,1])}')
print("Confusion Matrix")
print('NB: {}'.format(confusion_matrix(y_test, y_nb).T))
print(f'LR: {confusion_matrix(y_test, y_lr).T}')

NB: 0.8856675452149414
LR: 0.9190220848859402
F1F1F1F1F1
NB: 0.9393559583494022
LR: 0.9556385362210605
ROC_AUC_SCORE
NB: 0.8235143587011595
LR: 0.9261963819364003
Confusion Matrix
NB: [[   2    0]
 [1258 9743]]
LR: [[ 515  146]
 [ 745 9597]]


In [166]:
sent = "I don't like it, it normally hangs when I'm doing big things with it and that drives me crazy, though it seemed to be the when it was produced"

In [167]:
sent = preprocess(sent)

In [168]:
sent = vector.transform([sent])

In [169]:
lr.predict(sent)

array([1], dtype=int64)

In [165]:
nb.predict(sent)

array([1], dtype=int64)