In [1]:
from itertools import islice

import numpy as np
import pandas as pd
import json
import nltk
from nltk.stem import WordNetLemmatizer

In [2]:
from sklearn.model_selection import train_test_split # function for splitting data to train and test sets

from nltk.corpus import stopwords
from nltk.classify import SklearnClassifier

import matplotlib.pyplot as plt
%matplotlib inline

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output

In [14]:
import re
import nltk
from nltk.stem import WordNetLemmatizer

REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)|(=)")

def remove_punc(line):
    review = REPLACE_NO_SPACE.sub("", line.lower())
    review = REPLACE_WITH_SPACE.sub(" ", review)
    return review

def lemmatize(line):
    lemmatizer = WordNetLemmatizer()
    line = remove_punc(line)
    return " ".join([lemmatizer.lemmatize(word, pos='v') for word in nltk.word_tokenize(line)])


def preprocess_review(review):
    lines = [x for x in nltk.sent_tokenize(review) if len(x)>1]
    return ". ".join([lemmatize(line) for line in lines])
    
    
preprocess_review("Ribs = amazing\n2 hour wait time= not so amazing, but understandable. \n\nThis place would get 5 stars if they expanded their BBQ restaurant. Their ribs are AMAZING. You get SO much food for the price and it tastes sooo good. Plus, the two hour wait isnt always a bad thing because it gives you an excuse to drink and gamble while you wait!")
    


'rib amaze 2 hour wait time not so amaze but understandable.this place would get 5 star if they expand their bbq restaurant.their rib be amaze.you get so much food for the price and it taste sooo good.plus the two hour wait isnt always a bad thing because it give you an excuse to drink and gamble while you wait'

In [4]:
file = 'processed_reviews.json'
with open(file,encoding="utf-8") as f:
    data = [next(f).strip() for x in range(100000)]

positive=[]
negative=[]

test=[]
   
for x in data[300:]:
    x = json.loads(x)
    stars = x["stars"]
    if stars<2:
        #negative.extend([y for y in x["text"].split(".")])
        negative.append(x["text"])
    if stars>4:
        positive.append(x["text"])
        
total = len(negative)+len(positive)
print(len(negative),len(positive))

15231 40022


In [5]:
from sklearn.feature_extraction.text import CountVectorizer

train_data = []
train_data.extend(positive[:12500])
train_data.extend(negative[:12500])

cv = CountVectorizer(binary=True)
cv.fit(train_data)
X = cv.transform(train_data)
X_test = cv.transform(test)

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

target = [5 if i < 12500 else 1 for i in range(25000)]

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for each sentence C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))



Accuracy for C=0.01: 0.94736
Accuracy for C=0.05: 0.95728
Accuracy for C=0.25: 0.9584
Accuracy for C=0.5: 0.95856
Accuracy for C=1: 0.95904


In [10]:
final_model = LogisticRegression(C=0.5)
final_model.fit(X, target)

x_input=[]
x_val = []
for x in data[:300]:
    x = json.loads(x)
    if x["stars"] in [1,5]:
        new_data = [y for y in x["text"].split(".")]
        x_input.extend(new_data)
        x_val.extend([int(x["stars"])]*len(new_data))

X_test = cv.transform(x_input)
print ("Accuracy : %s" 
       % ( accuracy_score(x_val, final_model.predict(X_test))))
    

Accuracy for each sentence: 0.760425909494


In [11]:

feature_to_coef = {
    word: coef for word, coef in zip(
        cv.get_feature_names(), final_model.coef_[0]
    )
}
print(len(feature_to_coef))
for best_positive in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1], 
    reverse=True)[:10]:
    print (best_positive)
    
    
for best_negative in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1])[:10]:
    print (best_negative)
    

36939
('delicious', 2.7752735465862237)
('excellent', 2.384848753208368)
('amaze', 2.345414104551343)
('fantastic', 2.0501342303566248)
('best', 2.0444112550176241)
('great', 2.0015307342766238)
('awesome', 1.9769120363137864)
('perfect', 1.9321084525477419)
('love', 1.5856131545219361)
('wonderful', 1.4702294543982148)
('worst', -3.2027793816067187)
('horrible', -2.6572909207199409)
('terrible', -2.6475747502441789)
('awful', -2.2326144417316396)
('mediocre', -2.2276312441795754)
('overprice', -2.1890423835888493)
('bland', -2.1294695601694147)
('rude', -2.0507657285868768)
('poor', -1.842755937210296)
('disappointment', -1.7873470950430868)
