In [1]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
import itertools

from collections import defaultdict

In [2]:
import nltk
import random
#from nltk.corpus import movie_reviews
from nltk.classify.scikitlearn import SklearnClassifier
import pickle
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify import ClassifierI
from statistics import mode
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

In [3]:
def read_files():
    d = defaultdict(list)

    parent_dir = '../Data Sets/'

    path_dic = {'B': 'business_s.csv', 'C':'checkin.csv', 'R':'review_s.csv'\
               , 'T':'tip.csv', 'U':'user.csv'}

    for key in path_dic:
        d[key] = pd.read_csv(parent_dir + path_dic[key]).drop('Unnamed: 0', 1)
    return d
def show():
    sns.despine()
    plt.show()
    
def clean_format(w):
    w = w.lower().replace('.', '').replace(',', '').replace('!', '')
    #.replace('+', '').replace('(', '').replace(')', '')
    return w
                    


In [4]:
d = read_files()

In [5]:
d['RB'] = d['R'].merge(d['B'], on = 'business_id', how = 'inner')
d['RB'] = d['RB'].dropna(subset = ['is_open'])
d['RB'].rename(columns = {'stars_x' : 'review_star', 'stars_y':'buz_star'}, inplace = True)
d['RB'].head(1)

Unnamed: 0,business_id,cool,date,funny,review_id,review_star,text,useful,user_id,address,...,latitude,longitude,name,neighborhood,postal_code,review_count,buz_star,state,price,credit_card
0,fjMXGgOr3aCxnN48kovZ_Q,0,2015-03-09,0.0,3BBCHVND9tDPNliTFoLCHA,5.0,We recently decided to give this place another...,0.0,bCrpStRCku_gEX3Iwuv94A,5051 W Craig Rd,...,36.238959,-115.211568,Craig Road Animal Hospital,Northwest,89130,192,4.0,NV,,


In [6]:
print(list(d['RB']))

['business_id', 'cool', 'date', 'funny', 'review_id', 'review_star', 'text', 'useful', 'user_id', 'address', 'attributes', 'categories', 'city', 'hours', 'is_open', 'latitude', 'longitude', 'name', 'neighborhood', 'postal_code', 'review_count', 'buz_star', 'state', 'price', 'credit_card']


# Data preparation

In [18]:
# Define pos as 4 or above, drop 3 star reviews
df = d['RB']
df = df[df['review_star'] != 3]
df['pos'] = np.where(df['review_star'] >= 4, 1, 0)
df.index = range(len(df))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [19]:
print("Positive rate: ", np.mean(df['pos']))

Positive rate:  0.7429245283018868


In [20]:
# Stem all the words in reviews
ps = PorterStemmer()
df['text'] = [' '.join([ps.stem(w) for w in df['text'][i].split()]) for i in range(len(df['text']))]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [21]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['pos'], train_size = 3000/3816, random_state = 47)

In [22]:
print("First review: ", X_train[X_train.index[1]])
print()
print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)

First review:  I find there are veri few establish that do what I would realli call 'good business' still left in the world. I'm sure there are plenti I just run in to them everi onc in a rare while. The Keg is one of them. Two stories: The first time I went here my friend and I just want a good steak so we search out yelp and found The Keg wa close. Right off the bat I notic the atmospher wa pretti nice, noth over the top, but enjoyable. I also notic that the wait staff knew their shit. Our waitress explain all the special and recommend her favorit without hesitation. The real highlight wa befor our food came when the manag came over and ask us if it wa our first time. He then brought us a shrimp cocktail on the hous to kick off our meal. The second visit wa for a friend' birthday. We order a round of chocol cake shot and the manag show up to mix them himself and bought us our first round. He also gave us each recip to take home with us. The food on both occas wa great, I haven't had 

# Count Vectorizer

In [75]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer().fit(X_train)
vect.get_feature_names()[::2000]

['00', 'bundt', 'domest', 'helen', 'middl', 'pumps', 'stepchild', 'worthi']

In [76]:
print("Total number of features: ", len(vect.get_feature_names()))

Total number of features:  14187


In [77]:
# Now we vectorize the X_train data
X_train_vectorized = vect.transform(X_train)
X_train_vectorized

<3000x14187 sparse matrix of type '<class 'numpy.int64'>'
	with 218054 stored elements in Compressed Sparse Row format>

In [78]:
# Try using a logistic regression
clf = LogisticRegression()
clf.fit(X_train_vectorized, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [79]:
from sklearn.metrics import confusion_matrix, roc_auc_score

pred = clf.predict(vect.transform(X_test))

print(confusion_matrix(y_true = y_test, y_pred = pred))
print("Test accuracy: ", (confusion_matrix(y_true = y_test, y_pred = pred)[0][0] + confusion_matrix(y_true = y_test, y_pred = pred)[1][1])/len(X_test))
print('AUC: ', roc_auc_score(y_test, pred))

[[156  64]
 [ 26 570]]
Test accuracy:  0.889705882353
AUC:  0.832733374009


In [80]:
feature_names = np.array(vect.get_feature_names())

sored_coef_index = clf.coef_[0].argsort()
print("Smallest Coefs: \n{}\n".format(feature_names[sored_coef_index[:15]]))
print("Biggest Coefs: \n{}\n".format(feature_names[sored_coef_index[:-16:-1]]))

Smallest Coefs: 
['lack' 'worst' 'terrible' 'ok' 'money' 'poor' 'okay' 'rude' 'won' 'wouldn'
 'bland' 'horribl' 'gone' 'disappointed' 'not']

Biggest Coefs: 
['great' 'definit' 'best' 'love' 'amazing' 'delicious' 'everyth'
 'recommend' 'awesome' 'fresh' 'alway' 'thank' 'excel' 'amaz' 'delici']



# TF-IDF

Tf–idf, or Term frequency-inverse document frequency, allows us to weight terms based on how important they are to a document. High weight is given to terms that appear often in a particular document, but don't appear often in the corpus. Features with low tf–idf are either commonly used across all documents or rarely used and only occur in long documents. 

In [181]:
from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer(min_df = 2).fit(X_train)
len(vect.get_feature_names())

7324

In [182]:
X_train_vectorized = vect.transform(X_train)

clf = LinearSVC()
clf.fit(X_train_vectorized, y_train)

pred = clf.predict(vect.transform(X_test))

print(confusion_matrix(y_true = y_test, y_pred = pred))
print("Test accuracy: ", (confusion_matrix(y_true = y_test, y_pred = pred)[0][0] + confusion_matrix(y_true = y_test, y_pred = pred)[1][1])/len(X_test))
print('AUC: ', roc_auc_score(y_test, pred))

[[156  64]
 [ 19 577]]
Test accuracy:  0.898284313725
AUC:  0.83860585723


In [183]:
feature_names = np.array(vect.get_feature_names())

sored_coef_index = clf.coef_[0].argsort()
print("Smallest Coefs: \n{}\n".format(feature_names[sored_coef_index[:25]]))
print("Biggest Coefs: \n{}\n".format(feature_names[sored_coef_index[:-26:-1]]))

Smallest Coefs: 
['not' 'lack' 'worst' 'ok' 'okay' 'no' 'poor' 'rude' 'wouldn' 'told'
 'order' 'money' 'terrible' 'bland' 'horribl' 'won' 'custom' '20' 'paid'
 'left' 'horrible' 'noth' 'disappoint' 'water' 'frozen']

Biggest Coefs: 
['great' 'best' 'love' 'definit' 'alway' 'everyth' 'recommend' 'good'
 'amazing' 'thank' 'awesome' 'fresh' 'delicious' 'amaz' 'excel' 'pretti'
 'easi' 'delici' 'friendly' 'awesom' 'vegas' 'seat' 'is' 'sweet' 'happi']



In [184]:
# The issue with n-grams: do not, not recommend, not good
print(clf.predict(vect.transform(['do not recommend this place',
                                 'this place is not good'])))

[1 1]


# n-grams

In [185]:
vect = TfidfVectorizer(min_df = 2, ngram_range = (1,2)).fit(X_train)
len(vect.get_feature_names())

39926

In [186]:
X_train_vectorized = vect.transform(X_train)

clf = LinearSVC()
clf.fit(X_train_vectorized, y_train)

pred = clf.predict(vect.transform(X_test))

print(confusion_matrix(y_true = y_test, y_pred = pred))
print("Test accuracy: ", (confusion_matrix(y_true = y_test, y_pred = pred)[0][0] + confusion_matrix(y_true = y_test, y_pred = pred)[1][1])/len(X_test))
print('AUC: ', roc_auc_score(y_test, pred))

[[154  66]
 [ 11 585]]
Test accuracy:  0.905637254902
AUC:  0.840771812081


In [187]:
feature_names = np.array(vect.get_feature_names())

sored_coef_index = clf.coef_[0].argsort()
print("Smallest Coefs: \n{}\n".format(feature_names[sored_coef_index[:25]]))
print("Biggest Coefs: \n{}\n".format(feature_names[sored_coef_index[:-26:-1]]))

Smallest Coefs: 
['not' 'no' 'order' 'worst' 'rude' 'poor' 'lack' 'told' 'ok' 'custom' 'bad'
 'won' 'wouldn' 'okay' 'noth' 'water' 'the worst' 'horribl' 'horrible'
 'better' 'left' 'terrible' 'tri to' 'money' 'bland']

Biggest Coefs: 
['great' 'love' 'definit' 'the best' 'alway' 'best' 'good' 'and' 'amazing'
 'everyth' 'recommend' 'amaz' 'delicious' 'thank' 'you' 'awesome' 'is'
 'fresh' 'delici' 'awesom' 'excel' 'pretti' 'littl' 'easi' 'perfect']



In [188]:
# NO MORE issue with n-grams: do not, not recommend, not good
print(clf.predict(vect.transform(['do not recommend this place',
                                 'this place is not good'])))

[0 0]


## Trying the voting clf

In [189]:
np.random.seed(4747)
# LogisticRegression
clf_LogisticRegression = LogisticRegression(class_weight = 'balanced')
clf_LogisticRegression.fit(X_train_vectorized, y_train)

pred = clf_LogisticRegression.predict(vect.transform(X_test))
print("LogisticRegression accuracy: ", (confusion_matrix(y_true = y_test, y_pred = pred)[0][0] + confusion_matrix(y_true = y_test, y_pred = pred)[1][1])/len(X_test))
print(confusion_matrix(y_true = y_test, y_pred = pred))
print('AUC: ', roc_auc_score(y_test, pred))

# LinearSVC
clf_LinearSVC = LinearSVC(class_weight = 'balanced')
clf_LinearSVC.fit(X_train_vectorized, y_train)

pred = clf_LinearSVC.predict(vect.transform(X_test))
print("LinearSVC accuracy: ", (confusion_matrix(y_true = y_test, y_pred = pred)[0][0] + confusion_matrix(y_true = y_test, y_pred = pred)[1][1])/len(X_test))
print(confusion_matrix(y_true = y_test, y_pred = pred))
print('AUC: ', roc_auc_score(y_test, pred))

# SGDClassifier
clf_SGDClassifier = SGDClassifier(class_weight = 'balanced')
clf_SGDClassifier.fit(X_train_vectorized, y_train)

pred = clf_SGDClassifier.predict(vect.transform(X_test))
print("SGDClassifier accuracy: ", (confusion_matrix(y_true = y_test, y_pred = pred)[0][0] + confusion_matrix(y_true = y_test, y_pred = pred)[1][1])/len(X_test))
print(confusion_matrix(y_true = y_test, y_pred = pred))
print('AUC: ', roc_auc_score(y_test, pred))

LogisticRegression accuracy:  0.897058823529
[[178  42]
 [ 42 554]]
AUC:  0.869310555217
LinearSVC accuracy:  0.906862745098
[[169  51]
 [ 25 571]]
AUC:  0.863117754728
SGDClassifier accuracy:  0.901960784314
[[174  46]
 [ 34 562]]
AUC:  0.866931055522


Trying out the voted clf

In [195]:
class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers

    def predict(self, features):
        pred_list = []
        for c in self._classifiers:
            pred_list.append(c.predict(features))
        
        res = []
        for i in range(len(pred_list[0])):
            a = pred_list[0][i]
            b = pred_list[1][i]
            c = pred_list[2][i]
            if (a+b+c)< 2:
                res.append(0)
            else:
                res.append(1)
        return res

    def confidence(self, features):
        pred_list = []
        for c in self._classifiers:
            pred_list.append(c.predict(features))
        
        votes = pred_list[0][0] + pred_list[1][0] + pred_list[2][0]
        if votes< 2:
            return 1-votes/3
        else:
            return votes/3
    
voted_classifier = VoteClassifier(clf_LogisticRegression,
                                  clf_LinearSVC, 
                                  clf_SGDClassifier) 

pred = voted_classifier.predict(vect.transform(X_test))
print("voted_classifier accuracy: ", (confusion_matrix(y_true = y_test, y_pred = pred)[0][0] + confusion_matrix(y_true = y_test, y_pred = pred)[1][1])/len(X_test))

voted_classifier accuracy:  0.908088235294


In [196]:
voted_classifier.predict(vect.transform(['do not recommend this place',
                                 'this place is not good']))

[0, 0]

In [197]:
def sentiment(text):
    return (voted_classifier.predict(vect.transform(text)), voted_classifier.confidence(vect.transform(text)))

sentiment(['I do not recommend this place'])

([0], 1.0)

In [198]:
sentiment(["It's sad that we cannot come back again. We liked here."])


([1], 1.0)