In [1]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
import itertools

from collections import defaultdict

In [2]:
import nltk
import random
from nltk.classify.scikitlearn import SklearnClassifier
import pickle
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify import ClassifierI
from statistics import mode
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split



In [3]:
parent_dir = '../Data Sets/'
path_dic = {'B': 'business_s.csv', 'R':'review_text.csv', 'U':'user.csv', 'I':'review_info.csv'}

def read_files():
    d = defaultdict(list)
    for key in path_dic:
        d[key] = pd.read_csv(parent_dir + path_dic[key]).drop('Unnamed: 0', 1)
    return d
def show():
    sns.despine()
    plt.show()
    
def clean_format(w):
    w = w.lower().replace('.', '').replace(',', '').replace('!', '')
    return w
d = read_files()

In [4]:
for key in d:
    print(path_dic[key] + ' : ', d[key].shape)

business_s.csv :  (102497, 25)
review_text.csv :  (4736897, 2)
user.csv :  (968039, 17)
review_info.csv :  (4736897, 10)


In [5]:
print(d['R'].shape)
d['R'] = d['R'].merge(d['I'][['review_id', 'business_id', 'review_stars']], on = 'review_id', how = 'inner')
print(d['R'].shape)

(4736897, 2)
(4736897, 4)


In [6]:
pd.set_option('max_columns', 100)
d['RB'] = d['R'].merge(d['B'], on = 'business_id', how = 'left')
d['RB'] = d['RB'].dropna(subset = ['is_open'])
print(d['RB'].shape)
d['RB'].head(1)

(4265614, 28)


Unnamed: 0,text_stem,review_id,business_id,review_stars,address,attributes,categories,city,hours,is_open,latitude,longitude,buz_name,neighborhood,postal_code,state,price,credit_card,buz_review_count,buz_cool_mean,buz_funny_mean,buz_useful_mean,buz_star_mean,buz_star_std,buz_polarity_mean,buz_polarity_std,buz_subjectivity_mean,buz_subjectivity_std
16,"thi place is horribl , we were so excit to tri...",ByRzJ8rF2KJWLr-cUNU6EA,jQsNFOzDpxPmOurSWCg1vQ,1.0,"14155 W Bell Rd, Ste 113","{'OutdoorSeating': True, 'WiFi': 'no', 'Restau...","['Fast Food', 'Gluten-Free', 'Asian Fusion', '...",Surprise,"{'Sunday': '10:30-21:00', 'Wednesday': '10:30-...",1.0,33.638228,-112.365259,Pei Wei,,85374,AZ,2.0,True,92.0,0.336957,0.271739,0.73913,3.26087,1.443969,0.164035,0.198044,0.53314,0.167233


In [7]:
print(d['RB'].shape)
print(list(d['RB']))

(4265614, 28)
['text_stem', 'review_id', 'business_id', 'review_stars', 'address', 'attributes', 'categories', 'city', 'hours', 'is_open', 'latitude', 'longitude', 'buz_name', 'neighborhood', 'postal_code', 'state', 'price', 'credit_card', 'buz_review_count', 'buz_cool_mean', 'buz_funny_mean', 'buz_useful_mean', 'buz_star_mean', 'buz_star_std', 'buz_polarity_mean', 'buz_polarity_std', 'buz_subjectivity_mean', 'buz_subjectivity_std']


# Data preparation

In [8]:
# Define pos as 4 or above, drop 3 star reviews
np.random.seed(47)
df = d['RB'][['text_stem', 'review_stars']]
df = df.sample(frac = 0.05, replace = False)
df = df[df['review_stars'] != 3]
df['pos'] = np.where(df['review_stars'] >= 4, 1, 0)
df = df.drop('review_stars', 1)
df.index = range(len(df))

d = []

In [9]:
print(df.shape)
print("Positive rate: ", np.mean(df['pos']))

(187890, 2)
Positive rate:  0.7482463143328544


In [10]:
print("Stemmed: ")
print(df['text_stem'][0])

Stemmed: 
the restaur is kinda hidden in the plaza on dobson and guadalup . the place is small but clean . servic wa good and we did n't have to wait to long for the food . We order some gogi , spici chicken , tofu soup and some korean street taco . everyth tast delici and the portion were gener . I highli recommend the korean street taco !


In [11]:
# print("before the merge: ", df.shape)

# merged = df.merge(d['U'], on = 'user_id', how = 'left')
# print("after the merge: ", merged.shape)
# merged.dropna(subset = ['review_text'])
# print("after the merge: ", merged.shape)

# pd.set_option('display.max_columns', 100)
# merged.head(3)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(df['text_stem'], df['pos'], train_size = .75, random_state = 47)
print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)

Shape of X_train:  (140917,)
Shape of X_test:  (46973,)


# Count Vectorizer

In [109]:
vect = CountVectorizer(max_df = 0.95, min_df = 3, stop_words = 'english').fit(X_train)
vect.get_feature_names()[::2000]

['00',
 'aroma',
 'burlesque',
 'cosmo',
 'effortless',
 'gaurante',
 'imperfect',
 'loaner',
 'navigate',
 'pizooki',
 'robl',
 'somon',
 'tomi',
 'whittl']

In [54]:
print("Total number of features: ", len(vect.get_feature_names()))

Total number of features:  20067


In [55]:
# Now we vectorize the X_train data
X_train_vectorized = vect.transform(X_train)
X_train_vectorized

<140917x20067 sparse matrix of type '<class 'numpy.int64'>'
	with 6082079 stored elements in Compressed Sparse Row format>

In [56]:
# Try using a logistic regression
clf = LinearSVC()
clf.fit(X_train_vectorized, y_train)

pred = clf.predict(vect.transform(X_test))

print(confusion_matrix(y_true = y_test, y_pred = pred))
print("Test accuracy: ", (confusion_matrix(y_true = y_test, y_pred = pred)[0][0] + confusion_matrix(y_true = y_test, y_pred = pred)[1][1])/len(X_test))
print('AUC: ', roc_auc_score(y_test, pred))

[[10080  1745]
 [ 1519 33629]]
Test accuracy:  0.930513273583
AUC:  0.90460701844


In [58]:
feature_names = np.array(vect.get_feature_names())

sored_coef_index = clf.coef_[0].argsort()
print("Smallest Coefs: \n{}\n".format(feature_names[sored_coef_index[:15]]))
print("Biggest Coefs: \n{}\n".format(feature_names[sored_coef_index[:-16:-1]]))

Smallest Coefs: 
['slowest' 'loo' 'mehh' 'hoity' 'canon' 'keyword' 'eerili' 'pail'
 'horrible' 'placat' 'drafti' 'symposium' 'calorie' 'wp' 'downhil']

Biggest Coefs: 
['restauranteur' 'hart' 'wac' 'kat' 'unobtrus' 'caress' 'chrysanthemum'
 'recheck' 'loungi' 'crackl' 'josi' '33rd' 'erica' 'lebanon' 'ralphi']



# TF-IDF

Tf–idf, or Term frequency-inverse document frequency, allows us to weight terms based on how important they are to a document. High weight is given to terms that appear often in a particular document, but don't appear often in the corpus. Features with low tf–idf are either commonly used across all documents or rarely used and only occur in long documents. 

In [104]:
vect = TfidfVectorizer(max_df = .95, min_df = 3, stop_words = 'english').fit(X_train)
len(vect.get_feature_names())

26755

In [60]:
X_train_vectorized = vect.transform(X_train)

clf = LinearSVC()
clf.fit(X_train_vectorized, y_train)

pred = clf.predict(vect.transform(X_test))

print(confusion_matrix(y_true = y_test, y_pred = pred))
print("Test accuracy: ", (confusion_matrix(y_true = y_test, y_pred = pred)[0][0] + confusion_matrix(y_true = y_test, y_pred = pred)[1][1])/len(X_test))
print('AUC: ', roc_auc_score(y_test, pred))

[[10321  1504]
 [ 1118 34030]]
Test accuracy:  0.944180699551
AUC:  0.920501743037


In [61]:
feature_names = np.array(vect.get_feature_names())

sored_coef_index = clf.coef_[0].argsort()
print("Smallest Coefs: \n{}\n".format(feature_names[sored_coef_index[:25]]))
print("Biggest Coefs: \n{}\n".format(feature_names[sored_coef_index[:-26:-1]]))

Smallest Coefs: 
['worst' 'bland' 'mediocr' 'terribl' 'downhil' 'aw' 'horribl' 'meh'
 'tasteless' 'rude' 'flavorless' 'underwhelm' 'poor' 'disgust' 'wast'
 'poorli' 'overr' 'overpr' 'atroci' 'ined' 'lack' 'slowest' 'filthi'
 'uninspir' 'disappoint']

Biggest Coefs: 
['delici' 'amaz' 'great' 'excel' 'perfect' 'awesom' 'fantast' 'love'
 'pleasantli' 'best' 'skeptic' 'perfectli' 'outstand' 'highli' 'notch'
 'definit' 'glad' 'downsid' 'heaven' 'gem' 'erica' 'fabul' 'phenomen'
 'genuin' 'exceed']



In [62]:
# The issue with n-grams: do not, not recommend, not good
print(clf.predict(vect.transform(['do not recommend this place',
                                 'this place is not good'])))

[1 1]


# n-grams

In [13]:
# vect = TfidfVectorizer(max_df = .95, min_df = 3, ngram_range = (1,2)).fit(X_train)
vect = TfidfVectorizer(max_df = .95, min_df = 3, ngram_range = (1,2)).fit(X_train)
print(len(vect.get_feature_names()))

# save_classifier = open("pickled_algos/vect.pickle","wb")
# pickle.dump(vect, save_classifier)
# save_classifier.close()

444315


In [64]:
X_train_vectorized = vect.transform(X_train)

clf = LinearSVC()
clf.fit(X_train_vectorized, y_train)

pred = clf.predict(vect.transform(X_test))

print(confusion_matrix(y_true = y_test, y_pred = pred))
print("Test accuracy: ", (confusion_matrix(y_true = y_test, y_pred = pred)[0][0] + confusion_matrix(y_true = y_test, y_pred = pred)[1][1])/len(X_test))
print('AUC: ', roc_auc_score(y_test, pred))

[[10820  1005]
 [  661 34487]]
Test accuracy:  0.964532816725
AUC:  0.94810218993


In [65]:
feature_names = np.array(vect.get_feature_names())

sored_coef_index = clf.coef_[0].argsort()
print("Smallest Coefs: \n{}\n".format(feature_names[sored_coef_index[:45]]))
print("Biggest Coefs: \n{}\n".format(feature_names[sored_coef_index[:-46:-1]]))

Smallest Coefs: 
['worst' 'two star' 'not' 'disappoint' 'not worth' 'bland' 'terribl'
 'horribl' 'mediocr' 'veri disappoint' 'rude' 'meh' 'aw' 'at best' 'overpr'
 'poor' 'lack' 'not good' 'wors' 'not recommend' 'disgust' 'dirti' 'no'
 'never again' 'will never' 'no thank' 'wast' 'to love' 'wo be' 'noth'
 'not impress' 'will not' 'gross' 'poorli' 'not great' 'wo' 'unfortun'
 'underwhelm' 'elsewher' 'unprofession' 'suck' 'not veri' 'ruin'
 'never come' 'definit not']

Biggest Coefs: 
['delici' 'great' 'amaz' 'awesom' 'excel' 'love' 'perfect' 'best'
 'not disappoint' 'fantast' 'good' 'be disappoint' 'definit' 'you wo'
 'highli recommend' 'outstand' 'perfectli' 'not bad' 'realli good' 'happi'
 'wonder' 'not too' 'friendli' 'never disappoint' 'thank' 'better than'
 'alway' 'love thi' 'my onli' 'four star' 'go wrong' 'the best'
 'will definit' 'not onli' 'fun' 'yummi' 'easi' 'tasti' 'so good'
 'profession' 'ca wait' 'recommend' 'fabul' 'love the' 'veri good']



In [66]:
pos_words = pd.DataFrame(feature_names[sored_coef_index[:200]], columns = ['positive_words'])
pos_words.to_csv('Positive Words.csv')
neg_words = pd.DataFrame(feature_names[sored_coef_index[:-200:-1]], columns = ['negative_words'])
neg_words.to_csv('Negative Words.csv')

In [67]:
# NO MORE issue with n-grams: do not, not recommend, not good
print(clf.predict(vect.transform(['do not recommend this place',
                                 'this place is not good'])))

[0 0]


## Trying the voting clf

In [68]:
np.random.seed(47)
# LogisticRegression
clf_LogisticRegression = LogisticRegression(class_weight = 'balanced') #class_weight = 'balanced'
clf_LogisticRegression.fit(X_train_vectorized, y_train)

pred = clf_LogisticRegression.predict(vect.transform(X_test))
print("LogisticRegression accuracy: ", (confusion_matrix(y_true = y_test, y_pred = pred)[0][0] + confusion_matrix(y_true = y_test, y_pred = pred)[1][1])/len(X_test))
print(confusion_matrix(y_true = y_test, y_pred = pred))
print('AUC: ', roc_auc_score(y_test, pred))

# LinearSVC
clf_LinearSVC = LinearSVC() #class_weight = 'balanced'
clf_LinearSVC.fit(X_train_vectorized, y_train)

pred = clf_LinearSVC.predict(vect.transform(X_test))
print("LinearSVC accuracy: ", (confusion_matrix(y_true = y_test, y_pred = pred)[0][0] + confusion_matrix(y_true = y_test, y_pred = pred)[1][1])/len(X_test))
print(confusion_matrix(y_true = y_test, y_pred = pred))
print('AUC: ', roc_auc_score(y_test, pred))

# SGDClassifier
clf_SGDClassifier = SGDClassifier(class_weight = 'balanced') #class_weight = 'balanced'
clf_SGDClassifier.fit(X_train_vectorized, y_train)

pred = clf_SGDClassifier.predict(vect.transform(X_test))
print("SGDClassifier accuracy: ", (confusion_matrix(y_true = y_test, y_pred = pred)[0][0] + confusion_matrix(y_true = y_test, y_pred = pred)[1][1])/len(X_test))
print(confusion_matrix(y_true = y_test, y_pred = pred))
print('AUC: ', roc_auc_score(y_test, pred))

LogisticRegression accuracy:  0.954888978775
[[11244   581]
 [ 1538 33610]]
AUC:  0.953554491776
LinearSVC accuracy:  0.964532816725
[[10820  1005]
 [  661 34487]]
AUC:  0.94810218993




SGDClassifier accuracy:  0.947075979818
[[11221   604]
 [ 1882 33266]]
AUC:  0.947688383113


Trying out the voted clf

In [115]:
class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers

    def predict(self, features):
        pred_list = []
        for c in self._classifiers:
            pred_list.append(c.predict(features))
        
        res = []
        for i in range(len(pred_list[0])):
            a = pred_list[0][i]
            b = pred_list[1][i]
            c = pred_list[2][i]
            if (a+b+c)< 2:
                res.append(0)
            else:
                res.append(1)
        return res

    def confidence(self, features):
        pred_list = []
        for c in self._classifiers:
            pred_list.append(c.predict(features))
        
        votes = pred_list[0][0] + pred_list[1][0] + pred_list[2][0]
        if votes< 2:
            return 1-votes/3
        else:
            return votes/3
    
voted_classifier = VoteClassifier(clf_LogisticRegression,
                                  clf_LinearSVC, 
                                  clf_SGDClassifier) 

pred = voted_classifier.predict(vect.transform(X_test))
print("voted_classifier accuracy: ", (confusion_matrix(y_true = y_test, y_pred = pred)[0][0] + confusion_matrix(y_true = y_test, y_pred = pred)[1][1])/len(X_test))

voted_classifier accuracy:  0.955208311157


In [116]:
# save_classifier = open("pickled_algos/clf_LogisticRegression.pickle","wb")
# pickle.dump(clf_LogisticRegression, save_classifier)
# save_classifier.close()

# save_classifier = open("pickled_algos/clf_LinearSVC.pickle","wb")
# pickle.dump(clf_LinearSVC, save_classifier)
# save_classifier.close()

# save_classifier = open("pickled_algos/clf_SGDClassifier.pickle","wb")
# pickle.dump(clf_SGDClassifier, save_classifier)
# save_classifier.close()

In [70]:
voted_classifier.predict(vect.transform(['do not recommend this place',
                                 'this place is not good']))

[0, 0]

In [73]:
def sentiment(text):
    return (voted_classifier.predict(vect.transform(text))[0], voted_classifier.confidence(vect.transform(text)))

sentiment(['I do not recommend this place'])

(0, 1.0)

In [75]:
sentiment(["we will come back again."])


(1, 1.0)

In [89]:
ps = PorterStemmer()
example_sec = "The python programmer named pythoner is pythoning a game pythonly"

from nltk.corpus import stopwords
stopWords = list(set(stopwords.words('english')))

print([w for w in word_tokenize(example_sec)])
print([clean_format(w) for w in word_tokenize(example_sec)])
print([clean_format(w) for w in word_tokenize(example_sec) if w not in stopWords])
print([ps.stem(clean_format(w)) for w in word_tokenize(example_sec) if w not in stopWords])

['The', 'python', 'programmer', 'named', 'pythoner', 'is', 'pythoning', 'a', 'game', 'pythonly']
['the', 'python', 'programmer', 'named', 'pythoner', 'is', 'pythoning', 'a', 'game', 'pythonly']
['the', 'python', 'programmer', 'named', 'pythoner', 'pythoning', 'game', 'pythonly']
['the', 'python', 'programm', 'name', 'python', 'python', 'game', 'pythonli']


In [111]:
sens = ["The python", "The python programmer named pythoner ", "The python programmer named pythoner is pythoning a game pythonly"]
samp_list = np.array([" ".join([ps.stem(clean_format(w)) for w in word_tokenize(example_sec) if w not in stopWords]) for example_sec in sens])
# samp_list
transed = vect.transform(samp_list)

In [119]:
ps.stem('amazing')

'amaz'