In [1]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt

from collections import defaultdict

In [38]:
import nltk
import random
#from nltk.corpus import movie_reviews
from nltk.classify.scikitlearn import SklearnClassifier
import pickle
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify import ClassifierI
from statistics import mode
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

In [225]:
def read_files():
    d = defaultdict(list)

    parent_dir = '../Data Sets/'

    path_dic = {'B': 'business_s.csv', 'C':'checkin.csv', 'R':'review_s.csv'\
               , 'T':'tip.csv', 'U':'user.csv'}

    for key in path_dic:
        d[key] = pd.read_csv(parent_dir + path_dic[key]).drop('Unnamed: 0', 1)
    return d
def show():
    sns.despine()
    plt.show()
    
def clean_format(w):
    w = w.lower().replace('.', '').replace(',', '').replace('!', '')
    #.replace('+', '').replace('(', '').replace(')', '')
    return w
                    


In [3]:
d = read_files()

# NLTK: creating classifiers, pickle them

In [4]:
d['RB'] = d['R'].merge(d['B'], on = 'business_id', how = 'inner')
d['RB'] = d['RB'].dropna(subset = ['is_open'])

In [12]:
d['RB'].rename(columns = {'stars_x' : 'review_star', 'stars_y':'buz_star'}, inplace = True)
d['RB'].head(1)

Unnamed: 0,business_id,cool,date,funny,review_id,review_star,text,useful,user_id,address,...,latitude,longitude,name,neighborhood,postal_code,review_count,buz_star,state,price,credit_card
0,fjMXGgOr3aCxnN48kovZ_Q,0,2015-03-09,0.0,3BBCHVND9tDPNliTFoLCHA,5.0,We recently decided to give this place another...,0.0,bCrpStRCku_gEX3Iwuv94A,5051 W Craig Rd,...,36.238959,-115.211568,Craig Road Animal Hospital,Northwest,89130,192,4.0,NV,,


In [13]:
print(list(d['RB']))

['business_id', 'cool', 'date', 'funny', 'review_id', 'review_star', 'text', 'useful', 'user_id', 'address', 'attributes', 'categories', 'city', 'hours', 'is_open', 'latitude', 'longitude', 'name', 'neighborhood', 'postal_code', 'review_count', 'buz_star', 'state', 'price', 'credit_card']


# NLTK preparation

In [226]:
documents = [(t, star) for t,star in zip(d['RB']['text'], d['RB']['review_star'])]

save_documents = open("pickled_algos/documents.pickle","wb")
pickle.dump(documents, save_documents)
save_documents.close()

Let's take look at the first review:

In [227]:
print(documents[0])

("We recently decided to give this place another try after having issues at our normal vet and I'm very glad we did! We transferred all of our pets records over here and plan on using them and only them. The receptionists are always on top of everything, friendly and helpful. Both the vet techs as well as the vets have also been very friendly, helpful and knowledgeable. We had an incident with one of or dogs, Miracle, and Dr.Finder took GREAT care of her and now Miracle LOVES her. We are very pleased with the treatment our animals have been receiving here!", 5.0)


In [228]:
# We will use the stemmer to stem all the words
ps = PorterStemmer()
example_words = ["python","pythoner","pythoning","pythoned","pythonly"]

print([ps.stem(w) for w in example_words])

['python', 'python', 'python', 'python', 'pythonli']


Remove stop words? Something to consider. But turned out this decreases accuracy...

In [229]:
from nltk.corpus import stopwords
 
data = "This is nonsense. I hate this place. The food is bad and the service is terrible"
stopWords = list(set(stopwords.words('english')))

# stopWords[:5]
print(' '.join([w for w in data.split() if w not in stopWords ]))

This nonsense. I hate place. The food bad service terrible


In [253]:
all_words = []

for (t, star) in documents:
    for word in t.split():
        w = clean_format(word)
        all_words.append(ps.stem(w))
        
all_words = nltk.FreqDist(all_words)
print(all_words.most_common(15))
print("stupid appeared: " + str(all_words['stupid']) + " times")

[('the', 25207), ('and', 17402), ('i', 13434), ('a', 12866), ('to', 12566), ('wa', 9003), ('of', 7497), ('it', 6576), ('for', 6096), ('is', 5996), ('in', 5677), ('my', 4878), ('that', 4374), ('with', 4131), ('we', 4044)]
stupid appeared: 26 times


In [264]:
words_features = [s for (s,_) in list(all_words.most_common(5000))]

save_word_features = open("pickled_algos/word_features5k.pickle","wb")
pickle.dump(words_features, save_word_features)
save_word_features.close()

print(words_features[:200])

['the', 'and', 'i', 'a', 'to', 'wa', 'of', 'it', 'for', 'is', 'in', 'my', 'that', 'with', 'we', 'thi', 'they', 'but', 'you', 'on', 'have', 'had', 'not', 'so', 'were', 'be', 'at', 'place', 'are', 'food', 'good', 'as', 'time', 'me', 'great', 'veri', 'there', 'like', 'get', 'all', 'go', 'out', 'if', 'our', 'just', 'here', 'servic', 'one', 'from', 'order', 'when', 'would', 'their', 'back', 'up', 'or', 'an', 'he', "it'", 'will', 'about', 'tri', 'realli', 'she', 'your', 'which', 'what', 'been', 'some', 'come', 'love', 'no', 'us', 'also', 'even', 'becaus', 'more', 'onli', 'other', 'can', 'do', '-', 'got', 'by', 'nice', 'look', 'want', 'them', 'make', "don't", 'wait', 'price', 'best', 'after', 'ha', "i'm", 'too', 'well', "i'v", "didn't", 'came', 'restaur', 'than', 'went', 'ask', 'did', 'day', 'friendli', 'work', 'over', 'first', 'littl', 'never', 'alway', 'staff', 'drink', 'her', 'could', 'menu', 'definit', 'peopl', 'know', 'take', 'recommend', 'say', 'thing', 'experi', 'need', 'tast', 'much',

In [257]:
def find_features(document):
    words = set(document.split())
    features = {}
    for w in words:
        w = clean_format(w)
        w = ps.stem(w)
        features[w] = (w in words_features)
    return features

featuresets = [(find_features(doc), star) for (doc,star) in documents]

save_word_features = open("pickled_algos/featuresets.pickle","wb")
pickle.dump(word_features, save_word_features)
save_word_features.close()

print(featuresets[0])

({'top': True, 'use': True, 'both': True, 'an': True, 'care': True, 'of': True, 'help': True, 'now': True, 'her': True, 'them': True, 'give': True, 'are': True, 'treatment': True, 'at': True, 'also': True, 'or': True, 'after': True, 'vet': True, 'to': True, 'pet': True, "i'm": True, 'veri': True, 'and': True, 'we': True, 'plan': True, 'the': True, 'record': True, 'transfer': True, 'alway': True, 'drfinder': False, 'great': True, 'glad': True, 'one': True, 'with': True, 'over': True, 'normal': True, 'decid': True, 'love': True, 'knowledg': True, 'on': True, 'place': True, 'miracl': True, 'thi': True, 'everyth': True, 'anim': True, 'been': True, 'here': True, 'our': True, 'took': True, 'issu': True, 'incid': True, 'have': True, 'dog': True, 'onli': True, 'friendli': True, 'tech': True, 'tri': True, 'did': True, 'had': True, 'as': True, 'recent': True, 'receptionist': True, 'pleas': True, 'all': True, 'well': True, 'receiv': True, 'anoth': True}, 5.0)


In [243]:
len(featuresets)

4349

In [244]:
# POS to NEG encode
temp = []
pos_count = 0
for i in range(len(featuresets)):
    if featuresets[i][1] >= 4:
        temp.append((featuresets[i][0], 'pos'))
        pos_count = pos_count + 1
    elif featuresets[i][1] <= 2:
        temp.append((featuresets[i][0], 'neg'))
print("pos rate: ", pos_count / len(temp))
print("len of temp: ", len(temp))

pos rate:  0.7429245283018868
len of temp:  3816


# Trying out some clfs

In [245]:
training_set = temp[:3000]
testing_set = temp[3000:]

print(len(training_set))
print(len(testing_set))

3000
816


In [246]:
np.random.seed(4747)

clf = nltk.NaiveBayesClassifier.train(training_set)
print("Naive Bayes Classifier accuracy: ", nltk.classify.accuracy(clf, testing_set) * 100)
clf.show_most_informative_features(15)

Naive Bayes Classifier accuracy:  46.200980392156865
Most Informative Features
                  refund = True              neg : pos    =     49.9 : 1.0
                    zero = True              neg : pos    =     23.2 : 1.0
                   bewar = True              neg : pos    =     21.7 : 1.0
                    rude = True              neg : pos    =     21.7 : 1.0
                    dirt = True              neg : pos    =     19.8 : 1.0
                  garbag = True              neg : pos    =     19.8 : 1.0
                   ignor = True              neg : pos    =     18.6 : 1.0
                 terribl = True              neg : pos    =     18.5 : 1.0
              underwhelm = True              neg : pos    =     17.9 : 1.0
               tasteless = True              neg : pos    =     17.9 : 1.0
                knowledg = True              pos : neg    =     17.8 : 1.0
                  intent = True              neg : pos    =     16.0 : 1.0
                    s

## Combing multiple classifiers

In [247]:
np.random.seed(4747)

classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)

###############
save_classifier = open("pickled_algos/originalnaivebayes5k.pickle","wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)

save_classifier = open("pickled_algos/MNB_classifier5k.pickle","wb")
pickle.dump(MNB_classifier, save_classifier)
save_classifier.close()

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)

save_classifier = open("pickled_algos/BernoulliNB_classifier5k.pickle","wb")
pickle.dump(BernoulliNB_classifier, save_classifier)
save_classifier.close()

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)

save_classifier = open("pickled_algos/LogisticRegression_classifier5k.pickle","wb")
pickle.dump(LogisticRegression_classifier, save_classifier)
save_classifier.close()

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)

save_classifier = open("pickled_algos/LinearSVC_classifier5k.pickle","wb")
pickle.dump(LinearSVC_classifier, save_classifier)
save_classifier.close()

NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)

save_classifier = open("pickled_algos/NuSVC_classifier5k.pickle","wb")
pickle.dump(LinearSVC_classifier, save_classifier)
save_classifier.close()

SGDC_classifier = SklearnClassifier(SGDClassifier())
SGDC_classifier.train(training_set)
print("SGDClassifier accuracy percent:",nltk.classify.accuracy(SGDC_classifier, testing_set)*100)

save_classifier = open("pickled_algos/SGDC_classifier5k.pickle","wb")
pickle.dump(SGDC_classifier, save_classifier)
save_classifier.close()

Original Naive Bayes Algo accuracy percent: 46.200980392156865
Most Informative Features
                  refund = True              neg : pos    =     49.9 : 1.0
                    zero = True              neg : pos    =     23.2 : 1.0
                   bewar = True              neg : pos    =     21.7 : 1.0
                    rude = True              neg : pos    =     21.7 : 1.0
                    dirt = True              neg : pos    =     19.8 : 1.0
                  garbag = True              neg : pos    =     19.8 : 1.0
                   ignor = True              neg : pos    =     18.6 : 1.0
                 terribl = True              neg : pos    =     18.5 : 1.0
              underwhelm = True              neg : pos    =     17.9 : 1.0
               tasteless = True              neg : pos    =     17.9 : 1.0
                knowledg = True              pos : neg    =     17.8 : 1.0
                  intent = True              neg : pos    =     16.0 : 1.0
           

Now let's combine the clfs！

In [248]:
class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers

    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)

    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)

        chosen_class = votes.count(mode(votes))
        return chosen_class / len(votes)
    
voted_classifier = VoteClassifier(SGDClassifier_classifier,
                                  LogisticRegression_classifier, 
                                  LinearSVC_classifier) 
#                                   MNB_classifier, 
#                                   NuSVC_classifier)
print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100)


voted_classifier accuracy percent: 91.05392156862744


91.17 % is really impressive!!!!!!!

In [239]:
def sentiment(text):
    feats = find_features(text)
    return voted_classifier.classify(feats),voted_classifier.confidence(feats)

In [265]:
for i in range(1):

    print(testing_set[i])
    print(sentiment(testing_set[i][1]))
    print()


({'you': True, 'diesel': True, 'earn': True, 'back': True, 'i': True, 'nice': True, 'a': True, 'it': True, 'are': True, 'guy': True, 'these': True, 'help': True, 'busi': True, 'compani': True, 'have': True, 'your': True, 'to': True, 'if': True, 'do': True, 'am': True, 'friendli': True, 'and': True, 'will': True, 'take': True, 'the': True, 'super': True, 'real': True, 'veri': True, 'what': True, 'they': True, 'respect': True, 'go': True, 'deal': True}, 'pos')
('pos', 1.0)



# Looking at the misclassified ones...

In [None]:
result = defaultdict(list)
for i in range(len(testing_set)):
    pred, conf = sentiment(testing_set[i][1])
    result['pred'].append(pred)
    result['pred'].append(pred)
