In [108]:
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import nltk

In [120]:
beauty = pd.read_csv("../data/beauty5k.csv")
beauty = beauty.dropna(subset=["reviewText", "overall"])

# encode sentiment to "pos" and "neg"
# remove punctuation and tokenize reviewText
def sentiment(x):
    if x > 3.0:
        return "pos"
    return "neg"

def remove_punc(s):
    return s.translate(None, string.punctuation)

def low(s):
    return s.lower()

beauty["sentiment"] = beauty["overall"].apply(sentiment)
beauty["token"] = beauty["reviewText"].apply(low).apply(remove_punc).apply(word_tokenize)

# POS tag
beauty["tagged_token"] = beauty["token"].apply(nltk.pos_tag)

# remove stop words
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))

def token_remove_stop(token_list):
    return [token for token in token_list if token not in stop]

def tagged_token_remove_stop(list_tuple):
    return [(token, tag) for (token, tag) in list_tuple if token not in stop]

beauty["token_no_stop"] = beauty["token"].apply(token_remove_stop)
beauty["tagged_token"] = beauty["tagged_token"].apply(tagged_token_remove_stop)

In [110]:
beauty.head(2)

Unnamed: 0.1,Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,sentiment,token,tagged_token,token_no_stop
0,27677,A1ZNEYW8GJIF1P,B00006IV2F,Pat Sims,"[0, 0]","Works well, no burning fingers or neck like a...",5.0,Value buy,1382832000.0,"10 27, 2013",pos,"[works, well, no, burning, fingers, or, neck, ...","[(works, NNS), (well, RB), (burning, VBG), (fi...","[works, well, burning, fingers, neck, like, cu..."
1,27716,A37V39UMYM2JIJ,B00006IV2F,rescueAdog,"[1, 1]",Finally found a curling iron that does all tha...,5.0,Perfect,1329264000.0,"02 15, 2012",pos,"[finally, found, a, curling, iron, that, does,...","[(finally, RB), (found, VBD), (curling, NN), (...","[finally, found, curling, iron, promises, grea..."


In [111]:
print beauty.iloc[1]["token"][-5:]
print type(beauty.iloc[1]["token_no_stop"])

['slipping', 'out', 'like', 'others', 'do']
<type 'list'>


In [112]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(beauty, test_size=0.2)

In [113]:
print train.groupby("sentiment").size()
print test.groupby("sentiment").size()

sentiment
neg    2396
pos    1603
dtype: int64
sentiment
neg    603
pos    397
dtype: int64


In [114]:
# fabricate the format required by nltk.naivebayes
train_token_label = []
test_token_label = []

for index, row in train.iterrows():
    train_token_label += [(token, row["sentiment"]) for token in row["token_no_stop"]]
for index, row in test.iterrows():
    test_token_label += [(token, row["sentiment"]) for token in row["token_no_stop"]]
    
def nbfeature(token):
    return {"token": token}

train_feat = [(nbfeature(token), sent) for (token, sent) in train_token_label]
test_feat = [(nbfeature(token), sent) for (token, sent) in test_token_label]

train_x = [token for (token, sent) in train_feat]
train_y = [sent for (token, sent) in train_feat]
test_x = [token for (token, sent) in test_feat]
test_y = [sent for (token, sent) in test_feat]

In [115]:
train_feat[1]

({'token': 'covers'}, 'pos')

In [116]:
from nltk.classify import NaiveBayesClassifier
nb_classifier = NaiveBayesClassifier.train(train_feat)
nb_classifier.labels()

['neg', 'pos']

In [117]:
nltk.classify.accuracy(nb_classifier, test_feat)

0.6373919963619827

In [118]:
# add Unigram, Bigram and Trigram in the feature
from nltk.util import bigrams, trigrams

train_token_label = []
test_token_label = []

for index, row in train.iterrows():
    train_token_label += [(token, row["sentiment"]) for token in row["token_no_stop"]]
    train_token_label += [(b, row["sentiment"]) for b in list(bigrams(row["token_no_stop"]))]
    train_token_label += [(t, row["sentiment"]) for t in list(trigrams(row["token_no_stop"]))]
for index, row in test.iterrows():
    test_token_label += [(token, row["sentiment"]) for token in row["token_no_stop"]]
    test_token_label += [(b, row["sentiment"]) for b in list(bigrams(row["token_no_stop"]))]
    test_token_label += [(t, row["sentiment"]) for t in list(trigrams(row["token_no_stop"]))]
    
train_feat = [(nbfeature(token), sent) for (token, sent) in train_token_label]
test_feat = [(nbfeature(token), sent) for (token, sent) in test_token_label]

In [123]:
nb_classifier2 = NaiveBayesClassifier.train(train_feat)
print nltk.classify.accuracy(nb_classifier2, test_feat)

0.650645526172


In [135]:
# 5 fold bagging
from sklearn.model_selection import KFold
nb_5fold = []
kf = KFold(n_splits=5, random_state=111, shuffle=True)
for train_index, val_index in kf.split(train_feat):
    train_data = [train_feat[index] for index in train_index]
    nb = NaiveBayesClassifier.train(train_data)
    nb_5fold.append(nb)
print len(nb_5fold)


# train_x = [token for (token, sent) in train_feat]
# train_y = [sent for (token, sent) in train_feat]
# test_x = [token for (token, sent) in test_feat]
# test_y = [sent for (token, sent) in test_feat]

# from sklearn.ensemble import BaggingClassifier
# nb = NaiveBayesClassifier
# nfold = 10
# subsample = 0.9
# nb_10fold = BaggingClassifier(base_estimator = nb,
#                              n_estimators = nfold,
#                              max_samples = subsample,
#                              random_state = 111)

# clf = nb_10fold.fit(train_x, train_y)
# print clf.score(test_x, test_y)

5


In [136]:
accuracy = 0
for test_sample in test_feat:
    pos_count, neg_count = 0, 0
    for nb in nb_5fold:
        pred = nb.classify(test_sample[0])
        if pred == "pos":
            pos_count += 1
        else:
            neg_count += 1
    final_pred = "pos"
    if pos_count < neg_count:
        final_pred = "neg"
    if final_pred == test_sample[1]:
        accuracy += 1
accuracy /= float(len(test_feat))

In [137]:
print accuracy

0.650635775021
