#### Build Naive Bayes Text Classifier Based off of 2014-15 ABSA Data

In [1]:
import numpy as np
import pandas as pd

from ABSAParser import parse_data_2014, parse_data_2015
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.metrics import accuracy_score, confusion_matrix

#### Read in ABSA Data

In [2]:
absa_data = pd.DataFrame() #Initialize Empty Container

In [3]:
def stack_data(parse_function, xml_path):
    df = parse_function(xml_path)
    
    return pd.concat([absa_data, df], axis=0)

In [8]:
absa_data = stack_data(parse_data_2014, "data/ABSA/2014/Restaurants_Train_v2.xml")
absa_data = stack_data(parse_data_2014, "data/ABSA/2014/restaurants_Trial.xml")
absa_data = stack_data(parse_data_2014, "data/ABSA/2014/Restaurants_Test_Data_phaseB.xml")
absa_data = stack_data(parse_data_2015, "data/ABSA/2015/ABSA-15_Restaurants_Train_Final.xml")
absa_data = stack_data(parse_data_2015, "data/ABSA/2015/ABSA15_Restaurants_Test.xml")

print absa_data.shape
absa_data.head(5)

(13494, 7)


Unnamed: 0,category,sentence_id,text,topic_ambience,topic_food,topic_service,topic_value
0,,1000,"The food is good, especially their more basic ...",0,0,0,0
1,,1001,"This is a great place to take out-of-towners, ...",0,0,0,0
2,,1002,"The view is spectacular, and the food is great.",0,0,0,0
3,,1003,Wonderful strawberry daiquiries as well!,0,0,0,0
4,,1004,Definitely worth the trip to Battery Park City!,0,0,0,0


#### Set Topic Binaries and Aggregate to Sentence Level

In [9]:
absa_data.category.value_counts(dropna=False)

NaN                         6143
food                        1691
anecdotes/miscellaneous     1412
FOOD#QUALITY                 852
service                      777
ambience                     556
SERVICE#GENERAL              443
RESTAURANT#GENERAL           416
price                        416
AMBIENCE#GENERAL             260
FOOD#STYLE_OPTIONS           133
RESTAURANT#MISCELLANEOUS     100
FOOD#PRICES                   85
RESTAURANT#PRICES             83
DRINKS#QUALITY                46
DRINKS#STYLE_OPTIONS          32
LOCATION#GENERAL              28
DRINKS#PRICES                 20
FOOD#GENERAL                   1
Name: category, dtype: int64

In [10]:
food = ["food","FOOD#QUALITY","FOOD#STYLE_OPTIONS","DRINKS#QUALITY","DRINKS#STYLE_OPTIONS","FOOD#GENERAL"]
service = ["service","SERVICE#GENERAL"]
ambience = ["ambience","AMBIENCE#GENERAL"]
value = ["price","FOOD#PRICES","RESTAURANT#PRICES","DRINKS#PRICES"]

topics = ["topic_food","topic_service","topic_ambience","topic_value"]
absa_data["topic_food"] = 0
absa_data["topic_service"] = 0
absa_data["topic_ambience"] = 0 
absa_data["topic_value"] = 0

absa_data.ix[absa_data.category.isin(food), "topic_food"] = 1
absa_data.ix[absa_data.category.isin(service), "topic_service"] = 1
absa_data.ix[absa_data.category.isin(ambience), "topic_ambience"] = 1
absa_data.ix[absa_data.category.isin(value), "topic_value"] = 1

absa_data = absa_data.groupby(by=["sentence_id","text"], as_index=False)[topics].max()

print absa_data.shape
print absa_data.topic_food.value_counts(dropna=False)
print absa_data.topic_service.value_counts(dropna=False)
print absa_data.topic_ambience.value_counts(dropna=False)
print absa_data.topic_value.value_counts(dropna=False)
absa_data.head(10)

(5845, 6)
0    3404
1    2441
Name: topic_food, dtype: int64
0    4661
1    1184
Name: topic_service, dtype: int64
0    5065
1     780
Name: topic_ambience, dtype: int64
0    5265
1     580
Name: topic_value, dtype: int64


Unnamed: 0,sentence_id,text,topic_food,topic_service,topic_ambience,topic_value
0,1000,"The food is good, especially their more basic ...",1,0,0,0
1,1001,"This is a great place to take out-of-towners, ...",0,0,1,0
2,1002,"The view is spectacular, and the food is great.",1,0,1,0
3,1003,Wonderful strawberry daiquiries as well!,1,0,0,0
4,1004,Definitely worth the trip to Battery Park City!,0,0,0,0
5,1004293:0,Judging from previous posts this used to be a ...,0,0,0,0
6,1004293:1,"We, there were four of us, arrived at noon - t...",0,1,0,0
7,1004293:2,"They never brought us complimentary noodles, i...",0,1,0,0
8,1004293:3,The food was lousy - too sweet or too salty an...,1,0,0,0
9,1004293:4,"After all that, they complained to me about th...",0,1,0,0


#### Split into Training and Test

In [13]:
absa_train, absa_test = train_test_split(absa_data, test_size=0.25, random_state=4444)
print absa_train.shape
print absa_test.shape

(4383, 6)
(1462, 6)


#### Train Naive Bayes Classifier & Output Accuracy and Confusion Matrix

In [14]:
def naive_bayes(train, test, topic):
    vectorizer = CountVectorizer(ngram_range=(1,1), min_df=2, max_df=0.2, stop_words="english",\
                                 binary=True, token_pattern="\\b[a-z][a-z][a-z]+\\b")
    
    vectorizer.fit(train["text"])
    
    train_X = vectorizer.transform(train["text"])
    test_X = vectorizer.transform(test["text"])
    
    train_y = train[topic]
    test_y = test[topic]
    
    clf = BernoulliNB().fit(train_X, train_y)
    
    print accuracy_score(test_y, clf.predict(test_X))
    print confusion_matrix(test_y, clf.predict(test_X))
    print ""

In [16]:
print "Food: "
naive_bayes(absa_train, absa_test, "topic_food")

print "Service: "
naive_bayes(absa_train, absa_test, "topic_service")

print "Ambience: "
naive_bayes(absa_train, absa_test, "topic_ambience")

print "Value: "
naive_bayes(absa_train, absa_test, "topic_value")

Food: 
0.883720930233
[[816  54]
 [116 476]]

Service: 
0.919288645691
[[1131   24]
 [  94  213]]

Ambience: 
0.925444596443
[[1249   15]
 [  94  104]]

Value: 
0.932968536252
[[1311    9]
 [  89   53]]



#### Model is Suffering Slightly on Recall, is underpredicting topics. Upsample to Balance Classes

In [18]:
def upsampler(df, topic, p1):  #p1 = (arbitrary %)
    df_1 = df[df[topic]==1].sample(n=n1, replace=True).copy()
    df_0 = df[df[topic]==0].sample(n=n0, replace=True).copy()
    
    return pd.concat([df_1,df_0], axis=0)

absa_train_balance_food = upsampler(absa_train, "topic_food", 1450,) #approx as if training set was split 50/50 on each
absa_train_balance_serv = upsampler(absa_train, "topic_service", 2200)
absa_train_balance_amb = upsampler(absa_train, "topic_ambience", 2200)
absa_train_balance_val = upsampler(absa_train, "topic_value", 2200)

In [19]:
print "Food: "
naive_bayes(absa_train_balance_food, absa_test, "topic_food")

print "Service: "
naive_bayes(absa_train_balance_serv, absa_test, "topic_service")

print "Ambience: "
naive_bayes(absa_train_balance_amb, absa_test, "topic_ambience")

print "Value: "
naive_bayes(absa_train_balance_val, absa_test, "topic_value")

Food: 
0.876196990424
[[788  82]
 [ 99 493]]

Service: 
0.846785225718
[[1029  126]
 [  98  209]]

Ambience: 
0.913132694938
[[1169   95]
 [  32  166]]

Value: 
0.870041039672
[[1155  165]
 [  25  117]]



#### play around with different upsampling ratios, oversampling rates