In [146]:
import os
from bs4 import BeautifulSoup
import pandas as pd
import re
import sklearn.metrics
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier


In [78]:
# Read the Semeval data and return a dataframe
def read_semeval_data(filename):
    '''
    Description: reads Semantic Evaluation XML dataset and converts into a 
                 dataframe
    Arguments:
                 filename: string with file path (including filename)
    Returns :    pandas data frame
    Important:   this function only puts positive and neutral reviews in dataframe                          
    '''
    f = open('data/train.xml', 'r')
    raw_training_data = f.read()
    xmldoc = BeautifulSoup(raw_training_data,'lxml-xml')
    sentences = xmldoc.Reviews.find_all('sentences')
    opinions = xmldoc.Reviews.find_all('Opinions')
    reviews = []
    for i in range(0,len(sentences)):
        record = {}
        entity_aspect_pairs = opinions[i].find_all('Opinion')
        for ea_pair in entity_aspect_pairs:
            ea = ea_pair.attrs['category']
            polarity = ea_pair.attrs['polarity']
            if(polarity == 'positive'):
                record[ea] = 1
            elif(polarity == 'negative'):
                record[ea] = -1
            else:
                record[ea] = 0           
        record['TEXT'] = sentences[i].get_text()
        reviews.append(record)
    #Create a dataframe
    df=pd.DataFrame(reviews)
    #Change order of the columns so that text appears first
    cols = df.columns.tolist()
    cols.sort()
    cols.reverse()
    df = df[cols]
    df.fillna(0, inplace=True)
    return df

entity_labels = ['Food','Drinks','Service','Ambience','Location','Restaurant']
<br/>attributes_labels = ['General','Prices','Quality','Style&Options','Miscellaneous']

##### Possible Combinations of Entities and Attributes #####

<img src='images/entity_attributes_combinations.jpg' style='width:50;height:50'>

In [98]:
df_training = read_semeval_data('data/train.xml')
df_testing = read_semeval_data('data/test.xml')

In [99]:
df_testing.shape

(335, 13)

In the dataframe above a +1 indicates a positive polarity and -1 indicates negative polarity whereas 0 means that this aspect is not found or found to be neutral

In [81]:
#Function to clean the text data
#Remove punctuations, newline characters and convert to lowercase.
#Note that we are not removing dot to mark sentence boundary
def clean_text_data(data):
    '''
    Description: Given text returns cleaned version
    Arguments:
                  data: string with raw review text
    Returns  :
                  cleaned: string with unwanted characters removed
    '''
    prog = re.compile('[\t\n\r\f\v\d\']', re.UNICODE)
    data = re.sub(prog, ' ', data).lower()
    prog = re.compile('[!\"#$%&\'()*+\,-/:;<=>?@[\]^_`{|}~]', re.UNICODE)
    cleaned = re.sub(prog, ' ', data)
    return cleaned

In [82]:
df_training['TEXT'] = df_training['TEXT'].apply(clean_text_data)

In [83]:
df_training.head()

Unnamed: 0,TEXT,SERVICE#GENERAL,RESTAURANT#PRICES,RESTAURANT#MISCELLANEOUS,RESTAURANT#GENERAL,LOCATION#GENERAL,FOOD#STYLE_OPTIONS,FOOD#QUALITY,FOOD#PRICES,DRINKS#STYLE_OPTIONS,DRINKS#QUALITY,DRINKS#PRICES,AMBIENCE#GENERAL
0,judging from previous posts this used to be ...,-1.0,0.0,0.0,-1,0.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0
1,i have eaten at saul many times the food i...,0.0,1.0,0.0,1,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
2,went on a day oyster binge with fish brin...,1.0,1.0,0.0,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,every time in new york i make it a point to ...,1.0,0.0,0.0,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,we ate outside at haru s sake bar because ha...,1.0,0.0,0.0,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [100]:
X_train = df_training['TEXT']
y_train = df_training.drop('TEXT',axis=1)
X_test = df_testing['TEXT']
y_test = df_testing.drop('TEXT',axis=1)

In [101]:
y_train.head()

Unnamed: 0,SERVICE#GENERAL,RESTAURANT#PRICES,RESTAURANT#MISCELLANEOUS,RESTAURANT#GENERAL,LOCATION#GENERAL,FOOD#STYLE_OPTIONS,FOOD#QUALITY,FOOD#PRICES,DRINKS#STYLE_OPTIONS,DRINKS#QUALITY,DRINKS#PRICES,AMBIENCE#GENERAL
0,-1.0,0.0,0.0,-1,0.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,1,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
2,1.0,1.0,0.0,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [86]:
df.shape

(335, 13)

In [102]:
classifier = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(RandomForestClassifier(n_estimators=50,min_samples_leaf=2, min_samples_split=4, 
                                      max_depth=3,class_weight='balanced',oob_score=True)))])
classifier.fit(X_train, y_train)

Pipeline(steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
       ...
            oob_score=True, random_state=None, verbose=0, warm_start=False),
           n_jobs=1))])

In [105]:
y_predicted = classifier.predict(X_test)

In [151]:
f1_sum = 0
for column_index in range(y_test.shape[1]):
    f1=metrics.f1_score(np.array(y_test)[:,column_index], y_predicted[:,column_index], average='macro')
    f1_sum = f1_sum + f1 

In [153]:
print(f1_sum/12)

0.858367046608


In [140]:
print(a)

range(0, 11)
