# Sentence Classification Model Building
## Training the model with Naive Bayes and SVM

In [1]:
import spacy
#import en_core_web_lg
#nlp = en_core_web_lg.load()
nlp = spacy.load('en_core_web_lg')

#import neuralcoref
#neuralcoref.add_to_pipe(nlp)

# Define function for replacing pronouns using neuralcoref
#def replace_pronouns(text):
 #   doc = nlp(text)
  #  return doc._.coref_resolved

In [2]:
import pandas as pd
# Read annotated reviews df, which is the labeled dataset for training
# This is located in the pickled files folder
aspect_train_df = pd.read_csv('aspect_train.csv')
aspect_test_df = pd.read_csv('aspect_test.csv')
aspect_train_df.head(3)

Unnamed: 0,Sentence,Aspect
0,LOVED THE HAMPTON INN SEAPORT!!!!!!!!!!!!!!!!!!,['OTHER']
1,Just returned from a 3 night stay.,['NOTRELATED']
2,This is a FABULOUS hotel.,['OTHER']


In [3]:
# Create a new column for text whose pronouns have been replaced
#aspect_train_df["text_pro"] = aspect_train_df.Sentence.map(lambda x: replace_pronouns(x))
#aspect_test_df["text_pro"] = aspect_test_df.Sentence.map(lambda x: replace_pronouns(x))
#aspect_train_df.head(3)

In [4]:
def clean_y(y):
    for i in range(len(y)):
        temp = []
        for j in y[i][1:-1].split(','):
            temp.append(j.strip("' "))
        y[i] = temp
    return y

In [5]:
from sklearn.preprocessing import MultiLabelBinarizer
import pickle
y_train = clean_y(aspect_train_df["Aspect"].values)
y_test = clean_y(aspect_test_df["Aspect"].values)
# Convert the multi-labels into arrays
mlb = MultiLabelBinarizer()
mlb.fit([["ROOMS", "CLEANLINESS", "VALUE", "SERVICE", "LOCATION", "CHECKIN", "BUSINESS", "FOOD", "BUILDING", "OTHER", "NOTRELATED"]])

y_train = mlb.transform(y_train)
y_test = mlb.transform(y_test)
X_train = aspect_train_df.Sentence
X_test = aspect_test_df.Sentence

# save the the fitted binarizer labels
# This is important: it contains the how the multi-label was binarized, so you need to
# load this in the next folder in order to undo the transformation for the correct labels.
filename = 'mlb.pkl'
pickle.dump(mlb, open(filename, 'wb'))

In [6]:
def bce(y_pred, y_true):
    err = len(np.argwhere(y_pred != y_true))
    bce = err/(y_pred.shape[0])
    return bce

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from skmultilearn.problem_transform import LabelPowerset
import numpy as np

# LabelPowerset allows for multi-label classification
# Build a pipeline for multinomial naive bayes classification
text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2))),
                     ('tfidf', TfidfTransformer()),
                     ('clf', LabelPowerset(MultinomialNB(alpha=1e-1))),])
text_clf = text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)

# Calculate accuracy and binary cross entrophy
print('Accuracy is %.2f' % (np.mean(predicted == y_test) * 100))
print('BCE is %.2f' % (bce(predicted, y_test)))

Accuracy is 91.30
BCE is 0.96


In [8]:
from sklearn.linear_model import SGDClassifier
# loss: The loss function to be used. Defaults to ‘hinge’, which gives a linear SVM.
text_clf_svm = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf-svm', LabelPowerset(
                             SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, max_iter=10, random_state=42)))])
_ = text_clf_svm.fit(X_train, y_train)
predicted_svm = text_clf_svm.predict(X_test)

# Calculate accuracy and binary cross entrophy
print('Accuracy is %.2f' % (np.mean(predicted_svm == y_test) * 100))
print('BCE is %.2f' % (bce(predicted_svm, y_test)))



Accuracy is 92.12
BCE is 0.87


In [9]:
from sklearn.metrics import multilabel_confusion_matrix
print(multilabel_confusion_matrix(y_test, predicted_svm))

[[[1199    9]
  [  51   13]]

 [[1263    2]
  [   5    2]]

 [[1223    3]
  [  36   10]]

 [[1199    2]
  [  47   24]]

 [[1116   39]
  [  41   76]]

 [[1102   21]
  [  53   96]]

 [[1110   14]
  [ 133   15]]

 [[ 678  208]
  [  94  292]]

 [[ 963   89]
  [  51  169]]

 [[1009   53]
  [  82  128]]

 [[1184   13]
  [  56   19]]]


Here we find the annotated train dataset is highly unbalanced: there are only 7 sentences labeled as related to 'CLEANLINESS' while 386 labels related to 'FOOD'. Therefore, the svm model perform better in aspects with more labels such as "LOCATION", "CHECKIN", "FOOD", "BUILDING", "OTHER".

In [10]:
pred_df = pd.DataFrame(
    {'text': X_test,
     'pred_category': mlb.inverse_transform(predicted_svm),
     'true_label': mlb.inverse_transform(y_test)
    })

In [11]:
pd.set_option('display.max_colwidth', -1)
pred_df.to_csv('svm_pred.csv', index=False)
pred_df

Unnamed: 0,text,pred_category,true_label
0,pleasant enough Stayed at the,"(OTHER,)","(OTHER,)"
1,"Singel for 2 nights for a football trip, the place is easy to find, in a good location, near the station,on the Singel canal, next to a church.","(LOCATION,)","(LOCATION,)"
2,"Room was comfy and very well heated if a little basic, breakfast was good a variety of cereals, breads, cheese, meats, etc, boiled eggs, juice, teacoffeeAll in all a pleasant enough place for a couple of days, mind you we didnt get to bed until 3:30 the first night and well after 4 on the second.","(FOOD,)","(ROOMS,)"
3,The guy I roomed with works in pest control and was a stickler about vermin bed bugs etc before we went and he had no complaints unlike some of our other guys who stayed elsewhere in the city so read into that what you will,"(ROOMS,)","(CLEANLINESS,)"
4,Exceeded our expectations!,"(OTHER,)","(OTHER,)"
...,...,...,...
1267,"As I said I would recommend this resort to anyone wanting to relax, be totally pampered, and enjoy some of the benefits of the good life.","(OTHER,)","(OTHER,)"
1268,"By the way, I read somewhere that the place was overcrowded with locals paying only a small daily fee to get in.","(OTHER,)","(OTHER,)"
1269,"This is somewhat true, but undestand that these people are paying abut $50 dollars per person for only the use of the swimming pool and a few of the other pleasures of the resort.","(BUILDING,)","(OTHER,)"
1270,"A family of 4 cost about $200, so your not dealing with a bunch of out of control kids running around with one adult trying to wrangle them, these people are paying very good money for the day use only of these things, and are not a problem at all.","(ROOMS,)","(OTHER,)"


In [12]:
pd.set_option('display.max_colwidth', -1)
sub = pred_df[pred_df['pred_category'] != pred_df['true_label']]
sub.to_csv('error_analysis.csv', index=False)

In [None]:
# Train svm on full dataset and save model
X = np.concatenate((X_train, X_test), axis=0)
y = np.concatenate((y_train, y_test), axis=0)
text_clf_svm = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf-svm', LabelPowerset(
                             SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, max_iter=10, random_state=42)))])
text_clf = text_clf_svm.fit(X, y)

# save the model to disk
filename = 'svm_model.pkl'
pickle.dump(text_clf, open(filename, 'wb'))

In [None]:
# Here is how to load model and mlb to do predict
# svm_model = pickle.load(open('svm_model.pkl', 'rb'))
# mlb = pickle.load(open('mlb.pkl', 'rb'))
# pred = mlb.inverse_transform(svm_model.predict(X_test))