# 01 - Sentence Classification Model Building


In [None]:
import pandas as pd

# Parse & clearn labeled training data

In [None]:
import xml.etree.ElementTree as ET
tree = ET.parse('../data/Restaurants_Train.xml')
root = tree.getroot()

In [None]:
# Use this dataframe for multilabel classification
# Must use scikitlearn's multilabel binarizer

labeled_reviews = []
for sentence in root.findall("sentence"):
    entry = {}
    aterms = []
    aspects = []
    if sentence.find("aspectTerms"):
        for aterm in sentence.find("aspectTerms").findall("aspectTerm"):
            aterms.append(aterm.get("term"))
    if sentence.find("aspectCategories"):
        for aspect in sentence.find("aspectCategories").findall("aspectCategory"):
            aspects.append(aspect.get("category"))
    entry["text"], entry["terms"], entry["aspects"]= sentence[0].text, aterms, aspects
    labeled_reviews.append(entry)
labeled_df = pd.DataFrame(labeled_reviews)
print("there are",len(labeled_reviews),"reviews in this training set")
#    print(sentence.find("aspectCategories").findall("aspectCategory").get("category"))

In [None]:
# Save annotated reviews
labeled_df.to_pickle("annotated_reviews_df.pkl")
labeled_df.head()

# Training the model with Naive Bayes
1. replace pronouns with neural coref
2. train the model with naive bayes

In [1]:
import neuralcoref

ModuleNotFoundError: No module named 'neuralcoref'

In [None]:
import spacy

In [None]:
spacy.__version__

In [None]:
import en_core_web_lg



In [None]:
from neuralcoref import Coref



In [None]:
# Load your usual SpaCy model (one of SpaCy English models)
import spacy
nlp = spacy.load("en_core_web_lg")

# load NeuralCoref and add it to the pipe of SpaCy's model
import neuralcoref

In [None]:
coref = neuralcoref.NeuralCoref(nlp.vocab)
nlp.add_pipe(coref, name='neuralcoref')

# You're done. You can now use NeuralCoref the same way you usually manipulate a SpaCy document and it's annotations.
doc = nlp(u'My sister has a dog. She loves him.')

doc._.has_coref
doc._.coref_clusters

In [None]:
from neuralcoref import Coref
import en_core_web_lg
spacy = en_core_web_lg.load()
coref = Coref(nlp=spacy)

# Define function for replacing pronouns using neuralcoref
def replace_pronouns(text):
    coref.one_shot_coref(text)
    return coref.get_resolved_utterances()[0]

In [None]:
# Read annotated reviews df, which is the labeled dataset for training
# This is located in the pickled files folder
annotated_reviews_df = pd.read_pickle("../pickled_files/annotated_reviews_df.pkl")
annotated_reviews_df.head(3)

In [None]:
# Create a new column for text whose pronouns have been replaced
annotated_reviews_df["text_pro"] = annotated_reviews_df.text.map(lambda x: replace_pronouns(x))

In [None]:
# uncomment below to pickle the new df
# annotated_reviews_df.to_pickle("annotated_reviews_df2.pkl")

# Read pickled file with replaced pronouns if it exists already
annotated_reviews_df = pd.read_pickle("annotated_reviews_df2.pkl")

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

# Convert the multi-labels into arrays
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(annotated_reviews_df.aspects)
X = annotated_reviews_df.text_pro

# Split data into train and test set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=0)

# save the the fitted binarizer labels
# This is important: it contains the how the multi-label was binarized, so you need to
# load this in the next folder in order to undo the transformation for the correct labels.
filename = 'mlb.pkl'
pickle.dump(mlb, open(filename, 'wb'))

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from skmultilearn.problem_transform import LabelPowerset
import numpy as np

# LabelPowerset allows for multi-label classification
# Build a pipeline for multinomial naive bayes classification
text_clf = Pipeline([('vect', CountVectorizer(stop_words = "english",ngram_range=(1, 1))),
                     ('tfidf', TfidfTransformer(use_idf=False)),
                     ('clf', LabelPowerset(MultinomialNB(alpha=1e-1))),])
text_clf = text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)

# Calculate accuracy
np.mean(predicted == y_test)

In [None]:
# Test if SVM performs better
from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf-svm', LabelPowerset(
                             SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, max_iter=6, random_state=42)))])
_ = text_clf_svm.fit(X_train, y_train)
predicted_svm = text_clf_svm.predict(X_test)

#Calculate accuracy
np.mean(predicted_svm == y_test)

In [None]:
import pickle
# Train naive bayes on full dataset and save model
text_clf = Pipeline([('vect', CountVectorizer(stop_words = "english",ngram_range=(1, 1))),
                     ('tfidf', TfidfTransformer(use_idf=False)),
                     ('clf', LabelPowerset(MultinomialNB(alpha=1e-1))),])
text_clf = text_clf.fit(X, y)

# save the model to disk
filename = 'naive_model1.pkl'
pickle.dump(text_clf, open(filename, 'wb'))

At this point, we can move on to 02-Sentiment analysis notebook, which will load the fitted Naive bayes model.

In [None]:
#mlb.inverse_transform(predicted)
pred_df = pd.DataFrame(
    {'text_pro': X_test,
     'pred_category': mlb.inverse_transform(predicted)
    })

In [None]:
pd.set_option('display.max_colwidth', -1)
pred_df.head()

## Some scrap code below which wasn't used

In [None]:
# Save annotated reviews
labeled_df.to_pickle("annotated_reviews_df.pkl")
labeled_df.head()

In [None]:
# This code was for parsing out terms & their relations to aspects
# However, the terms were not always hyponyms of the aspects, so they were unusable
aspects = {"food":[],"service":[],"anecdotes/miscellaneous":[], "ambience":[], "price":[]}
for i in range(len(labeled_df)):
    if len(labeled_df.aspects[i]) == 1:
        if labeled_df.terms[i] != []:
            for terms in labeled_df.terms[i]:
                aspects[labeled_df.aspects[i][0]].append(terms.lower())
for key in aspects:
    aspects[key] = list(set(aspects[key]))

In [None]:
terms = []
for i in labeled_df.terms:
    for j in i:
        if j not in terms:
            terms.append(j)
print("there are", len(terms),"unique terms")

In [None]:
# Use this dataframe if doing the classifications separately as binary classifications
labeled_reviews2 = []
for sentence in root.findall("sentence"):
    entry = {"food":0,"service":0,"anecdotes/miscellaneous":0, "ambience":0, "price":0}
    aterms = []
    aspects = []
    if sentence.find("aspectTerms"):
        for aterm in sentence.find("aspectTerms").findall("aspectTerm"):
            aterms.append(aterm.get("term"))
    if sentence.find("aspectCategories"):
        for aspect in sentence.find("aspectCategories").findall("aspectCategory"):
            if aspect.get("category") in entry.keys():
                entry[aspect.get("category")] = 1
    entry["text"], entry["terms"] = sentence[0].text, aterms
    labeled_reviews2.append(entry)
labeled_df2 = pd.DataFrame(labeled_reviews2)
#    print(sentence.find("aspectCategories").findall("aspectCategory").get("category"))

In [None]:
labeled_df2.iloc[:,:5].sum()