# This notebook:
- splits labelled data into train, validation, and test sets
- extract features and evaluation performance using 10-fold cross validation
- optimize select model with validation set

### Setup

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

### Utility Functions

In [2]:
# Data preprocessing

import gensim
import re


def split_into_sentences(text):
    '''
    Function to break text (astring) into sentences (a list of strings).
    # Ref: https://stackoverflow.com/a/31505798
    '''
    caps = "([A-Z])"
    digits = "([0-9])"
    prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
    suffixes = "(Inc|Ltd|Jr|Sr|Co)"
    starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
    acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
    websites = "[.](com|net|org|io|gov)"

    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + caps + "[.] "," \\1<prd> ",text)
    text = re.sub(digits + "[.]" + digits,"\\1<prd>\\2",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(caps + "[.]" + caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + caps + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = [s.strip() for s in sentences if len(s.strip())>1]
    return sentences


def remove_signature(sentences):
    '''
    Function to remove the signature line from comment sentences
    '''
    closing = ['thank you','thanks','sincerely','regards']
    last_sent = sentences[-1].lower()
    if any(term in last_sent for term in closing):
        return sentences[:-1]
    return sentences


def select_sentences(text, first_n, last_m):
    '''
    Function to extract the first N and the last M sentences from text, excluding the signature line.
    '''
    sentences = split_into_sentences(text)
    sentences = remove_signature(sentences)
    return " ".join(sentences[:first_n]+sentences[-last_m:])


def hasContent(sentences):
    if len(sentences) == 0:
        return False
    if sentences[0].find('Leave your personal comment here') != -1 \
    or (len(sentences) == 2 and len(sentences[0]) < 30 and \
        sentences[0].find('Dear Secretary Ryan Zinke,') != -1):
        return False
    return True


def tokenize(text, minLength=3):
    return gensim.utils.simple_preprocess(text, deacc=True, min_len=minLength)


In [3]:
# Customize tokenizers for feature extraction

import gensim
from nltk.stem.porter import *

class PorterStemmerTokenizer(object):
    def __init__(self):
        self.stemmer = PorterStemmer()
    def __call__(self, doc):
        tokens = gensim.utils.simple_preprocess(doc, deacc=True, min_len=2)
        return [self.stemmer.stem(t) for t in tokens]

    
from nltk.stem import WordNetLemmatizer 

class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        tokens = gensim.utils.simple_preprocess(doc, deacc=True, min_len=2)
        return [self.wnl.lemmatize(t) for t in tokens]

In [4]:
# Model evaluation

from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.metrics.classification import cohen_kappa_score


def run_CV(feature_matrices, labels, model_list, num_folds):
    '''
    Build a classifier with for each feature matrix and model, and evaluate with 
    a N-fold cross validation. The performance is measured with kappa statistic.
    The output is a list of performance results per model. 
    '''
    cv = KFold(num_folds)    
    result = []
    for model in model_list:
        model_scores = []
        for X in feature_matrices:
            kappa_scores = []
            for train_index, test_index in cv.split(X): # cv
                model.fit(X[train_index], labels.iloc[train_index]) # fit a logit model to the data
                ypred = model.predict(X[test_index]) # make predictions
                kappa = cohen_kappa_score(labels.iloc[test_index], ypred) # get kappa score
                kappa_scores.append(kappa)
            model_scores.append(np.mean(kappa_scores))
        result.append(model_scores)
    return result


def evaluate_on_test_set(vectorizers, model, train_data, test_data):
    scores = []
    for X in vectorizers:
        train_vectors = X.fit_transform(train_data['comment']) # construct feature vectors (train)
        clf = model.fit(train_vectors, train_data['sentiment'].cat.codes) # fit model
        test_vectors = X.transform(test_data['comment']) # construct feature vectors (test)
        predicted = clf.predict(test_vectors) # make predictions
        kappa = cohen_kappa_score(test_data['sentiment'].cat.codes, predicted) # get kappa score
        scores.append(kappa)
    print(scores)
    return scores


In [5]:
# Displaying and styling


def print_topN_features(vectorizer, model, class_labels, n):
    """
    Print features with the highest coefficient values, per class
    """
    feature_names = vectorizer.get_feature_names()
    features = {}
    for i, class_label in enumerate(class_labels):
        topN = np.argsort(model.coef_[i])[-n:]
        features[class_label] = [feature_names[j] for j in topN]
    return pd.DataFrame(features)


def highlight_max(s):
    '''
    highlight the maximum in a Series yellow.
    '''
    is_max = s == s.max()
    return ['background-color: yellow' if v else '' for v in is_max]


## Load and split data

In [30]:
# Load in the data from CSV

labelledComments = pd.read_csv("data/comments-merged.csv", usecols=['document_id', 'comment', 'sentiment'])
print("# comments labelled:", labelledComments.shape[0])
labelledComments.describe()

# comments labelled: 3388


Unnamed: 0,document_id,comment,sentiment
count,3388,3388,3388
unique,3388,3388,3
top,DOI-2017-0002-135005,DO NOT change the national monument designatio...,pos
freq,1,1,2820


In [31]:
# View class distribution

labelledComments['sentiment'] = labelledComments['sentiment'].astype('category')
print(labelledComments.sentiment.value_counts())
print("Index for class labels:", labelledComments.sentiment.cat.categories)

pos    2820
neg     541
neu      27
Name: sentiment, dtype: int64
Index for class labels: Index(['neg', 'neu', 'pos'], dtype='object')


In [34]:
# Split labelled dataset into train (.8) and test (.2) sets 

train, test = np.split(labelledComments.sample(frac=1), [int(.8*len(labelledComments))])

train.to_csv('data/comments-train.csv', index=False)
test.to_csv('data/comments-test.csv', index=False)

print("# train:", train.shape[0])
print("# test:", validate.shape[0])

# train: 2710
# test: 1017


In [None]:
# # Load data from file

# train = pd.read_csv("data/comments-train.csv", usecols=['document_id', 'comment', 'sentiment'])
# validate = pd.read_csv("data/comments-validate.csv", usecols=['document_id', 'comment', 'sentiment'])
# test = pd.read_csv("data/comments-test.csv", usecols=['document_id', 'comment', 'sentiment'])
# print("# train:", train.shape[0])
# print("# validate:", validate.shape[0])
# print("# test:", test.shape[0])

## Extract features
- baseline
- content v.s. style test: stopwords removal, occurrence count, and tf-idf
- token-level analysis: stemming and lemmatization
- full text v.s. select sentences

### baseline

In [35]:
# Convert a collection of text documents to a bag-of-word matrix 

docs = train['comment']

# baseline: unigram + bigram (binary)
unigram_bigram_v = CountVectorizer(stop_words=None, ngram_range=(1, 2), min_df=5, binary=True)
unigram_bigram = unigram_bigram_v.fit_transform(docs)

### stopwords removal, occurrence count, and tf-idf

In [36]:
# Simply change the parameters of CountVectorizer and TfidfVectorizer 

# unigram + bigram + stopwords removal
unigram_bigram_noStopword_v = CountVectorizer(stop_words='english', ngram_range=(1, 2), min_df=5, binary=True)
unigram_bigram_noStopword = unigram_bigram_noStopword_v.fit_transform(docs)

# unigram + bigram (count)
unigram_bigram_count_v = CountVectorizer(stop_words=None, ngram_range=(1, 2), min_df=5, binary=False)
unigram_bigram_count = unigram_bigram_count_v.fit_transform(docs)

# unigram + bigram (tf-idf)
unigram_bigram_tfidf_v = TfidfVectorizer(stop_words=None, ngram_range=(1, 2), min_df=5, binary=False)
unigram_bigram_tfidf = unigram_bigram_tfidf_v.fit_transform(docs)

### stemming and lemmatization

In [37]:
# Pass customized tokenizer objects to CountVectorizer

# unigram + bigram + stemming
unigram_bigram_stem_v = CountVectorizer(tokenizer=PorterStemmerTokenizer(), stop_words=None, ngram_range=(1, 2), min_df=5, binary=True)
unigram_bigram_stem = unigram_bigram_stem_v.fit_transform(docs)

# unigram + bigram + lemmatization
unigram_bigram_lemma_v = CountVectorizer(tokenizer=LemmaTokenizer(), stop_words=None, ngram_range=(1, 2), min_df=5, binary=True)
unigram_bigram_lemma = unigram_bigram_lemma_v.fit_transform(docs)

### select sentences
- split text into sentences
- remove the signature line if there is any
- extract the first three and last three sentences

In [38]:
# # Remove default letter opening, i.e. "Dear Secretary Ryan Zinke, "

# opening = "(Dear Secretary Ryan Zinke,|Dear Secretary Zinke,)"
# labelledComments['comment'] = labelledComments['comment'].map(lambda x: re.sub(opening, " ", x))

In [39]:
# Extract first and last 3 sentences from text and transform them to vectors

train['first3last3'] = train['comment'].map(lambda x: select_sentences(x,3,3))
sents = train['first3last3'] 

# unigram + bigram (first and last 3 sentences)
unigram_bigram_sents_v = CountVectorizer(stop_words=None, ngram_range=(1, 2), min_df=5, binary=True)
unigram_bigram_sents = unigram_bigram_sents_v.fit_transform(sents)

# unigram + bigram + stemming (first and last 3 sentences)
unigram_bigram_sents_stem_v = CountVectorizer(tokenizer=PorterStemmerTokenizer(), stop_words=None, ngram_range=(1, 2), min_df=5, binary=True)
unigram_bigram_sents_stem = unigram_bigram_sents_stem_v.fit_transform(sents)

# unigram + bigram + lemmatization (first and last 3 sentences)
unigram_bigram_sents_lemma_v = CountVectorizer(tokenizer=LemmaTokenizer(), stop_words=None, ngram_range=(1, 2), min_df=5, binary=True)
unigram_bigram_sents_lemma = unigram_bigram_sents_lemma_v.fit_transform(sents)

In [40]:
# Evaluate models with 3-fold cross validation

logit = LogisticRegression()
svm = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=1000, random_state=1)
labels = train['sentiment'].cat.codes

matrices = [unigram_bigram, 
            unigram_bigram_noStopword, unigram_bigram_count, unigram_bigram_tfidf, 
            unigram_bigram_stem, unigram_bigram_lemma,
            unigram_bigram_sents, unigram_bigram_sents_stem, unigram_bigram_sents_lemma]

result = run_CV(matrices, labels, [logit,svm], 3)

In [41]:
# Display results in dataframe and highlight the maximum in each column

df = pd.DataFrame(result).transpose()
df.columns = ['logit', 'svm']
df.index = ['unigram_bigram', 
            'unigram_bigram_noStopword', 'unigram_bigram_count', 'unigram_bigram_tfidf', 
            'unigram_bigram_stem', 'unigram_bigram_lemma',
            'unigram_bigram_sents', 'unigram_bigram_sents_stem', 'unigram_bigram_sents_lemma']

df.style.apply(highlight_max)

Unnamed: 0,logit,svm
unigram_bigram,0.856036,0.858823
unigram_bigram_noStopword,0.855881,0.839656
unigram_bigram_count,0.854713,0.856928
unigram_bigram_tfidf,0.719346,0.805107
unigram_bigram_stem,0.865571,0.868662
unigram_bigram_lemma,0.857727,0.857129
unigram_bigram_sents,0.853581,0.853323
unigram_bigram_sents_stem,0.859907,0.848841
unigram_bigram_sents_lemma,0.843115,0.836457


In [42]:
# Choose one model to examine the most predictive features per class

print_topN_features(unigram_bigram_stem_v, svm, ['neg', 'neu', 'pos'], 20)

Unnamed: 0,neg,neu,pos
0,have access,enjoy bear,all fifteen
1,keep our,even though,marvel
2,plant and,forc,open for
3,growth,enhanc,nation recreat
4,materi,furthermor,myriad
5,may never,it archaeolog,capabl
6,low,firmli,kind of
7,of conserv,countri these,comment as
8,harm,can keep,like our
9,ad,monument boundari,is disgrac


## Experiment with additional negative examples

### Load and slice data into train and test sets

In [None]:
# Load in data

labelled = pd.read_csv("data/comments-merged.csv", usecols=['document_id', 'comment', 'sentiment']).dropna()
labelled['comment'] = labelled['comment'].map(lambda x: re.sub("\r", " ", x))

additional = pd.read_csv("data/relabelled.csv", usecols=['document_id', 'comment', 'Actual']).dropna()
additional.columns = ['document_id', 'comment', 'sentiment']
additional['comment'] = additional['comment'].map(lambda x: re.sub("\r", " ", x))
additional = additional.replace({'sentiment': {'Positive': 'pos', 'Neutral': 'neu', 'Negative': 'neg'}})

print("# labelled comments:", labelled.shape[0])
print("# additional comments:", additional.shape[0])

In [None]:
# Prepare datasets for training and testing
# Approach 1: use a subset of 3000 labelled data as test set

# divide into training set (control group) and test set 
control_train, exp_test = np.split(labelled.sample(frac=1), [int(.7*len(labelled))])

# add additional examples to the control group to form treatment group
treatment_train = control_train.append(additional, ignore_index=True)

In [None]:
# # Approach 2: use CWP samples as test set

# cwp = pd.read_csv("data/cwp-sample.csv", usecols=['ID', 'comment', 'Sentiment']).dropna()
# cwp.columns = ['document_id', 'comment', 'sentiment']
# cwp['comment'] = cwp['comment'].map(lambda x: re.sub("\r", " ", x))
# cwp = cwp.replace({'sentiment': {'Positive': 'pos', 'Neutral': 'neu', 'Negative': 'neg'}})

# cwp['sentences'] = cwp['comment'].map(lambda x: split_into_sentences(x))
# cwp = cwp[cwp['sentences'].map(hasContent)]
# cwp['first_two_sents'] = cwp['sentences'].map(lambda x: " ".join([" ".join(tokenize(sent,2)) for sent in x[:2]]))
# exp_test = cwp.drop_duplicates(subset=['first_two_sents'])[['document_id', 'comment', 'sentiment']]


# # use current labelled data as train set for the control group
# control_train = labelled

# # add additional examples to the control group to form treatment group
# treatment_train = control_train.append(additional, ignore_index=True)

In [None]:
# Drop duplicates 

# based on document id 
treatment_train = treatment_train.drop_duplicates(subset=['document_id'])

# based on first two sentences of comment texts
treatment_train['sentences'] = treatment_train['comment'].map(lambda x: split_into_sentences(x))
treatment_train = treatment_train[treatment_train['sentences'].map(hasContent)]
treatment_train['first_two_sents'] = treatment_train['sentences'].map(lambda x: " ".join([" ".join(tokenize(sent,2)) for sent in x[:2]]))
treatment_train = treatment_train.drop_duplicates(subset=['first_two_sents'])
treatment_train = treatment_train[['document_id', 'comment', 'sentiment']]

In [None]:
# Change the data type of sentiment column

control_train['sentiment'] = control_train['sentiment'].astype('category')
treatment_train['sentiment'] = treatment_train['sentiment'].astype('category')
exp_test['sentiment'] = exp_test['sentiment'].astype('category')

print("# trainging instances in control group:", control_train.shape[0])
print("# trainging instances in treatment group:", treatment_train.shape[0])
print("# testing instances:", exp_test.shape[0])

In [None]:
control_train.sentiment.value_counts()

In [None]:
treatment_train.sentiment.value_counts()

In [None]:
# (Optional) Write to file

control_train.to_csv('data/control_train.csv', index=False)
treatment_train.to_csv('data/treatment_train.csv', index=False)
exp_test.to_csv('data/exp_test.csv', index=False)

### Extract features

In [None]:
# Initialize vectorizers

# baseline: unigram + bigram (binary)
unigram_bigram_v = CountVectorizer(stop_words=None, ngram_range=(1, 2), min_df=5, binary=True)

# unigram + bigram + stopwords removal
unigram_bigram_noStopword_v = CountVectorizer(stop_words='english', ngram_range=(1, 2), min_df=5, binary=True)

# unigram + bigram (count)
unigram_bigram_count_v = CountVectorizer(stop_words=None, ngram_range=(1, 2), min_df=5, binary=False)

# unigram + bigram (tf-idf)
unigram_bigram_tfidf_v = TfidfVectorizer(stop_words=None, ngram_range=(1, 2), min_df=5, binary=False)

# unigram + bigram + stemming
unigram_bigram_stem_v = CountVectorizer(tokenizer=PorterStemmerTokenizer(), stop_words=None, ngram_range=(1, 2), min_df=5, binary=True)

# unigram + bigram + lemmatization
unigram_bigram_lemma_v = CountVectorizer(tokenizer=LemmaTokenizer(), stop_words=None, ngram_range=(1, 2), min_df=5, binary=True)

### Evaluate models and display results

In [None]:
# Evaluate models on the test set

vectorizers = [unigram_bigram_v, 
               unigram_bigram_noStopword_v, unigram_bigram_count_v, unigram_bigram_tfidf_v, 
               unigram_bigram_stem_v, unigram_bigram_lemma_v]

logit = LogisticRegression()
logit_control = evaluate_on_test_set(vectorizers, logit, control_train, exp_test)
logit_treatment = evaluate_on_test_set(vectorizers, logit, treatment_train, exp_test)

svm = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=1000, random_state=1)
svm_control = evaluate_on_test_set(vectorizers, svm, control_train, exp_test)
svm_treatment = evaluate_on_test_set(vectorizers, svm, treatment_train, exp_test)

In [None]:
# Display results in dataframe and highlight the maximum in each column

df = pd.DataFrame([logit_control, logit_treatment, svm_control, svm_treatment]).transpose()
df.columns = ['logit_control', 'logit_treatment', 'svm_control', 'svm_treatment']
df.index = ['unigram_bigram', 
            'unigram_bigram_noStopword', 'unigram_bigram_count', 'unigram_bigram_tfidf', 
            'unigram_bigram_stem', 'unigram_bigram_lemma']

df.style.apply(highlight_max)

The results are very different each time. The additional ~400 negative examples seem to improve the performace in general but the best classifier does not always need them. Again the improvement is not consistent enough to be conclusive. 

If we adopt the second approach of using CWP samples as the test set, kappa boosts significantly (~0.97). I think that's an over-estimate because random sampling contains lots of template comments which are relatively easy to classify and less meaningful.

## Optimize using grid search

In [26]:
# Build a pipeline and perform grid search

pipe = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2), min_df=5, binary=True)),
                 ('clf', SGDClassifier(loss='hinge', penalty='l2', random_state=1))])

parameters = {'vect__tokenizer': (PorterStemmerTokenizer(), LemmaTokenizer()),
              'vect__stop_words': (None, 'english'), 
              'clf__alpha': (1e-2, 1e-3),
              'clf__n_iter': (200, 500, 1000),
              'clf__class_weight': (None, 'balanced')}

grid = GridSearchCV(pipe, parameters, scoring=make_scorer(cohen_kappa_score), n_jobs=-1)
grid = grid.fit(labelledComments['comment'], labelledComments['sentiment'].cat.codes)

In [27]:
n = 'A national monument does not need 1.35 MILLION acres!!'
labelledComments.sentiment.cat.categories[grid.predict([n])[0]]

'neg'

In [28]:
p = 'Secretary Zinke, I want you to know that I greatly support the Bears Ears National monument. I live in Northern Arizona and feel the wide open spaces of Southern Utah are very special. I enjoy hiking and camping in this area and would like to see it preserved for myself and future generations. Please keep Bears Ears protected!!'
labelledComments.sentiment.cat.categories[grid.predict([p])[0]]

'pos'

In [29]:
print(grid.best_score_)                                  
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, grid.best_params_[param_name]))

0.83561856545
clf__alpha: 0.001
clf__class_weight: 'balanced'
clf__n_iter: 1000
vect__stop_words: None
vect__tokenizer: <__main__.PorterStemmerTokenizer object at 0x126eae198>


## Predict

In [43]:
comments = pd.read_csv("data/comments-cleaned.csv")
print("# comments:", comments.shape[0])
comments.head()

# comments: 246733


Unnamed: 0,document_id,comment,duplicate
0,DOI-2017-0002-0002,Our national monuments are a national treasure...,False
1,DOI-2017-0002-0003,1.We do not want National Monument protection ...,True
2,DOI-2017-0002-0004,The monuments must be preserved. the precedent...,False
3,DOI-2017-0002-0005,My name is Ryan Erik Benally and I'm from Mont...,False
4,DOI-2017-0002-0006,all protections and preservations for the enti...,False


In [49]:
comments['sentiment'] = grid.predict(comments.comment)
comments = comments.replace({'sentiment': {2: 'pos', 1: 'neu', 0: 'neg'}})
comments.head()

Unnamed: 0,document_id,comment,duplicate,prediction
0,DOI-2017-0002-0002,Our national monuments are a national treasure...,False,pos
1,DOI-2017-0002-0003,1.We do not want National Monument protection ...,True,pos
2,DOI-2017-0002-0004,The monuments must be preserved. the precedent...,False,pos
3,DOI-2017-0002-0005,My name is Ryan Erik Benally and I'm from Mont...,False,neg
4,DOI-2017-0002-0006,all protections and preservations for the enti...,False,pos


In [93]:
comments.to_csv('data/comments-labelled-complete.csv', index=False)