# This notebook:
- splits labelled data into train, validation, and test sets
- extract features and evaluation performance using 10-fold cross validation
- optimize select model with validation set

## Load and split data

In [1]:
# Load in the data from CSV

import pandas as pd
import re

labelledComments = pd.read_csv("data/comments-to-label.csv", usecols=['document_id', 'comment', 'sentiment']).dropna()
labelledComments['comment'] = labelledComments['comment'].map(lambda x: re.sub("\r", " ", x))

print("# comments labelled:", labelledComments.shape[0])
labelledComments.describe()

# comments labelled: 2500


Unnamed: 0,document_id,comment,sentiment
count,2500,2500,2500
unique,2500,2500,3
top,DOI-2017-0002-138105,I strongly urge you to continue to protect all...,pos
freq,1,1,2363


In [2]:
# View class distribution

labelledComments['sentiment'] = labelledComments['sentiment'].astype('category')
print(labelledComments.sentiment.value_counts())
print("Index for class labels:", labelledComments.sentiment.cat.categories)

pos    2363
neg     120
neu      17
Name: sentiment, dtype: int64
Index for class labels: Index(['neg', 'neu', 'pos'], dtype='object')


In [3]:
# Split labelled dataset into train (.7), validation (.2), and test (.1) sets 

import numpy as np

train, validate, test = np.split(labelledComments.sample(frac=1), [int(.7*len(labelledComments)), int(.9*len(labelledComments))])

train.to_csv('data/comments-train.csv', index=False)
validate.to_csv('data/comments-validate.csv', index=False)
test.to_csv('data/comments-test.csv', index=False)

print("# train:", train.shape[0])
print("# validate:", validate.shape[0])
print("# test:", test.shape[0])

# train: 1750
# validate: 500
# test: 250


In [None]:
# # Load data from file

# import pandas as pd

# train = pd.read_csv("data/comments-train.csv", usecols=['document_id', 'comment', 'sentiment'])
# validate = pd.read_csv("data/comments-validate.csv", usecols=['document_id', 'comment', 'sentiment'])
# test = pd.read_csv("data/comments-test.csv", usecols=['document_id', 'comment', 'sentiment'])
# print("# train:", train.shape[0])
# print("# validate:", validate.shape[0])
# print("# test:", test.shape[0])

## Extract features
- baseline
- content v.s. style test: stopwords removal, occurrence count, and tf-idf
- token-level analysis: stemming and lemmatization
- full text v.s. select sentences

### baseline

In [4]:
# Convert a collection of text documents to a bag-of-word matrix 

from sklearn.feature_extraction.text import CountVectorizer

# baseline: unigram + bigram (binary)
unigram_bigram_v = CountVectorizer(stop_words=None, ngram_range=(1, 2), min_df=5, binary=True)
unigram_bigram = unigram_bigram_v.fit_transform(train["comment"])

### stopwords removal, occurrence count, and tf-idf

In [5]:
# Simply change the parameters of CountVectorizer and TfidfVectorizer 

from sklearn.feature_extraction.text import TfidfVectorizer

# unigram + bigram + stopwords removal
unigram_bigram_noStopword_v = CountVectorizer(stop_words='english', ngram_range=(1, 2), min_df=5, binary=True)
unigram_bigram_noStopword = unigram_bigram_noStopword_v.fit_transform(train["comment"])

# unigram + bigram (count)
unigram_bigram_count_v = CountVectorizer(stop_words=None, ngram_range=(1, 2), min_df=5, binary=False)
unigram_bigram_count = unigram_bigram_count_v.fit_transform(train["comment"])

# unigram + bigram (tf-idf)
unigram_bigram_tfidf_v = TfidfVectorizer(stop_words=None, ngram_range=(1, 2), min_df=5, binary=False)
unigram_bigram_tfidf = unigram_bigram_tfidf_v.fit_transform(train["comment"])

### stemming and lemmatization

In [6]:
# Customize tokenizers

import gensim
from nltk.stem.porter import *

class PorterStemmerTokenizer(object):
    def __init__(self):
        self.stemmer = PorterStemmer()
    def __call__(self, doc):
        tokens = gensim.utils.simple_preprocess(doc, deacc=True, min_len=2)
        return [self.stemmer.stem(t) for t in tokens]

    
from nltk.stem import WordNetLemmatizer 

class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        tokens = gensim.utils.simple_preprocess(doc, deacc=True, min_len=2)
        return [self.wnl.lemmatize(t) for t in tokens]

In [7]:
# Pass customized tokenizer objects to CountVectorizer

# unigram + bigram + stemming
unigram_bigram_stem_v = CountVectorizer(tokenizer=PorterStemmerTokenizer(), stop_words=None, ngram_range=(1, 2), min_df=5, binary=True)
unigram_bigram_stem = unigram_bigram_stem_v.fit_transform(train["comment"])

# unigram + bigram + lemmatization
unigram_bigram_lemma_v = CountVectorizer(tokenizer=LemmaTokenizer(), stop_words=None, ngram_range=(1, 2), min_df=5, binary=True)
unigram_bigram_lemma = unigram_bigram_lemma_v.fit_transform(train["comment"])

### select sentences
- split text into sentences
- remove the signature line if there is any
- extract the first three and last three sentences

In [None]:
# # Remove default letter opening, i.e. "Dear Secretary Ryan Zinke, "

# opening = "(Dear Secretary Ryan Zinke,|Dear Secretary Zinke,)"
# labelledComments['comment'] = labelledComments['comment'].map(lambda x: re.sub(opening, " ", x))

In [8]:
# Write functions to split sentences, remove signature and extract sentences

import re

def split_into_sentences(text):
    '''
    Function to break text (astring) into sentences (a list of strings).
    # Ref: https://stackoverflow.com/a/31505798
    '''
    caps = "([A-Z])"
    digits = "([0-9])"
    prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
    suffixes = "(Inc|Ltd|Jr|Sr|Co)"
    starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
    acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
    websites = "[.](com|net|org|io|gov)"

    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + caps + "[.] "," \\1<prd> ",text)
    text = re.sub(digits + "[.]" + digits,"\\1<prd>\\2",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(caps + "[.]" + caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + caps + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = [s.strip() for s in sentences if len(s.strip())>1]
    return sentences


def remove_signature(sentences):
    '''
    Function to remove the signature line from comment sentences
    '''
    closing = ['thank you','thanks','sincerely','regards']
    last_sent = sentences[-1].lower()
    if any(term in last_sent for term in closing):
        return sentences[:-1]
    return sentences


def select_sentences(text, first_n, last_m):
    '''
    Function to extract the first N and the last M sentences from text, excluding the signature line.
    '''
    sentences = split_into_sentences(text)
    sentences = remove_signature(sentences)
    return " ".join(sentences[:first_n]+sentences[-last_m:])

In [9]:
# Extract first and last 3 sentences from text and transform them to vectors

train['first3last3'] = train['comment'].map(lambda x: select_sentences(x,3,3))

# unigram + bigram (first and last 3 sentences)
unigram_bigram_sents_v = CountVectorizer(stop_words=None, ngram_range=(1, 2), min_df=5, binary=True)
unigram_bigram_sents = unigram_bigram_sents_v.fit_transform(train["first3last3"])

# unigram + bigram + stemming (first and last 3 sentences)
unigram_bigram_sents_stem_v = CountVectorizer(tokenizer=PorterStemmerTokenizer(), stop_words=None, ngram_range=(1, 2), min_df=5, binary=True)
unigram_bigram_sents_stem = unigram_bigram_sents_stem_v.fit_transform(train["first3last3"])

# unigram + bigram + lemmatization (first and last 3 sentences)
unigram_bigram_sents_lemma_v = CountVectorizer(tokenizer=LemmaTokenizer(), stop_words=None, ngram_range=(1, 2), min_df=5, binary=True)
unigram_bigram_sents_lemma = unigram_bigram_sents_lemma_v.fit_transform(train["first3last3"])

In [10]:
# Evaluate models with 10-fold cross validation

from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.metrics.classification import cohen_kappa_score


def run_CV(feature_matrices, labels, model_list, num_folds):
    '''
    Build a classifier with for each feature matrix and model, and evaluate with 
    a N-fold cross validation. The performance is measured with kappa statistic.
    The output is a list of performance results per model. 
    '''
    cv = KFold(num_folds)    
    result = []
    for model in model_list:
        model_scores = []
        for X in feature_matrices:
            kappa_scores = []
            for train_index, test_index in cv.split(X): # cv
                model.fit(X[train_index], labels.iloc[train_index]) # fit a logit model to the data
                ypred = model.predict(X[test_index]) # make predictions
                kappa = cohen_kappa_score(labels.iloc[test_index], ypred) # get kappa score
                kappa_scores.append(kappa)
            model_scores.append(np.mean(kappa_scores))
        result.append(model_scores)
    return result


logit = LogisticRegression()
svm = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=400, random_state=1)
labels = train['sentiment'].cat.codes

matrices = [unigram_bigram, 
            unigram_bigram_noStopword, unigram_bigram_count, unigram_bigram_tfidf, 
            unigram_bigram_stem, unigram_bigram_lemma,
            unigram_bigram_sents, unigram_bigram_sents_stem, unigram_bigram_sents_lemma]

result = run_CV(matrices, labels, [logit,svm], 10)

In [11]:
# Display results in dataframe

df = pd.DataFrame(result).transpose()
df.columns = ['logit', 'svm']
df.index = ['unigram_bigram', 
            'unigram_bigram_noStopword', 'unigram_bigram_count', 'unigram_bigram_tfidf', 
            'unigram_bigram_stem', 'unigram_bigram_lemma',
            'unigram_bigram_sents', 'unigram_bigram_sents_stem', 'unigram_bigram_sents_lemma']
df

Unnamed: 0,logit,svm
unigram_bigram,0.63522,0.638145
unigram_bigram_noStopword,0.608708,0.664825
unigram_bigram_count,0.630836,0.617871
unigram_bigram_tfidf,0.235604,0.349349
unigram_bigram_stem,0.685998,0.677692
unigram_bigram_lemma,0.640121,0.6721
unigram_bigram_sents,0.601051,0.63971
unigram_bigram_sents_stem,0.632646,0.673983
unigram_bigram_sents_lemma,0.62181,0.659006


In [12]:
# Choose one model to examine the most predictive features per class

def print_topN_features(vectorizer, model, class_labels, n):
    """
    Print features with the highest coefficient values, per class
    """
    feature_names = vectorizer.get_feature_names()
    for i, class_label in enumerate(class_labels):
        topN = np.argsort(model.coef_[i])[-n:]
        print("%s: %s" % (class_label,", ".join(feature_names[j] for j in topN)))

print_topN_features(unigram_bigram_stem_v, svm, ['neg', 'neu', 'pos'], 10)

neg: mani nation, depart, first time, neighbor commun, but in, and my, it design, human be, cannot allow, list
neu: fit, explor and, re creat, joy, fate, extinct, co, over to, protect wild, by keep
pos: grand, feder, for peopl, profit for, indigen peopl, econom boost, it citizen, know the, proud of, preserv more


## Optimize
- introduce additional features
- adjust class weights
- tune parameter using grid search