## Project w266 Spring '19

By Jonathan Zhou

Dataset used can be found at https://github.com/bwallace/ACL-2014-irony


In [1]:
import pdb
import sys
import collections
from collections import defaultdict
import re
import itertools
import sqlite3
import string

import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import KFold
from sklearn.model_selection import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

import pandas as pd
import numpy as np
import statsmodels.api as sm

from __future__ import print_function
import collections
import os
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential, load_model, Model
from keras.layers import LSTM, Embedding, Dropout, Input, TimeDistributed, Bidirectional, concatenate
from keras.layers import Dense, Activation, Flatten, Conv1D, MaxPooling1D, Embedding, Dropout
from keras.optimizers import Adam, RMSprop
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.regularizers import l2
import argparse

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Utilities

In [2]:
db_path = "ironate.db"
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

comment_sep_str = "\n\n"+"-"*50+"\n"

def _make_sql_list_str(ls):
    return "(" + ",".join([str(x_i) for x_i in ls]) + ")"

labelers_of_interest = [2,4,5,6]
labeler_id_str = _make_sql_list_str(labelers_of_interest)

def _grab_single_element(result_set, COL=0):
    return [x[COL] for x in result_set]

def get_all_comment_ids():
    return _grab_single_element(cursor.execute(
                '''select distinct comment_id from irony_label where labeler_id in %s;''' % 
                    labeler_id_str)) 

def get_ironic_comment_ids():
    cursor.execute(
        '''select distinct comment_id from irony_label 
            where label=1 and labeler_id in %s;''' % 
            labeler_id_str) 

    ironic_comments = _grab_single_element(cursor.fetchall())
    return ironic_comments

def context_stats():
    all_comment_ids = get_all_comment_ids()

    forced_decisions = _grab_single_element(cursor.execute(
                '''select distinct comment_id from irony_label where forced_decision=1 and labeler_id in %s;''' % 
                    labeler_id_str)) 

    for labeler in labelers_of_interest:
        labeler_forced_decisions = _grab_single_element(cursor.execute(
                '''select distinct comment_id from irony_label where forced_decision=1 and labeler_id = %s;''' % 
                    labeler))

        all_labeler_decisions = _grab_single_element(cursor.execute(
                '''select distinct comment_id from irony_label where forced_decision=0 and labeler_id = %s;''' % 
                    labeler))

        p_labeler_forced = float(len(labeler_forced_decisions))/float(len(all_labeler_decisions))
        print ("labeler %s: %s", labeler, p_labeler_forced)

    p_forced = float(len(forced_decisions)) / float(len(all_comment_ids))

    ironic_comments = get_ironic_comment_ids()
    ironic_ids_str = _make_sql_list_str(ironic_comments)
    forced_ironic_ids =  _grab_single_element(cursor.execute(
                '''select distinct comment_id from irony_label where 
                        forced_decision=1 and comment_id in %s and labeler_id in %s;''' % 
                                (ironic_ids_str, labeler_id_str))) 

    X,y = [],[]

    for c_id in all_comment_ids:
        if c_id in forced_decisions:
            y.append(1.0)
        else:
            y.append(0.0)

        if c_id in ironic_comments:
            X.append([1.0])
        else:
            X.append([0.0])

    X = sm.add_constant(X, prepend=True)
    logit_mod = sm.Logit(y, X)
    logit_res = logit_mod.fit()
    
    print (logit_res.summary())
    return logit_res

def grab_comments(comment_id_list, verbose=False):
    comments_list = []
    for comment_id in comment_id_list:
        cursor.execute("select text from irony_commentsegment where comment_id='%s' order by segment_index" % comment_id)
        segments = _grab_single_element(cursor.fetchall())
        comment = " ".join(segments)
        if verbose:
            print (comment)
        comments_list.append(comment.encode('utf-8').strip())
    return comments_list

def _get_entries(a_list, indices):
    return [a_list[i] for i in indices]

def get_labeled_thrice_comments():
    ''' get all ids for comments labeled >= 3 times '''
    cursor.execute(
        '''select comment_id from irony_label group by comment_id having count(distinct labeler_id) >= 3;'''
    )
    thricely_labeled_comment_ids = _grab_single_element(cursor.fetchall())
    return thricely_labeled_comment_ids

In [51]:
def data_transformation (): 
    all_comment_ids = _grab_single_element(cursor.execute(
                '''select distinct comment_id from irony_label where labeler_id in %s;''' %labeler_id_str))

    ironic_comment_ids = get_ironic_comment_ids()

    forced_decision_ids = _grab_single_element(cursor.execute(
                '''select distinct comment_id from irony_label where forced_decision=1 and labeler_id in %s;''' % 
                    labeler_id_str))
    
    metadata = cursor.execute("select id, subreddit, redditor, upvotes, downvotes from irony_comment where id in (select distinct comment_id from irony_label);")
    metadata = [[x[0], x[1].encode('utf-8').strip(), x[2].encode('utf-8').strip(), x[3], 0, 0, 0, 0, 0] for x in metadata]

    unique_users = [x[0].encode('utf-8').strip() for x in cursor.execute("select distinct redditor from irony_comment")]
    unique_subreddit = [x[0].encode('utf-8').strip() for x in cursor.execute("select distinct subreddit from irony_comment")]
    
    for info in metadata: 
        info[2] = unique_users.index(info[2])
        info[1] = unique_subreddit.index(info[1])
    
    metadata = np.array(metadata)
        
    comment_texts, y = [], []
    for id_ in all_comment_ids:
        comment_texts.append(grab_comments([id_])[0])
        if id_ in ironic_comment_ids:
            y.append(1)
        else:
            y.append(-1)
            
    # adding some features here; just adding them as tokens,
    # which is admittedly kind of hacky.
    emoticon_RE_str = '(?::|;|=)(?:-)?(?:\)|\(|D|P)'
    question_mark_RE_str = '\?'
    exclamation_point_RE_str = '\!'
    # any combination of multiple exclamation points and question marks
    interrobang_RE_str = '[\?\!]{2,}'

    for i, comment in enumerate(comment_texts):
        #pdb.set_trace()
        if len(re.findall(r'%s' % emoticon_RE_str, comment)) > 0:
            metadata[i][4] = 1
        if len(re.findall(r'%s' % exclamation_point_RE_str, comment)) > 0:
            metadata[i][5] = 1
        if len(re.findall(r'%s' % question_mark_RE_str, comment)) > 0:
            metadata[i][6] = 1
        if len(re.findall(r'%s' % interrobang_RE_str, comment)) > 0:
            metadata[i][7] = 1
        if any([len(s) > 2 and str.isupper(s) for s in comment.split(" ")]):
            metadata[i][8] = 1
        
        comment = re.sub(r'\d+', '', comment.lower())
        comment_texts[i] = comment.translate(string.maketrans("",""), string.punctuation)
            
    return metadata, comment_texts, y

def tokenizer(input_comments):
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(input_comments)
    sequences = tokenizer.texts_to_sequences(input_comments)

    word_index = tokenizer.word_index
    
    return word_index, sequences

In [4]:
## Data Profiling
print(labeler_id_str) # Unsure what this is for
print('Number of comments: ', len(get_all_comment_ids())) # List of all comment IDs returned, 3550
print('Number of ironic comments: ', len(get_ironic_comment_ids())) # List of all ironic comment ids, 723

cursor.execute('''select count(*) from (select distinct segment_id from irony_label);''')
print (cursor.fetchall())

all_comment_ids = _grab_single_element(cursor.execute('''select distinct comment_id from irony_label'''))
print (len(all_comment_ids))

cursor.execute('''select count(*) from (select distinct id from irony_comment);''')
print (cursor.fetchall())

cursor.execute('''select redditor, count(*) from (select distinct id, redditor from irony_comment) group by redditor;''')
print (len(cursor.fetchall()))


(2,4,5,6)
Number of comments:  3550
Number of ironic comments:  996
[(11072,)]
3550
[(10039,)]
6319


## Baseline BOW Implementation from 2014 paper by Byron C. Wallace et al

In [15]:
def ml_bow(show_features=False):    
    all_comment_ids = _grab_single_element(cursor.execute(
                '''select distinct comment_id from irony_label where labeler_id in %s;''' %labeler_id_str))

    ironic_comment_ids = get_ironic_comment_ids()
    #ironic_ids_str = _make_sql_list_str(ironic_comments)

    forced_decision_ids = _grab_single_element(cursor.execute(
                '''select distinct comment_id from irony_label where forced_decision=1 and labeler_id in %s;''' % 
                    labeler_id_str))

    kf = KFold(len(y), n_folds=5, shuffle=True)
    X_context, y_mistakes = [], []
    recalls, precisions = [], []
    Fs = []
    top_features = []
    for train, test in kf:
        train_ids = _get_entries(all_comment_ids, train)
        test_ids = _get_entries(all_comment_ids, test)
        y_train = _get_entries(y, train)
        y_test = _get_entries(y, test)

        X_train, X_test = x[train], x[test]
        svm = SGDClassifier(loss="hinge", penalty="l2", class_weight="balanced", alpha=.01)
        #pdb.set_trace()
        parameters = {'alpha':[.001, .01,  .1]}
        clf = GridSearchCV(svm, parameters, scoring='f1')
        clf.fit(X_train, y_train)
        preds = clf.predict(X_test)
        
        #precision, recall, f1, support = sklearn.metrics.precision_recall_fscore_support(y_test, preds)
        tp, fp, tn, fn = 0,0,0,0
        N = len(preds)

        for i in xrange(N):
            cur_id = test_ids[i]
            irony_indicator = 1 if cur_id in ironic_comment_ids else 0
            forced_decision_indicator = 1 if cur_id in forced_decision_ids else 0
            # so x1 is the coefficient for forced decisions (i.e., context); 
            # x2 is the coeffecient for irony (overall)
            X_context.append([irony_indicator, forced_decision_indicator])

            y_i = y_test[i]
            pred_y_i = preds[i]

            if y_i == 1:
                # ironic
                if pred_y_i == 1:
                    # true positive
                    tp += 1 
                    y_mistakes.append(0)
                else:
                    # false negative
                    fn += 1
                    y_mistakes.append(1)
            else:
                # unironic
                if pred_y_i == -1:
                    # true negative
                    tn += 1
                    y_mistakes.append(0)
                else:
                    # false positive
                    fp += 1
                    y_mistakes.append(1)

        recall = tp/float(tp + fn)
        precision = tp/float(tp + fp)
        recalls.append(recall)
        precisions.append(precision)
        f1 = 2* (precision * recall) / (precision + recall)
        Fs.append(f1)

    X_context = sm.add_constant(X_context, prepend=True)
    logit_mod = sm.Logit(y_mistakes, X_context)
    logit_res = logit_mod.fit()

    print (logit_res.summary())
    
    print (Fs)

## Improved BOW Implementation

In [66]:
def new_bow(x, metadata, y, verbose=False):
    x = np.concatenate((x, metadata), 1)
    
    indices = np.arange(x.shape[0])
    np.random.shuffle(indices)
    x = x[indices]
    y = y[indices]
    nb_validation_samples = int(VALIDATION_SPLIT * x.shape[0])

    x_train = x[:-nb_validation_samples]
    y_train = y[:-nb_validation_samples]
    x_val = x[-nb_validation_samples:]
    y_val = y[-nb_validation_samples:]

    svm = SGDClassifier(loss="hinge", penalty="l2", class_weight="balanced", alpha=.01)
    parameters = {'alpha':[.00001, .0001, .001, .01,  .1]}
    clf = GridSearchCV(svm, parameters, scoring='accuracy')
    clf.fit(x_train, y_train)
    
    if verbose: 
        fn = 0
        fp = 0
        y_preds = clf.predict(x_val)

        for i, comment in enumerate(x_val):
            if y_preds[i] != y_val[i]: 
                print ('-----PREDICTED =', y_preds[i], '------ACTUAL =', y_val[i])
                print (comment)
            if y_preds[i] == 1 and y_val[i] == -1: 
                fp += 1
            if y_preds[i] == -1 and y_val[i] == 1: 
                fn += 1
              
        print ('false positives =', fp)
        print ('false negatives =', fn)
    
    return clf.score(x_val, y_val)

## RNN Implementation

In [62]:
def RNN():
    embedding_matrix = np.random.random((len(V) + 1, EMBEDDING_DIM))

    inputs = Input(name='inputs',shape=[MAX_SEQUENCE_LENGTH])
    meta_inputs = Input(name='meta_inputs',shape=[9])
    embed = Embedding(len(V) + 1, EMBEDDING_DIM, weights=[embedding_matrix], 
                      input_length=MAX_SEQUENCE_LENGTH, trainable=True)(inputs)
    layer = Bidirectional(LSTM(64, dropout=0.1, recurrent_dropout=0.1, kernel_regularizer=l2(0.01)))(embed)
    layer = Dense(60,name='FC1')(layer)
    layer = Activation('tanh')(layer)
    layer = concatenate([layer, meta_inputs])
    layer = Dense(1,name='out_layer')(layer)
    layer = Activation('softmax')(layer)
    
    return Model(inputs=[inputs, meta_inputs], outputs=layer)

## CNN Implementation

In [63]:
def CNN(): 
    embedding_matrix = np.random.random((len(V) + 1, EMBEDDING_DIM))

    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    meta_inputs = Input(name='meta_inputs',shape=[9])
    embedded_sequences = Embedding(len(V) + 1, EMBEDDING_DIM, weights=[embedding_matrix], 
                                   input_length=MAX_SEQUENCE_LENGTH, trainable=True)(sequence_input)
    l_cov1= Conv1D(64, 5, activation='relu')(embedded_sequences)
    l_pool1 = MaxPooling1D(5)(l_cov1)
    l_cov2 = Conv1D(64, 5, activation='relu')(l_pool1)
    l_pool2 = MaxPooling1D(5)(l_cov2)
    l_cov3 = Conv1D(64, 5, activation='relu')(l_pool2)
    l_pool3 = MaxPooling1D(35)(l_cov3)  # global max pooling
    l_flat = Flatten()(l_pool3)
    layer = concatenate([l_flat, meta_inputs])
    layer = Dense(1,activation='softmax')(layer)
    return Model(inputs=[sequence_input, meta_inputs], outputs=layer)

## Execution

In [64]:
# Baseline BOW Training
ml_bow()

Optimization terminated successfully.
         Current function value: 0.666518
         Iterations 4
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                 3550
Model:                          Logit   Df Residuals:                     3547
Method:                           MLE   Df Model:                            2
Date:                Fri, 12 Apr 2019   Pseudo R-squ.:                 0.03598
Time:                        23:27:34   Log-Likelihood:                -2366.1
converged:                       True   LL-Null:                       -2454.5
                                        LLR p-value:                 4.401e-39
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.3590      0.046      7.725      0.000       0.268       0.450
x1            -1.0706      0.

In [48]:
#LSTM Training
MAX_SEQUENCE_LENGTH = 250
MAX_NB_WORDS = 2000
EMBEDDING_DIM = 30
VALIDATION_SPLIT = 0.2
BATCH_SIZE = 64

metadata, comments, y = data_transformation()
V, x = tokenizer(comments)
x = pad_sequences(x, maxlen=MAX_SEQUENCE_LENGTH)
y = np.asarray(y)

model = RNN()
model.summary()
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])

model.fit([x, metadata], y, batch_size=BATCH_SIZE, epochs=10,
          validation_split=VALIDATION_SPLIT, callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

# .2866 / 22.746 = 250/2000/20
# .3204 / 21.6682 = 250/2000/30

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
inputs (InputLayer)             (None, 250)          0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 250, 30)      401310      inputs[0][0]                     
__________________________________________________________________________________________________
bidirectional_2 (Bidirectional) (None, 128)          48640       embedding_3[0][0]                
__________________________________________________________________________________________________
FC1 (Dense)                     (None, 60)           7740        bidirectional_2[0][0]            
__________________________________________________________________________________________________
activation

<keras.callbacks.History at 0x1c3a53f190>

In [23]:
# CNN Execution
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 500
EMBEDDING_DIM = 10
VALIDATION_SPLIT = 0.2
BATCH_SIZE = 64

metadata, comments, y = data_transformation()
V, x = tokenizer(comments)
x = pad_sequences(x, maxlen=MAX_SEQUENCE_LENGTH)
y = np.asarray(y)

model = CNN()
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['acc'])
model.summary()

model.fit([x, metadata], y, batch_size=BATCH_SIZE, epochs=10,
          validation_split=VALIDATION_SPLIT, callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

# .3204 / 21.6682 = 1000/500/10

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1000)         0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 1000, 10)     133770      input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 996, 64)      3264        embedding_2[0][0]                
__________________________________________________________________________________________________
max_pooling1d_1 (MaxPooling1D)  (None, 199, 64)      0           conv1d_1[0][0]                   
__________________________________________________________________________________________________
conv1d_2 (

<keras.callbacks.History at 0x1c301b5b10>

In [78]:
# Improved BOW Training
score_list = []

for i in range(5):
    score_list.append(new_bow(x, metadata, y))
    
print ('Average accuracy for updated BOW =', sum(score_list) / len(score_list))
print ('Accuracy scores =', score_list)

Average accuracy for updated BOW = 0.6867605633802818
Accuracy scores = [0.6239436619718309, 0.7225352112676057, 0.6352112676056338, 0.7380281690140845, 0.7140845070422536]
