In [1]:
import datetime
import logging
import os
import random
import time
import warnings

import csv
import gluonnlp as nlp
import matplotlib.pyplot as plt
import mxnet as mx
import numpy as np
import pandas as pd
import seaborn as sns

from bert import *
from mxboard import SummaryWriter
from mxnet import gluon
from mxnet.gluon.data import Dataset, SimpleDataset
from sklearn.metrics import (accuracy_score, f1_score, precision_score, recall_score,
                             classification_report, confusion_matrix)
from sklearn.model_selection import train_test_split
from sklearn import utils
from tqdm import tqdm

In [2]:
%matplotlib inline

In [3]:
warnings.filterwarnings('ignore')

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

In [4]:
# set repeatable random state
np.random.seed(100)
random.seed(100)
mx.random.seed(10000)

In [5]:
# apply progress bars for pandas .apply() -> .progress_apply()
tqdm.pandas()

In [6]:
# make tqdm jupyter friendly
from tqdm import tqdm_notebook as tqdm
# for .progress_apply() we have to hack it like this?
tqdm().pandas()

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [7]:
class Timer:
    def __init__(self, name=None):
        self.name = name

    def __enter__(self):
        self.time_start = time.time()

    def __exit__(self, *exc):
        time_end = time.time()
        time_delta = datetime.timedelta(seconds=(time_end - self.time_start))
        if self.name:
            print(("Time for [{}]: {}".format(self.name, time_delta)))
        else:
            print(("Time: {}".format(time_delta)))

---

In [8]:
fn_art_eval = "data/artificial_evalset/artificial_evalset.tsv"

In [9]:
artificial_evalset_df = pd.DataFrame.from_csv(fn_art_eval, sep='\t', index_col=None)

new_cols = artificial_evalset_df.columns.to_list()
new_cols[2] = "type"
artificial_evalset_df.columns = new_cols

In [10]:
def fix_cols(row):
    row["argument1_id"] = row['arg_id']
    row["argument2_id"] = "{}-{}".format(row['arg_id'], row['type'])
    row["topic"] = "gay marriage"
    return row

artificial_evalset_df = artificial_evalset_df.apply(fix_cols, axis=1)

In [11]:
def add_tag(row):
    row["tag"] = "gay marriage"
    return row

artificial_evalset_df = artificial_evalset_df.apply(add_tag, axis=1)

In [12]:
ctx = mx.cpu()
_, vocabulary = nlp.model.get_model('bert_12_768_12',
                                    dataset_name='book_corpus_wiki_en_uncased',
                                    pretrained=True, ctx=ctx, use_pooler=True,
                                    use_decoder=False, use_classifier=False)
bert_tokenizer = nlp.data.BERTTokenizer(vocabulary, lower=True)
tokenizer = bert_tokenizer

In [13]:
# tokenizer from BERT
def tokenize_arguments(row):
    # tokenize
    row['argument1_tokens'] = tokenizer(row['argument1'])
    row['argument2_tokens'] = tokenizer(row['argument2'])

    # count tokens
    row['argument1_len'] = len(row['argument1_tokens'])
    row['argument2_len'] = len(row['argument2_tokens'])
    # token number diff
    row['argument12_len_sum'] = row['argument1_len'] + row['argument2_len']
    row['argument12_len_sum_half'] = row['argument12_len_sum'] / 2
    row['argument12_len_diff'] = row['argument1_len'] - row['argument2_len']
    row['argument12_len_diff_abs'] = np.abs(row['argument12_len_diff'])
    return row


artificial_evalset_df = artificial_evalset_df.progress_apply(tokenize_arguments, axis=1)

HBox(children=(IntProgress(value=0, max=175), HTML(value='')))




In [14]:
from nltk.tokenize import sent_tokenize, word_tokenize
# nltk.download('punct')


# tokenizer from BERT
def sentenize_arguments(row):
    # tokenize
    row['argument1_sentences'] = sent_tokenize(row['argument1'])
    row['argument2_sentences'] = sent_tokenize(row['argument2'])

    # count tokens
    row['argument1_sent_num'] = len(row['argument1_sentences'])
    row['argument2_sent_num'] = len(row['argument2_sentences'])
    # token number diff
    row['argument12_sent_num_sum'] = row['argument1_sent_num'] + row['argument2_sent_num']
    row['argument12_sent_num_sum_half'] = row['argument12_sent_num_sum'] / 2
    row['argument12_sent_num_diff'] = row['argument1_sent_num'] - row['argument2_sent_num']
    row['argument12_sent_num_diff_abs'] = np.abs(row['argument12_sent_num_diff'])
    return row


artificial_evalset_df = artificial_evalset_df.progress_apply(sentenize_arguments, axis=1)

HBox(children=(IntProgress(value=0, max=175), HTML(value='')))




In [15]:
def get_overview(df, task='same-side', class_name='is_same_side'):
    # Total instance numbers
    total = len(df)
    print("Task: ", task)
    print('=' * 40, '\n')

    print('Total instances: ', total)
    print('\n')

    print('For each topic:')
    for tag, tag_df in df.groupby(['tag']):
        print(tag, ': ', len(tag_df), ' instances')
        print('')
        print('\t\tUnique argument1:', len(tag_df['argument1'].unique()))
        print('\t\tUnique argument2:', len(tag_df['argument2'].unique()))
        arguments = np.concatenate([tag_df['argument1'].values, tag_df['argument2'].values])
        print('\t\tUnique total arguments:', len(set(list(arguments))), '\n')
        if class_name in df.columns:
            for is_same_side, side_df in tag_df.groupby([class_name]):
                print('\t\t', is_same_side, ': ', len(side_df), ' instances')
    print('\n')

    if class_name in df.columns:
        print('For each class value:')
        for class_value, class_df in df.groupby([class_name]):
            print(class_value, ': ', len(class_df), ' instances')
            print('\t\tUnique argument1:', len(class_df['argument1'].unique()))
            print('\t\tUnique argument2:', len(class_df['argument2'].unique()))
            arguments = np.concatenate([class_df['argument1'].values, class_df['argument2'].values])
            print('\t\tUnique total arguments:', len(set(list(arguments))), '\n')
        print('\n')

    print('Unique argument1:', len(df['argument1'].unique()))
    print('Unique argument2:', len(df['argument2'].unique()))
    arguments = df['argument1'].values
    arguments = np.concatenate([arguments, df['argument2'].values])

    print('Unique total arguments:', len(set(list(arguments))), '\n')

    print('-' * 40, '\n')

    arguments_length_lst = [x for x in df['argument1_len'].values]
    arguments_length_lst.extend([x for x in df['argument2_len'].values])
    print('Words:')
    print('\tshortest argument:', min(arguments_length_lst), ' words')
    print('\tlongest argument:', max(arguments_length_lst), ' words')
    print('\targument average length:', np.mean(arguments_length_lst),
          ' words')

    arguments_sent_length_lst = [x for x in df['argument1_sent_num'].values]
    arguments_sent_length_lst.extend([x for x in df['argument2_sent_num'].values])
    print('Sentences:')
    print('\tshortest argument:', min(arguments_sent_length_lst), ' sentences')
    print('\tlongest argument:', max(arguments_sent_length_lst), ' sentences')
    print('\targument average length:', np.mean(arguments_sent_length_lst),
          ' sentences')

In [16]:
get_overview(artificial_evalset_df, task="same-side artificial evalset")

Task:  same-side artificial evalset

Total instances:  175


For each topic:
gay marriage :  175  instances

		Unique argument1: 25
		Unique argument2: 174
		Unique total arguments: 199 

		 False :  100  instances
		 True :  75  instances


For each class value:
False :  100  instances
		Unique argument1: 25
		Unique argument2: 100
		Unique total arguments: 125 

True :  75  instances
		Unique argument1: 25
		Unique argument2: 74
		Unique total arguments: 99 



Unique argument1: 25
Unique argument2: 174
Unique total arguments: 199 

---------------------------------------- 

Words:
	shortest argument: 7  words
	longest argument: 137  words
	argument average length: 25.597142857142856  words
Sentences:
	shortest argument: 1  sentences
	longest argument: 5  sentences
	argument average length: 1.3314285714285714  sentences


In [17]:
artificial_evalset_df.describe()

Unnamed: 0,arg_id,argument1_id,argument1_len,argument2_len,argument12_len_sum,argument12_len_sum_half,argument12_len_diff,argument12_len_diff_abs,argument1_sent_num,argument2_sent_num,argument12_sent_num_sum,argument12_sent_num_sum_half,argument12_sent_num_diff,argument12_sent_num_diff_abs
count,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0
mean,13.0,13.0,26.24,24.954286,51.194286,25.597143,1.285714,8.52,1.28,1.382857,2.662857,1.331429,-0.102857,0.182857
std,7.231794,7.231794,26.187423,16.712174,41.693007,20.846504,13.850954,10.977427,0.724291,0.814201,1.479959,0.73998,0.429839,0.402211
min,1.0,1.0,7.0,8.0,15.0,7.5,-26.0,0.0,1.0,1.0,2.0,1.0,-2.0,0.0
25%,7.0,7.0,15.0,15.5,30.0,15.0,-6.0,2.0,1.0,1.0,2.0,1.0,0.0,0.0
50%,13.0,13.0,19.0,20.0,39.0,19.5,-1.0,6.0,1.0,1.0,2.0,1.0,0.0,0.0
75%,19.0,19.0,32.0,27.0,54.0,27.0,5.0,10.0,1.0,1.0,3.0,1.5,0.0,0.0
max,25.0,25.0,137.0,108.0,245.0,122.5,72.0,72.0,4.0,5.0,9.0,4.5,1.0,2.0


---

In [18]:
names_columns_X = ['argument1', 'argument2', 'argument1_id', 'argument2_id', 'topic']
names_columns_X2 = ['argument1', 'argument2', 'tag']
names_columns_y = ['is_same_side']

In [25]:
def compute_metrics(conf_mat, precision=3, dump=True):
    conf_mat = np.array(conf_mat)
    tn, fp, fn, tp = conf_mat.ravel()

    acc = (tp + tn) / (tp + tn + fp + fn)
    prec = tp / (tp + fp)
    rec  = tp / (tp + fn)
    f1 = 2 * (prec * rec) / (prec + rec)

    if dump:
        print("{:>10}: {:.{prec}f}".format("accuracy", acc, prec=precision))
        print("{:>10}: {:.{prec}f}".format("precision", prec, prec=precision))
        print("{:>10}: {:.{prec}f}".format("recall", rec, prec=precision))
        print("{:>10}: {:.{prec}f}".format("f1-score", f1, prec=precision))

    return prec, rec, f1, acc


def compute_metrics2(labels, preds, precision=3, averaging="macro", dump=True):
    acc = accuracy_score(labels, preds)
    prec = precision_score(labels, preds, labels=[0, 1], average=averaging)
    rec  = recall_score(labels, preds, labels=[0, 1], average=averaging)
    f1 = f1_score(labels, preds, labels=[0, 1], average=averaging)

    if dump:
        print("{:>10}: {:.{prec}f}".format("accuracy", acc, prec=precision))
        print("{:>10}: {:.{prec}f}".format("precision", prec, prec=precision))
        print("{:>10}: {:.{prec}f}".format("recall", rec, prec=precision))
        print("{:>10}: {:.{prec}f}".format("f1-score", f1, prec=precision))

    return prec, rec, f1, acc

In [20]:
def heatconmat(y_test, y_pred):
    sns.set_context('talk')
    plt.figure(figsize=(9, 6))
    sns.heatmap(confusion_matrix(y_test, y_pred),
                annot=True,
                fmt='d',
                cbar=False,
                cmap='gist_earth_r',
                yticklabels=sorted(np.unique(y_test)))
    plt.show()


def report_training_results(y_test, y_pred, name=None, heatmap=True):
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))
    print()
    compute_metrics(confusion_matrix(y_test, y_pred))
    compute_metrics2(y_test, y_pred)
    if heatmap:
        heatconmat(y_test, y_pred)
    print()
    print('Accuracy: ', round(accuracy_score(y_test, y_pred), 2), '\n')  #

    print('Report{}:'.format("" if not name else " for [{}]".format(name)))
    print(classification_report(y_test, y_pred))

    f1_dic = {}
    f1_dic['macro'] = round(
        f1_score(y_pred=y_pred, y_true=y_test, average='macro'), 2)
    f1_dic['micro'] = round(
        f1_score(y_pred=y_pred, y_true=y_test, average='micro'), 2)
    return f1_dic

In [21]:
fn = "data/artificial_evalset/artificial_evalset.pred.tsv"

In [22]:
artificial_evalset_df = pd.DataFrame.from_csv(fn, sep='\t')

In [28]:
# invert NEG:
# abs(v - 1) -- v is value (0|1)


def dump_art_eval_results(artificial_evalset_df):
    cols = [c for c in artificial_evalset_df.columns.tolist() if c.startswith("preds-")]
    cols = sorted(cols)

    for col in cols:
        model_name = col[6:]
        print("#" * 60)
        print("Model:", model_name)
        print("#" * 60)
        print()
        
        labels = artificial_evalset_df["is_same_side"].values
        preds = artificial_evalset_df[col].values
        print("All (uninverted NEG labels):")
        # compute_metrics(confusion_matrix(labels, preds))
        compute_metrics2(labels, preds)
        print()
        
        # labels, preds = zip(*[(l, p) if l == 1 else (1, abs(p - 1)) for l, p in zip(labels, preds)])
        # conf_mat = confusion_matrix(labels, preds)
        # print("All:")            
        # compute_metrics(conf_mat)
        # compute_metrics2(labels, preds)
        # print()

        for crit, crit_df in artificial_evalset_df.groupby("type"):
            crit_df = crit_df[["is_same_side", col]].astype({"is_same_side": "int32"})
            labels = crit_df["is_same_side"].values
            preds = crit_df[col].values

            #if "NEG" in crit:
            #    # invert values for conf_mat
            #    # neg label (is_same_side == False) should always be 0, converted to 1
            #    labels = [abs(v - 1) for v in labels]
            #    preds = [abs(v - 1) for v in preds]

            # conf_mat = confusion_matrix(labels, preds)
            print("Criterion:", crit)            
            # compute_metrics(conf_mat)
            compute_metrics2(labels, preds)
            print()

        # all negs
        neg_cols = [c for c in artificial_evalset_df["type"].unique().tolist() if "NEG" in c]
        all_neg_df = artificial_evalset_df.loc[[
            any(v) for v in zip(*[
                artificial_evalset_df["type"] == c for c in neg_cols
            ])
        ]]
        labels = all_neg_df["is_same_side"].values
        # labels = [abs(v - 1) for v in labels]
        preds = all_neg_df[col].values
        # preds = [abs(v - 1) for v in preds]
        # conf_mat = confusion_matrix(labels, preds)
        print("All negs:", neg_cols)            
        compute_metrics2(labels, preds)
        print()
        
        # all pos
        pos_cols = [c for c in artificial_evalset_df["type"].unique().tolist() if "NEG" not in c]
        all_pos_df = artificial_evalset_df.loc[[
            any(v) for v in zip(*[
                artificial_evalset_df["type"] == c for c in pos_cols
            ])
        ]]
        labels = all_pos_df["is_same_side"].values
        # labels = [v for v in labels]
        preds = all_pos_df[col].values
        # preds = [v for v in all_pos_df[col].values]
        # conf_mat = confusion_matrix(labels, preds)
        print("All pos:", pos_cols)            
        # compute_metrics(conf_mat)
        compute_metrics2(labels, preds)
        print()

        print("\n")

In [29]:
dump_art_eval_results(artificial_evalset_df)

############################################################
Model: cross_traindev_epi128_BCE
############################################################

All (uninverted NEG labels):
  accuracy: 0.480
 precision: 0.528
    recall: 0.520
  f1-score: 0.457

Criterion: CIT
  accuracy: 0.720
 precision: 0.500
    recall: 0.360
  f1-score: 0.419

Criterion: CIT-NEG
  accuracy: 0.320
 precision: 0.500
    recall: 0.160
  f1-score: 0.242

Criterion: CON
  accuracy: 0.960
 precision: 0.500
    recall: 0.480
  f1-score: 0.490

Criterion: CON-NEG
  accuracy: 0.240
 precision: 0.500
    recall: 0.120
  f1-score: 0.194

Criterion: DIFF
  accuracy: 0.720
 precision: 0.500
    recall: 0.360
  f1-score: 0.419

Criterion: DIFF-NEG
  accuracy: 0.320
 precision: 0.500
    recall: 0.160
  f1-score: 0.242

Criterion: NEG
  accuracy: 0.080
 precision: 0.500
    recall: 0.040
  f1-score: 0.074

All negs: ['NEG', 'CON-NEG', 'DIFF-NEG', 'CIT-NEG']
  accuracy: 0.240
 precision: 0.500
    recall: 0.120
  f1-s

In [None]:
# convert bool into int32
# artificial_evalset_df.astype({"is_same_side": "int32"})