In [2]:
import pandas as pd
import random
import numpy as np
from tqdm import tqdm
import ipdb
import re
import csv
from evaluate import load

from nltk.corpus import words
word_list = words.words()
from IPython.core.debugger import set_trace

from sklearn.metrics import precision_score as prec, recall_score as recall, f1_score
import matplotlib.pyplot as plt
# import mplcursors
import seaborn as sns
%matplotlib inline
sns.set(style='darkgrid', context='notebook', rc={'figure.figsize':(14,10)}, font_scale=2)

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('chained_assignment',None)

# Set random seeds for reproducibility on a specific machine
random.seed(1)
np.random.seed(1)
np.random.RandomState(1)

RandomState(MT19937) at 0x104A83340

In [5]:
with open('../outputs/ref-labels.txt', 'r') as f:
    label_sents = [x.strip() for x in f.readlines()]

In [6]:
gold = pd.read_csv('../data/test_data.tsv', sep='\t', quoting=csv.QUOTE_NONE, escapechar="\\")
test = gold[gold.split=='test']
test.ref_expressions = test.ref_expressions.apply(lambda x: eval(x))
test.shape

(318, 21)

## Distribution of labels in test set

In [7]:
test_set = " ".join(label_sents)

print("[IN]: ", test_set.count("[IN]"))
print("[OUT]: ", test_set.count("[OUT]"))
print("[OTHER]: ", test_set.count("[OTHER]"))
print("No tag comments :", len([x for x in label_sents if not(('[IN]' in x) or ('[OUT]' in x) or ('[OTHER]' in x))]))

[IN]:  303
[OUT]:  63
[OTHER]:  46
No tag comments : 77


In [8]:
counts = {'[IN]': ' '.join(label_sents).count('[IN]'), 
          '[OUT]': ' '.join(label_sents).count('[OUT]'), 
          '[OTHER]': ' '.join(label_sents).count('[OTHER]'), 
          'None': len([x for x in label_sents if not(('[IN]' in x) or ('[OUT]' in x) or ('[OTHER]' in x))])}
all_tags_count = counts['[IN]'] + counts['[OUT]'] + counts['[OTHER]']
counts,all_tags_count

({'[IN]': 303, '[OUT]': 63, '[OTHER]': 46, 'None': 77}, 412)

In [9]:
weight_counts = {x: np.round(counts[x]/all_tags_count, 3) for x in ['[IN]', '[OUT]', '[OTHER]']}
# weight_counts['None'] = np.round(counts['None']/len(label_sents), 3)
weight_counts

{'[IN]': 0.735, '[OUT]': 0.153, '[OTHER]': 0.112}

In [10]:
# Instead of weighting F1s by their actual counts, give equal weights to all classes
weight_equal = {'[IN]': 0.34, '[OUT]': 0.33, '[OTHER]': 0.33}

## Confusion matrix and scoring functions

In [12]:
def build_conf_mat(pred_sents, label_sents):
    conf_mat = {'[IN]': 0, '[OUT]': 0, '[OTHER]': 0, 'None': 0}
    label_mat = {'[IN]': conf_mat.copy(), '[OUT]': conf_mat.copy(), '[OTHER]': conf_mat.copy(), 'None': conf_mat.copy()}

    pattern = re.compile(r'(\[IN\]|\[OUT\]|\[OTHER\])')
    for ind in range(len(label_sents)):
        pred = pred_sents[ind]
        ref = label_sents[ind]

        if len(re.findall(pattern, ref))==0:
            if len(re.findall(pattern, pred))==0:
                label_mat['None']['None'] += 1
            else:
                if '[IN]' in pred:
                    label_mat['None']['[IN]'] += 1
                elif '[OUT]' in pred:
                    label_mat['None']['[OUT]'] += 1
                elif '[OTHER]' in pred:
                    label_mat['None']['[OTHER]'] += 1

        for match in pattern.finditer(ref):
            if '[IN]' in pred[match.start()-1:match.end()+1]:
                    label_mat[match.group()]['[IN]'] += 1
            elif '[OUT]' in pred[match.start()-1:match.end()+1]:
                label_mat[match.group()]['[OUT]'] += 1
            elif '[OTHER]' in pred[match.start()-1:match.end()+1]:
                label_mat[match.group()]['[OTHER]'] += 1
            else:
                if '[IN]' in pred[match.start()-3:match.end()+3]:
                    label_mat[match.group()]['[IN]'] += 0.5
                elif '[OUT]' in pred[match.start()-3:match.end()+3]:
                    label_mat[match.group()]['[OUT]'] += 0.5
                elif '[OTHER]' in pred[match.start()-3:match.end()+3]:
                    label_mat[match.group()]['[OTHER]'] += 0.5
                else:
                    if '[IN]' in pred[match.start()-5:match.end()+5]:
                        label_mat[match.group()]['[IN]'] += 0.25
                    elif '[OUT]' in pred[match.start()-5:match.end()+5]:
                        label_mat[match.group()]['[OUT]'] += 0.25
                    elif '[OTHER]' in pred[match.start()-5:match.end()+5]:
                        label_mat[match.group()]['[OTHER]'] += 0.25
                    label_mat[match.group()]['None'] += 1
    return label_mat

def print_scores(label_mat, counts, all_tags_count):
    recs = {}
    precs = {}
    f1s = {}
    tp_count = 0
    fp_count = 0
    for tag in  ['[IN]', '[OUT]', '[OTHER]']:
        tp = label_mat[tag][tag]
        recs[tag] = np.round(tp/(counts[tag]), 3)
        precs[tag] = np.round(tp/sum([label_mat[x][tag] for x in ['[IN]', '[OUT]', '[OTHER]', 'None']]), 3)
        tp_count += tp
        fp_count += sum([label_mat[tag][x] if x!=tag else 0 for x in ['[IN]', '[OUT]', '[OTHER]', 'None']])

    recs['micro_avg'] = np.round(tp_count/all_tags_count, 3)
    precs['micro_avg'] = np.round(tp_count/(tp_count+fp_count), 3)

    for tag in  ['[IN]', '[OUT]', '[OTHER]', 'micro_avg']:
        f1s[tag] = np.round(2*precs[tag]*recs[tag]/(precs[tag]+recs[tag]), 3)

    f1_macro = np.round(np.mean([f1s[x] for x in ['[IN]', '[OUT]', '[OTHER]']]), 3)
    f1_macro_weighted = np.round(sum([f1s[x]*weight_counts[x] for x in ['[IN]', '[OUT]', '[OTHER]']]), 3)
    none_acc = np.round(label_mat['None']['None']/counts['None'], 3)
        
    table = pd.DataFrame([recs, precs, f1s], index=['recall', 'precision', 'f1'])
    display(table, f"Macro F1: {f1_macro}", f"Weighted F1: {f1_macro_weighted}", f"None accuracy: {none_acc}")

# wer = load('wer')

# Calculate recall, precision, F1 for model output of interest here

In [21]:
with open('../outputs/seed-1-sample-sents-chkpt.txt', 'r') as f:
    pred_sents = [x.strip()  for x in f.readlines()]

label_mat = build_conf_mat(pred_sents, label_sents)
print_scores(label_mat, counts, all_tags_count)

Unnamed: 0,[IN],[OUT],[OTHER],micro_avg
recall,0.553,0.365,0.337,0.5
precision,0.898,0.767,1.0,0.519
f1,0.684,0.495,0.504,0.509


'Macro F1: 0.561'

'Weighted F1: 0.635'

'None accuracy: 0.922'

In [156]:
label_mat

{'[IN]': {'[IN]': 185.25, '[OUT]': 4, '[OTHER]': 1.0, 'None': 96},
 '[OUT]': {'[IN]': 6.25, '[OUT]': 31.0, '[OTHER]': 1.5, 'None': 20},
 '[OTHER]': {'[IN]': 6.0, '[OUT]': 1, '[OTHER]': 17.25, 'None': 20},
 'None': {'[IN]': 13, '[OUT]': 0, '[OTHER]': 2, 'None': 62}}

In [162]:
label_mat

{'[IN]': {'[IN]': 169.5, '[OUT]': 2.25, '[OTHER]': 1.0, 'None': 124},
 '[OUT]': {'[IN]': 3, '[OUT]': 33.75, '[OTHER]': 0, 'None': 25},
 '[OTHER]': {'[IN]': 1, '[OUT]': 2.5, '[OTHER]': 14.0, 'None': 27},
 'None': {'[IN]': 9, '[OUT]': 1, '[OTHER]': 0, 'None': 67}}

### Chance F1 scores? With and without uniform prior

In [167]:
random.seed(5)
np.random.seed(5)

conf_mat = {'[IN]': 0, '[OUT]': 0, '[OTHER]': 0, 'None': 0}

label_mat = {'[IN]': conf_mat.copy(), '[OUT]': conf_mat.copy(), '[OTHER]': conf_mat.copy(), 'None': conf_mat.copy()}

pattern = re.compile(r'(\[IN\]|\[OUT\]|\[OTHER\])')

for ind in range(len(label_sents)):
    ref = label_sents[ind]
    if len(re.findall(pattern, ref))==0:
        elem = random.sample(['[IN]', '[OUT]', '[OTHER]', 'None'], counts=[1,1,1,1], k=1)
        label_mat['None'][elem[0]] += 1
    for match in pattern.finditer(ref):
        elem = random.sample(['[IN]', '[OUT]', '[OTHER]', 'None'], counts=[1,1,1,1], k=1)
        label_mat[match.group()][elem[0]] += 1

print_scores(label_mat, counts, all_tags_count)

Unnamed: 0,[IN],[OUT],[OTHER],micro_avg
recall,0.228,0.302,0.152,0.231
precision,0.633,0.162,0.051,0.231
f1,0.335,0.211,0.076,0.231


'Macro F1: 0.207'

'Weighted F1: 0.287'

'None accuracy: 0.247'

In [130]:
random.seed(1)
np.random.seed(1)

conf_mat = {'[IN]': 0, '[OUT]': 0, '[OTHER]': 0, 'None': 0}

label_mat = {'[IN]': conf_mat.copy(), '[OUT]': conf_mat.copy(), '[OTHER]': conf_mat.copy(), 'None': conf_mat.copy()}

pattern = re.compile(r'(\[IN\]|\[OUT\]|\[OTHER\])')

for ind in range(len(label_sents)):
    ref = label_sents[ind]
    if len(re.findall(pattern, ref))==0:
        elem = random.sample(['[IN]', '[OUT]', '[OTHER]', 'None'], counts=[counts[x] for x in ['[IN]', '[OUT]', '[OTHER]', 'None']], k=1)
        label_mat['None'][elem[0]] += 1
    for match in pattern.finditer(ref):
        elem = random.sample(['[IN]', '[OUT]', '[OTHER]', 'None'], counts=[counts[x] for x in ['[IN]', '[OUT]', '[OTHER]', 'None']], k=1)
        label_mat[match.group()][elem[0]] += 1

print_scores(label_mat, counts, all_tags_count)

Unnamed: 0,[IN],[OUT],[OTHER],micro_avg
recall,0.62,0.111,0.043,0.478
precision,0.616,0.143,0.041,0.478
f1,0.618,0.125,0.042,0.478


'Macro F1: 0.262'

'Weighted F1: 0.478'

'None accuracy: 0.143'

## Bootstrap testing different conditions for significance

In [None]:
significance_b_over_a = 0
ling_wp = [58.4, 60.1, 60.3]
ling_wp_temp = [60.4, 60.1, 60.6]
wp_temp = [57.1, 59.5, 56.2]
wp = [56, 58, 55.8]
no_wp = [55.9, 56.3, 58.8]
no_wp_temp = [57.5, 59.5, 60.8]

random.seed(1)

def bootstrap(x, y, num_samples=100000):
    num_b_greater_a = 0
    for i in range(num_samples):
        a = random.choice(x)
        b = random.choice(y)
        if b-a>0.00001:
            num_b_greater_a += 1
    p_sig = 1-np.round((num_b_greater_a/num_samples), 3)
    if p_sig<0.05:
        print("True")

bootstrap(no_wp, ling_wp_temp)