In [5]:
import jsonlines
import numpy as np
from itertools import product
from collections import defaultdict

import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

np.random.seed(1)
mpl.rcParams['figure.dpi'] = 100
mpl.rcParams['font.family'] = 'DeJavu Serif'

In [6]:
# meta data

name_sets = {1: ['Top', 'Male', '2000s', 'White'],
             2: ['Top', 'Female', '2000s', 'White'],
             3: ['Medium', 'Male', '2000s', 'White'],
             4: ['Medium', 'Female', '2000s', 'White'],
             5: ['Bottom', 'Male', '2000s', 'White'],
             6: ['Bottom', 'Female', '2000s', 'White'],
             7: ['Medium', 'Male', '2000s', 'Black'],
             8: ['Medium', 'Female', '2000s', 'Black'],
             9: ['Medium', 'Male', '2000s', 'Asian'],
             10: ['Medium', 'Female', '2000s', 'Asian'],
             11: ['Medium', 'Male', '2000s', 'Hispanic'],
             12: ['Medium', 'Female', '2000s', 'Hispanic'],
             13: ['Top', 'Male', '1970s', 'White'],
             14: ['Top', 'Female', '1970s', 'White'],
             15: ['Top', 'Male', '1940s', 'White'],
             16: ['Top', 'Female', '1940s', 'White']}

dimensions = {'Gender': {'Male':[1,3,5,7,9,11,13,15], 'Female':[2,4,6,8,10,12,14,16]},
              'Race': {'White':[3,4], 'Black':[7,8], 'Asian':[9,10], 'Hispanic':[11,12]},
              'Popularity': {'Top':[1,2], 'Medium':[3,4], 'Bottom':[5,6]},
              'Decade': {'2000s':[1,2], '1970s':[13,14], '1940s':[15,16]}}

In [7]:
# load the labels

labels = defaultdict(dict)
with jsonlines.open('../Data/Finetune/Input/labels-test.jsonl', 'r') as reader:
    for line in reader:
        ID, position, name = map(lambda x:tuple(x), line.values())
        labels[ID][position] = name

In [8]:
# load the predictions

models = ['spaCy', 'NeuroNER']
types_context, types_name, seeds = ['clinical', 'general'], ['diverse', 'popular'], [0, 1, 2, 3, 4]
setups = [('original', 'original')] + list(product(types_context, types_name, seeds))
preds = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))

for model in models:
    for setup in setups:
        postfix = f'{setup[0]}+{setup[1]}-{model}'
        if setup != ('original', 'original'): postfix += f'-{setup[2]}'
        with jsonlines.open(f'../Data/Finetune/Output/finetunes-{postfix}.jsonl', 'r') as reader:
            for line in reader:
                ID, position, name = map(lambda x:tuple(x), line.values())
                preds[model][setup][ID][position] = name

### Model: Each Model under Each Setup

- Model & Setup: Overall Precision, Overall Recall, Overall F1, Standard Error

In [9]:
# get the vectorized raw prediction results

def aggregate(model, setup):
    
    vec_labels, vec_preds = [], []
    for ID, mentions in labels.items():
        for position, (name, _, _) in mentions.items():
            vec_labels.append(1)
            if position in preds[model][setup][ID] and name == preds[model][setup][ID][position][0]: vec_preds.append(1)
            else: vec_preds.append(0)
                
    for ID, mentions in preds[model][setup].items():
        for position in mentions:
            if position not in labels[ID]:
                vec_labels.append(0); vec_preds.append(1)
                
    return np.array(vec_labels), np.array(vec_preds)

In [10]:
P, R, F = 'Precision', 'Recall', 'F1'
scores = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
for model in models:
    print(f'Model: {model}')
    for setup in setups:
        vec_labels, vec_preds = aggregate(model, setup)
        tp, fp, fn = (vec_labels&vec_preds).sum(), ((~vec_labels)&vec_preds).sum(), (vec_labels&(~vec_preds)).sum()
        p, r, f = tp/(tp+fp), tp/(tp+fn), 2*tp/(2*tp+fp+fn)
        for metric, score in zip([P,R,F], [p,r,f]):
            scores[model][(setup[0], setup[1])][metric].append(score)
    for setup in scores[model]:
        print(f'Setup: {setup[0]} context & {setup[1]} name | ' + ' | '.join([f'{metric}: {np.mean(score):.3f} +- {np.std(score):.3f}' for metric, score in scores[model][setup].items()]))

Model: spaCy
Setup: original context & original name | Precision: 0.916 +- 0.000 | Recall: 0.623 +- 0.000 | F1: 0.741 +- 0.000
Setup: clinical context & diverse name | Precision: 0.990 +- 0.007 | Recall: 0.950 +- 0.006 | F1: 0.969 +- 0.002
Setup: clinical context & popular name | Precision: 0.998 +- 0.004 | Recall: 0.737 +- 0.072 | F1: 0.846 +- 0.046
Setup: general context & diverse name | Precision: 0.915 +- 0.072 | Recall: 0.830 +- 0.083 | F1: 0.864 +- 0.035
Setup: general context & popular name | Precision: 0.873 +- 0.110 | Recall: 0.492 +- 0.069 | F1: 0.629 +- 0.083
Model: NeuroNER
Setup: original context & original name | Precision: 0.955 +- 0.000 | Recall: 0.953 +- 0.000 | F1: 0.954 +- 0.000
Setup: clinical context & diverse name | Precision: 0.978 +- 0.014 | Recall: 0.978 +- 0.009 | F1: 0.978 +- 0.005
Setup: clinical context & popular name | Precision: 0.989 +- 0.003 | Recall: 0.865 +- 0.021 | F1: 0.923 +- 0.013
Setup: general context & diverse name | Precision: 0.958 +- 0.022 |

### Name: Each Dimension

- Dimension vs (Model * Setup): Mean of Absolute Difference in Recall, Standard Error

In [11]:
# collect the raw data (TP or FN)

set2model2setup2raw = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
for ID, mentions in labels.items():
    for position, (name, _, set_) in mentions.items():
        for model in models:
            for setup in setups:
                if position in preds[model][setup][ID] and name == preds[model][setup][ID][position][0]: # true positive
                    set2model2setup2raw[set_][model][setup].append(1)
                else: # false negative
                    set2model2setup2raw[set_][model][setup].append(0)

In [12]:
# merge the raw data by group

dimension2group2model2setup2raw = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(list))))
for dimension, group2sets in dimensions.items():
    for group, sets in group2sets.items():
        for model in models:
            for setup in setups:
                for set_ in sets:
                    dimension2group2model2setup2raw[dimension][group][model][setup] += set2model2setup2raw[set_][model][setup]
                dimension2group2model2setup2raw[dimension][group][model][setup] = np.array(dimension2group2model2setup2raw[dimension][group][model][setup])

In [13]:
# calculate the recall equality difference and standard error of each (model, dimension)

for dim, group2model2setup2raw in dimension2group2model2setup2raw.items():
    print()
    print(dim)
    
    for model in models:
        setup2diff = defaultdict(list)
        for setup in setups:
            group2raw = np.stack([group2model2setup2raw[group][model][setup] for group in dimensions[dim]])
            setup2diff[(setup[0], setup[1])].append(np.abs(group2raw.mean(-1) - group2raw.mean()).mean())
            
        for setup, diff in setup2diff.items():
            print(f'Model: {model} | Setup: {setup[0]} context & {setup[1]} name | Recall Equality Difference: {np.mean(diff):.3f} +- {np.std(diff):.3f}')


Gender
Model: spaCy | Setup: original context & original name | Recall Equality Difference: 0.003 +- 0.000
Model: spaCy | Setup: clinical context & diverse name | Recall Equality Difference: 0.012 +- 0.004
Model: spaCy | Setup: clinical context & popular name | Recall Equality Difference: 0.012 +- 0.007
Model: spaCy | Setup: general context & diverse name | Recall Equality Difference: 0.036 +- 0.005
Model: spaCy | Setup: general context & popular name | Recall Equality Difference: 0.010 +- 0.003
Model: NeuroNER | Setup: original context & original name | Recall Equality Difference: 0.005 +- 0.000
Model: NeuroNER | Setup: clinical context & diverse name | Recall Equality Difference: 0.007 +- 0.001
Model: NeuroNER | Setup: clinical context & popular name | Recall Equality Difference: 0.008 +- 0.004
Model: NeuroNER | Setup: general context & diverse name | Recall Equality Difference: 0.016 +- 0.007
Model: NeuroNER | Setup: general context & popular name | Recall Equality Difference: 0.00