In [16]:
import pandas as pd
import numpy as np
from nervaluate import Evaluator
import seqeval.metrics
import sklearn.metrics
import tqdm
from compute_f1_qa import calculate_mean_ci

In [2]:
tweet_dir = '/scratch/dzhang5/usda_project/tweet-fid-application'

In [3]:
train_mv_data = pd.read_pickle(f"{tweet_dir}/tweet-fid/LREC_mv/train.p")
train_expert_data = pd.read_pickle(f"{tweet_dir}/tweet-fid/LREC_expert_label/train.p")
train_bsc_data = pd.read_pickle(f"{tweet_dir}/tweet-fid/LREC_BSC/train.p")
test_expert_data = pd.read_pickle(f"{tweet_dir}/tweet-fid/LREC_expert_label/test.p")
test_mv_data = pd.read_pickle(f"{tweet_dir}/tweet-fid/LREC_mv/test.p")
test_bsc_data = pd.read_pickle(f"{tweet_dir}/tweet-fid/LREC_BSC/test.p")
dev_expert_data = pd.read_pickle(f"{tweet_dir}/tweet-fid/LREC_expert_label/dev.p")
dev_mv_data = pd.read_pickle(f"{tweet_dir}/tweet-fid/LREC_mv/dev.p")
dev_bsc_data = pd.read_pickle(f"{tweet_dir}/tweet-fid/LREC_BSC/dev.p")

In [4]:
train_test_expert_data = pd.concat([train_expert_data, test_expert_data])
train_test_mv_data = pd.concat([train_mv_data, test_mv_data])
train_test_bsc_data = pd.concat([train_bsc_data, test_bsc_data])

In [5]:
bootstrap_time = 200

In [6]:
labels = ['food', 'loc', 'symptom', 'keyword']

In [7]:
expert_label = train_test_expert_data['relevant_entity_label']
bsc_label = train_test_bsc_data['relevant_entity_label']
mv_label = train_test_mv_data['relevant_entity_label']

In [8]:
all_labels = pd.DataFrame({'expert':expert_label, 'bsc':bsc_label, 'mv':mv_label})

In [9]:
def pad_truncate(label, pred):
    ll, pl = len(label), len(pred)
    if ll == pl:
        return pred
    elif ll > pl:
        return pred + ['O']*(ll - pl)
    else:
        return pred[:ll]

In [10]:
all_labels['bsc'] = all_labels[['expert', 'bsc']].apply(lambda x: pad_truncate(x['expert'], x['bsc']), axis=1)

In [11]:
all_labels['mv'] = all_labels[['expert', 'mv']].apply(lambda x: pad_truncate(x['expert'], x['mv']), axis=1)

In [12]:
evaluator = Evaluator(all_labels['expert'].tolist(), all_labels['bsc'].tolist(), tags=labels, loader="list")
results, results_by_tag = evaluator.evaluate()
for key, val in results.items():
    print(key)
    print(val)

ent_type
{'correct': 1102, 'incorrect': 37, 'partial': 0, 'missed': 252, 'spurious': 920, 'possible': 1391, 'actual': 2059, 'precision': 0.5352112676056338, 'recall': 0.792235801581596, 'f1': 0.638840579710145}
partial
{'correct': 960, 'incorrect': 0, 'partial': 179, 'missed': 252, 'spurious': 920, 'possible': 1391, 'actual': 2059, 'precision': 0.5097134531325886, 'recall': 0.7544931703810208, 'f1': 0.6084057971014493}
strict
{'correct': 936, 'incorrect': 203, 'partial': 0, 'missed': 252, 'spurious': 920, 'possible': 1391, 'actual': 2059, 'precision': 0.45458960660514813, 'recall': 0.6728971962616822, 'f1': 0.542608695652174}
exact
{'correct': 960, 'incorrect': 179, 'partial': 0, 'missed': 252, 'spurious': 920, 'possible': 1391, 'actual': 2059, 'precision': 0.4662457503642545, 'recall': 0.6901509705248023, 'f1': 0.5565217391304348}


In [13]:
evaluator = Evaluator(all_labels['expert'].tolist(), all_labels['mv'].tolist(), tags=labels, loader="list")
results, results_by_tag = evaluator.evaluate()
for key, val in results.items():
    print(key)
    print(val)

ent_type
{'correct': 1007, 'incorrect': 31, 'partial': 0, 'missed': 353, 'spurious': 726, 'possible': 1391, 'actual': 1764, 'precision': 0.5708616780045351, 'recall': 0.723939611790079, 'f1': 0.6383518225039619}
partial
{'correct': 957, 'incorrect': 0, 'partial': 81, 'missed': 353, 'spurious': 726, 'possible': 1391, 'actual': 1764, 'precision': 0.5654761904761905, 'recall': 0.7171099928109274, 'f1': 0.6323296354992076}
strict
{'correct': 935, 'incorrect': 103, 'partial': 0, 'missed': 353, 'spurious': 726, 'possible': 1391, 'actual': 1764, 'precision': 0.530045351473923, 'recall': 0.6721782890007189, 'f1': 0.5927099841521395}
exact
{'correct': 957, 'incorrect': 81, 'partial': 0, 'missed': 353, 'spurious': 726, 'possible': 1391, 'actual': 1764, 'precision': 0.5425170068027211, 'recall': 0.6879942487419123, 'f1': 0.6066561014263074}


In [26]:
idx = all_labels.index
rng = np.random.RandomState(seed=12345)
f1_list, pre_list, rec_list = [], [], []
for _ in tqdm.trange(bootstrap_time):
    pred_idx = rng.choice(idx, size=idx.shape[0], replace=True)
    keep_labels = all_labels['expert'][pred_idx]
    keep_preds = all_labels['mv'][pred_idx]
    evaluator = Evaluator(keep_labels, keep_preds, tags=labels, loader="list")
    results, results_by_tag = evaluator.evaluate()
    f1_list.append(results["strict"]['f1'])
    pre_list.append(results["strict"]['precision'])
    rec_list.append(results["strict"]['recall'])

f1_str = calculate_mean_ci(f1_list)
pre_str = calculate_mean_ci(pre_list)
rec_str = calculate_mean_ci(rec_list)
print(f'F1:\n{f1_str}\nprecision:\n{pre_str}\nrecall:\n{rec_str}')

100%|██████████| 200/200 [02:51<00:00,  1.16it/s]

F1:
mean:  0.5908 ±  0.0146
lower 95% CI:  0.5598
upper 95% CI:  0.6156
precision:
mean:  0.5286 ±  0.0165
lower 95% CI:  0.4936
upper 95% CI:  0.5607
recall:
mean:  0.6701 ±  0.0160
lower 95% CI:  0.6356
upper 95% CI:  0.7009





In [27]:
idx = all_labels.index
rng = np.random.RandomState(seed=12345)
f1_list, pre_list, rec_list = [], [], []
for _ in tqdm.trange(bootstrap_time):
    pred_idx = rng.choice(idx, size=idx.shape[0], replace=True)
    keep_labels = all_labels['expert'][pred_idx]
    keep_preds = all_labels['bsc'][pred_idx]
    evaluator = Evaluator(keep_labels, keep_preds, tags=labels, loader="list")
    results, results_by_tag = evaluator.evaluate()
    f1_list.append(results["strict"]['f1'])
    pre_list.append(results["strict"]['precision'])
    rec_list.append(results["strict"]['recall'])

f1_str = calculate_mean_ci(f1_list)
pre_str = calculate_mean_ci(pre_list)
rec_str = calculate_mean_ci(rec_list)
print(f'F1:\n{f1_str}\nprecision:\n{pre_str}\nrecall:\n{rec_str}')

100%|██████████| 200/200 [02:50<00:00,  1.17it/s]

F1:
mean:  0.5414 ±  0.0141
lower 95% CI:  0.5138
upper 95% CI:  0.5694
precision:
mean:  0.4539 ±  0.0150
lower 95% CI:  0.4234
upper 95% CI:  0.4832
recall:
mean:  0.6711 ±  0.0157
lower 95% CI:  0.6388
upper 95% CI:  0.7011





In [58]:
cls_report = seqeval.metrics.classification_report(all_labels['expert'], all_labels['bsc'], zero_division=1, digits=4)
print(cls_report)

              precision    recall  f1-score   support

        food     0.4977    0.6866    0.5771       469
         loc     0.5299    0.6577    0.5869       485
       other     0.6202    0.8823    0.7284      1257
     symptom     0.3642    0.6751    0.4731       437

   micro avg     0.5316    0.7723    0.6297      2648
   macro avg     0.5030    0.7254    0.5914      2648
weighted avg     0.5397    0.7723    0.6336      2648



In [59]:
cls_report = seqeval.metrics.classification_report(all_labels['expert'], all_labels['mv'], zero_division=1, digits=4)
print(cls_report)

              precision    recall  f1-score   support

        food     0.5503    0.6994    0.6160       469
         loc     0.6360    0.6557    0.6457       485
       other     0.6360    0.8799    0.7383      1257
     symptom     0.4326    0.6613    0.5231       437

   micro avg     0.5826    0.7708    0.6636      2648
   macro avg     0.5637    0.7241    0.6308      2648
weighted avg     0.5873    0.7708    0.6642      2648



In [30]:
(train_test_expert_data.index == train_test_mv_data.index).all()

True

In [33]:
idx = train_test_expert_data.index
rng = np.random.RandomState(seed=12345)
acc_list, bacc_list, f1_list, pre_list, rec_list = [], [], [], [], []
for _ in tqdm.trange(bootstrap_time):
    pred_idx = rng.choice(idx, size=idx.shape[0], replace=True)
    keep_labels = train_test_expert_data['sentence_class'][pred_idx]
    keep_preds = train_test_mv_data['sentence_class'][pred_idx]
    acc_list.append(sklearn.metrics.accuracy_score(keep_labels, keep_preds))
    bacc_list.append(sklearn.metrics.balanced_accuracy_score(keep_labels, keep_preds))
    f1_list.append(sklearn.metrics.f1_score(keep_labels, keep_preds, pos_label=1))
    pre_list.append(sklearn.metrics.precision_score(keep_labels, keep_preds, pos_label=1))
    rec_list.append(sklearn.metrics.recall_score(keep_labels, keep_preds, pos_label=1))

acc_str = calculate_mean_ci(acc_list)
bacc_str = calculate_mean_ci(bacc_list)
f1_str = calculate_mean_ci(f1_list)
pre_str = calculate_mean_ci(pre_list)
rec_str = calculate_mean_ci(rec_list)
print(f'Acc:\n{acc_str}\nB.Acc:\n{bacc_str}\nF1:\n{f1_str}\nprecision:\n{pre_str}\nrecall:\n{rec_str}')

100%|██████████| 200/200 [00:01<00:00, 146.67it/s]

Acc:
mean:  0.8195 ±  0.0059
lower 95% CI:  0.8075
upper 95% CI:  0.8297
B.Acc:
mean:  0.8515 ±  0.0051
lower 95% CI:  0.8426
upper 95% CI:  0.8608
F1:
mean:  0.7759 ±  0.0082
lower 95% CI:  0.7588
upper 95% CI:  0.7907
precision:
mean:  0.6578 ±  0.0110
lower 95% CI:  0.6350
upper 95% CI:  0.6784
recall:
mean:  0.9459 ±  0.0062
lower 95% CI:  0.9348
upper 95% CI:  0.9573





In [67]:
cls_report = sklearn.metrics.classification_report(train_test_expert_data['sentence_class'], train_test_mv_data['sentence_class'], zero_division=1, digits=4)
print(cls_report)

              precision    recall  f1-score   support

           0     0.9661    0.7573    0.8491      2485
           1     0.6578    0.9461    0.7760      1225

    accuracy                         0.8197      3710
   macro avg     0.8119    0.8517    0.8126      3710
weighted avg     0.8643    0.8197    0.8250      3710



In [68]:
cls_report = sklearn.metrics.classification_report(train_test_expert_data['sentence_class'], train_test_bsc_data['sentence_class'], zero_division=1, digits=4)
print(cls_report)

              precision    recall  f1-score   support

           0     0.9661    0.7573    0.8491      2485
           1     0.6578    0.9461    0.7760      1225

    accuracy                         0.8197      3710
   macro avg     0.8119    0.8517    0.8126      3710
weighted avg     0.8643    0.8197    0.8250      3710

