___
# Automated extraction of IFI-related information
___

In [1]:
import numpy as np
import pandas as pd
import utils

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_style('ticks')
plt.rcParams['figure.figsize'] = (6, 4)
plt.rcParams['axes.titlesize'] = 22
plt.rcParams['axes.labelsize'] = 20
plt.rcParams['xtick.labelsize'] = 16
plt.rcParams['ytick.labelsize'] = 16

___
# Detect concepts and composite concepts
### Run concept recognition and relationship detection on the development set with CV

In [2]:
# Load the development set
df = pd.read_csv("../datasets/reports_dev.csv")

# Clean data
df['clean_text'] = df.report_text.apply(utils.clean_text)

# Learn vocabulary from k-1 folds and extract concepts from the k-th fold
df = utils.extract_features_cv(df)

Number of unique tokens in each category: [29, 56, 10, 13, 12, 43, 32, 5, 9]
Number of unique tokens in each category after expanding: [29, 56, 10, 15, 12, 43, 32, 5, 9]
Number of unique tokens in each category: [28, 57, 13, 17, 11, 38, 31, 4, 9]
Number of unique tokens in each category after expanding: [28, 57, 13, 17, 11, 38, 31, 4, 9]
Number of unique tokens in each category: [31, 60, 14, 15, 13, 44, 33, 5, 9]
Number of unique tokens in each category after expanding: [31, 60, 14, 15, 13, 44, 33, 5, 9]
Number of unique tokens in each category: [26, 58, 14, 13, 13, 41, 30, 5, 9]
Number of unique tokens in each category after expanding: [26, 58, 14, 14, 13, 41, 30, 5, 9]
Number of unique tokens in each category: [30, 58, 14, 16, 13, 42, 33, 5, 8]
Number of unique tokens in each category after expanding: [30, 58, 14, 17, 13, 42, 33, 5, 8]
Number of unique tokens in each category: [30, 53, 14, 14, 12, 40, 32, 5, 7]
Number of unique tokens in each category after expanding: [30, 53, 14, 16

### Run concept recognition and relationship detection on the test set

In [3]:
# Load the test set
df_test = pd.read_csv("../datasets/reports_test.csv")

# Clean data
df_test['clean_text'] = df_test.report_text.apply(utils.clean_text)

# Learn vocabulary from the development set
vocab = utils.create_vocab(df.report_id, expand=True)

# Extract concepts from the test set
df_test = utils.extract_features(df_test, vocab)

Number of unique tokens in each category: [32, 61, 14, 17, 13, 44, 33, 5, 9]
Number of unique tokens in each category after expanding: [32, 61, 14, 17, 13, 44, 33, 5, 9]


___
# Generate `.ann` files with detected concepts and composite concepts

In [4]:
# Define path to new annotation files
dst_path = "../datasets/automated_extraction/"

# Generate annotation files for the training set
df.apply(utils.write_ann_file, path=dst_path, axis=1)\

# Generate annotation files for the test set
df_test.apply(utils.write_ann_file, path=dst_path, axis=1)

0     None
1     None
2     None
3     None
4     None
5     None
6     None
7     None
8     None
9     None
10    None
11    None
12    None
13    None
14    None
15    None
16    None
17    None
18    None
19    None
20    None
21    None
22    None
23    None
24    None
25    None
26    None
27    None
28    None
29    None
30    None
31    None
32    None
33    None
34    None
35    None
36    None
37    None
38    None
39    None
40    None
41    None
42    None
43    None
44    None
45    None
46    None
47    None
48    None
49    None
50    None
51    None
52    None
dtype: object

> We use ` brat-eval` (https://github.com/READ-BioMed/brateval) to compare extracted concepts and composite concepts (`"../datasets/automated_extraction/"`) with gold standard annotations (`"../datasets/annotations_composite/"`).
>
> We run `brat-eval` on individual folds to obtain cross-validation performance. We then run `brat-eval` on the test set to evaluate the automated extraction of IFI-related information on unseen data.

___
# Evaluate the automated extraction of IFI-related information

In [5]:
# Load .csv file containing the output of brat-eval
df = pd.read_csv("../datasets/brat_eval_output.csv", index_col=0)

# Get feature names
feature_names = utils.get_feature_names("concepts") + utils.get_feature_names("composite")
df = df.loc[feature_names]

# Columns containing the results for cross-validation folds
folds = df.columns[:-1]

### Helper functions

In [6]:
def extract_results(x, i):
    try:
        m = int(x.split('|')[i])
        return m
    except:
        return np.nan 

def precision(tp, fp):
    return tp / (tp+fp)

def recall(tp, fn):
    return tp / (tp+fn)

def fscore(p, r):
    return 2 * (p * r) / (p + r)

def print_metrics(rows, cols):
    prec = precision(TPs.loc[rows, cols], FPs.loc[rows, cols])
    rec = recall(TPs.loc[rows, cols], FNs.loc[rows, cols])
    f = fscore(prec, rec)
    
    if len(cols) > 1: 
        print("Precision:\n", prec.aggregate(['mean', 'std'], axis=1).round(2))
        print("\nRecall:\n", rec.aggregate(['mean', 'std'], axis=1).round(2))
        print("\nF1 score:\n", f.aggregate(['mean', 'std'], axis=1).round(2))
    else:
        print("Precision:\n", prec.round(2))
        print("\nRecall:\n", rec.round(2))
        print("\nF1 score:\n", f.round(2))

        
def print_micro_metrics(rows, cols):
    prec = precision(TPs.loc[rows, cols].sum(), FPs.loc[rows, cols].sum())
    rec = recall(TPs.loc[rows, cols].sum(), FNs.loc[rows, cols].sum())
    f = fscore(prec, rec)
    
    if len(cols) > 1: 
        print("micro-Precision:\n", prec.aggregate(['mean', 'std']).round(2))
        print("\nmicro-Recall:\n", rec.aggregate(['mean', 'std']).round(2))
        print("\nmicro-F1 score:\n", f.aggregate(['mean', 'std']).round(2))
    else:
        print("micro-Precision:\n", prec.round(2))
        print("\nmicro-Recall:\n", rec.round(2))
        print("\nmicro-F1 score:\n", f.round(2))
        
        
def print_macro_metrics(rows, cols):
    prec = precision(TPs.loc[rows, cols], FPs.loc[rows, cols])
    rec = recall(TPs.loc[rows, cols], FNs.loc[rows, cols])
    f = fscore(prec, rec)
    
    if len(cols) > 1: 
        print("macro-Precision:\n", prec.mean(axis=1).mean().round(2))
        print("\nmacro-Recall:\n", rec.mean(axis=1).mean().round(2))
        print("\nmacro-F1 score:\n", f.mean(axis=1).mean().round(2))
    else:
        print("macro-Precision:\n", prec.mean().round(2))
        print("\nmacro-Recall:\n", rec.mean().round(2))
        print("\nmacro-F1 score:\n", f.mean().round(2))

### Parse `brat-eval` output to extract the number of TPs, FPs, and FNs

In [7]:
# Extract the number of TPs, FPs, FNs
TPs = df.applymap(extract_results, i=-6)
FPs = df.applymap(extract_results, i=-5)  
FNs = df.applymap(extract_results, i=-4) 

# Print the total number of concepts detected in each fold
(TPs + FPs).sum(axis=0)

fold1      463.0
fold2      512.0
fold3      367.0
fold4      456.0
fold5      422.0
fold6      673.0
fold7      735.0
fold8      394.0
fold9      475.0
fold10     461.0
TEST      1068.0
dtype: float64

In [8]:
# Group feature names to evaluate them separately
ifi_concepts = feature_names[:5]
certainty_cues = feature_names[6:9]
affirmed_concepts = feature_names[9:13]
negated_concepts = feature_names[13:17]
print("IFI concepts:", ifi_concepts)
print("Certainty cues:", certainty_cues)
print("Affirmed concepts:", affirmed_concepts)
print("Negated concepts:", negated_concepts)

IFI concepts: ['ClinicalQuery', 'FungalDescriptor', 'Fungus', 'Invasiveness', 'Stain']
Certainty cues: ['positive', 'equivocal', 'negative']
Affirmed concepts: ['affirmedFungalDescriptor', 'affirmedFungus', 'affirmedInvasiveness', 'affirmedStain']
Negated concepts: ['negatedFungalDescriptor', 'negatedFungus', 'negatedInvasiveness', 'negatedStain']


### CV and test set agreement for each feature 

In [9]:
# Print precision, recall, and f-score for every feature across CV folds
print("CROSS-VALIDATION")
print_metrics(feature_names, folds)

# Print precision, recall, and f-score for every feature in the test set
print("_"*80, "\nTEST")
print_metrics(feature_names, ['TEST'])

CROSS-VALIDATION
Precision:
                           mean   std
feature                             
ClinicalQuery             0.92  0.13
FungalDescriptor          0.75  0.10
Fungus                    0.82  0.30
Invasiveness              0.45  0.41
Stain                     0.94  0.05
SampleType                0.15  0.03
positive                  0.04  0.02
equivocal                 0.01  0.02
negative                  0.14  0.04
affirmedFungalDescriptor  0.64  0.20
affirmedFungus            0.76  0.27
affirmedInvasiveness      0.03  0.08
affirmedStain             0.02  0.06
negatedFungalDescriptor   0.78  0.12
negatedFungus             0.97  0.07
negatedInvasiveness       0.80  0.28
negatedStain              0.02  0.05

Recall:
                           mean   std
feature                             
ClinicalQuery             0.53  0.35
FungalDescriptor          0.93  0.04
Fungus                    0.92  0.15
Invasiveness              0.60  0.39
Stain                     0.95  0.09

### CV and test set micro-average for IFI concepts

In [10]:
# Print micro-precision, -recall, and -f-score for IFI features across CV fold
print("CROSS-VALIDATION")
print_micro_metrics(ifi_concepts, folds)

# Print micro-precision, -recall, and -f-score for IFI features on the test set
print("_"*80, "\nTEST")
print_micro_metrics(ifi_concepts, ['TEST'])

CROSS-VALIDATION
micro-Precision:
 mean    0.81
std     0.07
dtype: float64

micro-Recall:
 mean    0.87
std     0.09
dtype: float64

micro-F1 score:
 mean    0.84
std     0.07
dtype: float64
________________________________________________________________________________ 
TEST
micro-Precision:
 TEST    0.8
dtype: float64

micro-Recall:
 TEST    0.86
dtype: float64

micro-F1 score:
 TEST    0.83
dtype: float64


### CV and test set micro-average for certainty cues

In [11]:
# Print micro-precision, -recall, and -f-score for certainty cues across CV fold
print("CROSS-VALIDATION")
print_micro_metrics(certainty_cues, folds)

# Print micro-precision, -recall, and -f-score for certainty cues on the test set
print("_"*80, "\nTEST")
print_micro_metrics(certainty_cues, ['TEST'])

CROSS-VALIDATION
micro-Precision:
 mean    0.07
std     0.02
dtype: float64

micro-Recall:
 mean    0.89
std     0.08
dtype: float64

micro-F1 score:
 mean    0.12
std     0.03
dtype: float64
________________________________________________________________________________ 
TEST
micro-Precision:
 TEST    0.06
dtype: float64

micro-Recall:
 TEST    0.84
dtype: float64

micro-F1 score:
 TEST    0.11
dtype: float64


### CV and test set micro-average for affirmed concepts

In [12]:
# Print micro-precision, -recall, and -f-score for affirmed features across CV fold
print("CROSS-VALIDATION")
print_micro_metrics(affirmed_concepts, folds)

# Print micro-precision, -recall, and -f-score for affirmed features on the test set
print("_"*80, "\nTEST")
print_micro_metrics(affirmed_concepts, ['TEST'])

CROSS-VALIDATION
micro-Precision:
 mean    0.48
std     0.16
dtype: float64

micro-Recall:
 mean    0.69
std     0.24
dtype: float64

micro-F1 score:
 mean    0.55
std     0.17
dtype: float64
________________________________________________________________________________ 
TEST
micro-Precision:
 TEST    0.37
dtype: float64

micro-Recall:
 TEST    0.56
dtype: float64

micro-F1 score:
 TEST    0.44
dtype: float64


### CV and test set micro-average for negated concepts

In [13]:
# Print micro-precision, -recall, and -f-score for affirmed features across CV fold
print("CROSS-VALIDATION")
print_micro_metrics(negated_concepts, folds)

# Print micro-precision, -recall, and -f-score for affirmed features on the test set
print("_"*80, "\nTEST")
print_micro_metrics(negated_concepts, ['TEST'])

CROSS-VALIDATION
micro-Precision:
 mean    0.57
std     0.09
dtype: float64

micro-Recall:
 mean    0.96
std     0.07
dtype: float64

micro-F1 score:
 mean    0.71
std     0.08
dtype: float64
________________________________________________________________________________ 
TEST
micro-Precision:
 TEST    0.6
dtype: float64

micro-Recall:
 TEST    0.92
dtype: float64

micro-F1 score:
 TEST    0.73
dtype: float64


### CV and test set macro-average

In [14]:
# Print macro-precision, -recall, and -f-score for all concepts across CV fold
print("CROSS-VALIDATION")
print_macro_metrics(feature_names, folds)

# Print macro-precision, -recall, and -f-score for all concepts on the test set
print("_"*80, "\nTEST")
print_macro_metrics(feature_names, ['TEST'])

CROSS-VALIDATION
macro-Precision:
 0.49

macro-Recall:
 0.81

macro-F1 score:
 0.56
________________________________________________________________________________ 
TEST
macro-Precision:
 TEST    0.47
dtype: float64

macro-Recall:
 TEST    0.67
dtype: float64

macro-F1 score:
 TEST    0.56
dtype: float64
