In [1]:
import numpy as np
import pandas as pd
from utils.prod import get_feature_names
from utils import text

___
# EDA on the gold standard annotations
### Load data

In [2]:
# Define path to CHIFIR dataset
path = "../../../Data/CHIFIR/"

# Load metadata for the full / training / test set
df = pd.read_csv(path + "/chifir_metadata.csv")
print(df.shape)
df.head()

(283, 6)


Unnamed: 0,patient_id,report_no,y_report,histopathology_id,val_fold,dataset
0,13,1,Positive,658,10.0,development
1,14,1,Positive,189,7.0,development
2,28,1,Negative,529,8.0,development
3,28,2,Positive,325,8.0,development
4,28,3,Negative,559,8.0,development


In [3]:
# Load gold standard annotations
concepts = text.load_annotations('concepts')
print(concepts.shape)
concepts.head()

(1155, 10)


Unnamed: 0,histopathology_id,patient_id,report_no,concept_id,concept,phrase,start_char,end_char,preceding,following
0,658,13,1,T2,Invasiveness,intravascular spaces,669,689,,
1,658,13,1,T4,Stain,PAS,715,718,,
2,658,13,1,T5,Stain,GMS,723,726,,
3,658,13,1,T9,positive,positive,700,708,False,True
4,658,13,1,T3,FungalDescriptor,necrotic fungi,651,665,,


In [4]:
relations = text.load_annotations('relations')
print(relations.shape)
relations.head()

(606, 7)


Unnamed: 0,histopathology_id,patient_id,report_no,relation_id,relation,arg1,arg2
0,658,13,1,R2,positive-rel,T9,T3
1,658,13,1,R3,fungus-stain-rel,T4,T3
2,658,13,1,R4,fungus-stain-rel,T5,T3
3,658,13,1,R1,positive-rel,T8,T6
4,658,13,1,R5,invasiveness-rel,T2,T3


In [5]:
composite_concepts = text.load_annotations('composite')
print(composite_concepts.shape)
composite_concepts.head()

(1497, 10)


Unnamed: 0,histopathology_id,patient_id,report_no,concept_id,concept,phrase,start_char,end_char,preceding,following
0,658,13,1,T2,Invasiveness,intravascular spaces,669,689,,
1,658,13,1,T4,Stain,PAS,715,718,,
2,658,13,1,T5,Stain,GMS,723,726,,
3,658,13,1,T9,positive,positive,700,708,False,True
4,658,13,1,T3,FungalDescriptor,necrotic fungi,651,665,,


In [6]:
# Filter to caluclate stats on dev and test sets
concepts = concepts.merge(df.histopathology_id)
relations = relations.merge(df.histopathology_id)
composite_concepts = composite_concepts.merge(df.histopathology_id)

concepts.shape, relations.shape, composite_concepts.shape

((1155, 10), (606, 7), (1497, 10))

In [7]:
# Number of reports with no annotations
print("%d reports with no annotations of any kind." % 
      (~df.histopathology_id.isin(concepts.histopathology_id) & 
       ~df.histopathology_id.isin(relations.histopathology_id)).sum())

print("%d reports with no concepts." % (~df.histopathology_id.isin(concepts.histopathology_id)).sum())

print("%d reports with no relations." % (~df.histopathology_id.isin(relations.histopathology_id)).sum())

24 reports with no annotations of any kind.
24 reports with no concepts.
149 reports with no relations.


### Summary statistics for concepts

In [8]:
# Median number of concepts in the annotated reports
concepts.groupby('histopathology_id').size().median()

3.0

In [9]:
# Total occurences for each category
concepts.concept.value_counts().sort_index()

concept
ClinicalQuery        68
FungalDescriptor    295
Fungus              106
Invasiveness         39
Stain               172
SampleType          198
positive            117
equivocal             8
negative            152
Name: count, dtype: int64

In [10]:
# Number of reports with at least one occurrence
concepts.groupby('concept', observed=False).histopathology_id.nunique()

concept
ClinicalQuery        53
FungalDescriptor    128
Fungus               60
Invasiveness         12
Stain               100
SampleType          179
positive             42
equivocal             5
negative            104
Name: histopathology_id, dtype: int64

In [11]:
# Number of unique phrases
concepts.groupby('concept', observed=False).phrase.nunique()

concept
ClinicalQuery       43
FungalDescriptor    85
Fungus              19
Invasiveness        27
Stain               16
SampleType          64
positive            40
equivocal            6
negative            12
Name: phrase, dtype: int64

In [12]:
# Lexical diversity
(concepts.groupby('concept', observed=False).phrase.nunique() / concepts.concept.value_counts().sort_index()).round(2)

concept
ClinicalQuery       0.63
FungalDescriptor    0.29
Fungus              0.18
Invasiveness        0.69
Stain               0.09
SampleType          0.32
positive            0.34
equivocal           0.75
negative            0.08
dtype: float64

In [13]:
# Percentage of occurrences in positive reports
concepts = concepts.merge(df[['histopathology_id', 'y_report']])
concepts.groupby('concept', observed=False).y_report.value_counts(normalize=True).sort_index().round(2)

concept           y_report
ClinicalQuery     Negative    0.59
                  Positive    0.41
FungalDescriptor  Negative    0.45
                  Positive    0.55
Fungus            Negative    0.42
                  Positive    0.58
Invasiveness      Negative    0.00
                  Positive    1.00
Stain             Negative    0.72
                  Positive    0.28
SampleType        Negative    0.86
                  Positive    0.14
positive          Negative    0.03
                  Positive    0.97
equivocal         Negative    0.00
                  Positive    1.00
negative          Negative    0.86
                  Positive    0.14
Name: proportion, dtype: float64

### Summary statistics for relations

In [14]:
# Median number of relations in the annotated reports
relations.groupby('histopathology_id').size().median()

4.0

In [15]:
# Total occurences for each category
relations.relation.value_counts().sort_index()

relation
positive-rel              144
equivocal-rel               9
negative-rel              201
fungal-description-rel     41
invasiveness-rel           38
fungus-stain-rel          173
Name: count, dtype: int64

In [16]:
# Number of reports with at least one occurrence
relations.groupby('relation', observed=False).histopathology_id.nunique()

relation
positive-rel               42
equivocal-rel               5
negative-rel              104
fungal-description-rel     22
invasiveness-rel           12
fungus-stain-rel           93
Name: histopathology_id, dtype: int64

In [17]:
# Percentage of occurrences in positive reports
relations = relations.merge(df[['histopathology_id', 'y_report']])
relations.groupby('relation', observed=False).y_report.value_counts(normalize=True).sort_index().round(2)

relation                y_report
positive-rel            Negative    0.03
                        Positive    0.97
equivocal-rel           Negative    0.00
                        Positive    1.00
negative-rel            Negative    0.88
                        Positive    0.12
fungal-description-rel  Negative    0.00
                        Positive    1.00
invasiveness-rel        Negative    0.00
                        Positive    1.00
fungus-stain-rel        Negative    0.74
                        Positive    0.26
Name: proportion, dtype: float64

In [18]:
# Total occurences for each category
composite_concepts.concept.value_counts().sort_index()

concept
ClinicalQuery                68
FungalDescriptor            295
Fungus                      106
Invasiveness                 39
Stain                       172
SampleType                  198
positive                    117
equivocal                     8
negative                    152
affirmedFungalDescriptor    101
affirmedFungus               37
affirmedInvasiveness          1
affirmedStain                 2
negatedFungalDescriptor     138
negatedFungus                53
negatedInvasiveness           6
negatedStain                  4
Name: count, dtype: int64

In [19]:
# Number of reports with at least one occurrence
composite_concepts.groupby('concept', observed=False).histopathology_id.nunique()

concept
ClinicalQuery                53
FungalDescriptor            128
Fungus                       60
Invasiveness                 12
Stain                       100
SampleType                  179
positive                     42
equivocal                     5
negative                    104
affirmedFungalDescriptor     38
affirmedFungus               24
affirmedInvasiveness          1
affirmedStain                 2
negatedFungalDescriptor      96
negatedFungus                35
negatedInvasiveness           4
negatedStain                  4
Name: histopathology_id, dtype: int64

In [20]:
# Percentage of occurrences in positive reports
composite_concepts = composite_concepts.merge(df[['histopathology_id', 'y_report']])
composite_concepts.groupby('concept', observed=False).y_report.value_counts(normalize=True).sort_index().round(2)

concept                   y_report
ClinicalQuery             Negative    0.59
                          Positive    0.41
FungalDescriptor          Negative    0.45
                          Positive    0.55
Fungus                    Negative    0.42
                          Positive    0.58
Invasiveness              Negative    0.00
                          Positive    1.00
Stain                     Negative    0.72
                          Positive    0.28
SampleType                Negative    0.86
                          Positive    0.14
positive                  Negative    0.03
                          Positive    0.97
equivocal                 Negative    0.00
                          Positive    1.00
negative                  Negative    0.86
                          Positive    0.14
affirmedFungalDescriptor  Negative    0.04
                          Positive    0.96
affirmedFungus            Negative    0.00
                          Positive    1.00
affirmedInvasivenes