___
# Automated extraction of IFI-related information
___

In [1]:
import numpy as np
import pandas as pd
from utils import text, prod, eval_ner
from utils.dev import get_cv_strategy

# import warnings
# warnings.filterwarnings("ignore")

___
# Detect concepts and relations
### Run concept recognition and relationship detection on the development set with CV
**Load and prepare data**

In [2]:
# Load the development set
df = pd.read_csv("../datasets/reports_dev.csv")

# Clean data
df['clean_text'] = df.order_results.apply(text.clean_text)

**Run NER**

In [3]:
cv = get_cv_strategy()

for train_idx, val_idx in cv.split(df.clean_text, df.y_report, df.patient_id):
    
    # Learn vocabulary and termsets
    vocab = text.learn_vocab(df.loc[train_idx].histopathology_id, expand=True)
    termset = text.learn_termset(df.loc[train_idx].histopathology_id)
    
    # Load NLP pipeline
    nlp = text.build_nlp_pipeline(termset)
    
    # Match phrases in text to detect concepts
    df.loc[val_idx, 'doc'] = text.detect_concepts(df.loc[val_idx, 'clean_text'], nlp, vocab)

Number of unique tokens in each category: [34, 58, 14, 21, 13, 44, 34, 6, 10]
Number of unique tokens in each category after expanding: [34, 58, 14, 22, 13, 44, 34, 6, 10]
Number of unique tokens in each termset: [28, 6, 28, 6]
Number of unique tokens in each category: [32, 50, 14, 18, 12, 43, 32, 4, 10]
Number of unique tokens in each category after expanding: [32, 50, 14, 21, 12, 43, 32, 4, 10]
Number of unique tokens in each termset: [26, 6, 26, 6]
Number of unique tokens in each category: [31, 59, 14, 21, 12, 43, 34, 6, 8]
Number of unique tokens in each category after expanding: [31, 59, 14, 22, 12, 43, 34, 6, 8]
Number of unique tokens in each termset: [28, 5, 28, 5]
Number of unique tokens in each category: [29, 57, 13, 21, 12, 40, 33, 6, 10]
Number of unique tokens in each category after expanding: [29, 57, 13, 22, 12, 40, 33, 6, 10]
Number of unique tokens in each termset: [27, 6, 27, 6]
Number of unique tokens in each category: [30, 55, 14, 17, 12, 39, 31, 6, 10]
Number of un

**Get detected concepts (either to use directly to evaluate NER or to get concept counts)**

In [4]:
# Transform predictions to a table of concepts
detected_concepts = text.get_concepts(df)

**Evaluate NER**

In [5]:
# Load gold standard concepts
concepts = text.load_annotations('composite')

# Get feature names
feature_names = prod.get_feature_names(['concepts','composite'])

# Calculate precision and recall
eval_ner.evaluate_ner(df[['histopathology_id', 'val_fold']], 
                      concepts, detected_concepts, feature_names)

                           mean   std
concept                              
ClinicalQuery              0.98  0.08
FungalDescriptor           0.73  0.10
Fungus                     0.89  0.09
Invasiveness               0.27  0.41
Stain                      0.95  0.06
SampleType                 0.14  0.03
positive                   0.04  0.03
equivocal                  0.01  0.01
negative                   0.11  0.03
affirmed_FungalDescriptor  0.42  0.18
affirmed_Fungus            0.76  0.35
affirmed_Invasiveness      0.03  0.07
affirmed_Stain             0.04  0.08
negated_FungalDescriptor   0.81  0.12
negated_Fungus             0.96  0.08
negated_Invasiveness       0.30  0.42
negated_Stain              0.02  0.04
                           mean   std
concept                              
ClinicalQuery              0.49  0.22
FungalDescriptor           0.93  0.05
Fungus                     0.93  0.14
Invasiveness               0.35  0.42
Stain                      0.96  0.07
SampleType  

  return x.tp.sum() / (x.tp.sum()+x.fp.sum())
  ).apply(precision).groupby('concept').agg(['mean', 'std']).round(2))
  return x.tp.sum() / (x.tp.sum()+x.fn.sum())
  ).apply(recall).groupby('concept').agg(['mean', 'std']).round(2))


### Run relationship detection using gold standard annotations on the development set with CV
**Load and prepare data**

In [6]:
# Load the development set
df = pd.read_csv("../datasets/reports_dev.csv")

# Clean data
df['clean_text'] = df.order_results.apply(text.clean_text)

**Run NER**

In [7]:
cv = get_cv_strategy()

for train_idx, val_idx in cv.split(df.clean_text, df.y_report, df.patient_id):
    
    # Learn termsets
    termset = text.learn_termset(df.loc[train_idx].histopathology_id)
    
    # Load NLP pipeline
    nlp = text.build_nlp_pipeline(termset)
    
    # Label known concepts
    df.loc[val_idx, 'doc'] = text.label_concepts(df.loc[val_idx, ['histopathology_id', 'clean_text']], nlp)


Number of unique tokens in each termset: [28, 6, 28, 6]
Number of unique tokens in each termset: [26, 6, 26, 6]
Number of unique tokens in each termset: [28, 5, 28, 5]
Number of unique tokens in each termset: [27, 6, 27, 6]
Number of unique tokens in each termset: [25, 6, 25, 6]
Number of unique tokens in each termset: [24, 6, 24, 6]
Number of unique tokens in each termset: [27, 6, 27, 6]
Number of unique tokens in each termset: [25, 5, 25, 5]
Number of unique tokens in each termset: [27, 6, 27, 6]
Number of unique tokens in each termset: [25, 6, 25, 6]


**Get detected concepts (either to use directly to evaluate NER or to get concept counts)**

In [8]:
# Transform predictions to a table of concepts
detected_concepts = text.get_concepts(df)

**Evaluate NER**

In [9]:
# Load gold standard concepts
concepts = text.load_annotations('composite')

# Get feature names
feature_names = prod.get_feature_names(['concepts','composite'])

# Calculate precision and recall
eval_ner.evaluate_ner(df[['histopathology_id', 'val_fold']], 
                      concepts, detected_concepts, feature_names)

                           mean   std
concept                              
ClinicalQuery              1.00  0.00
FungalDescriptor           1.00  0.00
Fungus                     1.00  0.00
Invasiveness               1.00  0.00
Stain                      1.00  0.00
SampleType                 1.00  0.00
positive                   1.00  0.00
equivocal                  1.00  0.00
negative                   1.00  0.00
affirmed_FungalDescriptor  0.67  0.24
affirmed_Fungus            0.78  0.34
affirmed_Invasiveness      0.05  0.10
affirmed_Stain             0.04  0.08
negated_FungalDescriptor   0.97  0.05
negated_Fungus             0.98  0.08
negated_Invasiveness       0.53  0.50
negated_Stain              0.02  0.04
                           mean   std
concept                              
ClinicalQuery              1.00  0.00
FungalDescriptor           1.00  0.00
Fungus                     1.00  0.00
Invasiveness               1.00  0.00
Stain                      1.00  0.00
SampleType  

  return x.tp.sum() / (x.tp.sum()+x.fp.sum())
  ).apply(precision).groupby('concept').agg(['mean', 'std']).round(2))
  return x.tp.sum() / (x.tp.sum()+x.fn.sum())
  ).apply(recall).groupby('concept').agg(['mean', 'std']).round(2))


### Run concept recognition and relationship detection on the test set

In [10]:
# Load the development set
df_test = pd.read_csv("../datasets/reports_test.csv")

# Clean data
df_test['clean_text'] = df_test.order_results.apply(text.clean_text)

# Learn vocabulary and termsets
vocab = text.learn_vocab(df.histopathology_id, expand=True)
termset = text.learn_termset(df.histopathology_id)

# Load NLP pipeline
nlp = text.build_nlp_pipeline(termset)

# Match phrases in text to detect concepts
df_test['doc'] = text.detect_concepts(df_test.clean_text, nlp, vocab)

# Transform predictions to a table of concepts
detected_concepts = text.get_concepts(df_test)

# Load gold standard concepts
concepts = text.load_annotations('composite')

# Get feature names
feature_names = prod.get_feature_names(['concepts','composite'])

# Calculate precision and recall
eval_ner.evaluate_ner(df_test[['histopathology_id', 'val_fold']], 
                      concepts, detected_concepts, feature_names)

Number of unique tokens in each category: [34, 60, 14, 21, 13, 44, 34, 6, 10]
Number of unique tokens in each category after expanding: [34, 60, 14, 22, 13, 44, 34, 6, 10]
Number of unique tokens in each termset: [28, 6, 28, 6]
concept
ClinicalQuery                0.83
FungalDescriptor             0.80
Fungus                       0.94
Invasiveness                 0.33
Stain                        0.94
SampleType                   0.13
positive                     0.04
equivocal                    0.00
negative                     0.10
affirmed_FungalDescriptor    0.41
affirmed_Fungus              0.67
affirmed_Invasiveness        0.00
affirmed_Stain               0.00
negated_FungalDescriptor     0.84
negated_Fungus               0.89
negated_Invasiveness         1.00
negated_Stain                0.04
dtype: float64
concept
ClinicalQuery                0.71
FungalDescriptor             0.98
Fungus                       0.94
Invasiveness                 0.33
Stain                      

  print(counts.groupby('concept').apply(precision).round(2))
  print(counts.groupby('concept').apply(recall).round(2))
  return x.tp.sum() / (x.tp.sum()+x.fn.sum())


### Run relationship detection using gold standard annotations on the test set

In [11]:
# Load the development set
df_test = pd.read_csv("../datasets/reports_test.csv")

# Clean data
df_test['clean_text'] = df_test.order_results.apply(text.clean_text)

# Learn termsets
termset = text.learn_termset(df.histopathology_id)

# Load NLP pipeline
nlp = text.build_nlp_pipeline(termset)

# Label known concepts
df_test['doc'] = text.label_concepts(df_test[['histopathology_id', 'clean_text']], nlp)

# Transform predictions to a table of concepts
detected_concepts = text.get_concepts(df_test)

# Load gold standard concepts
concepts = text.load_annotations('composite')

# Get feature names
feature_names = prod.get_feature_names(['concepts','composite'])

# Calculate precision and recall
eval_ner.evaluate_ner(df_test[['histopathology_id', 'val_fold']], 
                      concepts, detected_concepts, feature_names)

Number of unique tokens in each termset: [28, 6, 28, 6]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['doc'] = df.clean_text.apply(nlp)


concept
ClinicalQuery                1.00
FungalDescriptor             1.00
Fungus                       1.00
Invasiveness                 1.00
Stain                        1.00
SampleType                   1.00
positive                     1.00
equivocal                     NaN
negative                     1.00
affirmed_FungalDescriptor    0.60
affirmed_Fungus              0.71
affirmed_Invasiveness        0.00
affirmed_Stain               0.00
negated_FungalDescriptor     0.86
negated_Fungus               0.86
negated_Invasiveness         0.67
negated_Stain                0.05
dtype: float64
concept
ClinicalQuery                1.00
FungalDescriptor             0.96
Fungus                       0.89
Invasiveness                 1.00
Stain                        0.91
SampleType                   1.00
positive                     1.00
equivocal                     NaN
negative                     0.92
affirmed_FungalDescriptor    0.53
affirmed_Fungus              0.83
affirmed_Invasive

  print(counts.groupby('concept').apply(precision).round(2))
  return x.tp.sum() / (x.tp.sum()+x.fp.sum())
  print(counts.groupby('concept').apply(recall).round(2))
  return x.tp.sum() / (x.tp.sum()+x.fn.sum())
