In [1]:
import numpy as np
import pandas as pd
from utils.dataset import *
from utils.text import clean_text

___
# Load cytology and histopathology reports
### Load metadata

In [2]:
# Define path to CHIFIR dataset
path = "../../../Data/PIFIR/"

# Load a csv file with report IDs and labels
df = pd.read_csv(path + "/pifir_metadata.csv")
print(df.shape)
df.head()

(201, 6)


Unnamed: 0,patient_id,scan_no,imaging_id,is_ifi_episode,val_fold,dataset
0,254,2,100,True,5.0,development
1,56,1,106,True,5.0,development
2,231,2,111,True,1.0,development
3,262,1,119,True,1.0,development
4,3,1,125,False,4.0,development


### Load report texts

In [3]:
# Read report texts
df['order_results'] = df.apply(read_reports, path=path+"reports/", axis=1)

### Summary statistics

In [4]:
print("Total number of reports:", df.shape[0])
print("Total number of unique patients:", df.patient_id.nunique())
print("Number of reports per patient varies from %d to %d with a median value of %d." % 
      (df.groupby('patient_id').size().min(), 
       df.groupby('patient_id').size().max(), 
       df.groupby('patient_id').size().median())
     )
print("The average document length is %d characters." % df.order_results.apply(len).mean())

Total number of reports: 201
Total number of unique patients: 156
Number of reports per patient varies from 1 to 2 with a median value of 1.
The average document length is 1808 characters.


### Examine data splits

In [5]:
# How many reports in dev and test sets?
df.dataset.value_counts(dropna=False)

dataset
development    159
test            42
Name: count, dtype: int64

In [6]:
# How many reports in each CV fold?
df.val_fold.value_counts().sort_index()

val_fold
1.0    31
2.0    35
3.0    32
4.0    30
5.0    31
Name: count, dtype: int64

### Parse files with gold standard annotations

In [7]:
# Map character positions before and after text pre-processing
df['pos_mapping'] = df.order_results.apply(clean_text, return_mapping=True)

read_annotations(df, path=path+"annotations/")

Found 155 discontinous concepts that should be merged
Extracted 3134 concepts and 1960 relations.
After handling discontinous concepts, there are a total of 3194 concepts.
Totalling 3466 concepts and composite concepts.


### Save datasets

In [8]:
df[df.dataset=='development'].to_csv("../datasets/reports_dev.csv", index=False)
df[df.dataset=='test'].to_csv("../datasets/reports_test.csv", index=False)