In [1]:
import numpy as np
import pandas as pd
from utils.dataset import *
from utils.text import clean_text

___
# Load cytology and histopathology reports
### Load metadata

In [2]:
# Define path to CHIFIR dataset
path = "../../../Data/CHIFIR/"

# Load a csv file with report IDs and labels
df = pd.read_csv(path + "/chifir_metadata.csv")
print(df.shape)
df.head()

(283, 7)


Unnamed: 0,patient_id,report_no,y_report,is_ifi_episode,histopathology_id,val_fold,dataset
0,13,1,Positive,Yes,658,10.0,development
1,14,1,Positive,Yes,189,7.0,development
2,28,1,Negative,No,529,8.0,development
3,28,2,Positive,Yes,325,8.0,development
4,28,3,Negative,Yes,559,8.0,development


### Load report texts

In [3]:
# Read report texts
df['order_results'] = df.apply(read_reports, path=path+"reports/", axis=1)

### Convert target variable to categorical

In [4]:
df.y_report = labels2cat(df.y_report)

### Summary statistics

In [5]:
print("Total number of reports:", df.shape[0])
print("Total number of unique patients:", df.patient_id.nunique())
print("Number of reports per patient varies from %d to %d with a median value of %d." % 
      (df.groupby('patient_id').size().min(), 
       df.groupby('patient_id').size().max(), 
       df.groupby('patient_id').size().median())
     )
print("The average document length is %d characters." % df.order_results.apply(len).mean())

Total number of reports: 283
Total number of unique patients: 201
Number of reports per patient varies from 1 to 6 with a median value of 1.
The average document length is 1353 characters.


In [6]:
print_stats(df, 'y_report')

Number of patients: 201
Number of reports: 283

Proportion of reports of each class:
y_report
Negative    0.86
Positive    0.14
Name: proportion, dtype: float64


### Examine data splits

In [7]:
# How many reports in dev and test sets?
df.dataset.value_counts(dropna=False)

dataset
development    231
test            52
Name: count, dtype: int64

In [8]:
# How many reports in each CV fold?
df.val_fold.value_counts().sort_index()

val_fold
1.0     21
2.0     19
3.0     19
4.0     30
5.0     19
6.0     26
7.0     29
8.0     29
9.0     18
10.0    21
Name: count, dtype: int64

In [9]:
# Proportion of positive reports in each CV fold?
df.groupby('val_fold').y_report.value_counts(normalize=True).round(2)

val_fold  y_report
1.0       Negative    0.95
          Positive    0.05
2.0       Negative    0.79
          Positive    0.21
3.0       Negative    0.95
          Positive    0.05
4.0       Negative    0.83
          Positive    0.17
5.0       Negative    0.89
          Positive    0.11
6.0       Negative    0.77
          Positive    0.23
7.0       Negative    0.97
          Positive    0.03
8.0       Negative    0.79
          Positive    0.21
9.0       Negative    0.83
          Positive    0.17
10.0      Negative    0.86
          Positive    0.14
Name: proportion, dtype: float64

### Parse files with gold standard annotations

In [10]:
# Map character positions before and after text pre-processing
df['pos_mapping'] = df.order_results.apply(clean_text, return_mapping=True)

read_annotations(df, path=path+"annotations/")

Found 0 discontinous concepts that should be merged
Extracted 1137 concepts and 606 relations.
After handling discontinous concepts there are a total of 1155 concepts.
Totalling 1497 concepts and composite concepts.


### Save datasets

In [11]:
df[df.dataset=='development'].to_csv("../datasets/reports_dev.csv", index=False)
df[df.dataset=='test'].to_csv("../datasets/reports_test.csv", index=False)