In [1]:
import numpy as np
import pandas as pd
import utils
from math import ceil
from sklearn.model_selection import GroupKFold

___
# Load cytology and histopathology reports
### Load metadata

In [2]:
# Load a csv file with report IDs and labels
df = pd.read_csv("../datasets/reports_metadata.csv")
print(df.shape)
df.head()

(283, 4)


Unnamed: 0,report_id,patient_id,report_no,y_report
0,754,1,1,Negative
1,214,2,1,Positive
2,125,3,1,Negative
3,859,4,1,Negative
4,381,5,1,Negative


### Read report texts

In [3]:
# Define the folder with report texts
path = "../datasets/reports/"

# Read report texts
df['report_text'] = df.apply(utils.read_report, path=path, axis=1)

### Summary statistics

In [4]:
print("Total number of reports:", df.shape[0])
print("Total number of unique patients:", df.patient_id.nunique())
print("Number of reports per patient varies from %d to %d with a median value of %d." % 
      (df.groupby('patient_id').size().min(), 
       df.groupby('patient_id').size().max(), 
       df.groupby('patient_id').size().median())
     )
print("The average document length is %d characters." % df.report_text.apply(len).mean())

Total number of reports: 283
Total number of unique patients: 201
Number of reports per patient varies from 1 to 6 with a median value of 1.
The average document length is 1353 characters.


In [5]:
utils.print_stats(df)

Number of patients: 201
Number of reports: 283

Report-level annotation:
Negative    243
Positive     40
Name: y_report, dtype: int64

Proportion of positive reports: 14.1%


___
# Split data into development and test

In [6]:
# Convert report labels to int
df['y'] = np.where(df.y_report=="Positive", 1, 0)

### Determine the target number of positive and negative reports as 20% of the dataset

In [7]:
# Target test set size, number of positive and negative reports
n_neg_test, n_pos_test = (df.y.value_counts() * 0.2).round().astype(int).values
n_test = n_neg_test + n_pos_test
print("The target is to have %d reports in the test set: %d negative and %d positive." % 
      (n_test, n_neg_test, n_pos_test))

The target is to have 57 reports in the test set: 49 negative and 8 positive.


### Allocate positive reports

In [8]:
# IDs of patients that have at least one positive report
pos_ids = df[df.y==1].patient_id.unique().tolist()

# IDs of patients that have only one positive report
one_pos_ids = [id_ for id_ in pos_ids if df[df.patient_id==id_].y.sum() == 1]

print("There are %d patients that have at least one positive report, out of those %d have exactly one." % 
      (len(pos_ids), len(one_pos_ids)))

# Randomly select n_pos_test IDs for the test set
test_ids = df[df.patient_id.isin(one_pos_ids) & df.y].sample(n_pos_test, random_state=42).patient_id.tolist()

# Add ALL reports from these patients to the test set
test_data = df[df.patient_id.isin(test_ids)]
test_data.y_report.value_counts()

There are 28 patients that have at least one positive report, out of those 22 have exactly one.


Positive    8
Negative    5
Name: y_report, dtype: int64

In [9]:
# Remaining IDs will be allocated to the development set
dev_ids = [id_ for id_ in pos_ids if id_ not in test_ids]

# Add reports from the remaining patients to the development set
dev_data = df[df.patient_id.isin(dev_ids) & df.y]
dev_data.y_report.value_counts()

Positive    32
Name: y_report, dtype: int64

In [10]:
# Remove from the original dataset reports that have already been allocated 
df.drop(dev_data.index, inplace=True)
df.drop(test_data.index, inplace=True)
print( "After allocating %d reports to the development set and %d reports to the test set, there are %d reports left." % 
      (dev_data.shape[0], test_data.shape[0], df.shape[0]))

# Verify that all the remaining reports are negative
assert (df.y_report == "Negative").all()

After allocating 32 reports to the development set and 13 reports to the test set, there are 238 reports left.


### Allocate negative reports

In [11]:
# How many negative reports should be added to the test set to meet the target?
n_neg_add = n_neg_test - (test_data.y==0).sum()

# The number of splits for GroupKFold generator
n_splits = ceil(df.shape[0] / n_neg_add)

print("There are currently %d negative reports in the test set." % (test_data.y==0).sum())
print("Need to add %d more negative reports to meet the target." % n_neg_add)
print("This will be best achieved by splitting off 1/%d of the remaning reports." % n_splits)

There are currently 5 negative reports in the test set.
Need to add 44 more negative reports to meet the target.
This will be best achieved by splitting off 1/6 of the remaning reports.


In [12]:
# Instantiate a CV generator
cv = GroupKFold(n_splits=n_splits)

# Group reports by patient ID
cv_generator = cv.split(df.report_text, df.y, df.patient_id)

# Extract indices for the development and test set
dev_idx, test_idx = next(cv_generator)
print("Adding %d and %d reports to the development and test sets, respectively." % 
      (len(dev_idx), len(test_idx)))

Adding 198 and 40 reports to the development and test sets, respectively.


In [13]:
# Concatenate subsets of data
dev_data = pd.concat([dev_data, df.iloc[dev_idx]], axis=0)
test_data = pd.concat([test_data, df.iloc[test_idx]], axis=0)

# Verify that all reports from the same patient belong to one subset of data
assert set(dev_data.patient_id.unique()).intersection(set(test_data.patient_id.unique())) == set()

print("The split results in a total of %d and %d reports in the development and test sets, respectively." %
      (dev_data.shape[0], test_data.shape[0]))

The split results in a total of 230 and 53 reports in the development and test sets, respectively.


In [14]:
print("DEVELOPMENT SET")
utils.print_stats(dev_data)
print()
print("TEST SET")
utils.print_stats(test_data)

DEVELOPMENT SET
Number of patients: 164
Number of reports: 230

Report-level annotation:
Negative    198
Positive     32
Name: y_report, dtype: int64

Proportion of positive reports: 13.9%

TEST SET
Number of patients: 37
Number of reports: 53

Report-level annotation:
Negative    45
Positive     8
Name: y_report, dtype: int64

Proportion of positive reports: 15.1%


In [15]:
# Save data
dev_data.to_csv("../datasets/reports_dev.csv", index=False)
test_data.to_csv("../datasets/reports_test.csv", index=False)