# RSNA - Create Validation Dataset

This is a script to split the dataset into a training dataset and a validation dataset. As this is a multi-labeled dataset, it is not trivial to maintain the same label distribution (e.g. same frequency for the combination of 'epidural' and 'intraventricular'). Nevertheless, I tried to make the label frequency as similar as I can between the training and validation dataset.

In [1]:
import numpy as np
import pandas as pd

In [2]:
data_table = pd.read_pickle('rsna_data_table.pkl') 

## Create a "balanced" validation set manually

In [3]:
data_table.columns

Index(['epidural', 'intraparenchymal', 'intraventricular', 'subarachnoid',
       'subdural', 'any', 'count'],
      dtype='object')

In [4]:
epidural_index = data_table[data_table["epidural"] == 1].index
print(len(epidural_index))

msk = np.random.rand(len(epidural_index)) < 0.84
epidural_train = epidural_index[msk]
epidural_valid = epidural_index[~msk]
print(len(epidural_train))
print(len(epidural_valid))

2761
2299
462


In [5]:
intrap_index = data_table[data_table["intraparenchymal"] == 1].index
print(len(intrap_index))

msk = np.random.rand(len(intrap_index)) < 0.84
intrap_train = intrap_index[msk]
intrap_valid = intrap_index[~msk]
print(len(intrap_train))
print(len(intrap_valid))

32564
27426
5138


In [6]:
intrav_index = data_table[data_table["intraventricular"] == 1].index
print(len(intrav_index))

msk = np.random.rand(len(intrav_index)) < 0.84
intrav_train = intrav_index[msk]
intrav_valid = intrav_index[~msk]
print(len(intrav_train))
print(len(intrav_valid))

23766
19963
3803


In [7]:
suba_index = data_table[data_table["subarachnoid"] == 1].index
print(len(suba_index))

msk = np.random.rand(len(suba_index)) < 0.84
suba_train = suba_index[msk]
suba_valid = suba_index[~msk]
print(len(suba_train))
print(len(suba_valid))

32122
26919
5203


In [8]:
subd_index = data_table[data_table["subdural"] == 1].index
print(len(subd_index))

msk = np.random.rand(len(subd_index)) < 0.84
subd_train = subd_index[msk]
subd_valid = subd_index[~msk]
print(len(subd_train))
print(len(subd_valid))

42496
35568
6928


In [9]:
any_index = data_table[data_table["any"] == 1].index
print(len(any_index))

97103


In [10]:
train_index = epidural_train.append(intrap_train)
train_index = train_index.append(intrav_train)
train_index = train_index.append(suba_train)
train_index = train_index.append(subd_train)
print(len(train_index))
train_index = train_index.drop_duplicates()
print(len(train_index))

112175
85451


In [11]:
valid_index = epidural_valid.append(intrap_valid)
valid_index = valid_index.append(intrav_valid)
valid_index = valid_index.append(suba_valid)
valid_index = valid_index.append(subd_valid)
print(len(valid_index))
valid_index = valid_index.drop_duplicates()
print(len(valid_index))

21534
20428


In [12]:
overlapped_index = train_index.intersection(valid_index)
print(len(overlapped_index))

msk = np.random.rand(len(overlapped_index)) < 0.84
overlapped_train = overlapped_index[msk]
overlapped_valid = overlapped_index[~msk]

train_index = train_index.drop(overlapped_valid)
valid_index = valid_index.drop(overlapped_train)
overlapped_index = train_index.intersection(valid_index)
print(len(overlapped_index))

8776
0


In [13]:
table_index = data_table.index
table_index = table_index.drop(train_index)
table_index = table_index.drop(valid_index)

msk = np.random.rand(len(table_index)) < 0.84
table_train = table_index[msk]
table_valid = table_index[~msk]

train_index = train_index.append(table_train)
valid_index = valid_index.append(table_valid)

print(len(train_index))
print(len(valid_index))
print(len(train_index.intersection(valid_index)))

568772
105486
0


In [14]:
train_table = data_table.loc[train_index]
valid_table = data_table.loc[valid_index]

In [15]:
print(len(train_table[train_table["epidural"]==1])/len(train_table))
print(len(train_table[train_table["intraparenchymal"]==1])/len(train_table))
print(len(train_table[train_table["intraventricular"]==1])/len(train_table))
print(len(train_table[train_table["subarachnoid"]==1])/len(train_table))
print(len(train_table[train_table["subdural"]==1])/len(train_table))
print(len(train_table[train_table["any"]==1])/len(train_table))
print(len(train_table))

0.004253022300675842
0.0511962614193385
0.037505362429936774
0.05023454037821834
0.0648133171112502
0.14788175226628597
568772


In [16]:
print(len(valid_table[valid_table["epidural"]==1])/len(valid_table))
print(len(valid_table[valid_table["intraparenchymal"]==1])/len(valid_table))
print(len(valid_table[valid_table["intraventricular"]==1])/len(valid_table))
print(len(valid_table[valid_table["subarachnoid"]==1])/len(valid_table))
print(len(valid_table[valid_table["subdural"]==1])/len(valid_table))
print(len(valid_table[valid_table["any"]==1])/len(valid_table))
print(len(valid_table))

0.003242136397247028
0.03265836224712284
0.023074152020173294
0.033653755000663595
0.05339097131372884
0.12316326337144265
105486


In [17]:
print(len(data_table[data_table["epidural"]==1])/len(train_table))
print(len(data_table[data_table["intraparenchymal"]==1])/len(train_table))
print(len(data_table[data_table["intraventricular"]==1])/len(train_table))
print(len(data_table[data_table["subarachnoid"]==1])/len(train_table))
print(len(data_table[data_table["subdural"]==1])/len(train_table))
print(len(data_table[data_table["any"]==1])/len(train_table))
print(len(data_table))

0.004854317723094667
0.05725316998727082
0.04178475733685906
0.05647605718987573
0.07471535166991343
0.17072394562320226
674258


In [18]:
train_table.to_pickle('rsna_train_table.pkl')
valid_table.to_pickle('rsna_valid_table.pkl')