In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from sklearn.utils import resample

In [2]:
data_dir = './data/csv'

data_entry_file = os.path.join(data_dir, 'Data_Entry_2017_v2020.csv')
sample_labels_file = os.path.join(data_dir, 'sample_labels.csv')

data_entry_df = pd.read_csv(data_entry_file)
sample_labels_df = pd.read_csv(sample_labels_file)

In [3]:
data_entry_df_copy = data_entry_df.copy()
sample_labels_df_copy = sample_labels_df.copy()

In [4]:
finding_labels_df = data_entry_df_copy['Finding Labels'].str.get_dummies(sep='|')

data_entry_df_copy = pd.concat([data_entry_df_copy, finding_labels_df], 1)

In [5]:
old_new_columns_map = {
    'Image Index': 'image_index',
    'Finding Labels': 'finding_labels',
    'Follow-up #': 'follow_up_number',
    'Patient ID': 'patient_id',
    'Patient Age':'patient_age',
    'Patient Gender': 'patient_gender',
    'View Position': 'view_position',
    'OriginalImage[Width': 'original_image_width',
    'Height]': 'original_image_height',
    'OriginalImagePixelSpacing[x': 'original_image_pixel_spacing_x',
    'y]': 'original_image_pixel_spacing_y',
    'Atelectasis': 'atelectasis',
    'Cardiomegaly': 'cardiomegaly',
    'Consolidation': 'consolidation',
    'Edema': 'edema',
    'Effusion': 'effusion',
    'Emphysema': 'emphysema',
    'Fibrosis': 'fibrosis',
    'Hernia': 'hernia',
    'Infiltration': 'infiltration',
    'Mass': 'mass',
    'No Finding': 'no_finding',
    'Nodule': 'nodule',
    'Pleural_Thickening': 'pleural_thickening',
    'Pneumonia': 'pneumonia',
    'Pneumothorax': 'pneumothorax'
}

data_entry_df_copy.rename(columns=old_new_columns_map, inplace=True)
sample_labels_df_copy.rename(columns=old_new_columns_map, inplace=True)

In [6]:
old_new_columns_map = {
    'Atelectasis': 'atelectasis',
    'Cardiomegaly': 'cardiomegaly',
    'Consolidation': 'consolidation',
    'Edema': 'edema',
    'Effusion': 'effusion',
    'Emphysema': 'emphysema',
    'Fibrosis': 'fibrosis',
    'Hernia': 'hernia',
    'Infiltration': 'infiltration',
    'Mass': 'mass',
    'No Finding': 'no_finding',
    'Nodule': 'nodule',
    'Pleural_Thickening': 'pleural_thickening',
    'Pneumonia': 'pneumonia',
    'Pneumothorax': 'pneumothorax'
}

finding_labels_df.rename(columns=old_new_columns_map, inplace=True)

In [7]:
sample_size = data_entry_df_copy.shape[0]

pneumonia_cases = data_entry_df_copy.query('pneumonia == 1')
nonpneumonia_cases = data_entry_df_copy.query('pneumonia == 0')

num_pneumonia_cases = pneumonia_cases.shape[0]
num_nonpneumonia_cases = nonpneumonia_cases.shape[0]

pct_pneumonia_cases = num_pneumonia_cases / sample_size
pct_nonpneumonia_cases = num_nonpneumonia_cases / sample_size

In [8]:
print('Total number of records: {}'.format(sample_size))

print('Total number of Pneumonia cases: {}'.format(num_pneumonia_cases))
print('Total number of non Pneumonia cases: {}'.format(num_nonpneumonia_cases))

print('Percentage of Pneumonia: {:0.2f}'.format(pct_pneumonia_cases))
print('Percentage of non Pneumonia: {:0.2f}'.format(pct_nonpneumonia_cases))

Total number of records: 112120
Total number of Pneumonia cases: 1431
Total number of non Pneumonia cases: 110689
Percentage of Pneumonia: 0.01
Percentage of non Pneumonia: 0.99


---
---
---

In [9]:
p_size = pneumonia_cases.shape[0]

final_size = p_size * 2

n_size = p_size

In [10]:
df_minority = pneumonia_cases.copy()

In [11]:
df_majority = nonpneumonia_cases[nonpneumonia_cases.image_index.isin(sample_labels_df_copy.image_index)]

In [12]:
df_majority_sample = df_majority.sample(n_size)

In [13]:
df_balanced = pd.concat([df_majority_sample, df_minority])

df_balanced = df_balanced.sample(frac=1).reset_index(drop=True)

---
---
---

In [14]:
training_size = int(np.rint(final_size * 0.8))
testing_size = final_size - training_size

training_size, testing_size

(2290, 572)

In [15]:
df_training = df_balanced.sample(training_size)

In [16]:
df_testing = df_balanced[~df_balanced.index.isin(df_training.index)]

In [17]:
df_training = df_training.sample(frac=1).reset_index(drop=True)
df_testing = df_testing.sample(frac=1).reset_index(drop=True)

In [18]:
df_training

Unnamed: 0,image_index,finding_labels,follow_up_number,patient_id,patient_age,patient_gender,view_position,original_image_width,original_image_height,original_image_pixel_spacing_x,...,emphysema,fibrosis,hernia,infiltration,mass,no_finding,nodule,pleural_thickening,pneumonia,pneumothorax
0,00010544_029.png,Pneumonia,29,10544,32,F,AP,2500,2048,0.168,...,0,0,0,0,0,0,0,0,1,0
1,00020209_007.png,Nodule|Pneumonia,7,20209,67,M,AP,2500,2048,0.168,...,0,0,0,0,0,0,1,0,1,0
2,00002919_004.png,Infiltration,1,2919,47,F,PA,2992,2991,0.143,...,0,0,0,1,0,0,0,0,0,0
3,00015594_001.png,No Finding,1,15594,60,M,AP,2500,2048,0.168,...,0,0,0,0,0,1,0,0,0,0
4,00017504_061.png,Edema|Infiltration|Pneumonia,61,17504,11,F,AP,2500,2048,0.168,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2285,00011523_000.png,No Finding,0,11523,42,F,PA,2992,2991,0.143,...,0,0,0,0,0,1,0,0,0,0
2286,00017710_009.png,Consolidation|Pneumonia,8,17710,44,F,AP,2500,2048,0.168,...,0,0,0,0,0,0,0,0,1,0
2287,00009386_003.png,Edema|Effusion|Infiltration|Pneumonia,3,9386,30,F,AP,2500,2048,0.168,...,0,0,0,1,0,0,0,0,1,0
2288,00021806_007.png,Atelectasis|Infiltration|Pneumonia,7,21806,47,F,AP,3056,2544,0.139,...,0,0,0,1,0,0,0,0,1,0


In [19]:
df_testing

Unnamed: 0,image_index,finding_labels,follow_up_number,patient_id,patient_age,patient_gender,view_position,original_image_width,original_image_height,original_image_pixel_spacing_x,...,emphysema,fibrosis,hernia,infiltration,mass,no_finding,nodule,pleural_thickening,pneumonia,pneumothorax
0,00026366_001.png,Pneumonia,1,26366,23,M,AP,2544,3056,0.139000,...,0,0,0,0,0,0,0,0,1,0
1,00007007_004.png,Atelectasis|Consolidation|Pneumonia,4,7007,44,F,AP,2500,2048,0.171000,...,0,0,0,0,0,0,0,0,1,0
2,00017504_059.png,Edema|Pneumonia,59,17504,11,F,AP,2500,2048,0.168000,...,0,0,0,0,0,0,0,0,1,0
3,00011328_004.png,No Finding,3,11328,39,M,PA,2992,2991,0.143000,...,0,0,0,0,0,1,0,0,0,0
4,00000080_005.png,No Finding,5,80,67,F,PA,1884,2021,0.194311,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
567,00014675_023.png,No Finding,23,14675,67,F,AP,2048,2500,0.168000,...,0,0,0,0,0,1,0,0,0,0
568,00021584_001.png,Pneumonia,8,21584,46,M,PA,2992,2991,0.143000,...,0,0,0,0,0,0,0,0,1,0
569,00013249_037.png,No Finding,37,13249,14,M,AP,2500,2048,0.168000,...,0,0,0,0,0,1,0,0,0,0
570,00004533_020.png,Edema|Effusion|Infiltration|Pneumonia,17,4533,34,M,AP,3004,2544,0.139000,...,0,0,0,1,0,0,0,0,1,0


In [20]:
training_filepath = os.path.join(data_dir, "train_data.csv")
df_training.to_csv (training_filepath, index = True, header = True)

testing_filepath = os.path.join(data_dir, "test_data.csv")
df_testing.to_csv (testing_filepath, index = True, header = True)