In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from sklearn.utils import resample

In [2]:
data_dir = './data/csv'

data_entry_file = os.path.join(data_dir, 'Data_Entry_2017_v2020.csv')
sample_labels_file = os.path.join(data_dir, 'sample_labels.csv')

data_entry_df = pd.read_csv(data_entry_file)
sample_labels_df = pd.read_csv(sample_labels_file)

In [3]:
data_entry_df_copy = data_entry_df.copy()
sample_labels_df_copy = sample_labels_df.copy()

In [4]:
finding_labels_df = data_entry_df_copy['Finding Labels'].str.get_dummies(sep='|')

data_entry_df_copy = pd.concat([data_entry_df_copy, finding_labels_df], 1)

In [5]:
old_new_columns_map = {
    'Image Index': 'image_index',
    'Finding Labels': 'finding_labels',
    'Follow-up #': 'follow_up_number',
    'Patient ID': 'patient_id',
    'Patient Age':'patient_age',
    'Patient Gender': 'patient_gender',
    'View Position': 'view_position',
    'OriginalImage[Width': 'original_image_width',
    'Height]': 'original_image_height',
    'OriginalImagePixelSpacing[x': 'original_image_pixel_spacing_x',
    'y]': 'original_image_pixel_spacing_y',
    'Atelectasis': 'atelectasis',
    'Cardiomegaly': 'cardiomegaly',
    'Consolidation': 'consolidation',
    'Edema': 'edema',
    'Effusion': 'effusion',
    'Emphysema': 'emphysema',
    'Fibrosis': 'fibrosis',
    'Hernia': 'hernia',
    'Infiltration': 'infiltration',
    'Mass': 'mass',
    'No Finding': 'no_finding',
    'Nodule': 'nodule',
    'Pleural_Thickening': 'pleural_thickening',
    'Pneumonia': 'pneumonia',
    'Pneumothorax': 'pneumothorax'
}

data_entry_df_copy.rename(columns=old_new_columns_map, inplace=True)
sample_labels_df_copy.rename(columns=old_new_columns_map, inplace=True)

In [6]:
old_new_columns_map = {
    'Atelectasis': 'atelectasis',
    'Cardiomegaly': 'cardiomegaly',
    'Consolidation': 'consolidation',
    'Edema': 'edema',
    'Effusion': 'effusion',
    'Emphysema': 'emphysema',
    'Fibrosis': 'fibrosis',
    'Hernia': 'hernia',
    'Infiltration': 'infiltration',
    'Mass': 'mass',
    'No Finding': 'no_finding',
    'Nodule': 'nodule',
    'Pleural_Thickening': 'pleural_thickening',
    'Pneumonia': 'pneumonia',
    'Pneumothorax': 'pneumothorax'
}

finding_labels_df.rename(columns=old_new_columns_map, inplace=True)

In [7]:
sample_size = data_entry_df_copy.shape[0]

pneumonia_cases = data_entry_df_copy.query('pneumonia == 1')
nonpneumonia_cases = data_entry_df_copy.query('pneumonia == 0')

num_pneumonia_cases = pneumonia_cases.shape[0]
num_nonpneumonia_cases = nonpneumonia_cases.shape[0]

pct_pneumonia_cases = num_pneumonia_cases / sample_size
pct_nonpneumonia_cases = num_nonpneumonia_cases / sample_size

In [8]:
print('Total number of records: {}'.format(sample_size))

print('Total number of Pneumonia cases: {}'.format(num_pneumonia_cases))
print('Total number of non Pneumonia cases: {}'.format(num_nonpneumonia_cases))

print('Percentage of Pneumonia: {:0.2f}'.format(pct_pneumonia_cases))
print('Percentage of non Pneumonia: {:0.2f}'.format(pct_nonpneumonia_cases))

Total number of records: 112120
Total number of Pneumonia cases: 1431
Total number of non Pneumonia cases: 110689
Percentage of Pneumonia: 0.01
Percentage of non Pneumonia: 0.99


---
---
---

In [9]:
final_size = sample_labels_df_copy.shape[0]

p_size = pneumonia_cases.shape[0]
n_size = final_size - p_size

In [10]:
df_minority = pneumonia_cases.copy()

In [11]:
df_majority = nonpneumonia_cases[nonpneumonia_cases.image_index.isin(sample_labels_df_copy.image_index)]

In [12]:
df_majority_sample = df_majority.sample(n_size)

In [13]:
df_balanced = pd.concat([df_majority_sample, df_minority])

df_balanced = df_balanced.sample(frac=1).reset_index(drop=True)

---
---
---

In [14]:
training_size = int(np.rint(final_size * 0.8))
testing_size = final_size - training_size

training_size, testing_size

(4485, 1121)

In [15]:
df_training = df_balanced.sample(training_size)

In [16]:
df_testing = df_balanced[~df_balanced.index.isin(df_training.index)]

In [17]:
df_training = df_training.sample(frac=1).reset_index(drop=True)
df_testing = df_testing.sample(frac=1).reset_index(drop=True)

In [18]:
df_training

Unnamed: 0,image_index,finding_labels,follow_up_number,patient_id,patient_age,patient_gender,view_position,original_image_width,original_image_height,original_image_pixel_spacing_x,...,emphysema,fibrosis,hernia,infiltration,mass,no_finding,nodule,pleural_thickening,pneumonia,pneumothorax
0,00000211_018.png,Cardiomegaly|Effusion|Pneumonia,18,211,59,F,AP,2500,2048,0.168000,...,0,0,0,0,0,0,0,0,1,0
1,00010352_046.png,No Finding,46,10352,32,M,PA,2850,2991,0.143000,...,0,0,0,0,0,1,0,0,0,0
2,00017643_003.png,Atelectasis,3,17643,88,M,AP,2500,2048,0.168000,...,0,0,0,0,0,0,0,0,0,0
3,00028882_006.png,No Finding,6,28882,30,M,PA,2020,2021,0.194311,...,0,0,0,0,0,1,0,0,0,0
4,00015375_007.png,No Finding,7,15375,74,M,AP,2500,2048,0.168000,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4480,00008716_000.png,Atelectasis|Cardiomegaly,0,8716,33,F,PA,2500,2048,0.168000,...,0,0,0,0,0,0,0,0,0,0
4481,00012428_003.png,No Finding,3,12428,62,F,AP,2500,2048,0.168000,...,0,0,0,0,0,1,0,0,0,0
4482,00006468_003.png,No Finding,3,6468,21,M,AP,2048,2500,0.171000,...,0,0,0,0,0,1,0,0,0,0
4483,00016765_000.png,No Finding,0,16765,51,M,PA,2962,2991,0.143000,...,0,0,0,0,0,1,0,0,0,0


In [19]:
df_testing

Unnamed: 0,image_index,finding_labels,follow_up_number,patient_id,patient_age,patient_gender,view_position,original_image_width,original_image_height,original_image_pixel_spacing_x,...,emphysema,fibrosis,hernia,infiltration,mass,no_finding,nodule,pleural_thickening,pneumonia,pneumothorax
0,00013615_053.png,Pneumonia,53,13615,10,F,AP,2500,2048,0.168,...,0,0,0,0,0,0,0,0,1,0
1,00011237_067.png,Infiltration|Pneumonia,66,11237,56,F,AP,2048,2500,0.168,...,0,0,0,1,0,0,0,0,1,0
2,00000652_005.png,No Finding,5,652,49,F,AP,2500,2048,0.168,...,0,0,0,0,0,1,0,0,0,0
3,00012834_112.png,Edema|Infiltration|Pleural_Thickening|Pneumonia,112,12834,33,M,AP,2500,2048,0.168,...,0,0,0,1,0,0,0,1,1,0
4,00001661_000.png,No Finding,0,1661,16,M,PA,2500,2048,0.168,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1116,00002343_003.png,Effusion|Pneumonia,3,2343,31,F,PA,2682,2318,0.143,...,0,0,0,0,0,0,0,0,1,0
1117,00021711_015.png,Atelectasis|Effusion,15,21711,71,M,PA,2992,2991,0.143,...,0,0,0,0,0,0,0,0,0,0
1118,00025861_000.png,Infiltration,0,25861,26,M,PA,2892,2948,0.143,...,0,0,0,1,0,0,0,0,0,0
1119,00011424_006.png,No Finding,0,11424,63,F,PA,2562,2991,0.143,...,0,0,0,0,0,1,0,0,0,0


In [20]:
training_filepath = os.path.join(data_dir, "train_data.csv")
df_training.to_csv (training_filepath, index = True, header = True)

testing_filepath = os.path.join(data_dir, "test_data.csv")
df_testing.to_csv (testing_filepath, index = True, header = True)