In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from sklearn.utils import resample

In [2]:
data_dir = '../archived/data/csv'

data_entry_file = os.path.join(data_dir, 'Data_Entry_2017_v2020.csv')
sample_labels_file = os.path.join(data_dir, 'sample_labels.csv')

data_entry_df = pd.read_csv(data_entry_file)
sample_labels_df = pd.read_csv(sample_labels_file)

In [3]:
data_entry_df_copy = data_entry_df.copy()
sample_labels_df_copy = sample_labels_df.copy()

In [4]:
finding_labels_df = data_entry_df_copy['Finding Labels'].str.get_dummies(sep='|')

data_entry_df_copy = pd.concat([data_entry_df_copy, finding_labels_df], 1)

In [5]:
old_new_columns_map = {
    'Image Index': 'image_index',
    'Finding Labels': 'finding_labels',
    'Follow-up #': 'follow_up_number',
    'Patient ID': 'patient_id',
    'Patient Age':'patient_age',
    'Patient Gender': 'patient_gender',
    'View Position': 'view_position',
    'OriginalImage[Width': 'original_image_width',
    'Height]': 'original_image_height',
    'OriginalImagePixelSpacing[x': 'original_image_pixel_spacing_x',
    'y]': 'original_image_pixel_spacing_y',
    'Atelectasis': 'atelectasis',
    'Cardiomegaly': 'cardiomegaly',
    'Consolidation': 'consolidation',
    'Edema': 'edema',
    'Effusion': 'effusion',
    'Emphysema': 'emphysema',
    'Fibrosis': 'fibrosis',
    'Hernia': 'hernia',
    'Infiltration': 'infiltration',
    'Mass': 'mass',
    'No Finding': 'no_finding',
    'Nodule': 'nodule',
    'Pleural_Thickening': 'pleural_thickening',
    'Pneumonia': 'pneumonia',
    'Pneumothorax': 'pneumothorax'
}

data_entry_df_copy.rename(columns=old_new_columns_map, inplace=True)
sample_labels_df_copy.rename(columns=old_new_columns_map, inplace=True)

In [6]:
old_new_columns_map = {
    'Atelectasis': 'atelectasis',
    'Cardiomegaly': 'cardiomegaly',
    'Consolidation': 'consolidation',
    'Edema': 'edema',
    'Effusion': 'effusion',
    'Emphysema': 'emphysema',
    'Fibrosis': 'fibrosis',
    'Hernia': 'hernia',
    'Infiltration': 'infiltration',
    'Mass': 'mass',
    'No Finding': 'no_finding',
    'Nodule': 'nodule',
    'Pleural_Thickening': 'pleural_thickening',
    'Pneumonia': 'pneumonia',
    'Pneumothorax': 'pneumothorax'
}

finding_labels_df.rename(columns=old_new_columns_map, inplace=True)

In [7]:
sample_size = data_entry_df_copy.shape[0]

pneumonia_cases = data_entry_df_copy.query('pneumonia == 1')
nonpneumonia_cases = data_entry_df_copy.query('pneumonia == 0')

num_pneumonia_cases = pneumonia_cases.shape[0]
num_nonpneumonia_cases = nonpneumonia_cases.shape[0]

pct_pneumonia_cases = num_pneumonia_cases / sample_size
pct_nonpneumonia_cases = num_nonpneumonia_cases / sample_size

In [8]:
print('Total number of records: {}'.format(sample_size))

print('Total number of Pneumonia cases: {}'.format(num_pneumonia_cases))
print('Total number of non Pneumonia cases: {}'.format(num_nonpneumonia_cases))

print('Percentage of Pneumonia: {:0.2f}'.format(pct_pneumonia_cases))
print('Percentage of non Pneumonia: {:0.2f}'.format(pct_nonpneumonia_cases))

Total number of records: 112120
Total number of Pneumonia cases: 1431
Total number of non Pneumonia cases: 110689
Percentage of Pneumonia: 0.01
Percentage of non Pneumonia: 0.99


---
---
---

In [9]:
final_size = sample_labels_df_copy.shape[0]

size = int(final_size / 2)

In [10]:
df_minority = pneumonia_cases[pneumonia_cases.image_index.isin(sample_labels_df_copy.image_index)]

In [11]:
df_majority = nonpneumonia_cases[nonpneumonia_cases.image_index.isin(sample_labels_df_copy.image_index)]

In [12]:
df_minority_upsampled = resample(df_minority, 
                                 replace=True,       # sample with replacement
                                 n_samples=size,    # to match majority class
                                 random_state=123)   # reproducible results

In [13]:
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=size,     # to match minority class
                                 random_state=123) # reproducible results

In [14]:
df_balanced = pd.concat([df_majority_downsampled, df_minority_upsampled])

df_balanced = df_balanced.sample(frac=1).reset_index(drop=True)

---
---
---

In [15]:
training_size = int(np.rint(final_size * 0.8))
testing_size = final_size - training_size

training_size, testing_size

(4485, 1121)

In [16]:
df_training = df_balanced.sample(training_size)

In [17]:
df_testing = df_balanced[~df_balanced.index.isin(df_training.index)]

In [18]:
df_training = df_training.sample(frac=1).reset_index(drop=True)
df_testing = df_testing.sample(frac=1).reset_index(drop=True)

In [19]:
df_training

Unnamed: 0,image_index,finding_labels,follow_up_number,patient_id,patient_age,patient_gender,view_position,original_image_width,original_image_height,original_image_pixel_spacing_x,...,emphysema,fibrosis,hernia,infiltration,mass,no_finding,nodule,pleural_thickening,pneumonia,pneumothorax
0,00020482_025.png,Infiltration|Pneumonia,25,20482,29,F,AP,2500,2048,0.168,...,0,0,0,1,0,0,0,0,1,0
1,00012667_000.png,Pneumonia,0,12667,7,F,PA,1804,1685,0.143,...,0,0,0,0,0,0,0,0,1,0
2,00015171_020.png,Infiltration,20,15171,67,M,PA,3056,2544,0.139,...,0,0,0,1,0,0,0,0,0,0
3,00005391_002.png,No Finding,2,5391,72,F,AP,3012,2544,0.139,...,0,0,0,0,0,1,0,0,0,0
4,00008470_008.png,Pneumonia,8,8470,29,F,AP,2500,2048,0.171,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4480,00001373_010.png,Cardiomegaly|Effusion|Pneumonia,10,1373,58,M,AP,3056,2544,0.139,...,0,0,0,0,0,0,0,0,1,0
4481,00013424_004.png,Effusion,4,13424,56,M,AP,2500,2048,0.168,...,0,0,0,0,0,0,0,0,0,0
4482,00027757_002.png,Effusion|Infiltration|Pneumonia,2,27757,21,M,AP,3056,2544,0.139,...,0,0,0,1,0,0,0,0,1,0
4483,00015069_000.png,Infiltration|Pneumonia,0,15069,25,M,PA,2048,2500,0.168,...,0,0,0,1,0,0,0,0,1,0


In [20]:
df_testing

Unnamed: 0,image_index,finding_labels,follow_up_number,patient_id,patient_age,patient_gender,view_position,original_image_width,original_image_height,original_image_pixel_spacing_x,...,emphysema,fibrosis,hernia,infiltration,mass,no_finding,nodule,pleural_thickening,pneumonia,pneumothorax
0,00013952_000.png,Atelectasis|Pneumonia,0,13952,59,M,PA,2500,2048,0.168,...,0,0,0,0,0,0,0,0,1,0
1,00012741_005.png,Atelectasis|Consolidation|Infiltration,5,12741,42,M,PA,2842,2991,0.143,...,0,0,0,1,0,0,0,0,0,0
2,00005721_003.png,Mass,3,5721,65,M,AP,2500,2048,0.171,...,0,0,0,0,1,0,0,0,0,0
3,00020524_008.png,No Finding,8,20524,47,F,PA,2482,2653,0.143,...,0,0,0,0,0,1,0,0,0,0
4,00005876_000.png,Infiltration|Pleural_Thickening,0,5876,34,M,PA,2500,2048,0.171,...,0,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1116,00024891_000.png,No Finding,0,24891,30,M,PA,2610,2991,0.143,...,0,0,0,0,0,1,0,0,0,0
1117,00012158_025.png,Pneumonia,21,12158,47,F,PA,2992,2991,0.143,...,0,0,0,0,0,0,0,0,1,0
1118,00013249_033.png,Cardiomegaly|Edema|Infiltration|Pneumonia,33,13249,14,M,AP,2500,2048,0.168,...,0,0,0,1,0,0,0,0,1,0
1119,00000524_001.png,Fibrosis|Pleural_Thickening,1,524,50,M,PA,2500,2048,0.168,...,0,1,0,0,0,0,0,1,0,0


In [21]:
training_filepath = os.path.join(data_dir, "train_data_model_7.csv")
df_training.to_csv (training_filepath, index = True, header = True)

testing_filepath = os.path.join(data_dir, "test_data_model_7.csv")
df_testing.to_csv (testing_filepath, index = True, header = True)