In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [2]:
data_dir = './data/csv'

data_entry_file = os.path.join(data_dir, 'Data_Entry_2017_v2020.csv')

data_entry_df = pd.read_csv(data_entry_file)

In [3]:
data_entry_df_copy = data_entry_df.copy()

In [4]:
finding_labels_df = data_entry_df_copy['Finding Labels'].str.get_dummies(sep='|')

data_entry_df_copy = pd.concat([data_entry_df_copy, finding_labels_df], 1)

In [5]:
old_new_columns_map = {
    'Image Index': 'image_index',
    'Finding Labels': 'finding_labels',
    'Follow-up #': 'follow_up_number',
    'Patient ID': 'patient_id',
    'Patient Age':'patient_age',
    'Patient Gender': 'patient_gender',
    'View Position': 'view_position',
    'OriginalImage[Width': 'original_image_width',
    'Height]': 'original_image_height',
    'OriginalImagePixelSpacing[x': 'original_image_pixel_spacing_x',
    'y]': 'original_image_pixel_spacing_y',
    'Atelectasis': 'atelectasis',
    'Cardiomegaly': 'cardiomegaly',
    'Consolidation': 'consolidation',
    'Edema': 'edema',
    'Effusion': 'effusion',
    'Emphysema': 'emphysema',
    'Fibrosis': 'fibrosis',
    'Hernia': 'hernia',
    'Infiltration': 'infiltration',
    'Mass': 'mass',
    'No Finding': 'no_finding',
    'Nodule': 'nodule',
    'Pleural_Thickening': 'pleural_thickening',
    'Pneumonia': 'pneumonia',
    'Pneumothorax': 'pneumothorax'
}

data_entry_df_copy.rename(columns=old_new_columns_map, inplace=True)

In [6]:
old_new_columns_map = {
    'Atelectasis': 'atelectasis',
    'Cardiomegaly': 'cardiomegaly',
    'Consolidation': 'consolidation',
    'Edema': 'edema',
    'Effusion': 'effusion',
    'Emphysema': 'emphysema',
    'Fibrosis': 'fibrosis',
    'Hernia': 'hernia',
    'Infiltration': 'infiltration',
    'Mass': 'mass',
    'No Finding': 'no_finding',
    'Nodule': 'nodule',
    'Pleural_Thickening': 'pleural_thickening',
    'Pneumonia': 'pneumonia',
    'Pneumothorax': 'pneumothorax'
}

finding_labels_df.rename(columns=old_new_columns_map, inplace=True)

In [7]:
sample_size = data_entry_df_copy.shape[0]

pneumonia_cases = data_entry_df_copy.query('pneumonia == 1')
nonpneumonia_cases = data_entry_df_copy.query('pneumonia == 0')

num_pneumonia_cases = pneumonia_cases.shape[0]
num_nonpneumonia_cases = nonpneumonia_cases.shape[0]

pct_pneumonia_cases = num_pneumonia_cases / sample_size
pct_nonpneumonia_cases = num_nonpneumonia_cases / sample_size

In [8]:
print('Total number of records: {}'.format(sample_size))

print('Total number of Pneumonia cases: {}'.format(num_pneumonia_cases))
print('Total number of non Pneumonia cases: {}'.format(num_nonpneumonia_cases))

print('Percentage of Pneumonia: {:0.2f}'.format(pct_pneumonia_cases))
print('Percentage of non Pneumonia: {:0.2f}'.format(pct_nonpneumonia_cases))

Total number of records: 112120
Total number of Pneumonia cases: 1431
Total number of non Pneumonia cases: 110689
Percentage of Pneumonia: 0.01
Percentage of non Pneumonia: 0.99


---
---
---

In [9]:
pneumonia_cases = pneumonia_cases.sample(frac=1).reset_index(drop=True)
nonpneumonia_cases = nonpneumonia_cases.sample(frac=1).reset_index(drop=True)

In [10]:
TARGET_SAMPLE_SIZE = 5606

In [11]:
training_size = int(np.rint(TARGET_SAMPLE_SIZE * 0.8))
testing_size = TARGET_SAMPLE_SIZE - training_size

In [12]:
nonpneumonia_cases_sample_size = TARGET_SAMPLE_SIZE - num_pneumonia_cases

nonpneumonia_cases_sample = nonpneumonia_cases.sample(nonpneumonia_cases_sample_size)

---
---

In [13]:
pneumonia_in_training_size = int(np.rint(num_pneumonia_cases * 0.8))
nonpneumonia_in_training_size = training_size - pneumonia_in_training_size

In [14]:
pneumonia_in_training = pneumonia_cases.sample(pneumonia_in_training_size)
nonpneumonia_in_training = nonpneumonia_cases_sample.sample(nonpneumonia_in_training_size)

In [15]:
pneumonia_in_testing = pneumonia_cases[~pneumonia_cases.image_index.isin(pneumonia_in_training.image_index)]
nonpneumonia_in_testing = nonpneumonia_cases_sample[~nonpneumonia_cases_sample.image_index.isin(nonpneumonia_in_training.image_index)
                                            ]

In [16]:
pneumonia_in_training.shape, nonpneumonia_in_training.shape, \
pneumonia_in_testing.shape, nonpneumonia_in_testing.shape

((1145, 26), (3340, 26), (286, 26), (835, 26))

In [17]:
training_df = pd.concat([pneumonia_in_training, nonpneumonia_in_training]).reset_index()
testing_df = pd.concat([pneumonia_in_testing, nonpneumonia_in_testing]).reset_index()

In [18]:
training_filepath = os.path.join(data_dir, "train_data.csv")
training_df.to_csv (training_filepath, index = True, header = True)

testing_filepath = os.path.join(data_dir, "test_data.csv")
testing_df.to_csv (testing_filepath, index = True, header = True)