In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [2]:
root_dir = './data'

csv_dir = os.path.join(root_dir, 'csv')
txt_dir = os.path.join(root_dir, 'txt')

data_entry_file = os.path.join(csv_dir, 'Data_Entry_2017_v2020.csv')

data_entry_df = pd.read_csv(data_entry_file)

In [3]:
data_entry_df_copy = data_entry_df.copy()

In [4]:
finding_labels_df = data_entry_df_copy['Finding Labels'].str.get_dummies(sep='|')

data_entry_df_copy = pd.concat([data_entry_df_copy, finding_labels_df], 1)

In [5]:
old_new_columns_map = {
    'Image Index': 'image_index',
    'Finding Labels': 'finding_labels',
    'Follow-up #': 'follow_up_number',
    'Patient ID': 'patient_id',
    'Patient Age':'patient_age',
    'Patient Gender': 'patient_gender',
    'View Position': 'view_position',
    'OriginalImage[Width': 'original_image_width',
    'Height]': 'original_image_height',
    'OriginalImagePixelSpacing[x': 'original_image_pixel_spacing_x',
    'y]': 'original_image_pixel_spacing_y',
    'Atelectasis': 'atelectasis',
    'Cardiomegaly': 'cardiomegaly',
    'Consolidation': 'consolidation',
    'Edema': 'edema',
    'Effusion': 'effusion',
    'Emphysema': 'emphysema',
    'Fibrosis': 'fibrosis',
    'Hernia': 'hernia',
    'Infiltration': 'infiltration',
    'Mass': 'mass',
    'No Finding': 'no_finding',
    'Nodule': 'nodule',
    'Pleural_Thickening': 'pleural_thickening',
    'Pneumonia': 'pneumonia',
    'Pneumothorax': 'pneumothorax'
}

data_entry_df_copy.rename(columns=old_new_columns_map, inplace=True)

In [6]:
old_new_columns_map = {
    'Atelectasis': 'atelectasis',
    'Cardiomegaly': 'cardiomegaly',
    'Consolidation': 'consolidation',
    'Edema': 'edema',
    'Effusion': 'effusion',
    'Emphysema': 'emphysema',
    'Fibrosis': 'fibrosis',
    'Hernia': 'hernia',
    'Infiltration': 'infiltration',
    'Mass': 'mass',
    'No Finding': 'no_finding',
    'Nodule': 'nodule',
    'Pleural_Thickening': 'pleural_thickening',
    'Pneumonia': 'pneumonia',
    'Pneumothorax': 'pneumothorax'
}

finding_labels_df.rename(columns=old_new_columns_map, inplace=True)

In [7]:
df = data_entry_df_copy.copy()

---
---
---

In [8]:
train_txt = os.path.join(txt_dir, 'train_val_list.txt')
test_txt = os.path.join(txt_dir, 'test_list.txt')

train_txt, test_txt

('./data/txt/train_val_list.txt', './data/txt/test_list.txt')

In [9]:
def get_target_list(txt_file):
    
    lst = []
    
    with open(txt_file) as f:
        for line in f:
            lst.append(line.split('\n')[0].strip())

    return lst

In [10]:
train_list = get_target_list(train_txt)
test_list = get_target_list(test_txt)

In [11]:
train_df = df[df.image_index.isin(train_list)]

In [12]:
test_df = df[df.image_index.isin(test_list)]

In [13]:
train_filepath = os.path.join(csv_dir, "train_data.csv")

train_df.to_csv (train_filepath, index = True, header = True)

In [14]:
test_filepath = os.path.join(csv_dir, "test_data.csv")

test_df.to_csv (test_filepath, index = True, header = True)