# Preprocessing of Chest X Ray
Dataset download [link](https://nihcc.app.box.com/v/ChestXray-NIHCC/)

In [1]:
import os 
import shutil
import numpy as np
from tqdm import tqdm
import pandas as pd 

In [2]:
path_raw = '/home/imagedpt/.cache/torch/mmf/data/chest_x_ray'

### Split Train/Test

In [3]:
file = open(os.path.join(path_raw, 'train_val_list.txt'))
train_list = file.read().split('\n')
train_list = np.asarray(train_list)

In [4]:
for f in tqdm(train_list):
    path_to_src = os.path.join(path_raw, 'images/images', f)
    destination_dir = os.path.join(path_raw, 'single_label/train')
    os.makedirs(destination_dir, exist_ok=True)
    path_to_destination = os.path.join(path_raw, 'single_label/train', f)
    shutil.copy(path_to_src, path_to_destination)    


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 86524/86524 [01:35<00:00, 907.42it/s]


In [5]:
file = open(os.path.join(path_raw, 'test_list.txt'))
test_list = file.read().split('\n')
test_list = np.asarray(test_list)

In [6]:
for f in tqdm(test_list):
    path_to_copy = os.path.join(path_raw, 'images/images', f)
    destination_dir = os.path.join(path_raw, 'single_label/test')
    os.makedirs(destination_dir, exist_ok=True)
    path_to_destination = os.path.join(path_raw, 'single_label/test', f)
    shutil.copy(path_to_copy, path_to_destination)    


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25596/25596 [00:29<00:00, 869.36it/s]


### Split Classes

In [7]:
label = pd.read_csv(os.path.join(path_raw, 'Data_Entry_2017_v2020.csv'))

classes = ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion',
           'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass', 'No Finding',
           'Nodule', 'Pleural_Thickening', 'Pneumonia', 'Pneumothorax']

In [8]:
for c in classes:
    path_file = os.path.join(path_raw, 'single_label', 'train', c)
    os.mkdir(path_file)
    path_file = os.path.join(path_raw, 'single_label', 'test', c)    
    os.mkdir(path_file)

In [9]:
for l in tqdm(label.iterrows()):
    if (train_list==l[1][0]).sum() == 1:
        if '|' not in l[1][1]:
            path_to_destination = os.path.join(path_raw, 'single_label', 'train', l[1][1], l[1][0])
            path_to_copy = os.path.join(path_raw, 'single_label', 'train', l[1][0])
            shutil.copy(path_to_copy, path_to_destination)
            os.remove(path_to_copy)
    elif (test_list==l[1][0]).sum() == 1:
        if '|' not in l[1][1]:
            path_to_destination = os.path.join(path_raw, 'single_label', 'test', l[1][1])
            path_to_copy = os.path.join(path_raw, 'single_label', 'test', l[1][0])
            shutil.copy(path_to_copy, path_to_destination)
            os.remove(path_to_copy)
    else : 
        print('not found')    

112120it [03:02, 612.92it/s]


## Balance dataset

In [10]:
test_files = set(open("keep_test_files.txt", "r").read().split("\n"))
train_files = set(open("keep_train_files.txt", "r").read().split("\n"))


base_path = os.path.join(path_raw, "single_label", "test", "No Finding")
deleted, kept = 0, 0
for f in tqdm(os.listdir(base_path)):
    if f not in test_files:
        to_remove = os.path.join(base_path, f)
        os.remove(to_remove)
        deleted += 1
    else:
        kept +=1
print(f"Deleted {deleted} files, kept {kept} ones.")


base_path = os.path.join(path_raw, "single_label", "train", "No Finding")
deleted, kept = 0, 0
for f in tqdm(os.listdir(base_path)):
    if f not in train_files:
        to_remove = os.path.join(base_path, f)
        os.remove(to_remove)
        deleted += 1
    else:
        kept += 1
print(f"Deleted {deleted} files, kept {kept} ones.")


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9861/9861 [00:00<00:00, 44819.03it/s]


Deleted 7261 files, kept 2600 ones.


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50500/50500 [00:00<00:00, 51212.43it/s]

Deleted 43000 files, kept 7500 ones.



