# Datasplits EDA

In [4]:
import pandas as pd
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
import ast

# Loading the Datasplit files

In [24]:
# Loading the data
# chestX_ray14 = pd.read_csv("../../../purrlab_students/ChestX-ray14/Data_Entry_2017.csv")
GCS16l = pd.read_csv("../Data/Data_splits/GCS16l.csv", index_col=0)
Bbox = pd.read_csv("../Data/Data_splits/Bbox.csv", index_col=0)
GCS4l = pd.read_csv("../Data/Data_splits/GCS4l.csv", index_col=0)
RSNA = pd.read_csv("../Data/Data_splits/RSNA.csv", index_col=0)

path_test = pd.read_csv("../Data/Data_splits/pathology_detection-test.csv", index_col=0)

path_fine_CHX14 = pd.read_csv("../Data/Data_splits/pathology_detection-CXR14-finetuning.csv", index_col=0)
path_val_CHX14 = pd.read_csv("../Data/Data_splits/pathology_detection-CXR14-finetuning_val.csv", index_col=0)

tube_fine = pd.read_csv("../Data/Data_splits/tube_detection-finetuning.csv", index_col=0)
tube_val = pd.read_csv("../Data/Data_splits/tube_detection-finetuning_val.csv", index_col=0)

annotations = pd.read_csv("../Data/Data_splits/tube_detection-Annotations.csv", index_col=0)
CHX14_Ann = pd.read_csv("../Data/Data_splits/tube_detection-CXR14_test", index_col=0)

combined = pd.read_csv("../Data/Data_splits/pathology_detection-CXR14_test_combined.csv", index_col=0)

# Obtaining the distributions

In [19]:
def df_stats(df):
    df_c = df.columns.to_list()    
    print("Size:", len(df))

    for label in df_c[2:]:
        print(label, ":", "{:.2f}".format(round(Counter(df[label])[1]/len(df)*100, 2)))
    print()

In [20]:
df_stats(tube_fine)

Size: 2117
Labels : 0.00
Effusion : 21.63
Pneumothorax : 2.31
Atelectasis : 10.01
Cardiomegaly : 3.35
Pneumonia : 5.29
Chest_drain_tube : 8.31
NSG_tube : 74.82
Endotracheal_tube : 29.00
Tracheostomy_tube : 15.73



In [21]:
df_stats(tube_val)

Size: 1456
Labels : 0.00
Effusion : 21.15
Pneumothorax : 2.47
Atelectasis : 9.34
Cardiomegaly : 4.46
Pneumonia : 5.77
Chest_drain_tube : 8.86
NSG_tube : 76.58
Endotracheal_tube : 31.46
Tracheostomy_tube : 13.26



In [22]:
df_stats(annotations)

Size: 1011
Labels : 0.00
Chest_drain_Ann : 3.66
NSG_tube_Ann : 44.41
Endotracheal_tube_Ann : 16.02
Tracheostomy_tube_Ann : 24.93
Effusion : 24.13
Pneumothorax : 2.18
Atelectasis : 8.80
Cardiomegaly : 3.46
Pneumonia : 8.51
Chest_drain_tube : 7.02
NSG_tube : 73.69
Endotracheal_tube : 35.41
Tracheostomy_tube : 23.24



In [25]:
df_stats(CHX14_Ann)

Size: 2835
Finding Labels : 0.00
Chest_drain_Ann : 46.24
Effusion : 19.79
Pneumothorax : 100.00
Atelectasis : 14.22
Cardiomegaly : 0.99
Pneumonia : 0.78



In [26]:
df_stats(GCS16l)

Size: 810
Hernia : 0.62
Pneumonia : 0.25
Nodule : 16.05
Edema : 2.84
Other : 7.16
Infiltration : 7.53
Pneumothorax : 16.79
Abnormal : 71.36
Nodule or mass : 22.22
Consolidation : 10.25
Fibrosis : 1.60
Mass : 12.22
Emphysema : 0.86
Atelectasis : 37.41
Effusion : 27.90
Cardiomegaly : 10.00
Pleural_Thickening : 6.05



In [27]:
df_stats(Bbox)

Size: 880
Pneumonia : 13.64
Nodule : 8.98
Infiltration : 13.98
Pneumothorax : 11.14
Nodule or mass : 18.64
Mass : 9.66
Atelectasis : 20.45
Effusion : 17.39
Cardiomegaly : 16.59



In [28]:
df_stats(GCS4l)

Size: 4376
Fracture : 4.25
Pneumothorax : 5.44
Nodule or mass : 13.83
Airspace opacity : 49.50



In [29]:
df_stats(RSNA)

Size: 26684
Pneumonia : 22.53



In [30]:
df_stats(combined)

Size: 1664
Effusion : 22.60
Pneumothorax : 14.30
Atelectasis : 28.73
Cardiomegaly : 13.46
Pneumonia : 13.70



### Combining the pathology and tube detection df to obtain the distributions for the pathology detection sets

In [2]:
# PD datasets
path_train = pd.read_csv("../Data/Data_splits/pathology_detection-train.csv", index_col=0)
path_val = pd.read_csv("../Data/Data_splits/pathology_detection-val.csv", index_col=0)
path_test = pd.read_csv("../Data/Data_splits/pathology_detection-test.csv", index_col=0)

# TD datasets
tube_fine = pd.read_csv("../Data/Data_splits/tube_detection-finetuning.csv", index_col=0)
tube_val = pd.read_csv("../Data/Data_splits/tube_detection-finetuning_val.csv", index_col=0)
annotations = pd.read_csv("../Data/Data_splits/tube_detection-Annotations.csv", index_col=0)

fine = pd.concat([path_train, tube_fine])
val = pd.concat([path_val, tube_val])
test = pd.concat([path_test, annotations])


In [8]:
df_stats(fine)

Size: 79063
Labels : 0.00
Effusion : 4.69
Pneumothorax : 0.27
Atelectasis : 1.48
Cardiomegaly : 9.26
Pneumonia : 4.32
Chest_drain_tube : 0.22
NSG_tube : 2.00
Endotracheal_tube : 0.78
Tracheostomy_tube : 0.42



In [9]:
df_stats(val)

Size: 11148
Labels : 0.00
Effusion : 6.73
Pneumothorax : 0.51
Atelectasis : 2.31
Cardiomegaly : 8.61
Pneumonia : 4.33
Chest_drain_tube : 1.16
NSG_tube : 10.00
Endotracheal_tube : 4.11
Tracheostomy_tube : 1.73



In [10]:
df_stats(test)

Size: 10800
Labels : 0.00
Effusion : 5.97
Pneumothorax : 0.33
Atelectasis : 1.99
Cardiomegaly : 8.65
Pneumonia : 4.28
Chest_drain_tube : 0.66
NSG_tube : 6.90
Endotracheal_tube : 3.31
Tracheostomy_tube : 2.18
Chest_drain_Ann : 0.34
NSG_tube_Ann : 4.16
Endotracheal_tube_Ann : 1.50
Tracheostomy_tube_Ann : 2.33



In [3]:
# test.to_csv("../Data/Data_splits/pathology_detection-test_ALL.csv")