In [None]:
import sys
import os
from pathlib import Path

# Get absolute path to project root
project_root = Path(os.path.abspath('')).parent.parent
sys.path.append(str(project_root))

from dotenv import load_dotenv
load_dotenv()

nih_dataset_root_dir = os.getenv("NIH_CXR14_DATASET_DIR")

main_output_dir = "../data"
os.makedirs(main_output_dir, exist_ok=True)

In [2]:
## Load the NIH data Entries

from src.datasets import NIHFindingLabels
from src.utils import print_dict_as_table


nih_finding_labels = NIHFindingLabels(nih_dataset_root_dir)


label_counts = nih_finding_labels.label_counts
print(print_dict_as_table(label_counts, ["Label", "Count"]))

print("NIH-CXR14 Data Entries: ", len(nih_finding_labels))


print("NIH-CXR14 Data Entries Sample: ", nih_finding_labels[0])


Label                | Count     
--------------------------------
No Finding           | 60361     
Infiltration         | 19894     
Effusion             | 13317     
Atelectasis          | 11559     
Nodule               | 6331      
Mass                 | 5782      
Pneumothorax         | 5302      
Consolidation        | 4667      
Pleural_Thickening   | 3385      
Cardiomegaly         | 2776      
Emphysema            | 2516      
Edema                | 2303      
Fibrosis             | 1686      
Pneumonia            | 1431      
Hernia               | 227       
None
NIH-CXR14 Data Entries:  112120
NIH-CXR14 Data Entries Sample:  (tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       dtype=torch.float64), '00000001_000.png', ['No Finding', 'Infiltration', 'Effusion', 'Atelectasis', 'Nodule', 'Mass', 'Pneumothorax', 'Consolidation', 'Pleural_Thickening', 'Cardiomegaly', 'Emphysema', 'Edema', 'Fibrosis', 'Pneumonia', 'Hernia'])


In [3]:
## merge the labels

labels_to_merge = ["Nodule", "Mass"]

new_label = "Nodule/Mass"
nih_finding_labels = nih_finding_labels.merge_labels(labels_to_merge, new_label)


In [4]:
## filter by top k labels
top_k = 5

top_k_labels = nih_finding_labels.create_top_k_dataset(top_k)

label_counts = top_k_labels.label_counts

print(print_dict_as_table(label_counts, ["Label", "Count"]))
print("Top K Labels: ", top_k_labels.label_counts.keys())
print("Top K Data Entries: ", len(top_k_labels))




Label                | Count     
--------------------------------
No Finding           | 60361     
Infiltration         | 19894     
Effusion             | 13317     
Atelectasis          | 11559     
Nodule/Mass          | 11207     
None
Top K Labels:  dict_keys(['No Finding', 'Infiltration', 'Effusion', 'Atelectasis', 'Nodule/Mass'])
Top K Data Entries:  102566


In [5]:
## balance the label distribution

balanced_labels = top_k_labels.balance_labels(13000)

label_counts = balanced_labels.label_counts

print(print_dict_as_table(label_counts, ["Label", "Count"]))
print("Top K Labels: ", label_counts.keys())
print("Top K Data Entries: ", len(balanced_labels))





Label                | Count     
--------------------------------
Infiltration         | 13000     
No Finding           | 13000     
Effusion             | 9651      
Nodule/Mass          | 8947      
Atelectasis          | 8591      
None
Top K Labels:  dict_keys(['Infiltration', 'No Finding', 'Effusion', 'Nodule/Mass', 'Atelectasis'])
Top K Data Entries:  48311


In [6]:
## save the data entries

balanced_labels.save(main_output_dir)