In [1]:
import sys
import os
sys.path.append(os.path.abspath("..")) 

from dotenv import load_dotenv
load_dotenv()

IMAGE_SIZE = 224

# DATA ANALYZE
This section aims to analyze the NIH-CXR14 dataset through these operations:

* **NIHDataset**: Loads tabular data and image paths from NIH-CXR14 dataset 

* **Field Filtering**: Selects essential columns `["Image Index", "Finding Labels", "Image Path", "embeddings"]`

* **Balance Adjustment**: Handles class imbalance using the `Finding Labels` field

* **Label Exclusion**: Sets aside specific labels for validation using unseen data

In [2]:
from src.datasets import NIHDataset
nih_dataset = NIHDataset(
    root_dir=os.getenv("NIH_CXR14_DATASET_DIR"),
    img_size=IMAGE_SIZE)

In [3]:
len(nih_dataset)

112120

In [4]:
fields = nih_dataset.get_fields()
fields

['Embeddings',
 'Image Index',
 'Finding Labels',
 'Follow-up #',
 'Patient ID',
 'Patient Age',
 'Patient Gender',
 'View Position',
 'OriginalImage[Width',
 'Height]',
 'OriginalImagePixelSpacing[x',
 'y]',
 'Unnamed: 11',
 'Image Path']

In [5]:
selected_fields = ["Image Index", "Finding Labels", "Image Path", "Embeddings"]

filtered_dataset = nih_dataset.select_columns(selected_fields)

fields = filtered_dataset.get_fields()
fields

['Embeddings', 'Image Index', 'Finding Labels', 'Image Path']

In [6]:
label_field = filtered_dataset.get_label_counts()
label_field

{'Atelectasis': 11559,
 'No Finding': 60361,
 'Consolidation': 4667,
 'Effusion': 13317,
 'Pleural_Thickening': 3385,
 'Infiltration': 19894,
 'Emphysema': 2516,
 'Pneumothorax': 5302,
 'Cardiomegaly': 2776,
 'Fibrosis': 1686,
 'Nodule': 6331,
 'Mass': 5782,
 'Edema': 2303,
 'Pneumonia': 1431,
 'Hernia': 227}

In [7]:
# Hedef etiketleri belirle
include_labels = ["No Finding", "Infiltration", "Effusion", "Atelectasis"]

# Filtreleme yap
filtered_dataset = filtered_dataset.filter_by_labels(include_labels, mode='include')

# Etiket sayılarını kontrol et
label_counts = filtered_dataset.get_label_counts()
print("\nFinal label counts:")
for label in sorted(label_counts.keys()):
    print(f"{label}: {label_counts[label]}")


Filtering for labels: ['No Finding', 'Infiltration', 'Effusion', 'Atelectasis'], mode=include
DataFrame shape before filtering: (112120, 3)

Final label counts:
Atelectasis: 7471
Effusion: 7465
Infiltration: 13240
No Finding: 60361


In [8]:
## limit the number of samples per label
filtered_dataset = filtered_dataset.limit_samples("No Finding", 10000)
filtered_dataset = filtered_dataset.limit_samples("Infiltration", 10000)



In [9]:
label_counts = filtered_dataset.get_label_counts()
print("\nFinal label counts:")
for label in sorted(label_counts.keys()):
    print(f"{label}: {label_counts[label]}")




Final label counts:
Atelectasis: 6988
Effusion: 6918
Infiltration: 10000
No Finding: 10000


In [29]:
from src.pipelines import VaeProcessor
from torch.utils.data import DataLoader
from torchvision import transforms
from PIL import Image
import torch
import numpy as np

# Image transformations
image_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize([0.5], [0.5])
])

class VaeDataset():
    def __init__(self, dataset, transform=None):
        self.dataset = dataset
        self.transform = transform
    
    def __getitem__(self, idx):
        row, embed = self.dataset[idx]
        image_path = row["Image Path"]
        labels = row["Finding Labels"]
        
        # Load and transform image
        image = Image.open(image_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        
        # Convert labels to tensor
        if isinstance(labels, (list, np.ndarray)):
            labels = torch.tensor([1 if label in labels else 0 for label in self.dataset.get_fields()])
        else:
            labels = torch.tensor([1 if label in str(labels).split('|') else 0 for label in self.dataset.get_fields()])
        
        # Convert embedding to tensor if it's not already
        if isinstance(embed, np.ndarray):
            embed = torch.from_numpy(embed)
            
        return image, embed, labels
    
    def __len__(self):
        return len(self.dataset)

In [30]:
# Setup
device = "cuda:2"
batch_size = 16

# Create dataset and dataloader
vae_dataset = VaeDataset(filtered_dataset, transform=image_transform)
vae_loader = DataLoader(vae_dataset, batch_size=batch_size, shuffle=False)
vae_processor = VaeProcessor(device=device)

# Setup storage

In [31]:
# Setup storage
import pickle
from pathlib import Path

save_dir = Path("data")
save_dir.mkdir(exist_ok=True)

storage_dictionary = {
    'latents': {},
    'embeddings': {},
    'labels': {},
}

# Process data
current_idx = 0
for images, embeddings, labels in vae_loader:
    # Move to device
    images = images.to(device)
    embeddings = embeddings.to(device)
    
    # Get latents
    latents = vae_processor.prepare_latent(images)
    
    # Store in dictionary
    for latent, embed, label in zip(latents, embeddings, labels):
        storage_dictionary['latents'][current_idx] = latent.cpu().detach().numpy()
        storage_dictionary['embeddings'][current_idx] = embed.cpu().detach().numpy()
        storage_dictionary['labels'][current_idx] = label.cpu().numpy()
        current_idx += 1
    
    # Periodic saving
    if current_idx % 1000 == 0:
        print(f"Processed {current_idx} images")
        with open(save_dir / "nih-cxr14-latent-embed.pkl", "wb") as f:
            pickle.dump(storage_dictionary, f)

# Final save
with open(save_dir / "nih-cxr14-latent-embed.pkl", "wb") as f:
    pickle.dump(storage_dictionary, f)

print("Finished processing all images")
print(f"Total images processed: {current_idx}")

Processed 2000 images
Processed 4000 images
Processed 6000 images
Processed 8000 images
Processed 10000 images
Processed 12000 images
Processed 14000 images
Processed 16000 images
Processed 18000 images
Processed 20000 images
Processed 22000 images
Processed 24000 images
Processed 26000 images
Processed 28000 images
Finished processing all images
Total images processed: 29337
