In [1]:

import os
import sys
import logging

from pathlib import Path
from datetime import datetime
from tqdm.notebook import tqdm

import h5py
import numpy as np
from PIL import Image

import tensorflow as tf

# Get absolute path to project root
project_root = Path(os.path.abspath('')).parent.parent
sys.path.append(str(project_root))

from dotenv import load_dotenv
load_dotenv()


os.environ["CUDA_VISIBLE_DEVICES"] = "2" 

nih_dataset_root_dir = os.getenv("NIH_CXR14_DATASET_DIR")
elixr_dataset_root_dir = nih_dataset_root_dir + "/elixr"
elixr_c_14_dataset_root_dir = elixr_dataset_root_dir + "/elixrc"



2025-02-26 23:05:34.009566: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1740600334.037736 2554781 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1740600334.046542 2554781 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-26 23:05:34.076813: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
from src.models import  ELIXR
from src.datasets import NIHImageDataset

## EXTRACT IMAGE EMBEDDINGS FROM NIH-CXR-DATASET

In [3]:
elixrc_path = elixr_dataset_root_dir + "/elixrc"
if os.path.exists(elixrc_path):
    list_dir = os.listdir(elixrc_path)
    if len(list_dir) > 0:
        raise ValueError("ELIXR-C14 dataset already exists. Exiting to prevent overwriting. If you want to re-generate the dataset, delete the existing dataset directory and re-run this script.")
else:
    os.makedirs(elixrc_path, exist_ok=True)

dataset = NIHImageDataset(nih_dataset_root_dir + "/original")

elixr_model = ELIXR(
    use_elixrb=False,
    use_elixrc=True
)


image_ids = dataset.get_image_ids()

max_len = max([len(image_id) for image_id in image_ids])


print("Max len", max_len)



I0000 00:00:1740600344.035478 2554781 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 22456 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3090, pci bus id: 0000:88:00.0, compute capability: 8.6


Max len 16


In [4]:
def save_embed(image_id, elixrc_embedding, chunk_number, output_dir):

    output_path = f"{output_dir}/elixrc_embedding_chunk_{chunk_number}.h5"
    with h5py.File(output_path, "w") as f:
        f.create_dataset("elixrc_embedding", 
                         data=np.array(elixrc_embedding), 
                         dtype="float32", 
                         compression="gzip", 
                         compression_opts=9)
        
        f.create_dataset("image_ids", 
                         data=np.array(image_id, dtype="S"))
        
        f.attrs["chunk_number"] = chunk_number
        f.attrs["len"] = len(elixrc_embedding)
        f.attrs["creation_date"] = str(datetime.datetime.now())
        f.attrs["embedding_dim"] = elixrc_embedding[0].shape[1]
        f.attrs["image_id_max_len"] = max_len

    print(f"Saved chunk {chunk_number} to {output_path}")

In [5]:
def resize(image, min_size= 1024):
    width, height = image.size
    if width < height:
        new_width = min_size
        new_height = int(height * (min_size / width))
    else:
        new_height = min_size
        new_width = int(width * (min_size / height))

    return image.resize((new_width, new_height), Image.LANCZOS)

In [None]:
from tqdm.notebook import tqdm
import numpy as np
import queue
import threading

# Create a buffer using a queue
buffer_size = 1000  # Adjust as needed
image_buffer = queue.Queue(maxsize=buffer_size)
buffer_lock = threading.Lock()

# Function to fill the buffer in background
def fill_buffer(dataset, buffer):
    for image, image_id in dataset:
        # Preprocess image
        resized_img = resize(image, min_size=1024)
        # Add to buffer (will block if buffer is full)
        buffer.put((resized_img, image_id))
    
    # Add sentinel to mark the end
    buffer.put(None)

# Start the buffer filling thread
buffer_thread = threading.Thread(target=fill_buffer, args=(dataset, image_buffer), daemon=True)
buffer_thread.start()

# Process images from buffer
elixrc_embedding = []
image_ids = []
chunk_size = 10000
chunk_count = 0

# Process until we get the sentinel value
with tqdm(total=len(dataset)) as pbar:
    while True:
        # Get item from buffer (will wait if buffer is empty)
        item = image_buffer.get()
        
        # Check for end sentinel
        if item is None:
            break
            
        # Unpack the item
        resized_img, image_id = item
        
        # Process with model
        output = elixr_model(image=resized_img)
        
        # Store results
        elixrc_embedding.append(output)
        image_ids.append(image_id)
        
        # Update progress bar
        pbar.update(1)
        
        # Save chunk if needed
        if len(elixrc_embedding) >= chunk_size:
            save_embed(image_ids, elixrc_embedding, chunk_count, elixrc_path)
            elixrc_embedding = []
            image_ids = []
            chunk_count += 1

# Save final chunk if needed
if len(elixrc_embedding) > 0:
    save_embed(image_ids, elixrc_embedding, chunk_count, elixrc_path)

  0%|          | 0/112120 [00:00<?, ?it/s]

I0000 00:00:1740600367.456283 2555143 cuda_dnn.cc:529] Loaded cuDNN version 90300


KeyboardInterrupt: 

In [None]:
from tqdm import tqdm
import numpy as np

# Initialize empty lists to store data
img_ids = []
general_img_embeddings = []
qform_img_embeddings = []

# Process dataset with tqdm progress bar
total_items = len(dataset)  # Replace with actual length if dataset doesn't support len()

for image, image_id in tqdm(dataset, total=total_items, desc="Processing Images", 
                           unit="img", ncols=100, colour="green"):
    img_ids.append(image_id)
    
    # Get embeddings from model
    output = elixr_model(image)
    
    # Store the embeddings
    general_img_embeddings.append(output['general_img_embedding'])
    qform_img_embeddings.append(output['qformer_embedding'])
    

# Convert lists to numpy arrays for saving to h5py
img_ids = np.array(img_ids, dtype='S')
general_img_embeddings = np.array(general_img_embeddings)
qform_img_embeddings = np.array(qform_img_embeddings)

print(f"Processed {len(img_ids)} images")
print(f"General embeddings shape: {general_img_embeddings.shape}")
print(f"Q-former embeddings shape: {qform_img_embeddings.shape}")





# Save general embeddings
general_img_embedding_file = output_dir + '/elixr_general_embeddings.h5'
with h5py.File(general_img_embedding_file, 'w') as f:
    # Create datasets with compression
    f.create_dataset("embeddings", data=general_img_embeddings, dtype=np.float32, 
                    compression="gzip", compression_opts=9)
    f.create_dataset("Image Index", data=img_ids)
    
    # Add metadata
    f.attrs['creation_date'] = str(datetime.now())
    f.attrs['embedding_dim'] = general_img_embeddings.shape[1]
    f.attrs['num_images'] = len(img_ids)
    f.attrs['max_len'] = max_len  # Assuming max_len is defined

print(f"Saved general embeddings to {general_img_embedding_file}")

# Save Q-former embeddings
qform_img_embedding_file = output_dir + '/elixr_qformer_embeddings.h5'
with h5py.File(qform_img_embedding_file, 'w') as f:
    # Create datasets with compression
    f.create_dataset("embeddings", data=qform_img_embeddings, dtype=np.float32, 
                    compression="gzip", compression_opts=9)
    f.create_dataset("Image Index", data=img_ids)
    
    # Add metadata
    f.attrs['creation_date'] = str(datetime.now())
    f.attrs['embedding_dim'] = qform_img_embeddings.shape[1]
    f.attrs['num_images'] = len(img_ids)
    f.attrs['max_len'] = max_len  # Assuming max_len is defined

print(f"Saved Q-former embeddings to {qform_img_embedding_file}")


