In [1]:

import os
import sys
import logging

from pathlib import Path
from PIL import Image
import numpy as np

import h5py

import tensorflow as tf
os.environ["CUDA_VISIBLE_DEVICES"] = "0" 
os.environ['HF_HOME'] = '~/.cache/huggingface'


# Suppress TensorFlow logging
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # 0=DEBUG, 1=INFO, 2=WARNING, 3=ERROR
tf.get_logger().setLevel(logging.ERROR)

# Suppress warnings from other libraries
logging.getLogger('tensorflow').setLevel(logging.ERROR)
logging.getLogger('absl').setLevel(logging.ERROR)

# Set XLA flags to reduce spill warnings
os.environ['XLA_FLAGS'] = '--xla_gpu_enable_fast_min_max'


# Get absolute path to project root
project_root = Path(os.path.abspath('')).parent.parent
sys.path.append(str(project_root))

from dotenv import load_dotenv
load_dotenv()

nih_dataset_root_dir = os.getenv("NIH_CXR14_DATASET_DIR")


2025-02-26 03:15:19.318451: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1740528919.331222  332122 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1740528919.335104  332122 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-26 03:15:19.347773: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
from src.models import  ELIXR
from src.datasets import NIHImageDataset



dataset = NIHImageDataset(nih_dataset_root_dir + "/original")

elixr_model = ELIXR()


output_dir = nih_dataset_root_dir + "/elixr"
os.makedirs(output_dir, exist_ok=True)


image_ids = dataset.get_image_ids()

max_len = max([len(image_id) for image_id in image_ids])

print("Max len", max_len)



  from .autonotebook import tqdm as notebook_tqdm
I0000 00:00:1740528924.062392  332122 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5730 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3070, pci bus id: 0000:01:00.0, compute capability: 8.6


Max len 16


In [None]:
from tqdm import tqdm
import numpy as np

# Initialize empty lists to store data
img_ids = []
general_img_embeddings = []
qform_img_embeddings = []

# Process dataset with tqdm progress bar
total_items = len(dataset)  # Replace with actual length if dataset doesn't support len()

for image, image_id in tqdm(dataset, total=total_items, desc="Processing Images", 
                           unit="img", ncols=100, colour="green"):
    img_ids.append(image_id)
    
    # Get embeddings from model
    output = elixr_model(image)
    
    # Store the embeddings
    general_img_embeddings.append(output['general_img_embedding'])
    qform_img_embeddings.append(output['qformer_embedding'])
    

# Convert lists to numpy arrays for saving to h5py
img_ids = np.array(img_ids, dtype='S')
general_img_embeddings = np.array(general_img_embeddings)
qform_img_embeddings = np.array(qform_img_embeddings)

print(f"Processed {len(img_ids)} images")
print(f"General embeddings shape: {general_img_embeddings.shape}")
print(f"Q-former embeddings shape: {qform_img_embeddings.shape}")



import h5py
from datetime import datetime

# Save general embeddings
general_img_embedding_file = output_dir + '/elixr_general_embeddings.h5'
with h5py.File(general_img_embedding_file, 'w') as f:
    # Create datasets with compression
    f.create_dataset("embeddings", data=general_img_embeddings, dtype=np.float32, 
                    compression="gzip", compression_opts=9)
    f.create_dataset("Image Index", data=img_ids)
    
    # Add metadata
    f.attrs['creation_date'] = str(datetime.now())
    f.attrs['embedding_dim'] = general_img_embeddings.shape[1]
    f.attrs['num_images'] = len(img_ids)
    f.attrs['max_len'] = max_len  # Assuming max_len is defined

print(f"Saved general embeddings to {general_img_embedding_file}")

# Save Q-former embeddings
qform_img_embedding_file = output_dir + '/elixr_qformer_embeddings.h5'
with h5py.File(qform_img_embedding_file, 'w') as f:
    # Create datasets with compression
    f.create_dataset("embeddings", data=qform_img_embeddings, dtype=np.float32, 
                    compression="gzip", compression_opts=9)
    f.create_dataset("Image Index", data=img_ids)
    
    # Add metadata
    f.attrs['creation_date'] = str(datetime.now())
    f.attrs['embedding_dim'] = qform_img_embeddings.shape[1]
    f.attrs['num_images'] = len(img_ids)
    f.attrs['max_len'] = max_len  # Assuming max_len is defined

print(f"Saved Q-former embeddings to {qform_img_embedding_file}")




Processing Images:   0%|[32m                                                [0m| 0/112120 [00:00<?, ?img/s][0mI0000 00:00:1740528937.526997  332199 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1740528939.464399  332199 service.cc:148] XLA service 0x79d07abef620 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1740528939.464411  332199 service.cc:156]   StreamExecutor device (0): NVIDIA GeForce RTX 3070, Compute Capability 8.6
2025-02-26 03:15:39.769662: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-02-26 03:15:40.649632: W external/local_xla/xla/service/gpu/nvptx_compiler.cc:930] The NVIDIA driver's CUDA version is 12.4 which is older than the PTX compiler version 12.5.82. Because the driver is older than the PTX compiler version, XLA is disabling parallel compilation, which may slow down compilation. You sh