<a href="https://www.kaggle.com/code/sumaniitm/complete-train-metadata-using-duckdb-and-polars?scriptVersionId=264278996" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

## Installing required dependencies

In [None]:
!pip install duckdb --no-index --find-links=/kaggle/input/polars-and-duckdb/kaggle/working/mysitepackages/duck_pkg
!pip install python-gdcm
!pip install pylibjpeg
!pip install pylibjpeg-libjpeg==2.2.0
!pip install pylibjpeg-openjpeg==2.3.0
!pip install matplotlib==3.10.3
!pip install scikit-learn==1.7.0
!pip install polars --no-index --find-links=/kaggle/input/polars-and-duckdb/kaggle/working/mysitepackages/polars_pkg
!pip install pydicom

## Importing required libraries

In [1]:
from pydicom import dcmread
from pydicom.dataset import FileDataset, FileMetaDataset
from pydicom.uid import generate_uid, ImplicitVRLittleEndian

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import os
import polars as pl
import duckdb as dd
from tqdm import tqdm
import matplotlib.pyplot as plt
import cv2
import pickle
import gc
import ctypes
from pathlib import Path
import logging
import json
import multiprocessing as mp
from concurrent.futures import ProcessPoolExecutor
import datetime
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
import tensorflow as tf
import tensorflow_io as tfio
from tensorflow import keras

2025-09-27 06:34:33.960074: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1758954874.259030      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1758954874.338107      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
print(tf.__version__)
print(tfio.__version__)

2.18.0
0.37.1


## Initializing the TPU

In [None]:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='local')
tf.tpu.experimental.initialize_tpu_system(tpu)
tpu_strategy = tf.distribute.TPUStrategy(tpu)

print("Number of accelerators: ", tpu_strategy.num_replicas_in_sync)

## Setting polars configs to view the dataframes better

In [3]:
pl.Config(fmt_str_lengths=1000)
pl.Config.set_tbl_rows(1000)

polars.config.Config

# Load the metadata of the training images
## Also separate out the localizer coordinates into individual columns

In [None]:
train_meta_data = pl.read_csv('/kaggle/input/rsna-intracranial-aneurysm-detection/train.csv'\
                              , low_memory=True)

train_locale_meta_data = pl.read_csv('/kaggle/input/rsna-intracranial-aneurysm-detection/train_localizers.csv'\
                              , low_memory=True)

def parse_coordinates(coord_str):
    if coord_str is None:
        return None, None
    try:
        coord_dict = json.loads(coord_str.replace("'", '"'))
        return float(coord_dict.get('x', 0.0)), float(coord_dict.get('y', 0.0)), int(coord_dict.get('f', 0.0))
    except (json.JSONDecodeError, KeyError, ValueError, AttributeError):
        return None, None

train_locale_meta_data = train_locale_meta_data.with_columns([
    pl.col("coordinates")
    .map_elements(lambda x: parse_coordinates(x)[0], return_dtype=pl.Float64)
    .cast(pl.Float64)
    .alias("coordinates_x"),
    
    pl.col("coordinates")
    .map_elements(lambda x: parse_coordinates(x)[1], return_dtype=pl.Float64)
    .cast(pl.Float64)
    .alias("coordinates_y"),
    
    pl.col("coordinates")
    .map_elements(lambda x: parse_coordinates(x)[2], return_dtype=pl.Int32)
    .cast(pl.Int32)
    .alias("coordinates_f")
])

print("Train CSV shape : ", train_meta_data.shape)
print("Train Localizers CSV shape : ", train_locale_meta_data.shape)
# Show the first few rows
print(train_locale_meta_data.filter(pl.col('coordinates_f') != 0.0)\
      .select(["coordinates", "coordinates_x", "coordinates_y", "coordinates_f"]).head(5))

## Glancing at the datasets so far

In [None]:
train_meta_data.head(10)

In [None]:
train_locale_meta_data.head(10)

### Get summary statistics of the new columns

In [None]:
print(train_locale_meta_data.select(["coordinates_x", "coordinates_y", "coordinates_f"]).describe())

# Get the image metadata from each training series and create a dataframe out of them

In [None]:
allowed_tags = ['BitsAllocated', 'BitsStored', 'Rows', 'Columns', 'FrameOfReferenceUID', 'HighBit', 'ImageOrientationPatient'
                , 'ImagePositionPatient', 'InstanceNumber', 'Modality', 'PhotometricInterpretation'
                , 'PixelRepresentation', 'PixelSpacing', 'PlanarConfiguration', 'RescaleIntercept', 'RescaleSlope'
                , 'RescaleType', 'SamplesPerPixel', 'SliceThickness', 'SpacingBetweenSlices']

## Functions to collect metadata without the image arrays
### We use python slots to reduce the memory foorprint
### Also use multithreading to speed up processing

In [None]:
class DicomRecord:
    """
    Memory-efficient class for storing DICOM metadata using __slots__
    """
    __slots__ = ['folder_name', 'file_name', 'file_path', 'image_shape'] + [
        'BitsAllocated', 'BitsStored', 'Rows', 'Columns', 'FrameOfReferenceUID',
        'HighBit', 'ImageOrientationPatient', 'ImagePositionPatient', 'InstanceNumber',
        'Modality', 'PhotometricInterpretation', 'PixelRepresentation', 'PixelSpacing',
        'PlanarConfiguration', 'RescaleIntercept', 'RescaleSlope', 'RescaleType',
        'SamplesPerPixel', 'SliceThickness', 'SpacingBetweenSlices'
    ]
    
    def __init__(self, folder_name, file_name, file_path, image_shape):
        self.folder_name = folder_name
        self.file_name = file_name
        self.file_path = file_path
        self.image_shape = image_shape
        for tag in self.__slots__[4:]:  
            setattr(self, tag, None)
    
    def to_dict(self):
        return {slot: getattr(self, slot) for slot in self.__slots__}

In [None]:
def process_single_folder(folder_path, allowed_tags):
    """
    Process a single folder of DICOM files and save image arrays
    """
    try:
        data = []
        dcm_files = list(Path(folder_path).glob("*.dcm"))
        folder_name = Path(folder_path).name
        
        for dcm_file in dcm_files:
            try:
                # Read DICOM file
                ds = dcmread(str(dcm_file))
                original_shape = str(ds.pixel_array.shape)
                
                # Create record
                record = DicomRecord(folder_name, dcm_file.name, str(dcm_file), original_shape)
                
                # Fill in tags
                for tag in allowed_tags:
                    try:
                        value = getattr(ds, tag)
                        if hasattr(value, '__iter__') and not isinstance(value, str):
                            value = str(list(map(str, value)))
                        else:
                            value = str(value)
                        setattr(record, tag, value)
                    except (AttributeError, TypeError):
                        continue
                
                data.append(record.to_dict())
                
            except Exception as e:
                print(f"Error processing file {dcm_file}: {e}")
                continue
                
        return data
        
    except Exception as e:
        print(f"Error processing folder {folder_path}: {e}")
        return []

In [None]:
def create_dicom_dataset(root_folder, allowed_tags, num_processes=None, chunk_size=100):
    """
    Create dataset with metadata DataFrame and memory-mapped image arrays
    """
    root_path = Path(root_folder)
    folders = [f for f in root_path.iterdir() if f.is_dir()]
    
    if not num_processes:
        num_processes = mp.cpu_count()
    
    # Create directories for temporary and array storage
    temp_dir = Path("temp_chunks")
    temp_dir.mkdir(exist_ok=True)
    
    # Create schema
    schema = {
        'folder_name': pl.Utf8,
        'file_name': pl.Utf8,
        'file_path': pl.String,
        'image_shape': pl.String
    }
    schema.update({tag: pl.Utf8 for tag in allowed_tags})
    
    # Process folders in parallel
    with ProcessPoolExecutor(max_workers=num_processes) as executor:
        for i in range(0, len(folders), chunk_size):
            chunk_folders = folders[i:i+chunk_size]
            chunk_data = []
            
            futures = [
                executor.submit(
                    process_single_folder, 
                    str(folder), 
                    allowed_tags
                    #arrays_dir
                )
                for folder in chunk_folders
            ]
            
            for future in tqdm(futures, 
                             desc=f"Processing chunk {i//chunk_size + 1}/{(len(folders)-1)//chunk_size + 1}"):
                chunk_data.extend(future.result())
            
            if chunk_data:
                chunk_df = pl.DataFrame(
                    chunk_data,
                    schema=schema,
                    infer_schema_length=None
                )
                
                chunk_df.write_parquet(
                    temp_dir / f"dicom_metadata_chunk_{i//chunk_size}.parquet",
                    compression="snappy"
                )
                
                del chunk_data
                del chunk_df
    
    # Combine chunks
    print("\nCombining chunks...")
    chunk_files = list(temp_dir.glob("dicom_metadata_chunk_*.parquet"))
    final_df = pl.concat([
        pl.scan_parquet(str(chunk_file))
        for chunk_file in chunk_files
    ]).collect()
    
    # Clean up temporary files
    for f in chunk_files:
        f.unlink()
    temp_dir.rmdir()
    
    return final_df

## Starting the collection of metadata

In [None]:
with tpu_strategy.scope():
    root_folder = "/kaggle/input/rsna-intracranial-aneurysm-detection/series"
    
    try:
        metadata_df = create_dicom_dataset(
            root_folder, 
            allowed_tags, 
            num_processes=mp.cpu_count(),
            chunk_size=192
        )
    except Exception as e:
        print(f"Error: {e}")

In [None]:
metadata_df.columns

# Create the full training data
* Bring in the localizer coordinates for the series where aneurysm is present
* Create another column to signify whether aneurysm is shown in a specific image within a series
* There can be cases where some images of a series cannot catch aneurysm presence
* Bringing all the rows at the image file granularity, i.e. if a file has coordinates then it has aneurysm else not

In [None]:
df_all_coordinates = dd.sql( \
    "select t2.coordinates_x, t2.coordinates_y, t2.coordinates_f, t1.* \
    from metadata_df t1 \
    left join train_locale_meta_data t2 \
    on t1.folder_name = t2.SeriesInstanceUID \
    and replace(t1.file_name, '.dcm','') = t2.SOPInstanceUID "\
).pl()

print(df_all_coordinates.shape)
print(df_all_coordinates.columns)

In [None]:
new_columns = [col.lower().replace(" ", "_") for col in train_meta_data.columns]
train_meta_data.columns = new_columns
print(train_meta_data.columns)

In [None]:
df_all_data = dd.sql( \
    "select t2.file_name, t2.image_shape, t2.coordinates_x, t2.coordinates_y, t2.coordinates_f \
    , t1.aneurysm_present as aneurysm_present_in_series \
    , case when t2.coordinates_x is not null then 1 else 0 end as aneurysm_present_in_image \
    , t1.seriesinstanceuid, t1.patientage, t1.patientsex, t1.modality \
    , case when t2.coordinates_x is not null then t1.left_infraclinoid_internal_carotid_artery \
    else 0 end as left_infraclinoid_internal_carotid_artery \
    , case when t2.coordinates_x is not null then t1.right_infraclinoid_internal_carotid_artery \
    else 0 end as right_infraclinoid_internal_carotid_artery \
    , case when t2.coordinates_x is not null then t1.left_supraclinoid_internal_carotid_artery \
    else 0 end as left_supraclinoid_internal_carotid_artery \
    , case when t2.coordinates_x is not null then t1.right_supraclinoid_internal_carotid_artery \
    else 0 end as right_supraclinoid_internal_carotid_artery \
    , case when t2.coordinates_x is not null then t1.left_middle_cerebral_artery \
    else 0 end as left_middle_cerebral_artery \
    , case when t2.coordinates_x is not null then t1.right_middle_cerebral_artery \
    else 0 end as right_middle_cerebral_artery \
    , case when t2.coordinates_x is not null then t1.anterior_communicating_artery \
    else 0 end as anterior_communicating_artery \
    , case when t2.coordinates_x is not null then t1.left_anterior_cerebral_artery \
    else 0 end as left_anterior_cerebral_artery \
    , case when t2.coordinates_x is not null then t1.right_anterior_cerebral_artery \
    else 0 end as right_anterior_cerebral_artery \
    , case when t2.coordinates_x is not null then t1.left_posterior_communicating_artery \
    else 0 end as left_posterior_communicating_artery \
    , case when t2.coordinates_x is not null then t1.right_posterior_communicating_artery \
    else 0 end as right_posterior_communicating_artery \
    , case when t2.coordinates_x is not null then t1.basilar_tip \
    else 0 end as basilar_tip \
    , case when t2.coordinates_x is not null then t1.other_posterior_circulation \
    else 0 end as other_posterior_circulation \
    from train_meta_data t1 \
    join df_all_coordinates t2 \
    on t1.SeriesInstanceUID = t2.folder_name" \
).pl()

print("Full training data: ", df_all_data.shape)
print("Full training data columns: ", df_all_data.columns)
print("Aneurysm not present in {0} series".format(df_all_data.filter(pl.col("coordinates_x").is_null()).shape[0]))

print("Aneurysm present in {0} series".format(df_all_data.filter(pl.col("coordinates_x").is_not_null()).shape[0]))

print("Aneurysm not shown in {0} images".format(df_all_data.filter(pl.col("aneurysm_present_in_image")==0).shape[0]))

print("Aneurysm shown in {0} images".format(df_all_data.filter(pl.col("aneurysm_present_in_image")==1).shape[0]))

print(df_all_data.select(["coordinates_x", "coordinates_y"]).describe())

In [None]:
df_all_data.write_parquet('full_training_data.parquet')

# Looking at a specific CTA image that shows aneurysm
## Taking a multi-frame image by converting it to single-frame image

In [30]:
df_all_data = pl.read_parquet('/kaggle/input/rsna-aneurysm-train-metadata-suman/full_training_data.parquet')
print("Shape of training metadata", df_all_data.shape)
df_all_data.columns

Shape of training metadata (1001346, 24)


['file_name',
 'image_shape',
 'coordinates_x',
 'coordinates_y',
 'coordinates_f',
 'aneurysm_present_in_series',
 'aneurysm_present_in_image',
 'seriesinstanceuid',
 'patientage',
 'patientsex',
 'modality',
 'left_infraclinoid_internal_carotid_artery',
 'right_infraclinoid_internal_carotid_artery',
 'left_supraclinoid_internal_carotid_artery',
 'right_supraclinoid_internal_carotid_artery',
 'left_middle_cerebral_artery',
 'right_middle_cerebral_artery',
 'anterior_communicating_artery',
 'left_anterior_cerebral_artery',
 'right_anterior_cerebral_artery',
 'left_posterior_communicating_artery',
 'right_posterior_communicating_artery',
 'basilar_tip',
 'other_posterior_circulation']

In [31]:
#with tf.device('/device:TPU:0'):
root_folder = "/kaggle/input/rsna-intracranial-aneurysm-detection/series"

def create_full_image_path(row_data):
    return row_data['root_folder'] + '/' + row_data['seriesinstanceuid'] + '/' + row_data['file_name']

df_all_data = df_all_data.with_columns(pl.lit(root_folder).alias("root_folder"))

df_all_data = df_all_data.with_columns([
    pl.struct(pl.col("root_folder"), pl.col("seriesinstanceuid"), pl.col("file_name"))
    .map_elements(create_full_image_path, return_dtype=pl.String)
    .alias("full_image_path")
])

In [9]:
dd.sql(" \
select t1.modality, t1.aneurysm_present_in_image, t1.per_mod_count, \
round(t1.per_mod_count/t2.total_count ,3) as modality_pct \
from \
( \
select modality, aneurysm_present_in_image, cast(count(1) as float) as per_mod_count from df_all_data \
group by modality, aneurysm_present_in_image \
)t1 \
join \
(select cast(count(1) as float) as total_count from df_all_data)t2 \
on 1=1 \
order by 1 \
").pl()

modality,aneurysm_present_in_image,per_mod_count,modality_pct
str,i32,f32,f32
"""CTA""",0,725991.0,0.725
"""CTA""",1,1221.0,0.001
"""MRA""",1,657.0,0.001
"""MRA""",0,195928.0,0.196
"""MRI T1post""",1,92.0,0.0
"""MRI T1post""",0,47105.0,0.047
"""MRI T2""",1,279.0,0.0
"""MRI T2""",0,30073.0,0.03


In [32]:
df_all_data = dd.sql(" select \
case when left_infraclinoid_internal_carotid_artery = 1 then \'left_infraclinoid_internal_carotid_artery\' \
when right_infraclinoid_internal_carotid_artery = 1 then \'right_infraclinoid_internal_carotid_artery\' \
when left_supraclinoid_internal_carotid_artery = 1 then \'left_supraclinoid_internal_carotid_artery\' \
when right_supraclinoid_internal_carotid_artery = 1 then \'right_supraclinoid_internal_carotid_artery\' \
when left_middle_cerebral_artery = 1 then \'left_middle_cerebral_artery\' \
when right_middle_cerebral_artery = 1 then \'right_middle_cerebral_artery\' \
when anterior_communicating_artery = 1 then \'anterior_communicating_artery\' \
when left_anterior_cerebral_artery = 1 then \'left_anterior_cerebral_artery\' \
when right_anterior_cerebral_artery = 1 then \'right_anterior_cerebral_artery\' \
when left_posterior_communicating_artery = 1 then \'left_posterior_communicating_artery\' \
when right_posterior_communicating_artery = 1 then \'right_posterior_communicating_artery\' \
when basilar_tip = 1 then \'basilar_tip\' \
when other_posterior_circulation = 1 then \'other_posterior_circulation\' \
else \'no_aneurysm\' end as aneurysm_position \
, * \
from df_all_data" \
).pl()

In [33]:
df_all_data.head(5)

aneurysm_position,file_name,image_shape,coordinates_x,coordinates_y,coordinates_f,aneurysm_present_in_series,aneurysm_present_in_image,seriesinstanceuid,patientage,patientsex,modality,left_infraclinoid_internal_carotid_artery,right_infraclinoid_internal_carotid_artery,left_supraclinoid_internal_carotid_artery,right_supraclinoid_internal_carotid_artery,left_middle_cerebral_artery,right_middle_cerebral_artery,anterior_communicating_artery,left_anterior_cerebral_artery,right_anterior_cerebral_artery,left_posterior_communicating_artery,right_posterior_communicating_artery,basilar_tip,other_posterior_circulation,root_folder,full_image_path
str,str,str,f64,f64,i32,i64,i32,str,i64,str,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,str,str
"""right_middle_cerebral_artery""","""1.2.826.0.1.3680043.8.498.50109132199445951854133683565774892169.dcm""","""(512, 512)""",256.300637,146.099363,0.0,1,1,"""1.2.826.0.1.3680043.8.498.10602156717395509282545203380100998253""",40,"""Female""","""CTA""",0,0,0,0,0,1,0,1,0,0,0,0,0,"""/kaggle/input/rsna-intracranial-aneurysm-detection/series""","""/kaggle/input/rsna-intracranial-aneurysm-detection/series/1.2.826.0.1.3680043.8.498.10602156717395509282545203380100998253/1.2.826.0.1.3680043.8.498.50109132199445951854133683565774892169.dcm"""
"""anterior_communicating_artery""","""1.2.826.0.1.3680043.8.498.46546299230498892201063080673910592618.dcm""","""(512, 512)""",258.326172,197.485714,0.0,1,1,"""1.2.826.0.1.3680043.8.498.10603321067992496978932502160661673268""",35,"""Male""","""CTA""",0,0,0,0,0,0,1,0,0,0,0,0,0,"""/kaggle/input/rsna-intracranial-aneurysm-detection/series""","""/kaggle/input/rsna-intracranial-aneurysm-detection/series/1.2.826.0.1.3680043.8.498.10603321067992496978932502160661673268/1.2.826.0.1.3680043.8.498.46546299230498892201063080673910592618.dcm"""
"""other_posterior_circulation""","""1.2.826.0.1.3680043.8.498.22013535645164965059058737013866577944.dcm""","""(320, 260)""",145.686159,157.433479,0.0,1,1,"""1.2.826.0.1.3680043.8.498.10607580708371334840797048741181101985""",68,"""Male""","""MRI T2""",0,0,0,0,0,0,0,0,0,0,0,0,1,"""/kaggle/input/rsna-intracranial-aneurysm-detection/series""","""/kaggle/input/rsna-intracranial-aneurysm-detection/series/1.2.826.0.1.3680043.8.498.10607580708371334840797048741181101985/1.2.826.0.1.3680043.8.498.22013535645164965059058737013866577944.dcm"""
"""no_aneurysm""","""1.2.826.0.1.3680043.8.498.10665853855746359070265541252480337151.dcm""","""(150, 480, 480)""",,,,0,0,"""1.2.826.0.1.3680043.8.498.10618752182981309163840057940806925305""",52,"""Female""","""MRA""",0,0,0,0,0,0,0,0,0,0,0,0,0,"""/kaggle/input/rsna-intracranial-aneurysm-detection/series""","""/kaggle/input/rsna-intracranial-aneurysm-detection/series/1.2.826.0.1.3680043.8.498.10618752182981309163840057940806925305/1.2.826.0.1.3680043.8.498.10665853855746359070265541252480337151.dcm"""
"""no_aneurysm""","""1.2.826.0.1.3680043.8.498.73808789234996291374936109664786479598.dcm""","""(768, 696)""",,,,0,0,"""1.2.826.0.1.3680043.8.498.10623075849681650687943932638488393349""",60,"""Female""","""MRA""",0,0,0,0,0,0,0,0,0,0,0,0,0,"""/kaggle/input/rsna-intracranial-aneurysm-detection/series""","""/kaggle/input/rsna-intracranial-aneurysm-detection/series/1.2.826.0.1.3680043.8.498.10623075849681650687943932638488393349/1.2.826.0.1.3680043.8.498.73808789234996291374936109664786479598.dcm"""


In [34]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df_all_data = df_all_data.with_columns(
        pl.Series(
            "aneurysm_position_encoded",
            le.fit_transform(df_all_data["aneurysm_position"].to_numpy())
        )
    )

# Getting the biases

In [35]:
df_agg = df_all_data.group_by(["aneurysm_position_encoded","aneurysm_position"]).agg(pl.col("file_name").count())
list_of_positions = pl.Series(df_agg.select(pl.col('aneurysm_position'))).to_list()
bias_arr_prep_dict = {}
for pos in list_of_positions:
    print(pos)
    print(df_agg.filter(pl.col('aneurysm_position')==pos).select(pl.col('file_name')).item(0, 0))
    bias_arr_prep_dict[pos] = df_agg.filter(pl.col('aneurysm_position')==pos).select(pl.col('file_name')).item(0, 0)
    
print(bias_arr_prep_dict)

left_infraclinoid_internal_carotid_artery
117
no_aneurysm
999097
other_posterior_circulation
90
right_supraclinoid_internal_carotid_artery
272
left_posterior_communicating_artery
88
right_anterior_cerebral_artery
44
anterior_communicating_artery
359
left_supraclinoid_internal_carotid_artery
419
left_anterior_cerebral_artery
37
left_middle_cerebral_artery
275
right_infraclinoid_internal_carotid_artery
126
basilar_tip
82
right_middle_cerebral_artery
268
right_posterior_communicating_artery
72
{'left_infraclinoid_internal_carotid_artery': 117, 'no_aneurysm': 999097, 'other_posterior_circulation': 90, 'right_supraclinoid_internal_carotid_artery': 272, 'left_posterior_communicating_artery': 88, 'right_anterior_cerebral_artery': 44, 'anterior_communicating_artery': 359, 'left_supraclinoid_internal_carotid_artery': 419, 'left_anterior_cerebral_artery': 37, 'left_middle_cerebral_artery': 275, 'right_infraclinoid_internal_carotid_artery': 126, 'basilar_tip': 82, 'right_middle_cerebral_artery': 

In [36]:
target_variables = ['left_infraclinoid_internal_carotid_artery',  'right_infraclinoid_internal_carotid_artery'
                    ,  'left_supraclinoid_internal_carotid_artery',  'right_supraclinoid_internal_carotid_artery'
                    ,  'left_middle_cerebral_artery',  'right_middle_cerebral_artery'
                    ,  'anterior_communicating_artery',  'left_anterior_cerebral_artery'
                    ,  'right_anterior_cerebral_artery',  'left_posterior_communicating_artery'
                    ,  'right_posterior_communicating_artery'
                    ,  'basilar_tip',  'other_posterior_circulation'
                    ,  'no_aneurysm'
                   ]

In [37]:
total_pos = 0
for pos in target_variables:
    total_pos = total_pos + bias_arr_prep_dict[pos]

print(total_pos)

1001346


In [38]:
bias_dict = {}
bias_list = []
for pos in target_variables:
    local_pos = total_pos - bias_arr_prep_dict[pos]
    bias_dict[pos] = round(np.log(bias_arr_prep_dict[pos]/local_pos),5)
    bias_list.append(bias_dict[pos])

print(bias_dict)
print(bias_list)

{'left_infraclinoid_internal_carotid_artery': -9.05456, 'right_infraclinoid_internal_carotid_artery': -8.98045, 'left_supraclinoid_internal_carotid_artery': -7.77857, 'right_supraclinoid_internal_carotid_artery': -8.21078, 'left_middle_cerebral_artery': -8.19981, 'right_middle_cerebral_artery': -8.2256, 'anterior_communicating_artery': -7.93317, 'left_anterior_cerebral_artery': -10.2059, 'right_anterior_cerebral_artery': -10.03262, 'left_posterior_communicating_artery': -9.33943, 'right_posterior_communicating_artery': -9.54012, 'basilar_tip': -9.41005, 'other_posterior_circulation': -9.31696, 'no_aneurysm': 6.09637}
[-9.05456, -8.98045, -7.77857, -8.21078, -8.19981, -8.2256, -7.93317, -10.2059, -10.03262, -9.33943, -9.54012, -9.41005, -9.31696, 6.09637]


In [39]:
df_all_data.shape

(1001346, 28)

In [40]:
df_all_data.write_parquet('full_training_data.parquet')

In [None]:
def extract_single_frame(multiframe_path, slice_number, output_path=None):
    """
    Extract a single frame from a multi-frame DICOM
    
    Args:
        multiframe_path: Path to multi-frame DICOM file
        slice_number: The slice number to extract (0-based index)
        output_path: Path to save the single-frame DICOM. If None, returns the dataset
    """
    try:
        # Read the multi-frame DICOM with force=True to handle potentially corrupted files
        multi_ds = dcmread(multiframe_path, force=True)
        
        # Verify it's a multi-frame image
        if not hasattr(multi_ds, 'NumberOfFrames'):
            raise ValueError("Input DICOM is not a multi-frame image")
        
        # Check if slice number is valid
        if slice_number >= multi_ds.NumberOfFrames:
            raise ValueError(f"Slice number {slice_number} is out of range. "
                           f"Image has {multi_ds.NumberOfFrames} frames")
        
        # Create new dataset for single frame
        single_ds = FileDataset(output_path or "temp.dcm", {}, 
                              file_meta=FileMetaDataset(), 
                              preamble=b"\0" * 128)
        
        # Copy attributes from multi-frame dataset
        attrs_to_copy = allowed_tags
        
        for attr in attrs_to_copy:
            if hasattr(multi_ds, attr):
                setattr(single_ds, attr, getattr(multi_ds, attr))
        
        # Generate new UIDs
        single_ds.SOPInstanceUID = generate_uid()
        single_ds.file_meta.MediaStorageSOPInstanceUID = single_ds.SOPInstanceUID
        
        # Set transfer syntax to uncompressed little endian
        single_ds.file_meta.TransferSyntaxUID = ImplicitVRLittleEndian
        single_ds.file_meta.MediaStorageSOPClassUID = multi_ds.file_meta.MediaStorageSOPClassUID
        if hasattr(multi_ds.file_meta, 'ImplementationClassUID'):
            single_ds.file_meta.ImplementationClassUID = multi_ds.file_meta.ImplementationClassUID
        
        # Set instance-specific attributes
        single_ds.InstanceNumber = slice_number + 1
        
        try:
            # Try to get pixel array directly
            pixel_array = multi_ds.pixel_array[slice_number]
        except Exception as e:
            #print(f"Warning: Could not directly access pixel_array: {e}")
            # Alternative approach: decompress and get pixels
            if hasattr(multi_ds, 'decompress'):
                multi_ds.decompress()
            pixel_array = multi_ds.pixel_array[slice_number]
        
        # Set pixel data
        single_ds.PixelData = pixel_array.tobytes()
        
        # Update image-specific attributes
        single_ds.NumberOfFrames = 1
        
        # Try to copy position and orientation
        try:
            if hasattr(multi_ds, 'PerFrameFunctionalGroupsSequence'):
                frame_content = multi_ds.PerFrameFunctionalGroupsSequence[slice_number]
                
                if hasattr(frame_content, 'PlanePositionSequence'):
                    position = frame_content.PlanePositionSequence[0].ImagePositionPatient
                    single_ds.ImagePositionPatient = position
                
                if hasattr(frame_content, 'PlaneOrientationSequence'):
                    orientation = frame_content.PlaneOrientationSequence[0].ImageOrientationPatient
                    single_ds.ImageOrientationPatient = orientation
        except Exception as e:
            #print(f"Warning: Could not copy position/orientation: {e}")
            raise
        
        # Add creation timestamp
        dt = datetime.datetime.now()
        single_ds.ContentDate = dt.strftime('%Y%m%d')
        single_ds.ContentTime = dt.strftime('%H%M%S.%f')
        
        # Save or return the dataset
        if output_path:
            single_ds.save_as(output_path, write_like_original=False)
            return None
        return single_ds
    
    except Exception as e:
        #print(f"Error extracting frame: {e}")
        raise

# Alternative version using different approach for compressed files
def extract_single_frame_alternative(multiframe_path, slice_number, output_path=None):
    """
    Alternative version for handling problematic files
    """
    try:
        # Read with force and stop before pixels
        multi_ds = dcmread(multiframe_path, force=True, stop_before_pixels=True)
        
        # Read pixel data separately
        with open(multiframe_path, 'rb') as f:
            multi_ds.PixelData = f.read()
        
        # Decompress if needed
        if hasattr(multi_ds, 'decompress'):
            multi_ds.decompress()
        
        # Get pixel array
        pixel_array = multi_ds.pixel_array[slice_number]
        
        # Create new dataset
        single_ds = FileDataset(output_path or "temp.dcm", {}, 
                              file_meta=FileMetaDataset(), 
                              preamble=b"\0" * 128)
        
        # Copy attributes (same as before)
        attrs_to_copy = allowed_tags
        
        for attr in attrs_to_copy:
            if hasattr(multi_ds, attr):
                setattr(single_ds, attr, getattr(multi_ds, attr))
        
        # Generate new UIDs
        single_ds.SOPInstanceUID = generate_uid()
        single_ds.file_meta.MediaStorageSOPInstanceUID = single_ds.SOPInstanceUID
        
        # Set transfer syntax to uncompressed little endian
        single_ds.file_meta.TransferSyntaxUID = ImplicitVRLittleEndian
        single_ds.file_meta.MediaStorageSOPClassUID = multi_ds.file_meta.MediaStorageSOPClassUID
        if hasattr(multi_ds.file_meta, 'ImplementationClassUID'):
            single_ds.file_meta.ImplementationClassUID = multi_ds.file_meta.ImplementationClassUID
        
        # Set instance-specific attributes
        single_ds.InstanceNumber = slice_number + 1
        
        try:
            # Try to get pixel array directly
            pixel_array = multi_ds.pixel_array[slice_number]
        except Exception as e:
            #print(f"Warning: Could not directly access pixel_array: {e}")
            # Alternative approach: decompress and get pixels
            if hasattr(multi_ds, 'decompress'):
                multi_ds.decompress()
            pixel_array = multi_ds.pixel_array[slice_number]
        
        # Set pixel data
        single_ds.PixelData = pixel_array.tobytes()
        
        # Update image-specific attributes
        single_ds.NumberOfFrames = 1
        
        # Try to copy position and orientation
        try:
            if hasattr(multi_ds, 'PerFrameFunctionalGroupsSequence'):
                frame_content = multi_ds.PerFrameFunctionalGroupsSequence[slice_number]
                
                if hasattr(frame_content, 'PlanePositionSequence'):
                    position = frame_content.PlanePositionSequence[0].ImagePositionPatient
                    single_ds.ImagePositionPatient = position
                
                if hasattr(frame_content, 'PlaneOrientationSequence'):
                    orientation = frame_content.PlaneOrientationSequence[0].ImageOrientationPatient
                    single_ds.ImageOrientationPatient = orientation
        except Exception as e:
            #print(f"Warning: Could not copy position/orientation: {e}")
            raise
        
        # Add creation timestamp
        dt = datetime.datetime.now()
        single_ds.ContentDate = dt.strftime('%Y%m%d')
        single_ds.ContentTime = dt.strftime('%H%M%S.%f')
        
        # Save or return the dataset
        if output_path:
            single_ds.save_as(output_path, write_like_original=False)
            return None
        return single_ds
        
    except Exception as e:
        #print(f"Error in alternative extraction: {e}")
        raise

# Function to try both methods
def safe_extract_single_frame(multiframe_path, slice_number, output_path=None):
    """
    Try both extraction methods
    """
    try:
        return extract_single_frame(multiframe_path, slice_number, output_path)
    except Exception as e:
        #print(f"Primary method failed: {e}")
        #print("Trying alternative method...")
        try:
            return extract_single_frame_alternative(multiframe_path, slice_number, output_path)
        except Exception as e2:
            #print(f"Alternative method also failed: {e2}")
            raise

# Version with zoom functionality
def load_and_view_single_slice_with_zoom(dcm_path, x_coord, y_coord, f_coord=None, zoom_size=100):
    """
    Load and display a single DICOM slice with crosshair and zoomed inset
    
    Args:
        dcm_path: Path to the DICOM file
        x_coord: x coordinate for the crosshair
        y_coord: y coordinate for the crosshair
        zoom_size: Size of the zoom window in pixels
    """
    # Read DICOM file
    if f_coord:
        ds = safe_extract_single_frame(dcm_path, f_coord)
    else:
        ds = dcmread(dcm_path)
    img = ds.pixel_array
    
    # Create figure and axes
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 7))
    
    # Main image with crosshair
    ax1.imshow(img, cmap='gray')
    ax1.axvline(x=x_coord, color='red', alpha=0.5)
    ax1.axhline(y=y_coord, color='red', alpha=0.5)
    ax1.plot(x_coord, y_coord, 'r+', markersize=10, markeredgewidth=2)
    
    # Zoomed region
    x_start = int(max(0, x_coord - zoom_size/2))
    x_end = int(min(img.shape[1], x_coord + zoom_size/2))
    y_start = int(max(0, y_coord - zoom_size/2))
    y_end = int(min(img.shape[0], y_coord + zoom_size/2))
    
    zoomed = img[y_start:y_end, x_start:x_end]
    ax2.imshow(zoomed, cmap='gray')
    
    # Add crosshair to zoomed region
    center_x = x_coord - x_start
    center_y = y_coord - y_start
    ax2.axvline(x=center_x, color='red', alpha=0.5)
    ax2.axhline(y=center_y, color='red', alpha=0.5)
    ax2.plot(center_x, center_y, 'r+', markersize=10, markeredgewidth=2)
    
    ax1.axis('off')
    ax2.axis('off')
    ax1.set_title('Full Image')
    ax2.set_title('Zoomed Region')
    
    plt.tight_layout()
    plt.show()

In [None]:
dcm_path = '/kaggle/input/rsna-intracranial-aneurysm-detection/series/1.2.826.0.1.3680043.8.498.10656705618563493995266564048457485210/1.2.826.0.1.3680043.8.498.42869495026349479137655237867466396964.dcm'
x_coord = 297.728962
y_coord = 209.570827
f_coord = 65
load_and_view_single_slice_with_zoom(dcm_path, x_coord, y_coord, f_coord)

In [None]:
dcm_path = '/kaggle/input/rsna-intracranial-aneurysm-detection/series/1.2.826.0.1.3680043.8.498.10602156717395509282545203380100998253/1.2.826.0.1.3680043.8.498.50109132199445951854133683565774892169.dcm'
x_coord = 256.300637
y_coord = 146.099363
load_and_view_single_slice_with_zoom(dcm_path, x_coord, y_coord, f_coord)

In [None]:
if f_coord:
    ds = safe_extract_single_frame(dcm_path, f_coord)
else:
    ds = dcmread(dcm_path)

arr = ds.pixel_array.astype(np.float32)

# Apply rescale if present
slope = float(getattr(ds, "RescaleSlope", 1) or 1)
intercept = float(getattr(ds, "RescaleIntercept", 0) or 0)
arr = arr * slope + intercept

# Handle MONOCHROME1 (invert)
if getattr(ds, "PhotometricInterpretation", "") == "MONOCHROME1":
    arr = arr.max() - arr

image = tf.convert_to_tensor(arr)

expanded_image = tf.expand_dims(image, -1)
m, M=tf.math.reduce_min(expanded_image), tf.math.reduce_max(expanded_image)
expanded_image = (tf.image.grayscale_to_rgb(expanded_image)-m)/(M-m)
expanded_image = tf.image.resize(expanded_image, (128,128))
sqzd_image = tf.squeeze(expanded_image)

train_img = tf.reshape(sqzd_image, shape=(128, 128, 3))

In [None]:
image_np = train_img.numpy()
plt.imshow(image_np)
plt.title("TensorFlow Image Visualization")
plt.axis('off') # Hide axes for cleaner image display
plt.show()

In [None]:
def read_and_parse_dicom_files_tensorflow_train(dcm_path, f_coord):
    if f_coord:
        ds = safe_extract_single_frame(dcm_path, f_coord)
    else:
        ds = dcmread(dcm_path)
    
    arr = ds.pixel_array.astype(np.float32)
    
    # Apply rescale if present
    slope = float(getattr(ds, "RescaleSlope", 1) or 1)
    intercept = float(getattr(ds, "RescaleIntercept", 0) or 0)
    arr = arr * slope + intercept
    
    # Handle MONOCHROME1 (invert)
    if getattr(ds, "PhotometricInterpretation", "") == "MONOCHROME1":
        arr = arr.max() - arr
    
    image = tf.convert_to_tensor(arr)
    
    expanded_image = tf.expand_dims(image, -1)
    m, M=tf.math.reduce_min(expanded_image), tf.math.reduce_max(expanded_image)
    expanded_image = (tf.image.grayscale_to_rgb(expanded_image)-m)/(M-m)
    expanded_image = tf.image.resize(expanded_image, (128,128))
    sqzd_image = tf.squeeze(expanded_image)

    return sqzd_image

def preprocessing(dcm_path, f_coord):
    train_img = read_and_parse_dicom_files_tensorflow_train(dcm_path, f_coord)
    train_img = tf.reshape(train_img, shape=(128, 128, 3))
    return train_img

def load_dataset_tensorflow_train(dcm_path, f_coord, labels):
    image = preprocessing(dcm_path, f_coord)
    return {"images": tf.cast(image, tf.float32), "labels": tf.cast(labels, tf.float32)}

def dict_to_tuple(inputs):
    return inputs["images"], inputs["labels"]

In [None]:
df_all_data.columns

In [None]:
cols_to_select = ['full_image_path', 'left_infraclinoid_internal_carotid_artery',  'right_infraclinoid_internal_carotid_artery',  'left_supraclinoid_internal_carotid_artery'
                  ,  'right_supraclinoid_internal_carotid_artery',  'left_middle_cerebral_artery',  'right_middle_cerebral_artery',  'anterior_communicating_artery'
                  ,  'left_anterior_cerebral_artery',  'right_anterior_cerebral_artery',  'left_posterior_communicating_artery',  'right_posterior_communicating_artery'
                  ,  'basilar_tip',  'other_posterior_circulation'
]

df_for_train_baseline = df_all_data.select(cols_to_select)

In [None]:
x_train, x_test_val = train_test_split(df_for_train_baseline, test_size=0.4, random_state=42)
x_test, x_valid = train_test_split(x_test_val, test_size=0.2, random_state=42)

print("Training data shape : {0}".format(x_train.shape))
print("Test data shape : {0}".format(x_test.shape))
print("Validation data shape : {0}".format(x_valid.shape))

In [None]:
label_cols = ['left_infraclinoid_internal_carotid_artery',  'right_infraclinoid_internal_carotid_artery',  'left_supraclinoid_internal_carotid_artery'
                  ,  'right_supraclinoid_internal_carotid_artery',  'left_middle_cerebral_artery',  'right_middle_cerebral_artery',  'anterior_communicating_artery'
                  ,  'left_anterior_cerebral_artery',  'right_anterior_cerebral_artery',  'left_posterior_communicating_artery',  'right_posterior_communicating_artery'
                  ,  'basilar_tip',  'other_posterior_circulation'
]

#### This is a multi-label classification problem where each instance (i.e. each DICOM image) can be labelled with at-most 13 labels (i.e. brain locations).
#### The resulting predictions will then need to be aggregated up at the level of each scan series
#### Finally the main target variable has to be calculated as the max of all the 13 labels, i.e. if at least one of the 13 labels is 1 then the final target variable is 1

In [None]:
tpu_strategy.num_replicas_in_sync

In [None]:
def generate_tf_datasets(p_df, p_BATCH_SIZE_PER_REPLICA):

    BATCH_SIZE = p_BATCH_SIZE_PER_REPLICA * tpu_strategy.num_replicas_in_sync
    
    image_filenames = pl.Series(p_df.select(pl.col('full_image_path'))).to_list()
    image_labels = pl.Series(p_df.select(label_cols)).to_list()
    
    image_dataset = tf.data.Dataset.from_tensor_slices((image_filenames, image_labels))
    
    image_ds = image_dataset.map(load_dataset_tensorflow_train, num_parallel_calls=tf.data.AUTOTUNE)
    image_ds = image_ds.map(dict_to_tuple, num_parallel_calls=tf.data.AUTOTUNE)
    image_ds = image_ds.batch(batch_size=BATCH_SIZE, drop_remainder=True)
    image_ds = image_ds.prefetch(tf.data.AUTOTUNE)
    
    return image_ds

In [None]:
train_ds = generate_tf_datasets(p_df=x_train, p_BATCH_SIZE_PER_REPLICA = 10)

In [None]:
type(train_ds)