## Installing relevant packages

In [1]:
!pip install duckdb --no-index --find-links=/kaggle/input/polars-and-duckdb/kaggle/working/mysitepackages/duck_pkg
!pip install python-gdcm
!pip install pylibjpeg
!pip install pylibjpeg-libjpeg==2.2.0
!pip install pylibjpeg-openjpeg==2.3.0
!pip install matplotlib==3.10.3
!pip install scikit-learn==1.7.0
!pip install polars --no-index --find-links=/kaggle/input/polars-and-duckdb/kaggle/working/mysitepackages/polars_pkg
!pip install pydicom

Looking in links: /kaggle/input/polars-and-duckdb/kaggle/working/mysitepackages/duck_pkg
Collecting python-gdcm
  Downloading python_gdcm-3.0.26-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.8 kB)
Downloading python_gdcm-3.0.26-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (12.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m85.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m:01[0m
[?25hInstalling collected packages: python-gdcm
Successfully installed python-gdcm-3.0.26
Collecting pylibjpeg
  Downloading pylibjpeg-2.0.1-py3-none-any.whl.metadata (7.8 kB)
Downloading pylibjpeg-2.0.1-py3-none-any.whl (24 kB)
Installing collected packages: pylibjpeg
Successfully installed pylibjpeg-2.0.1
Collecting pylibjpeg-libjpeg==2.2.0
  Downloading pylibjpeg_libjpeg-2.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.7 kB)
Downloading pylibjpeg_libjpeg-2.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux20

In [2]:
from pydicom import dcmread
from pydicom.dataset import FileDataset, FileMetaDataset
from pydicom.uid import generate_uid, ImplicitVRLittleEndian

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import os
import polars as pl
import duckdb as dd
from tqdm import tqdm
import matplotlib.pyplot as plt
import cv2
import pickle
import gc
import ctypes
from pathlib import Path
import logging
import json
import multiprocessing as mp
from concurrent.futures import ProcessPoolExecutor
import datetime
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
import tensorflow as tf
import tensorflow_io as tfio
from tensorflow import keras

2025-09-01 10:41:58.803622: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756723318.989853      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756723319.042423      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Setting polars configs to view the dataframes better

In [3]:
pl.Config(fmt_str_lengths=1000)
pl.Config.set_tbl_rows(1000)

polars.config.Config

## Load the previously prepared training metadata

In [4]:
df_all_data = pl.read_parquet('/kaggle/input/rsna-aneurysm-train-metadata-suman/full_training_data.parquet')
print("Shape of training metadata", df_all_data.shape)
df_all_data.columns

Shape of training metadata (1001346, 24)


['file_name',
 'image_shape',
 'coordinates_x',
 'coordinates_y',
 'coordinates_f',
 'aneurysm_present_in_series',
 'aneurysm_present_in_image',
 'seriesinstanceuid',
 'patientage',
 'patientsex',
 'modality',
 'left_infraclinoid_internal_carotid_artery',
 'right_infraclinoid_internal_carotid_artery',
 'left_supraclinoid_internal_carotid_artery',
 'right_supraclinoid_internal_carotid_artery',
 'left_middle_cerebral_artery',
 'right_middle_cerebral_artery',
 'anterior_communicating_artery',
 'left_anterior_cerebral_artery',
 'right_anterior_cerebral_artery',
 'left_posterior_communicating_artery',
 'right_posterior_communicating_artery',
 'basilar_tip',
 'other_posterior_circulation']

## Checking the segmentation data

In [26]:
import nibabel as nib

In [39]:
image_path = '/kaggle/input/rsna-intracranial-aneurysm-detection/segmentations/1.2.826.0.1.3680043.8.498.62169558538817009391695314359016512306.nii'
img = nib.load(image_path)

In [40]:
image_array_data = img.get_fdata()
print(img.get_fdata().shape)
print(image_array_data.shape)

(296, 512, 512)
(296, 512, 512)


In [5]:
seg_files_root_path = '/kaggle/input/rsna-intracranial-aneurysm-detection/segmentations'
all_seg_files = list(Path(seg_files_root_path).glob("*.nii*"))
print("total number of segmentation files: ", len(all_seg_files))

total number of segmentation files:  356


In [30]:
temp_seg_files = all_seg_files[0:10]

for sf in temp_seg_files:
    print(sf)

/kaggle/input/rsna-intracranial-aneurysm-detection/segmentations/1.2.826.0.1.3680043.8.498.62169558538817009391695314359016512306.nii
/kaggle/input/rsna-intracranial-aneurysm-detection/segmentations/1.2.826.0.1.3680043.8.498.15111820005882064793593034423469604305_cowseg.nii
/kaggle/input/rsna-intracranial-aneurysm-detection/segmentations/1.2.826.0.1.3680043.8.498.17415277997649872560329721717694101082_cowseg.nii
/kaggle/input/rsna-intracranial-aneurysm-detection/segmentations/1.2.826.0.1.3680043.8.498.56479623144539472445940519727300319231_cowseg.nii
/kaggle/input/rsna-intracranial-aneurysm-detection/segmentations/1.2.826.0.1.3680043.8.498.79221197357738210862579456170058377494.nii
/kaggle/input/rsna-intracranial-aneurysm-detection/segmentations/1.2.826.0.1.3680043.8.498.24941924992372724575490063788348447936.nii
/kaggle/input/rsna-intracranial-aneurysm-detection/segmentations/1.2.826.0.1.3680043.8.498.42092450058597943280470345107435382425_cowseg.nii
/kaggle/input/rsna-intracranial-an

In [6]:
class NiiRecord:
    """
    Memory-efficient class for storing NII metadata using __slots__
    """
    __slots__ = ['file_name', 'image_shape']
    
    def __init__(self, file_name, image_shape):
        self.file_name = file_name
        self.image_shape = image_shape
    
    def to_dict(self):
        return {slot: getattr(self, slot) for slot in self.__slots__}

In [23]:
def get_nii_shape_name(seg_file):
    """
    Process all the nii files and get the file name and shape of the image array
    """
    try:
        data = []
        #print("inside the function call", type(seg_files))
        #for seg_file in seg_files:
        try:
            # Read NII file
            img = nib.load(seg_file)
            original_shape = str(img.get_fdata().shape)
            
            # Create record
            record = NiiRecord(seg_file.name, original_shape)
            
            data.append(record.to_dict())
            
        except Exception as e:
            print(f"Error processing file {seg_file}: {e}")
            #continue
                
        return data
        
    except Exception as e:
        print(f"Error processing files : {e}")
        return []

In [8]:
mp.cpu_count()

4

In [None]:
schema = {
    'file_name': pl.Utf8,
    'image_shape': pl.String
}

nii_df = pl.DataFrame(
    tqdm(get_all_nii_files(seg_files_root_path)),
    schema=schema,
    infer_schema_length=None
)

In [31]:
def create_nii_dataset(root_folder, num_processes=None, chunk_size=60):
    """
    Create dataset with nii files
    """
    schema = {
        'file_name': pl.Utf8,
        'image_shape': pl.String
    }
    
    temp_dir = Path("temp_nii_chunks")
    temp_dir.mkdir(exist_ok=True)

    if not num_processes:
        num_processes = mp.cpu_count()

    seg_files = list(Path(root_folder).glob("*.nii*"))
    
    # Process folders in parallel
    with ProcessPoolExecutor(max_workers=num_processes) as executor:
        for i in range(0, len(seg_files), chunk_size):
            chunk_seg_files = seg_files[i:i+chunk_size]
            chunk_data = []
            print("before the function call")
            futures = [
                executor.submit(
                    get_nii_shape_name, 
                    Path(seg_file)
                )
                for seg_file in chunk_seg_files
            ]
            
            for future in tqdm(futures, 
                             desc=f"Processing chunk {i//chunk_size + 1}/{(len(seg_files)-1)//chunk_size + 1}"):
                chunk_data.extend(future.result())
            
            if chunk_data:
                chunk_df = pl.DataFrame(
                    chunk_data,
                    schema=schema,
                    infer_schema_length=None
                )
                
                chunk_df.write_parquet(
                    temp_dir / f"nii_metadata_chunk_{i//chunk_size}.parquet",
                    compression="snappy"
                )
                
                del chunk_data
                del chunk_df
    
    # Combine chunks
    print("\nCombining chunks...")
    chunk_files = list(temp_dir.glob("nii_metadata_chunk_*.parquet"))
    final_nii_df = pl.concat([
        pl.scan_parquet(str(chunk_file))
        for chunk_file in chunk_files
    ]).collect()
    
    # Clean up temporary files
    for f in chunk_files:
        f.unlink()
    temp_dir.rmdir()
    
    return final_nii_df

In [32]:
try:
    full_nii_df = create_nii_dataset(
        seg_files_root_path
    )
except Exception as e:
    print(f"Error: {e}")

before the function call


Processing chunk 1/6: 100%|██████████| 60/60 [00:04<00:00, 13.05it/s]


before the function call


Processing chunk 2/6: 100%|██████████| 60/60 [00:21<00:00,  2.74it/s]


before the function call


Processing chunk 3/6: 100%|██████████| 60/60 [00:23<00:00,  2.52it/s]


before the function call


Processing chunk 4/6: 100%|██████████| 60/60 [00:27<00:00,  2.18it/s]


before the function call


Processing chunk 5/6: 100%|██████████| 60/60 [00:28<00:00,  2.14it/s]


before the function call


Processing chunk 6/6: 100%|██████████| 56/56 [00:20<00:00,  2.76it/s]


Combining chunks...





In [33]:
full_nii_df.shape

(356, 2)

In [34]:
full_nii_df.head(5)

file_name,image_shape
str,str
"""1.2.826.0.1.3680043.8.498.72679260079421518845786364620483278827.nii""","""(512, 512, 305)"""
"""1.2.826.0.1.3680043.8.498.11936548827981649628619858103408216131_cowseg.nii""","""(512, 512, 25)"""
"""1.2.826.0.1.3680043.8.498.86822530556046989269633487715061058236.nii""","""(512, 512, 414)"""
"""1.2.826.0.1.3680043.8.498.10557880026294057874761753231388788828.nii""","""(512, 512, 28)"""
"""1.2.826.0.1.3680043.8.498.58207377463057728877763676083525829095.nii""","""(512, 512, 27)"""
