# Import required libraries

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import os
import polars as pl
#import duckdb as dd
#from tqdm import tqdm
import matplotlib.pyplot as plt
#import cv2
#from pydicom import dcmread
import warnings
#from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
import pickle
#import gc
import ctypes
import tensorflow as tf
#tf.compat.v1.disable_eager_execution()
#tf.keras.backend.clear_session()

"""for gpu in tf.config.experimental.list_physical_devices("GPU"):
    tf.config.experimental.set_memory_growth(gpu, True)"""
    
import tensorflow_io as tfio
from tensorflow import keras

#tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

In [2]:
try: # detect TPUs
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
except ValueError: # detect GPUs
    strategy = tf.distribute.MirroredStrategy() # for GPU or multi-GPU machines
    #strategy = tf.distribute.get_strategy() # default strategy that works on CPU and single GPU
    #strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() # for clusters of multi-GPU machines

print("Number of accelerators: ", strategy.num_replicas_in_sync)

Number of accelerators:  2


In [3]:
# strategy = tf.distribute.MirroredStrategy()
print('DEVICES AVAILABLE: {}'.format(strategy.num_replicas_in_sync))

BATCH_SIZE_PER_REPLICA = 48

#We obtain the BATCH_SIZE dividing by the number of devices. 
#BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync
BATCH_SIZE = BATCH_SIZE_PER_REPLICA * 2

DEVICES AVAILABLE: 2


# Feature extraction from image files
## Only if not using tensorflow
### kept here as backward compatibility with earlier work

In [None]:
metadata_root_path = '/kaggle/input/spinal-canal-stenosis-metadata'

for file in os.listdir(metadata_root_path):
    labels = []
    features = []
    data = {}
    metadata_file_path = os.path.join(metadata_root_path, file)
    print(metadata_file_path)
    metadata_df = pl.read_csv(metadata_file_path, low_memory=True)
    for j in tqdm(range(metadata_df.shape[0])):
        dcm_image_path = metadata_df.item(j,0)
        dicom_ds = dcmread(dcm_image_path)
        img_array = dicom_ds.pixel_array
        features.append(np.mean(img_array.T, axis=0))
        # features.append(dcmread(dcm_image_path).pixel_array)
        labels.append(metadata_df.item(j,2))

    print("feature list length --> ", len(features))
    print("label list length --> ", len(labels))

    #extracted_training_features = np.array([np.resize(img,(128,128)) for img in features])
    #training_labels = np.array(labels)

    #del labels
    #del features
    #gc.collect()

    extracted_training_features_file_name = "{0}_training_features".format(file.replace('_feature_metadata.csv',''))
    labels_file_name = "{0}_labels".format(file.replace('.csv',''))

    with open(extracted_training_features_file_name, "wb") as file:
        pickle.dump(features, file)
    with open(labels_file_name, "wb") as file:
        pickle.dump(labels, file)

    del labels
    del features
    gc.collect()
    libc = ctypes.CDLL("libc.so.6") # clearing cache 
    libc.malloc_trim(0)

    #data["image_array"] = features
    #data["encoded_severity"] = labels
    #extracted_training_data = pd.DataFrame(data)
    #extracted_training_data.to_csv("{0}.csv.gz".format(file.replace('.csv','')), index=False, compression='gzip')

    print('finished dumping features & labels for {0}'.format(file))

# Review pipeline with one sample image

In [None]:
image_bytes \
= tf.io.read_file('/kaggle/input/rsna-2024-lumbar-spine-degenerative-classification/train_images/100206310/1012284084/1.dcm')

image = tfio.image.decode_dicom_image(image_bytes, scale='auto', dtype=tf.float32)

m, M=tf.math.reduce_min(image), tf.math.reduce_max(image)
image = (tf.image.grayscale_to_rgb(image)-m)/(M-m)
image = tf.image.resize(image, (128,128))

# Preprocessing functions

In [4]:
def read_and_parse_dicom_files(full_file_path):
    tf.config.run_functions_eagerly(True)
    raw_image = tf.io.read_file(full_file_path)
    sp = tf.strings.split(tf.gather(tf.strings.split(full_file_path, 'images/'), 1), '/')
    N = tf.size(sp)
    LEN = tf.strings.length(tf.gather(sp, 0))+tf.strings.length(tf.gather(sp, 2))
    
    # Add missing file metadata to avoid warnnigs flooding
    if   LEN==12: raw_image = tf.strings.regex_replace(raw_image, pattern=b'DICM\x02\x00\x01\x00', rewrite=b'DICM\x02\x00\x00\x00UL\x04\x00\x92\x00\x00\x00\x02\x00\x01\x00')
    elif LEN==13: raw_image = tf.strings.regex_replace(raw_image, pattern=b'DICM\x02\x00\x01\x00', rewrite=b'DICM\x02\x00\x00\x00UL\x04\x00\x92\x00\x00\x00\x02\x00\x01\x00')
    elif LEN==14: raw_image = tf.strings.regex_replace(raw_image, pattern=b'DICM\x02\x00\x01\x00', rewrite=b'DICM\x02\x00\x00\x00UL\x04\x00\x94\x00\x00\x00\x02\x00\x01\x00')
    elif LEN==15: raw_image = tf.strings.regex_replace(raw_image, pattern=b'DICM\x02\x00\x01\x00', rewrite=b'DICM\x02\x00\x00\x00UL\x04\x00\x94\x00\x00\x00\x02\x00\x01\x00')
    elif LEN==16: raw_image = tf.strings.regex_replace(raw_image, pattern=b'DICM\x02\x00\x01\x00', rewrite=b'DICM\x02\x00\x00\x00UL\x04\x00\x96\x00\x00\x00\x02\x00\x01\x00')
    elif LEN==17: raw_image = tf.strings.regex_replace(raw_image, pattern=b'DICM\x02\x00\x01\x00', rewrite=b'DICM\x02\x00\x00\x00UL\x04\x00\x96\x00\x00\x00\x02\x00\x01\x00')
    elif LEN==18: raw_image = tf.strings.regex_replace(raw_image, pattern=b'DICM\x02\x00\x01\x00', rewrite=b'DICM\x02\x00\x00\x00UL\x04\x00\x98\x00\x00\x00\x02\x00\x01\x00')
    
    #image_bytes = tf.io.read_file(full_file_path)
    #image = tfio.image.decode_dicom_image(image_bytes, scale='auto', dtype=tf.float32)
    image = tfio.image.decode_dicom_image(raw_image, scale='auto', dtype=tf.float32)
    m, M=tf.math.reduce_min(image), tf.math.reduce_max(image)
    image = (tf.image.grayscale_to_rgb(image)-m)/(M-m)
    image = tf.image.resize(image, (128,128))
    return tf.squeeze(image)

In [5]:
def load_dataset(image_path, labels):
    image = read_and_parse_dicom_files(image_path)
    return {"images": tf.cast(image, tf.float32), "labels": tf.cast(labels, tf.float32)}

def dict_to_tuple(inputs):
    return inputs["images"], inputs["labels"]

# Train, Test, Validation & holdout splits
## holdout set to be used for CV

In [6]:
metadata_file_path = '/kaggle/input/spinal-canal-stenosis-metadata/spinal_canal_stenosis_l1_l2_feature_metadata.csv'
metadata_df = pl.read_csv(metadata_file_path, low_memory=True)

In [7]:
metadata_df.columns

['full_img_path', 'severity', 'encoded_severity']

In [18]:
"""def attach_weights(label):
    if label == 'Normal/Mild':
        return 1
    elif label == 'Severe':
        return 4
    else:
        return 2
    
metadata_df = metadata_df.with_columns([
    pl.col("severity").map_elements(attach_weights, return_dtype=pl.Int32).alias("sample_weight")
])"""

In [7]:
for_train, holdout = train_test_split(metadata_df, test_size=0.4, random_state=42)

x_train, x_test_val = train_test_split(for_train, test_size=0.3, random_state=42)
x_test, x_valid = train_test_split(x_test_val, test_size=0.2, random_state=42)

print("Training data shape : {0}".format(x_train.shape))
print("Test data shape : {0}".format(x_test.shape))
print("Validation data shape : {0}".format(x_valid.shape))
print("Holdout data shape : {0}".format(holdout.shape))

Training data shape : (61794, 3)
Test data shape : (21187, 3)
Validation data shape : (5297, 3)
Holdout data shape : (58853, 3)


In [8]:
holdout_image_filenames = pl.Series(holdout.select(pl.col('full_img_path'))).to_list()
holdout_image_labels = pl.Series(holdout.select(pl.col('encoded_severity'))).to_list()
#holdout_weights = pl.Series(holdout.select(pl.col('sample_weight'))).to_list()

train_image_filenames = pl.Series(x_train.select(pl.col('full_img_path'))).to_list()
train_image_labels = pl.Series(x_train.select(pl.col('encoded_severity'))).to_list()
#train_weights = pl.Series(x_train.select(pl.col('sample_weight'))).to_list()

test_image_filenames = pl.Series(x_test.select(pl.col('full_img_path'))).to_list()
test_image_labels = pl.Series(x_test.select(pl.col('encoded_severity'))).to_list()
#test_weights = pl.Series(x_test.select(pl.col('sample_weight'))).to_list()

valid_image_filenames = pl.Series(x_valid.select(pl.col('full_img_path'))).to_list()
valid_image_labels = pl.Series(x_valid.select(pl.col('encoded_severity'))).to_list()
#valid_weights = pl.Series(x_valid.select(pl.col('sample_weight'))).to_list()

In [9]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_image_filenames, train_image_labels))

test_dataset = tf.data.Dataset.from_tensor_slices((test_image_filenames, test_image_labels))

valid_dataset = tf.data.Dataset.from_tensor_slices((valid_image_filenames, valid_image_labels))

holdout_dataset = tf.data.Dataset.from_tensor_slices((holdout_image_filenames, holdout_image_labels))

In [10]:
train_ds = train_dataset.map(load_dataset, num_parallel_calls=tf.data.AUTOTUNE)
train_ds = train_ds.map(dict_to_tuple, num_parallel_calls=tf.data.AUTOTUNE)
train_ds = train_ds.batch(batch_size=BATCH_SIZE, drop_remainder=True)
train_ds = train_ds.prefetch(tf.data.AUTOTUNE)

test_ds = test_dataset.map(load_dataset, num_parallel_calls=tf.data.AUTOTUNE)
test_ds = test_ds.map(dict_to_tuple, num_parallel_calls=tf.data.AUTOTUNE)
test_ds = test_ds.batch(batch_size=BATCH_SIZE, drop_remainder=True)
test_ds = test_ds.prefetch(tf.data.AUTOTUNE)

valid_ds = valid_dataset.map(load_dataset, num_parallel_calls=tf.data.AUTOTUNE)
valid_ds = valid_ds.map(dict_to_tuple, num_parallel_calls=tf.data.AUTOTUNE)
valid_ds = valid_ds.batch(batch_size=BATCH_SIZE, drop_remainder=True)
valid_ds = valid_ds.prefetch(tf.data.AUTOTUNE)

holdout_ds = holdout_dataset.map(load_dataset, num_parallel_calls=tf.data.AUTOTUNE)
holdout_ds = holdout_ds.map(dict_to_tuple, num_parallel_calls=tf.data.AUTOTUNE)
holdout_ds = holdout_ds.batch(batch_size=BATCH_SIZE, drop_remainder=True)
holdout_ds = holdout_ds.prefetch(tf.data.AUTOTUNE)



In [12]:
elem = next(iter(train_ds))
type(elem)

tuple

In [13]:
elem[1]

<tf.Tensor: shape=(96,), dtype=float32, numpy=
array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.], dtype=float32)>

In [22]:
tf.constant(np.ones(48))

<tf.Tensor: shape=(48,), dtype=float64, numpy=
array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])>

#### Create a solution dataframe with predictions as 1 or 0 depending on the labels
#### using this solution dataframe and the actual predictions and sample weights (given in the problem statement)
#### build a custom keras metric

In [None]:
pl.Config(fmt_str_lengths=1000)
x_train.head(10)

In [None]:
def get_study_id(full_image_path):
    return full_image_path.split('/')[-3]

x_train_studies = x_train.with_columns(
    pl.col("full_img_path")
    .map_elements(get_study_id, return_dtype=pl.String)
    .alias("study_id")
)
x_train_studies.head(10)

In [None]:
x_train.select([pl.col('severity'), pl.col('encoded_severity')]).unique()

In [None]:
x_train.select(pl.col('encoded_severity')).unique().to_series().to_list()

### Code to create custom metric aligned with Competition evaluation criteria

In [11]:
train_studies_metadata_file_path = '/kaggle/input/rsna-2024-lumbar-spine-degenerative-classification/train.csv'
train_studies_metadata_df = pl.read_csv(train_studies_metadata_file_path, low_memory=True)
print("before dropping nulls :", train_studies_metadata_df.shape)
train_studies_metadata_df = train_studies_metadata_df.drop_nulls()
print("after dropping nulls :", train_studies_metadata_df.shape)

studies_full = train_studies_metadata_df.select(pl.col('study_id')).unique().to_series().to_list()
print(len(studies_full))
#train_studies_metadata_df.select([pl.col('study_id'),pl.col('spinal_canal_stenosis_l1_l2')]).head(10)

before dropping nulls : (1975, 26)
after dropping nulls : (1790, 26)
1790


In [12]:
config = {}
config['root_file_path'] = '/kaggle/input/rsna-2024-lumbar-spine-degenerative-classification/train_images'
config['start'] = 10
config['end'] = 110
#studies = os.listdir(config['root_file_path'])[config['start']:config['end']]
studies = studies_full[config['start']:config['end']]
print(len(studies))

test_dict = {}
for study in studies:
    image_files = []
    for dirname, _, filenames in os.walk(config['root_file_path']+'/'+str(study)):
        for filename in filenames:
            test_dict[os.path.join(dirname, filename).split('/')[-3]] = image_files
            image_files.append(os.path.join(dirname, filename))
            
print(len(test_dict))

100
100


In [13]:
def label_encoder(label):
    if label == 'Normal/Mild':
        return 2
    elif label == 'Severe':
        return 3
    else:
        return 1
    
def attach_weights(label):
    if label == 'Normal/Mild':
        return 1
    elif label == 'Severe':
        return 4
    else:
        return 2

train_studies_metadata_df_up = train_studies_metadata_df.unpivot(index="study_id")
train_studies_metadata_df_up.columns = ['study_id', 'condition', 'severity']

train_studies_metadata_df_up = train_studies_metadata_df_up.with_columns([
    pl.col("severity").map_elements(label_encoder, return_dtype=pl.Int32).alias("encoded_severity"),
    pl.col("severity").map_elements(attach_weights, return_dtype=pl.Int32).alias("sample_weight"),
    (pl.col("study_id").cast(pl.String)+'_'+pl.col("condition")).alias("row_id")
])

print(train_studies_metadata_df_up.shape)
train_studies_metadata_df_up.head(10)

(44750, 6)


study_id,condition,severity,encoded_severity,sample_weight,row_id
i64,str,str,i32,i32,str
4003253,"""spinal_canal_stenosis_l1_l2""","""Normal/Mild""",2,1,"""4003253_spinal_canal_stenosis_…"
4646740,"""spinal_canal_stenosis_l1_l2""","""Normal/Mild""",2,1,"""4646740_spinal_canal_stenosis_…"
7143189,"""spinal_canal_stenosis_l1_l2""","""Normal/Mild""",2,1,"""7143189_spinal_canal_stenosis_…"
8785691,"""spinal_canal_stenosis_l1_l2""","""Normal/Mild""",2,1,"""8785691_spinal_canal_stenosis_…"
10728036,"""spinal_canal_stenosis_l1_l2""","""Normal/Mild""",2,1,"""10728036_spinal_canal_stenosis…"
11340341,"""spinal_canal_stenosis_l1_l2""","""Normal/Mild""",2,1,"""11340341_spinal_canal_stenosis…"
11943292,"""spinal_canal_stenosis_l1_l2""","""Normal/Mild""",2,1,"""11943292_spinal_canal_stenosis…"
13317052,"""spinal_canal_stenosis_l1_l2""","""Normal/Mild""",2,1,"""13317052_spinal_canal_stenosis…"
22191399,"""spinal_canal_stenosis_l1_l2""","""Normal/Mild""",2,1,"""22191399_spinal_canal_stenosis…"
26342422,"""spinal_canal_stenosis_l1_l2""","""Normal/Mild""",2,1,"""26342422_spinal_canal_stenosis…"


In [14]:
temp = train_studies_metadata_df_up\
.select([pl.col('study_id'), pl.col('row_id'), pl.col('encoded_severity'), pl.col('severity'), pl.col('sample_weight')])
temp.head(10)

study_id,row_id,encoded_severity,severity,sample_weight
i64,str,i32,str,i32
4003253,"""4003253_spinal_canal_stenosis_…",2,"""Normal/Mild""",1
4646740,"""4646740_spinal_canal_stenosis_…",2,"""Normal/Mild""",1
7143189,"""7143189_spinal_canal_stenosis_…",2,"""Normal/Mild""",1
8785691,"""8785691_spinal_canal_stenosis_…",2,"""Normal/Mild""",1
10728036,"""10728036_spinal_canal_stenosis…",2,"""Normal/Mild""",1
11340341,"""11340341_spinal_canal_stenosis…",2,"""Normal/Mild""",1
11943292,"""11943292_spinal_canal_stenosis…",2,"""Normal/Mild""",1
13317052,"""13317052_spinal_canal_stenosis…",2,"""Normal/Mild""",1
22191399,"""22191399_spinal_canal_stenosis…",2,"""Normal/Mild""",1
26342422,"""26342422_spinal_canal_stenosis…",2,"""Normal/Mild""",1


In [15]:
train_studies_metadata_df_final = temp.pivot("severity", index=["study_id","row_id"], values="encoded_severity")
#.select([pl.col('study_id'), pl.col('row_id'), pl.col('encoded_severity'), pl.col('severity')])\
#.pivot("severity", index=["study_id","row_id"], values="encoded_severity")

train_studies_metadata_df_final.columns = ['study_id', 'row_id', 'normal_mild', 'moderate', 'severe']

train_studies_metadata_df_final.head(10)

study_id,row_id,normal_mild,moderate,severe
i64,str,i32,i32,i32
4003253,"""4003253_spinal_canal_stenosis_…",2,,
4646740,"""4646740_spinal_canal_stenosis_…",2,,
7143189,"""7143189_spinal_canal_stenosis_…",2,,
8785691,"""8785691_spinal_canal_stenosis_…",2,,
10728036,"""10728036_spinal_canal_stenosis…",2,,
11340341,"""11340341_spinal_canal_stenosis…",2,,
11943292,"""11943292_spinal_canal_stenosis…",2,,
13317052,"""13317052_spinal_canal_stenosis…",2,,
22191399,"""22191399_spinal_canal_stenosis…",2,,
26342422,"""26342422_spinal_canal_stenosis…",2,,


In [16]:
train_studies_metadata_df_final_2 = train_studies_metadata_df_final.join(temp, on=["study_id","row_id"], how="inner")
#train_studies_metadata_df_final.columns = ['study_id', 'row_id', 'normal_mild', 'moderate', 'severe', 'sample_weight']
train_studies_metadata_df_final_2.head(10)

study_id,row_id,normal_mild,moderate,severe,encoded_severity,severity,sample_weight
i64,str,i32,i32,i32,i32,str,i32
4003253,"""4003253_spinal_canal_stenosis_…",2,,,2,"""Normal/Mild""",1
4646740,"""4646740_spinal_canal_stenosis_…",2,,,2,"""Normal/Mild""",1
7143189,"""7143189_spinal_canal_stenosis_…",2,,,2,"""Normal/Mild""",1
8785691,"""8785691_spinal_canal_stenosis_…",2,,,2,"""Normal/Mild""",1
10728036,"""10728036_spinal_canal_stenosis…",2,,,2,"""Normal/Mild""",1
11340341,"""11340341_spinal_canal_stenosis…",2,,,2,"""Normal/Mild""",1
11943292,"""11943292_spinal_canal_stenosis…",2,,,2,"""Normal/Mild""",1
13317052,"""13317052_spinal_canal_stenosis…",2,,,2,"""Normal/Mild""",1
22191399,"""22191399_spinal_canal_stenosis…",2,,,2,"""Normal/Mild""",1
26342422,"""26342422_spinal_canal_stenosis…",2,,,2,"""Normal/Mild""",1


In [17]:
train_studies_metadata_df_final_2 = train_studies_metadata_df_final_2.drop(['encoded_severity', 'severity'])

In [18]:
train_studies_metadata_df_final_2 = train_studies_metadata_df_final_2.with_columns([
    pl.when(pl.col('normal_mild').is_not_null()).then(1).otherwise(0).alias('true_normal_mild'),
    pl.when(pl.col('moderate').is_not_null()).then(1).otherwise(0).alias('true_moderate'),
    pl.when(pl.col('severe').is_not_null()).then(1).otherwise(0).alias('true_severe'),
])

train_studies_metadata_df_final_2.head(10)

study_id,row_id,normal_mild,moderate,severe,sample_weight,true_normal_mild,true_moderate,true_severe
i64,str,i32,i32,i32,i32,i32,i32,i32
4003253,"""4003253_spinal_canal_stenosis_…",2,,,1,1,0,0
4646740,"""4646740_spinal_canal_stenosis_…",2,,,1,1,0,0
7143189,"""7143189_spinal_canal_stenosis_…",2,,,1,1,0,0
8785691,"""8785691_spinal_canal_stenosis_…",2,,,1,1,0,0
10728036,"""10728036_spinal_canal_stenosis…",2,,,1,1,0,0
11340341,"""11340341_spinal_canal_stenosis…",2,,,1,1,0,0
11943292,"""11943292_spinal_canal_stenosis…",2,,,1,1,0,0
13317052,"""13317052_spinal_canal_stenosis…",2,,,1,1,0,0
22191399,"""22191399_spinal_canal_stenosis…",2,,,1,1,0,0
26342422,"""26342422_spinal_canal_stenosis…",2,,,1,1,0,0


In [19]:
train_studies_metadata_df_final_2 = train_studies_metadata_df_final_2.drop(['normal_mild', 'moderate', 'severe'])
train_studies_metadata_df_final_2.columns = ['study_id', 'row_id', 'sample_weight', 'normal_mild', 'moderate', 'severe']
train_studies_metadata_df_final_2.head(10)

study_id,row_id,sample_weight,normal_mild,moderate,severe
i64,str,i32,i32,i32,i32
4003253,"""4003253_spinal_canal_stenosis_…",1,1,0,0
4646740,"""4646740_spinal_canal_stenosis_…",1,1,0,0
7143189,"""7143189_spinal_canal_stenosis_…",1,1,0,0
8785691,"""8785691_spinal_canal_stenosis_…",1,1,0,0
10728036,"""10728036_spinal_canal_stenosis…",1,1,0,0
11340341,"""11340341_spinal_canal_stenosis…",1,1,0,0
11943292,"""11943292_spinal_canal_stenosis…",1,1,0,0
13317052,"""13317052_spinal_canal_stenosis…",1,1,0,0
22191399,"""22191399_spinal_canal_stenosis…",1,1,0,0
26342422,"""26342422_spinal_canal_stenosis…",1,1,0,0


In [20]:
train_studies_metadata_df_final_2.shape

(44750, 6)

In [21]:
solutions = train_studies_metadata_df_final_2.filter(pl.col('study_id').is_in(studies))
solutions = solutions.drop(['study_id'])
print(solutions.shape)
#solutions.head(10)

(2500, 5)


### Now generate the predictions

In [22]:
model = keras.models.\
load_model("/kaggle/input/keras_base_scs_l1_l2/tensorflow2/default/1/keras_base_spinal_canal_stenosis_l1_l2.h5")

In [23]:
def read_and_parse_dicom_files_for_inf(full_file_path):
    tf.config.run_functions_eagerly(True)
    raw_image = tf.io.read_file(full_file_path)
    sp = tf.strings.split(tf.gather(tf.strings.split(full_file_path, 'images/'), 1), '/')
    N = tf.size(sp)
    LEN = tf.strings.length(tf.gather(sp, 0))+tf.strings.length(tf.gather(sp, 2))
    
    # Add missing file metadata to avoid warnnigs flooding
    if   LEN==12: raw_image = tf.strings.regex_replace(raw_image, pattern=b'DICM\x02\x00\x01\x00', rewrite=b'DICM\x02\x00\x00\x00UL\x04\x00\x92\x00\x00\x00\x02\x00\x01\x00')
    elif LEN==13: raw_image = tf.strings.regex_replace(raw_image, pattern=b'DICM\x02\x00\x01\x00', rewrite=b'DICM\x02\x00\x00\x00UL\x04\x00\x92\x00\x00\x00\x02\x00\x01\x00')
    elif LEN==14: raw_image = tf.strings.regex_replace(raw_image, pattern=b'DICM\x02\x00\x01\x00', rewrite=b'DICM\x02\x00\x00\x00UL\x04\x00\x94\x00\x00\x00\x02\x00\x01\x00')
    elif LEN==15: raw_image = tf.strings.regex_replace(raw_image, pattern=b'DICM\x02\x00\x01\x00', rewrite=b'DICM\x02\x00\x00\x00UL\x04\x00\x94\x00\x00\x00\x02\x00\x01\x00')
    elif LEN==16: raw_image = tf.strings.regex_replace(raw_image, pattern=b'DICM\x02\x00\x01\x00', rewrite=b'DICM\x02\x00\x00\x00UL\x04\x00\x96\x00\x00\x00\x02\x00\x01\x00')
    elif LEN==17: raw_image = tf.strings.regex_replace(raw_image, pattern=b'DICM\x02\x00\x01\x00', rewrite=b'DICM\x02\x00\x00\x00UL\x04\x00\x96\x00\x00\x00\x02\x00\x01\x00')
    elif LEN==18: raw_image = tf.strings.regex_replace(raw_image, pattern=b'DICM\x02\x00\x01\x00', rewrite=b'DICM\x02\x00\x00\x00UL\x04\x00\x98\x00\x00\x00\x02\x00\x01\x00')
    
    #image_bytes = tf.io.read_file(full_file_path)
    #image = tfio.image.decode_dicom_image(image_bytes, scale='auto', dtype=tf.float32)
    image = tfio.image.decode_dicom_image(raw_image, scale='auto', dtype=tf.float32)
    m, M=tf.math.reduce_min(image), tf.math.reduce_max(image)
    image = (tf.image.grayscale_to_rgb(image)-m)/(M-m)
    image = tf.image.resize(image, (128,128))
    return tf.squeeze(image)

In [24]:
vfunc = np.vectorize(read_and_parse_dicom_files_for_inf, otypes=[object])

def get_predictions(key, model_to_use):
    final_feature_list = vfunc(test_dict[key]).tolist()
    final = np.array(final_feature_list)
    return model_to_use.predict(final)

In [25]:
from multiprocessing import cpu_count
n_cores = cpu_count()
print(f'Number of Logical CPU cores: {n_cores}')

Number of Logical CPU cores: 4


In [26]:
#from joblib import Parallel, delayed
from tqdm import tqdm

#y_proba = (Parallel(n_jobs=90)(delayed(get_predictions)(st, model) for st in tqdm(test_dict.keys())))
y_proba = [get_predictions(st, model) for st in tqdm(test_dict.keys())]

  outputs = ufunc(*inputs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 277ms/step


  1%|          | 1/100 [00:04<08:03,  4.89s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 249ms/step


  2%|▏         | 2/100 [00:08<06:50,  4.18s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 250ms/step


  3%|▎         | 3/100 [00:11<05:54,  3.66s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 256ms/step


  4%|▍         | 4/100 [00:17<07:00,  4.38s/it]

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 270ms/step


  5%|▌         | 5/100 [00:19<05:58,  3.77s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 255ms/step


  6%|▌         | 6/100 [00:24<06:21,  4.06s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 251ms/step


  7%|▋         | 7/100 [00:27<05:45,  3.72s/it]

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 260ms/step


  8%|▊         | 8/100 [00:29<04:55,  3.21s/it]

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 239ms/step


  9%|▉         | 9/100 [00:31<04:30,  2.97s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 256ms/step


 10%|█         | 10/100 [00:35<04:43,  3.15s/it]

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 230ms/step


 11%|█         | 11/100 [00:39<05:10,  3.49s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 232ms/step


 12%|█▏        | 12/100 [00:43<05:07,  3.49s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 251ms/step


 13%|█▎        | 13/100 [00:47<05:19,  3.68s/it]

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 242ms/step


 14%|█▍        | 14/100 [00:50<05:00,  3.49s/it]

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 238ms/step


 15%|█▌        | 15/100 [00:52<04:29,  3.17s/it]

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 267ms/step


 16%|█▌        | 16/100 [00:56<04:27,  3.18s/it]

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 276ms/step


 17%|█▋        | 17/100 [00:58<04:05,  2.96s/it]

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 254ms/step


 18%|█▊        | 18/100 [01:00<03:48,  2.79s/it]

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 248ms/step


 19%|█▉        | 19/100 [01:03<03:40,  2.72s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 235ms/step


 20%|██        | 20/100 [01:07<03:59,  3.00s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 267ms/step


 21%|██        | 21/100 [01:10<03:55,  2.98s/it]

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 239ms/step


 22%|██▏       | 22/100 [01:13<03:50,  2.96s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 260ms/step


 23%|██▎       | 23/100 [01:17<04:26,  3.46s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 251ms/step


 24%|██▍       | 24/100 [01:21<04:26,  3.51s/it]

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 232ms/step


 25%|██▌       | 25/100 [01:27<05:24,  4.33s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 252ms/step


 26%|██▌       | 26/100 [01:30<04:51,  3.94s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 231ms/step


 27%|██▋       | 27/100 [01:36<05:22,  4.42s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 234ms/step


 28%|██▊       | 28/100 [01:40<05:14,  4.36s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 236ms/step


 29%|██▉       | 29/100 [01:44<05:06,  4.31s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 237ms/step


 30%|███       | 30/100 [01:49<05:16,  4.52s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 233ms/step


 31%|███       | 31/100 [01:52<04:50,  4.21s/it]

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 233ms/step


 32%|███▏      | 32/100 [01:55<04:11,  3.69s/it]

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 241ms/step


 33%|███▎      | 33/100 [01:58<03:45,  3.36s/it]

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 274ms/step


 34%|███▍      | 34/100 [02:00<03:30,  3.20s/it]

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 238ms/step


 35%|███▌      | 35/100 [02:03<03:17,  3.04s/it]

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 243ms/step


 36%|███▌      | 36/100 [02:06<03:10,  2.97s/it]

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 233ms/step


 37%|███▋      | 37/100 [02:11<03:40,  3.50s/it]

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 239ms/step


 38%|███▊      | 38/100 [02:14<03:32,  3.42s/it]

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 238ms/step


 39%|███▉      | 39/100 [02:21<04:35,  4.51s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 240ms/step


 40%|████      | 40/100 [02:27<04:53,  4.89s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 238ms/step


 41%|████      | 41/100 [02:30<04:27,  4.54s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 240ms/step


 42%|████▏     | 42/100 [02:34<04:05,  4.23s/it]

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 264ms/step


 43%|████▎     | 43/100 [02:37<03:41,  3.88s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 246ms/step


 44%|████▍     | 44/100 [02:41<03:41,  3.96s/it]

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 264ms/step


 45%|████▌     | 45/100 [02:44<03:17,  3.58s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 232ms/step


 46%|████▌     | 46/100 [02:47<03:12,  3.56s/it]

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 233ms/step


 47%|████▋     | 47/100 [02:50<02:51,  3.24s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 245ms/step


 48%|████▊     | 48/100 [02:53<02:41,  3.10s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 227ms/step


 49%|████▉     | 49/100 [02:56<02:45,  3.25s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 234ms/step


 50%|█████     | 50/100 [03:00<02:57,  3.55s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 236ms/step


 51%|█████     | 51/100 [03:05<03:15,  3.99s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 234ms/step


 52%|█████▏    | 52/100 [03:09<03:10,  3.97s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 241ms/step


 53%|█████▎    | 53/100 [03:14<03:16,  4.19s/it]

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 232ms/step


 54%|█████▍    | 54/100 [03:17<02:54,  3.79s/it]

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 231ms/step


 55%|█████▌    | 55/100 [03:21<02:56,  3.92s/it]

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 243ms/step


 56%|█████▌    | 56/100 [03:24<02:38,  3.61s/it]

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 235ms/step


 57%|█████▋    | 57/100 [03:29<02:51,  3.99s/it]

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 232ms/step


 58%|█████▊    | 58/100 [03:32<02:33,  3.65s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 243ms/step


 59%|█████▉    | 59/100 [03:38<03:00,  4.41s/it]

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 248ms/step


 60%|██████    | 60/100 [03:41<02:37,  3.93s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 239ms/step


 61%|██████    | 61/100 [03:45<02:32,  3.92s/it]

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 229ms/step


 62%|██████▏   | 62/100 [03:47<02:08,  3.38s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 234ms/step


 63%|██████▎   | 63/100 [03:52<02:26,  3.95s/it]

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 236ms/step


 64%|██████▍   | 64/100 [03:55<02:12,  3.67s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 236ms/step


 65%|██████▌   | 65/100 [03:59<02:13,  3.81s/it]

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 235ms/step


 66%|██████▌   | 66/100 [04:02<02:02,  3.61s/it]

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 236ms/step


 67%|██████▋   | 67/100 [04:05<01:45,  3.21s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 245ms/step


 68%|██████▊   | 68/100 [04:08<01:46,  3.32s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 256ms/step


 69%|██████▉   | 69/100 [04:12<01:44,  3.36s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 234ms/step


 70%|███████   | 70/100 [04:16<01:47,  3.59s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 241ms/step


 71%|███████   | 71/100 [04:19<01:44,  3.61s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 234ms/step


 72%|███████▏  | 72/100 [04:23<01:41,  3.63s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 229ms/step


 73%|███████▎  | 73/100 [04:26<01:32,  3.43s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 239ms/step


 74%|███████▍  | 74/100 [04:30<01:31,  3.50s/it]

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 265ms/step


 75%|███████▌  | 75/100 [04:33<01:23,  3.33s/it]

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 245ms/step


 76%|███████▌  | 76/100 [04:45<02:25,  6.05s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 233ms/step


 77%|███████▋  | 77/100 [04:49<02:02,  5.32s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 232ms/step


 78%|███████▊  | 78/100 [04:53<01:49,  4.99s/it]

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 230ms/step


 79%|███████▉  | 79/100 [04:55<01:28,  4.20s/it]

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 242ms/step


 80%|████████  | 80/100 [04:58<01:13,  3.66s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 234ms/step


 81%|████████  | 81/100 [05:02<01:11,  3.74s/it]

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 255ms/step


 82%|████████▏ | 82/100 [05:04<01:00,  3.39s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 230ms/step


 83%|████████▎ | 83/100 [05:08<01:02,  3.66s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 227ms/step


 84%|████████▍ | 84/100 [05:12<00:58,  3.65s/it]

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 252ms/step


 85%|████████▌ | 85/100 [05:14<00:49,  3.28s/it]

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 231ms/step


 86%|████████▌ | 86/100 [05:17<00:44,  3.16s/it]

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 231ms/step


 87%|████████▋ | 87/100 [05:20<00:38,  2.93s/it]

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 227ms/step


 88%|████████▊ | 88/100 [05:22<00:32,  2.73s/it]

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 234ms/step


 89%|████████▉ | 89/100 [05:26<00:34,  3.17s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 236ms/step


 90%|█████████ | 90/100 [05:30<00:32,  3.21s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 247ms/step


 91%|█████████ | 91/100 [05:33<00:29,  3.26s/it]

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 232ms/step


 92%|█████████▏| 92/100 [05:36<00:26,  3.30s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 235ms/step


 93%|█████████▎| 93/100 [05:41<00:26,  3.81s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 246ms/step


 94%|█████████▍| 94/100 [05:45<00:22,  3.72s/it]

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 246ms/step


 95%|█████████▌| 95/100 [05:48<00:17,  3.52s/it]

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 232ms/step


 96%|█████████▌| 96/100 [05:50<00:12,  3.16s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 231ms/step


 97%|█████████▋| 97/100 [05:55<00:11,  3.75s/it]

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 240ms/step


 98%|█████████▊| 98/100 [06:00<00:07,  3.91s/it]

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 231ms/step


 99%|█████████▉| 99/100 [06:02<00:03,  3.46s/it]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 241ms/step


100%|██████████| 100/100 [06:05<00:00,  3.66s/it]


In [27]:
rows = {}

for i in range(len(y_proba)):
    rows[list(test_dict.keys())[i]+'_spinal_canal_stenosis_l1_l2'] = np.mean(y_proba[i], axis=0)
    
submission = pd.read_csv('/kaggle/input/rsna-2024-lumbar-spine-degenerative-classification/sample_submission.csv')
submission['row_id'] = 'samples'

conditions = ['spinal_canal_stenosis', 'neural_foraminal_narrowing', 'subarticular_stenosis']
sides = ['left', 'right']
vertebrae_levels = ['l1_l2', 'l2_l3', 'l3_l4', 'l4_l5', 'l5_s1']
severity_levels = ['normal_mild', 'moderate', 'severe']

for c in conditions:
    for v in vertebrae_levels:
        if c != 'spinal_canal_stenosis':
            for s in sides:
                for st in test_dict.keys():
                    rows[st+'_'+s+'_'+c+'_'+v] = np.array([0.333333, 0.333333, 0.333333])
        else:
            for st in test_dict.keys():
                if c+'_'+v == 'spinal_canal_stenosis_l1_l2':
                    pass
                else:
                    rows[st+'_'+c+'_'+v] = np.array([0.333333, 0.333333, 0.333333])
                    
for row_id, feature in tqdm(rows.items()):
    feature_set_reshaped = feature.reshape(1, -1)
    predictions = np.ascontiguousarray(feature_set_reshaped)
    df = pd.DataFrame(predictions, columns=severity_levels)
    df.insert(loc=0, column='row_id', value=row_id)
    submission = pd.concat([submission,df]).reset_index(drop=True)

i = submission[(submission.row_id == 'samples')].index
submission = submission.drop(i).reset_index(drop=True)

100%|██████████| 2500/2500 [00:01<00:00, 1335.09it/s]


In [28]:
solutions_pd = solutions.to_pandas()
print(submission.shape)
print(solutions_pd.shape)

(2500, 4)
(2500, 5)


In [29]:
def get_condition(full_location: str) -> str:
    # Given an input like spinal_canal_stenosis_l1_l2 extracts 'spinal'
    for injury_condition in ['spinal', 'foraminal', 'subarticular']:
        if injury_condition in full_location:
            return injury_condition
    raise ValueError(f'condition not found in {full_location}')

In [30]:
target_levels = ['normal_mild', 'moderate', 'severe']

if not pd.api.types.is_numeric_dtype(submission[target_levels].values):
        raise ParticipantVisibleError('All submission values must be numeric')

if not np.isfinite(submission[target_levels].values).all():
    raise ParticipantVisibleError('All submission values must be finite')

if solutions_pd[target_levels].min().min() < 0:
    raise ParticipantVisibleError('All labels must be at least zero')
if submission[target_levels].min().min() < 0:
    raise ParticipantVisibleError('All predictions must be at least zero')

In [31]:
solutions_pd['study_id'] = solutions_pd['row_id'].apply(lambda x: x.split('_')[0])
solutions_pd['location'] = solutions_pd['row_id'].apply(lambda x: '_'.join(x.split('_')[1:]))
solutions_pd['condition'] = solutions_pd['row_id'].apply(get_condition)

In [32]:
row_id_column_name = 'row_id'

del solutions_pd[row_id_column_name]
del submission[row_id_column_name]
assert sorted(submission.columns) == sorted(target_levels)

submission['study_id'] = solutions_pd['study_id']
submission['location'] = solutions_pd['location']
submission['condition'] = solutions_pd['condition']

#### Testing for one condition to understand log loss

In [33]:
condition_losses = []
condition_weights = []

condition_indices = solutions_pd.loc[solutions_pd['condition'] == 'spinal'].index.values

In [73]:
y_true=solutions_pd.loc[condition_indices, target_levels].values
y_true

array([[1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       ...,
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0]], dtype=int32)

In [72]:
y_pred=submission.loc[condition_indices, target_levels].values
y_pred

array([[0.06724037, 0.87429702, 0.05846261],
       [0.06496824, 0.87956232, 0.05546964],
       [0.06813553, 0.87197536, 0.059889  ],
       ...,
       [0.333333  , 0.333333  , 0.333333  ],
       [0.333333  , 0.333333  , 0.333333  ],
       [0.333333  , 0.333333  , 0.333333  ]])

In [59]:
sample_weight=solutions_pd.loc[condition_indices, 'sample_weight'].values
sample_weight

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1,
       1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1,
       1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 4,
       1, 1, 1, 1, 1, 1, 1, 4, 4, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 4, 1,
       4, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1,
       2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1,

In [74]:
print(sample_weight[163:165])
print(y_true[163:165])
print(y_pred[163:165])

[1 4]
[[1 0 0]
 [0 0 1]]
[[0.333333 0.333333 0.333333]
 [0.333333 0.333333 0.333333]]


In [None]:
condition_losses = []
condition_weights = []
for condition in ['spinal', 'foraminal', 'subarticular']:
    condition_indices = solutions_pd.loc[solutions_pd['condition'] == condition].index.values
    condition_loss = log_loss(
        y_true=solutions_pd.loc[condition_indices, target_levels].values,
        y_pred=submission.loc[condition_indices, target_levels].values,
        sample_weight=solutions_pd.loc[condition_indices, 'sample_weight'].values
    )
    condition_losses.append(condition_loss)
    condition_weights.append(1)

In [None]:
condition_losses

In [None]:
any_severe_spinal_labels = pd.Series(solutions_pd.loc[solutions_pd['condition'] == 'spinal'].groupby('study_id')['severe'].max())
any_severe_spinal_weights = pd.Series(solutions_pd.loc[solutions_pd['condition'] == 'spinal'].groupby('study_id')['sample_weight'].max())
any_severe_spinal_predictions = pd.Series(submission.loc[submission['condition'] == 'spinal'].groupby('study_id')['severe'].max())

In [None]:
any_severe_scalar = 1.0

any_severe_spinal_loss = log_loss(
    y_true=any_severe_spinal_labels,
    y_pred=any_severe_spinal_predictions,
    sample_weight=any_severe_spinal_weights
)
condition_losses.append(any_severe_spinal_loss)
condition_weights.append(any_severe_scalar)

print("final score during training : ", np.average(condition_losses, weights=condition_weights))

# Training

### Custom Loss function with weights

In [31]:
def create_wtd_log_loss(sample_weights):
    def wtd_log_loss(y_true, y_pred):
        return log_loss(y_true, y_pred, sample_weight=sample_weights)
    return wtd_log_loss

In [34]:
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras import layers

with strategy.scope():
    
    rsna_input = layers.Input(shape=(128,128,3), name="rsna_input")
    #weights = layers.Input(shape=(1,3), name="weights")
    
    conv_base = EfficientNetB0(include_top=False, weights="imagenet", input_tensor=rsna_input)
    conv_base.trainable = False
    
    x = layers.GlobalAveragePooling2D(name="avg_pool")(conv_base.output)
    x = layers.BatchNormalization()(x)
    
    hidden_layer1 = layers.Dense(200, activation="relu", kernel_initializer=keras.initializers.LecunNormal(seed=None))(x)
    hidden_layer2 = layers.Dense(100, activation="selu")(hidden_layer1)
    hidden_layer3 = layers.Dense(50, activation="selu")(hidden_layer2)
    rsna_output = layers.Dense(3, activation="softmax")(hidden_layer3)
    #model = tf.keras.models.Model(inputs=rsna_input, outputs=rsna_output)
    model = tf.keras.Model(rsna_input, rsna_output)
    
    checkpoint_cb = tf.keras.callbacks.ModelCheckpoint("keras_effnet_spinal_canal_stenosis_l1_l2.keras")
    early_stopping_cb = tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True)

    #model.compile(loss="sparse_categorical_crossentropy", optimizer="adamax", metrics=["accuracy"])
    model.compile(loss=create_wtd_log_loss(z), optimizer="adamax", metrics=["accuracy"])

In [None]:
config = model.get_config()
print(config["layers"][0]["config"])

In [None]:
model.summary()

In [37]:
history = model.fit(train_ds, epochs=5, validation_data=valid_ds, callbacks=[checkpoint_cb, early_stopping_cb])

Epoch 1/5




TypeError: Axis must be specified when shapes of a and weights differ.

In [None]:
pd.DataFrame(history.history).plot(figsize=(10,6))
plt.grid(True)
plt.gca().set_ylim(0.60,1)
plt.show()

In [None]:
model.save("keras_base_spinal_canal_stenosis_l1_l2.h5")

In [None]:
cropped_image = tf.image.resize_with_crop_or_pad(image, 100, 100)

In [None]:
fig, axes = plt.subplots(1,1, figsize=(5,5))
axes.imshow(np.squeeze(cropped_image.numpy()), cmap='gray')
axes.set_title('image')

## Testing inference with the trained model

In [5]:
model = keras.models.\
load_model("/kaggle/input/keras_base_scs_l1_l2/tensorflow2/default/1/keras_base_spinal_canal_stenosis_l1_l2.h5")

### Remove the squeeze from image file preprocessing

In [75]:
def read_and_parse_dicom_files_for_inf(full_file_path):
    tf.config.run_functions_eagerly(True)
    raw_image = tf.io.read_file(full_file_path)
    sp = tf.strings.split(tf.gather(tf.strings.split(full_file_path, 'images/'), 1), '/')
    N = tf.size(sp)
    LEN = tf.strings.length(tf.gather(sp, 0))+tf.strings.length(tf.gather(sp, 2))
    
    # Add missing file metadata to avoid warnnigs flooding
    if   LEN==12: raw_image = tf.strings.regex_replace(raw_image, pattern=b'DICM\x02\x00\x01\x00', rewrite=b'DICM\x02\x00\x00\x00UL\x04\x00\x92\x00\x00\x00\x02\x00\x01\x00')
    elif LEN==13: raw_image = tf.strings.regex_replace(raw_image, pattern=b'DICM\x02\x00\x01\x00', rewrite=b'DICM\x02\x00\x00\x00UL\x04\x00\x92\x00\x00\x00\x02\x00\x01\x00')
    elif LEN==14: raw_image = tf.strings.regex_replace(raw_image, pattern=b'DICM\x02\x00\x01\x00', rewrite=b'DICM\x02\x00\x00\x00UL\x04\x00\x94\x00\x00\x00\x02\x00\x01\x00')
    elif LEN==15: raw_image = tf.strings.regex_replace(raw_image, pattern=b'DICM\x02\x00\x01\x00', rewrite=b'DICM\x02\x00\x00\x00UL\x04\x00\x94\x00\x00\x00\x02\x00\x01\x00')
    elif LEN==16: raw_image = tf.strings.regex_replace(raw_image, pattern=b'DICM\x02\x00\x01\x00', rewrite=b'DICM\x02\x00\x00\x00UL\x04\x00\x96\x00\x00\x00\x02\x00\x01\x00')
    elif LEN==17: raw_image = tf.strings.regex_replace(raw_image, pattern=b'DICM\x02\x00\x01\x00', rewrite=b'DICM\x02\x00\x00\x00UL\x04\x00\x96\x00\x00\x00\x02\x00\x01\x00')
    elif LEN==18: raw_image = tf.strings.regex_replace(raw_image, pattern=b'DICM\x02\x00\x01\x00', rewrite=b'DICM\x02\x00\x00\x00UL\x04\x00\x98\x00\x00\x00\x02\x00\x01\x00')
    
    #image_bytes = tf.io.read_file(full_file_path)
    #image = tfio.image.decode_dicom_image(image_bytes, scale='auto', dtype=tf.float32)
    image = tfio.image.decode_dicom_image(raw_image, scale='auto', dtype=tf.float32)
    m, M=tf.math.reduce_min(image), tf.math.reduce_max(image)
    image = (tf.image.grayscale_to_rgb(image)-m)/(M-m)
    image = tf.image.resize(image, (128,128))
    return image

In [76]:
"""test_image = \
read_and_parse_dicom_files_for_inf('/kaggle/input/rsna-2024-lumbar-spine-degenerative-classification/test_images/44036939/3481971518/1.dcm')
"""
test_image = \
read_and_parse_dicom_files_for_inf('/kaggle/input/rsna-2024-lumbar-spine-degenerative-classification/train_images/100206310/1012284084/20.dcm')

In [77]:
test_image.shape

TensorShape([1, 128, 128, 3])

In [78]:
y_proba = model.predict(test_image)



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 252ms/step


In [79]:
y_proba

array([[0.0680268 , 0.87121695, 0.06075624]], dtype=float32)

In [80]:
train_studies_metadata_df.filter(pl.col('study_id')==100206310)

study_id,spinal_canal_stenosis_l1_l2,spinal_canal_stenosis_l2_l3,spinal_canal_stenosis_l3_l4,spinal_canal_stenosis_l4_l5,spinal_canal_stenosis_l5_s1,left_neural_foraminal_narrowing_l1_l2,left_neural_foraminal_narrowing_l2_l3,left_neural_foraminal_narrowing_l3_l4,left_neural_foraminal_narrowing_l4_l5,left_neural_foraminal_narrowing_l5_s1,right_neural_foraminal_narrowing_l1_l2,right_neural_foraminal_narrowing_l2_l3,right_neural_foraminal_narrowing_l3_l4,right_neural_foraminal_narrowing_l4_l5,right_neural_foraminal_narrowing_l5_s1,left_subarticular_stenosis_l1_l2,left_subarticular_stenosis_l2_l3,left_subarticular_stenosis_l3_l4,left_subarticular_stenosis_l4_l5,left_subarticular_stenosis_l5_s1,right_subarticular_stenosis_l1_l2,right_subarticular_stenosis_l2_l3,right_subarticular_stenosis_l3_l4,right_subarticular_stenosis_l4_l5,right_subarticular_stenosis_l5_s1
i64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
100206310,"""Normal/Mild""","""Normal/Mild""","""Moderate""","""Severe""","""Normal/Mild""","""Normal/Mild""","""Moderate""","""Moderate""","""Severe""","""Moderate""","""Moderate""","""Moderate""","""Severe""","""Moderate""","""Normal/Mild""","""Normal/Mild""","""Normal/Mild""","""Moderate""","""Severe""","""Moderate""","""Normal/Mild""","""Normal/Mild""","""Moderate""","""Moderate""","""Moderate"""


In [84]:
train_studies_metadata_df_final_2.filter((pl.col('study_id')==100206310) & (pl.col('row_id')=='100206310_spinal_canal_stenosis_l1_l2'))

study_id,row_id,sample_weight,normal_mild,moderate,severe
i64,str,i32,i32,i32,i32
100206310,"""100206310_spinal_canal_stenosi…",1,1,0,0


In [89]:
print(np.array([[1,0,0]]).shape)
print(y_proba.shape)
print(np.array([1]).shape)

(1, 3)
(1, 3)
(1,)


In [86]:
print(log_loss(np.array([[1,0,0]]), y_proba, sample_weight=np.array([1])))

2.68785348332437


## How weighted log loss works for N samples
### Using the example below
#### Two samples, each sample has 3 classes with true labels (y_true) & predicted probabilities (y_pred)
#### the 1st class has a weight of 1 and 2nd class has a weight 2. The number of weights correspond to the true labels of the two samples, 
#### the weight associated with the 1st class is 1 and with the 2nd class is 2, since the first sample belongs to class 1, hence 1 will be multiplied with the predicted probability

In [54]:
y_true = np.array([[1,0,0],[0,1,0]])
y_pred = np.array([[0.0680268 , 0.87121695, 0.06075624],[0.0680268 , 0.87121695, 0.06075624]])
sample_weight = np.array([1,2])

print(log_loss(y_true, y_pred, sample_weight=sample_weight))
print(log_loss(y_true, y_pred))

0.9878606690208339
1.4128588827163582


In [91]:
# Loss of 1st sample
loss_1 = -(1 * np.log(0.0680268) + 0 * np.log(0.87121695) + 0 * np.log(0.06075624))
# Loss of 2nd sample
loss_2 = -(0 * np.log(0.0680268) + 1 * np.log(0.87121695) + 0 * np.log(0.06075624))

loss_without_wt = np.average([loss_1, loss_2])
loss_with_wt = np.average([loss_1, loss_2], weights=[1,2])

print("loss_without_wt : ", loss_without_wt)
print("loss_with_wt : ", loss_with_wt)

loss_without_wt :  1.4128588927163583
loss_with_wt :  0.9878606790208341


In [None]:
model.evaluate(test_ds)