In [1]:
!pip install duckdb --no-index --find-links=file:///kaggle/input/polars-and-duckdb/kaggle/working/mysitepackages/duck_pkg
!pip install polars --no-index --find-links=file:///kaggle/input/polars-and-duckdb/kaggle/working/mysitepackages/polars_pkg

Looking in links: file:///kaggle/input/polars-and-duckdb/kaggle/working/mysitepackages/duck_pkg
Processing /kaggle/input/polars-and-duckdb/kaggle/working/mysitepackages/duck_pkg/duckdb-0.8.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Installing collected packages: duckdb
Successfully installed duckdb-0.8.1
Looking in links: file:///kaggle/input/polars-and-duckdb/kaggle/working/mysitepackages/polars_pkg


In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import os
import polars as pl
import duckdb as dd
#from tqdm import tqdm
import matplotlib.pyplot as plt
#import cv2
#from pydicom import dcmread
import warnings
#from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
import pickle
#import gc
import ctypes
import tensorflow as tf
import tensorflow_io as tfio
from tensorflow import keras

In [3]:
try: # detect TPUs
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
except ValueError: # detect GPUs
    strategy = tf.distribute.MirroredStrategy() # for GPU or multi-GPU machines
    #strategy = tf.distribute.get_strategy() # default strategy that works on CPU and single GPU
    #strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() # for clusters of multi-GPU machines

print("Number of accelerators: ", strategy.num_replicas_in_sync)

Number of accelerators:  2


In [4]:
print('DEVICES AVAILABLE: {}'.format(strategy.num_replicas_in_sync))

BATCH_SIZE_PER_REPLICA = 48

BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync

DEVICES AVAILABLE: 2


## Preprocessing functions

In [5]:
def read_and_parse_dicom_files(full_file_path):
    tf.config.run_functions_eagerly(True)
    raw_image = tf.io.read_file(full_file_path)
    sp = tf.strings.split(tf.gather(tf.strings.split(full_file_path, 'images/'), 1), '/')
    N = tf.size(sp)
    LEN = tf.strings.length(tf.gather(sp, 0))+tf.strings.length(tf.gather(sp, 2))
    
    # Add missing file metadata to avoid warnnigs flooding
    if   LEN==12: raw_image = tf.strings.regex_replace(raw_image, pattern=b'DICM\x02\x00\x01\x00', rewrite=b'DICM\x02\x00\x00\x00UL\x04\x00\x92\x00\x00\x00\x02\x00\x01\x00')
    elif LEN==13: raw_image = tf.strings.regex_replace(raw_image, pattern=b'DICM\x02\x00\x01\x00', rewrite=b'DICM\x02\x00\x00\x00UL\x04\x00\x92\x00\x00\x00\x02\x00\x01\x00')
    elif LEN==14: raw_image = tf.strings.regex_replace(raw_image, pattern=b'DICM\x02\x00\x01\x00', rewrite=b'DICM\x02\x00\x00\x00UL\x04\x00\x94\x00\x00\x00\x02\x00\x01\x00')
    elif LEN==15: raw_image = tf.strings.regex_replace(raw_image, pattern=b'DICM\x02\x00\x01\x00', rewrite=b'DICM\x02\x00\x00\x00UL\x04\x00\x94\x00\x00\x00\x02\x00\x01\x00')
    elif LEN==16: raw_image = tf.strings.regex_replace(raw_image, pattern=b'DICM\x02\x00\x01\x00', rewrite=b'DICM\x02\x00\x00\x00UL\x04\x00\x96\x00\x00\x00\x02\x00\x01\x00')
    elif LEN==17: raw_image = tf.strings.regex_replace(raw_image, pattern=b'DICM\x02\x00\x01\x00', rewrite=b'DICM\x02\x00\x00\x00UL\x04\x00\x96\x00\x00\x00\x02\x00\x01\x00')
    elif LEN==18: raw_image = tf.strings.regex_replace(raw_image, pattern=b'DICM\x02\x00\x01\x00', rewrite=b'DICM\x02\x00\x00\x00UL\x04\x00\x98\x00\x00\x00\x02\x00\x01\x00')
    
    #image_bytes = tf.io.read_file(full_file_path)
    #image = tfio.image.decode_dicom_image(image_bytes, scale='auto', dtype=tf.float32)
    image = tfio.image.decode_dicom_image(raw_image, scale='auto', dtype=tf.float32)
    m, M=tf.math.reduce_min(image), tf.math.reduce_max(image)
    image = (tf.image.grayscale_to_rgb(image)-m)/(M-m)
    image = tf.image.resize(image, (128,128))
    return tf.squeeze(image)

def load_dataset(image_path, labels):
    image = read_and_parse_dicom_files(image_path)
    return {"images": tf.cast(image, tf.float32), "labels": tf.cast(labels, tf.float32)}

def dict_to_tuple(inputs):
    return inputs["images"], inputs["labels"]

## Train, Test, Validation & holdout splits
### holdout set to be used for CV

In [16]:
condition_for_training = 'spinal_canal_stenosis'
vertebrae_position = 'l2_l3'

In [8]:
metadata_file_path = '/kaggle/input/spinal-canal-stenosis-metadata/{0}_{1}_feature_metadata.csv'.format(condition_for_training, vertebrae_position)
metadata_df = pl.read_csv(metadata_file_path, low_memory=True)

In [9]:
for_train, holdout = train_test_split(metadata_df, test_size=0.4, random_state=42)

x_train, x_test_val = train_test_split(for_train, test_size=0.3, random_state=42)
x_test, x_valid = train_test_split(x_test_val, test_size=0.2, random_state=42)

print("Training data shape : {0}".format(x_train.shape))
print("Test data shape : {0}".format(x_test.shape))
print("Validation data shape : {0}".format(x_valid.shape))
print("Holdout data shape : {0}".format(holdout.shape))

Training data shape : (61794, 3)
Test data shape : (21187, 3)
Validation data shape : (5297, 3)
Holdout data shape : (58853, 3)


In [10]:
holdout_image_filenames = pl.Series(holdout.select(pl.col('full_img_path'))).to_list()
holdout_image_labels = pl.Series(holdout.select(pl.col('encoded_severity'))).to_list()

train_image_filenames = pl.Series(x_train.select(pl.col('full_img_path'))).to_list()
train_image_labels = pl.Series(x_train.select(pl.col('encoded_severity'))).to_list()

test_image_filenames = pl.Series(x_test.select(pl.col('full_img_path'))).to_list()
test_image_labels = pl.Series(x_test.select(pl.col('encoded_severity'))).to_list()

valid_image_filenames = pl.Series(x_valid.select(pl.col('full_img_path'))).to_list()
valid_image_labels = pl.Series(x_valid.select(pl.col('encoded_severity'))).to_list()

In [11]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_image_filenames, train_image_labels))

test_dataset = tf.data.Dataset.from_tensor_slices((test_image_filenames, test_image_labels))

valid_dataset = tf.data.Dataset.from_tensor_slices((valid_image_filenames, valid_image_labels))

holdout_dataset = tf.data.Dataset.from_tensor_slices((holdout_image_filenames, holdout_image_labels))

In [12]:
train_ds = train_dataset.map(load_dataset, num_parallel_calls=tf.data.AUTOTUNE)
train_ds = train_ds.map(dict_to_tuple, num_parallel_calls=tf.data.AUTOTUNE)
train_ds = train_ds.batch(batch_size=BATCH_SIZE, drop_remainder=True)
train_ds = train_ds.prefetch(tf.data.AUTOTUNE)

test_ds = test_dataset.map(load_dataset, num_parallel_calls=tf.data.AUTOTUNE)
test_ds = test_ds.map(dict_to_tuple, num_parallel_calls=tf.data.AUTOTUNE)
test_ds = test_ds.batch(batch_size=BATCH_SIZE, drop_remainder=True)
test_ds = test_ds.prefetch(tf.data.AUTOTUNE)

valid_ds = valid_dataset.map(load_dataset, num_parallel_calls=tf.data.AUTOTUNE)
valid_ds = valid_ds.map(dict_to_tuple, num_parallel_calls=tf.data.AUTOTUNE)
valid_ds = valid_ds.batch(batch_size=BATCH_SIZE, drop_remainder=True)
valid_ds = valid_ds.prefetch(tf.data.AUTOTUNE)

holdout_ds = holdout_dataset.map(load_dataset, num_parallel_calls=tf.data.AUTOTUNE)
holdout_ds = holdout_ds.map(dict_to_tuple, num_parallel_calls=tf.data.AUTOTUNE)
holdout_ds = holdout_ds.batch(batch_size=BATCH_SIZE, drop_remainder=True)
holdout_ds = holdout_ds.prefetch(tf.data.AUTOTUNE)



In [13]:
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras import layers

with strategy.scope():
    
    rsna_input = layers.Input(shape=(128,128,3), name="rsna_input")
    
    conv_base = EfficientNetB0(include_top=False, weights="imagenet", input_tensor=rsna_input)
    conv_base.trainable = False
    
    x = layers.GlobalAveragePooling2D(name="avg_pool")(conv_base.output)
    x = layers.BatchNormalization()(x)
    
    hidden_layer1 = layers.Dense(200, activation="relu", kernel_initializer=keras.initializers.LecunNormal(seed=None))(x)
    hidden_layer2 = layers.Dense(100, activation="selu")(hidden_layer1)
    hidden_layer3 = layers.Dense(50, activation="selu")(hidden_layer2)
    rsna_output = layers.Dense(3, activation="softmax")(hidden_layer3)
    model = tf.keras.Model(rsna_input, rsna_output)
    
    checkpoint_cb = tf.keras.callbacks.ModelCheckpoint("keras_effnet_{0}_{1}.keras".format(condition_for_training, vertebrae_position))
    early_stopping_cb = tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True)

    model.compile(loss="sparse_categorical_crossentropy", optimizer="adamax", metrics=["accuracy"])

Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb0_notop.h5
[1m16705208/16705208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [14]:
history = model.fit(train_ds, class_weight={0:2, 1:1, 2:4}, epochs=7, validation_data=valid_ds, callbacks=[checkpoint_cb, early_stopping_cb])

Epoch 1/7
[1m643/643[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m961s[0m 1s/step - accuracy: 0.8617 - loss: 0.9834 - val_accuracy: 0.8867 - val_loss: 0.7618
Epoch 2/7
[1m643/643[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m959s[0m 1s/step - accuracy: 0.8836 - loss: 0.8943 - val_accuracy: 0.8867 - val_loss: 0.6995
Epoch 3/7
[1m643/643[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m951s[0m 1s/step - accuracy: 0.8829 - loss: 0.8863 - val_accuracy: 0.8867 - val_loss: 0.5944
Epoch 4/7
[1m643/643[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m942s[0m 1s/step - accuracy: 0.8848 - loss: 0.8798 - val_accuracy: 0.8867 - val_loss: 0.5867
Epoch 5/7
[1m643/643[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m937s[0m 1s/step - accuracy: 0.8853 - loss: 0.8749 - val_accuracy: 0.8867 - val_loss: 0.5925
Epoch 6/7
[1m643/643[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m942s[0m 1s/step - accuracy: 0.8858 - loss: 0.8732 - val_accuracy: 0.8867 - val_loss: 0.5131
Epoch 7/7
[1m643/643[0m [

In [17]:
model.save("keras_base_{0}_{1}.h5".format(condition_for_training, vertebrae_position))

In [18]:
model.evaluate(test_ds)



[1m220/220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m189s[0m 856ms/step - accuracy: 0.8892 - loss: 0.5085


[0.5126920938491821, 0.8865530490875244]