# Load libraries

In [1]:
import os
import tensorflow.keras
import h5py
import io

import numpy as np
import pandas as pd
import tensorflow as tf
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import metrics
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import precision_score, recall_score, classification_report, confusion_matrix, f1_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder, KBinsDiscretizer
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from imblearn.over_sampling import SMOTE
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Activation, LeakyReLU, ReLU, Flatten, BatchNormalization, Input, Reshape, Conv2D, MaxPooling2D
from tensorflow.keras.activations import relu,sigmoid,softmax

2024-08-10 15:07:57.271323: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-10 15:07:57.271461: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-10 15:07:57.441892: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Global variable initialization

In [None]:
TRAIN_IMAGES_PATH = '/kaggle/input/isic-2024-challenge/train-image.hdf5'
TEST_IMAGES_PATH = '/kaggle/input/isic-2024-challenge/test-image.hdf5'

TRAIN_METADATA_PATH = '/kaggle/input/isic-2024-challenge/train-metadata.csv'
TEST_METADATA_PATH = '/kaggle/input/isic-2024-challenge/test-metadata.csv'

## Images processing function

In [None]:
def process_image(hdf5_file_path: str) -> pd.DataFrame:

    image_arrays = []

    with h5py.File(hdf5_file_path, 'r') as f:

        for dataset_name in f.keys():

            # Loading the image data from the current dataset
            # 'f[dataset_name][()]' retrieves the binary image data
            # 'io.BytesIO' converts the binary data into a format that can be read as an image
            image_grayscale = load_img(io.BytesIO(f[dataset_name][()]), color_mode='grayscale')

            image_array = img_to_array(image_grayscale) / 255.0

            flattened_image = tf.image.resize(image_array, (32, 32)).numpy().flatten()

            # Adding a new column "image_code" to the table that stores the dataset name (unique identifier for each image)
            df_image_pixels = pd.DataFrame([flattened_image])
            df_image_pixels["image_code"] = str(dataset_name)
            
            image_arrays.append(df_image_pixels)

    return pd.concat(image_arrays, ignore_index=True)

## Metadata processing function

In [None]:
def process_metadata(csv_file_path: str) -> pd.DataFrame:
    
    df_meta = pd.read_csv(
        csv_file_path,
        dtype = {
            'isic_id': 'string',
            'target': 'int64',
            'patient_id': 'string',
            'age_approx': 'float64',
            'sex': 'string',
            'anatom_site_general': 'string',
            'clin_size_long_diam_mm': 'float64',
            'image_type': 'string',
            'tbp_tile_type': 'string',
            'tbp_lv_A': 'float64',
            'tbp_lv_Aext': 'float64',
            'tbp_lv_B': 'float64',
            'tbp_lv_Bext': 'float64',
            'tbp_lv_C': 'float64',
            'tbp_lv_Cext': 'float64',
            'tbp_lv_H': 'float64',
            'tbp_lv_Hext': 'float64',
            'tbp_lv_L': 'float64',
            'tbp_lv_Lext': 'float64',
            'tbp_lv_areaMM2': 'float64',
            'tbp_lv_area_perim_ratio': 'float64',
            'tbp_lv_color_std_mean': 'float64',
            'tbp_lv_deltaA': 'float64',
            'tbp_lv_deltaB': 'float64',
            'tbp_lv_deltaL': 'float64',
            'tbp_lv_deltaLB': 'float64',
            'tbp_lv_deltaLBnorm': 'float64',
            'tbp_lv_eccentricity': 'float64',
            'tbp_lv_location': 'string',
            'tbp_lv_location_simple': 'string',
            'tbp_lv_minorAxisMM': 'float64',
            'tbp_lv_nevi_confidence': 'float64',
            'tbp_lv_norm_border': 'float64',
            'tbp_lv_norm_color': 'float64',
            'tbp_lv_perimeterMM': 'float64',
            'tbp_lv_radial_color_std_max': 'float64',
            'tbp_lv_stdL': 'float64',
            'tbp_lv_stdLExt': 'float64',
            'tbp_lv_symm_2axis': 'float64',
            'tbp_lv_symm_2axis_angle': 'int64',
            'tbp_lv_x': 'float64',
            'tbp_lv_y': 'float64',
            'tbp_lv_z': 'float64',
            'attribution': 'string',
            'copyright_license': 'string',
            'lesion_id': 'string',
            'iddx_full': 'string',
            'iddx_1': 'string',
            'iddx_2': 'string',
            'iddx_3': 'string',
            'iddx_4': 'string',
            'iddx_5': 'string',
            'mel_mitotic_index': 'string',
            'mel_thick_mm': 'float64',
            'tbp_lv_dnn_lesion_confidence': 'float64'
        }
    )

    

# Load data

In [None]:
df_train_images = process_image(TRAIN_IMAGES_PATH)
df_test_images = process_image(TEST_IMAGES_PATH)

df_train_metadata = process_metadata(TRAIN_METADATA_PATH)
df_test_metadata = process_metadata(TEST_METADATA_PATH)

# Undersample the "benign"data for balancing purposes

In [None]:
df_meta.rename(columns={"isic_id": "image_code"}, inplace=True)

merged_df = pd.merge(df, df_meta, on='image_code')

In [None]:
benign_df = merged_df[merged_df['target'] == 0]
malign_df = merged_df[merged_df['target'] == 1]

In [None]:
final_df = pd.concat([sampled_benign_df, malign_df])

In [None]:
image_columns = df.columns.tolist() 
meta_columns = df_meta.columns.tolist()

image_columns_with_target = image_columns + ['target']

image_df = final_df[image_columns_with_target]
meta_df = final_df[meta_columns]

In [None]:
image_df["target"].value_counts()

In [None]:
melanoma_counts = pd.Series({'benign': 3000, 'malign': 393})

colors = ['lavender', 'pink']
bars = plt.bar(melanoma_counts.index, melanoma_counts, color=colors)

for i, bar in enumerate(bars):
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 1, 
             str(melanoma_counts[i]), ha='center', va='bottom')

plt.legend(bars, ['benign', 'malign'])
plt.title("Melanoma distribution after resampling")

plt.show()

# Drop columns with a lot of nulls

In [None]:
meta_df.isnull().sum()

In [None]:
meta_df.drop(columns=["lesion_id", "iddx_2", "iddx_3", "iddx_4", "iddx_5", "mel_mitotic_index", "mel_thick_mm"], inplace = True)

# Label encoding

In [None]:
enc = LabelEncoder()

#### Sex

In [None]:
meta_df['sex'].fillna('unkown',inplace = True)
df_meta['sex_enc'] = enc.fit_transform(df_meta.sex.astype('str'))

In [None]:
plt.figure(figsize = (5,3))
sns.countplot(x = 'sex', hue = 'target', data = meta_df, palette="pastel")

#### Anatom_site_general

In [None]:
meta_df.anatom_site_general = meta_df.anatom_site_general.fillna('unknown')
meta_df['anatom_enc']= enc.fit_transform(meta_df.anatom_site_general.astype('str'))

#### Age

In [None]:
meta_df['age_approx'] = meta_df['age_approx'].fillna(meta_df['age_approx'].mode().values[0])
meta_df['age_enc']= enc.fit_transform(meta_df['age_approx'].astype('str'))

In [None]:
plt.figure(figsize = (10,5))
sns.countplot(x = 'age_approx', hue = 'target', data = meta_df, palette="pastel")

#### Images per patient

In [None]:
meta_df['n_images'] = meta_df.patient_id.map(meta_df.groupby(['patient_id']).image_code.count())

#### Categorize number of images per patient

In [None]:
categorize = KBinsDiscretizer(n_bins = 10, encode = 'ordinal', strategy = 'uniform')
meta_df['n_images_enc'] = categorize.fit_transform(meta_df['n_images'].values.reshape(-1, 1)).astype(int).squeeze()

In [None]:
plt.figure(figsize = (6,3))
sns.countplot(x = 'n_images_enc', hue = 'target', data = meta_df, palette="pastel")

#### tbp tile type

In [None]:
meta_df['tbp_tile_type_enc']= enc.fit_transform(meta_df.tbp_tile_type.astype('str'))

#### iddx_full

In [None]:
meta_df['iddx_full_enc']= enc.fit_transform(meta_df.iddx_full.astype('str'))

#### iddx-1

In [None]:
meta_df['iddx_1_enc']= enc.fit_transform(meta_df.iddx_1.astype('str'))

#### tbp_lv_location

In [None]:
meta_df['tbp_lv_location_enc']= enc.fit_transform(meta_df.tbp_lv_location.astype('str'))

#### tbp_lv_location_simple

In [None]:
meta_df['tbp_lv_location_simple_enc']= enc.fit_transform(meta_df.tbp_lv_location_simple.astype('str'))

# metadata feature selection

#### drop columns not encoded

In [None]:
meta_df.drop(columns= [
   "tbp_lv_location", "tbp_lv_location_simple", "patient_id",
   "age_approx", "sex", "anatom_site_general", "image_type", "tbp_tile_type",
   "tbp_lv_location", "tbp_lv_location_simple", "attribution", "copyright_license",
   "iddx_full", "iddx_1"
 ], inplace=True)

#### make a copy for concat models later

In [None]:
sprmdl_meta = meta_df.copy()
sprmdl_meta.head()

#### drop image code column

In [None]:
meta_df.drop(columns=["image_code"], inplace=True)

#### correlation matrix

In [None]:
corr = meta_df.corr(method = 'pearson')
corr = corr.abs()
corr.style.background_gradient(cmap='inferno')

#### metadata features and target

In [None]:
meta_features = meta_df[
    [
        "clin_size_long_diam_mm", "tbp_lv_A",
        "tbp_lv_Aext", "tbp_lv_B", "tbp_lv_Bext", "tbp_lv_C", "tbp_lv_Cext",
        "tbp_lv_H", "tbp_lv_Hext", "tbp_lv_L", "tbp_lv_Lext", "tbp_lv_areaMM2",
        "tbp_lv_color_std_mean", "tbp_lv_deltaA", "tbp_lv_deltaB", "tbp_lv_deltaL",
        "tbp_lv_deltaLB", "tbp_lv_deltaLBnorm", "tbp_lv_eccentricity","iddx_full_enc", "iddx_1_enc"
    ]
]


meta_target = meta_df["target"]

# images features and target

In [None]:
image_features = image_df.drop(columns=["image_code", "target"])

image_target = image_df["target"]

# Train/test split

In [None]:
img_train, img_test, target_img_train, target_img_test = train_test_split(image_features, image_target, test_size = 0.20, random_state=0)

meta_train, meta_test, target_meta_train, target_meta_test = train_test_split(meta_features, meta_target, test_size = 0.20, random_state=0)

# Normalize metadata

In [None]:
scaler = StandardScaler()

In [None]:
scaler.fit(meta_train)

In [None]:
meta_train_norm = scaler.transform(meta_train)

meta_test_norm = scaler.transform(meta_test)

# ML models building

## image model: CNN

In [None]:
img_model = tf.keras.Sequential([
    tf.keras.layers.Reshape((32, 32, 1), input_shape=(1024,)), 
    tf.keras.layers.Conv2D(16, 3, padding='same', activation='relu'),
    tf.keras.layers.MaxPooling2D(),
    tf.keras.layers.Conv2D(32, 3, padding='same', activation='relu'),
    tf.keras.layers.MaxPooling2D(),
    tf.keras.layers.Conv2D(64, 3, padding='same', activation='relu'),
    tf.keras.layers.MaxPooling2D(),
    tf.keras.layers.Conv2D(128, 3, padding='same', activation='relu'),
    tf.keras.layers.MaxPooling2D(),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),  # Dense layer with 128 neurons
    tf.keras.layers.Dense(2)
    ])

img_model.compile(optimizer='adam',
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=['accuracy', 'mse'])

In [None]:
img_model.summary()

In [None]:
img_model.fit(img_train, target_img_train, epochs=10, validation_data=(img_test, target_img_test))

In [None]:
img_predictions = img_model.predict(img_test)

In [None]:
print(
    "This image most likely belongs to {} with a {:.2f} percent confidence."
    .format(tumour[np.argmax(score)], 100 * np.max(score))
)

## Metadata model: LightGBM

In [None]:
meta_d_train = lgb.Dataset(meta_train_norm, label=target_meta_train)
meta_d_test = lgb.Dataset(meta_test_norm, label=target_meta_test)
watchlist = [meta_d_train, meta_d_test]

lgbm_params = {
    "learning_rate": 0.3,
    "boosting_type": "dart", #dart has been shown to prevent overfitting (see documentation)
    "objective": "binary",
    "metric": ["auc", "binary_logloss"],
    "num_leaves": 100,
    "max_depth": 10,
    'verbosity': -1
    }

### Stratified K-fold

In [None]:
N_FOLDS = 10
folds = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

oof = np.zeros(len(meta_train_norm))
sub = np.zeros(len(meta_test_norm))

scores = [0 for _ in range(folds.n_splits)]

# Cross-validation loop
for fold_, (train_idx, val_idx) in enumerate(folds.split(meta_train_norm, target_meta_train)):
    X_train, y_train = meta_train_norm[train_idx], target_meta_train.iloc[train_idx]
    X_val, y_val = meta_train_norm[val_idx], target_meta_train.iloc[val_idx]
    
    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val)
    watchlist = [train_data, val_data]
    
    clf = lgb.train(lgbm_params, train_set=train_data, valid_sets=watchlist, num_boost_round=50)
    
    oof[val_idx] = clf.predict(X_val)
    sub += clf.predict(meta_test_norm) / folds.n_splits
    
    scores[fold_] = roc_auc_score(y_val, oof[val_idx])
    print("Fold {}: {}".format(fold_ + 1, round(scores[fold_], 5)))


print("CV score (auc): {:<8.5f}, (std: {:<8.5f})".format(roc_auc_score(target_meta_train, oof), np.std(scores)))

In [None]:
meta_predictions = clf.predict(meta_test_norm)

#### Convert into binary values (0/1) for classification

In [None]:
for i in range(meta_test_norm.shape[0]):
    if meta_predictions[i] >= 0.8:
        meta_predictions[i] = 1
    else:
        meta_predictions[i] = 0

#### Check model performance

In [None]:
print(f'Accuracy score: {metrics.accuracy_score(meta_predictions, target_meta_test)}')
print(f' ROC AUC score: {roc_auc_score(meta_predictions, target_meta_test)}')
print(classification_report(meta_predictions, target_meta_test))


In [None]:
cm_lgbm = confusion_matrix(target_meta_test, meta_predictions)
sns.heatmap(cm_lgbm, annot=True)

# Concat CNN and LGBM models

In [None]:
sprmdl_meta_features = sprmdl_meta.drop(columns=["image_code", "target"])
sprmdl_meta_target = sprmdl_meta["target"]
sprmdl_meta_key = sprmdl_meta["image_code"]

In [None]:
scaler.fit(sprmdl_meta_features)

In [None]:
sprmdl_meta_features_norm = scaler.transform(sprmdl_meta_features)

sprmdl_meta_features_norm = pd.DataFrame(sprmdl_meta_features_norm, columns = sprmdl_meta_features.columns)

In [None]:
sprmdl_meta_features_norm['image_code'] = sprmdl_meta_key

sprmdl_meta_features_norm.head()

In [None]:
supermodel_df = pd.merge(sprmdl_meta_features_norm, image_df, on='image_code')
supermodel_df.to_csv('supermodel_df.csv', index=False)

In [None]:
image_columns = list(map(str, range(1024)))
metadata_columns = [col for col in supermodel_df.columns if col not in image_columns + ['image_code', 'target']]

image_data = supermodel_df[image_columns].values
metadata_features = supermodel_df[metadata_columns].values
labels = supermodel_df['target'].values

In [None]:
# Define CNN model using API
inputs = Input(shape=(1024,))
x = Reshape((32, 32, 1))(inputs)
x = Conv2D(16, 3, padding='same', activation='relu')(x)
x = MaxPooling2D()(x)
x = Conv2D(32, 3, padding='same', activation='relu')(x)
x = MaxPooling2D()(x)
x = Conv2D(64, 3, padding='same', activation='relu')(x)
x = MaxPooling2D()(x)
x = Conv2D(128, 3, padding='same', activation='relu')(x)
x = MaxPooling2D()(x)
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
outputs = Dense(2)(x)

img_model = Model(inputs=inputs, outputs=outputs)
img_model.compile(optimizer='adam',
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=['accuracy', 'mse'])


dummy_input = np.zeros((1, 1024)) # Call the model on some dummy data to ensure it is built
img_model.predict(dummy_input)

feature_extractor = Model(inputs=img_model.input, outputs=img_model.layers[-2].output)

supermodel_df = pd.read_csv('supermodel_df.csv')  #load the data

image_columns = list(map(str, range(1024)))
metadata_columns = [col for col in supermodel_df.columns if col not in image_columns + ['image_code', 'target']]

image_data = supermodel_df[image_columns].values
metadata_features = supermodel_df[metadata_columns].values
labels = supermodel_df['target'].values


cnn_features = feature_extractor.predict(image_data) #Extract features


combined_features = np.hstack((cnn_features, metadata_features)) # Combine CNN features with metadata


lgbm_params = {
    "learning_rate": 0.05,
    "boosting_type": "dart", #dart has been shown to prevent overfitting (see documentation)
    "objective": "binary",
    "metric": ["auc", "binary_logloss"],
    "num_leaves": 100,
    "max_depth": 10,
    'verbosity': -1
    }

N_FOLDS = 10
folds = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

oof = np.zeros(len(combined_features))
sub = np.zeros(len(combined_features)) 

scores = [0 for _ in range(folds.n_splits)]

for fold_, (train_idx, val_idx) in enumerate(folds.split(combined_features, labels)):
    X_train, y_train = combined_features[train_idx], labels[train_idx]
    X_val, y_val = combined_features[val_idx], labels[val_idx]
    
    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val)
    watchlist = [train_data, val_data]
    
    clf = lgb.train(lgbm_params, train_set=train_data, valid_sets=watchlist, num_boost_round=50)
    
    oof[val_idx] = clf.predict(X_val)
    sub += clf.predict(combined_features) / folds.n_splits
    
    scores[fold_] = roc_auc_score(y_val, oof[val_idx])
    print("Fold {}: {}".format(fold_ + 1, round(scores[fold_], 5)))

print("CV AUC: {:.5f}".format(roc_auc_score(labels, oof)))
