# Load

In [1]:
import sys
sys.path.append('/kaggle/input/isic2024-tomo20180402-scripts/isic2024_tomo20180402_scripts')
import os
os.sched_setaffinity(0, range(0, 24))
import pandas as pd
import polars as pl
import warnings
warnings.simplefilter('ignore')
from scripts.config import Config as cfg
from scripts.logger import create_logger
from scripts.data_loader import load_metadata, load_images
from scripts import feature_generation, image_dataset, image_training, tabular_training, common_training, prediction

In [2]:
# logger
logger = create_logger()

# load metadata
logger.info(f'start: load_metadata')
train_metadata_df, test_metadata_df = load_metadata(data_dir=cfg.DATA_DIR)
original_train_metadata_columns = train_metadata_df.columns

# load image
if cfg.IS_TRAIN:
    logger.info(f'start: load_images train')
    train_imgs, train_isic_ids = load_images(image_hdf5_path =cfg.DATA_DIR/'train-image.hdf5', metadata_df=train_metadata_df)
logger.info(f'start: load_images test')
test_imgs, test_isic_ids = load_images(image_hdf5_path=cfg.DATA_DIR/'test-image.hdf5', metadata_df=test_metadata_df)

# device
logger.info(f'device = {cfg.DEVICE}')

[INFO] 2024-12-26 13:34:56,396 >>	start: load_metadata
[INFO] 2024-12-26 13:34:57,534 >>	start: load_images test
100%|██████████| 3/3 [00:00<00:00, 229.80it/s]
[INFO] 2024-12-26 13:34:57,559 >>	device = cuda


# Feature Engineering

In [3]:
train_metadata_df, test_metadata_df = feature_generation.run(train_metadata_df=train_metadata_df, test_metadata_df=test_metadata_df)

added_metadata_feature_col = [col for col in train_metadata_df.columns if col not in original_train_metadata_columns]
logger.info(f'n_added_metadata_feature_col = {len(added_metadata_feature_col)}')

if cfg.IS_TRAIN:
    logger.info(f'start: add_fold')
    train_metadata_df = common_training.add_fold(train_metadata_df=train_metadata_df, seed=cfg.SEED, n_fold=cfg.N_FOLD)

[INFO] 2024-12-26 13:34:57,574 >>	start: metadata_preprocess
[INFO] 2024-12-26 13:34:57,919 >>	start: add_metadata_feature
[INFO] 2024-12-26 13:35:03,326 >>	start: convert_categorical_dtype
[INFO] 2024-12-26 13:35:03,756 >>	start: generate_categories_dict
[INFO] 2024-12-26 13:35:03,761 >>	start: get_dummies
[INFO] 2024-12-26 13:35:03,829 >>	start: align_test_columns_to_train
[INFO] 2024-12-26 13:35:03,833 >>	start: clean_column_names
[INFO] 2024-12-26 13:35:06,408 >>	start: align_test_dtypes_to_train
[INFO] 2024-12-26 13:35:06,412 >>	start: fill_inf_with_median
[INFO] 2024-12-26 13:35:06,425 >>	n_added_metadata_feature_col = 270


# 1st Stage Training

In [4]:
if cfg.IS_TRAIN:
    image_training.run(train_metadata_df=train_metadata_df, train_imgs=train_imgs)

# 2nd Stage Training

In [5]:
if cfg.IS_TRAIN:
    result_all_img_df = pd.read_csv(cfg.OUTPUT_DIR / 'pt' / 'result_all_img.csv')
    train_metadata_df = train_metadata_df.join(pl.from_pandas(result_all_img_df), on='isic_id', how='left')
    tabular_training.run(train_metadata_df=train_metadata_df)

# Prediction

In [6]:
# load models
logger.info(f'start: load models')
img_model_paths_p1 = prediction.get_img_model_paths(exp='exp035')  # pattern1: 1st.exp035, 2nd.exp079
img_model_paths_p2 = prediction.get_img_model_paths(exp='exp012')  # pattern2: 1st.exp012, 2nd.exp083
lgb_model_paths_p1 = prediction.get_gbdt_model_paths(exp='exp079', gbdt='lgb')
xgb_model_paths_p1 = prediction.get_gbdt_model_paths(exp='exp079', gbdt='xgb')
catb_model_paths_p1 = prediction.get_gbdt_model_paths(exp='exp079', gbdt='catb')
lgb_model_paths_p2 = prediction.get_gbdt_model_paths(exp='exp083', gbdt='lgb')
xgb_model_paths_p2 = prediction.get_gbdt_model_paths(exp='exp083', gbdt='xgb')
catb_model_paths_p2 = prediction.get_gbdt_model_paths(exp='exp083', gbdt='catb')
img_models_p1 = prediction.load_image_models(img_model_paths_p1)
img_models_p2 = prediction.load_image_models(img_model_paths_p2)
lgb_models_p1 = [prediction.load_lgb(lgb_model_path) for lgb_model_path in lgb_model_paths_p1]
xgb_models_p1 = [prediction.load_xgb(xgb_model_path) for xgb_model_path in xgb_model_paths_p1]
catb_models_p1 = [prediction.load_catb(catb_model_path) for catb_model_path in catb_model_paths_p1]
lgb_models_p2 = [prediction.load_lgb(lgb_model_path) for lgb_model_path in lgb_model_paths_p2]
xgb_models_p2 = [prediction.load_xgb(xgb_model_path) for xgb_model_path in xgb_model_paths_p2]
catb_models_p2 = [prediction.load_catb(catb_model_path) for catb_model_path in catb_model_paths_p2]

# generate valid transform
valid_transform_p1, valid_transform_p2 = image_dataset.generate_valid_transforms(image_size=cfg.IMAGE_SIZE)

# prediction p1
logger.info(f'start: pattern1 prediction')
y_pred_p1 = prediction.predict_batch(
    test_metadata_df=test_metadata_df,
    test_imgs=test_imgs,
    img_models=img_models_p1,
    lgb_models=lgb_models_p1,
    xgb_models=xgb_models_p1,
    catb_models=catb_models_p1,
    val_transform=valid_transform_p1,
    dataset_func=image_dataset.CustomDatasetP1,
    feature_cols=cfg.FEATURE_COLS_P1,
    y_pred_img_name='y_pred_img')

# prediction p2
logger.info(f'start: pattern2 prediction')
y_pred_p2 = prediction.predict_batch(
    test_metadata_df=test_metadata_df,
    test_imgs=test_imgs,
    img_models=img_models_p2,
    lgb_models=lgb_models_p2,
    xgb_models=xgb_models_p2,
    catb_models=catb_models_p2,
    val_transform=valid_transform_p2,
    dataset_func=image_dataset.CustomDatasetP2,
    feature_cols=cfg.FEATURE_COLS_P2,
    y_pred_img_name='y_pred_img')

# output submission
logger.info(f'output submission')
submission = test_metadata_df.select('isic_id').to_pandas()
submission['target'] = (y_pred_p1 + y_pred_p2) / 2
submission.to_csv('submission.csv', index=False)
display(submission)

logger.info(f'end')

[INFO] 2024-12-26 13:35:06,480 >>	start: load models
[INFO] 2024-12-26 13:35:06,596 >>	get_img_model_paths: exp = exp035, n_model_paths = 25
[INFO] 2024-12-26 13:35:06,773 >>	get_img_model_paths: exp = exp012, n_model_paths = 25
[INFO] 2024-12-26 13:35:07,320 >>	get_gbdt_model_paths: exp = exp079, gbdt = lgb, n_model_paths = 250
[INFO] 2024-12-26 13:35:07,856 >>	get_gbdt_model_paths: exp = exp079, gbdt = xgb, n_model_paths = 250
[INFO] 2024-12-26 13:35:08,402 >>	get_gbdt_model_paths: exp = exp079, gbdt = catb, n_model_paths = 250
[INFO] 2024-12-26 13:35:08,999 >>	get_gbdt_model_paths: exp = exp083, gbdt = lgb, n_model_paths = 250
[INFO] 2024-12-26 13:35:09,647 >>	get_gbdt_model_paths: exp = exp083, gbdt = xgb, n_model_paths = 250
[INFO] 2024-12-26 13:35:10,198 >>	get_gbdt_model_paths: exp = exp083, gbdt = catb, n_model_paths = 250
[INFO] 2024-12-26 13:36:24,995 >>	start: pattern1 prediction


batch: 1 / 1


100%|██████████| 1/1 [00:03<00:00,  3.46s/it]
[INFO] 2024-12-26 13:36:30,609 >>	start: pattern2 prediction


batch: 1 / 1


100%|██████████| 1/1 [00:02<00:00,  2.33s/it]
[INFO] 2024-12-26 13:36:34,944 >>	output submission


Unnamed: 0,isic_id,target
0,ISIC_0015657,0.598865
1,ISIC_0015729,0.365756
2,ISIC_0015740,0.563732


[INFO] 2024-12-26 13:36:34,960 >>	end
