In [1]:
import os
import pandas as pd
import numpy as np
import h5py

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

import dproc, sgutil, sgpp, sgnn, sgml
import warnings

# 모든 FutureWarning 무시
warnings.simplefilter(action='ignore', category=FutureWarning)

2025-05-06 00:53:11.300021: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-06 00:53:11.310088: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746492791.321818     297 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746492791.325351     297 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-06 00:53:11.339010: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

# Data Processing

In [2]:
def load_data(filename):
    """
    이미지를 불러옵니다.
    Parameters:
        filename: str
            h5 파일에서 데이터를 불러옵니다.
    Returns:
        np.ndarray, pd.DataFrame, np.ndarray, 
        train 이미지, train spot 정보, test 이미지, test spot 정보
    """
    images, images_test = list(), list()
    spots, spots_test = list(), list()
    with h5py.File(filename, "r") as h5file:
        train_images = h5file["images/Train"]
        train_spots = h5file["spots/Train"]
    
        num_train_slides = len(train_images)
        # Train 이미지를 불러옵니다.
        # 하나의 텐서로 만들기 위해 이미지의 크기를 2000x2000으로 균일하게 만듭니다.
        for i, slide_name in enumerate(train_images.keys()):
            image = np.array(train_images[slide_name])
            p1 = 2000 - image.shape[0]
            p2 = 2000 - image.shape[1]
            images.append(
                np.pad(image, [(0, p1), (0, p2), (0, 0)], 'edge')
            )
            spots.append(pd.DataFrame(np.array(train_spots[slide_name])).assign(slide = i))
            if slide_name == 'S_2':
                spots[-1] = spots[-1].assign(
                    x = lambda x: x['x'] - 60,
                    y = lambda x: x['y'] - 60,
                )
        # Test 이미지를 불러옵니다.
        test_images = h5file["images/Test"]
        test_spots = h5file["spots/Test"]
        sample = 'S_7'
        image = np.array(test_images[sample])
        p1 = 2000 - image.shape[0]
        p2 = 2000 - image.shape[1]
        images_test.append(np.pad(image, [(0, p1), (0, p2), (0, 0)], 'edge'))
        spots_test.append(pd.DataFrame(np.array(test_spots[sample])).assign(slide = 0))
        df_spots = pd.concat(spots).reset_index(drop = True)
        images = np.stack(images)
        images_test = np.stack(images_test)
        df_spots_test = pd.concat(spots_test).reset_index(drop = True)
    return images, df_spots, images_test, df_spots_test

def make_img_proc_info(df, img_width, img_height):
    return df.assign(
        left = lambda x: (x['x'] - img_width // 2).astype('int'),
        right = lambda x: (x['left'] + img_width).astype('int'),
        top = lambda x: (x['y'] - img_height // 2).astype('int'),
        bottom = lambda x: (x['top'] + img_height).astype('int'),
        lpad = lambda x: -(x['left'].where(x['left'] < 0, 0)),
        rpad = lambda x: -(2000 - x['right']).where(x['right'] > 2000, 0),
        tpad = lambda x: -(x['top'].where(x['top'] < 0, 0)),
        bpad = lambda x: -(2000 - x['bottom']).where(x['bottom'] > 2000, 0)
    ).assign(
        left = lambda x: x['left'].clip(0, 2000),
        right = lambda x: x['right'].clip(0, 2000),
        top = lambda x: x['top'].clip(0, 2000),
        bottom = lambda x: x['bottom'].clip(0, 2000),
    )

def create_df(df, img_width, img_height):
    df = make_img_proc_info(df, img_width, img_height)
    df_pixel = df[['left', 'right', 'top', 'bottom', 'slide', 'lpad', 'rpad', 'tpad', 'bpad']].apply(
        lambda x: pd.Series(proc_images_np(x, images)), axis = 1
    ).rename(columns = lambda x: 'pixel_{}'.format(x)).reset_index(drop = True)
    X_pixel = df_pixel.columns
    return df.join(df_pixel), X_pixel

def proc_images_np(X, images):
    return np.pad(
        images[X['slide'], X['left']:X['right'], X['top']:X['bottom'], :], 
        [(X['lpad'], X['rpad']), (X['tpad'], X['bpad']), (0, 0)], 'edge'
    ).flatten()


images, df_spots, images_test, df_spots_test = load_data("data/elucidata_ai_challenge_data.h5")
targets = [i for i in df_spots.columns if i.startswith('C')]
n_components = 35
targets2 = ['C{}_l'.format(i + 1) for i in range(n_components)]
df_spots= df_spots.join(
    np.log(df_spots[targets]).rename(columns = lambda x: x + '_l')
)

In [3]:
from scipy.stats import spearmanr
import xgboost as xgb
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GroupKFold, GroupShuffleSplit, train_test_split

gkf = GroupKFold(6)
ss = GroupShuffleSplit(1)
sc = sgutil.SGCache('img', 'result', 'model')

def get_validation_splitter(validation_fraction):
    return lambda x: train_test_split(x, test_size = validation_fraction)

def spearman(df, df_prds):
    return df_prds.apply(
        lambda x: spearmanr(x, df.loc[x.name, targets])[0],axis=1
    ).mean()

config = {
    'predict_func': lambda m, df, X: pd.DataFrame(np.exp(m.predict(df[X])), index = df.index, columns = targets2),
    'score_func': lambda df, prds: spearman(df[targets], prds),
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(), 
    'return_train_scores': True,
    'y': targets2, 'groups': 'slide'
}


xgb_adapter = sgml.XGBAdapter(xgb.XGBRegressor, progress = 50)
lasso_adapter = sgml.SklearnAdapter(Lasso)
ridge_adapter = sgml.SklearnAdapter(Ridge)
mlp_adapter = sgml.SklearnAdapter(MLPRegressor)

lr_adapter = sgml.SklearnAdapter(LinearRegression)

In [4]:
df_spots8, X_pixel8 = create_df(df_spots, 8, 8)
df_spots9, X_pixel9 = create_df(df_spots, 9, 9)

# Linear Regression1

In [5]:
result_lr = sc.cv_result(
    'LR', df_spots8, gkf, {'X_num': X_pixel8.tolist()}, config, lr_adapter, result_proc = [sgml.lr_learning_result]
)
np.mean(result_lr['valid_scores']), result_lr['valid_scores']

(0.45159830844446774,
 [0.5297622132553462,
  0.35523506124946186,
  0.285001394714117,
  0.4690515127702302,
  0.5735355011569846,
  0.4970041675206669])

#  XGBoost1

In [6]:
hparams = {
    'model_params': {'max_depth': 2, 'learning_rate': 0.01, 'n_estimators': 100},
    'X_num': X_pixel8.tolist()
}
result_xgb = sc.cv_result('xgb', df_spots8, gkf, hparams, config, xgb_adapter)
np.mean(result_xgb['valid_scores']), result_xgb['valid_scores']

(0.45333574818398664,
 [0.5581417589062058,
  0.3278171532609403,
  0.2877363706365074,
  0.44514948355079187,
  0.5775053789631795,
  0.523664343786295])

# Linear Regression 2

- Linear Regression + PCA

In [7]:
hparams = {'pca': {'X_num': X_pixel8.tolist(), 'hparams': {'n_components': 0.7}}}
result_lr2 = sc.cv_result(
    'LR2', df_spots8, gkf, hparams, config, lr_adapter, result_proc = [sgml.lr_learning_result]
)
np.mean(result_lr2['valid_scores']), result_lr2['valid_scores']

(0.4785741677274913,
 [0.585518577475043,
  0.3823903667829625,
  0.2811122302230374,
  0.47779563383904533,
  0.6224199244915357,
  0.5222082735533239])

# Linear Regression 3

- Linear Regression + PCA + pixel9

In [8]:
hparams = {'pca': {'X_num': X_pixel9.tolist(), 'hparams': {'n_components': 0.5}}}
result_lr3 = sc.cv_result(
    'LR3', df_spots9, gkf, hparams, config, lr_adapter, result_proc = [sgml.lr_learning_result], rerun = 0
)
np.mean(result_lr3['valid_scores']), result_lr3['valid_scores']

(0.4788232912064534,
 [0.5861936845222671,
  0.38231418193130695,
  0.283400730596353,
  0.47704874704726036,
  0.6214675435391548,
  0.5225148596023776])

# Ridge1

In [9]:
hparams = {'X_num': X_pixel8.tolist(), 'model_params': {'alpha': 1e7}}
result_rd1 = sc.cv_result(
    'Ridge1', df_spots8, gkf, hparams, config, ridge_adapter, result_proc = [sgml.lr_learning_result], rerun = 0
)
np.mean(result_rd1['valid_scores']), result_rd1['valid_scores']

(0.4953370298393036,
 [0.5607180493342712,
  0.44186317221472116,
  0.27515070428887123,
  0.4832064923694837,
  0.6716981285267731,
  0.5393856323017012])

# Lasso1

In [10]:
hparams = {'X_num': X_pixel8.tolist(), 'model_params': {'alpha': 10}}
result_lasso1 = sc.cv_result(
    'lasso1', df_spots8, gkf, hparams, config, lasso_adapter, result_proc = [sgml.lr_learning_result], rerun = 0
)
np.mean(result_lasso1['valid_scores']), result_lasso1['valid_scores']

(0.49562809082597,
 [0.5607180493342712,
  0.44186317221472116,
  0.27515070428887123,
  0.4832064923694837,
  0.6734624284496407,
  0.5393676982988318])

# MLP1

In [11]:
hparams = {'X_num': X_pixel8.tolist(), 'model_params': {'alpha': 1e4, 'max_iter': 1000, 'hidden_layer_sizes' : (16, )}}
result_mlp1 = sc.cv_result(
    'mlp1', df_spots8, gkf, hparams, config, mlp_adapter, rerun = 0
)
np.mean(result_mlp1['valid_scores']), result_mlp1['valid_scores']

(0.49370322399374444,
 [0.5401794197855354,
  0.44183474026843333,
  0.27287523238275635,
  0.48265547162420147,
  0.6751305159745059,
  0.5495439639270343])

# Ensemble

In [31]:
prd = np.zeros_like(df_spots[targets])
for i in [
    #sc.read_prd('ridge1', df_spots.index, columns = targets),
    sc.read_prd('lasso1', df_spots.index, columns = targets),
    #sc.read_prd('mlp1', df_spots.index, columns = targets),
]:
    prd += i[targets].rank(axis = 1)

spearman(
    df_spots[targets],
    pd.DataFrame(prd, index = df_spots.index, columns = targets)
)

0.46947160268238924

# Train

In [35]:
result = sc.train_cv('lasso1', df_spots8, config)