In [1]:
import os
import pandas as pd
import numpy as np
import h5py
import joblib

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, StandardScaler
from sklearn.pipeline import make_pipeline

import warnings

# 모든 FutureWarning 무시
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
def load_data(filename):
    """
    이미지를 불러옵니다.
    Parameters:
        filename: str
            h5 파일에서 데이터를 불러옵니다.
    Returns:
        np.ndarray, pd.DataFrame, np.ndarray, 
        train 이미지, train spot 정보, test 이미지, test spot 정보
    """
    images, images_test = list(), list()
    spots, spots_test = list(), list()
    with h5py.File(filename, "r") as h5file:
        train_images = h5file["images/Train"]
        train_spots = h5file["spots/Train"]
    
        num_train_slides = len(train_spots)
        # Train 이미지를 불러옵니다.
        # 하나의 텐서로 만들기 위해 이미지의 크기를 2000x2000으로 균일하게 만듭니다.
        for i, slide_name in enumerate(train_images.keys()):
            spots.append(pd.DataFrame(np.array(train_spots[slide_name])).assign(slide = i))
    return pd.concat(spots).reset_index(drop = True)
df_spots = load_data("data/elucidata_ai_challenge_data.h5")
targets = [i for i in df_spots.columns if i.startswith('C')]

In [27]:
#df_log_std_pca_5_eff_b0_S2_mse, _ = joblib.load('result/cv_log_std_pca_5_eff_b0_S2_mse.joblib') # 0.5582929638498111
df_log_std_pca_5_eff_b0_pre_fine_S2_mse, _ = joblib.load('result/cv_log_std_pca_5_eff_b0_pre_fine_S2_mse.joblib') # 0.5628191101569386
df_log_std_pca_5_eff_b1_pre_fine_S2_mse, _ = joblib.load('result/cv_log_std_pca_5_eff_b1_pre_fine_S2_mse.joblib') # 0.5606549434961433

In [28]:
prd = df_log_std_pca_5_eff_b1_pre_fine_S2_mse[targets].rank(axis = 1)
for i in [
    df_log_std_pca_5_eff_b0_pre_fine_S2_mse
]:
    prd += i[targets].rank(axis = 1)
from scipy.stats import spearmanr
prd[targets].apply(
    lambda x: spearmanr(df_spots.loc[x.name, targets], x)[0], axis=1
).mean()

0.5650678209579263

In [29]:
from scipy.stats import spearmanr
df_log_std_pca_5_eff_b0_pre_fine_S2_mse[targets].apply(
    lambda x: spearmanr(df_spots.loc[x.name, targets], x)[0], axis=1
).mean()

0.5628191101569386

In [30]:
from scipy.stats import spearmanr
df_log_std_pca_5_eff_b1_pre_fine_S2_mse[targets].apply(
    lambda x: spearmanr(df_spots.loc[x.name, targets], x)[0], axis=1
).mean()

0.5606549434961433