In [13]:
import os
import pandas as pd
import numpy as np
import h5py
import joblib

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, StandardScaler
from sklearn.pipeline import make_pipeline

import warnings

# 모든 FutureWarning 무시
warnings.simplefilter(action='ignore', category=FutureWarning)

In [14]:
def load_data(filename):
    """
    이미지를 불러옵니다.
    Parameters:
        filename: str
            h5 파일에서 데이터를 불러옵니다.
    Returns:
        np.ndarray, pd.DataFrame, np.ndarray, 
        train 이미지, train spot 정보, test 이미지, test spot 정보
    """
    images, images_test = list(), list()
    spots, spots_test = list(), list()
    with h5py.File(filename, "r") as h5file:
        train_images = h5file["images/Train"]
        train_spots = h5file["spots/Train"]
    
        num_train_slides = len(train_spots)
        # Train 이미지를 불러옵니다.
        # 하나의 텐서로 만들기 위해 이미지의 크기를 2000x2000으로 균일하게 만듭니다.
        for i, slide_name in enumerate(train_images.keys()):
            spots.append(pd.DataFrame(np.array(train_spots[slide_name])).assign(slide = i))
    return pd.concat(spots).reset_index(drop = True)
df_spots = load_data("data/elucidata_ai_challenge_data.h5")
targets = [i for i in df_spots.columns if i.startswith('C')]

In [15]:
df_log_std_pca_5_eff_b0, _ = joblib.load('result/cv_log_std_pca_5_eff_b0.joblib') # 0.5383309629996448
df_log_std_pca_10_eff_b0, _ = joblib.load('result/cv_log_std_pca_10_eff_b0.joblib') # 0.536148846890535
#df_log_std_pca_5_eff_b0_rt, _ = joblib.load('result/cv_log_std_pca_5_eff_b0_rt.joblib') # 0.5283482179552861
df_log_std_pca_5_eff_b1, _ = joblib.load('result/cv_log_std_pca_5_eff_b1.joblib') # 0.5505204501251932
df_log_std_pca_5_eff_b2, _ = joblib.load('result/cv_log_std_pca_5_eff_b2.joblib') # 0.5442914212037672
df_log_std_pca_5_eff_b3, _ = joblib.load('result/cv_log_std_pca_5_eff_b3.joblib') # 0.5427672949644584
df_log_std_pca_5_eff_b0_S2_mse, _ = joblib.load('result/cv_log_std_pca_5_eff_b0_S2_mse.joblib') # 0.5582929638498111
df_log_std_pca_5_eff_b0_S2_mae, _ = joblib.load('result/cv_log_std_pca_5_eff_b0_S2_mae.joblib') # 0.5341752798855798
df_log_std_pca_5_eff_b1_S2_mae, _ = joblib.load('result/cv_log_std_pca_5_eff_b1_S2_mae.joblib') # 0.5496152946745833
df_log_std_pca_5_eff_b1_S2_mse, _ = joblib.load('result/cv_log_std_pca_5_eff_b1_S2_mse.joblib') # 0.5440003046373658

In [22]:
prd = df_log_std_pca_5_eff_b0[targets].rank(axis = 1)
for i in [
    #df_log_std_pca_5_eff_b0, df_log_std_pca_10_eff_b0, 
    df_log_std_pca_5_eff_b1, df_log_std_pca_5_eff_b2, df_log_std_pca_5_eff_b3,
    df_log_std_pca_5_eff_b0_S2_mse, df_log_std_pca_5_eff_b0_S2_mae,
    df_log_std_pca_5_eff_b1_S2_mae, df_log_std_pca_5_eff_b1_S2_mse
]:
    prd += i[targets].rank(axis = 1)
from scipy.stats import spearmanr
prd[targets].apply(
    lambda x: spearmanr(df_spots.loc[x.name, targets], x)[0], axis=1
).mean()

0.5534741598946824

In [5]:
from scipy.stats import spearmanr
df_log_std_pca_5_eff_b0[targets].apply(
    lambda x: spearmanr(df_spots.loc[x.name, targets], x)[0], axis=1
).mean()

0.5383309629996448

In [6]:
from scipy.stats import spearmanr
df_log_std_pca_10_eff_b0[targets].apply(
    lambda x: spearmanr(df_spots.loc[x.name, targets], x)[0], axis=1
).mean()

0.536148846890535

In [7]:
from scipy.stats import spearmanr
df_log_std_pca_5_eff_b1[targets].apply(
    lambda x: spearmanr(df_spots.loc[x.name, targets], x)[0], axis=1
).mean()

0.5505204501251932

In [8]:
from scipy.stats import spearmanr
df_log_std_pca_5_eff_b3[targets].apply(
    lambda x: spearmanr(df_spots.loc[x.name, targets], x)[0], axis=1
).mean()

0.5427672949644584

In [9]:
from scipy.stats import spearmanr
df_log_std_pca_5_eff_b2[targets].apply(
    lambda x: spearmanr(df_spots.loc[x.name, targets], x)[0], axis=1
).mean()

0.5442914212037672

In [10]:
from scipy.stats import spearmanr
df_log_std_pca_5_eff_b0_S2_mse[targets].apply(
    lambda x: spearmanr(df_spots.loc[x.name, targets], x)[0], axis=1
).mean()

0.5582929638498111

In [11]:
from scipy.stats import spearmanr
df_log_std_pca_5_eff_b0_S2_mae[targets].apply(
    lambda x: spearmanr(df_spots.loc[x.name, targets], x)[0], axis=1
).mean()

0.5341752798855798

In [12]:
from scipy.stats import spearmanr
df_log_std_pca_5_eff_b1_S2_mae[targets].apply(
    lambda x: spearmanr(df_spots.loc[x.name, targets], x)[0], axis=1
).mean()

0.5496152946745833

In [16]:
from scipy.stats import spearmanr
df_log_std_pca_5_eff_b1_S2_mse[targets].apply(
    lambda x: spearmanr(df_spots.loc[x.name, targets], x)[0], axis=1
).mean()

0.5440003046373658