In [10]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import tensorflow as tf
import pandas as pd
import numpy as np
import h5py
import joblib

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
import warnings

# 모든 FutureWarning 무시
warnings.simplefilter(action='ignore', category=FutureWarning)

# Data Processing

In [11]:
def load_data(filename):
    """
    이미지를 불러옵니다.
    Parameters:
        filename: str
            h5 파일에서 데이터를 불러옵니다.
    Returns:
        np.ndarray, pd.DataFrame, np.ndarray, 
        train 이미지, train spot 정보, test 이미지, test spot 정보
    """
    images, images_test = list(), list()
    spots, spots_test = list(), list()
    with h5py.File(filename, "r") as h5file:
        train_images = h5file["images/Train"]
        train_spots = h5file["spots/Train"]
    
        num_train_slides = len(train_images)
        # Train 이미지를 불러옵니다.
        # 하나의 텐서로 만들기 위해 이미지의 크기를 2000x2000으로 균일하게 만듭니다.
        for i, slide_name in enumerate(train_images.keys()):
            image = np.array(train_images[slide_name])
            p1 = 2000 - image.shape[0]
            p2 = 2000 - image.shape[1]
            images.append(
                np.pad(image, [(0, p1), (0, p2), (0, 0)], 'edge')
            )
            spots.append(pd.DataFrame(np.array(train_spots[slide_name])).assign(slide = i))
            if slide_name == 'S_1':
                spots[-1] = spots[-1].assign(
                    x = lambda x: x['x'] - 50,
                    y = lambda x: x['y'] - 50,
                )
            elif slide_name == 'S_2':
                spots[-1] = spots[-1].assign(
                    x = lambda x: x['x'] - 60,
                    y = lambda x: x['y'] - 60,
                )
        # Test 이미지를 불러옵니다.
        test_images = h5file["images/Test"]
        test_spots = h5file["spots/Test"]
        sample = 'S_7'
        image = np.array(test_images[sample])
        p1 = 2000 - image.shape[0]
        p2 = 2000 - image.shape[1]
        images_test.append(np.pad(image, [(0, p1), (0, p2), (0, 0)], 'edge'))
        spots_test.append(pd.DataFrame(np.array(test_spots[sample])).assign(slide = 0))
    # EfficientNet의 형식으로 바꿉니다.
    with tf.device('/CPU:0'):
        images = tf.constant(tf.keras.applications.efficientnet.preprocess_input(images))
    df_spots = pd.concat(spots).reset_index(drop = True)
    with tf.device('/CPU:0'):
        images_test = tf.constant(tf.keras.applications.efficientnet.preprocess_input(images_test))
    df_spots_test = pd.concat(spots_test).reset_index(drop = True)
    return images, df_spots, images_test, df_spots_test

def make_img_proc_info(df, img_with, img_height):
    """
    
    """
    return df.assign(
        left = lambda x: x['x'] - img_width // 2,
        right = lambda x: x['x'] + img_width // 2,
        top = lambda x: x['y'] - img_height // 2,
        bottom = lambda x: x['y'] + img_height // 2,
        lpad = lambda x: -(x['left'].where(x['left'] < 0, 0)),
        rpad = lambda x: -(2000 - x['right']).where(x['right'] > 2000, 0),
        tpad = lambda x: -(x['top'].where(x['top'] < 0, 0)),
        bpad = lambda x: -(2000 - x['bottom']).where(x['bottom'] > 2000, 0)
    ).assign(
        left = lambda x: x['left'].clip(0, 2000),
        right = lambda x: x['right'].clip(0, 2000),
        top = lambda x: x['top'].clip(0, 2000),
        bottom = lambda x: x['bottom'].clip(0, 2000),
    )

def create_tf_ds(df):
    if (pd.Series(targets).isin(df.columns)).all():
        return tf.data.Dataset.from_tensor_slices(
            ({
                i: df[i] for i in ['left', 'right', 'top', 'bottom', 'slide', 'lpad', 'rpad', 'tpad', 'bpad']
            }, df[targets2])
        )
    else:
        return tf.data.Dataset.from_tensor_slices({
            i: df[i] for i in ['left', 'right', 'top', 'bottom', 'slide', 'lpad', 'rpad', 'tpad', 'bpad']
        })

def proc_images(X, images):
    return tf.pad(
        images[X['slide'], X['left']:X['right'], X['top']:X['bottom'], :], 
        paddings = [(X['lpad'], X['rpad']), (X['tpad'], X['bpad']), (0, 0)],
        constant_values=1
    )

augmentation_layers = [
    tf.keras.layers.RandomFlip("horizontal_and_vertical"),
    tf.keras.layers.RandomRotation(0.5),
    tf.keras.layers.RandomZoom(0.1),
    tf.keras.layers.RandomContrast(0.1),
]

def data_augmentation(x):
    for layer in augmentation_layers:
        x = layer(x)
    return x

images, df_spots, images_test, df_spots_test = load_data("data/elucidata_ai_challenge_data.h5")
targets = [i for i in df_spots.columns if i.startswith('C')]
n_components = 5
target_proc = make_pipeline(
    FunctionTransformer(np.log, np.exp), StandardScaler(), PCA(n_components=n_components)
).fit(df_spots[targets])
target_proc.fit(df_spots[targets])
targets2 = ['pca_{}'.format(i) for i in range(n_components)]
df_spots= df_spots.join(
    pd.DataFrame(target_proc.transform(df_spots[targets]), index = df_spots.index, columns = targets2)
)

KeyboardInterrupt: 

In [None]:
fig, ax = plt.subplots(1, 6, figsize=(14, 3))
for i, slide_name in enumerate(range(6)):
    image = images[i]
    x, y = df_spots.loc[df_spots['slide'] == i, "x"], df_spots.loc[df_spots['slide'] == i, "y"]

    ax[i].imshow(image, aspect="auto")
    ax[i].scatter(x, y, color="red", s=1, alpha=0.4)  # Overlay spot locations
    ax[i].set_title(slide_name)
    ax[i].axis('off')

plt.tight_layout()
plt.show()

In [None]:
df_spots['slide'].unique()

In [None]:
from scipy.stats import spearmanr
img_width = 224
img_height = 224

def create_model(img_width, img_height):
    input_shape = (img_width, img_height, 3)
    enet = tf.keras.applications.EfficientNetB0(
        include_top = False, 
        weights = 'imagenet',
        input_shape = input_shape,
        pooling = 'avg'
    )
    inputs = tf.keras.Input(shape = input_shape)
    x = enet(inputs, training = False)
    x = tf.keras.layers.Dropout(0.5)(x)
    d1 = tf.keras.layers.Dense(256, activation = 'relu', kernel_initializer = 'HeUniform')
    x = d1(x)
    d2 = tf.keras.layers.Dense(len(targets2))
    outputs = d2(x)
    m = tf.keras.models.Model(inputs, outputs)
    return m, (enet, d1, d2)

def reconstruct_model(layers):
    input_shape = (img_width, img_height, 3)
    inputs = tf.keras.Input(shape = input_shape)
    x = layers[0](inputs, training = True)
    x = tf.keras.layers.Dropout(0.2)(x)
    x = layers[1](x)
    outputs = layers[2](x)
    m = tf.keras.models.Model(inputs, outputs)
    return m

def train_model(
        m, train_idx, valid_idx, learning_rate, 
        target_proc = FunctionTransformer(lambda x: x, lambda x: x), 
        batch_size = 32, epochs = 20, step = ''
    ):

    ds_cv_train = create_tf_ds(
        df_spots.iloc[train_idx].pipe(
            lambda x: pd.concat([
                x, x.sample(frac=0.5).assign(
                    left = lambda x: x['left'] + np.random.normal(scale = 10, size = len(x)).astype('int') + np.random.choice([-5, 5], size = len(x)),
                    right = lambda x: x['left'] + img_width,
                    top = lambda x: x['top'] + np.random.normal(scale = 10, size = len(x)).astype('int') + np.random.choice([-5, 5], size = len(x)),
                    bottom = lambda x: x['top'] + img_height,
                ).query('left >= 0 and right <= 2000 and top >= 0 and bottom <= 2000')
            ])
        ).pipe(
            lambda x: pd.concat([
                x, x.sample(n = batch_size - (len(x) % batch_size))
            ])
        )
    ).shuffle(5000).map(
        lambda X, Y: (proc_images(X, images), Y)
    ).map(
        lambda X, Y: (data_augmentation(X), Y)
    ).batch(batch_size).prefetch(tf.data.AUTOTUNE).cache()

    ds_cv_prd = create_tf_ds(
        df_spots.iloc[train_idx]
    ).map(
        lambda X, Y: (proc_images(X, images), Y)
    ).batch(batch_size).prefetch(tf.data.AUTOTUNE).cache()
    
    if valid_idx is not None:
        ds_valid = create_tf_ds(df_spots.iloc[valid_idx]).map(
            lambda X, Y: (proc_images(X, images), Y)
        ).batch(batch_size).prefetch(tf.data.AUTOTUNE).cache()
    else:
        ds_valid = None
    lr_schedule = tf.keras.optimizers.schedules.CosineDecay(
        initial_learning_rate=learning_rate,
        decay_steps=3000,
        alpha=0.1
    )
#    lr_schedule = learning_rate
    m.compile(
        loss = tf.keras.losses.MeanAbsoluteError(),
        optimizer = tf.keras.optimizers.Adam(learning_rate = lr_schedule),
        metrics = [tf.keras.metrics.MeanAbsoluteError()]
    )

    df_true_train = df_spots.iloc[train_idx][targets]
    if valid_idx is not None:
        df_true = df_spots.iloc[valid_idx][targets]
    else:
        df_true = None
    progress_bar = tqdm(total = epochs, desc=step)
    scores_train, scores_valid = list(), list()
    df_prd = None
    for i in range(epochs):
        hist = m.fit(ds_cv_train, epochs = 1, verbose = 0)
        df_prd = pd.DataFrame(
            target_proc.inverse_transform(m.predict(ds_cv_prd, verbose = 0)), 
            index = df_true_train.index, columns = targets
        )
        scores_train.append(
            df_true_train.apply(lambda x: spearmanr(x, df_prd.loc[x.name])[0], axis=1).mean()
        )
        metric = "train coef: {:.4f}".format(scores_train[-1])
        if valid_idx is not None:
            df_prd = pd.DataFrame(
                target_proc.inverse_transform(m.predict(ds_valid, verbose = 0)), 
                index = df_true.index, columns = targets
            )
            scores_valid.append(
                df_true.apply(lambda x: spearmanr(x, df_prd.loc[x.name])[0], axis=1).mean()
            )
            metric = metric + ", valid coef: {:.4f}".format(scores_valid[-1])
        progress_bar.set_postfix_str(metric)
        progress_bar.update(1)
    progress_bar.close()
    return (scores_train, scores_valid), df_prd

# Validation

In [None]:
df_spots = make_img_proc_info(df_spots, img_width, img_height)
df_spots_test = make_img_proc_info(df_spots_test, img_width, img_height)

In [None]:
from sklearn.model_selection import GroupKFold
gkf = GroupKFold(n_splits = 6)
scores = list()
oofs = list()
for i, (train_idx, valid_idx) in enumerate(
    gkf.split(df_spots[['x', 'y']], df_spots[targets], groups = df_spots['slide'])
):
    m, layers = create_model(img_width, img_height)
    score_1, df_prd = train_model(
        m, train_idx, valid_idx, learning_rate = 1e-7, 
        target_proc = target_proc, batch_size = 32, epochs = 10, step = 'train {}'.format(i)
    )
    """
    m = reconstruct_model(layers)
    score_2, df_prd = train_model(
        m, train_idx, valid_idx, learning_rate = 1e-6, 
        target_proc = target_proc, batch_size = 32, epochs = 10, step = 'fine tuning {}'.format(i)
    )
    """
    scores.append(score_1)
    oofs.append(df_prd)
df_oof = pd.concat(oofs, axis = 0)

In [None]:
from scipy.stats import spearmanr
df_oof[targets].groupby(df_spots['slide']).apply(
    lambda x: x.apply(lambda x: spearmanr(df_spots.loc[x.name, targets], x)[0], axis = 1).mean()
)

In [9]:
from scipy.stats import spearmanr
df_oof[targets].apply(
    lambda x: spearmanr(df_spots.loc[x.name, targets], x)[0], axis=1
).mean()

np.float64(0.550668474360639)

In [7]:
joblib.dump((df_oof, scores), 'result/cv_log_std_pca_5_eff_b0_rt.joblib')

['result/cv_log_std_pca_5_eff_b0_rt.joblib']

# Train

In [5]:
img_width = 224
img_height = 224
df_spots = make_img_proc_info(df_spots, img_width, img_height)
df_spots_test = make_img_proc_info(df_spots_test, img_width, img_height)

In [6]:
m, layers = create_model(img_width, img_height)
score_1, df_prd = train_model(
    m, df_spots.index, None, learning_rate = 5e-7, 
    target_proc = target_proc, batch_size = 32, epochs = 10, step = 'train'
)

Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb0_notop.h5
[1m16705208/16705208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step


train:   0%|          | 0/10 [00:00<?, ?it/s]

In [7]:
"""
inputs = tf.keras.Input(shape = input_shape)
x = enet(inputs, training = True)
x = tf.keras.layers.Dropout(0.2)(x)
x = d1(x)
outputs = d2(x)
m = tf.keras.models.Model(inputs, outputs)
m.compile(
    optimizer=tf.keras.optimizers.Adam(1e-6),  # Low learning rate
    loss=tf.keras.losses.MeanSquaredError(),
    metrics=[tf.keras.metrics.MeanSquaredError()],
)
hist = m.fit(ds_train, epochs=10)
"""
''

Epoch 1/10
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 75ms/step - loss: 0.8789 - mean_squared_error: 0.8789
Epoch 2/10
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 84ms/step - loss: 0.8748 - mean_squared_error: 0.8748
Epoch 3/10
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 76ms/step - loss: 0.8705 - mean_squared_error: 0.8705
Epoch 4/10
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 73ms/step - loss: 0.8666 - mean_squared_error: 0.8666
Epoch 5/10
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 74ms/step - loss: 0.8645 - mean_squared_error: 0.8645
Epoch 6/10
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 83ms/step - loss: 0.8619 - mean_squared_error: 0.8619
Epoch 7/10
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 73ms/step - loss: 0.8578 - mean_squared_error: 0.8578
Epoch 8/10
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0

In [9]:
joblib.dump(m.get_weights(), 'model/log_std_pca_5_eff_b0.joblib')

['model/log_std_pca_5_eff_b0.joblib']

In [10]:
joblib.dump(target_proc, 'model/target_proc_log_std_pca_5_eff_b0.joblib') 

['model/target_proc_log_std_pca_5_eff_b0.joblib']