In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import tensorflow as tf
import pandas as pd
import numpy as np
import h5py
import joblib

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, StandardScaler
from sklearn.pipeline import make_pipeline

# Data Processing

In [2]:
def load_data(filename):
    """
    이미지를 불러옵니다.
    Parameters:
        filename: str
            h5 파일에서 데이터를 불러옵니다.
    Returns:
        np.ndarray, pd.DataFrame, np.ndarray, 
        train 이미지, train spot 정보, test 이미지, test spot 정보
    """
    images, images_test = list(), list()
    spots, spots_test = list(), list()
    with h5py.File(filename, "r") as h5file:
        train_images = h5file["images/Train"]
        train_spots = h5file["spots/Train"]
    
        num_train_slides = len(train_images)
        # Train 이미지를 불러옵니다.
        # 하나의 텐서로 만들기 위해 이미지의 크기를 2000x2000으로 균일하게 만듭니다.
        for i, slide_name in enumerate(train_images.keys()):
            image = np.array(train_images[slide_name])
            p1 = 2000 - image.shape[0]
            p2 = 2000 - image.shape[1]
            images.append(
                np.pad(image, [(0, p1), (0, p2), (0, 0)], 'edge')
            )
            spots.append(pd.DataFrame(np.array(train_spots[slide_name])).assign(slide = i))
        # Test 이미지를 불러옵니다.
        test_images = h5file["images/Test"]
        test_spots = h5file["spots/Test"]
        sample = 'S_7'
        image = np.array(test_images[sample])
        p1 = 2000 - image.shape[0]
        p2 = 2000 - image.shape[1]
        images_test.append(np.pad(image, [(0, p1), (0, p2), (0, 0)], 'edge'))
        spots_test.append(pd.DataFrame(np.array(test_spots[sample])).assign(slide = 0))
    # EfficientNet의 형식으로 바꿉니다.
    with tf.device('/CPU:0'):
        images = tf.constant(tf.keras.applications.efficientnet.preprocess_input(images))
    df_spots = pd.concat(spots)
    with tf.device('/CPU:0'):
        images_test = tf.constant(tf.keras.applications.efficientnet.preprocess_input(images_test))
    df_spots_test = pd.concat(spots_test)
    return images, df_spots, images_test, df_spots_test

def make_img_proc_info(df, img_with, img_height):
    """
    
    """
    return df.assign(
        left = lambda x: x['x'] - img_width // 2,
        right = lambda x: x['x'] + img_width // 2,
        top = lambda x: x['y'] - img_height // 2,
        bottom = lambda x: x['y'] + img_height // 2,
        lpad = lambda x: -(x['left'].where(x['left'] < 0, 0)),
        rpad = lambda x: -(2000 - x['right']).where(x['right'] > 2000, 0),
        tpad = lambda x: -(x['top'].where(x['top'] < 0, 0)),
        bpad = lambda x: -(2000 - x['bottom']).where(x['bottom'] > 2000, 0)
    ).assign(
        left = lambda x: x['left'].clip(0, 2000),
        right = lambda x: x['right'].clip(0, 2000),
        top = lambda x: x['top'].clip(0, 2000),
        bottom = lambda x: x['bottom'].clip(0, 2000),
    )

def create_tf_ds(df):
    if (pd.Series(targets).isin(df.columns)).all():
        return tf.data.Dataset.from_tensor_slices(
            ({
                i: df[i] for i in ['left', 'right', 'top', 'bottom', 'slide', 'lpad', 'rpad', 'tpad', 'bpad']
            }, df[targets])
        )
    else:
        return tf.data.Dataset.from_tensor_slices({
            i: df[i] for i in ['left', 'right', 'top', 'bottom', 'slide', 'lpad', 'rpad', 'tpad', 'bpad']
        })

def proc_images(X, images):
    return tf.pad(
        images[X['slide'], X['left']:X['right'], X['top']:X['bottom'], :], 
        paddings = [(X['lpad'], X['rpad']), (X['tpad'], X['bpad']), (0, 0)],
        constant_values=1
    )

augmentation_layers = [
    tf.keras.layers.RandomFlip("horizontal"),
    tf.keras.layers.RandomRotation(0.1),
]

def data_augmentation(x):
    for layer in augmentation_layers:
        x = layer(x)
    return x

images, df_spots, images_test, df_spots_test = load_data("data/elucidata_ai_challenge_data.h5")
targets = [i for i in df_spots.columns if i.startswith('C')]

target_proc = make_pipeline(FunctionTransformer(np.log, np.exp),  StandardScaler())
target_proc.fit(df_spots[targets])
df_spots[targets] = target_proc.transform(df_spots[targets])

In [3]:
img_width = 224
img_height = 224

df_spots = make_img_proc_info(df_spots, img_width, img_height)
df_spots_test = make_img_proc_info(df_spots_test, img_width, img_height)

In [5]:
df_spots['slide'].unique()

array([0, 1, 2, 3, 4, 5])

In [6]:
from tqdm.notebook import tqdm
class TqdmEpochProgress(tf.keras.callbacks.Callback):
    def __init__(self, epochs):
        super().__init__()
        self.epochs = epochs
        self.progress_bar = None

    def on_train_begin(self, logs=None):
        self.progress_bar = tqdm(total=self.epochs, desc="Epochs")

    def on_epoch_end(self, epoch, logs=None):
        log_str = f"loss: {logs.get('loss'):.4f}"
        if 'val_loss' in logs:
            log_str += f", val_loss: {logs.get('val_loss'):.4f}"
        self.progress_bar.set_postfix_str(log_str)
        self.progress_bar.update(1)

    def on_train_end(self, logs=None):
        self.progress_bar.close()

# Validation

In [39]:
batch_size = 32
ds_cv_train = create_tf_ds(
    df_spots.loc[df_spots['slide'] != 5].pipe(
        lambda x: pd.concat([
            x, x.sample(n = batch_size - (len(x) % batch_size))
        ])
    )
).shuffle(5000).map(
    lambda X, Y: (proc_images(X, images), Y)
).map(
    lambda X, Y: (data_augmentation(X), Y)
).batch(batch_size).prefetch(tf.data.AUTOTUNE).cache()

ds_valid = create_tf_ds(df_spots.loc[df_spots['slide'] == 5]).map(
    lambda X, Y: (proc_images(X, images), Y)
).batch(batch_size).prefetch(tf.data.AUTOTUNE).cache()

In [55]:
from scipy.stats import spearmanr
from sklearn.metrics import mean_squared_error

input_shape = (img_width, img_height, 3)
enet = tf.keras.applications.EfficientNetB0(
    include_top = False, 
    weights = 'imagenet',
    input_shape = input_shape,
    pooling = 'avg'
)
inputs = tf.keras.Input(shape = input_shape)
x = enet(inputs, training = False)
x = tf.keras.layers.Dropout(0.2)(x)
d1 = tf.keras.layers.Dense(64, activation = 'relu', kernel_initializer = 'HeUniform')
x = d1(x)
d2 = tf.keras.layers.Dense(len(targets))
outputs = d2(x)
m = tf.keras.models.Model(inputs, outputs)

lr_schedule = tf.keras.optimizers.schedules.CosineDecay(
    initial_learning_rate=3e-6,
    decay_steps=5000,
    alpha=0.1
)

m.compile(
    loss = tf.keras.losses.MeanSquaredError(),
    optimizer = tf.keras.optimizers.Adam(learning_rate = lr_schedule),
    metrics = [tf.keras.metrics.MeanSquaredError()]
)
df_true = pd.DataFrame(
    target_proc.inverse_transform(
        df_spots.loc[df_spots['slide'] == 5, targets]
    ), index = df_spots[df_spots['slide'] == 5].index, columns = targets
)
for i in range(30):
    hist = m.fit(ds_cv_train, epochs = 1)
    df_prd = pd.DataFrame(
        target_proc.inverse_transform(m.predict(ds_valid)), index = df_spots[df_spots['slide'] == 5].index, columns = targets
    )
    print(
        df_true.apply(lambda x: spearmanr(x, df_prd.loc[x.name])[0], axis=1).mean(),
        mean_squared_error(df_true.stack(), df_prd.stack())
    )

[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 76ms/step - loss: 1.1092 - mean_squared_error: 1.1092
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 342ms/step
0.518999081294782 0.33009170559704354
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 76ms/step - loss: 1.0521 - mean_squared_error: 1.0521
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
0.5207991733278677 0.3290757018812078
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 75ms/step - loss: 1.0219 - mean_squared_error: 1.0219
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
0.5601591856254697 0.32236360870606423
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 84ms/step - loss: 0.9986 - mean_squared_error: 0.9986
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
0.5699605451936872 0.3200590939559671
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m

# Fine Tuning


In [58]:
inputs = tf.keras.Input(shape = input_shape)
x = enet(inputs, training = True)
x = tf.keras.layers.Dropout(0.2)(x)
x = d1(x)
outputs = d2(x)
m = tf.keras.models.Model(inputs, outputs)
m.compile(
    optimizer=tf.keras.optimizers.Adam(3e-6),  # Low learning rate
    loss=tf.keras.losses.MeanSquaredError(),
    metrics=[tf.keras.metrics.MeanSquaredError()],
)
for i in range(10):
    hist = m.fit(ds_cv_train, epochs=1)
    df_prd = pd.DataFrame(
        target_proc.inverse_transform(m.predict(ds_valid)), index = df_spots[df_spots['slide'] == 5].index, columns = targets
    )
    print(
        df_true.apply(lambda x: spearmanr(x, df_prd.loc[x.name])[0], axis=1).mean(),
        mean_squared_error(df_true.stack(), df_prd.stack())
    )

[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 78ms/step - loss: 0.8075 - mean_squared_error: 0.8075
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 301ms/step
0.5723167315706771 0.310563293689337
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 76ms/step - loss: 0.8002 - mean_squared_error: 0.8002
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
0.5730656896905104 0.31008050635853646
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 84ms/step - loss: 0.7902 - mean_squared_error: 0.7902
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
0.5714456514313042 0.3101638124620468
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 76ms/step - loss: 0.7807 - mean_squared_error: 0.7807
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
0.5698879551820728 0.3100369421991883
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 

# Train

In [59]:
batch_size = 32
ds_train = create_tf_ds(
    df_spots.pipe(
        lambda x: pd.concat([
            x, x.sample(n = batch_size - (len(x) % batch_size))
        ])
    )
).shuffle(5000).map(
    lambda X, Y: (proc_images(X, images), Y)
).map(
    lambda X, Y: (data_augmentation(X), Y)
).batch(batch_size).prefetch(tf.data.AUTOTUNE).cache()

input_shape = (img_width, img_height, 3)
enet = tf.keras.applications.EfficientNetB0(
    include_top = False, 
    weights = 'imagenet',
    input_shape = input_shape,
    pooling = 'avg'
)
inputs = tf.keras.Input(shape = input_shape)
x = enet(inputs, training = False)
x = tf.keras.layers.Dropout(0.2)(x)
d1 = tf.keras.layers.Dense(64, activation = 'relu', kernel_initializer = 'HeUniform')
x = d1(x)
d2 = tf.keras.layers.Dense(len(targets))
outputs = d2(x)
m = tf.keras.models.Model(inputs, outputs)

lr_schedule = tf.keras.optimizers.schedules.CosineDecay(
    initial_learning_rate=3e-6,
    decay_steps=5000,
    alpha=0.1
)

m.compile(
    loss = tf.keras.losses.MeanSquaredError(),
    optimizer = tf.keras.optimizers.Adam(learning_rate = lr_schedule),
    metrics = [tf.keras.metrics.MeanSquaredError()]
)
hist = m.fit(ds_train, epochs = 30)

Epoch 1/30
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 186ms/step - loss: 1.1314 - mean_squared_error: 1.1314
Epoch 2/30
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 78ms/step - loss: 1.0609 - mean_squared_error: 1.0609
Epoch 3/30
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 77ms/step - loss: 1.0237 - mean_squared_error: 1.0237
Epoch 4/30
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 77ms/step - loss: 1.0044 - mean_squared_error: 1.0044
Epoch 5/30
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 85ms/step - loss: 0.9836 - mean_squared_error: 0.9836
Epoch 6/30
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 76ms/step - loss: 0.9676 - mean_squared_error: 0.9676
Epoch 7/30
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 76ms/step - loss: 0.9550 - mean_squared_error: 0.9550
Epoch 8/30
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[

In [None]:
inputs = tf.keras.Input(shape = input_shape)
x = enet(inputs, training = True)
x = tf.keras.layers.Dropout(0.2)(x)
x = d1(x)
outputs = d2(x)
m = tf.keras.models.Model(inputs, outputs)
m.compile(
    optimizer=tf.keras.optimizers.Adam(1e-6),  # Low learning rate
    loss=tf.keras.losses.MeanSquaredError(),
    metrics=[tf.keras.metrics.MeanSquaredError()],
)
hist = m.fit(ds_train, epochs=10)

In [None]:
joblib.dump(m.get_weights, 'model/eff_b0_2.joblib')

In [None]:
joblib.dump(target_proc, 'model/target_proc_2.joblib') 

In [10]:
ds_test = create_tf_ds(df_spots_test)

df_submission = pd.DataFrame(
    target_proc.inverse_transform(
        m.predict(
            ds_test.map(lambda X: proc_images(X, images_test)).batch(32)
        )
    ), columns = targets
).reset_index().rename(columns = {'index': 'ID'})

[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 124ms/step


In [11]:
df_submission

Unnamed: 0,ID,C1,C2,C3,C4,C5,C6,C7,C8,C9,...,C26,C27,C28,C29,C30,C31,C32,C33,C34,C35
0,0,0.077941,0.043652,0.175470,0.011950,0.098215,0.003235,0.012892,0.001753,0.003514,...,0.003245,0.001521,0.000470,0.000453,0.000523,0.034370,0.043636,0.000712,0.001227,0.018761
1,1,0.031405,0.039082,0.051170,0.006747,0.089328,0.002896,0.013572,0.005587,0.002254,...,0.002828,0.000331,0.000757,0.000414,0.000312,0.031395,0.042351,0.003108,0.000844,0.023202
2,2,0.073687,0.042339,0.057946,0.003385,0.036626,0.001122,0.014627,0.003375,0.003865,...,0.002439,0.000503,0.000630,0.000439,0.000582,0.038751,0.035186,0.000895,0.002870,0.016234
3,3,0.080863,0.043887,0.313371,0.016495,0.083784,0.003541,0.015294,0.001456,0.001234,...,0.003949,0.000897,0.000696,0.001236,0.000171,0.037267,0.056375,0.000779,0.002152,0.011728
4,4,0.019779,0.047874,0.048455,0.007821,0.024612,0.001668,0.003994,0.009856,0.000440,...,0.003381,0.000609,0.000508,0.000324,0.001303,0.026807,0.032900,0.001654,0.000868,0.016599
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2083,2083,0.049777,0.055029,0.062455,0.016545,0.292763,0.002689,0.019779,0.001736,0.003108,...,0.002948,0.004000,0.000985,0.001082,0.000443,0.026539,0.054108,0.001340,0.000897,0.038822
2084,2084,0.035788,0.043603,0.088948,0.006203,0.371941,0.006795,0.045683,0.000713,0.056270,...,0.002693,0.005886,0.001566,0.002723,0.000165,0.047275,0.072403,0.000397,0.002130,0.019584
2085,2085,0.062852,0.045850,0.115488,0.027797,0.073778,0.001217,0.021061,0.002774,0.002420,...,0.000932,0.000643,0.000667,0.001292,0.000235,0.038626,0.017400,0.001952,0.001073,0.014109
2086,2086,0.016944,0.058467,0.063099,0.077487,0.144921,0.001477,0.017023,0.004665,0.003143,...,0.001426,0.000226,0.000780,0.001270,0.000599,0.033640,0.044233,0.001922,0.000357,0.018170
