In [29]:
import pandas as pd
import numpy as np
import h5py

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.decomposition import PCA

In [30]:
# Load training data
with h5py.File("data/elucidata_ai_challenge_data.h5", "r") as f:
    train_spots = f["spots/Train"]
    train_spot_tables = {slide_name: pd.DataFrame(np.array(train_spots[slide_name])).assign(slide = slide_name) for slide_name in train_spots.keys()}

# Combine all training slides
df_spots = pd.concat(train_spot_tables.values(), ignore_index=True)

# Extract features and target labels
X = ['x', 'y']  # Use spatial coordinates
targets = [i for i in df_spots.columns if i.startswith('C')]

n_components = 5
target_proc = make_pipeline(
    FunctionTransformer(np.log, np.exp), StandardScaler(), PCA(n_components=n_components)
).fit(df_spots[targets])
target_proc.fit(df_spots[targets])
targets2 = ['pca_{}'.format(i) for i in range(n_components)]
df_spots= df_spots.join(
    pd.DataFrame(target_proc.transform(df_spots[targets]), index = df_spots.index, columns = targets2)
)

In [31]:
from sklearn.linear_model import  RANSACRegressor
df_cv_train = df_spots.loc[df_spots['slide'] != 'S_6']
df_valid = df_spots.loc[df_spots['slide'] == 'S_6']
r_reg = RANSACRegressor()
r_reg.fit(df_cv_train[X], df_cv_train[targets2])

In [32]:
from scipy.stats import spearmanr

print(
    pd.DataFrame(
        target_proc.inverse_transform(r_reg.predict(df_cv_train[X])), index = df_cv_train.index, columns = targets
    ).apply(
        lambda x: spearmanr(x, df_cv_train.loc[x.name, targets])[0],axis=1
    ).mean()
)

pd.DataFrame(
    target_proc.inverse_transform(r_reg.predict(df_valid[X])), index = df_valid.index, columns = targets
).apply(
    lambda x: spearmanr(x, df_valid.loc[x.name, targets])[0],axis=1
).mean()

0.4376560198945555


0.44669758147161304

In [33]:
df_spots = df_spots.join(
    df_spots.groupby('slide')[['x', 'y']].transform(
        lambda x: (x - x.min()) * 2 / (x.max() - x.min()) - 1
    ).rename(columns = lambda x: x+'_mm')
)

In [34]:
from sklearn.linear_model import  RANSACRegressor
from scipy.stats import spearmanr
X = ['x_mm', 'y_mm']
r_reg = RANSACRegressor()
coef_train, coef_valid = list(), list()
for i in df_spots['slide'].unique():
    df_cv_train = df_spots.loc[df_spots['slide'] != i]
    df_valid = df_spots.loc[df_spots['slide'] == i]
    r_reg.fit(df_cv_train[X], df_cv_train[targets2])

    coef_train.append(
        pd.DataFrame(
            target_proc.inverse_transform(r_reg.predict(df_cv_train[X])), index = df_cv_train.index, columns = targets
        ).apply(
            lambda x: spearmanr(x, df_cv_train.loc[x.name, targets])[0],axis=1
        ).mean()
    )

    coef_valid.append(
        pd.DataFrame(
            target_proc.inverse_transform(r_reg.predict(df_valid[X])), index = df_valid.index, columns = targets
        ).apply(
            lambda x: spearmanr(x, df_valid.loc[x.name, targets])[0],axis=1
        ).mean()
    )
    print(i, coef_train[-1], coef_valid[-1])

S_1 0.49113212254993033 0.4052636074912441
S_2 0.3166313473900829 0.40583348758005405
S_3 0.4045952053624688 0.4145029026103196
S_4 0.48745812203686284 0.21108649019843828
S_5 0.5043200313698621 0.019683842529259764
S_6 0.3604943186600161 0.6606092436974791
