In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import os
import time
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras.backend as K
import tensorflow.keras.layers as L
import tensorflow.keras.models as M
import tensorflow_addons as tfa
from tensorflow.keras import losses, backend
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
print('tensorflow ver:', tf.__version__)
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
if gpu_devices:
    for gpu_device in gpu_devices:
        print('device available:', gpu_device)

In [None]:
KAGGLE = False
VERS = ['v1', 'v2']
PIPES = [1, 2]
CUT = 1e-3
if KAGGLE:
    DATA_PATH = '../input/lish-moa'
    MODELS_PATHS = [f'../input/moa-models-{x}' for x in VERS]
else:
    DATA_PATH = './data'
    MODELS_PATHS = [f'./models_{x}' for x in VERS]
PARAMS = {}
PARAMS['SEED'] = 2022
PARAMS['SEEDS'] = 8
PARAMS['FOLDS'] = 5
PARAMS['EPOCHS'] = 200
PARAMS['BATCH_SIZE'] = 32
PARAMS['DECAY'] = True
PARAMS['PATIENCE'] = 20
PARAMS['UNITS'] = 1024
PARAMS['DROPOUT'] = .5
PARAMS['FEAT_IMP'] = 1
PARAMS['PSEUDO_LBL'] = False
PARAMS['LBL_SMOOTH'] = 5e-4
PARAMS['N_COMPS'] = 48
PARAMS['THRESHOLD'] = 1e-2

start_time = time.time()

In [None]:
train_features = pd.read_csv(f'{DATA_PATH}/train_features.csv')
train_targets = pd.read_csv(f'{DATA_PATH}/train_targets_scored.csv')
test_features = pd.read_csv(f'{DATA_PATH}/test_features.csv')
ssubm = pd.read_csv(f'{DATA_PATH}/sample_submission.csv')
print(
    'train features loaded:', train_features.shape,
    '\ntrain targets loaded:', train_targets.shape,
    '\ntest features loaded:', test_features.shape,
    '\nsubmission loaded:', ssubm.shape,
)
g_cols = train_features.columns[train_features.columns.str.startswith('g-')]
c_cols = train_features.columns[train_features.columns.str.startswith('c-')]
feat_cols = g_cols.to_list() + c_cols.to_list()
print('g-cols:', len(g_cols), '\nc-cols:', len(c_cols))

In [None]:
class VarianceThreshold:
    
    def __init__(self, threshold):
        self.threshold = threshold
        
    def fit_transform(self, df, feat_cols):
        self.df = df
        self.var = self.df[feat_cols].var()
        self.drop_cols = [x for x in feat_cols 
                          if x not in self.var[self.var > self.threshold].index.to_list()]
        self.valid_cols = [x for x in feat_cols 
                           if x in self.var[self.var > self.threshold].index.to_list()]
        return self.df.drop(self.drop_cols, axis=1), self.valid_cols
        
    def transform(self, df):
        return df.drop(self.drop_cols, axis=1)

def preprocess(df):
    df = df.copy()
    df.loc[:, 'cp_type'] = df.loc[:, 'cp_type'].map({'trt_cp': 0, 'ctl_vehicle': 1})
    df.loc[:, 'cp_dose'] = df.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})
    del df['sig_id']
    df = df.join(pd.get_dummies(df['cp_time'], drop_first=False, prefix='cp_time'))
    df = df.drop('cp_time', axis=1)
    return df

def get_model(num_columns, units=2048, drop=.4, lbl_smooth=.001, pipe=1):
    if pipe == 1:
        model = tf.keras.Sequential(
            [
                tf.keras.layers.Input(num_columns),
                tf.keras.layers.BatchNormalization(),
                tf.keras.layers.Dropout(drop),
                tfa.layers.WeightNormalization(
                    tf.keras.layers.Dense(units, activation='elu')
                ),
                tf.keras.layers.BatchNormalization(),
                tf.keras.layers.Dropout(drop),
                tfa.layers.WeightNormalization(
                    tf.keras.layers.Dense(int(units / 2), activation='elu')
                ),
                tf.keras.layers.BatchNormalization(),
                tf.keras.layers.Dropout(drop),
                tfa.layers.WeightNormalization(
                    tf.keras.layers.Dense(206, activation='sigmoid')
                )
            ]
        )
    elif pipe == 2:
        model = tf.keras.Sequential(
            [
                tf.keras.layers.Input(num_columns),
                tf.keras.layers.BatchNormalization(),
                tf.keras.layers.Dropout(drop),
                tfa.layers.WeightNormalization(
                    tf.keras.layers.Dense(units, activation='elu')
                ),
                tf.keras.layers.BatchNormalization(),
                tf.keras.layers.Dropout(drop),
                tfa.layers.WeightNormalization(
                    tf.keras.layers.Dense(int(units / 2), activation='elu')
                ),
                tf.keras.layers.BatchNormalization(),
                tf.keras.layers.Dropout(drop),
                tfa.layers.WeightNormalization(
                    tf.keras.layers.Dense(int(units / 4), activation='elu')
                ),
                tf.keras.layers.BatchNormalization(),
                tf.keras.layers.Dropout(drop),
                tfa.layers.WeightNormalization(
                    tf.keras.layers.Dense(206, activation='sigmoid')
                )
            ]
        )
    else:
        raise AttributeError('Cannot recover attribute for model pipe')
    model.compile(
        optimizer=tfa.optimizers.Lookahead(
            tf.keras.optimizers.Adam(learning_rate=0.001),
            sync_period=10
        ),
        loss=losses.BinaryCrossentropy(label_smoothing=lbl_smooth),
        metrics=tf.keras.losses.BinaryCrossentropy(name='score')
        #metrics=score
    )
    return model

def metric(y_true, y_pred):
    metrics = []
    #p_min, p_max = PARAMS['LBL_SMOOTH'], 1 - PARAMS['LBL_SMOOTH']
    #y_pred = np.clip(y_pred, p_min, p_max)
    for _target in train_targets.columns:
        metrics.append(
            log_loss(
                y_true.loc[:, _target], 
                y_pred.loc[:, _target].astype(float), 
                labels=[0, 1]
            )
        )
    return np.mean(metrics)

def score(y_true, y_pred):
    p_min, p_max = PARAMS['LBL_SMOOTH'], 1 - PARAMS['LBL_SMOOTH']
    y_pred = tf.clip_by_value(y_pred, p_min, p_max)
    return -backend.mean(
        y_true * backend.log(y_pred) + (1 - y_true) * backend.log(1 - y_pred)
    )

In [None]:
# basic preprocessing
train = preprocess(train_features)
test = preprocess(test_features)
del train_targets['sig_id']
train_targets = train_targets.loc[train['cp_type'] == 0].reset_index(drop=True)
train = train.loc[train['cp_type'] == 0].reset_index(drop=True)

# pca-g feature PCA
pca_g = PCA(n_components=PARAMS['N_COMPS'], random_state=PARAMS['SEED'])
train_pca = pca_g.fit_transform(train[g_cols])
train_pca = pd.DataFrame(
train_pca, 
    columns=[f'pca_g-{i}' for i in range(PARAMS['N_COMPS'])], 
    index=train.index
)
train = pd.concat((train, train_pca), axis=1)
test_pca = pca_g.transform(test[g_cols])
test_pca = pd.DataFrame(
    test_pca, 
    columns=[f'pca_g-{i}' for i in range(PARAMS['N_COMPS'])], 
    index=test.index
)
test = pd.concat((test, test_pca), axis=1)
feat_cols += [f'pca_g-{i}' for i in range(PARAMS['N_COMPS'])]
print('added PCA features:', [f'pca_g-{i}' for i in range(PARAMS['N_COMPS'])])

# pca-c feature PCA
pca_c = PCA(n_components=int(PARAMS['N_COMPS'] / 8), random_state=PARAMS['SEED'])
train_pca = pca_c.fit_transform(train[c_cols])
train_pca = pd.DataFrame(
    train_pca, 
    columns=[f'pca_c-{i}' for i in range(int(PARAMS['N_COMPS'] / 8))], 
    index=train.index
)
train = pd.concat((train, train_pca), axis=1)
test_pca = pca_c.transform(test[c_cols])
test_pca = pd.DataFrame(
    test_pca, 
    columns=[f'pca_c-{i}' for i in range(int(PARAMS['N_COMPS'] / 8))], 
    index=test.index
)
test = pd.concat((test, test_pca), axis=1)
feat_cols += [f'pca_c-{i}' for i in range(int(PARAMS['N_COMPS'] / 8))]
print('\nadded PCA features:', [f'pca_c-{i}' for i in range(int(PARAMS['N_COMPS'] / 8))])

In [None]:
scaler = MinMaxScaler()
#scaler = StandardScaler()
train[feat_cols] = scaler.fit_transform(train[feat_cols])
test[feat_cols] = scaler.transform(test[feat_cols])
tmp = train.describe().T
tmp.head()

In [None]:
threshold = train[feat_cols].var().sort_values().quantile(PARAMS['THRESHOLD'])
print('threshold {:.4f}'.format(threshold))
print('features total:', len(feat_cols))
var_thresh = VarianceThreshold(threshold)
train, feat_cols = var_thresh.fit_transform(train, feat_cols)
test = var_thresh.transform(test)
print(f'features total with variance threshold {threshold:.4f}:', len(feat_cols))
elapsed_time = time.time() - start_time
print(f'time elapsed: {elapsed_time // 60:.0f} min {elapsed_time % 60:.0f} sec')

In [None]:
ssubm.loc[:, train_targets.columns] = 0

for i_ver, version in enumerate(VERS):
    print('=' * 10, f'version {version}', '=' * 10)
    with open(f'{MODELS_PATHS[i_ver]}/top_feats.npy', 'rb') as file:
        top_feats = np.load(file)
    print('features loaded:', len(top_feats))
    all_models = [x for x in os.listdir(MODELS_PATHS[i_ver]) if 'seed_' in x]
    
    for model_file in all_models:
        checkpoint_path = f'{MODELS_PATHS[i_ver]}/{model_file}'
        model = get_model(
            len(top_feats), 
            units=PARAMS['UNITS'], 
            drop=PARAMS['DROPOUT'], 
            lbl_smooth=PARAMS['LBL_SMOOTH'],
            pipe=PIPES[i_ver]
        )
        model.load_weights(checkpoint_path)
        test_predict = model.predict(test.values[:, top_feats], batch_size=PARAMS['BATCH_SIZE'])
        ssubm.loc[:, train_targets.columns] += test_predict
        print(f'predict for model file  done: {checkpoint_path}')
        
ssubm.loc[:, train_targets.columns] /= (len(VERS) * PARAMS['FOLDS'] * PARAMS['SEEDS'])

ssubm.loc[:, ssubm.columns[1:]] = np.clip(ssubm.loc[:, ssubm.columns[1:]], CUT, 1 - CUT)
print('clipped from', np.min(ssubm.min(numeric_only=True)), 'to', max(ssubm.max(numeric_only=True)))
ssubm.loc[test['cp_type'] == 1, train_targets.columns] = 0
ssubm.to_csv('submission.csv', index=False)
print(f'time elapsed: {elapsed_time // 60:.0f} min {elapsed_time % 60:.0f} sec')

In [None]:
import matplotlib.pyplot as plt
s = pd.read_csv('submission.csv')
print('s:', np.max(s.loc[:, s.columns[1:]].values), np.min(s.loc[:, s.columns[1:]].values))
s_ = pd.read_csv('submission_.csv')
print('s_:', np.max(s_.loc[:, s_.columns[1:]].values), np.min(s_.loc[:, s_.columns[1:]].values))
s_sc = pd.read_csv('submission_sc.csv')
print('s_sc:', np.max(s_sc.loc[:, s_sc.columns[1:]].values), np.min(s_sc.loc[:, s_sc.columns[1:]].values))
s_sc2 = pd.read_csv('submission_sc2.csv')
print('s_sc2:', np.max(s_sc2.loc[:, s_sc2.columns[1:]].values), np.min(s_sc2.loc[:, s_sc2.columns[1:]].values))
s_sc3 = pd.read_csv('submission_sc3.csv')
print('s_sc3:', np.max(s_sc3.loc[:, s_sc3.columns[1:]].values), np.min(s_sc3.loc[:, s_sc3.columns[1:]].values))
sk = pd.read_csv('submission_kgl.csv')
print('sk:', np.max(sk.loc[:, sk.columns[1:]].values), np.min(sk.loc[:, sk.columns[1:]].values))

In [None]:
plt.figure(figsize=(16, 4))
plt.plot(np.sum(sk.loc[:, sk.columns[1:]].values - s_.loc[:, s_.columns[1:]].values, axis=0), label='sk-s_')
plt.plot(np.sum(sk.loc[:, sk.columns[1:]].values - s_sc.loc[:, s_sc.columns[1:]].values, axis=0), label='sk-s_sc')
plt.plot(np.sum(sk.loc[:, sk.columns[1:]].values - s_sc2.loc[:, s_sc2.columns[1:]].values, axis=0), label='sk-s_sc2')
plt.plot(np.sum(sk.loc[:, sk.columns[1:]].values - s_sc3.loc[:, s_sc3.columns[1:]].values, axis=0), label='sk-s_sc3')
plt.plot(np.sum(sk.loc[:, sk.columns[1:]].values - s.loc[:, s.columns[1:]].values, axis=0), label='sk-s')
plt.legend()
plt.show()