In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import os
import time
import json
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras.backend as K
import tensorflow.keras.layers as L
import tensorflow.keras.models as M
import tensorflow_addons as tfa
from tensorflow.keras import losses, backend
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, QuantileTransformer
from sklearn.decomposition import PCA
print('tensorflow ver:', tf.__version__)
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
if gpu_devices:
    for gpu_device in gpu_devices:
        print('device available:', gpu_device)

In [None]:
KAGGLE = False
VERS = ['v40', 'v41', 'v42', 'v43',
        'v50', 'v51', 'v52', 'v53']
CUT = 1e-3
if KAGGLE:
    DATA_PATH = '../input/lish-moa'
    MODELS_PATHS = [f'../input/moa-models-{x}' for x in VERS]
else:
    DATA_PATH = './data'
    MODELS_PATHS = [f'./models_{x}' for x in VERS]
with open(f'{MODELS_PATHS[0]}/params.json') as file:
    PARAMS = json.load(file)
print('params loaded:', PARAMS)

start_time = time.time()

In [None]:
train_features_raw = pd.read_csv(f'{DATA_PATH}/train_features.csv')
train_targets_raw = pd.read_csv(f'{DATA_PATH}/train_targets_scored.csv')
del train_targets_raw['sig_id']
test_features_raw = pd.read_csv(f'{DATA_PATH}/test_features.csv')
ssubm = pd.read_csv(f'{DATA_PATH}/sample_submission.csv')
print(
    'train features loaded:', train_features_raw.shape,
    '\ntrain targets loaded:', train_targets_raw.shape,
    '\ntest features loaded:', test_features_raw.shape,
    '\nsubmission loaded:', ssubm.shape,
)

In [None]:
class VarianceThreshold:
    
    def __init__(self, threshold):
        self.threshold = threshold
        
    def fit_transform(self, df, feat_cols):
        self.df = df
        self.var = self.df[feat_cols].var()
        self.drop_cols = [x for x in feat_cols 
                          if x not in self.var[self.var > self.threshold].index.to_list()]
        self.valid_cols = [x for x in feat_cols 
                           if x in self.var[self.var > self.threshold].index.to_list()]
        return self.df.drop(self.drop_cols, axis=1), self.valid_cols
        
    def transform(self, df):
        return df.drop(self.drop_cols, axis=1)

def preprocess(df):
    df = df.copy()
    df.loc[:, 'cp_type'] = df.loc[:, 'cp_type'].map({'trt_cp': 0, 'ctl_vehicle': 1})
    df.loc[:, 'cp_dose'] = df.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})
    del df['sig_id']
    df = df.join(pd.get_dummies(df['cp_time'], drop_first=False, prefix='cp_time'))
    df = df.drop('cp_time', axis=1)
    return df 
    
def pca_train_test(train, test, feat_cols, g_cols, c_cols, r_comps, seed):
    # pca-g feature PCA
    pca_g = PCA(n_components=int(len(g_cols) / r_comps), random_state=seed)
    train_pca = pca_g.fit_transform(train[g_cols])
    train_pca = pd.DataFrame(
        train_pca, 
        columns=[f'pca_g-{i}' for i in range(int(len(g_cols) / r_comps))], 
        index=train.index
    )
    train = pd.concat((train, train_pca), axis=1)
    test_pca = pca_g.transform(test[g_cols])
    test_pca = pd.DataFrame(
        test_pca, 
        columns=[f'pca_g-{i}' for i in range(int(len(g_cols) / r_comps))], 
        index=test.index
    )
    test = pd.concat((test, test_pca), axis=1)
    feat_cols += [f'pca_g-{i}' for i in range(int(len(g_cols) / r_comps))]
    print('added PCA features:', [f'pca_g-{i}' for i in range(int(len(g_cols) / r_comps))])

    # pca-c feature PCA
    pca_c = PCA(n_components=int(len(c_cols) / r_comps), random_state=seed)
    train_pca = pca_c.fit_transform(train[c_cols])
    train_pca = pd.DataFrame(
        train_pca, 
        columns=[f'pca_c-{i}' for i in range(int(len(c_cols) / r_comps))], 
        index=train.index
    )
    train = pd.concat((train, train_pca), axis=1)
    test_pca = pca_c.transform(test[c_cols])
    test_pca = pd.DataFrame(
        test_pca, 
        columns=[f'pca_c-{i}' for i in range(int(len(c_cols) / r_comps))], 
        index=test.index
    )
    test = pd.concat((test, test_pca), axis=1)
    feat_cols += [f'pca_c-{i}' for i in range(int(len(c_cols) / r_comps))]
    print('\nadded PCA features:', [f'pca_c-{i}' for i in range(int(len(c_cols) / r_comps))])
    return train, test, feat_cols

def get_feats_stats(df, g_cols, c_cols):
    df['stats_g_sum'] = df[g_cols].sum(axis=1)
    df['stats_g_mean'] = df[g_cols].mean(axis=1)
    df['stats_g_std'] = df[g_cols].std(axis=1)
    df['stats_g_kurt'] = df[g_cols].kurtosis(axis=1)
    df['stats_g_skew'] = df[g_cols].skew(axis=1)
    df['stats_c_sum'] = df[c_cols].sum(axis=1)
    df['stats_c_mean'] = df[c_cols].mean(axis=1)
    df['stats_c_std'] = df[c_cols].std(axis=1)
    df['stats_c_kurt'] = df[c_cols].kurtosis(axis=1)
    df['stats_c_skew'] = df[c_cols].skew(axis=1)
    df['stats_gc_sum'] = df[g_cols.to_list() + c_cols.to_list()].sum(axis=1)
    df['stats_gc_mean'] = df[g_cols.to_list() + c_cols.to_list()].mean(axis=1)
    df['stats_gc_std'] = df[g_cols.to_list() + c_cols.to_list()].std(axis=1)
    df['stats_gc_kurt'] = df[g_cols.to_list() + c_cols.to_list()].kurtosis(axis=1)
    df['stats_gc_skew'] = df[g_cols.to_list() + c_cols.to_list()].skew(axis=1)
    return df

In [None]:
def get_train_test(train_features, train_targets, test_features,  
                   r_comps, q_flag, pipe_scaler, 
                   seed):
    g_cols = train_features.columns[train_features.columns.str.startswith('g-')]
    c_cols = train_features.columns[train_features.columns.str.startswith('c-')]
    feat_cols = g_cols.to_list() + c_cols.to_list()
    print('g-cols:', len(g_cols), '| c-cols:', len(c_cols))
    
    if q_flag:
        qtrans = QuantileTransformer(n_quantiles=100, random_state=seed, output_distribution='normal')
        train_features = pd.concat([
            train_features.drop(columns=feat_cols), 
            pd.DataFrame(qtrans.fit_transform(train_features[feat_cols]),
                         columns = feat_cols)], axis=1)
        test_features = pd.concat([
            test_features.drop(columns=feat_cols),
            pd.DataFrame(qtrans.transform(test_features[feat_cols]),
                         columns = feat_cols)], axis=1)

    train = preprocess(train_features)
    test = preprocess(test_features)
    train_targets = train_targets.loc[train['cp_type'] == 0].reset_index(drop=True)
    train = train.loc[train['cp_type'] == 0].reset_index(drop=True)
    
    train, test, feat_cols = pca_train_test(train, test, feat_cols, 
                                            g_cols, c_cols,
                                            r_comps, seed)
    train = get_feats_stats(train, g_cols, c_cols)
    test = get_feats_stats(test, g_cols, c_cols)
    feat_cols.extend([x for x in train.columns if 'stats_' in x])
    print('features:', len(feat_cols))
    
    if pipe_scaler == 1:
        scaler = RobustScaler()
    elif pipe_scaler == 2:
        scaler = MinMaxScaler()
    elif pipe_scaler == 3:
        scaler = StandardScaler()
    train[feat_cols] = scaler.fit_transform(train[feat_cols])
    test[feat_cols] = scaler.transform(test[feat_cols])
    
    threshold = train[feat_cols].var().sort_values().quantile(PARAMS['THRESHOLD'])
    print('threshold {:.4f}'.format(threshold))
    print('features total:', len(feat_cols))
    var_thresh = VarianceThreshold(threshold)
    train, feat_cols = var_thresh.fit_transform(train, feat_cols)
    test = var_thresh.transform(test)
    print(f'features total with variance threshold {threshold:.4f}:', len(feat_cols))
    elapsed_time = time.time() - start_time
    print(f'time elapsed: {elapsed_time // 60:.0f} min {elapsed_time % 60:.0f} sec')
    
    return train, train_targets, test, feat_cols

In [None]:
def get_model(num_columns, num_columns_rs=0, units=1024, drop=.4, lbl_smooth=.001, pipe=1):
    if pipe == 0:
        model = tf.keras.Sequential(
            [
                tf.keras.layers.Input(num_columns),
                tf.keras.layers.BatchNormalization(),
                tf.keras.layers.Dropout(drop),
                tfa.layers.WeightNormalization(
                    tf.keras.layers.Dense(units, activation='elu')
                ),
                tf.keras.layers.BatchNormalization(),
                tf.keras.layers.Dropout(drop),
                tfa.layers.WeightNormalization(
                    tf.keras.layers.Dense(206, activation='sigmoid')
                )
            ]
        )
    elif pipe == 1:
        model = tf.keras.Sequential(
            [
                tf.keras.layers.Input(num_columns),
                tf.keras.layers.BatchNormalization(),
                tf.keras.layers.Dropout(drop),
                tfa.layers.WeightNormalization(
                    tf.keras.layers.Dense(units, activation='elu')
                ),
                tf.keras.layers.BatchNormalization(),
                tf.keras.layers.Dropout(drop),
                tfa.layers.WeightNormalization(
                    tf.keras.layers.Dense(int(units / 2), activation='elu')
                ),
                tf.keras.layers.BatchNormalization(),
                tf.keras.layers.Dropout(drop),
                tfa.layers.WeightNormalization(
                    tf.keras.layers.Dense(206, activation='sigmoid')
                )
            ]
        )
    elif pipe == 2:
        model = tf.keras.Sequential(
            [
                tf.keras.layers.Input(num_columns),
                tf.keras.layers.BatchNormalization(),
                tf.keras.layers.Dropout(drop),
                tfa.layers.WeightNormalization(
                    tf.keras.layers.Dense(units, activation='elu')
                ),
                tf.keras.layers.BatchNormalization(),
                tf.keras.layers.Dropout(drop),
                tfa.layers.WeightNormalization(
                    tf.keras.layers.Dense(int(units / 2), activation='elu')
                ),
                tf.keras.layers.BatchNormalization(),
                tf.keras.layers.Dropout(drop),
                tfa.layers.WeightNormalization(
                    tf.keras.layers.Dense(int(units / 4), activation='elu')
                ),
                tf.keras.layers.BatchNormalization(),
                tf.keras.layers.Dropout(drop),
                tfa.layers.WeightNormalization(
                    tf.keras.layers.Dense(206, activation='sigmoid')
                )
            ]
        )
    elif pipe == 3:
        model = tf.keras.Sequential(
            [
                tf.keras.layers.Input(num_columns),
                tf.keras.layers.BatchNormalization(),
                tf.keras.layers.Dropout(drop / 2),
                tfa.layers.WeightNormalization(
                    tf.keras.layers.Dense(int(units / 2), activation='elu')
                ),
                tf.keras.layers.BatchNormalization(),
                tf.keras.layers.Dropout(drop / 2),
                tfa.layers.WeightNormalization(
                    tf.keras.layers.Dense(int(units / 2), activation='elu')
                ),
                tf.keras.layers.BatchNormalization(),
                tf.keras.layers.Dropout(drop / 2),
                tfa.layers.WeightNormalization(
                    tf.keras.layers.Dense(int(units / 2), activation='elu')
                ),
                tf.keras.layers.BatchNormalization(),
                tf.keras.layers.Dropout(drop / 2),
                tfa.layers.WeightNormalization(
                    tf.keras.layers.Dense(int(units / 2), activation='elu')
                ),
                tf.keras.layers.BatchNormalization(),
                tf.keras.layers.Dropout(drop / 2),
                tfa.layers.WeightNormalization(
                    tf.keras.layers.Dense(206, activation='sigmoid')
                )
            ]
        )
    else:
        raise AttributeError('Cannot recover attribute for model pipe')
    model.compile(
        optimizer=tfa.optimizers.Lookahead(
            tf.keras.optimizers.Adam(learning_rate=0.001),
            sync_period=10
        ),
        loss=losses.BinaryCrossentropy(label_smoothing=lbl_smooth),
        metrics=tf.keras.losses.BinaryCrossentropy(name='score')
    )
    return model

def metric(y_true, y_pred, smooth=.001):
    metrics = []
    y_pred = np.clip(y_pred, smooth, 1 - smooth)
    for _target in y_true.columns:
        metrics.append(
            log_loss(
                y_true.loc[:, _target], 
                y_pred.loc[:, _target].astype(float), 
                labels=[0, 1]
            )
        )
    return np.mean(metrics)

In [None]:
ssubm.loc[:, train_targets_raw.columns] = 0

for i_ver, version in enumerate(VERS):
    print('=' * 10, f'version {version}', '=' * 10)
    with open(f'{MODELS_PATHS[i_ver]}/params.json') as file:
        PARAMS = json.load(file)
    print('params loaded:', PARAMS)
    train, train_targets, test, _ = get_train_test(
        train_features_raw, train_targets_raw, test_features_raw, 
        r_comps=PARAMS['REDUCE_COMPS'],
        q_flag=PARAMS['QTRANS'], pipe_scaler=PARAMS['PIPE_SCALER'],
        seed=PARAMS['SEED']
    )
    print('train done:', train.shape, '| test done:', test.shape)
    with open(f'{MODELS_PATHS[i_ver]}/top_feats.npy', 'rb') as file:
        top_feats = np.load(file)
    print('features loaded:', len(top_feats))
    all_models = [x for x in os.listdir(MODELS_PATHS[i_ver]) if 'seed_' in x]
    
    for model_file in all_models:
        checkpoint_path = f'{MODELS_PATHS[i_ver]}/{model_file}'
        model = get_model(
            len(top_feats), 
            units=PARAMS['UNITS'], 
            drop=PARAMS['DROPOUT'], 
            lbl_smooth=PARAMS['LBL_SMOOTH'],
            pipe=PARAMS['PIPE']
        )
        model.load_weights(checkpoint_path)
        test_predict = model.predict(test.values[:, top_feats], batch_size=PARAMS['BATCH_SIZE'])
        ssubm.loc[:, train_targets_raw.columns] += test_predict
        print(f'predict for model file done: {checkpoint_path}')
        
ssubm.loc[:, train_targets_raw.columns] /= (len(VERS) * PARAMS['FOLDS'] * PARAMS['SEEDS'])
ssubm.loc[:, ssubm.columns[1:]] = np.clip(ssubm.loc[:, ssubm.columns[1:]], CUT, 1 - CUT)
print('clipped from', np.min(ssubm.min(numeric_only=True)), 'to', max(ssubm.max(numeric_only=True)))
ssubm.loc[test['cp_type'] == 1, train_targets_raw.columns] = 0
ssubm.to_csv('submission.csv', index=False)
elapsed_time = time.time() - start_time
print(f'time elapsed: {elapsed_time // 60:.0f} min {elapsed_time % 60:.0f} sec')