- additional pca
- import variance threshold
- change valdiation method

In [1]:
import os
import sys
import warnings
import numpy as np
import pandas as pd 
import lightgbm as lgb
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.model_selection import KFold, StratifiedKFold
from category_encoders import CountEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import log_loss, roc_auc_score
from sklearn import preprocessing
from sklearn.multioutput import MultiOutputClassifier
sys.path.append('../input/multilabelstraifier/')
from ml_stratifiers import MultilabelStratifiedKFold
warnings.filterwarnings('ignore')
from sklearn.feature_selection import VarianceThreshold

In [2]:
SEED = 42
NFOLDS = 5
np.random.seed(SEED)
DATA_DIR = '/kaggle/input/lish-moa/'
p_min = 0.001
p_max = 1 - p_min

In [3]:
train = pd.read_csv(DATA_DIR + 'train_features.csv')
targets = pd.read_csv(DATA_DIR + 'train_targets_scored.csv')

test = pd.read_csv(DATA_DIR + 'test_features.csv')
sub = pd.read_csv(DATA_DIR + 'sample_submission.csv')
drug = pd.read_csv(DATA_DIR + 'train_drug.csv')

In [4]:
target_feats = [ i for i in targets.columns if i != "sig_id"]
g_feats = [i for i in train.columns if "g-" in i]
c_feats = [i for i in train.columns if "c-" in i]

In [5]:
noncons_train_index = train[train.cp_type=="ctl_vehicle"].index
cons_train_index = train[train.cp_type!="ctl_vehicle"].index
noncons_test_index = test[test.cp_type=="ctl_vehicle"].index
cons_test_index = test[test.cp_type!="ctl_vehicle"].index

# Preprocess

In [6]:
test = test[test.index.isin(cons_test_index)].reset_index(drop=True)
train = train[train.index.isin(cons_train_index)].reset_index(drop=True)
fn_targets = targets[targets.index.isin(cons_train_index)].copy().reset_index(drop=True)
y = targets.drop("sig_id", axis=1).copy()
drug = drug[drug.index.isin(cons_train_index)].copy().reset_index(drop=True)

In [7]:
# https://www.kaggle.com/c/lish-moa/discussion/195195
seed = 34

folds = []
    
# LOAD FILES
train_score = fn_targets.merge(drug, on='sig_id', how='left') 

# LOCATE DRUGS
vc = train_score.drug_id.value_counts()
vc1 = vc.loc[vc <= 18].index.sort_values()
vc2 = vc.loc[vc > 18].index.sort_values()
    
# STRATIFY DRUGS 18X OR LESS
dct1 = {}; dct2 = {}
skf = MultilabelStratifiedKFold(n_splits = NFOLDS, shuffle = True, random_state = seed)
tmp = train_score.groupby('drug_id')[target_feats].mean().loc[vc1]
for fold,(idxT,idxV) in enumerate(skf.split(tmp,tmp[target_feats])):
    dd = {k:fold for k in tmp.index[idxV].values}
    dct1.update(dd)

# STRATIFY DRUGS MORE THAN 18X
skf = MultilabelStratifiedKFold(n_splits = NFOLDS, shuffle = True, random_state = seed)
tmp = train_score.loc[train_score.drug_id.isin(vc2)].reset_index(drop = True)
for fold,(idxT,idxV) in enumerate(skf.split(tmp,tmp[target_feats])):
    dd = {k:fold for k in tmp.sig_id[idxV].values}
    dct2.update(dd)

# ASSIGN FOLDS
train_score['fold'] = train_score.drug_id.map(dct1)
train_score.loc[train_score.fold.isna(),'fold'] = train_score.loc[train_score.fold.isna(),'sig_id'].map(dct2)
train_score.fold = train_score.fold.astype('int8')
folds.append(train_score.fold.values)
    
np.array(folds)

array([[1, 1, 4, ..., 1, 0, 1]], dtype=int8)

# Feature engineering

In [8]:
X = train[g_feats+c_feats].copy().values
select = VarianceThreshold(threshold=0.8)
X_new = select.fit_transform(X)
drop_feats = list(np.array(train[g_feats+c_feats].columns)[select.get_support()==False])
len(drop_feats)

train.drop(drop_feats, axis=1, inplace=True)
test.drop(drop_feats, axis=1, inplace=True)

g_feats = [i for i in train.columns if "g-" in i]
c_feats = [i for i in train.columns if "c-" in i]

In [9]:
c_num = 10
pca_c_cols = ["pca-c"+str(i+1) for i in range(c_num)]
pca = PCA(n_components=c_num,random_state=42)
c_train = pca.fit_transform(train[c_feats])
c_test = pca.transform(test[c_feats])
c_train = pd.DataFrame(c_train, columns=pca_c_cols)
c_test = pd.DataFrame(c_test, columns=pca_c_cols)

g_num = 80
pca_g_cols = ["pca-g"+str(i+1) for i in range(g_num)]
pca = PCA(n_components=g_num, random_state=42)
g_train = pca.fit_transform(train[g_feats])
g_test = pca.transform(test[g_feats])
g_train = pd.DataFrame(g_train, columns=pca_g_cols)
g_test = pd.DataFrame(g_test, columns=pca_g_cols)

train = pd.concat([train, c_train],axis=1)
test = pd.concat([test, c_test],axis=1)
train = pd.concat([train, g_train],axis=1)
test = pd.concat([test, g_test],axis=1)

In [10]:
def fe(df, remove_features):
    df['g_sum'] = df[g_feats].sum(axis = 1)
    df['g_mean'] = df[g_feats].mean(axis = 1)
    df['g_std'] = df[g_feats].std(axis = 1)
    df['g_kurt'] = df[g_feats].kurtosis(axis = 1)
    df['g_skew'] = df[g_feats].skew(axis = 1)
    df['c_sum'] = df[c_feats].sum(axis = 1)
    df['c_mean'] = df[c_feats].mean(axis = 1)
    df['c_std'] = df[c_feats].std(axis = 1)
    df['c_kurt'] = df[c_feats].kurtosis(axis = 1)
    df['c_skew'] = df[c_feats].skew(axis = 1)
    df['gc_sum'] = df[c_feats + g_feats].sum(axis = 1)
    df['gc_mean'] = df[c_feats + g_feats].mean(axis = 1)
    df['gc_std'] = df[c_feats + g_feats].std(axis = 1)
    df['gc_kurt'] = df[c_feats + g_feats].kurtosis(axis = 1)
    df['gc_skew'] = df[c_feats + g_feats].skew(axis = 1)
    df.drop(remove_features, axis=1, inplace=True)
    df.loc[:, 'cp_dose'] = df.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})
    return df

remove_features = ["cp_type" , "sig_id"]
        
train = fe(train, remove_features)
test = fe(test, remove_features)
    
print(train.shape, test.shape)

(21948, 914) (3624, 914)


In [11]:
train["fold"] = np.array(folds).reshape(-1,1)

# modelling

In [12]:
classifier = MultiOutputClassifier(XGBClassifier(tree_method='gpu_hist')) 

clf = Pipeline([('classify', classifier)
               ])

params = {'classify__estimator__gamma': 3.6975,
          'classify__estimator__learning_rate': 0.0803,
          'classify__estimator__max_delta_step': 2.0706,
          'classify__estimator__max_depth': 10,
          'classify__estimator__min_child_weight': 25.5800,
          'classify__estimator__n_estimators': 100,
         }

clf.set_params(**params)

Pipeline(steps=[('classify',
                 MultiOutputClassifier(estimator=XGBClassifier(base_score=None,
                                                               booster=None,
                                                               colsample_bylevel=None,
                                                               colsample_bynode=None,
                                                               colsample_bytree=None,
                                                               gamma=3.6975,
                                                               gpu_id=None,
                                                               importance_type='gain',
                                                               interaction_constraints=None,
                                                               learning_rate=0.0803,
                                                               max_delta_step=2.0706,
                                                   

In [13]:
def modelling_xgb(X, y, X_test, seed):
    oof_preds = np.zeros(y.shape)
    test_preds = np.zeros((X_test.shape[0], y.shape[1]))
    oof_losses = []
    for fn in range(NFOLDS):
        print('Starting fold: ', fn)
        val_idx = X.iloc[:,-1] == fn
        trn_idx = X.iloc[:,-1] != fn
        X_train, X_val = X.loc[trn_idx,:].to_numpy(), X.loc[val_idx,:].to_numpy()
        y_train, y_val = y.loc[trn_idx].to_numpy(), y.loc[val_idx].to_numpy()
        X_train = np.delete(X_train, -1, 1)
        X_val = np.delete(X_val, -1, 1)
    
        clf.fit(X_train, y_train)
        val_preds = clf.predict_proba(X_val) # list of preds per class
        val_preds = np.array(val_preds)[:,:,1].T # take the positive class
        oof_preds[val_idx] = val_preds
    
        loss = log_loss(np.ravel(y_val), np.ravel(val_preds))
        print(loss)
        oof_losses.append(loss)
        preds = clf.predict_proba(X_test)
        preds = np.array(preds)[:,:,1].T # take the positive class
        test_preds += preds / NFOLDS
    
    print(oof_losses)
    print('Mean OOF loss across folds', np.mean(oof_losses))
    print('STD OOF loss across folds', np.std(oof_losses))
    return oof_preds, test_preds

In [14]:
X = train.copy()
y_xgb = fn_targets.drop("sig_id", axis=1).copy()
X_test = test.copy()

seeds = [42]
oof_preds = np.zeros(y_xgb.shape)
test_preds = np.zeros((test.shape[0], y_xgb.shape[1]))
for seed_ in seeds:
    ind_preds, ind_test_preds = modelling_xgb(X, y_xgb, X_test, seed_)
    oof_preds += ind_preds / len(seeds)
    test_preds += ind_test_preds / len(seeds)

Starting fold:  0
0.01802298004472023
Starting fold:  1
0.018461632031383953
Starting fold:  2
0.01771447134739482
Starting fold:  3
0.0180343302189189
Starting fold:  4
0.01803448574623002
[0.01802298004472023, 0.018461632031383953, 0.01771447134739482, 0.0180343302189189, 0.01803448574623002]
Mean OOF loss across folds 0.018053579877729584
STD OOF loss across folds 0.00023798010019748418


In [15]:
#aucs = []
#for task_id in range(targets.shape[1]-1):
#    aucs.append(roc_auc_score(y_true=targets.iloc[:, task_id+1].values,
#                              y_score=oof_preds[:, task_id]))
#print(f"Overall AUC : {np.mean(aucs)}")

In [16]:
check = np.zeros([targets.shape[0], targets.shape[1]-1])
check[cons_train_index,:] = oof_preds
print('OOF log loss: ', log_loss(np.ravel(y), np.ravel(check)))

OOF log loss:  0.01663886078344363


In [17]:
#aucs = []
#for task_id in range(targets.shape[1]-1):
#    aucs.append(roc_auc_score(y_true=targets.iloc[:, task_id+1].values,
#                              y_score=np.clip(oof_preds[:, task_id], p_min,p_max)))
#print(f"Overall AUC : {np.mean(aucs)}")

In [18]:
check = np.zeros([targets.shape[0], targets.shape[1]-1])
check[cons_train_index,:] = oof_preds
print('OOF log loss: ', log_loss(np.ravel(y), np.ravel(np.clip(check,p_min,p_max))))

OOF log loss:  0.01671837272518806


In [19]:
sub.loc[cons_test_index,target_feats] = np.clip(test_preds,p_min,p_max)
sub.loc[noncons_test_index,target_feats] = 0
sub.to_csv('submission.csv', index=False)