- cancel diff between ctl group
- change model parameters

In [1]:
import os
import sys
import warnings
import numpy as np
import pandas as pd 
import lightgbm as lgb
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.model_selection import KFold, StratifiedKFold
from category_encoders import CountEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import log_loss, mean_squared_error
from sklearn import preprocessing
from sklearn.multioutput import MultiOutputClassifier
sys.path.append('../input/multilabelstraifier/')
from ml_stratifiers import MultilabelStratifiedKFold
warnings.filterwarnings('ignore')

In [2]:
SEED = 42
NFOLDS = 5
np.random.seed(SEED)
DATA_DIR = '/kaggle/input/lish-moa/'

In [3]:
train = pd.read_csv(DATA_DIR + 'train_features.csv')
targets = pd.read_csv(DATA_DIR + 'train_targets_scored.csv')

test = pd.read_csv(DATA_DIR + 'test_features.csv')
sub = pd.read_csv(DATA_DIR + 'sample_submission.csv')

In [4]:
target_feats = [ i for i in targets.columns if i != "sig_id"]
g_feats = [i for i in train.columns if "g-" in i]
c_feats = [i for i in train.columns if "c-" in i]

In [5]:
noncons_train_index = train[train.cp_type=="ctl_vehicle"].index
cons_train_index = train[train.cp_type!="ctl_vehicle"].index
noncons_test_index = test[test.cp_type=="ctl_vehicle"].index
cons_test_index = test[test.cp_type!="ctl_vehicle"].index

# Preprocess

In [6]:
# normalization by ctl group
#train_ctl = train[train.index.isin(noncons_train_index)].copy().reset_index(drop=True)
#test_ctl = test[test.index.isin(noncons_test_index)].copy().reset_index(drop=True)
#ctl_df = pd.concat([train_ctl, test_ctl])

#ctl_group_data = ctl_df.groupby(["cp_dose", "cp_time"]).agg({"mean"}).reset_index()
#mean_g_feats = ["mean-" + i for i in g_feats]
#mean_c_feats = ["mean-" + i for i in c_feats]
#columns = ["cp_dose", "cp_time"] + mean_g_feats + mean_c_feats
#ctl_group_data.columns = columns

#train_cons = train_feat[train_feat.index.isin(cons_train_index)].copy().reset_index(drop=True)
#test_cons = test_feat[test_feat.index.isin(cons_test_index)].copy().reset_index(drop=True)
#n_train_score = train_score[train_score.index.isin(cons_train_index)].copy().reset_index(drop=True)
#n_train_nonscore = train_nonscore[train_nonscore.index.isin(cons_train_index)].copy().reset_index(drop=True)

In [7]:
#train = pd.merge(train, ctl_group_data, on=["cp_time", "cp_dose"], how="left")
#test = pd.merge(test, ctl_group_data, on=["cp_time", "cp_dose"], how="left")

#for i in range(len(g_feats)):
#    train["diff-g-"+str(i)] = train["g-"+str(i)] - train["mean-g-"+str(i)]
#    test["diff-g-"+str(i)] = test["g-"+str(i)] - test["mean-g-"+str(i)]
    
#for i in range(len(c_feats)):
#    train["diff-c-"+str(i)] = train["c-"+str(i)] - train["mean-c-"+str(i)]
#    test["diff-c-"+str(i)] = test["c-"+str(i)] - test["mean-c-"+str(i)]

In [8]:
categoricals = ["cp_dose"]

def encoding(tr, te):
    for f in categoricals:
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(tr[f]))
        tr[f] = lbl.transform(list(tr[f]))
        te[f] = lbl.transform(list(te[f])) 
        
    return tr, te

train, test = encoding(train, test)

# Feature engineering

In [9]:
def fe(df, remove_features):
    df.drop(remove_features, axis=1, inplace=True)
    return df

remove_features = ["cp_type" , "sig_id"]
        
train = fe(train, remove_features)
test = fe(test, remove_features)
    
print(train.shape, test.shape)

(23814, 874) (3982, 874)


# modelling

In [10]:
classifier = MultiOutputClassifier(XGBClassifier(tree_method='gpu_hist'))

clf = Pipeline([('classify', classifier)
               ])

params = {'classify__estimator__gamma': 3.6975,
          'classify__estimator__learning_rate': 0.0703,
          'classify__estimator__max_delta_step': 2.0706,
          'classify__estimator__max_depth': 10,
          'classify__estimator__min_child_weight': 31.5800,
          'classify__estimator__n_estimators': 166,
         }

clf.set_params(**params)

Pipeline(steps=[('classify',
                 MultiOutputClassifier(estimator=XGBClassifier(base_score=None,
                                                               booster=None,
                                                               colsample_bylevel=None,
                                                               colsample_bynode=None,
                                                               colsample_bytree=None,
                                                               gamma=3.6975,
                                                               gpu_id=None,
                                                               importance_type='gain',
                                                               interaction_constraints=None,
                                                               learning_rate=0.0703,
                                                               max_delta_step=2.0706,
                                                   

In [11]:
def modelling_xgb(X, y, X_test, seed):
    oof_preds = np.zeros(y.shape)
    test_preds = np.zeros((X_test.shape[0], y.shape[1]))
    oof_losses = []
    mskf = MultilabelStratifiedKFold(n_splits=NFOLDS, random_state=seed, shuffle=True)
    for fn, (trn_idx, val_idx) in enumerate(mskf.split(X, y)):
        print('Starting fold: ', fn)
        X_train, X_val = X.iloc[trn_idx,:], X.iloc[val_idx,:].to_numpy()
        y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx].to_numpy()
    
        X_train = X_train[X_train.index.isin(cons_train_index)].to_numpy()
        y_train = y_train[y_train.index.isin(cons_train_index)].to_numpy()
    
        clf.fit(X_train, y_train)
        val_preds = clf.predict_proba(X_val) # list of preds per class
        val_preds = np.array(val_preds)[:,:,1].T # take the positive class
        oof_preds[val_idx] = val_preds
    
        loss = log_loss(np.ravel(y_val), np.ravel(val_preds))
        print(loss)
        oof_losses.append(loss)
        preds = clf.predict_proba(X_test)
        preds = np.array(preds)[:,:,1].T # take the positive class
        test_preds += preds / NFOLDS
    
    print(oof_losses)
    print('Mean OOF loss across folds', np.mean(oof_losses))
    print('STD OOF loss across folds', np.std(oof_losses))
    return oof_preds, test_preds

In [12]:
X = train.copy()
y = targets.drop("sig_id", axis=1).copy()
X_test = test.copy()

seeds = [42]
oof_preds = np.zeros(y.shape)
test_preds = np.zeros((test.shape[0], y.shape[1]))
for seed_ in seeds:
    ind_preds, ind_test_preds = modelling_xgb(X, y, X_test, seed_)
    oof_preds += ind_preds / len(seeds)
    test_preds += ind_test_preds / len(seeds)

Starting fold:  0
0.016738832513000318
Starting fold:  1
0.016908957230028557
Starting fold:  2
0.016773734659711288
Starting fold:  3
0.016497753560878757
Starting fold:  4
0.01667905765085504
[0.016738832513000318, 0.016908957230028557, 0.016773734659711288, 0.016497753560878757, 0.01667905765085504]
Mean OOF loss across folds 0.01671966712289479
STD OOF loss across folds 0.00013417608848077545


In [13]:
train_checkscore = targets.copy()
train_checkscore.iloc[:,1:] = oof_preds
train_checkscore.loc[train_checkscore.index.isin(noncons_train_index),target_feats] = 0
print('OOF log loss: ', log_loss(np.ravel(y), np.ravel(np.array(train_checkscore.iloc[:,1:]))))

OOF log loss:  0.016449612618380825


In [14]:
sub.iloc[:,1:] = test_preds
sub.loc[sub.index.isin(noncons_test_index),target_feats] = 0
sub.to_csv('submission.csv', index=False)