- reduce n_estimator (change other parameters)

In [1]:
import os
import sys
import warnings
import numpy as np
import pandas as pd 
import lightgbm as lgb
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.model_selection import KFold, StratifiedKFold
from category_encoders import CountEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import log_loss, mean_squared_error
from sklearn import preprocessing
from sklearn.multioutput import MultiOutputClassifier
sys.path.append('../input/multilabelstraifier/')
from ml_stratifiers import MultilabelStratifiedKFold
warnings.filterwarnings('ignore')
from sklearn.feature_selection import VarianceThreshold

In [2]:
SEED = 42
NFOLDS = 5
np.random.seed(SEED)
DATA_DIR = '/kaggle/input/lish-moa/'

In [3]:
train = pd.read_csv(DATA_DIR + 'train_features.csv')
targets = pd.read_csv(DATA_DIR + 'train_targets_scored.csv')

test = pd.read_csv(DATA_DIR + 'test_features.csv')
sub = pd.read_csv(DATA_DIR + 'sample_submission.csv')

In [4]:
target_feats = [ i for i in targets.columns if i != "sig_id"]
g_feats = [i for i in train.columns if "g-" in i]
c_feats = [i for i in train.columns if "c-" in i]

In [5]:
noncons_train_index = train[train.cp_type=="ctl_vehicle"].index
cons_train_index = train[train.cp_type!="ctl_vehicle"].index
noncons_test_index = test[test.cp_type=="ctl_vehicle"].index
cons_test_index = test[test.cp_type!="ctl_vehicle"].index

# Preprocess

In [6]:
test = test[test.index.isin(cons_test_index)].reset_index(drop=True)
train = train[train.index.isin(cons_train_index)].reset_index(drop=True)
fn_targets = targets[targets.index.isin(cons_train_index)].copy().reset_index(drop=True)
y = targets.drop("sig_id", axis=1).copy()

# Feature engineering

In [7]:
def fe(df, remove_features):
    df['g_sum'] = df[g_feats].sum(axis = 1)
    df['g_mean'] = df[g_feats].mean(axis = 1)
    df['g_std'] = df[g_feats].std(axis = 1)
    df['g_kurt'] = df[g_feats].kurtosis(axis = 1)
    df['g_skew'] = df[g_feats].skew(axis = 1)
    df['c_sum'] = df[c_feats].sum(axis = 1)
    df['c_mean'] = df[c_feats].mean(axis = 1)
    df['c_std'] = df[c_feats].std(axis = 1)
    df['c_kurt'] = df[c_feats].kurtosis(axis = 1)
    df['c_skew'] = df[c_feats].skew(axis = 1)
    df['gc_sum'] = df[c_feats + g_feats].sum(axis = 1)
    df['gc_mean'] = df[c_feats + g_feats].mean(axis = 1)
    df['gc_std'] = df[c_feats + g_feats].std(axis = 1)
    df['gc_kurt'] = df[c_feats + g_feats].kurtosis(axis = 1)
    df['gc_skew'] = df[c_feats + g_feats].skew(axis = 1)
    df.drop(remove_features, axis=1, inplace=True)
    df.loc[:, 'cp_dose'] = df.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})
    return df

remove_features = ["cp_type" , "sig_id"]
        
train = fe(train, remove_features)
test = fe(test, remove_features)
    
print(train.shape, test.shape)

(21948, 889) (3624, 889)


# modelling

In [8]:
classifier = MultiOutputClassifier(XGBClassifier(tree_method='gpu_hist')) 

clf = Pipeline([('classify', classifier)
               ])

params = {'classify__estimator__gamma': 3.6975,
          'classify__estimator__learning_rate': 0.0803,
          'classify__estimator__max_delta_step': 2.0706,
          'classify__estimator__max_depth': 10,
          'classify__estimator__min_child_weight': 25.5800,
          'classify__estimator__n_estimators': 100,
         }

clf.set_params(**params)

Pipeline(steps=[('classify',
                 MultiOutputClassifier(estimator=XGBClassifier(base_score=None,
                                                               booster=None,
                                                               colsample_bylevel=None,
                                                               colsample_bynode=None,
                                                               colsample_bytree=None,
                                                               gamma=3.6975,
                                                               gpu_id=None,
                                                               importance_type='gain',
                                                               interaction_constraints=None,
                                                               learning_rate=0.0803,
                                                               max_delta_step=2.0706,
                                                   

In [9]:
def modelling_xgb(X, y, X_test, seed):
    oof_preds = np.zeros(y.shape)
    test_preds = np.zeros((X_test.shape[0], y.shape[1]))
    oof_losses = []
    mskf = MultilabelStratifiedKFold(n_splits=NFOLDS, random_state=seed, shuffle=True)
    for fn, (trn_idx, val_idx) in enumerate(mskf.split(X, y)):
        print('Starting fold: ', fn)
        X_train, X_val = X.iloc[trn_idx,:].to_numpy(), X.iloc[val_idx,:].to_numpy()
        y_train, y_val = y.iloc[trn_idx].to_numpy(), y.iloc[val_idx].to_numpy()
    
        clf.fit(X_train, y_train)
        val_preds = clf.predict_proba(X_val) # list of preds per class
        val_preds = np.array(val_preds)[:,:,1].T # take the positive class
        oof_preds[val_idx] = val_preds
    
        loss = log_loss(np.ravel(y_val), np.ravel(val_preds))
        print(loss)
        oof_losses.append(loss)
        preds = clf.predict_proba(X_test)
        preds = np.array(preds)[:,:,1].T # take the positive class
        test_preds += preds / NFOLDS
    
    print(oof_losses)
    print('Mean OOF loss across folds', np.mean(oof_losses))
    print('STD OOF loss across folds', np.std(oof_losses))
    return oof_preds, test_preds

In [10]:
X = train.copy()
y_xgb = fn_targets.drop("sig_id", axis=1).copy()
X_test = test.copy()

seeds = [42]
oof_preds = np.zeros(y_xgb.shape)
test_preds = np.zeros((test.shape[0], y_xgb.shape[1]))
for seed_ in seeds:
    ind_preds, ind_test_preds = modelling_xgb(X, y_xgb, X_test, seed_)
    oof_preds += ind_preds / len(seeds)
    test_preds += ind_test_preds / len(seeds)

Starting fold:  0
0.017738735475654337
Starting fold:  1
0.017728636817706374
Starting fold:  2
0.0174600812110519
Starting fold:  3
0.017768631367041644
Starting fold:  4
0.017781262518877964
[0.017738735475654337, 0.017728636817706374, 0.0174600812110519, 0.017768631367041644, 0.017781262518877964]
Mean OOF loss across folds 0.017695469478066442
STD OOF loss across folds 0.00011924157250244787


In [11]:
aucs = []
for task_id in range(targets.shape[1]-1):
    aucs.append(roc_auc_score(y_true=targets.iloc[:, task_id+1].values,
                              y_score=oof_preds[:, task_id]))
print(f"Overall AUC : {np.mean(aucs)}")

NameError: name 'roc_auc_score' is not defined

In [12]:
check = np.zeros([targets.shape[0], targets.shape[1]-1])
check[cons_train_index,:] = oof_preds
print('OOF log loss: ', log_loss(np.ravel(y), np.ravel(check)))

OOF log loss:  0.016308906175128927


In [13]:
sub.loc[cons_test_index,target_feats] = test_preds
sub.loc[noncons_test_index,target_feats] = 0
sub.to_csv('submission.csv', index=False)