- try multioutputclassifier without gpu
- cancel 2nd layer structures
- add regression function

In [1]:
import os
import sys
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn import preprocessing
from lightgbm import LGBMClassifier
from sklearn.metrics import log_loss, mean_squared_error
from sklearn.decomposition import PCA
from tqdm._tqdm_notebook import tqdm_notebook
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import StratifiedKFold, KFold
sys.path.append('../input/multilabelstraifier/')
from ml_stratifiers import MultilabelStratifiedKFold
warnings.filterwarnings('ignore')

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  del sys.path[0]


In [2]:
SEED = 42
NFOLDS = 5
np.random.seed(SEED)

In [3]:
DIR = "/kaggle/input/lish-moa/"
train = pd.read_csv(DIR+"train_features.csv")
test = pd.read_csv(DIR+"test_features.csv")
nontargets = pd.read_csv(DIR+"train_targets_nonscored.csv")
targets = pd.read_csv(DIR+"train_targets_scored.csv")
sub = pd.read_csv(DIR+"sample_submission.csv")

In [4]:
target_feats = [ i for i in targets.columns if i != "sig_id"]
g_feats = [i for i in train.columns if "g-" in i]
c_feats = [i for i in train.columns if "c-" in i]

In [5]:
noncons_train_index = train[train.cp_type=="ctl_vehicle"].index
cons_train_index = train[train.cp_type!="ctl_vehicle"].index
noncons_test_index = test[test.cp_type=="ctl_vehicle"].index
cons_test_index = test[test.cp_type!="ctl_vehicle"].index

# preprocess

In [6]:
# normalization by ctl group
train_ctl = train[train.index.isin(noncons_train_index)].copy().reset_index(drop=True)
test_ctl = test[test.index.isin(noncons_test_index)].copy().reset_index(drop=True)
ctl_df = pd.concat([train_ctl, test_ctl])

ctl_group_data = ctl_df.groupby(["cp_dose", "cp_time"]).agg({"mean"}).reset_index()
mean_g_feats = ["mean-" + i for i in g_feats]
mean_c_feats = ["mean-" + i for i in c_feats]
columns = ["cp_dose", "cp_time"] + mean_g_feats + mean_c_feats
ctl_group_data.columns = columns

#train_cons = train_feat[train_feat.index.isin(cons_train_index)].copy().reset_index(drop=True)
#test_cons = test_feat[test_feat.index.isin(cons_test_index)].copy().reset_index(drop=True)
#n_train_score = train_score[train_score.index.isin(cons_train_index)].copy().reset_index(drop=True)
#n_train_nonscore = train_nonscore[train_nonscore.index.isin(cons_train_index)].copy().reset_index(drop=True)

n_train = pd.merge(train, ctl_group_data, on=["cp_time", "cp_dose"], how="left")
n_test = pd.merge(test, ctl_group_data, on=["cp_time", "cp_dose"], how="left")

for i in range(len(g_feats)):
    n_train["diff-g-"+str(i)] = n_train["g-"+str(i)] - n_train["mean-g-"+str(i)]
    n_test["diff-g-"+str(i)] = n_test["g-"+str(i)] - n_test["mean-g-"+str(i)]
    
for i in range(len(c_feats)):
    n_train["diff-c-"+str(i)] = n_train["c-"+str(i)] - n_train["mean-c-"+str(i)]
    n_test["diff-c-"+str(i)] = n_test["c-"+str(i)] - n_test["mean-c-"+str(i)]

In [7]:
categoricals = ["cp_dose"]

def encoding(tr, te):
    for f in categoricals:
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(tr[f]))
        tr[f] = lbl.transform(list(tr[f]))
        te[f] = lbl.transform(list(te[f])) 
        
    return tr, te

n_train, n_test = encoding(n_train, n_test)

# feature engineering

In [8]:
def fe(df, remove_features):
    df.drop(remove_features, axis=1, inplace=True)
    return df

remove_features = ["cp_type", "sig_id"] + mean_g_feats + mean_c_feats + g_feats + c_feats 
for i in [i for i in n_train.columns if i != "sig_id"]:
    if i not in remove_features and (n_train[i].std() == 0):
        remove_features.append(i)
        
n_train = fe(n_train, remove_features)
n_test = fe(n_test, remove_features)
    
print(n_train.shape, n_test.shape)

(23814, 874) (3982, 874)


In [9]:
def regression(tr, ta, te):
    y_train = ta.sum(axis=1).copy()
    X_train = tr.copy()
    X_test = te.copy()

    params = {'objective': 'regression', 'metric': 'rmse', 'boosting_type': 'gbdt', 'tree_learner': 'serial', 'learning_rate': 0.01, 
               "num_leaves": 10, 'random_seed':44, 'max_depth': 4} 
        
    n_folds=4
    skf=StratifiedKFold(n_splits = n_folds, shuffle=True, random_state=0)

    train_score_pred = np.zeros([X_train.shape[0]])
    test_score_pred = np.zeros(X_test.shape[0])
    for i , (train_index, test_index) in enumerate(skf.split(X_train, y_train)):
        X_train2 = X_train.iloc[train_index,:]
        y_train2 = y_train.iloc[train_index]

        X_valid2 = X_train.iloc[test_index,:]
        y_valid2 = y_train.iloc[test_index]
        
        lgb_train = lgb.Dataset(X_train2, y_train2)
        lgb_eval = lgb.Dataset(X_valid2, y_valid2, reference=lgb_train)
        
        clf = lgb.train(params, lgb_train,valid_sets=[lgb_train, lgb_eval], 
               num_boost_round=10000,early_stopping_rounds=50,verbose_eval = 100) 

        valid_predict = clf.predict(X_valid2, num_iteration = clf.best_iteration)
        train_score_pred[test_index] = valid_predict
        test_score_pred += clf.predict(X_test, num_iteration = clf.best_iteration) / n_folds
            
    print("score:", mean_squared_error(y_train, valid))
    return train_score_pred, test_score_pred

# 1st model (multi output)

In [10]:
clf = MultiOutputClassifier(LGBMClassifier(tree_method='hist')) #gpu_hist

params = {#'estimator__colsample_bytree': 0.9522,
          'estimator__learning_rate': 0.0503,
          'estimator__max_depth': 10,
          'estimator__min_child_weight': 31.5800,
          'estimator__n_estimators': 160,
          #'estimator__subsample': 0.8639,
         }

print(clf.get_params().keys())
clf.set_params(**params)

dict_keys(['estimator__boosting_type', 'estimator__class_weight', 'estimator__colsample_bytree', 'estimator__importance_type', 'estimator__learning_rate', 'estimator__max_depth', 'estimator__min_child_samples', 'estimator__min_child_weight', 'estimator__min_split_gain', 'estimator__n_estimators', 'estimator__n_jobs', 'estimator__num_leaves', 'estimator__objective', 'estimator__random_state', 'estimator__reg_alpha', 'estimator__reg_lambda', 'estimator__silent', 'estimator__subsample', 'estimator__subsample_for_bin', 'estimator__subsample_freq', 'estimator__tree_method', 'estimator', 'n_jobs'])


MultiOutputClassifier(estimator=LGBMClassifier(learning_rate=0.0503,
                                               max_depth=10,
                                               min_child_weight=31.58,
                                               n_estimators=160,
                                               tree_method='hist'))

In [11]:
def multi_modelling(tr, ta, te):
    X = tr.copy()
    y = ta.drop("sig_id", axis=1).copy()
    X_test = te.copy()

    oof_preds = np.zeros(y.shape)
    test_preds = np.zeros((test.shape[0], y.shape[1]))
    oof_losses = []
    mskf = MultilabelStratifiedKFold(n_splits=NFOLDS, random_state=SEED, shuffle=True)
    for fn, (trn_idx, val_idx) in enumerate(mskf.split(X, y)):
        print('Starting fold: ', fn)
        X_train, X_val = X.iloc[trn_idx,:], X.iloc[val_idx,:].to_numpy()
        y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx].to_numpy()
    
        X_train = X_train[X_train.index.isin(cons_train_index)].to_numpy()
        y_train = y_train[y_train.index.isin(cons_train_index)].to_numpy()
    
        clf.fit(X_train, y_train)
        val_preds = clf.predict_proba(X_val) # list of preds per class
        val_preds = np.array(val_preds)[:,:,1].T # take the positive class
        oof_preds[val_idx] = val_preds
    
        loss = log_loss(np.ravel(y_val), np.ravel(val_preds))
        print(loss)
        oof_losses.append(loss)
        preds = clf.predict_proba(X_test)
        preds = np.array(preds)[:,:,1].T # take the positive class
        test_preds += preds / NFOLDS
    
    print(oof_losses)
    print('Mean OOF loss across folds', np.mean(oof_losses))
    print('STD OOF loss across folds', np.std(oof_losses))
    return oof_preds, test_preds
    
oof_preds, test_preds = multi_modelling(n_train, targets, n_test)

Starting fold:  0
0.016850499174407747
Starting fold:  1
0.01698703268588588
Starting fold:  2
0.01691767019520539
Starting fold:  3
0.016634349310579655
Starting fold:  4
0.016809659993064215
[0.016850499174407747, 0.01698703268588588, 0.01691767019520539, 0.016634349310579655, 0.016809659993064215]
Mean OOF loss across folds 0.016839842271828578
STD OOF loss across folds 0.00011914201634549347


In [12]:
train_checkscore = targets.copy()
train_checkscore.iloc[:,1:] = oof_preds
train_checkscore.loc[train_checkscore.index.isin(noncons_train_index),target_feats] = 0
print('OOF log loss: ', log_loss(np.ravel(targets.drop("sig_id", axis=1)), np.ravel(np.array(train_checkscore.iloc[:,1:]))))

OOF log loss:  0.016599404799040197


In [13]:
sub.iloc[:,1:] = test_preds
sub.loc[sub.index.isin(noncons_test_index),target_feats] = 0
sub.to_csv('submission.csv', index=False)

# 2nd model

In [14]:
def modelling_lgb(new_train, target_train, new_test, target):
    
    X_train = new_train.copy()
    X_test = new_test.copy()
    y_train = target_train[target].copy()
    
    params = {'objective': 'binary', 'boosting_type': 'gbdt', 'tree_learner': 'serial', 'learning_rate': 0.01, 
               "num_leaves": 10, 'random_seed':44, 'max_depth': 5} 
        
    n_folds=4
    if target not in ["erbb2_inhibitor", "atp-sensitive_potassium_channel_antagonist"]:
        skf=StratifiedKFold(n_splits = n_folds, shuffle=True, random_state=0)
    else:
        skf=KFold(n_splits = n_folds, shuffle=True, random_state=0)

    valid = np.zeros([X_train.shape[0]])
    pred_value = np.zeros(X_test.shape[0])
    for i , (train_index, test_index) in enumerate(skf.split(X_train, y_train)):
        X_train2 = X_train.iloc[train_index,:]
        y_train2 = y_train.iloc[train_index]

        X_valid2 = X_train.iloc[test_index,:]
        y_valid2 = y_train.iloc[test_index]
        
        lgb_train = lgb.Dataset(X_train2, y_train2)
        lgb_eval = lgb.Dataset(X_valid2, y_valid2, reference=lgb_train)
        
        clf = lgb.train(params, lgb_train,valid_sets=[lgb_train, lgb_eval], 
               num_boost_round=10000,early_stopping_rounds=25,verbose_eval = 0) 

        valid_predict = clf.predict(X_valid2, num_iteration = clf.best_iteration)
        valid[test_index] = valid_predict
        pred_value += clf.predict(X_test, num_iteration = clf.best_iteration) / n_folds
            
    score = log_loss(y_train, valid)
            
    return valid, pred_value, score

In [15]:
#layer1_train = train_checkscore[train_checkscore.index.isin(cons_train_index)].drop("sig_id", axis=1).copy()
#layer1_test = sub[sub.index.isin(cons_test_index)].drop("sig_id", axis=1).copy()
#layer1_targets = targets[targets.index.isin(cons_train_index)].drop("sig_id", axis=1).copy()

#n_train2 = pd.merge(n_train, layer1_train, on="sig_id", how="outer") 
#n_test2 = pd.merge(n_test, layer1_test, on="sig_id", how="outer")

In [16]:
#final = pd.read_csv(DIR+"sample_submission.csv")
#train_checkscore2 = targets.copy()
#target_list2 = []
#log_loss_list2 = []

#for ind, target in enumerate(target_feats):
#    print(ind, target)
#    valid, pred_value, score = modelling_lgb(layer1_train, layer1_targets, layer1_test, target)
#    train_checkscore2.loc[cons_train_index, target] = valid
#    train_checkscore2.loc[noncons_train_index, target] = 0
#    print("oof log_loss= {}, all log_loss= {}".format(score, log_loss(targets[target], train_checkscore2[target])))
#    target_list2.append(target)
#    log_loss_list2.append(score)
#    final.loc[cons_test_index, target] = pred_value
#    final.loc[noncons_test_index, target] = 0

In [17]:
# local score
#scores = [log_loss(targets[target_col], train_checkscore2[target_col]) for target_col in target_feats]
#print(np.mean(scores))

In [18]:
#score_df = pd.DataFrame(target_list2, columns=["Target"])
#score_df["score"] = log_loss_list2

#plt.figure(figsize=(20,10))
#sns.barplot(x="Target", y="score", data=score_df)
#plt.show()

#np.transpose(score_df)

# submission

In [19]:
#final.to_csv("submission.csv", index=False) 