In [2]:
import multiprocessing
multiprocessing.cpu_count()

8

In [3]:
import psutil
psutil.cpu_count()

8

## Imports

In [1]:
import os
import gc
import sys
import glob
import json
import random
import joblib
import itertools

import numpy as np
import pandas as pd

import matplotlib.pylab as plt
import seaborn as sns

from functools import reduce
from tqdm.auto import tqdm
from itertools import cycle

from scipy import stats

from sklearn import metrics
from sklearn import model_selection
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import ensemble
from sklearn import decomposition
from sklearn import tree
from sklearn import feature_selection

import warnings
warnings.filterwarnings("ignore")

import lightgbm as lgb
import xgboost as xgb

pd.set_option("display.max_columns", None)

plt.style.use("ggplot")
color_pal = plt.rcParams["axes.prop_cycle"].by_key()["color"]
color_cycle = cycle(plt.rcParams["axes.prop_cycle"].by_key()["color"])

In [3]:
def amex_metric(y_true, y_pred):
    labels = np.transpose(np.array([y_true, y_pred]))
    labels = labels[labels[:, 1].argsort()[::-1]]
    weights = np.where(labels[:,0]==0, 20, 1)
    cut_vals = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])
    gini = [0,0]
    for i in [1,0]:
        labels = np.transpose(np.array([y_true, y_pred]))
        labels = labels[labels[:, i].argsort()[::-1]]
        weight = np.where(labels[:,0]==0, 20, 1)
        weight_random = np.cumsum(weight / np.sum(weight))
        total_pos = np.sum(labels[:, 0] *  weight)
        cum_pos_found = np.cumsum(labels[:, 0] * weight)
        lorentz = cum_pos_found / total_pos
        gini[i] = np.sum((lorentz - weight_random) * weight)
    return 0.5 * (gini[1]/gini[0] + top_four)

In [4]:
def lgb_amex_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'amex_metric', amex_metric(y_true, y_pred), True

In [5]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

In [6]:
def get_P_2_buckets(df, nfolds=5):
    """
    CV stabilization trick 2.
    Create buckets to stratify train set by P_2_LST | TARGET values.
    Help to reduce noise on hold-out CV.
    """
    df = df[['P_2_last', 'target']].copy()
    df = df.sort_values(by='P_2_last', ascending=False)\
    .reset_index().rename({'index':'row_id'}, axis=1)

    buckets = np.zeros(df.shape[0])

    p0, p1, ind = 0, 0, 0
    for i in range(df.shape[0]):
        buckets[i] = ind
        p0 += np.int8(df.loc[i, 'target'] == 0)
        p1 += np.int8(df.loc[i, 'target'] == 1)
        if p0 >= nfolds and p1 >= nfolds:
            ind += 1
            p0, p1 = 0, 0

    df.loc[:, 'bucket_id'] = buckets

    df.loc[df.loc[:, 'P_2_last'].isnull(), 'bucket_id'] = -1
    df.loc[df.loc[:, 'bucket_id'] == 0, 'bucket_id'] = 1
    df.loc[df.loc[:, 'bucket_id'] == np.max(df.loc[:, 'bucket_id']), 'bucket_id'] = np.max(df.loc[:, 'bucket_id']) - 1

    df = df.sort_values(by='row_id', ascending=True).reset_index(drop=True)
    
    return np.int64(df.loc[:, 'bucket_id'] + df.loc[:, 'target']*10**6) 

## Config

In [7]:
class cfg:
    exp_name = "new_feats"
    oof_dir = "OOFs"
    pred_dir = "PREDs"
    seed = 4242
    n_folds = 5
    target = 'target'
    save_oof = os.path.join(oof_dir, exp_name)
    save_pred = os.path.join(pred_dir, exp_name)

os.makedirs(cfg.save_oof, exist_ok = True)
os.makedirs(cfg.save_pred, exist_ok = True)

## Loading Data

In [8]:
train_df = pd.read_feather("input/feature_engg_v2/sammy_top_pp_sq.f")

## Model

In [9]:
cat_features = [
    "B_30",
    "B_38",
    "D_114",
    "D_116",
    "D_117",
    "D_120",
    "D_126",
    "D_63",
    "D_64",
    "D_66",
    "D_68",
]

cat_features = [f"{cf}_last" for cf in cat_features]
features = [col for col in train_df.columns if col not in ['customer_ID', cfg.target]]
cat_features = [col for col in cat_features if col in features]
features.__len__(), cat_features.__len__()

(3858, 11)

In [10]:
# params = {
#     'objective': 'binary',
#     #'metric': "binary_logloss",
#     'boosting': 'dart',
#     'seed': cfg.seed,
#     'num_leaves': 100,
#     'learning_rate': 0.01,
#     'feature_fraction': 0.20,
#     'bagging_freq': 10,
#     'bagging_fraction': 0.50,
#     'n_jobs': -1,
#     'lambda_l2': 2,
#     'min_data_in_leaf': 40,
#     'metric' : 'amex_metric'
# }

In [11]:
params = {
    "drop_rate": 0.035677,
    "skip_drop": 0.5,
    "max_drop": 50,
    "uniform_drop": False,
    "colsample_bytree": 0.01935038,
    "reg_alpha": 0.08525348,
    "reg_lambda": 56.13959031,
    "learning_rate": 0.01,
    "max_depth": -1,
    "min_child_samples": 40,
    "min_split_gain": 0,
    "min_child_weight": 0.01,
    "num_leaves": 63,
    "max_bin": 1000,
    "subsample": 0.85107051,
    "pos_bagging_fraction": 1,
    "neg_bagging_fraction": 1,
    "subsample_freq": 3,
    "save_binary": True,
    "seed": 42,
    "feature_fraction_seed": 42,
    "bagging_seed": 42,
    "drop_seed": 42,
    "data_random_seed": 42,
    "objective": 'binary',
    "boosting_type": 'dart',
    "verbose": -1,
    "metric": 'amex_metric',
    "is_unbalance": True,
    "boost_from_average": True,
    "n_jobs": -1
}

In [12]:
bucket_id = get_P_2_buckets(train_df, nfolds=cfg.n_folds)

In [13]:
kfold = model_selection.StratifiedKFold(n_splits = cfg.n_folds, shuffle = True, random_state = cfg.seed)
for fold, (trn_ind, val_ind) in enumerate(kfold.split(train_df, train_df[cfg.target], groups=bucket_id)):
    if fold in [0, 1]:
        print(' ')
        print('-'*50)
        print(f'Training fold {fold} with {len(features)} features...')
        x_train, x_val = train_df[features].iloc[trn_ind], train_df[features].iloc[val_ind]
        y_train, y_val = train_df[cfg.target].iloc[trn_ind], train_df[cfg.target].iloc[val_ind]
        lgb_train = lgb.Dataset(x_train, y_train, categorical_feature = cat_features)
        lgb_valid = lgb.Dataset(x_val, y_val, categorical_feature = cat_features)

        del x_train, y_train
        gc.collect()

        global max_score 
        max_score = 0.75
        def save_model():
            def callback(env):
                global max_score
                iteration = env.iteration
                score = env.evaluation_result_list[0][2]
                if iteration % 500 == 0:
                    print('iteration {}, score= {:.05f}, max_score= {:.05f}'.format(iteration,score,max_score))
                if score > max_score:
                    max_score = score
                    path = f'{cfg.save_dir}/fold_{fold}'
                    os.makedirs(path, exist_ok=True)
                    for fname in os.listdir(path):
                        if fname.endswith(".pkl"):
                            os.remove(os.path.join(path, fname))
                    # print('High Score: iteration {}, score={:.05f}'.format(iteration, score))
                    joblib.dump(env.model,os.path.join(path,f"{score}.pkl"))
            callback.order = 0
            return callback

        model = lgb.train(
            params = params,
            train_set = lgb_train,
            num_boost_round = 30000,
            valid_sets = [lgb_valid],
            early_stopping_rounds = 2000,
            verbose_eval = 500,
            feval = lgb_amex_metric,
            callbacks=[save_model()]
        )
        feat_imp=pd.DataFrame({'Variables':features,'Importance':model.feature_importance()})
        feat_imp=feat_imp.sort_values(by='Importance',ascending=False)
        print(feat_imp.head(20))

        print(f"Fold {fold} Best score : {max_score}")
        model = joblib.load(f"{cfg.save_dir}/fold_{fold}/{max_score}.pkl")

        val_pred = model.predict(x_val)
        score = amex_metric(y_val, val_pred)
        oof_df = y_val.reset_index()
        oof_df['pred'] = val_pred
        oof_df.to_csv(f"{cfg.save_oof}/oof_fold{fold}_seed{cfg.seed}.csv", index=False)

        print(f'Our fold {fold} CV score is {score}')
        del x_val, y_val, lgb_train, lgb_valid
        gc.collect()

 
--------------------------------------------------
Training fold 0 with 3858 features...
iteration 0, score= 0.61908, max_score= 0.75000
[500]	valid_0's amex_metric: 0.752423
iteration 500, score= 0.75248, max_score= 0.75242
[1000]	valid_0's amex_metric: 0.765194
iteration 1000, score= 0.76519, max_score= 0.76519
[1500]	valid_0's amex_metric: 0.771493
iteration 1500, score= 0.77151, max_score= 0.77166
[2000]	valid_0's amex_metric: 0.775901
iteration 2000, score= 0.77618, max_score= 0.77620
[2500]	valid_0's amex_metric: 0.779879
iteration 2500, score= 0.78001, max_score= 0.78012
[3000]	valid_0's amex_metric: 0.784738
iteration 3000, score= 0.78474, max_score= 0.78475
[3500]	valid_0's amex_metric: 0.786014
iteration 3500, score= 0.78599, max_score= 0.78606
[4000]	valid_0's amex_metric: 0.787386
iteration 4000, score= 0.78739, max_score= 0.78748
[4500]	valid_0's amex_metric: 0.788234
iteration 4500, score= 0.78813, max_score= 0.78835
[5000]	valid_0's amex_metric: 0.788981
iteration 5000

In [14]:
oof_dfs = []
for fold in range(0, 5):
    try:
        tmp_df = pd.read_csv(f"{cfg.save_oof}/oof_fold{fold}_seed{cfg.seed}.csv")
        oof_dfs.append(tmp_df)
    except:
        pass
oof_dfs = pd.concat(oof_dfs)
oof_dfs = oof_dfs.sort_values('index')
oof_dfs.to_csv(f"{cfg.save_oof}/oof_fold{cfg.seed}.csv")
oof_score = amex_metric(y_true=oof_dfs['target'], y_pred=oof_dfs['pred'])
print(oof_score)

0.7982081642589892


In [15]:
del train_df
gc.collect()

172

In [16]:
test_df = pd.read_feather("input/feature_engg_v2/sammy_top_pp_sq_test.f")

In [17]:
features.__len__()

3858

In [18]:
test_df.shape[0]//3

308207

In [19]:
308207 + 308207

616414

In [20]:
test_df1 = test_df.iloc[:308207]
test_df2 = test_df.iloc[308207:616414]
test_df3 = test_df.iloc[616414:]

test_df1.shape, test_df2.shape, test_df3.shape

((308207, 3859), (308207, 3859), (308207, 3859))

In [21]:
ids = test_df['customer_ID']

In [22]:
%%time
y_pred_list = []
model_list = glob.glob(f"{cfg.save_dir}/fold*/*.pkl")
print(len(model_list))
for path in tqdm(model_list):
    model = joblib.load(path)
    y_pred = model.predict( test_df1[features])
    y_pred_list.append(y_pred)
y_pred_mean1 = np.mean(y_pred_list, axis=0)
y_pred_median1 = np.median(y_pred_list, axis=0)

5


  0%|          | 0/5 [00:00<?, ?it/s]

CPU times: user 4h 8min 45s, sys: 60 s, total: 4h 9min 45s
Wall time: 35min 49s


In [23]:
%%time
y_pred_list = []
model_list = glob.glob(f"{cfg.save_dir}/fold*/*.pkl")
print(len(model_list))
for path in tqdm(model_list):
    model = joblib.load(path)
    y_pred = model.predict( test_df2[features])
    y_pred_list.append(y_pred)
y_pred_mean2 = np.mean(y_pred_list, axis=0)
y_pred_median2 = np.median(y_pred_list, axis=0)

5


  0%|          | 0/5 [00:00<?, ?it/s]

CPU times: user 4h 6min 59s, sys: 29.4 s, total: 4h 7min 28s
Wall time: 34min 46s


In [24]:
%%time
y_pred_list = []
model_list = glob.glob(f"{cfg.save_dir}/fold*/*.pkl")
print(len(model_list))
for path in tqdm(model_list):
    model = joblib.load(path)
    y_pred = model.predict( test_df3[features])
    y_pred_list.append(y_pred)
y_pred_mean3 = np.mean(y_pred_list, axis=0)
y_pred_median3 = np.median(y_pred_list, axis=0)

5


  0%|          | 0/5 [00:00<?, ?it/s]

CPU times: user 4h 5min 2s, sys: 27.7 s, total: 4h 5min 29s
Wall time: 34min 33s


In [25]:
y_pred_mean = np.concatenate([y_pred_mean1, y_pred_mean2, y_pred_mean3])

In [26]:
y_pred_median = np.concatenate([y_pred_median1, y_pred_median2, y_pred_median3])

In [27]:
ss_df = pd.read_csv("input/sample_submission.csv")

In [28]:
sub_df = pd.DataFrame({
    "customer_ID" : ss_df['customer_ID'],
    "prediction" : y_pred_mean
})

sub_df.to_csv(f"{cfg.save_dir}/submission_{cfg.exp_name}.csv", index=False)

In [29]:
sub_df = pd.DataFrame({
    "customer_ID" : ss_df['customer_ID'],
    "prediction" : y_pred_median
})

sub_df.to_csv(f"{cfg.save_dir}/submission_{cfg.exp_name}_median.csv", index=False)