In [1]:
import pandas as pd
import numpy as np
from typing import Tuple
import optuna
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import pickle
import time
import gc
import warnings
warnings.filterwarnings("ignore")

In [17]:
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns').sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns').sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)
    
    return 0.5 * (g + d)

In [3]:
def get_score(model, test_data, y_true):
    y_true = pd.DataFrame(y_true, columns=['target'])
    preds = model.predict(test_data)
    preds_proba = pd.DataFrame(model.predict_proba(test_data)[:,1], columns=['prediction'], index=y_true.index)
    
    roc_auc = roc_auc_score(y_true, preds)
    f1 = f1_score(y_true, preds)
    amex = amex_metric(y_true, preds_proba)
    
    print(f'roc_auc: {roc_auc}')
    print(f'f1: {f1}')
    print(f'amex_score : {amex}')
    
    return roc_auc, f1, amex

In [4]:
def get_train_test(X,y) -> Tuple[pd.DataFrame]:
    """train/test split and preparation two Pools for CatBoost algorithm
    return: four splitted train, test dataframes 
    """

    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, stratify=y, random_state=11)
    
    return X_train, X_test, y_train, y_test

def get_train_val_test(X,y) -> Tuple[pd.DataFrame]:
    """train/val/test split and preparation two Pools for CatBoost algorithm
    return: six splitted train, test dataframes 
    """

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=11)
    X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, stratify=y_test, random_state=11)
    
    return X_train, X_val, X_test, y_train, y_val, y_test

# Train meta models for test predicts

In [6]:
df = pd.read_parquet('../input/amex-finally/finally_dataset.parquet')

In [7]:
cat_features = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68',
"B_30_first", "B_38_first", "D_114_first", "D_116_first", "D_117_first", "D_120_first", "D_126_first",
"D_63_first", "D_64_first", "D_66_first", "D_68_first", "B_30_last", "B_38_last", "D_114_last", "D_116_last", "D_117_last", 
"D_120_last", "D_126_last","D_63_last", "D_64_last", "D_66_last", "D_68_last" ]
l = ['D_114', 'D_117', 'D_120', 'D_64', 'D_66', 'B_30_first', 'D_114_first', 'D_116_first', 'D_120_first', 'D_126_first', 
 'D_63_first', 'D_64_first', 'D_66_first', 'D_68_first', 'B_30_last', 'B_38_last', 'D_116_last', 'D_126_last', 'D_63_last', 
 'D_68_last']

cat_features = list(set(cat_features) - set(l))

In [8]:
df[cat_features].astype("category")
df[cat_features] = df[cat_features] + 1    # categorical features contains negative values 

In [9]:
gc.collect()

## XGBoost

In [10]:
# Optuna params
xgb_params = {
            "objective": "binary:logistic",
            "eval_metric": "logloss",
            'n_estimators': 10000,                
            'learning_rate': 0.029423426886939774, 
            'learning_rate': 0.01,
            'booster': 'gbtree',
            'max_depth': 5, 
            'grow_policy': 'depthwise',
            'sampling_method': 'gradient_based',
            'colsample_bytree': 0.66, 
            'subsample': 0.25, 
            'min_child_weight': 130, 
            'gamma': 0.36990716718927885, 
            'lambda': 0.020979804579924967, 
            'random_state': 17, 
            'tree_method':'gpu_hist',
            'predictor' : "gpu_predictor",
            'enable_categorical': True,
            'max_cat_to_onehot':3
            }

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
                        df.drop('target', axis=1), df['target'], test_size=0.20, stratify=df['target'], random_state=11
                        )

xgb = XGBClassifier(**xgb_params)
xgb.fit(X_train, y_train, 
        eval_set=[(X_train, y_train), (X_test, y_test)],
        early_stopping_rounds=1000,
        verbose=250)

In [13]:
file_name = "xgb.pkl"
pickle.dump(xgb, open(file_name, "wb"))

## LightGBM

In [11]:
# Optuna params
lgbm_params = {
            'objective': "binary",
            'metric': "binary_logloss",
            'device_type': 'gpu', 
            'n_estimators': 10000,
            'learning_rate': 0.01,
            'boosting_type': 'gbdt', 
            'num_leaves': 202, 
            'max_depth': 6, 
            'min_data_in_leaf': 5650, 
            'lambda_l1': 0.0005649399426788199, 
            'lambda_l2': 0.3919995828102032, 
            'min_gain_to_split': 0.5151712833075788, 
            'feature_fraction': 0.10564610997790758, 
            'bagging_fraction': 0.6343447733973661, 
            'bagging_freq': 4,
            'seed': 42
            }


In [16]:
X_train, X_test, y_train, y_test = train_test_split(
                        df.drop('target', axis=1), df['target'], test_size=0.20, stratify=df['target'], random_state=0
                        )

lgbm = LGBMClassifier(**lgbm_params)
lgbm.fit(X_train, y_train, 
        eval_set=[(X_train, y_train), (X_test, y_test)],
        early_stopping_rounds=1000,
        categorical_feature=cat_features,
        verbose=250)

In [18]:
file_name = "lgbm.pkl"
pickle.dump(lgbm, open(file_name, "wb"))

## CatBoost

In [12]:
# Optuna params
cb_params = {
            'task_type':"GPU",
            'iterations' : 10000,
            'border_count' : 254,
            'random_seed': 17,
            'objective': 'CrossEntropy', 
            'learning_rate': 0.01,
            'grow_policy': 'Depthwise', 
            'l2_leaf_reg': 0.4988032266322125, 
            'random_strength': 0, 
            'depth': 8, 
            'max_ctr_complexity': 0, 
            'bootstrap_type': 'Poisson', 
            'min_data_in_leaf': 760
            }

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
                        df.drop('target', axis=1), df['target'], test_size=0.20, stratify=df['target'], random_state=17
                        )

cb = CatBoostClassifier(**cb_params)
cb.fit(X_train, y_train, 
        eval_set=(X_test, y_test),
        early_stopping_rounds=1000,
        cat_features=cat_features,
        verbose=500)

In [14]:
cb.save_model('cb_total') 

# K-Fold training meta features

In [13]:
X = df.drop('target', axis=1)
y = df['target']

In [14]:
gc.collect()

In [16]:
N_SPLITS = 3
CUR_SPLIT = 1

meta_train = pd.DataFrame(data=0, index=X.index , columns=['xgb', 'lgbm', 'cb'])

cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=11)

for train_idx, val_idx in cv.split(X, y):
    print(f"##### FOLD {CUR_SPLIT} #####")
    X_tr, X_v = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_v = y.iloc[train_idx], y.iloc[val_idx]

    print('train xgb')
    xgb = XGBClassifier(**xgb_params)
    xgb.fit(X_tr, y_tr, 
            eval_set=[(X_tr, y_tr)],
            verbose=250)
    meta_train['xgb'].update(pd.Series(xgb.predict_proba(X_v)[:,1], index=val_idx))    

    print('train lgbm')
    lgbm = LGBMClassifier(**lgbm_params)
    lgbm.fit(X_tr, y_tr, 
             eval_set=[(X_tr, y_tr)],
             categorical_feature=cat_features,
             verbose=250)
    meta_train['lgbm'].update(pd.Series(lgbm.predict_proba(X_v)[:,1], index=val_idx))
    
    print('train cb')
    cb = CatBoostClassifier(**cb_params)
    cb.fit(X_tr, y_tr, 
    cat_features=cat_features,
    verbose=500)
    meta_train['cb'].update(pd.Series(cb.predict_proba(X_v)[:,1], index=val_idx))
    
    CUR_SPLIT += 1

In [20]:
meta_train['target'] = df['target']

In [21]:
meta_train.to_csv('meta_train.csv', index=False)

# Test handling

In [6]:
xgb_total = pickle.load(open('../input/models/xgb_diff.pkl', "rb"))
lgbm_total = pickle.load(open('../input/models/lgbm_diff.pkl', "rb"))
cb_total = CatBoostClassifier().load_model('../input/models/cb_diff')

In [7]:
test_df = pd.read_parquet('../input/amex-data-integer-dtypes-parquet-format/test.parquet')

In [8]:
meta_test_df = pd.DataFrame(data=0, index=range(0, 924621), columns=['xgb', 'lgbm', 'cb'])

In [9]:
all_models = [xgb_total, lgbm_total, cb_total]

### Split the test data on chunks

In [10]:
test_df_1 = test_df.loc[0:1386893].copy()
test_df_2 = test_df.loc[1386894:2773784].copy()
test_df_3 = test_df.loc[2773785:4160676].copy()
test_df_4 = test_df.loc[4160677:5547567].copy()
test_df_5 = test_df.loc[5547568:6934473].copy()
test_df_6 = test_df.loc[6934474:8321375].copy()
test_df_7 = test_df.loc[8321376:9708267].copy()
test_df_8 = test_df.loc[9708268:].copy()

test_chunks = [test_df_1,test_df_2,test_df_3,test_df_4,test_df_5,test_df_6,test_df_7,test_df_8]

In [11]:
(test_df_1.shape[0] +
test_df_2.shape[0] +
test_df_3.shape[0] +
test_df_4.shape[0] +
test_df_5.shape[0] +
test_df_6.shape[0] +
test_df_7.shape[0] +
test_df_8.shape[0]) == test_df.shape[0]

In [12]:
del test_df
gc.collect()

### Feature engineering for test data

In [13]:
req_features = pd.read_parquet('../input/amex-finally/finally_dataset.parquet')
required_features = req_features.columns.to_list()

In [14]:
del req_features
gc.collect()

In [15]:
def feature_engineering(df):
    cols = [col for col in list(df.columns) if col not in ['customer_ID', 'S_2']]
    cat_features = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
    num_features = [col for col in cols if col not in cat_features]

    #LAGS
    lag = pd.concat([(df.groupby('customer_ID')[num_features].nth(-1) - df.groupby('customer_ID')[num_features].nth(-2)).add_prefix('lag1_'), 
                  (df.groupby('customer_ID')[num_features].nth(-1) - df.groupby('customer_ID')[num_features].nth(-3)).add_prefix('lag2_')], axis=1)#.reset_index()

    #Min-max diff
    ptp = df.groupby('customer_ID')[num_features].agg(['min', 'max'])
    ptp = ptp.groupby(axis=1, level=0).diff()
    ptp = ptp.iloc[:, 1::2].rename(columns={'max': 'max_min_diff'})
    ptp.columns = ['_'.join(x) for x in ptp.columns]

    #STATS
    print('stats')
    stats_agg = df.groupby("customer_ID")[num_features].agg(['first', 'mean','median', 'std', 'min', 'max', 'last'])
    stats_agg.columns = ['_'.join(x) for x in stats_agg.columns]
    print('last')
    last_minus_mean = (df.groupby('customer_ID')[num_features].last() - df.groupby('customer_ID')[num_features].mean()).add_prefix('sub_mean_')#.reset_index(inplace = True)

    
    #CAT
    cat_agg = df.groupby("customer_ID")[cat_features].agg(['count', 'first', 'last', 'nunique'])
    cat_agg.columns = ['_'.join(x) for x in cat_agg.columns]

    rfe_dropped_cols = ['B_10', 'B_12', 'B_13', 'B_15', 'B_16', 'B_19', 'B_21', 'B_22', 'B_25', 'B_26', 'B_27', 'B_28', 
            'B_30', 'B_32', 'B_36', 'B_38', 'B_40', 'B_4', 'B_8', 'D_102', 'D_103', 'D_108', 'D_109', 'D_111', 
            'D_113', 'D_115', 'D_116', 'D_118', 'D_122', 'D_123', 'D_124', 'D_125', 'D_126', 'D_127', 'D_129', 
            'D_130', 'D_131', 'D_136', 'D_137', 'D_138', 'D_139', 'D_141', 'D_143', 'D_144', 'D_145', 'D_55', 
            'D_58', 'D_59', 'D_63', 'D_65', 'D_68', 'D_69', 'D_70', 'D_71', 'D_72', 'D_74', 'D_75', 'D_78', 
            'D_80', 'D_83', 'D_84', 'D_86', 'D_87', 'D_89', 'D_91', 'D_92', 'D_93', 'D_94', 'D_96', 'P_4', 
            'R_10', 'R_12', 'R_13', 'R_14', 'R_15', 'R_16', 'R_17', 'R_18', 'R_19', 'R_21', 'R_22', 'R_23', 
            'R_24', 'R_25', 'R_28', 'R_6', 'R_8', 'R_9', 'S_11', 'S_12', 'S_16', 'S_17', 'S_18', 'S_19', 'S_20', 
            'S_22', 'S_25', 'S_27', 'S_5', 'S_6', 'S_7']
    
    add_dropped_cols_df = df.groupby('customer_ID').tail(1).reset_index(drop=True)[rfe_dropped_cols]
    
    dfs = [stats_agg, lag, ptp, last_minus_mean, cat_agg]
    train = dfs[0].join(dfs[1:])
    train = pd.concat([train.set_index(add_dropped_cols_df.index), add_dropped_cols_df], axis=1)

    print('Train shape: ', train.shape)    
    
    return train 

### And predicting each chunks of test data

In [16]:
import itertools
def get_meta_test(chunks, models, meta_df, features):
    
    NUM_PART = 1
    
    xgb, lgbm, cb = [], [], []
    preds_models = [xgb, lgbm, cb]

    for chunk in chunks:
        print(f"##### part: {NUM_PART} #####")
        featured_part = feature_engineering(chunk)[features]    
        for i, model in enumerate(models):
            print(f"##### model: {str(model)[:5]} #####")
            preds = model.predict_proba(featured_part)[:, 1]
            preds_models[i].append(preds)
   
        NUM_PART +=1
        gc.collect()
        
    meta_df['xgb'] = list(itertools.chain.from_iterable(preds_models[0]))
    meta_df['lgbm'] = list(itertools.chain.from_iterable(preds_models[1]))
    meta_df['cb'] = list(itertools.chain.from_iterable(preds_models[2]))
        
    return meta_df

In [19]:
meta_test = get_meta_test(test_chunks, all_models, meta_test_df, required_features[1:])

In [20]:
meta_test.to_csv('meta_test.csv', index=False)

# Training models on meta features

In [5]:
meta_train = pd.read_csv('../input/metastack/meta_train.csv')
meta_test = pd.read_csv('../input/metastack/meta_test.csv')

In [4]:
# Optuna params
cb_params = {
            'task_type':"GPU",
            'iterations' : 500,
            'border_count' : 254,
            'random_seed': 42,
            'objective': 'CrossEntropy', 
            'learning_rate': 0.011401459455245273, 
            'grow_policy': 'Depthwise', 
            'l2_leaf_reg': 0.4988032266322125, 
            'random_strength': 0, 
            'depth': 8, 
            'max_ctr_complexity': 0, 
            'bootstrap_type': 'Poisson', 
            'min_data_in_leaf': 760
            }

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
                        train_meta.drop('target', axis=1), train_meta['target'], test_size=0.10, stratify=train_meta['target'], random_state=42
                        )

cb = CatBoostClassifier(**cb_params)
cb.fit(X_train, y_train, 
        eval_set=(X_test, y_test),
        verbose=500)

# Stacking + blending

In [2]:
meta_train = pd.read_csv('../input/metastack/meta_train.csv')
meta_test = pd.read_csv('../input/metastack/meta_test.csv')

meta_train_diff = pd.read_csv('../input/meta-diff/meta_train_diff.csv')
meta_test_diff  = pd.read_csv('../input/meta-diff/meta_test_diff.csv')

In [3]:
# Optuna params
cb_params = {
            'task_type':"GPU",
            'iterations' : 3000,
            'border_count' : 254,
            'random_seed': 17,
            'objective': 'CrossEntropy', 
            'learning_rate': 0.01,
            'grow_policy': 'Depthwise', 
            'l2_leaf_reg': 0.4988032266322125, 
            'random_strength': 0, 
            'depth': 8, 
            'max_ctr_complexity': 0, 
            'bootstrap_type': 'Poisson', 
            'min_data_in_leaf': 760
            }

# Optuna params
xgb_params = {
            "objective": "binary:logistic",
            "eval_metric": "logloss",
            'n_estimators': 3000,                
            'learning_rate': 0.029423426886939774, 
            'learning_rate': 0.01,
            'booster': 'gbtree',
            'max_depth': 5, 
            'grow_policy': 'depthwise',
            'sampling_method': 'gradient_based',
            'colsample_bytree': 0.66, 
            'subsample': 0.25, 
            'min_child_weight': 130, 
            'gamma': 0.36990716718927885, 
            'lambda': 0.020979804579924967, 
            'random_state': 17, 
            'tree_method':'gpu_hist',
            'predictor' : "gpu_predictor",
            'enable_categorical': True,
            'max_cat_to_onehot':3
            }

In [4]:
total_meta_train = pd.DataFrame({'xgb':[], 'lgbm':[], 'cb':[], 'target':[]})
total_meta_train['xgb'] = (0.5 * meta_train['xgb'] + 0.5 * meta_train_diff['xgb'])
total_meta_train['lgbm'] = (0.5 * meta_train['lgbm'] + 0.5 * meta_train_diff['lgbm'])
total_meta_train['cb'] = (0.5 * meta_train['cb'] + 0.5 * meta_train_diff['cb'])
total_meta_train['target'] = meta_train['target']

In [5]:
X = total_meta_train.drop('target', axis=1)
y = total_meta_train['target']

In [8]:
N_SPLITS = 3
CUR_SPLIT = 1

meta_train_preds = pd.DataFrame(data=0, index=X.index , columns=['xgb', 'cb'])

cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=11)

for train_idx, val_idx in cv.split(X, y):
    print(f"##### FOLD {CUR_SPLIT} #####")
    X_tr, X_v = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_v = y.iloc[train_idx], y.iloc[val_idx]

    print('train xgb')
    xgb = XGBClassifier(**xgb_params)
    xgb.fit(X_tr, y_tr, 
            eval_set=[(X_tr, y_tr),(X_v, y_v) ],
            early_stopping_rounds=500,
            verbose=250)
    meta_train_preds['xgb'].update(pd.Series(xgb.predict_proba(X_v)[:,1], index=val_idx))    

  
    print('train cb')
    cb = CatBoostClassifier(**cb_params)
    cb.fit(X_tr, y_tr, 
           eval_set=(X_v, y_v),
    #cat_features=cat_features,
    early_stopping_rounds=500,
    verbose=500)
    meta_train_preds['cb'].update(pd.Series(cb.predict_proba(X_v)[:,1], index=val_idx))
    
    CUR_SPLIT += 1

In [15]:
meta_train_preds['target'] = meta_train['target']

In [10]:
total_meta_test = pd.DataFrame({'xgb':[], 'lgbm':[], 'cb':[]})
total_meta_test['xgb'] = (0.5 * meta_test['xgb'] + 0.5 * meta_test_diff['xgb'])
total_meta_test['lgbm'] = (0.5 * meta_test['lgbm'] + 0.5 * meta_test_diff['lgbm'])
total_meta_test['cb'] = (0.5 * meta_test['cb'] + 0.5 * meta_test_diff['cb'])

In [12]:
xgb_preds = xgb.predict_proba(total_meta_test)[:,1]
cb_preds = cb.predict_proba(total_meta_test)[:,1]

In [25]:
preds = 0.65 * xgb_preds + 0.35 * cb_preds

# Submit

In [75]:
val1, val2, val3 = 0.33, 0.33, 0.34

In [76]:
mad_diff_submit = meta_test_diff['xgb'] * val1 + meta_test_diff['lgbm'] * val2 + meta_test_diff['cb'] * val3

In [26]:
submission = pd.read_csv('../input/submission-csv/sample_submission.csv')
submission['prediction'] = preds
submission.to_csv('stack_xgb_cb.csv', index=False)

In [29]:
import os
os.chdir('./')
from IPython.display import FileLink
FileLink(r'stack_xgb_cb.csv')