In [14]:
import os
import gc
import pickle

# Import required packages
import pandas as pd
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder, RobustScaler, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.feature_selection import SelectFromModel, SelectKBest, SequentialFeatureSelector, VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedKFold

import xgboost as xgb

In [15]:
# Change working directory to project root
if os.getcwd().split("/")[-1] == "notebooks":
    os.chdir("../")

# Enable garbage collection
gc.enable()

pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [16]:
'''
# Lag Features
    for col in train_num_agg:
        if 'last' in col and col.replace('last', 'first') in train_num_agg:
            train_num_agg[col + '_lag_sub'] = train_num_agg[col] - train_num_agg[col.replace('last', 'first')]
            train_num_agg[col + '_lag_div'] = train_num_agg[col] / train_num_agg[col.replace('last', 'first')]
'''

"\n# Lag Features\n    for col in train_num_agg:\n        if 'last' in col and col.replace('last', 'first') in train_num_agg:\n            train_num_agg[col + '_lag_sub'] = train_num_agg[col] - train_num_agg[col.replace('last', 'first')]\n            train_num_agg[col + '_lag_div'] = train_num_agg[col] / train_num_agg[col.replace('last', 'first')]\n"

In [17]:
def aggregate_features(X_amex):
    numeric__agg = (X_amex
                    .select_dtypes(include='number')
                    .groupby('customer_ID')
                    .agg(['first', 'last', 'mean', 'min', 'max', 'std', 'sem']))
    numeric__agg.columns = ['__'.join(col) for col in numeric__agg.columns]
    
    for col in numeric__agg:
        if 'last' in col and col.replace('last', 'first') in numeric__agg:
            numeric__agg[col + '_lag_delta'] = numeric__agg[col] - numeric__agg[col.replace('last', 'first')]
            numeric__agg[col + '_lag_div'] = numeric__agg[col] / numeric__agg[col.replace('last', 'first')]
    
    categorical__agg = (X_amex
                        .select_dtypes(include='category')
                        .groupby('customer_ID')
                        .agg(['first', 'last', 'count', 'nunique']))
    categorical__agg.columns = ['__'.join(col) for col in categorical__agg.columns]

    X_amex__agg = pd.concat([categorical__agg, numeric__agg], axis=1)

    return X_amex__agg

In [18]:
#X_amex = load_amex('train').pipe(add_features)

In [19]:
# Source: https://www.kaggle.com/code/munumbutt/simple-lgbm-starter/notebook

def amex_metric(y_true, y_pred) -> float:
    if isinstance(y_true, (pd.Series, np.ndarray)):
        y_true = pd.DataFrame(y_true)
    if isinstance(y_pred, (pd.Series, np.ndarray)):
        y_pred = pd.DataFrame(y_pred, columns=['prediction'])
    
    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

In [20]:
# Source: https://www.kaggle.com/code/werus23/amex-feature-engineering

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [21]:
def xgb_amex(y_pred, y_true):
    return 'amex', amex_metric_np(y_pred,y_true.get_label())

# Created by https://www.kaggle.com/yunchonggan
# https://www.kaggle.com/competitions/amex-default-prediction/discussion/328020
def amex_metric_np(preds: np.ndarray, target: np.ndarray) -> float:
    indices = np.argsort(preds)[::-1]
    preds, target = preds[indices], target[indices]

    weight = 20.0 - target * 19.0
    cum_norm_weight = (weight / weight.sum()).cumsum()
    four_pct_mask = cum_norm_weight <= 0.04
    d = np.sum(target[four_pct_mask]) / np.sum(target)

    weighted_target = target * weight
    lorentz = (weighted_target / weighted_target.sum()).cumsum()
    gini = ((lorentz - cum_norm_weight) * weight).sum()

    n_pos = np.sum(target)
    n_neg = target.shape[0] - n_pos
    gini_max = 10 * n_neg * (n_pos + 20 * n_neg - 19) / (n_pos + 20 * n_neg)

    g = gini / gini_max
    return 0.5 * (g + d)

# https://www.kaggle.com/kyakovlev
# https://www.kaggle.com/competitions/amex-default-prediction/discussion/327534
def amex_score(y_true, y_pred):

    labels     = np.transpose(np.array([y_true, y_pred]))
    labels     = labels[labels[:, 1].argsort()[::-1]]
    weights    = np.where(labels[:,0]==0, 20, 1)
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]]
        weight         = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weight / np.sum(weight))
        total_pos      = np.sum(labels[:, 0] *  weight)
        cum_pos_found  = np.cumsum(labels[:, 0] * weight)
        lorentz        = cum_pos_found / total_pos
        gini[i]        = np.sum((lorentz - weight_random) * weight)

    return 0.5 * (gini[1]/gini[0] + top_four)


def xgb__amex_metric(labels, predt):
    score = 1 - amex_score(labels, predt)
    return score

In [22]:
def load_amex(dataset, use_feather=True):
    if use_feather:
        return {
        'train': pd.read_feather('./data/external/compressed/train_data.ftr'),
        'test': pd.read_feather('./data/external/compressed/test_data.ftr'),
        'train_agg': pd.read_feather('./data/interim/train_agg.ftr'), 
        'test_agg': pd.read_feather('./data/interim/test_agg.ftr')
        }.get(dataset)
    elif dataset == 'train':
        return pd.read_csv('./data/raw/train_data.csv')
    elif dataset == 'test':
        return pd.read_csv('./data/raw/test_data.csv')
    else:
        raise ValueError

def add_features(dataset):
    to_drop = ['target', 'S_2', 'B_17', 'B_29', 'B_39', 'B_42', 'D_105', 'D_106', 'D_108', 'D_110', 'D_111', 'D_132', 'D_134', 'D_135', 'D_136', 'D_137', 'D_138', 'D_142', 'D_42', 'D_43', 'D_46', 'D_49', 'D_50', 'D_53', 'D_56', 'D_66', 'D_73', 'D_76', 'D_77', 'D_82', 'D_87', 'D_88', 'R_26', 'R_9', 'S_27', 'S_9']

    cols_to_drop = dataset.columns[dataset.columns.isin(to_drop)].tolist()
    print(f'Dropped Columns: {cols_to_drop}')

    amex_agg = (dataset
                .set_index('customer_ID')
                .drop(cols_to_drop, axis=1)
                .pipe(aggregate_features)
                .replace([np.inf, -np.inf], np.nan))
    
    if 'target' in cols_to_drop:
        amex_agg['target'] = (dataset
                              .set_index('customer_ID')
                              .groupby('customer_ID')
                              .tail(1)
                              .target)
    
    return amex_agg

In [23]:
'''

                .assign(
                    S_2_Day = dataset['S_2'].dt.day.astype('category'),
                    S_2_Month = dataset['S_2'].dt.month.astype('category'),
                    S_2_Year = dataset['S_2'].dt.year.astype('category'))
'''

"\n\n                .assign(\n                    S_2_Day = dataset['S_2'].dt.day.astype('category'),\n                    S_2_Month = dataset['S_2'].dt.month.astype('category'),\n                    S_2_Year = dataset['S_2'].dt.year.astype('category'))\n"

In [24]:
def amex_features(X_train, verbose=True):
    numeric_features = (X_train
                        .select_dtypes(include='number')
                        .columns
                        .tolist())

    categorical_features = (X_train
                            .select_dtypes(include='category')
                            .columns
                            .tolist())

    ordinal_features = []
    
    if verbose:
        print(f'Numeric Features - Count(#): {len(numeric_features)}')
        print(f'Categorical Features - Count(#): {len(categorical_features)}')
        print(f'Ordinal Features - Count(#): {len(ordinal_features)}')
    
    return {
        'numeric': numeric_features,
        'categorical': categorical_features,
        'ordinal': ordinal_features
    }

In [25]:
'''
# Load compressed datasets
# Source: https://www.kaggle.com/datasets/munumbutt/amexfeather

amex_train = load_amex('train', use_feather=True)
amex_test = load_amex('test', use_feather=True)

amex_train__agg = add_features(amex_train)
amex_test__agg = add_features(amex_test)

amex_train__agg.reset_index().to_feather('./data/interim/train_agg.ftr')
amex_test__agg.reset_index().to_feather('./data/interim/test_agg.ftr')
'''

Dropped Columns: ['S_2', 'D_42', 'D_43', 'D_46', 'D_49', 'D_50', 'D_53', 'D_56', 'S_9', 'B_17', 'D_66', 'D_73', 'D_76', 'D_77', 'R_9', 'D_82', 'B_29', 'D_87', 'D_88', 'D_105', 'D_106', 'R_26', 'D_108', 'D_110', 'D_111', 'B_39', 'S_27', 'B_42', 'D_132', 'D_134', 'D_135', 'D_136', 'D_137', 'D_138', 'D_142', 'target']


  numeric__agg[col + '_lag_delta'] = numeric__agg[col] - numeric__agg[col.replace('last', 'first')]
  numeric__agg[col + '_lag_div'] = numeric__agg[col] / numeric__agg[col.replace('last', 'first')]
  numeric__agg[col + '_lag_delta'] = numeric__agg[col] - numeric__agg[col.replace('last', 'first')]
  numeric__agg[col + '_lag_div'] = numeric__agg[col] / numeric__agg[col.replace('last', 'first')]
  numeric__agg[col + '_lag_delta'] = numeric__agg[col] - numeric__agg[col.replace('last', 'first')]
  numeric__agg[col + '_lag_div'] = numeric__agg[col] / numeric__agg[col.replace('last', 'first')]
  numeric__agg[col + '_lag_delta'] = numeric__agg[col] - numeric__agg[col.replace('last', 'first')]
  numeric__agg[col + '_lag_div'] = numeric__agg[col] / numeric__agg[col.replace('last', 'first')]
  numeric__agg[col + '_lag_delta'] = numeric__agg[col] - numeric__agg[col.replace('last', 'first')]
  numeric__agg[col + '_lag_div'] = numeric__agg[col] / numeric__agg[col.replace('last', 'first')]
  numeric_

Dropped Columns: ['S_2', 'D_42', 'D_43', 'D_46', 'D_49', 'D_50', 'D_53', 'D_56', 'S_9', 'B_17', 'D_66', 'D_73', 'D_76', 'D_77', 'R_9', 'D_82', 'B_29', 'D_87', 'D_88', 'D_105', 'D_106', 'R_26', 'D_108', 'D_110', 'D_111', 'B_39', 'S_27', 'B_42', 'D_132', 'D_134', 'D_135', 'D_136', 'D_137', 'D_138', 'D_142']


  numeric__agg[col + '_lag_delta'] = numeric__agg[col] - numeric__agg[col.replace('last', 'first')]
  numeric__agg[col + '_lag_div'] = numeric__agg[col] / numeric__agg[col.replace('last', 'first')]
  numeric__agg[col + '_lag_delta'] = numeric__agg[col] - numeric__agg[col.replace('last', 'first')]
  numeric__agg[col + '_lag_div'] = numeric__agg[col] / numeric__agg[col.replace('last', 'first')]
  numeric__agg[col + '_lag_delta'] = numeric__agg[col] - numeric__agg[col.replace('last', 'first')]
  numeric__agg[col + '_lag_div'] = numeric__agg[col] / numeric__agg[col.replace('last', 'first')]
  numeric__agg[col + '_lag_delta'] = numeric__agg[col] - numeric__agg[col.replace('last', 'first')]
  numeric__agg[col + '_lag_div'] = numeric__agg[col] / numeric__agg[col.replace('last', 'first')]
  numeric__agg[col + '_lag_delta'] = numeric__agg[col] - numeric__agg[col.replace('last', 'first')]
  numeric__agg[col + '_lag_div'] = numeric__agg[col] / numeric__agg[col.replace('last', 'first')]
  numeric_

In [26]:
amex_train__agg = load_amex('train_agg').set_index('customer_ID')

X_train, X_test, y_train, y_test = train_test_split(
    amex_train__agg.drop('target', axis=1), 
    amex_train__agg.target,
    stratify=amex_train__agg.target,
    test_size=0.20,
    random_state=1123)

del amex_train__agg, amex_test__agg
gc.collect()

0

In [35]:
X_train = reduce_mem_usage(X_train)
X_test = reduce_mem_usage(X_test)

Memory usage after optimization is: 1184.82 MB
Decreased by 24.9%
Memory usage after optimization is: 283.60 MB
Decreased by 28.1%


In [36]:
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)

In [28]:
'''
IterativeImputer(
        initial_strategy='median',
        sample_posterior=True,
        max_iter=2,
        add_indicator=True, 
        random_state=1123)

SimpleImputer(strategy='mean', add_indicator=True)
'''

"\nIterativeImputer(\n        initial_strategy='median',\n        sample_posterior=True,\n        max_iter=2,\n        add_indicator=True, \n        random_state=1123)\n\nSimpleImputer(strategy='mean', add_indicator=True)\n"

In [29]:
'''
train_df = load_amex('train_agg', use_feather=True)

y_train = train_df.target
X_train = train_df.drop('target', axis=1)

del train_df
gc.collect()
'''

"\ntrain_df = load_amex('train_agg', use_feather=True)\n\ny_train = train_df.target\nX_train = train_df.drop('target', axis=1)\n\ndel train_df\ngc.collect()\n"

In [30]:
features = amex_features(X_train)

numeric_preprocessor = make_pipeline(
    SimpleImputer(strategy='median', add_indicator=True))

categorical_preprocessor = make_pipeline(
    SimpleImputer(strategy='most_frequent', add_indicator=True),
    OneHotEncoder(handle_unknown='ignore', sparse=False))

ordinal_preprocessor = make_pipeline(
    SimpleImputer(strategy='constant', fill_value=-1, add_indicator=True),
    OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-11))

feature_preprocessor = ColumnTransformer([
        ('numeric', numeric_preprocessor, features['numeric']),
        ('categorical', categorical_preprocessor, features['categorical'])
    ], verbose_feature_names_out=True)

feature_selector = SelectFromModel(
    RandomForestClassifier(
        n_estimators=25,
        random_state=1123), 
    max_features=750)

preprocessor_pipeline = make_pipeline(
    feature_preprocessor)

Numeric Features - Count(#): 1316
Categorical Features - Count(#): 20
Ordinal Features - Count(#): 0


In [31]:
'''
log_reg = make_pipeline(
    preprocessor_pipeline, 
    SelectKBest(k=128), 
    LogisticRegression())

log_reg.fit(X_train, y_train)

train_score = amex_metric(y_train, log_reg.predict_proba(X_train)[:, 0])
test_score = amex_metric(y_test, log_reg.predict_proba(X_test)[:, 0])

print(f'Train Score: {train_score}')
print(f'Test Score: {test_score}')
'''

"\nlog_reg = make_pipeline(\n    preprocessor_pipeline, \n    SelectKBest(k=128), \n    LogisticRegression())\n\nlog_reg.fit(X_train, y_train)\n\ntrain_score = amex_metric(y_train, log_reg.predict_proba(X_train)[:, 0])\ntest_score = amex_metric(y_test, log_reg.predict_proba(X_test)[:, 0])\n\nprint(f'Train Score: {train_score}')\nprint(f'Test Score: {test_score}')\n"

In [32]:
'''
rf_clf = make_pipeline(
    preprocessor_pipeline,
    RandomForestClassifier(
        n_estimators=25, 
        random_state=1123,
        n_jobs=5))

# Fit the classifier to the training set
rf_clf.fit(X_train, y_train)

# Predict the labels of the test set: preds
train_preds = rf_clf.predict(X_train)
test_preds = rf_clf.predict(X_test)

train_score = amex_score(y_train.values, train_preds)
test_score = amex_score(y_test.values, test_preds)

print(f'Train Score: {train_score}')
print(f'Test Score: {test_score}')

with open('models/rf_clf.pkl','wb') as f:
    pickle.dump(rf_clf, f)
'''

"\nrf_clf = make_pipeline(\n    preprocessor_pipeline,\n    RandomForestClassifier(\n        n_estimators=25, \n        random_state=1123,\n        n_jobs=5))\n\n# Fit the classifier to the training set\nrf_clf.fit(X_train, y_train)\n\n# Predict the labels of the test set: preds\ntrain_preds = rf_clf.predict(X_train)\ntest_preds = rf_clf.predict(X_test)\n\ntrain_score = amex_score(y_train.values, train_preds)\ntest_score = amex_score(y_test.values, test_preds)\n\nprint(f'Train Score: {train_score}')\nprint(f'Test Score: {test_score}')\n\nwith open('models/rf_clf.pkl','wb') as f:\n    pickle.dump(rf_clf, f)\n"

SelectFromModel(
        rf_clf,
        prefit=True,
        max_features=512),

In [44]:

# Print version of XGBoost used
print(f'XGB Version: {xgb.__version__}')

# Create the DMatrix: housing_dmatrix
#train_dmatrix = xgb.DMatrix(data=X_train, label=y_train)

# Create the parameter dictionary for each tree: params 
xgb_params = {"objective": "binary:logistic", 
              "booster": "dart",
              "use_label_encoder": False,
              "max_depth": 4,
              "learning_rate": 0.032,
              "subsample": 0.80,
              "colsample_bytree": 0.64,
              "custom_metric": amex_score,
              "early_stopping_rounds": 5,
              "eval_metric": xgb__amex_metric,
              "feval": xgb_amex,
              "gamma": 1.12,
              "verbosity": 3,
              "seed": 1123}

# Instantiate the XGBClassifier: xg_cl
xgb_clf = xgb.XGBClassifier(
    objective='binary:logistic',
    booster='dart',
    use_label_encoder=False,
    max_depth=7,
    early_stopping_rounds=5,
    learning_rate=0.1,
    feval=xgb_amex,
    eval_metric=xgb__amex_metric,
    verbosity=3,
    seed=1123, n_jobs=-1)
# colsample_bytree=0.75,subsample=0.72,
# save

XGB Version: 1.6.1


In [37]:
preprocessor_pipeline.fit(X_train, y_train)

X_train__preprocessed = preprocessor_pipeline.transform(X_train)
X_test__preprocessed = preprocessor_pipeline.transform(X_test)
# eval_set=[(X_test__preprocessed, y_test)]

In [45]:
# Fit the classifier to the training set
xgb_clf.fit(X_train__preprocessed, y_train, 
            eval_set=[(X_test__preprocessed, y_test)])

[18:40:54] DEBUG: /home/conda/feedstock_root/build_artifacts/xgboost-split_1660208814268/work/src/gbm/gbtree.cc:155: Using tree method: 2
[18:41:48] INFO: /home/conda/feedstock_root/build_artifacts/xgboost-split_1660208814268/work/src/tree/updater_prune.cc:101: tree pruning end, 244 extra nodes, 0 pruned nodes, max_depth=7
[18:41:48] INFO: /home/conda/feedstock_root/build_artifacts/xgboost-split_1660208814268/work/src/gbm/gbtree.cc:909: drop 0 trees, weight = 1
[0]	validation_0-logloss:0.62983	validation_0-xgb__amex_metric:0.28736
[18:42:17] INFO: /home/conda/feedstock_root/build_artifacts/xgboost-split_1660208814268/work/src/tree/updater_prune.cc:101: tree pruning end, 240 extra nodes, 0 pruned nodes, max_depth=7
[18:42:17] INFO: /home/conda/feedstock_root/build_artifacts/xgboost-split_1660208814268/work/src/gbm/gbtree.cc:909: drop 0 trees, weight = 1
[1]	validation_0-logloss:0.57793	validation_0-xgb__amex_metric:0.27757
[18:42:48] INFO: /home/conda/feedstock_root/build_artifacts/xgbo

In [None]:
xgb_clf.get_params()

In [39]:
# Predict the labels of the test set: preds
train_preds = xgb_clf.predict(X_train__preprocessed)
test_preds = xgb_clf.predict(X_test__preprocessed)

train_score = amex_score(y_train.values, train_preds)
test_score = amex_score(y_test.values, test_preds)

print(f'Train Score: {train_score}')
print(f'Test Score: {test_score}')

Train Score: 0.5813243818286744
Test Score: 0.5548575600288617


In [None]:

'''
with open('models/xgb_clf.pkl','wb') as f:
    pickle.dump(xgb_clf, f)
def amex_scorer_func(estimator, X, y):
    y_pred = estimator.predict(X)
    return amex_score(y, y_pred)

amex_scorer = make_scorer(amex_score)

cv_kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=1123)

score = cross_val_score(
    xgb_clf, 
    X_train, 
    y_train, 
    scoring=amex_scorer, 
    cv=cv_kfold)
print(f'AMEX Score (Cross Validated): {score}')

'''

In [None]:
'''
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1123)

skf_scores = []
skf_amex_scores = []
for train_index, test_index in skf.split(X_train, y_train):
    X_train_fold, y_train_fold = X_train.iloc[train_index], y_train.iloc[train_index]
    X_test_fold, y_test_fold = X_train.iloc[test_index], y_train.iloc[test_index]
    
    xgb_clf.fit(X_train_fold, y_train_fold)
    
    score = xgb_clf.score(X_test_fold, y_test_fold)
    print(score)
    skf_scores.append(score)
    skf_amex_scores.append((
        amex_score(y_train_fold, xgb_clf.predict(X_train_fold)),
        amex_score(y_test_fold, xgb_clf.predict(X_test_fold))
    ))
print(skf_scores)
'''

In [None]:
# Save test predictions to file
def make_submission(estimator, save_csv=True):
    X_test = load_amex('test_agg')
    y_pred = estimator.predict(X_test)
    
    submission = pd.DataFrame({
        'customer_ID': X_test.index,
        'target': y_pred})
    
    if save_csv:
        submission.to_csv('.data/processed/kaggle_submission.csv', index=False)
    if gc:
        del X_test, y_pred
        gc.collect()

    return submission