In [None]:
# Source: https://www.kaggle.com/code/munumbutt/simple-lgbm-starter/notebook

def amex_metric(y_true, y_pred) -> float:
    if isinstance(y_true, (pd.Series, np.ndarray)):
        y_true = pd.DataFrame(y_true)
    if isinstance(y_pred, (pd.Series, np.ndarray)):
        y_pred = pd.DataFrame(y_pred, columns=['prediction'])
    
    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

In [None]:
# Source: https://www.kaggle.com/code/werus23/amex-feature-engineering

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
def xgb_amex(y_pred, y_true):
    return 'amex', amex_metric_np(y_pred,y_true.get_label())

# Created by https://www.kaggle.com/yunchonggan
# https://www.kaggle.com/competitions/amex-default-prediction/discussion/328020
def amex_metric_np(preds: np.ndarray, target: np.ndarray) -> float:
    indices = np.argsort(preds)[::-1]
    preds, target = preds[indices], target[indices]

    weight = 20.0 - target * 19.0
    cum_norm_weight = (weight / weight.sum()).cumsum()
    four_pct_mask = cum_norm_weight <= 0.04
    d = np.sum(target[four_pct_mask]) / np.sum(target)

    weighted_target = target * weight
    lorentz = (weighted_target / weighted_target.sum()).cumsum()
    gini = ((lorentz - cum_norm_weight) * weight).sum()

    n_pos = np.sum(target)
    n_neg = target.shape[0] - n_pos
    gini_max = 10 * n_neg * (n_pos + 20 * n_neg - 19) / (n_pos + 20 * n_neg)

    g = gini / gini_max
    return 0.5 * (g + d)

# https://www.kaggle.com/kyakovlev
# https://www.kaggle.com/competitions/amex-default-prediction/discussion/327534
def amex_score(y_true, y_pred):

    labels     = np.transpose(np.array([y_true, y_pred]))
    labels     = labels[labels[:, 1].argsort()[::-1]]
    weights    = np.where(labels[:,0]==0, 20, 1)
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]]
        weight         = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weight / np.sum(weight))
        total_pos      = np.sum(labels[:, 0] *  weight)
        cum_pos_found  = np.cumsum(labels[:, 0] * weight)
        lorentz        = cum_pos_found / total_pos
        gini[i]        = np.sum((lorentz - weight_random) * weight)

    return 0.5 * (gini[1]/gini[0] + top_four)


def xgb__amex_metric(labels, predt):
    score = 1 - amex_score(labels, predt)
    return score

In [None]:
'''

                .assign(
                    S_2_Day = dataset['S_2'].dt.day.astype('category'),
                    S_2_Month = dataset['S_2'].dt.month.astype('category'),
                    S_2_Year = dataset['S_2'].dt.year.astype('category'))
'''

In [None]:
'''
# Load compressed datasets
# Source: https://www.kaggle.com/datasets/munumbutt/amexfeather

'''

In [None]:
amex_train__agg = load_amex('train_agg').set_index('customer_ID')

X_train, X_test, y_train, y_test = train_test_split(
    amex_train__agg.drop('target', axis=1), 
    amex_train__agg.target,
    stratify=amex_train__agg.target,
    test_size=0.20,
    random_state=1123)

del amex_train__agg, amex_test__agg
gc.collect()

In [None]:
X_train = reduce_mem_usage(X_train)
X_test = reduce_mem_usage(X_test)

In [None]:
'''
IterativeImputer(
        initial_strategy='median',
        sample_posterior=True,
        max_iter=2,
        add_indicator=True, 
        random_state=1123)

SimpleImputer(strategy='mean', add_indicator=True)
'''

In [None]:
'''
train_df = load_amex('train_agg', use_feather=True)

y_train = train_df.target
X_train = train_df.drop('target', axis=1)

del train_df
gc.collect()
'''

In [None]:
'''
log_reg = make_pipeline(
    preprocessor_pipeline, 
    SelectKBest(k=128), 
    LogisticRegression())

log_reg.fit(X_train, y_train)

train_score = amex_metric(y_train, log_reg.predict_proba(X_train)[:, 0])
test_score = amex_metric(y_test, log_reg.predict_proba(X_test)[:, 0])

print(f'Train Score: {train_score}')
print(f'Test Score: {test_score}')
'''

In [None]:
'''
rf_clf = make_pipeline(
    preprocessor_pipeline,
    RandomForestClassifier(
        n_estimators=25, 
        random_state=1123,
        n_jobs=5))

# Fit the classifier to the training set
rf_clf.fit(X_train, y_train)

# Predict the labels of the test set: preds
train_preds = rf_clf.predict(X_train)
test_preds = rf_clf.predict(X_test)

train_score = amex_score(y_train.values, train_preds)
test_score = amex_score(y_test.values, test_preds)

print(f'Train Score: {train_score}')
print(f'Test Score: {test_score}')

with open('models/rf_clf.pkl','wb') as f:
    pickle.dump(rf_clf, f)
'''

SelectFromModel(
        rf_clf,
        prefit=True,
        max_features=512),

In [None]:
# Create the DMatrix: housing_dmatrix
#train_dmatrix = xgb.DMatrix(data=X_train, label=y_train)

# Create the parameter dictionary for each tree: params 
xgb_params = {"objective": "binary:logistic", 
              "booster": "dart",
              "use_label_encoder": False,
              "max_depth": 4,
              "learning_rate": 0.032,
              "subsample": 0.80,
              "colsample_bytree": 0.64,
              "custom_metric": amex_score,
              "early_stopping_rounds": 5,
              "eval_metric": xgb__amex_metric,
              "feval": xgb_amex,
              "gamma": 1.12,
              "verbosity": 3,
              "seed": 1123}

In [None]:

'''
with open('models/xgb_clf.pkl','wb') as f:
    pickle.dump(xgb_clf, f)
def amex_scorer_func(estimator, X, y):
    y_pred = estimator.predict(X)
    return amex_score(y, y_pred)

amex_scorer = make_scorer(amex_score)

cv_kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=1123)

score = cross_val_score(
    xgb_clf, 
    X_train, 
    y_train, 
    scoring=amex_scorer, 
    cv=cv_kfold)
print(f'AMEX Score (Cross Validated): {score}')

'''

In [None]:
'''
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1123)

skf_scores = []
skf_amex_scores = []
for train_index, test_index in skf.split(X_train, y_train):
    X_train_fold, y_train_fold = X_train.iloc[train_index], y_train.iloc[train_index]
    X_test_fold, y_test_fold = X_train.iloc[test_index], y_train.iloc[test_index]
    
    xgb_clf.fit(X_train_fold, y_train_fold)
    
    score = xgb_clf.score(X_test_fold, y_test_fold)
    print(score)
    skf_scores.append(score)
    skf_amex_scores.append((
        amex_score(y_train_fold, xgb_clf.predict(X_train_fold)),
        amex_score(y_test_fold, xgb_clf.predict(X_test_fold))
    ))
print(skf_scores)
'''

In [None]:
# Save test predictions to file
def make_submission(estimator, save_csv=True):
    X_test = load_amex('test_agg')
    y_pred = estimator.predict(X_test)
    
    submission = pd.DataFrame({
        'customer_ID': X_test.index,
        'target': y_pred})
    
    if save_csv:
        submission.to_csv('.data/processed/kaggle_submission.csv', index=False)
    if gc:
        del X_test, y_pred
        gc.collect()

    return submission