In [None]:
import os
import pandas as pd
import numpy as np
import xgboost as xgb
from matplotlib import pyplot
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [None]:
# SET PATH
path = f"{os.getcwd()}/dataset/train.parquet"
# LOAD PARQUET DATASET WITH FEATURE ENGINEERING
df = pd.read_parquet(path=path)

In [None]:
# REDUCE DTYPE FOR CUSTOMER AND DATE
df['customer_ID']= df['customer_ID'].str[-16:].apply(int, base=16)

# S_2 is not relevant so lets drop it
df.drop(['S_2'],axis=1,inplace=True)
# df = df.fillna(-127)
# There are multiple transactions. Let's take only the latest transaction from each customer.
df= df.groupby('customer_ID').tail(1)
df= df.set_index(['customer_ID'])

print('shape of data:', df.shape)

In [None]:
df.head()

In [None]:
# Feature Engineering
# all_cols = [c for c in list(df.columns) if c not in ['customer_ID','S_2']]
# cat_features = ["B_30","B_38","D_114","D_116","D_117","D_120","D_126","D_63","D_64","D_66","D_68"]
# num_features = [col for col in all_cols if col not in cat_features]
#
# test_num_agg = df.groupby("customer_ID")[num_features].agg(['mean', 'std', 'min', 'max', 'last'])
# test_num_agg.columns = ['_'.join(x) for x in test_num_agg.columns]
#
# test_cat_agg = df.groupby("customer_ID")[cat_features].agg(['count', 'last', 'nunique'])
# test_cat_agg.columns = ['_'.join(x) for x in test_cat_agg.columns]
#
# df = pd.concat([test_num_agg, test_cat_agg], axis=1)
# del test_num_agg, test_cat_agg
# print('shape after engineering', df.shape )

In [None]:
# Read and process Train Labels
targets = pd.read_csv(f"{os.getcwd()}/dataset/train_labels.csv")
targets['customer_ID'] = targets['customer_ID'].str[-16:].apply(int, base=16)
targets = targets.set_index('customer_ID')
train_data = df.merge(targets, left_index=True, right_index=True, how='left')
train_data.target = train_data.target.astype('int8')
del targets, df
# NEEDED TO MAKE CV DETERMINISTIC (Pandas merge above randomly shuffles rows)
train_data = train_data.sort_index().reset_index()

# FEATURES
FEATURES = train_data.columns[1:-1]
print(f'There are {len(FEATURES)} features!')
print("Train data Shape", train_data.shape)

In [None]:
x_train = train_data.drop(['target','customer_ID'],axis=1)
y_train = train_data['target']

In [None]:
# Split train data into training and testing sets
x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train, test_size=0.20,
                                                                            random_state=0,
                                                                            stratify=y_train)

# Initialize XGB Classifier
xgb_cal = xgb.XGBClassifier()
xgb_cal.fit(x_train_split,y_train_split)

In [None]:
# Create StratifiedKFold and train using XGBoost object.
kfold = 5
skf = StratifiedKFold(n_splits=kfold, shuffle=True, random_state=0)
xgb_cal = xgb.XGBClassifier()
lst_accu_stratified_test = []
lst_accu_stratified_train = []
for i, (train_index, test_index) in enumerate(skf.split(x_train, y_train), start=1):
    print('[Fold %d/%d]' % (i, kfold))
    x_train_split, x_test_split  = x_train[train_index], x_train[test_index]
    y_train_split, y_test_split  = y_train[train_index], y_train[test_index]
    xgb_cal.fit(x_train_split, y_train_split)
    y_predict_test = xgb_cal.predict(x_test_split)
    lst_accu_stratified_test.append(accuracy_score(y_test_split, y_predict_test))
    y_predict_train = xgb_cal.predict(x_train_split)
    lst_accu_stratified_train.append(accuracy_score(y_train_split,y_predict_train))

In [None]:
print("Value of 5 Folds with Test Data:", lst_accu_stratified_test)
print("V of 5 Folds with Train Data:", lst_accu_stratified_train)

print("Mean of Test data Accuracy:",np.mean(lst_accu_stratified_test))
print("Mean of Train data Accuracy:",np.mean(lst_accu_stratified_train))

In [None]:
# define the tree depths to evaluate
values = [i for i in range(1, 6)]
# plot of train and test scores
pyplot.plot(values, lst_accu_stratified_train, '-o', label='Train')
pyplot.plot(values, lst_accu_stratified_test, '-o', label='Test')
pyplot.legend()
pyplot.show()

In [None]:
y_predict=xgb_cal.predict(x_test_split)

test_scores = accuracy_score(y_test_split, y_predict)

# Classification Score using test data
print('XGBoost Classifier Accuracy: {:.3f}'.format(test_scores))
print('\nXGBoost Classifier Precision: {:.3f}'.format(precision_score (y_test_split, y_predict)))
print('\nXGBoost Classifier Recall: {:.3f}'.format(recall_score (y_test_split, y_predict)))

In [None]:
y_predict_train=xgb_cal.predict(x_train_split)

train_scores = accuracy_score(y_train_split, y_predict_train)

# Classification Score using train data to check over and under fitting
print('XGBoost Classifier Accuracy: {:.3f}'.format(train_scores))
print('\nXGBoost Classifier Precision: {:.3f}'.format(precision_score (y_train_split, y_predict_train)))
print('\nXGBoost Classifier Recall: {:.3f}'.format(recall_score (y_train_split, y_predict_train)))

In [None]:
# define the tree depths to evaluate
# values = [i for i in range(1, 7)]
# plot of train and test scores
pyplot.plot(1, train_scores, '-o', label='Train')
pyplot.plot(1, test_scores, '-o', label='Test')
pyplot.legend()
pyplot.show()

In [None]:
# https://www.kaggle.com/competitions/amex-default-prediction/discussion/327534
def amex_metric_mod(y_true, y_pred):

    labels     = np.transpose(np.array([y_true, y_pred]))
    labels     = labels[labels[:, 1].argsort()[::-1]]
    weights    = np.where(labels[:,0]==0, 20, 1)
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]]
        weight         = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weight / np.sum(weight))
        total_pos      = np.sum(labels[:, 0] *  weight)
        cum_pos_found  = np.cumsum(labels[:, 0] * weight)
        lorentz        = cum_pos_found / total_pos
        gini[i]        = np.sum((lorentz - weight_random) * weight)

    return 0.5 * (gini[1]/gini[0] + top_four)

amex_metric_score_test = amex_metric_mod(y_test_split,y_predict)
amex_metric_score_train = amex_metric_mod(y_train_split,y_predict_train)
print("Amex Metric Score on test data: ", amex_metric_score_test)

In [None]:
# SET TEST DATA PATH
path = f"{os.getcwd()}/dataset/test.parquet"
# LOAD PARQUET DATASET WITH FEATURE ENGINEERING
df_test = pd.read_parquet(path=path)

In [None]:
# S_2 is not relevant so lets drop it
df_test.drop(['S_2'],axis=1,inplace=True)

# There are multiple transactions. Let's take only the latest transaction from each customer.
df_test= df_test.groupby('custy. omer_ID').tail(1)
df_test= df_test.set_index(['customer_ID'])

df_test.shape

In [None]:
y_test_predict=xgb_cal.predict_proba(df_test)

In [None]:
y_predict_final=y_test_predict[:,1]
submission = pd.DataFrame({"customer_ID":df_test.index,"prediction":y_predict_final})

In [None]:
submission.to_csv('submission.csv', index=False)