In [1]:
import sys
sys.path.append("../")
# import cupy, cudf

In [2]:
import gc
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle
import seaborn as sns
from kneed import KneeLocator
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, silhouette_score
from sklearn.cluster import KMeans
from skopt import BayesSearchCV
from statsmodels.tsa.stattools import adfuller, kpss, acf, pacf
from tqdm import tqdm

In [3]:
from sklearn.decomposition import PCA

In [4]:
from utils.eval_helpers import plot_roc_curves, plot_feature_importance, amex_metric
from utils.eda_helpers import plot_missing_proportion_barchart

In [5]:
RAW_DATA_PATH = "../raw_data"
os.listdir(RAW_DATA_PATH)

['train_labels.csv',
 '.DS_Store',
 'train_data.parquet',
 'test_data.parquet',
 'test_data.ftr',
 'train_data.csv',
 'train_data.ftr',
 'test_data.csv',
 'sample_submission.csv']

In [6]:
PROCESSED_DATA_PATH = "../processed_data"
EVALUATION_DATA_PATH = "../evaluation_data"
SUBMISSION_DATA_PATH = "../submissions"
MODELS_PATH = "../models"
CATEGORY_COLUMNS = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

In [7]:
%load_ext autoreload
%autoreload

### Preprocessing Train

In [8]:
train_data = pd.read_parquet(f"{RAW_DATA_PATH}/train_data.parquet")
train_data["S_2"] = pd.to_datetime(train_data["S_2"])

In [9]:
train_data.shape

(5531451, 190)

In [10]:
train_data.columns

Index(['customer_ID', 'S_2', 'P_2', 'D_39', 'B_1', 'B_2', 'R_1', 'S_3', 'D_41',
       'B_3',
       ...
       'D_136', 'D_137', 'D_138', 'D_139', 'D_140', 'D_141', 'D_142', 'D_143',
       'D_144', 'D_145'],
      dtype='object', length=190)

In [11]:
train_labels = pd.read_csv(f"{RAW_DATA_PATH}/train_labels.csv")

In [12]:
train_labels.shape

(458913, 2)

In [13]:
train_labels.columns

Index(['customer_ID', 'target'], dtype='object')

In [14]:
train_data = train_data.sort_values(by=["customer_ID", "S_2"])

In [15]:
train_data = train_data.merge(train_labels, on="customer_ID", how="left")

### Preparation

In [16]:
def get_cols(df, key):
    return [col for col in df.columns if key in col]

In [17]:
special_columns = ['customer_ID', 'S_2', 'target'] + get_cols(train_data, "has") + get_cols(train_data, "sign")

In [18]:
all_cols = [c for c in train_data.columns if c not in special_columns]
cat_features = CATEGORY_COLUMNS
num_features = [col for col in all_cols if col not in CATEGORY_COLUMNS]

In [19]:
len(all_cols), len(cat_features), len(num_features)

(188, 11, 177)

In [20]:
ground_truth_df = train_data.groupby("customer_ID")["target"].mean().reset_index()

#### Check Variances

In [21]:
num_feature_summary_stats = train_data.loc[:, num_features].describe()

In [22]:
num_feature_coef_of_vars = num_feature_summary_stats.loc["std"] / num_feature_summary_stats.loc["mean"]
num_feature_coef_of_vars = num_feature_coef_of_vars.sort_values(ascending=False)

In [23]:
high_var_features = num_feature_coef_of_vars[num_feature_coef_of_vars > 1].index.tolist()
high_var_features = sorted(high_var_features)

#### Check Missing Value

In [24]:
num_feature_missing_df = train_data.loc[:, num_features].isnull().sum()
num_feature_missing_df = num_feature_missing_df / train_data.shape[0]

In [25]:
low_missing_features = num_feature_missing_df[num_feature_missing_df < 0.1].index.tolist()
low_missing_features = sorted(low_missing_features)

In [26]:
high_var_low_missing_features = list(set(high_var_features).intersection(set(low_missing_features)))

In [27]:
len(high_var_low_missing_features)

108

### Read Clustering + Aggregation Result

In [None]:
train_num_agg = pd.read_parquet(f"{PROCESSED_DATA_PATH}/train_agg_data.parquet")
train_cluster_agg = pd.read_parquet(f"{PROCESSED_DATA_PATH}/train_cluster_agg.parquet")

In [None]:
train_num_agg.shape

In [None]:
train_cluster_agg.shape

In [None]:
train_data_all_agg = pd.concat([train_cluster_agg.set_index("customer_ID"), train_num_agg], axis=1)

In [None]:
train_data_all_agg = train_data_all_agg.reset_index()

In [None]:
train_data_all_agg[["D_63_last", "D_64_last"]] = train_data_all_agg[["D_63_last", "D_64_last"]].astype("category")

In [None]:
del train_cluster_agg, train_num_agg

In [None]:
cluster_cols = get_cols(train_data_all_agg, key="cluster")

In [None]:
train_data_all_agg[cluster_cols] = train_data_all_agg[cluster_cols].astype("category")

In [None]:
for column in tqdm(cluster_cols):
    temp_dict = train_data_all_agg.groupby(column)["target"].mean().to_dict()
    train_data_all_agg.loc[:, f"{column}_"] = train_data_all_agg.loc[:, column].map(temp_dict).astype(float)
    train_data_all_agg = train_data_all_agg.drop(columns=column)

### Clustering

In [29]:
train_data["Date"] = train_data["S_2"].dt.to_period("M")

In [33]:
train_data_agg = train_data.groupby("customer_ID")["Date"].count().reset_index()

In [37]:
train_data_agg["target"] = train_labels["target"].to_list()

In [38]:
for column in high_var_low_missing_features:
    pca_model_filepath = f"{MODELS_PATH}/pca/{column}.pkl"
    kmeans_model_filepath = f"{MODELS_PATH}/kmeans/{column}.pkl"
    kmeans_result_filepath = f"{EVALUATION_DATA_PATH}/kmeans/{column}.csv"
    
    if all([os.path.exists(pca_model_filepath),
            os.path.exists(kmeans_model_filepath),
            os.path.exists(kmeans_result_filepath)]):
        print(f"Models for column {column} has already exists, skip the clustering")
        continue
    else:
        # Convert time series (column) to row
        unstack_df = train_data.groupby(['customer_ID', 'Date'])[column].mean().unstack().reset_index()
        unstack_df.columns = ["customer_ID"] + [f"pre_pca_{i}" for i in range(1, 14)]
        unstack_df_bfilled = unstack_df.bfill(axis=1)
        unstack_df_bfilled = unstack_df_bfilled.fillna(0)
        print(f"Column {column}:")

        # Fit PCA
        X = unstack_df_bfilled.iloc[:, 1:].to_numpy()
        pca = PCA(n_components=13)
        pca.fit(X)
        current_length = 1
        explained_var = 0
        while explained_var < 0.85:
            current_length += 1
            explained_var = pca.explained_variance_ratio_[:current_length].sum()
            
        print(f"By using {current_length} of PCA features, we can explain {explained_var:.3f} of the variation")

        # Transform using PCA
        pca_columns = ["pca{i}" for i in range(current_length)]
        unstack_df_bfilled[pca_columns] = pca.transform(X)[:, :current_length]
        unstack_df_bfilled = unstack_df_bfilled.merge(ground_truth_df, on="customer_ID", how="left")

        # KMeans inertia find solution
        kmeans_kwargs = dict(init="random", n_init=10, max_iter=100, random_state=1020)
        sse = []
        experiment_cluster_numbers = range(2, 9)
        scaled_features = unstack_df_bfilled[pca_columns]

        for k in experiment_cluster_numbers:
            kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
            kmeans.fit(scaled_features)
            sse.append(kmeans.inertia_)
        kl = KneeLocator(experiment_cluster_numbers, sse, curve="convex", direction="decreasing")
        print(f"It has been divided into {kl.elbow:.0f} clusters")

        # Refit with the best cluster number
        kmeans = KMeans(n_clusters=kl.elbow, **kmeans_kwargs)
        kmeans.fit(unstack_df_bfilled[pca_columns])
        unstack_df_bfilled["cluster"] = kmeans.labels_
        train_data_agg[f"{column}_kmeans_cluster"] = kmeans.labels_

        # Print the cluster preditive ability
        max_pos_proportion = unstack_df_bfilled.groupby("cluster")["target"].mean().max()
        min_pos_proportion = unstack_df_bfilled.groupby("cluster")["target"].mean().min()
        print(f"Its Kmeans cluster achieves min {min_pos_proportion:.2f} and max {max_pos_proportion:.2f} of positive proportion")
    
    
        # Calculate default probability for each cluster
        target_dist_by_cluster = train_data_agg.groupby(f"{column}_kmeans_cluster").agg(default_prop=("target", "mean"), 
                                                                                        size=("target", "count")).reset_index()
        # Export the model & data out
        pickle.dump(pca, open(pca_model_filepath, 'wb'))
        pickle.dump(kmeans, open(kmeans_model_filepath, 'wb'))
        target_dist_by_cluster.to_csv(kmeans_result_filepath, index=False)

        print("Successfully export both models & result", end="\n\n")

Models for column B_23 has already exists, skip the clustering
Models for column R_28 has already exists, skip the clustering
Models for column B_40 has already exists, skip the clustering
Models for column D_143 has already exists, skip the clustering
Models for column S_18 has already exists, skip the clustering
Models for column D_80 has already exists, skip the clustering
Models for column B_7 has already exists, skip the clustering
Models for column D_75 has already exists, skip the clustering
Models for column B_12 has already exists, skip the clustering
Models for column D_70 has already exists, skip the clustering
Models for column R_11 has already exists, skip the clustering
Models for column R_23 has already exists, skip the clustering
Models for column S_16 has already exists, skip the clustering
Models for column B_25 has already exists, skip the clustering
Column D_58:
By using 2 of PCA features, we can explain 0.951 of the variation
It has been divided into 4 clusters
Its

In [39]:
include_kmeans_balance_columns = ["B_1", "B_3", "B_4", "B_7", "B_9", "B_11", "B_16", "B_19", "B_20", "B_22", "B_23", "B_37"]

In [42]:
include_kmeans_delinquency_columns = ["D_39", "D_41", "D_44", "D_51", "D_58", "D_70", "D_74", "D_75", "D_78", "D_92", 
                                      "D_113", "D_127", "D_133"]

In [43]:
include_kmeans_payment_risk_columns = ["P_4", "R_1", "R_2", "R_3", "R_4", "R_10", "R_15"]

In [44]:
include_kmeans_spend_columns = ["S_6", "S_13"]

In [None]:
train_data_agg.head()

In [None]:
plt.style.use("fivethirtyeight")
plt.plot(experiment_cluster_numbers, sse)
plt.xticks(experiment_cluster_numbers)
plt.xlabel("Number of Clusters")
plt.ylabel("SSE")
plt.show()

In [None]:
b1_unstack_bfill_["pca1"] = b1_unstack_bfill["pca1"].apply(lambda x: min(x, 1))
b1_unstack_bfill_["pca2"] = b1_unstack_bfill["pca2"].apply(lambda x: min(x, 1))

In [None]:
b1_unstack_bfill_ = b1_unstack_bfill_.sort_values(by="target", ascending=False)

In [None]:
plt.figure(figsize=(21, 10))
sns.scatterplot(data=b1_unstack_bfill_, 
                x=b1_unstack_bfill_["pca1"], 
                y=b1_unstack_bfill_["pca2"],
                hue=b1_unstack_bfill_["target"],
                s=5)
plt.show()

In [None]:
# train_data_agg.to_parquet(f"{PROCESSED_DATA_PATH}/train_cluster_agg.parquet")

### Try Training

In [None]:
train, val = train_test_split(train_data_all_agg, 
                              test_size=0.15, 
                              random_state=1020, 
                              stratify=train_data_all_agg["target"])

In [None]:
train["target"].mean(), val["target"].mean()

In [None]:
X_train = train.drop(columns=["customer_ID", "Date", "target", "max_date", "min_date"])
X_val = val.drop(columns=["customer_ID", "Date", "target", "max_date", "min_date"])

In [None]:
y_train = train["target"]
y_val = val["target"]

In [None]:
best_params = {'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 0.88,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': 15,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.04,
 'n_estimators': 120,
 'n_jobs': -1,
 'num_leaves': 68,
 'objective': None,
 'random_state': 1020,
 'reg_alpha': 0,
 'reg_lambda': 6,
 'silent': True,
 'subsample': 0.85,
 'subsample_for_bin': 200000,
 'subsample_freq': 5,
 'scale_pos_weight': 6}

In [None]:
final_lgbm_clf = LGBMClassifier(**best_params)

In [None]:
final_lgbm_clf.fit(X_train, y_train)

In [None]:
imp_df = plot_feature_importance(final_lgbm_clf.feature_name_, 
                                 final_lgbm_clf.feature_importances_, 
                                 title="Feature Importance",
                                 limit=40)

In [None]:
y_train_pred = final_lgbm_clf.predict_proba(X_train)[:, 1]
y_val_pred = final_lgbm_clf.predict_proba(X_val)[:, 1]

In [None]:
plot_roc_curves([y_train, y_val], 
                [y_train_pred, y_val_pred], 
                labels=["Train", "Validation"], 
                title="Train Validation ROC AUC")

In [None]:
y_train_df = pd.DataFrame(y_train).reset_index(drop=True)
y_train_pred_df = pd.DataFrame(y_train_pred).rename(columns={0: "prediction"})

In [None]:
amex_metric(y_train_df, y_train_pred_df)

In [None]:
y_val_df = pd.DataFrame(y_val).reset_index(drop=True)
y_val_pred_df = pd.DataFrame(y_val_pred).rename(columns={0: "prediction"})

In [None]:
amex_metric(y_val_df, y_val_pred_df)

### Retrain

In [None]:
selected_features = imp_df.loc[imp_df["feature_importance"] > 2]["feature"].tolist()

In [None]:
X_train = train.loc[:, selected_features]
X_val = val.loc[:, selected_features]

In [None]:
final_lgbm_clf = LGBMClassifier(**best_params)

In [None]:
final_lgbm_clf.fit(X_train, y_train)

In [None]:
y_train_pred = final_lgbm_clf.predict_proba(X_train)[:, 1]
y_val_pred = final_lgbm_clf.predict_proba(X_val)[:, 1]

In [None]:
plot_roc_curves([y_train, y_val], 
                [y_train_pred, y_val_pred], 
                labels=["Train", "Validation"], 
                title="Train Validation ROC AUC")

In [None]:
y_train_df = pd.DataFrame(y_train).reset_index(drop=True)
y_train_pred_df = pd.DataFrame(y_train_pred).rename(columns={0: "prediction"})

In [None]:
amex_metric(y_train_df, y_train_pred_df)

In [None]:
y_val_df = pd.DataFrame(y_val).reset_index(drop=True)
y_val_pred_df = pd.DataFrame(y_val_pred).rename(columns={0: "prediction"})

In [None]:
amex_metric(y_val_df, y_val_pred_df)

### Revised Score

In [None]:
y_val_pred_df["new_scores"] = [max(min(1.06411*(x**1.05289) - 0.0219941, 1), 0) for x in y_val_pred]

In [None]:
plt.figure(figsize=(21, 10))
sns.scatterplot(data=y_val_pred_df, 
                x=y_val_pred_df["prediction"], 
                y=y_val_pred_df["new_scores"],
                s=5)
plt.show()

In [None]:
y_val_pred_df