In [None]:
import os
import sys

import featuretools as ft
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import phik
import seaborn as sns
import shap
import xgboost as xgb
from scipy.stats import randint, uniform
from sklearn import preprocessing
from sklearn.metrics import f1_score
from sklearn.model_selection import (KFold, RandomizedSearchCV,
                                     StratifiedKFold, cross_val_score,
                                     train_test_split)
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.utils.class_weight import compute_sample_weight

ROOT = os.path.abspath(os.path.join(os.pardir))
if ROOT not in sys.path:
    sys.path.append(ROOT)

from src.config import config
from src.E2EPipeline import E2EPipeline

pd.set_option('display.max_columns', None)


In [None]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)


task = pd.read_excel("/Users/kewenyang/Documents/GitHub/Maybank_Classification/data/Assessment.xlsx",
                       engine='openpyxl',
                       sheet_name=0)
task

In [None]:
pd.set_option('display.max_rows', None)

meta = pd.read_excel("/Users/kewenyang/Documents/GitHub/Maybank_Classification/data/Assessment.xlsx",
                       engine='openpyxl',
                       sheet_name=2)
meta

In [None]:
df_raw = pd.read_excel("/Users/kewenyang/Documents/GitHub/Maybank_Classification/data/Assessment.xlsx",
                       engine='openpyxl',
                       sheet_name=1)
df_raw.head()

In [None]:
# check target var distribution --> imbalance
df_raw.C_seg.value_counts()

In [None]:
# C_ID is not unique
df_raw.loc[df_raw.C_ID == 59688]

In [None]:
# drop original C_ID and replace it by row index
df_raw.drop("C_ID", axis=1, inplace=True)
df_raw.reset_index(drop=True, inplace=True)
df_raw = df_raw.reset_index()
df_raw.rename(columns={"index": "C_ID"}, inplace=True)
df_raw.head()

In [None]:
# check col with more missing values
missing = df_raw.isnull().any(axis=0)
missing = missing.loc[missing == True].index
missing

In [None]:
df_raw.info()

In [None]:
for col in missing:
    print(f"for col - {col}:")
    print(df_raw[col].value_counts())
    print("")


In [None]:
# impute the missing value with 0
df_raw["HL_tag"].fillna(value=0, inplace=True)
df_raw["AL_tag"].fillna(value=0, inplace=True)

In [None]:
# remove this col as it's a dummy
df_raw["PC"].value_counts()

In [None]:
df_raw.drop("PC", axis=1, inplace=True)

In [None]:
df_raw.HL_tag.value_counts()

In [None]:
# define columns' categories
nominal = ["C_EDU", "C_HSE", "gn_occ", "HL_tag", "AL_tag", "C_seg"]
ordinal = ["INCM_TYP"]
target_name = "C_seg"
index_col = "C_ID"

In [None]:
interval_cols = [col for col in df_raw.columns if col not in (nominal + ordinal)]
interval_cols

In [None]:
# # explore overall correlations
# df_corr = df_raw.loc[:, [c for c in df_raw.columns if c not in ["AL_tag", "pur_price_avg", "UT_AVE", "MAXUT", "N_FUNDS"]]]
# phik_overview = df_corr.phik_matrix(interval_cols=interval_cols)
# f = plt.figure(figsize=(15, 15))
# plt.matshow(phik_overview, fignum=f.number, cmap="Reds")
# plt.xticks(range(df_corr.shape[1]), df_corr.columns, fontsize=8, rotation=90)
# plt.yticks(range(df_corr.shape[1]), df_corr.columns, fontsize=8)
# cb = plt.colorbar()
# cb.ax.tick_params(labelsize=8)
# plt.title('Correlation matrix', fontsize=10)

In [None]:
# C_seg is relatively more correlated with C_AGE, gn_occ, NUM_PRD, CASATD_CNT

In [None]:
# convert categorical to string
for col in (nominal + ordinal):
    df_raw[col] = df_raw[col].astype(str)

df_raw.head(2)

In [None]:
# After converting, np.nan has been converted to string as well
df_raw.loc[(df_raw.isin(["nan", "NaN", "NA", "Nan", "Nill", "NAN"])).any(axis=1)]


In [None]:
# map string back to np.nan
df_raw.replace({"nan": np.nan, "NaN": np.nan, "NAN": np.nan}, inplace=True)

In [None]:
df_raw.head()

In [None]:
# prepare list of features for feature engineering
counts = [
    "NUM_PRD",
    "CASATD_CNT",
    "N_FUNDS",
    "ANN_N_TRX",
    ]

values = [
    "MTHCASA",
    "MAXCASA",
    "MINCASA",
    "pur_price_avg",
    "UT_AVE",
    "MAXUT",
    "CC_AVE",
    "MAX_MTH_TRN_AMT",
    "MIN_MTH_TRN_AMT",
    "MTHTD",
    "MAXTD",
    "Asset value",
    "AVG_TRN_AMT",
    "ANN_TRN_AMT",
    "CC_LMT",
    ]

# show the numeric columns not included in feature engineering list
[col for col in df_raw.columns if ((df_raw[col].dtype != "O") and (col not in counts + values))]

In [None]:
df_raw.head()

In [None]:
# prepare df for standardization
stand_cols = counts + values + ['C_AGE', 'DRvCR']
df_stand = df_raw.loc[:, stand_cols]
df_raw.drop(stand_cols, axis=1, inplace=True)
df_stand.head()

In [None]:
# standardize features
scaler = StandardScaler()
feature_arr = scaler.fit_transform(df_stand.values)
df_stand = pd.DataFrame(feature_arr, index=df_stand.index, columns=df_stand.columns)

df_raw = pd.concat([df_raw, df_stand], axis=1)
display("after scaling", df_raw.head())

In [None]:
# pd.set_option('display.max_rows', None)
# # check avail transform
# ft.list_primitives()[ft.list_primitives().type == "transform"]

In [None]:
# feature engineering 1
es = ft.EntitySet(id='ft')
es = es.add_dataframe(dataframe_name="ft", dataframe=df_raw.loc[:, values + [index_col]], index=index_col)

features_matrix,feature_names = ft.dfs(
    entityset=es,
    target_dataframe_name='ft',
    trans_primitives=["add_numeric", "subtract_numeric", "divide_numeric",'multiply_numeric'],
    max_depth=2,
    verbose=True)

features_matrix.reset_index(inplace=True)
features_matrix = features_matrix.drop(values, axis=1)

df_raw = df_raw.merge(features_matrix, how='left', on=index_col)

df_raw.head()

In [None]:
df_raw.shape

In [None]:
# feature engineering 2
es = ft.EntitySet(id='ft')
es = es.add_dataframe(dataframe_name="ft", dataframe=df_raw.loc[:, counts + [index_col]], index=index_col)

features_matrix,feature_names = ft.dfs(
    entityset=es,
    target_dataframe_name='ft',
    trans_primitives=["add_numeric", "subtract_numeric", "divide_numeric",'multiply_numeric'],
    max_depth=2,
    verbose=True)

features_matrix.reset_index(inplace=True)
features_matrix = features_matrix.drop(counts, axis=1)

df_raw = df_raw.merge(features_matrix, how='left', on=index_col)

df_raw.head()

In [None]:
df_raw.replace([np.inf, -np.inf], np.nan, inplace=True)

In [None]:
# drop the customer_id
df_X = df_raw.drop([index_col, target_name], axis=1)
df_y = df_raw.loc[:, [target_name]]

In [None]:
# train, val, test split
X, X_test, y, y_test = train_test_split(df_X, df_y, test_size=0.1, random_state=1, shuffle=True, stratify=df_y)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=1, shuffle=True, stratify=y)

display(f"train set size: {X_train.shape}, val set size: {X_val.shape}, test set size: {X_test.shape}")

In [None]:
# get cols requiring onehotencoding
categorical = [col for col in (ordinal + nominal) if col != target_name]
display(f"the categorical variables that need one hot encoding are: {categorical}")

# apply onehotencoding for categorical variables
enc = OneHotEncoder(handle_unknown='error', sparse_output=False, drop=None)
enc.fit_transform(df_X.loc[:, categorical])
feature_labels = enc.get_feature_names_out()


feature_arr = enc.transform(X[categorical])
cat_X = pd.DataFrame(feature_arr, columns=feature_labels).reset_index(drop=True)
X = pd.concat([X.drop(categorical, axis=1).reset_index(drop=True), cat_X], axis=1)

# for training data
feature_arr = enc.transform(X_train[categorical])
cat_train = pd.DataFrame(feature_arr, columns=feature_labels).reset_index(drop=True)
X_train = pd.concat([X_train.drop(categorical, axis=1).reset_index(drop=True), cat_train], axis=1)
display("training data after onehot encoding:", X_train.head())

feature_arr = enc.transform(X_val[categorical])
cat_val = pd.DataFrame(feature_arr, columns=feature_labels).reset_index(drop=True)
X_val = pd.concat([X_val.drop(categorical, axis=1).reset_index(drop=True), cat_val], axis=1)
display("val data after onehot encoding:", X_val.head())

feature_arr = enc.transform(X_test[categorical])
cat_test = pd.DataFrame(feature_arr, columns=feature_labels).reset_index(drop=True)
X_test = pd.concat([X_test.drop(categorical, axis=1).reset_index(drop=True), cat_test], axis=1)
display("test data after onehot encoding:", X_test.head())

In [None]:
# impute missing predictor values with np.nan, so that XGBoost will handle them
X = X.replace(np.nan)
X_train = X_train.replace(np.nan)
X_val = X_val.replace(np.nan)
X_test = X_test.replace(np.nan)

In [None]:
# onehotencoder will encode the np.nan into a new column
nan_cols = [col for col in X.columns if "_nan" in col]
nan_cols

In [None]:
# as onehot add extra nan column, so we need to remove it
def impute_encoded_col(df: pd.DataFrame, col_lst: list) -> pd.DataFrame:
    for col_n in col_lst:
        key = col_n[:-4]
        index =(df.loc[df[col_n] == 1]).index

        # drop column
        df.drop(col_n, axis=1, inplace=True)

        # impute with np.nan
        df.loc[index, [col for col in df.columns if key in col]] = np.nan

    return df

X = impute_encoded_col(X, nan_cols)

X_train = impute_encoded_col(X_train, nan_cols)
display("train set after imputing:", X_train.sample(1))

X_val = impute_encoded_col(X_val, nan_cols)
display("val set after imputing:", X_val.sample(1))

X_test = impute_encoded_col(X_test, nan_cols)
display("test set after imputing:", X_test.sample(1))

In [None]:
# # try to drop useless columns based on feature selection
useless = ['C_HSE_OFFICE',
 'ANN_TRN_AMT / AVG_TRN_AMT',
 'C_HSE_COMMERICAL BUILDING',
 'AVG_TRN_AMT / ANN_TRN_AMT',
 'C_HSE_INDUSTRIAL BUILDING',
 'C_HSE_HOTEL/ SERVICE APARTMENT']

X.drop(useless, axis = 1, inplace=True)
X_train.drop(useless, axis = 1, inplace=True)
X_val.drop(useless, axis = 1, inplace=True)
X_test.drop(useless, axis = 1, inplace=True)

In [None]:
# label encoding for target variable
le = preprocessing.LabelEncoder()

y = pd.DataFrame(le.fit_transform(y[target_name]), columns=["Y"])

y_train = pd.DataFrame(le.transform(y_train[target_name]), columns=["Y"])
display("training after label encoding:", y_train.head())

y_val = pd.DataFrame(le.transform(y_val[target_name]), columns=["Y"])
display("y_val after label encoding:", y_val.head())

y_test = pd.DataFrame(le.transform(y_test[target_name]), columns=["Y"])
display("y_test after label encoding:", y_test.head())

In [None]:
y_train.values == 0

In [None]:
# calculate sample weights
sample_weights = compute_sample_weight(
    class_weight='balanced',
    y=y_train
)

# parameters
objective='binary:logistic'
booster = "gbtree"
eval_metric="logloss"
early_stopping_rounds = 5

In [None]:
# %%time

# # Finetune Hyperparameters
# params = {
#     "colsample_bytree": [0.5, 0.6, 0.7, 0.8, 0.9],
#     "gamma": uniform(0.3, 0.7),
#     "max_depth": [8, 9, 10, 11, 12],
#     "n_estimators": randint(30, 80),
#     "subsample": [0.6, 0.7, 0.8, 0.9],
#     "min_child_weight": [1, 1.5, 2, 2.5],
#     "eta": [0.3, 0.1, 0.05],
# }

# k = StratifiedKFold(n_splits=3, shuffle=False)
# scoring="f1"

# cv_model = xgb.XGBClassifier(
#     objective=objective,
#     tree_method= "auto",
#     eval_metric=eval_metric,
#     booster = booster)

# search = RandomizedSearchCV(
#     cv_model,
#     param_distributions=params,
#     scoring=scoring,
#     random_state=1,
#     n_iter=100,  # No. of combinations / fold
#     cv=k,
#     verbose=1,
#     n_jobs=1,
#     return_train_score=True,
#     refit=False,  # refit by manual as we need to plot the train-val curve to see overfitting problem
#     )

# search.fit(X_train, y_train, sample_weight=sample_weights)
# cv_results =  pd.DataFrame(search.cv_results_).loc[:, ["rank_test_score", "mean_test_score", "params"]].sort_values(by=["rank_test_score"])
# cv_results.head(5)

# # show the best set of hyperparams
# search.best_params_

In [None]:
# plot train-val curve to ensure no overfitting
sample_weights = compute_sample_weight(
    class_weight='balanced',
    y=y_train
)

colsample_bytree = 0.6
n_estimators = 69
tree_method= "auto"

# tuning parameters
eta = 0.3
max_depth = 10
max_leaves = 2 ** max_depth

# prevent overfitting
min_child_weight = 1.5
gamma = 0.45095519663195377
subsample = 0.9


# Use "hist" for constructing the trees, with early stopping enabled.
model = xgb.XGBClassifier(
    early_stopping_rounds=early_stopping_rounds,
    tree_method=tree_method,
    objective=objective,
    n_estimators=n_estimators,
    missing=np.nan,
    eval_metric=eval_metric,
    booster=booster,
    eta=eta,
    max_depth=max_depth,
    max_leaves=max_leaves,
    min_child_weight=min_child_weight,
    gamma=gamma,
    subsample=subsample,
    colsample_bytree=colsample_bytree)

# Fit the model, val sets are used for early stopping.
result = model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], sample_weight=sample_weights, verbose=True)

# check overfitting
results = result.evals_result()
epochs = len(results['validation_0']['logloss'])
x_axis = range(0, epochs)

# plot log loss
fig, ax = plt.subplots()
ax.plot(x_axis, results['validation_0']['logloss'], label='Train')
ax.plot(x_axis, results['validation_1']['logloss'], label='Validation')
ax.legend()
plt.ylabel('Log Loss')
plt.title('XGBoost Log Loss')
plt.show()

In [None]:
# finally train with all traing + val, and change to dart booster
sample_weights = compute_sample_weight(
    class_weight='balanced',
    y=y
)

colsample_bytree = 0.6
n_estimators = 69
tree_method= "auto"
# booster = "dart"  # update booster to dart
booster = "gbtree"

# tuning parameters
eta = 0.3
max_depth = 10
max_leaves = 2**max_depth

# prevent overfitting
min_child_weight = 1.5
gamma = 0.45095519663195377
subsample = 0.9


# Use "hist" for constructing the trees, with early stopping enabled.
model = xgb.XGBClassifier(
    tree_method=tree_method,
    objective=objective,
    n_estimators=n_estimators,
    missing=np.nan,
    eval_metric=eval_metric,
    booster=booster,
    eta=eta,
    max_depth=max_depth,
    max_leaves=max_leaves,
    min_child_weight=min_child_weight,
    gamma=gamma,
    subsample=subsample,
    colsample_bytree=colsample_bytree)

model.fit(X, y, verbose=True, sample_weight=sample_weights)

In [None]:
# train
preds = model.predict(X_train)

# 0.9130316495873766
f1_score(y_train, preds)

In [None]:
# val
preds = model.predict(X_val)

# 0.9130316495873766
f1_score(y_val, preds)

In [None]:
# predict on test set
preds = model.predict(X_test)

# 0.9196223751544026
f1_score(y_test, preds)

In [None]:
# Threshold Moving/Tuning
from numpy import arange
from numpy import argmax

yhat = model.predict_proba(X_val)

# keep probabilities for the positive outcome only
probs = yhat[:, 1]

# define thresholds
thresholds = arange(0, 1, 0.001)

# apply threshold to positive probabilities to create labels
def to_labels(pos_probs, threshold):
    return (pos_probs >= threshold).astype('int')


# evaluate each threshold
scores = [f1_score(y_val, to_labels(probs, t)) for t in thresholds]
# get best threshold
ix = argmax(scores)

best_thresh = thresholds[ix]

print('Best performing threshold=%.3f with best F-Score=%.5f' % (best_thresh, scores[ix]))

In [None]:
# predict on test set
prob = model.predict_proba(X_test)[:, 1]

preds = (prob > best_thresh).astype("int")

# 0.9228875406664908
f1_score(y_test, preds)

In [None]:
# plot the feature importance
feature_imp = pd.DataFrame(sorted(zip(model.feature_names_in_, model.feature_importances_), key=lambda x: x[0], reverse=True), columns=['Feature', 'Value'])
fig, ax = plt.subplots()

data = feature_imp.sort_values(by="Value", ascending=False).head(50)
fig.set_size_inches(18.5, 10.5)

sns.barplot(x="Value", y="Feature", data=data)
ax.set_title('Feature Importance By Weight')
ax.set_xlabel('importance score')
ax.set_ylabel('features')
ax.yaxis.set_visible(True)
ax.xaxis.set_visible(True)
fig.patch.set_facecolor('white')
# plt.savefig(f'../data/feature_importance_by_weight.png', bbox_inches='tight')

In [None]:
# get the useless col in prediction
useless_cols = feature_imp.sort_values(by="Value", ascending=False).loc[feature_imp.sort_values(by="Value", ascending=False).Value == 0, "Feature"].tolist()
useless_cols

In [None]:
explainer = shap.TreeExplainer(model)
shap_values = explainer(X_test)

In [None]:
shap.summary_plot(shap_values.values, X_test, max_display=10, show=False, plot_type="dot", plot_size=[10,10])


In [None]:
"""
After all the analysis, we have built the end to end pipeline for production
"""

In [None]:
# run pipeline
df_raw = pd.read_excel("/Users/kewenyang/Documents/GitHub/Maybank_Classification/data/Assessment.xlsx",
                       engine='openpyxl',
                       sheet_name=1)

pipe = E2EPipeline()
data = pipe.preprocess(df_raw)
f1 = pipe.train(data)
print(f"the f1 score is: {f1}")

In [None]:
!ls -lh