In [None]:
# def read_data():
#     # Read data from the csv
#     pass

# def clean_data():
#     # Clean data
#     pass

# def create_features():
#     # Feature Engineering
#     pass

# def split_data():
#     # Split the data
#     pass

# def save_data():
#     # Export data to csv file
#     pass

# def preprocess_data():
#     # Call all the preprocess functions as required
#     pass

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import altair as alt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.tree import DecisionTreeClassifier
from sklearn import datasets, ensemble
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from lightgbm.sklearn import LGBMClassifier
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve

### Reading the data

In [None]:
df = pd.read_csv("../data/raw/cc_default.csv")

### Data Wrangling

In [None]:
# Dropping unrelated columns

df.drop(columns=["ID"], inplace=True)

In [None]:
# Renaming columns

df.rename(columns={
    "default payment next month": "is_default", 
    "PAY_0":"PAY_1"
}, inplace=True)

In [None]:
df.head()

### Data Splitting

In [None]:
train_df, test_df = train_test_split(df, test_size = 0.2, random_state = 123)

In [None]:
train_df.info()

### Feature Engineering


#### Approach (TODO):

- ~~Create new columns for percent of bill amount paid~~
- Group the 5 & 6  categories of education into one "unknown"
- Assumptions for Education and Marriage
- Deal with class imbalance
- ~~Create new column wrt to credit utilization~~
- Repayment status



In [None]:
def preprocess(df):
    
    # Percent Bill Paid
    for i in range(1, 6):
        df[f"percent_paid{i}"] = df[f"PAY_AMT{i}"] / df[f"BILL_AMT{i + 1}"] * 100
        df[f"percent_paid{i}"][df[f"percent_paid{i}"] < 0] = 100 - (df[f"percent_paid{i}"])
        df[f"percent_paid{i}"][df[f"BILL_AMT{i + 1}"] == 0] = 100 + df[f"PAY_AMT{i}"] * 0.01


    # Precent Credit Utilized
    for i in range(1, 7):
        df[f"percent_credit_utilised{i}"] = df[f"BILL_AMT{i}"] / df[f"LIMIT_BAL"] * 100


    # Standard Deviations
    bill_amt_col_names = []
    pay_amt_col_names = []

    for i in range(1, 7):
        bill_amt_col_names.append(f"BILL_AMT{i}")
        pay_amt_col_names.append(f"PAY_AMT{i}")

    df["std_dev_bill"]= df[bill_amt_col_names].std(axis=1)
    df["std_dev_pay"]= df[pay_amt_col_names].std(axis=1)


    # Change Education categories 0, 5 and 6 to 4
    df["EDUCATION"].replace({5: 4, 6: 4, 0: 4}, inplace = True)


    # Change PAY_X values from -2 to 0
    for i in range(1, 7):
        df[f"PAY_{i}"].replace({-2: 0}, inplace=True)
    
    return df

In [None]:
train_df

In [None]:
train_df = preprocess(train_df)

In [None]:
train_df

In [None]:
test_df = preprocess(test_df)

### Feature Types

In [None]:
X_train, y_train = train_df.drop(columns=["is_default"]), train_df["is_default"]
X_test, y_test = test_df.drop(columns=["is_default"]), test_df["is_default"]

In [None]:
X_train.info()

In [None]:
categorical_features = [
    "SEX",
    "EDUCATION",
    "MARRIAGE"
]

pass_through_features = [
    "PAY_1",
    "PAY_2",
    "PAY_3",
    "PAY_4",
    "PAY_5",
    "PAY_6"
]

numerical_features = list(set(X_train.columns) -
                          set(categorical_features) - 
                          set(pass_through_features))


In [None]:
assert len(numerical_features) + len(categorical_features) + len(pass_through_features) == len(X_train.columns)

### Model

In [None]:
scalar = StandardScaler()
ohe = OneHotEncoder(handle_unknown = 'ignore', sparse = False)


In [None]:
preprocessor = make_column_transformer(
    (scalar, numerical_features),
    (ohe, categorical_features)
)

In [None]:
preprocessor.fit(X_train);

In [None]:
new_columns = numerical_features + preprocessor.named_transformers_[
    "onehotencoder"
].get_feature_names_out().tolist()           

In [None]:
# Credits to Varada K.

def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

In [None]:
scoring_metrics = [
    "roc_auc",
    "f1",
    "recall",
    "precision"
]

In [None]:
dummy = DummyClassifier()
pipe_dummy = make_pipeline(preprocessor, dummy)

In [None]:
results = {}

In [None]:
results["dummy"] = mean_std_cross_val_scores(pipe_dummy,
                                             X_train, 
                                             y_train,
                                             cv=10,
                                             return_train_score=True,
                                             scoring=scoring_metrics)

In [None]:
pd.DataFrame(results)

__Models to test__ 
1) Logistic 
2) SVC
4) Decision Tree
5) Random Forest
6) XGBoost
7) LGBM 
8) CatBoost
9) Naive Bayes

In [None]:
# Base models with default hyper-parameters

# models = {
#     "Decision Tree": make_pipeline(preprocessor, DecisionTreeClassifier()),
#     "SVC": make_pipeline(preprocessor, SVC()),
#     "Logistic Regression": make_pipeline(preprocessor, LogisticRegression()),
#     "Random Forest": make_pipeline(preprocessor, RandomForestClassifier()),
#     "XGBoost": make_pipeline(preprocessor, XGBClassifier(verbosity=0)),
#     "LGBM": make_pipeline(preprocessor, LGBMClassifier()),
#     "CatBoost": make_pipeline(preprocessor, CatBoostClassifier(verbose=0)),
#     "Naive Bayes": make_pipeline(preprocessor, GaussianNB())
# }

In [None]:
# for model_name, model in models.items():
    
#     results[model_name] = mean_std_cross_val_scores(
#         model, 
#         X_train,
#         y_train,
#         cv=10,
#         return_train_score=True,
#         scoring=scoring_metrics
#     )
    
#     print(model_name, "done!")
    

In [None]:
# pd.DataFrame(results)

__Dealing with class imbalance__

In [None]:
# Logistic Regresion
param_lr = {
    "logisticregression__class_weight": ['balanced', None],
    "logisticregression__C": 10.0 ** np.arange(-2, 4)
}

pipe_lr = make_pipeline(preprocessor, LogisticRegression(max_iter=10000))

random_search = RandomizedSearchCV(
    pipe_lr,
    param_lr,
    n_jobs=-1,
    return_train_score=True,
    scoring=scoring_metrics,
    refit="recall"
)

In [None]:
random_search.fit(X_train, y_train)

In [None]:
pd.DataFrame(random_search.cv_results_)[[
    "mean_fit_time",
    "param_logisticregression__class_weight",
    "param_logisticregression__C",
    "mean_train_recall",
    "mean_test_recall",
    "mean_train_precision",
    "mean_test_precision",
    "mean_train_f1",
    "mean_test_f1"
]].sort_values("mean_test_recall", ascending=False).set_index("mean_test_recall").T

In [None]:
random_search.best_params_

In [None]:
pipe_lr_balanced = make_pipeline(
    preprocessor, 
    LogisticRegression(
        class_weight=random_search.best_params_["logisticregression__class_weight"],
        C=random_search.best_params_["logisticregression__C"],
        max_iter=10000
))

In [None]:
pipe_lr_balanced.fit(X_train, y_train)

In [None]:
importances = pd.DataFrame(
    pipe_lr_balanced.named_steps["logisticregression"].coef_,
    columns=new_columns
).T

importances["abs_coef"] = np.abs(importances[0])

importances.sort_values(by = "abs_coef", ascending=False)

In [None]:
pipe_nb = make_pipeline(preprocessor, GaussianNB())

pipe_nb.fit(X_train, y_train)

In [None]:
#Tree Based Model

pipe_catboost = make_pipeline(
    preprocessor,
    CatBoostClassifier(
        verbose=0,
        random_state = 123,
        auto_class_weights="Balanced"
    )
)

pipe_lgbm = make_pipeline(
    preprocessor,
    LGBMClassifier(random_state=123, class_weight="balanced")
)

In [None]:
models_bal= {
    "Logistic": pipe_lr_balanced,
    "Catboost": pipe_catboost,
    "LGBM": pipe_lgbm,
    "Naive Bayes": pipe_nb
}

In [None]:
for name, value in models_bal.items():
    results[name] = mean_std_cross_val_scores(
        value,
        X_train,
        y_train,
        cv=10,
        return_train_score=True,
        scoring=scoring_metrics
    )
    
    print(name, "done!")

In [None]:
pd.DataFrame(results).T

In [None]:
# Stacking Classifier

stacking_model = StackingClassifier(
    estimators=list(models_bal.items()),
    final_estimator=LogisticRegression()
)

results["stacking"] = mean_std_cross_val_scores(
    stacking_model,
    X_train,
    y_train,
    cv=10,
    return_train_score=True,
    scoring=scoring_metrics
)

In [None]:
# Voting Classifier

pipe_averaging = VotingClassifier(
    list(models_bal.items()), voting="soft"
)

results["pipe_averaging"] = mean_std_cross_val_scores(
    pipe_averaging,
    X_train,
    y_train,
    cv=10,
    return_train_score=True,
    scoring=scoring_metrics
)

In [None]:
pd.DataFrame(results).T

In [None]:
# Credits to Varada K.

def plot_roc_curve(model, X, y):
    fpr, tpr, thresholds = roc_curve(y_train, model.predict_proba(X_train)[:, 1])
    plt.plot(fpr, tpr, label="ROC Curve")
    plt.xlabel("FPR")
    plt.ylabel("TPR (recall)")

    default_threshold = np.argmin(np.abs(thresholds - 0.5))

    plt.plot(
        fpr[default_threshold],
        tpr[default_threshold],
        "or",
        markersize=10,
        label="threshold 0.5",
    )
    plt.legend(loc="best");

In [None]:
# Credits to Varada K.

def plot_PR_curve(
    precision,
    recall,
    close_default,
    label="PR curve",
    marker_colour="r",
    marker_label="Default threshold",
):
    plt.plot(precision, recall, label=label)
    plt.xlabel("Precision")
    plt.ylabel("Recall")
    plt.plot(
        precision[close_default],
        recall[close_default],
        "o",
        markersize=12,
        label=marker_label,
        c=marker_colour,
    )
    plt.legend(loc="best");

In [None]:
pipe_averaging.fit(X_train, y_train)

In [None]:
plot_roc_curve(pipe_averaging, X_train, y_train)

In [None]:
precision_avg, recall_avg, thresholds_avg = precision_recall_curve(
    y_train, pipe_averaging.predict_proba(X_train)[:, 1]
)

close_default_avg = np.argmin(np.abs(thresholds_avg - 0.5))

plot_PR_curve(precision_avg, recall_avg, close_default_avg)

In [None]:
pipe_lr_balanced.fit(X_train, y_train)

In [None]:
plot_roc_curve(pipe_lr_balanced, X_train, y_train)

In [None]:
precision_lr, recall_lr, thresholds_lr = precision_recall_curve(
    y_train, pipe_lr_balanced.predict_proba(X_train)[:, 1]
)
close_default_lr = np.argmin(np.abs(thresholds_lr - 0.5))

plot_PR_curve(precision_lr, recall_lr, close_default_lr)

In [None]:
def get_scores(model, X, y, threshold):
    y_pred = model.predict_proba(X)[:, 1] > threshold
    precision = precision_score(y, y_pred)
    recall = recall_score(y, y_pred)
    f1 = f1_score(y, y_pred)
    
    return {
        "Precision": precision,
        "Recall": recall,
        "f1": f1
    }

__Best threshold after hit and trial: 0.63__

In [None]:
get_scores(pipe_averaging, X_train, y_train, 0.63)

In [None]:
X_test

In [None]:
get_scores(pipe_averaging, X_test, y_test, 0.63)