# CreditCardApproval MachineLearning Diana Max

## Setup notebook & fetch data

In [1]:
from ucimlrepo import fetch_ucirepo
import numpy as np
import pandas as pd
import random

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, FunctionTransformer, TargetEncoder, label_binarize
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    precision_recall_curve,
    confusion_matrix,
    classification_report,
    make_scorer,
    RocCurveDisplay,
    PrecisionRecallDisplay,
)

from sklearn.model_selection import (
    cross_val_score,
    GridSearchCV, 
    KFold, 
    RandomizedSearchCV, 
    train_test_split, 
    StratifiedKFold, 
    cross_validate,
    RepeatedStratifiedKFold
)

from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.base import clone
from sklearn.compose import make_column_transformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier
from sklearn.gaussian_process import GaussianProcessClassifier

In [40]:
credit_approval = fetch_ucirepo(id=27)

X = credit_approval.data.features
y = credit_approval.data.targets
df = credit_approval.data.original

In [41]:
y

Unnamed: 0,A16
0,+
1,+
2,+
3,+
4,+
...,...
685,-
686,-
687,-
688,-


## preprocessing

In [4]:
"""
Maybe add some more advanced techniques from here later: https://scikit-learn.org/stable/modules/preprocessing.html

"""

'\nMaybe add some more advanced techniques from here later: https://scikit-learn.org/stable/modules/preprocessing.html\n\n'

### feature engineering: mixed numerical categorical feature space (this is old code, use fully numerical feature space for now)

In [5]:
make_column_transformer(
    (
        make_pipeline(
            SimpleImputer(strategy='most_frequent'),
            OneHotEncoder(drop='first')
        ),
        ['A1']
        
    ),
    (
        make_pipeline(
            SimpleImputer(strategy='median'),
            MinMaxScaler()
        ),
        ['A2']
    ),
    (
        make_pipeline(
            FunctionTransformer(lambda col: col.mask(col != 'u', 'non-u')),
            SimpleImputer(strategy='most_frequent'),
            OneHotEncoder(drop='first')
           ),
        ['A4']
    ),
    (
        make_pipeline(
            FunctionTransformer(lambda col: col.replace({'v':'v', 'h':'h','bb':'other','ff':'other','j':'other','z':'other','dd':'other','n':'other','o':'other'})),
            SimpleImputer(strategy='most_frequent'),
            OneHotEncoder(drop='first')
        ),
        ['A7']
    ),
    (
        make_pipeline(
            FunctionTransformer(lambda col: col.replace({'g':'g', 's':'non-g','p':'non-g'})),
            SimpleImputer(strategy='most_frequent'),
            OneHotEncoder(drop='first')
        ),
        ['A13']
    ),
    (
        'test',
        make_pipeline(
            FunctionTransformer(np.log1p),
            SimpleImputer(strategy='median'),
            MinMaxScaler()
        ),
        ['A11','A14','A15']
    ),
    # continuous default
    (
        make_pipeline(
            MinMaxScaler(),
        ),
        ['A3','A8']
    ),
    # categorical default
    (
        make_pipeline(
            OneHotEncoder(drop='first'),
        ),
        ['A9','A10', 'A12']
    ),
    
    # remove: A5, A6
    remainder='drop'
)

### feature engineering: fully numerical feature space

#### variant 1: include A7

In [6]:
column_tweaker_include_a7 = make_column_transformer(
    (
        make_pipeline(
            SimpleImputer(strategy='most_frequent'),
            OneHotEncoder(sparse_output=False, drop='first')
        ),
        ['A1','A9','A10','A12']
    ),
    (
        make_pipeline(
            SimpleImputer(strategy='most_frequent'),
            FunctionTransformer(lambda col: label_binarize(col, classes=['u']))
        ),
        ['A4']
    ),
    (
        make_pipeline(
            SimpleImputer(strategy='most_frequent'),
            FunctionTransformer(lambda col: label_binarize(col, classes=['g']))
        ),
        ['A5']
    ),
    (
        make_pipeline(
            TargetEncoder(),
            SimpleImputer(strategy='median')
        ),
        ['A6', 'A7']
    ),
    (
        make_pipeline(
            SimpleImputer(strategy='most_frequent'),
            FunctionTransformer(lambda col: label_binarize(col, classes=['g']))
        ),
        ['A13']
    ),
    (
        make_pipeline(
            FunctionTransformer(np.log1p),
            SimpleImputer(strategy='median')
        ),
        ['A2','A3','A8','A11','A14','A15']
    ),

    # leave everything else untouched
    remainder='drop'
)

#### variant 2: exclude A7

In [7]:
column_tweaker_exclude_a7 = make_column_transformer(
    (
        make_pipeline(
            SimpleImputer(strategy='most_frequent'),
            OneHotEncoder(sparse_output=False, drop='first')
        ),
        ['A1','A9','A10','A12']
    ),
    (
        make_pipeline(
            SimpleImputer(strategy='most_frequent'),
            FunctionTransformer(lambda col: label_binarize(col, classes=['u']))
        ),
        ['A4']
    ),
    (
        make_pipeline(
            SimpleImputer(strategy='most_frequent'),
            FunctionTransformer(lambda col: label_binarize(col, classes=['g']))
        ),
        ['A5']
    ),
    (
        make_pipeline(
            TargetEncoder(),
            SimpleImputer(strategy='median')
        ),
        ['A6']
    ),
    (
        make_pipeline(
            SimpleImputer(strategy='most_frequent'),
            FunctionTransformer(lambda col: label_binarize(col, classes=['g']))
        ),
        ['A13']
    ),
    (
        make_pipeline(
            FunctionTransformer(np.log1p),
            SimpleImputer(strategy='median')
        ),
        ['A2','A3','A8','A11','A14','A15']
    ),

    # leave everything else untouched
    remainder='drop'
)

### feature scaling

In [8]:
column_scaler = MinMaxScaler()

### PCA

In [9]:
column_pca = PCA(n_components=.99, svd_solver='full')

### put together pipelines

#### feature engineering pipeline 1

In [10]:
preprocessing_pipe = make_pipeline(
    column_tweaker_exclude_a7,
    column_scaler,
    column_pca
)
preprocessing_pipe

#### feature engineering pipeline 2

In [11]:
"""

In which different formats do we need the data for different classifiers ?

"""

'\n\nIn which different formats do we need the data for different classifiers ?\n\n'

## train test validation split

In [12]:
# Set random seed for reproducibility
seed = 42
np.random.seed(seed)
random.seed(seed)

In [13]:
X_preprocessed = preprocessing_pipe.fit_transform(X=X, y=y.values.ravel())

In [14]:
X_train, X_test_validate, y_train, y_test_validate = train_test_split(X_preprocessed, y.values.ravel(), test_size=0.1, random_state=seed)
X_train.shape, y_train.shape

((621, 12), (621,))

In [15]:
X_test, X_validate, y_test, y_validate = train_test_split(X_test_validate, y_test_validate, test_size=.5)
(X_test.shape, y_test.shape), (X_validate.shape, y_validate.shape)

(((34, 12), (34,)), ((35, 12), (35,)))

In [16]:
X_train_test, y_train_test = np.concatenate((X_train, X_test)), np.concatenate((y_train, y_test)) 

In [17]:
np.concatenate((X_train))

array([ 0.26601904, -0.55324516, -0.41293061, ...,  0.4801647 ,
       -0.10331035, -0.0011813 ])

## train & optimise different models

On what to optimize ?

The task is o predict, if, given the different parameters, access to a credit card is denied or granted.

- The worst case would be to give access to a position that is obvious fraud (False Positive). Cost: The owner looses money.
- Rejecting access to a legit position (False Negative) would be inconvenient, since someone wouldn't be able to draw his money, but we consider it less of a problem than a False Positive.

- Obviously, a True Positive would be giving someone his money while a True Negative would be denying fraud.



In summary, we primarily optimize for the False Positive rate.

Additional metrics:
- The dataset is balanced therefore classificatino accuracy is a meaningfull metrix as well.

How to compare classifiers ?
- Plot learning curves.
- Compare missclassified examples.
- Compare robustness in different CV-splits.

Next, we compare the performance of a variety of common ML classifiers. Since we are relativey naive to which classifiers could work, we try a diverse set.

#### Linear Discriminant Analysis

Parameters:
'priors' : The default value estimates the class proportions from the training set. Since our dataset with n<1000 is relatively small, class proportions might be slightly skewed leading to a suboptimal estimation from the training data .

In [None]:
lda = LinearDiscriminantAnalysis()

param_grid = {
    'priors':[[.25,.75], [.3,.7], [.35,.65], [.4,.6],
              [.45,.55], [.5,.5], [.55,.45], [.6,.4],
              [.65,.35], [.7,.3], [.75,.25], ]
    }


lda_gscv = GridSearchCV(estimator=lda, param_grid=param_grid, cv=10)

lda_gscv.fit(X_train_test, y_train_test)

lda_gscv.best_estimator_.score(X_validate, y_validate)

#### Quadratic Discriminant Analysis

In [None]:
qda = QuadraticDiscriminantAnalysis()

param_grid = {
    'priors':[[.25,.75], [.3,.7], [.35,.65], [.4,.6],
              [.45,.55], [.5,.5], [.55,.45], [.6,.4],
              [.65,.35], [.7,.3], [.75,.25], ]
    }

qda_gscv = GridSearchCV(estimator=qda, param_grid=param_grid, cv=10)

qda_gscv.fit(X_train_test, y_train_test)

qda_gscv.best_estimator_.score(X_validate, y_validate)

#### Compare LDA and QDA

In [None]:
lda_gscv.best_estimator_.predict(X_validate) == qda_gscv.best_estimator_.predict(X_validate)

### Gaussian Process Classifier

#### Baseline (no GridSearchCV)

In [52]:
gpc = GaussianProcessClassifier()

scoring = {"Accuracy": make_scorer(accuracy_score), "Precision": make_scorer(precision_score, pos_label = '+')}
gpc_cv_result = cross_validate(gpc, X_preprocessed, y.values.ravel(), cv=10, scoring=scoring)
gpc_cv_result['test_Accuracy'].mean(), gpc_cv_result['test_Precision'].mean()

(0.8405797101449275, 0.8664805214633129)

### Random Forest

#### Baseline

In [18]:
clf = RandomForestClassifier(max_depth=6, random_state=seed)

clf.fit(X_train, y_train)

In [19]:
clf.score(X_validate, y_validate)

0.8571428571428571

#### GridSearchCV

In [20]:
#param_grid = [{'n_estimators': [50, 100, 200], 'min_samples_split': [3, 5], 'min_samples_leaf': [3, 6]}]

param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10, 15, 20],
    'min_samples_leaf': [1, 2, 4, 6, 8, 10],
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy']
}

In [42]:
scoring = {"Accuracy": make_scorer(accuracy_score), "Precision": make_scorer(precision_score, pos_label = '+')}

In [43]:
rf = RandomForestClassifier(random_state=seed)

In [44]:
cv = StratifiedKFold(n_splits=10)

In [45]:
search_rf = GridSearchCV(
    estimator=rf,
    scoring=scoring,
    param_grid=param_grid, 
    verbose=1,
    cv=cv,
    refit='Accuracy',
    n_jobs=-1)

In [46]:
search_rf.fit(X_train_test, y_train_test)

Fitting 10 folds for each of 10800 candidates, totalling 108000 fits


KeyboardInterrupt: 

In [None]:
search_rf.cv_results_

In [None]:
search_rf.best_params_ , search_rf.best_estimator_

In [None]:
# Auswerten auf den validierungsdaten
accuracy_score(y_validate, search_rf.best_estimator_.predict(X_validate))

### KNeighborsClassifier

#### Baseline

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn.score(X_validate, y_validate)

#### GridSearch

In [None]:
param_grid = [{'n_neighbors':[5, 10, 15], 'weights':['uniform', 'distance'], 'algorithm':['auto', 'ball_tree', 'kd_tree', 'brute'], 'p':[1, 2]}]


In [None]:
scoring = {"Accuracy": make_scorer(accuracy_score), "Precision": make_scorer(precision_score, pos_label = '+')}

In [None]:
neighbor = KNeighborsClassifier()
cv = StratifiedKFold(n_splits=10)

In [None]:
search_knn = GridSearchCV(
    estimator=neighbor,
    scoring=scoring,
    param_grid=param_grid, 
    verbose=1,
    cv=cv,
    refit='AUC',
    n_jobs=-1)

In [None]:
search_knn.fit(X_train_test, y_train_test)
search_knn.cv_results_

In [None]:
search_knn.best_params_ , search_knn.best_estimator_

In [None]:
# Auswerten auf den validierungsdaten
# accuracy_score(y_validate, search_knn.best_estimator_.predict(X_validate))

### 

### Adaboost
A ensemble method

#### Baseline

In [None]:
ada = AdaBoostClassifier(LinearDiscriminantAnalysis())
ada_cv_result = cross_val_score(ada, X_preprocessed, y.values.ravel(), cv=10)

#### GridSearchCV

In [None]:
"""
Check if all these methods always categorize the same, or different samples wrong. If the later is the case, this would motivate using a ensemble method to balance strengths and weaknesses of different classifiers.

Maybe building some ensemble method that combines a (gradient boosted) dewcicion tree for the categorical variables with something like SVM for the continuous data.

This would obviate the need for target encoding of A6 and A7 and poosible better handle the many binary variables in the dataset. 
""";

## (Sequential Feature Selection)

In [None]:
"""
Further reduce dimensionality by SFS. This may give different results for different classifiers.

Compare results of SFS to results of EDA
"""

## model comparison

In [None]:

print('Best RandomForestClassifier Accuracy:', accuracy_score(y_validate, search_rf.best_estimator_.predict(X_validate)))
print('Best KNeighborsClassifier Accuracy:', accuracy_score(y_validate, search_knn.best_estimator_.predict(X_validate)))

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(10, 4))

RocCurveDisplay.from_estimator(search_rf.best_estimator_, X_preprocessed, y, ax=ax1)
RocCurveDisplay.from_estimator(search_knn.best_estimator_, X_preprocessed, y, ax=ax1)
ax1.set_title("ROC curve")

PrecisionRecallDisplay.from_estimator(search_rf.best_estimator_, X_preprocessed, y, ax=ax2)
PrecisionRecallDisplay.from_estimator(search_knn.best_estimator_, X_preprocessed, y, ax=ax2)
ax2.set_title("Precision-Recall curve")



In [None]:
y_num = np.where(y_validate == '+', 1, 0)

In [None]:
#DetCurveDisplay.from_predictions(y_validate, rf_best.predict)
# Get prediction probabilities
probabilities = rf_best.predict_proba(X_validate)
print("Prediction Probabilities:\n", probabilities)

# Get class predictions (optional)
predictions = rf_best.predict(X_validate)
print("Class Predictions:\n", predictions)

DetCurveDisplay.from_predictions(y_num, predictions)

In [None]:
# Model evauation

# Since classes are balanced, accuracy is the correct evaluation metric.

# Plot ROC for different hyperparameters

# above: plot full results of grid search to showcase effects of different hyperparameters