# CreditCardApproval MachineLearning Diana Max

## Setup notebook & fetch data

In [None]:
from ucimlrepo import fetch_ucirepo
import numpy as np
import pandas as pd
#import xgboost as xgb

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, FunctionTransformer, TargetEncoder, label_binarize
from sklearn.metrics import (
    auc, 
    accuracy_score, 
    confusion_matrix, 
    mean_squared_error, 
    make_scorer
)

from sklearn.model_selection import (
    cross_val_score,
    GridSearchCV, 
    KFold, 
    RandomizedSearchCV, 
    train_test_split, 
    StratifiedKFold, 
    cross_validate,
    RepeatedStratifiedKFold
)

from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.base import clone
from sklearn.compose import make_column_transformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [None]:
from sklearn.metrics import (
    accuracy_score,
    roc_curve,
    roc_auc_score,
    precision_recall_curve,
    auc,
    confusion_matrix,
    classification_report,
    RocCurveDisplay,
    PrecisionRecallDisplay,
)

In [None]:
credit_approval = fetch_ucirepo(id=27)

X = credit_approval.data.features
y = credit_approval.data.targets
df = credit_approval.data.original

## preprocessing

In [None]:
"""
Maybe add some more advanced techniques from here later: https://scikit-learn.org/stable/modules/preprocessing.html

"""

### feature engineering: mixed numerical categorical feature space (this is old code, use fully numerical feature space for now)

In [None]:
make_column_transformer(
    (
        make_pipeline(
            SimpleImputer(strategy='most_frequent'),
            OneHotEncoder(drop='first')
        ),
        ['A1']
        
    ),
    (
        make_pipeline(
            SimpleImputer(strategy='median'),
            MinMaxScaler()
        ),
        ['A2']
    ),
    (
        make_pipeline(
            FunctionTransformer(lambda col: col.mask(col != 'u', 'non-u')),
            SimpleImputer(strategy='most_frequent'),
            OneHotEncoder(drop='first')
           ),
        ['A4']
    ),
    (
        make_pipeline(
            FunctionTransformer(lambda col: col.replace({'v':'v', 'h':'h','bb':'other','ff':'other','j':'other','z':'other','dd':'other','n':'other','o':'other'})),
            SimpleImputer(strategy='most_frequent'),
            OneHotEncoder(drop='first')
        ),
        ['A7']
    ),
    (
        make_pipeline(
            FunctionTransformer(lambda col: col.replace({'g':'g', 's':'non-g','p':'non-g'})),
            SimpleImputer(strategy='most_frequent'),
            OneHotEncoder(drop='first')
        ),
        ['A13']
    ),
    (
        'test',
        make_pipeline(
            FunctionTransformer(np.log1p),
            SimpleImputer(strategy='median'),
            MinMaxScaler()
        ),
        ['A11','A14','A15']
    ),
    # continuous default
    (
        make_pipeline(
            MinMaxScaler(),
        ),
        ['A3','A8']
    ),
    # categorical default
    (
        make_pipeline(
            OneHotEncoder(drop='first'),
        ),
        ['A9','A10', 'A12']
    ),
    
    # remove: A5, A6
    remainder='drop'
)

### feature engineering: fully numerical feature space

#### variant 1: include A7

In [None]:
column_tweaker_include_a7 = make_column_transformer(
    (
        make_pipeline(
            SimpleImputer(strategy='most_frequent'),
            OneHotEncoder(sparse_output=False, drop='first')
        ),
        ['A1','A9','A10','A12']
    ),
    (
        make_pipeline(
            SimpleImputer(strategy='most_frequent'),
            FunctionTransformer(lambda col: label_binarize(col, classes=['u']))
        ),
        ['A4']
    ),
    (
        make_pipeline(
            SimpleImputer(strategy='most_frequent'),
            FunctionTransformer(lambda col: label_binarize(col, classes=['g']))
        ),
        ['A5']
    ),
    (
        make_pipeline(
            TargetEncoder(),
            SimpleImputer(strategy='median')
        ),
        ['A6', 'A7']
    ),
    (
        make_pipeline(
            SimpleImputer(strategy='most_frequent'),
            FunctionTransformer(lambda col: label_binarize(col, classes=['g']))
        ),
        ['A13']
    ),
    (
        make_pipeline(
            FunctionTransformer(np.log1p),
            SimpleImputer(strategy='median')
        ),
        ['A2','A3','A8','A11','A14','A15']
    ),

    # leave everything else untouched
    remainder='drop'
)

#### variant 2: exclude A7

In [None]:
column_tweaker_exclude_a7 = make_column_transformer(
    (
        make_pipeline(
            SimpleImputer(strategy='most_frequent'),
            OneHotEncoder(sparse_output=False, drop='first')
        ),
        ['A1','A9','A10','A12']
    ),
    (
        make_pipeline(
            SimpleImputer(strategy='most_frequent'),
            FunctionTransformer(lambda col: label_binarize(col, classes=['u']))
        ),
        ['A4']
    ),
    (
        make_pipeline(
            SimpleImputer(strategy='most_frequent'),
            FunctionTransformer(lambda col: label_binarize(col, classes=['g']))
        ),
        ['A5']
    ),
    (
        make_pipeline(
            TargetEncoder(),
            SimpleImputer(strategy='median')
        ),
        ['A6']
    ),
    (
        make_pipeline(
            SimpleImputer(strategy='most_frequent'),
            FunctionTransformer(lambda col: label_binarize(col, classes=['g']))
        ),
        ['A13']
    ),
    (
        make_pipeline(
            FunctionTransformer(np.log1p),
            SimpleImputer(strategy='median')
        ),
        ['A2','A3','A8','A11','A14','A15']
    ),

    # leave everything else untouched
    remainder='drop'
)

### feature scaling

In [None]:
column_scaler = MinMaxScaler()

### PCA

In [None]:
column_pca = PCA(n_components=.99, svd_solver='full')

### put together pipelines

#### feature engineering pipeline 1

In [None]:
preprocessing_pipe = make_pipeline(
    column_tweaker_exclude_a7,
    column_scaler,
    column_pca
)
preprocessing_pipe

#### feature engineering pipeline 2

In [None]:
"""

In which different formats do we need the data for different classifiers ?

"""

## train test validation split

In [None]:
X_preprocessed = preprocessing_pipe.fit_transform(X=X, y=y.values.ravel())

In [None]:
X_train, X_test_validate, y_train, y_test_validate = train_test_split(X_preprocessed, y.values.ravel(), test_size=0.1)
X_train.shape, y_train.shape

In [None]:
X_test, X_validate, y_test, y_validate = train_test_split(X_test_validate, y_test_validate, test_size=.5)
(X_test.shape, y_test.shape), (X_validate.shape, y_validate.shape)

In [None]:
X_train_test, y_train_test = np.concatenate((X_train, X_test)), np.concatenate((y_train, y_test))
X_train_test.shape, y_train_test.shape

## train & optimise different models

### Discriminant Analysis

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

#### Linear Discriminant Analysis

In [None]:
y.value_counts()

In [None]:
lda = LinearDiscriminantAnalysis()

param_grid = {
    'priors':[[.3,.7],[.4,.6],[.5,.5],[.6,.4],[.7,.3]]
    }

lda_gs = GridSearchCV(estimator=lda, param_grid=param_grid, cv=10)

lda_gs.fit(X_train_test, y_train_test)

lda_gs.best_estimator_.score(X_validate, y_validate)

#### Quadratic Discriminant Analysis

In [None]:
qda = QuadraticDiscriminantAnalysis()

param_grid = {
    'priors':[[.3,.7],[.4,.6],[.5,.5],[.6,.4],[.7,.3]]
    }

qda_gs = GridSearchCV(estimator=qda, param_grid=param_grid, cv=10)

qda_gs.fit(X_train_test, y_train_test)

qda_gs.best_estimator_.score(X_validate, y_validate)

### Gaussian Process Classifier

In [None]:
from sklearn.gaussian_process import GaussianProcessClassifier

In [None]:
gpc = GaussianProcessClassifier()

In [None]:
gpc_cv_result = cross_val_score(gpc, X_preprocessed, y.values.ravel(), cv=10)

In [None]:
gpc_cv_result.mean()

### LogisticRegression (Linear Model)

In [None]:
y_test.ravel()


In [None]:
res = cross_validate(
    estimator=LogisticRegression(),
    X = X_preprocessed,
    y =  y.values.ravel(),
    cv=StratifiedKFold(n_splits=7, shuffle=True, random_state=42),
    scoring="accuracy")

In [None]:
res['test_score'].mean()

In [None]:
lr = LogisticRegression().fit(X_train, y_train)

In [None]:
accuracy_score(y_test, lr.predict(X_test))

In [None]:
print(classification_report(y_test, lr.predict(X_test)))

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4))
RocCurveDisplay.from_estimator(lr, X_preprocessed, y, ax=ax1)
ax1.set_title("ROC curve")
PrecisionRecallDisplay.from_estimator(lr, X_preprocessed, y, ax=ax2)
ax2.set_title("Precision-Recall curve")

#### GridSearchCV

In [None]:
import warnings

warnings.filterwarnings("ignore")

In [None]:
param_grid = [{'penalty': ['l2', None], 'C': [1., 2., 3.]}]

In [None]:
scoring = {"AUC": "roc_auc", "Accuracy": make_scorer(accuracy_score)}

In [None]:
lg = LogisticRegression(random_state=42,solver='saga')

In [None]:
cv = StratifiedKFold(n_splits=10)

In [None]:
search = GridSearchCV(estimator=lr, scoring=scoring, param_grid=param_grid, cv=cv, refit='AUC')

In [None]:
search.fit(X_preprocessed, y.values.ravel())

In [None]:
search.cv_results_.keys()

In [None]:
search.cv_results_

In [None]:
search.best_params_ 

In [None]:
lr_best = LogisticRegression(penalty=search.best_params_['penalty'], C=search.best_params_['C'])

In [None]:
## mit den besten Parametern fitten
lr_best.fit(X_train, y_train)
accuracy_score(y_test, lr_best.predict(X_test))

In [None]:
results = search.cv_results_

### Random Forest

In [None]:
clf = RandomForestClassifier(max_depth=6, random_state=0)

clf.fit(X_train, y_train)

In [None]:
clf.score(X_validate, y_validate)

#### GridSearchCV

In [None]:
param_grid = [{'n_estimators': [50, 100, 200], 'min_samples_split': [3, 5], 'min_samples_leaf': [3, 6]}]

# param_grid = {
#     'n_estimators': [100, 200, 300, 400, 500],
#     'max_depth': [None, 10, 20, 30, 40, 50],
#     'min_samples_split': [2, 5, 10, 15, 20],
#     'min_samples_leaf': [1, 2, 4, 6, 8, 10],
#     'max_features': ['auto', 'sqrt', 'log2'],
#     'bootstrap': [True, False],
#     'criterion': ['gini', 'entropy']
# }

In [None]:
scoring = {"AUC": "roc_auc", "Accuracy": make_scorer(accuracy_score)}

In [None]:
rf = RandomForestClassifier(random_state=42)

In [None]:
cv = StratifiedKFold(n_splits=10)

In [None]:
search = GridSearchCV(
    estimator=rf, scoring=scoring, param_grid=param_grid, 
    verbose=1, cv=cv, refit='AUC')

In [None]:
search.fit(X_train, y_train)

In [None]:
search.cv_results_

In [None]:
search.best_params_ 

In [None]:
rf_best = RandomForestClassifier(
    n_estimators=search.best_params_['n_estimators'], 
    min_samples_split=search.best_params_['min_samples_split'],
    min_samples_leaf=search.best_params_['min_samples_leaf'])

In [None]:
## mit den besten Parametern fitten
rf_best.fit(X_train, y_train)
accuracy_score(y_test, rf_best.predict(X_test))

In [None]:
# Max war beim Frisör

### SupportVectorClassifier

### Adaboost

### Ensemble method

In [None]:
"""
Check if all these methods always categorize the same, or different samples wrong. If the later is the case, this would motivate using a ensemble method to balance strengths and weaknesses of different classifiers.

Maybe building some ensemble method that combines a (gradient boosted) dewcicion tree for the categorical variables with something like SVM for the continuous data.

This would obviate the need for target encoding of A6 and A7 and poosible better handle the many binary variables in the dataset. 
""";

## (Sequential Feature Selection)

In [None]:
"""
Further reduce dimensionality by SFS. This may give different results for different classifiers.
"""

## model comparison