In [1]:
# Our code imports
from Models import Model
from Metrics import accuracy, fmeasure, roc_auc, confusion_matrix
from Processing import Processor
from Pipelines import ModelGridBuilder, AnalysisPipeline

# Standard lib imports
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA

import altair as alt

In [2]:
credit = pd.read_csv('cleaned_training_data.csv')
credit_X = credit.drop(columns='TARGET')
credit_Y = credit['TARGET']

In [3]:
def plot_parameter_tuning(dev_set_analysis):
    lambdas = [x[0][1] for x in dev_set_analysis]
    roc = [x[1] for x in dev_set_analysis]
    data = pd.DataFrame({
        'lambda': lambdas,
        'ROC-AUC': roc
    })
    display(alt.Chart(data).mark_line().encode(
        x='lambda',
        y='ROC-AUC'
    ))

# Random Split

## Tune Logistic Regression w/ Lasso lambda

In [None]:
lambdas = np.linspace(0, 100, num=26)

logisticSearchBuilder = ModelGridBuilder('Logistic Lasso', parameters=lambdas)
models = logisticSearchBuilder.get_models()

random_logistic_analysis = AnalysisPipeline(credit_X, credit_Y, models, roc_auc)
random_logistic_analysis.process_data(pca=True, split_type='random', train_prop=.8, dev_prop=.1)
random_logistic_analysis.fit_models(max_iterations=8000)
# choose best model based on confusion matrix
best_model_specs, best_model_conf_matrix = random_logistic_analysis.test_models()
display(best_model_conf_matrix)
best_model_specs

In [None]:
plot_parameter_tuning(random_logistic_analysis.dev_set_analysis)

## Tune SVC

In [None]:
lambdas = np.linspace(0, 10, num=21)
svcSearchBuilder = ModelGridBuilder('SVC', parameters=lambdas)
models = svcSearchBuilder.get_models()

random_svc_analysis = AnalysisPipeline(credit_X, credit_Y, models, roc_auc)
random_svc_analysis.process_data(pca=True, split_type='random', train_prop=.8, dev_prop=.1)
random_svc_analysis.fit_models(max_iterations=8000)
# choose best model based on confusion matrix
best_model_specs, best_model_conf_matrix = random_svc_analysis.test_models()
display(best_model_conf_matrix)
best_model_specs

In [None]:
plot_parameter_tuning(random_svc_analysis.dev_set_analysis)

## Test LDA

In [None]:
ldaSearchBuilder = ModelGridBuilder('LDA')
models = ldaSearchBuilder.get_models()

random_lda_analysis = AnalysisPipeline(credit_X, credit_Y, models, roc_auc)
random_lda_analysis.process_data(pca=True, split_type='random', train_prop=.8, dev_prop=.1)
random_lda_analysis.fit_models()
# choose best model based on confusion matrix
best_model_specs, best_model_conf_matrix = random_lda_analysis.test_models()
display(best_model_conf_matrix)
best_model_specs

In [None]:
plot_parameter_tuning(random_lda_analysis.dev_set_analysis)

# Stratified Split

## Tune Logistic w/ Lasso

In [None]:
lambdas = np.linspace(0, 50, num=26)

logisticSearchBuilder = ModelGridBuilder('Logistic Lasso', parameters=lambdas)
models = logisticSearchBuilder.get_models()

stratified_logistic_analysis = AnalysisPipeline(credit_X, credit_Y, models, roc_auc)
stratified_logistic_analysis.process_data(pca=True, split_type='stratified_class', train_prop=.8, dev_prop=.1)
stratified_logistic_analysis.fit_models(max_iterations=8000)
# choose best model based on confusion matrix
best_model_specs, best_model_conf_matrix = stratified_logistic_analysis.test_models()
display(best_model_conf_matrix)
best_model_specs

In [None]:
plot_parameter_tuning(stratified_logistic_analysis.dev_set_analysis)

## Tune SVC

In [None]:
lambdas = np.linspace(0, 10, num=21)
svcSearchBuilder = ModelGridBuilder('SVC', parameters=lambdas)
models = svcSearchBuilder.get_models()

stratified_svc_analysis = AnalysisPipeline(credit_X, credit_Y, models, roc_auc)
stratified_svc_analysis.process_data(pca=True, split_type='stratified_class', train_prop=.8, dev_prop=.1)
stratified_svc_analysis.fit_models(max_iterations=8000)
# choose best model based on confusion matrix
best_model_specs, best_model_conf_matrix = stratified_svc_analysis.test_models()
display(best_model_conf_matrix)
best_model_specs

In [None]:
plot_parameter_tuning(stratified_svc_analysis.dev_set_analysis)

## LDA

In [None]:
ldaSearchBuilder = ModelGridBuilder('LDA')
models = ldaSearchBuilder.get_models()

stratified_lda_analysis = AnalysisPipeline(credit_X, credit_Y, models, roc_auc)
stratified_lda_analysis.process_data(pca=True, split_type='stratified_class', train_prop=.8, dev_prop=.1)
stratified_lda_analysis.fit_models()
# choose best model based on confusion matrix
best_model_specs, best_model_conf_matrix = stratified_lda_analysis.test_models()
display(best_model_conf_matrix)
best_model_specs

In [None]:
plot_parameter_tuning(random_lda_analysis.dev_set_analysis)

# Set training class ratio split

## Tune Logistic Lasso

In [None]:
lambdas = np.linspace(0, 50, num=26)

logisticSearchBuilder = ModelGridBuilder('Logistic Lasso', parameters=lambdas)
models = logisticSearchBuilder.get_models()

ratio_logistic_analysis = AnalysisPipeline(credit_X, credit_Y, models, roc_auc)
ratio_logistic_analysis.process_data(pca=True, split_type='set_class_prop_undersample', dev_prop=.05, class_prop_1_0=1)
ratio_logistic_analysis.fit_models(max_iterations=8000)
# choose best model based on confusion matrix
best_model_specs, best_model_conf_matrix = ratio_logistic_analysis.test_models()
display(best_model_conf_matrix)
best_model_specs

In [None]:
plot_parameter_tuning(ratio_logistic_analysis.dev_set_analysis)

## Tune SVC 

In [None]:
lambdas = np.linspace(0, 10, num=21)
svcSearchBuilder = ModelGridBuilder('SVC', parameters=lambdas)
models = svcSearchBuilder.get_models()

ratio_svc_analysis = AnalysisPipeline(credit_X, credit_Y, models, roc_auc)
ratio_svc_analysis.process_data(pca=True, split_type='set_class_prop_undersample', dev_prop=.05, class_prop_1_0=1)
ratio_svc_analysis.fit_models(max_iterations=8000)
# choose best model based on confusion matrix
best_model_specs, best_model_conf_matrix = ratio_svc_analysis.test_models()
display(best_model_conf_matrix)
best_model_specs

In [None]:
plot_parameter_tuning(ratio_svc_analysis.dev_set_analysis)

## LDA

In [None]:
ldaSearchBuilder = ModelGridBuilder('LDA')
models = ldaSearchBuilder.get_models()

ratio_lda_analysis = AnalysisPipeline(credit_X, credit_Y, models, roc_auc)
ratio_lda_analysis.process_data(pca=True, split_type='set_class_prop_undersample', dev_prop=.05, class_prop_1_0=1)
ratio_lda_analysis.fit_models()
# choose best model based on confusion matrix
best_model_specs, best_model_conf_matrix = ratio_lda_analysis.test_models()
display(best_model_conf_matrix)
best_model_specs

In [None]:
plot_parameter_tuning(ratio_lda_analysis.dev_set_analysis)