In [1]:
# Our code imports
from Models import Model
from Metrics import accuracy, fmeasure, roc_auc, confusion_matrix
from Processing import Processor
from Pipelines import ModelGridBuilder, AnalysisPipeline

# Standard lib imports
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA

import altair as alt

In [2]:
credit = pd.read_csv('cleaned_training_data.csv')
credit_X = credit.drop(columns='TARGET')
credit_Y = credit['TARGET']

# Random Split

## Tune Logistic Regression w/ Lasso lambda

In [None]:
lambdas = np.linspace(0, 100, num=26)

logisticSearchBuilder = ModelGridBuilder('Logistic Lasso', parameters=lambdas)
models = logisticSearchBuilder.get_models()

random_logistic_analysis = AnalysisPipeline(credit_X, credit_Y, models, roc_auc)
random_logistic_analysis.process_data(pca=True, split_type='random', train_prop=.8, dev_prop=.1)
random_logistic_analysis.fit_models(max_iterations=8000)
# choose best model based on confusion matrix
best_model_specs, best_model_conf_matrix = random_logistic_analysis.test_models()
display(best_model_conf_matrix)
best_model_specs

In [None]:
random_logistic_analysis.dev_set_analysis

## Tune SVC

In [None]:
lambdas = np.linspace(0, 100, num=26)
svcSearchBuilder = ModelGridBuilder('SVC', parameters=lambdas)
models = svcSearchBuilder.get_models()

random_svc_analysis = AnalysisPipeline(credit_X, credit_Y, models, roc_auc)
random_svc_analysis.process_data(pca=True, split_type='random', train_prop=.8, dev_prop=.1)
random_svc_analysis.fit_models(max_iterations=8000)
# choose best model based on confusion matrix
best_model_specs, best_model_conf_matrix = random_svc_analysis.test_models()
display(best_model_conf_matrix)
best_model_specs

In [None]:
random_svc_analysis.dev_set_analysis

## Test LDA

In [None]:
ldaSearchBuilder = ModelGridBuilder('LDA')
models = ldaSearchBuilder.get_models()

random_lda_analysis = AnalysisPipeline(credit_X, credit_Y, models, roc_auc)
random_lda_analysis.process_data(pca=True, split_type='random', train_prop=.8, dev_prop=.1)
random_lda_analysis.fit_models()
# choose best model based on confusion matrix
best_model_specs, best_model_conf_matrix = random_lda_analysis.test_models()
display(best_model_conf_matrix)
best_model_specs

In [None]:
random_lda_analysis

# Stratified Split

In [None]:
lambdas = np.linspace(0, 100, num=26)

logisticSearchBuilder = ModelGridBuilder('Logistic Lasso', parameters=lambdas)
models = logisticSearchBuilder.get_models()

random_logistic_analysis = AnalysisPipeline(credit_X, credit_Y, models, roc_auc)
random_logistic_analysis.process_data(pca=True, split_type='stratified_class', train_prop=.8, dev_prop=.1)
random_logistic_analysis.fit_models(max_iterations=8000)
# choose best model based on confusion matrix
best_model_specs, best_model_conf_matrix = random_logistic_analysis.test_models()
display(best_model_conf_matrix)
best_model_specs