In [57]:
# Our code imports
from Models import Model
from Metrics import accuracy, fmeasure, roc_auc, net_revenue, confusion_matrix
from Processing import Processor
from Pipelines import ModelGridBuilder, AnalysisPipeline

# Standard lib imports
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA

import altair as alt

In [23]:
credit = pd.read_csv('cleaned_training_data.csv')
credit_X = credit.drop(columns='TARGET')
credit_Y = credit['TARGET']

In [136]:
def plot_parameter_tuning(dev_set_analysis):
    lambdas = [x[0][1] for x in dev_set_analysis]
    roc = [x[1] for x in dev_set_analysis]
    data = pd.DataFrame({
        'lambda': lambdas,
        'ROC-AUC': roc
    })
    display(alt.Chart(data).mark_line().encode(
        alt.X('lambda'),
        alt.Y('ROC-AUC', scale=alt.Scale(domain=(.65,.7)))
    ))
    
def get_original_dataframes_from_splits(processor, orignal_df):
    train = original_df.loc[processor.train_indices]
    dev = original_df.loc[processor.dev_indices]
    test = original_df.loc[processor.test_indices]
    return train, dev, test


# Random Split

## Tune Logistic Regression w/ Lasso lambda

In [4]:
lambdas = np.linspace(0, 100, num=26)

logisticSearchBuilder = ModelGridBuilder('Logistic Lasso', parameters=lambdas)
models = logisticSearchBuilder.get_models()

random_logistic_analysis = AnalysisPipeline(credit_X, credit_Y, models, roc_auc)
random_logistic_analysis.process_data(pca=True, split_type='random', train_prop=.8, dev_prop=.1)
random_logistic_analysis.fit_models(max_iterations=8000)
# choose best model based on confusion matrix
best_model_specs, best_model_conf_matrix = random_logistic_analysis.test_models()
display(best_model_conf_matrix)
best_model_specs

Unnamed: 0,True 1,True 0
Pred 1,692,4887
Pred 0,191,6141


('Logistic Lasso', 44.0)

In [133]:
plot_parameter_tuning(random_logistic_analysis.dev_set_analysis)

In [35]:
random_logistic_analysis.dev_set_analysis[11]

(('Logistic Lasso', 44.0), 0.6702736183527075)

## Tune SVC

In [6]:
lambdas = np.linspace(0, 10, num=21)
svcSearchBuilder = ModelGridBuilder('SVC', parameters=lambdas)
models = svcSearchBuilder.get_models()

random_svc_analysis = AnalysisPipeline(credit_X, credit_Y, models, roc_auc)
random_svc_analysis.process_data(pca=True, split_type='random', train_prop=.8, dev_prop=.1)
random_svc_analysis.fit_models(max_iterations=8000)
# choose best model based on confusion matrix
best_model_specs, best_model_conf_matrix = random_svc_analysis.test_models()
display(best_model_conf_matrix)
best_model_specs

Unnamed: 0,True 1,True 0
Pred 1,691,4904
Pred 0,192,6124


('SVC', 0.0)

In [134]:
plot_parameter_tuning(random_svc_analysis.dev_set_analysis)

In [36]:
random_svc_analysis.dev_set_analysis[0]

(('SVC', 0.0), 0.6689366016124507)

## Test LDA

In [8]:
ldaSearchBuilder = ModelGridBuilder('LDA')
models = ldaSearchBuilder.get_models()

random_lda_analysis = AnalysisPipeline(credit_X, credit_Y, models, roc_auc)
random_lda_analysis.process_data(pca=True, split_type='random', train_prop=.8, dev_prop=.1)
random_lda_analysis.fit_models()
# choose best model based on confusion matrix
best_model_specs, best_model_conf_matrix = random_lda_analysis.test_models()
display(best_model_conf_matrix)
best_model_specs

Unnamed: 0,True 1,True 0
Pred 1,8,4
Pred 0,875,11024


['LDA']

In [39]:
random_lda_analysis.dev_set_analysis[0][1]

0.504348654778057

In [78]:
f'TPPA: {net_revenue(random_lda_analysis.best_model.predict(test_X), test_Y, annuity)}'

'TPPA: 0.9294094315752524'

## Best Model-Random Split: Logistic Lasso w/ lambda = 44
roc-auc on dev set = .67

In [94]:
random_best_model = random_logistic_analysis.best_model
roc = random_logistic_analysis.testscore_best_model()

_, _, _, _, test_X, test_Y = random_logistic_analysis.processor.get_train_dev_test_sets()
predictions = random_best_model.predict(test_X)
acc = accuracy(predictions, test_Y)
f_4 = fmeasure(predictions, test_Y, B=4)
f_1 = fmeasure(predictions, test_Y, B=1)
f_p5 = fmeasure(predictions, test_Y, B=.5)
annuity = credit_X['AMT_ANNUITY'].loc[random_logistic_analysis.processor.test_indices]
tpaa = net_revenue(predictions, test_Y, annuity)
print('ALL w.r.t. TEST SET')
f'ROC-AUC: {roc:.4f};Accuracy: {acc:.4f}; F-4: {f_4:.4f}; F-1: {f_1:.4f}; F-.5: {f_p5:.4f}; TPAA: {tpaa:.6f}'

ALL w.r.t. TEST SET


'ROC-AUC: 0.6634;Accuracy: 0.5714; F-4: 0.5677; F-1: 0.1914; F-.5: 0.1319; TPAA: 0.551898'

# Stratified Split

## Tune Logistic w/ Lasso

In [10]:
lambdas = np.linspace(0, 50, num=26)

logisticSearchBuilder = ModelGridBuilder('Logistic Lasso', parameters=lambdas)
models = logisticSearchBuilder.get_models()

stratified_logistic_analysis = AnalysisPipeline(credit_X, credit_Y, models, roc_auc)
stratified_logistic_analysis.process_data(pca=True, split_type='stratified_class', train_prop=.8, dev_prop=.1)
stratified_logistic_analysis.fit_models(max_iterations=8000)
# choose best model based on confusion matrix
best_model_specs, best_model_conf_matrix = stratified_logistic_analysis.test_models()
display(best_model_conf_matrix)
best_model_specs

Unnamed: 0,True 1,True 0
Pred 1,640,4834
Pred 0,181,6256


('Logistic Lasso', 42.0)

In [135]:
plot_parameter_tuning(stratified_logistic_analysis.dev_set_analysis)

In [81]:
stratified_logistic_analysis.dev_set_analysis[21]

(('Logistic Lasso', 42.0), 0.6718244811304694)

## Tune SVC

In [12]:
lambdas = np.linspace(0, 10, num=21)
svcSearchBuilder = ModelGridBuilder('SVC', parameters=lambdas)
models = svcSearchBuilder.get_models()

stratified_svc_analysis = AnalysisPipeline(credit_X, credit_Y, models, roc_auc)
stratified_svc_analysis.process_data(pca=True, split_type='stratified_class', train_prop=.8, dev_prop=.1)
stratified_svc_analysis.fit_models(max_iterations=8000)
# choose best model based on confusion matrix
best_model_specs, best_model_conf_matrix = stratified_svc_analysis.test_models()
display(best_model_conf_matrix)
best_model_specs

Unnamed: 0,True 1,True 0
Pred 1,639,4843
Pred 0,182,6247


('SVC', 0.5)

In [130]:
plot_parameter_tuning(stratified_svc_analysis.dev_set_analysis)

In [83]:
stratified_svc_analysis.dev_set_analysis[1]

(('SVC', 0.5), 0.6708096967673416)

## LDA

In [14]:
ldaSearchBuilder = ModelGridBuilder('LDA')
models = ldaSearchBuilder.get_models()

stratified_lda_analysis = AnalysisPipeline(credit_X, credit_Y, models, roc_auc)
stratified_lda_analysis.process_data(pca=True, split_type='stratified_class', train_prop=.8, dev_prop=.1)
stratified_lda_analysis.fit_models()
# choose best model based on confusion matrix
best_model_specs, best_model_conf_matrix = stratified_lda_analysis.test_models()
display(best_model_conf_matrix)
best_model_specs

Unnamed: 0,True 1,True 0
Pred 1,7,8
Pred 0,814,11082


['LDA']

In [84]:
stratified_lda_analysis.dev_set_analysis

[(['LDA'], 0.5039024084859894)]

## Best Model-Stratified Split: Logistic Lasso w/ lambda = 42

In [95]:
stratified_best_model = stratified_logistic_analysis.best_model
roc = stratified_logistic_analysis.testscore_best_model()

_, _, _, _, test_X, test_Y = stratified_logistic_analysis.processor.get_train_dev_test_sets()
predictions = stratified_best_model.predict(test_X)
acc = accuracy(predictions, test_Y)
f_4 = fmeasure(predictions, test_Y, B=4)
f_1 = fmeasure(predictions, test_Y, B=1)
f_p5 = fmeasure(predictions, test_Y, B=.5)
annuity = credit_X['AMT_ANNUITY'].loc[stratified_logistic_analysis.processor.test_indices]
tpaa = net_revenue(predictions, test_Y, annuity)
print('ALL w.r.t. TEST SET')
f'ROC-AUC: {roc:.4f}; Accuracy: {acc:.4f}; F-4: {f_4:.4f}; F-1: {f_1:.4f}; F-.5: {f_p5:.4f}; TPAA: {tpaa:.6f}'

ALL w.r.t. TEST SET


'ROC-AUC: 0.6493; Accuracy: 0.5615; F-4: 0.5584; F-1: 0.1909; F-.5: 0.1318; TPAA: 0.538138'

# Set training class ratio split

## Tune Logistic Lasso

In [15]:
lambdas = np.linspace(0, 50, num=26)

logisticSearchBuilder = ModelGridBuilder('Logistic Lasso', parameters=lambdas)
models = logisticSearchBuilder.get_models()

ratio_logistic_analysis = AnalysisPipeline(credit_X, credit_Y, models, roc_auc)
ratio_logistic_analysis.process_data(pca=True, split_type='set_class_prop_undersample', dev_prop=.05, class_prop_1_0=1)
ratio_logistic_analysis.fit_models(max_iterations=8000)
# choose best model based on confusion matrix
best_model_specs, best_model_conf_matrix = ratio_logistic_analysis.test_models()
display(best_model_conf_matrix)
best_model_specs

GRADIENT DID NOT CONVERGE. RESULTS ARE BAD
GRADIENT DID NOT CONVERGE. RESULTS ARE BAD
GRADIENT DID NOT CONVERGE. RESULTS ARE BAD


Unnamed: 0,True 1,True 0
Pred 1,327,2503
Pred 0,83,3042


('Logistic Lasso', 44.0)

In [131]:
plot_parameter_tuning(ratio_logistic_analysis.dev_set_analysis)

In [98]:
ratio_logistic_analysis.dev_set_analysis[22]

(('Logistic Lasso', 44.0), 0.6730816600321098)

## Tune SVC 

In [17]:
lambdas = np.linspace(0, 10, num=21)
svcSearchBuilder = ModelGridBuilder('SVC', parameters=lambdas)
models = svcSearchBuilder.get_models()

ratio_svc_analysis = AnalysisPipeline(credit_X, credit_Y, models, roc_auc)
ratio_svc_analysis.process_data(pca=True, split_type='set_class_prop_undersample', dev_prop=.05, class_prop_1_0=1)
ratio_svc_analysis.fit_models(max_iterations=8000)
# choose best model based on confusion matrix
best_model_specs, best_model_conf_matrix = ratio_svc_analysis.test_models()
display(best_model_conf_matrix)
best_model_specs

Unnamed: 0,True 1,True 0
Pred 1,326,2495
Pred 0,84,3050


('SVC', 3.0)

In [18]:
plot_parameter_tuning(ratio_svc_analysis.dev_set_analysis)

In [100]:
ratio_svc_analysis.dev_set_analysis[6]

(('SVC', 3.0), 0.6725835184411357)

## LDA

In [19]:
ldaSearchBuilder = ModelGridBuilder('LDA')
models = ldaSearchBuilder.get_models()

ratio_lda_analysis = AnalysisPipeline(credit_X, credit_Y, models, roc_auc)
ratio_lda_analysis.process_data(pca=True, split_type='set_class_prop_undersample', dev_prop=.05, class_prop_1_0=1)
ratio_lda_analysis.fit_models()
# choose best model based on confusion matrix
best_model_specs, best_model_conf_matrix = ratio_lda_analysis.test_models()
display(best_model_conf_matrix)
best_model_specs

Unnamed: 0,True 1,True 0
Pred 1,270,1695
Pred 0,140,3850


['LDA']

In [105]:
ratio_lda_analysis.dev_set_analysis[0][1]

0.6764278959290946

## Best Model-Train class ratio split: LDA

In [106]:
ratio_best_model = ratio_lda_analysis.best_model
roc = ratio_lda_analysis.testscore_best_model()

_, _, _, _, test_X, test_Y = ratio_lda_analysis.processor.get_train_dev_test_sets()
predictions = ratio_best_model.predict(test_X)
acc = accuracy(predictions, test_Y)
f_4 = fmeasure(predictions, test_Y, B=4)
f_1 = fmeasure(predictions, test_Y, B=1)
f_p5 = fmeasure(predictions, test_Y, B=.5)
annuity = credit_X['AMT_ANNUITY'].loc[ratio_lda_analysis.processor.test_indices]
tpaa = net_revenue(predictions, test_Y, annuity)
print('ALL w.r.t. TEST SET')
f'ROC-AUC: {roc:.4f}; Accuracy: {acc:.4f}; F-4: {f_4:.4f}; F-1: {f_1:.4f}; F-.5: {f_p5:.4f}; TPAA: {tpaa:.6f}'

ALL w.r.t. TEST SET


'ROC-AUC: 0.6712; Accuracy: 0.7053; F-4: 0.5227; F-1: 0.2279; F-.5: 0.1647; TPAA: 0.698450'

# Best overall model: Class ratio-split LDA
LDA with the class ratio (ratio = 1) split was the best model with ROC-AUC .6712 (see directly above)

In [116]:
# This is the missclassificaitions on the dev set for the best model

ratio_best_model = ratio_lda_analysis.best_model

_, _, dev_X, dev_Y, _, _ = ratio_lda_analysis.processor.get_train_dev_test_sets()
predictions = ratio_best_model.predict(dev_X)
missclassifications = credit_X.loc[ratio_lda_analysis.processor.dev_indices[predictions != dev_Y]]
missclassifications.to_csv('error_analysis.csv', index=False)

In [120]:
to_predict = pd.read_csv('cleaned_test_data.csv')
to_predict
pd.read_csv('application_test.csv')

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100001,Cash loans,F,N,Y,0,135000.0,568800.0,20560.5,450000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,100005,Cash loans,M,N,Y,0,99000.0,222768.0,17370.0,180000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
2,100013,Cash loans,M,Y,Y,0,202500.0,663264.0,69777.0,630000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0
3,100028,Cash loans,F,N,Y,2,315000.0,1575000.0,49018.5,1575000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
4,100038,Cash loans,M,Y,N,1,180000.0,625500.0,32067.0,625500.0,...,0,0,0,0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48739,456221,Cash loans,F,N,Y,0,121500.0,412560.0,17473.5,270000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
48740,456222,Cash loans,F,N,N,2,157500.0,622413.0,31909.5,495000.0,...,0,0,0,0,,,,,,
48741,456223,Cash loans,F,Y,Y,1,202500.0,315000.0,33205.5,315000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,3.0,1.0
48742,456224,Cash loans,M,N,N,0,225000.0,450000.0,25128.0,450000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0
