In [1]:
# Our code imports
from Models import Model
from Metrics import accuracy, fmeasure, roc_auc, confusion_matrix
from Processing import Processor
from Pipelines import ModelGridBuilder, AnalysisPipeline

# Standard lib imports
import pandas as pd
import numpy as np

## Import the final dataset

In [2]:
credit = pd.read_csv('cleaned_training_data.csv', index_col=0).reset_index(drop=True).drop(columns=['FLAG_MOBIL', 'FLAG_DOCUMENT_2'])
credit_X = credit.drop(columns='TARGET')
credit_Y = credit['TARGET']

# Build solo models (see later for hyperparameter tuning)

## Preprocess the data
**Use the processor object to handle the data going forward**

In [3]:
processor = Processor(credit_X, credit_Y)
# Standardize dummify, and pad with B0. Numeric data is infered if no column names provided
processor.process_data()

## Choose a split type ('random', 'stratified_class', 'set_class_prop_undersample')

'random' performs random selection. 'stratified_class' ensures that the train, dev, and test set each have the same proportions of each class as the overall dataset (\~93% 0 and 7% 1). 'set_class_prop_undersample' requires the 'class_prop_1_0' parameter to be set (ratio between # of 1s / # of 0s) and forces the training set to have this ratio of class levels. Note that the dev and test sets will have stratified class levels at the overall dataset ratio (\~ 93% 0 and 7% 1).

**random**

In [4]:
processor.calculate_train_dev_test_split(split_type='random', train_prop=.8, dev_prop=.1)

train_X, train_Y, dev_X, dev_Y, test_X, test_Y = processor.get_train_dev_test_sets()

train_X.shape, train_Y.shape, dev_X.shape, dev_Y.shape, test_X.shape, test_Y.shape

((102113, 144), (102113,), (12764, 144), (12764,), (12764, 144), (12764,))

**stratified_class**

Note: the calculated train, dev, and test sets are saved and can be accessed by the .get_train_dev_test_sets function. Re-running .calculate_train_dev_test_split() re-calculates the dataset splits

In [5]:
processor.calculate_train_dev_test_split(split_type='stratified_class', train_prop=.8, dev_prop=.1)

train_X, train_Y, dev_X, dev_Y, test_X, test_Y = processor.get_train_dev_test_sets()

pd.Series(train_Y).value_counts() / len(train_Y), pd.Series(dev_Y).value_counts() / len(dev_Y), pd.Series(test_Y).value_counts() / len(test_Y)

Re-calculating train, dev, and test sets


(0    0.932026
 1    0.067974
 dtype: float64,
 0    0.932002
 1    0.067998
 dtype: float64,
 0    0.932069
 1    0.067931
 dtype: float64)

**set_class_prop_undersample**

Note: train_prop is ignored here (the size of the training set is determined by how large the dev/test sets are and the 1/0 class level ratio).

In [6]:
processor.calculate_train_dev_test_split(split_type='set_class_prop_undersample', dev_prop=.03, class_prop_1_0=1)

train_X, train_Y, dev_X, dev_Y, test_X, test_Y = processor.get_train_dev_test_sets()

train_Y.shape, dev_Y.shape, test_Y.shape, pd.Series(train_Y).value_counts()

Re-calculating train, dev, and test sets


((16312,),
 (3829,),
 (3829,),
 0    8156
 1    8156
 dtype: int64)

## Build solo models w/ preprocessed data
**Available models: 'Logistic', 'Logistic Lasso', 'SVC', 'SVC_C', 'LDA'**

Note: gradient descent defaults to
- max_iterations=75000
- tol = 1e-3
- etas = [.1, .01, .001, .0001, .00001, .000001]

You can modify these by passing in named variables to any model '.fit()' function

If you would like to turn off output as gradient descent trys each eta value, use 'show_iter=False'

In [7]:
logistic = Model('Logistic')
logistic.fit(train_X, train_Y, etas=[.1, .0001], show_iter=True)

Eta: 0.1; Iterations: 2
Gradient converged w/ 3085 iterations and eta = 0.0001


Any models that require a lambda value requires that it is provided when the model is created (lamb=)

In [26]:
logistic = Model('Logistic Lasso', lamb=1)
logistic.fit(train_X, train_Y, show_iter=False)

In [27]:
svc = Model('SVC', lamb=1)
svc.fit(train_X, train_Y, show_iter=False)

Access any modles coefficients via '.coef_'

In [30]:
svc.coef_[:10]

array([ 0.00000000e+00,  9.49645789e-06,  1.30141738e-05, -4.68515224e-05,
       -3.00345191e-05, -6.14864012e-05,  1.49985113e-04, -8.22653154e-05,
        5.92100506e-05,  9.42785972e-05])

LDA '.coef_' returns a 2-dimensional tuple where [0] holds B and [1] holds the cutoff value (c) for which XB >= C is class 1.

In [None]:
lda = Model('LDA')
lda.fit(train_X, train_Y)
lda.coef_

## Make predictions and score

In [10]:
predictions = logistic.predict(dev_X, dev_Y)
display(confusion_matrix(predictions, dev_Y))
'Accuracy: ', accuracy(predictions, dev_Y)

Unnamed: 0,True 1,True 0
Pred 1,171,1108
Pred 0,89,2461


('Accuracy: ', 0.6873857404021938)

# Run some analysis of class level ratios in training set

In [None]:
ratios = np.linspace(0, 2, num=21)
fscores = []
accuracies = []
roc_aucs = []
saved_predictions = []
for class_ratio in ratios:
    # Get splits for the class_ratio
    processor.calculate_train_dev_test_split(split_type='set_class_prop_undersample', dev_prop=.03, 
                                             class_prop_1_0=class_ratio, silence=True)
    train_X, train_Y, dev_X, dev_Y, test_X, test_Y = processor.get_train_dev_test_sets()
    # Train model and get predictions on the dev set
    logisticLasso = Model('Logistic Lasso', lamb=1)
    logisticLasso.fit(train_X, train_Y, max_iterations=4000, etas=[.001, .0001], show_iter=True)
    predictions = logisticLasso.predict(dev_X, dev_Y)
    saved_predictions.append(predictions)
    fscores.append(fmeasure(predictions, dev_Y, B=5))
    accuracies.append(accuracy(predictions, dev_Y))
    roc_aucs.append(roc_auc(predictions, dev_Y))

In [10]:
import altair as alt

data = pd.DataFrame({
    'Ratios': ratios,
    'F-5': fscores,
    'ROC-AUC': roc_aucs,
    'Accuracy': accuracies
})

alt.Chart(data).mark_line().encode(
    x='Ratios',
    y='ROC-AUC'
)

# Run multiple models at once and do hyperparameter tuning

## Use the 'ModelGridBuilder' class to create multiple models at once to test

In [38]:
logisticSearchBuilder = ModelGridBuilder('Logistic Lasso', parameters=[0, .25, .5, 1, 1.5, 2])
svcSearchBuilder = ModelGridBuilder('SVC', parameters=[0, .25, .5, 1, 1.5, 2])
ldaSearchBuilder = ModelGridBuilder('LDA')

**Save all models created into a single list**

In [39]:
models = logisticSearchBuilder.get_models() + svcSearchBuilder.get_models() + ldaSearchBuilder.get_models()

## Use the 'AnalysisPipeline' class to create a grid search over the provided models
**Provide AnalysisPipeline with the unprocessed X and Y data, models, and a scoring function to be used on the dev/test sets**

Run '.process_data()' to process the data via the input split type (see notes on the Processor object above)

Run '.fit_models()' to fit every model provided on the processed data (can pass in gradient descent variables as above)

In [42]:
modelAnalysis = AnalysisPipeline(credit_X, credit_Y, models, roc_auc)
modelAnalysis.process_data(split_type='stratified_class', train_prop=.8, dev_prop=.1)
modelAnalysis.fit_models(max_iterations=1000, etas=[.001])

**Run predictions on the dev set for every model and determine which one is best via '.test_models()'**

**Output all of the computed scores for each model via '.dev_set_analysis'**

In [None]:
best_model_specs, conf_mat = modelAnalysis.test_models()
display(conf_mat)
best_model_specs, modelAnalysis.dev_set_analysis

**Change the scoring function and retest the models**

In [None]:
modelAnalysis.score_func = lambda p, Y: fmeasure(p, Y, B=10)

best_model_specs, conf_mat = modelAnalysis.test_models()
display(conf_mat)
best_model_specs, modelAnalysis.dev_set_analysis

**Get the test set scores for the best model**

In [None]:
modelAnalysis.testscore_best_model()