In [1]:
%%capture
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import math

import seaborn as sns
import matplotlib.pyplot as plt

from py_helper_functions import *

from datetime import datetime

from patsy.highlevel import dmatrices
from sklearn.metrics import mean_squared_error
import sklearn.metrics as metrics

In [2]:
# helper functions
def regression_results(y_true, y_pred):

    # Regression metrics
    explained_variance=metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
    mse=metrics.mean_squared_error(y_true, y_pred) 
    median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
    r2=metrics.r2_score(y_true, y_pred)

    print('explained_variance: ', round(explained_variance,4))    
    print('r2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))
    
def create_coef_matrix(X, model):
    coef_matrix = pd.concat(
        [pd.DataFrame(X.columns),pd.DataFrame(model.coef_.flatten())], axis = 1
    )
    coef_matrix.columns = ['variable', 'coefficient']
    coef_matrix.iloc[-1] = ['Intercept', model.intercept_.flatten()[0]]
    return coef_matrix

def cv_summary(lambdas, C_values, model):
    d = {'lambdas': lambdas, 'C_values': C_values, 'mean_cv_score': model.scores_[1].mean(axis = 0)}
    return(pd.DataFrame(data=d))

def create_roc_plot(y_true, y_pred):
    fpr, tpr, thresholds = roc_curve(y_true, y_pred)
    all_coords = pd.DataFrame({
        'fpr': fpr,
        'tpr': tpr,
        'thresholds': thresholds
    })
    
    plot = ggplot(all_coords, aes(x = 'fpr', y = 'tpr')) \
        + geom_line(color=color[0], size = 0.7) \
        + geom_area(position = 'identity', fill = 'mediumaquamarine', alpha = 0.3) \
        + xlab("False Positive Rate (1-Specifity)") \
        + ylab("True Positive Rate (Sensitivity)") \
        + geom_abline(intercept = 0, slope = 1,  linetype = "dotted", color = "black") \
        + scale_y_continuous(limits = (0, 1), breaks = seq(0, 1, .1), expand = (0, 0.01)) \
        + scale_x_continuous(limits = (0, 1), breaks = seq(0, 1, .1), expand = (0.01, 0)) \
        + theme_bw()
    return(plot)

def sigmoid_array(x):
    return(1 / (1 + np.exp(-x)))

def generate_fold_prediction(model, X, fold, param_index):
    fold_coef = model.coefs_paths_[1][fold,param_index,:]
    return(sigmoid_array(np.dot(X, np.transpose(fold_coef)[:-1]) +  np.transpose(fold_coef)[-1]))

def create_loss_plot(all_coords, optimal_threshold, curr_exp_loss):
    all_coords_copy = all_coords.copy()
    all_coords_copy['loss'] = (all_coords_copy.false_pos*FP + all_coords_copy.false_neg*FN)/all_coords_copy.n
    
    t = optimal_threshold
    l = curr_exp_loss
    
    plot = ggplot(all_coords_copy, aes(x = 'thresholds', y = 'loss')) + \
        geom_line(color=color[0], size=0.7) + \
        scale_x_continuous(breaks = seq(0, 1.1, by = 0.1)) + \
        coord_cartesian(xlim=(0,1))+ \
        geom_vline(xintercept = t , color = color[0] ) + \
        annotate(geom = "text", x = t - 0.01, y= max(all_coords_copy.loss) - 0.4,
                 label="best threshold: " + str(round(t,2)),
                 colour=color[1], angle=90, size = 7) +\
        annotate(geom = "text", x = t + 0.06, y= l,\
                 label= str(round(l, 2)), size = 7) +\
        theme_bw()
    return(plot)


def create_roc_plot_with_optimal(all_coords, optimal_threshold):
    all_coords_copy = all_coords.copy()
    all_coords_copy['sp'] = all_coords_copy.true_neg/all_coords_copy.neg
    all_coords_copy['se'] = all_coords_copy.true_pos/all_coords_copy.pos
    
    best_coords = all_coords_copy[all_coords_copy.thresholds == optimal_threshold]
    sp = best_coords.sp.values[0]
    se = best_coords.se.values[0]

    plot = ggplot(all_coords_copy, aes(x = 'sp', y = 'se')) +\
        geom_line(color=color[0], size=0.7) +\
        scale_y_continuous(breaks = seq(0, 1.1, by = 0.1)) +\
        scale_x_reverse(breaks = seq(0, 1.1, by = 0.1)) +\
        geom_point(data = pd.DataFrame({'sp': [sp], 'se': [se]})) +\
        annotate(geom = "text", x = sp, y = se + 0.03,
                 label = str(round(sp, 2)) + ', ' + str(round(se, 2)), size = 7) +\
        theme_bw()
    return(plot)

In [3]:
def get_sme_comp_default(df):
    '''
    This function determine the default status for SME firms
    :param df: raw data
    :return: dataframe with default status for only SME firms
    '''
    # add all missing year and comp_id combinations -
    # originally missing combinations will have NAs in all other columns
    df = (
        df.set_index(["year", "comp_id"])
        .unstack(fill_value="toReplace")
        .stack()
        .reset_index()
    )
    df = df.replace("toReplace", np.nan)  # only way I could define it as NaN
    
    # generate status_alive; if sales larger than zero and not-NA, then firm is alive
    df["status_alive"] = (df["sales"] > 0 & (False == df["sales"].isna())).astype(int)
    
    # defaults in one year if there are sales in this year but no sales one year later
    # Status_in_one_years: data.groupby('comp_id')['status_alive'].shift(-1)
    df["default"] = (
        (df["status_alive"] == 1)
        & (df.groupby("comp_id")["status_alive"].shift(-1) == 0)
    ).astype(int)
    
    # filter for SME firms
    return df[(df.sales >= 1000) & (df.sales <= 10_000_000)]

def get_cleaned_data(local=True):
    '''
    This function reads from csv files, cleans it and returns the cleaned dataframe
    :param local: default True
    :param src: path to file
    :return: dataframe with cleaned data
    '''
    
    if local:
        raw_files = ['cs_bisnode_panel1.csv', 'cs_bisnode_panel2.csv']
    else:
        raw_files = ['https://raw.githubusercontent.com/viethngn/Data_Analysis_3_ECBS5171/main/assignment3/cs_bisnode_panel1.csv', 'https://raw.githubusercontent.com/viethngn/Data_Analysis_3_ECBS5171/main/assignment3/cs_bisnode_panel2.csv']
        
    dfs = []
    for file in raw_files:
        u_df = pd.read_csv(file)
        dfs.append(u_df)
    df = pd.concat(dfs, ignore_index=True)
    
    # drop variables with many NAs
    df = df.drop(columns=["COGS", "finished_prod", "net_dom_sales", "net_exp_sales", "wages", "D", "exit_year", "exit_date", "birth_year", "labor_avg", "founded_year"])

    working_sample = get_sme_comp_default(df[(df.ind2 == 26)])
    
    # engineering new features
    working_sample['n_day_alive'] = working_sample.apply(lambda x: (pd.Timestamp(f'31/12/{x.year}') - pd.Timestamp(x['founded_date'])).days, axis=1)
    
    working_sample.rename(columns={'amort': 'n_amort',
                            'curr_assets': 'n_curr_assets',
                            'curr_liab': 'n_curr_liab',
                            'extra_exp': 'n_extra_exp',
                            'extra_inc': 'n_extra_inc',
                            'extra_profit_loss': 'n_extra_profit_loss',
                            'fixed_assets': 'n_fixed_assets',
                            'inc_bef_tax': 'n_inc_bef_tax',
                            'intang_assets': 'n_intang_assets',
                            'inventories': 'n_inventories',
                            'liq_assets': 'n_liq_assets',
                            'material_exp': 'n_material_exp',
                            'personnel_exp': 'n_personnel_exp',
                            'profit_loss_year': 'n_profit_loss_year',
                            'sales': 'n_sales',
                            'share_eq': 'n_share_eq',
                            'subscribed_cap': 'n_subscribed_cap',
                            'tang_assets': 'n_tang_assets',
                            'balsheet_flag': 'd_balsheet_flag',
                            'balsheet_length': 'n_balsheet_length',
                            'balsheet_notfullyear': 'd_balsheet_notfullyear',
                            'ceo_count': 'n_ceo_count',
                            'foreign': 'n_foreign',
                            'female': 'n_female',
                            'inoffice_days': 'n_inoffice_days',
                            'gender': 'f_gender',
                            'origin': 'f_origin',
                            'urban_m': 'f_urban_m',
                            'region_m': 'f_region_m',
                            'day_alive': 'n_day_alive'}, inplace=True)
    
    # create the list of categorical and numerical column names
    categorical_columns = [col for col in working_sample.columns if col.startswith("f_")]
    numerical_columns = [col for col in working_sample.columns if col.startswith("n_")]
    binary_columns = [col for col in working_sample.columns if col.startswith("d_")]
    
    # # flag observations with missing values to impute
    working_sample['imputed_flag'] = working_sample[categorical_columns + numerical_columns].isna().any(axis=1)

    # handle missing values for categorical features
    working_sample[categorical_columns] = working_sample[categorical_columns].fillna('Missing')

    # handle missing values for numerical features by replacing with median as distribution is not symmetrical
    num_medians = working_sample[numerical_columns].median()
    working_sample[numerical_columns] = working_sample[numerical_columns].fillna(num_medians)
    
    # add log columns for some numerical columns to improve the distribution balance
    log_numerical_columns = []
    for col in numerical_columns:
        if 'n_sales' in col:
            working_sample[f'ln_{"_".join(col.split("_")[1:])}'] = np.log(working_sample[col])
            log_numerical_columns.append(f'ln_{"_".join(col.split("_")[1:])}')
        else:
            log_numerical_columns.append(col)
            
    # cube root transform for some numerical columns to improve the distribution balance
    cube_root_columns = []
    for col in numerical_columns:
        if col in ['n_amort', 'n_curr_assets', 'n_curr_liab', 'n_extra_exp', 'n_extra_inc', 'n_extra_profit_loss', 'n_fixed_assets', 'n_inc_bef_tax', 'n_intang_assets', 'n_inventories', 'n_liq_assets', 'n_material_exp', 'n_personnel_exp', 'n_profit_loss_year', 'n_sales', 'n_share_eq', 'n_subscribed_cap', 'n_tang_assets']:
            working_sample[f'cb_{"_".join(col.split("_")[1:])}'] = np.cbrt(working_sample[col])
            cube_root_columns.append(f'cb_{"_".join(col.split("_")[1:])}')
        else:
            cube_root_columns.append(col)
    
    del df
        
    return working_sample[(working_sample["begin"].notna()) & (working_sample.year < 2016)], categorical_columns, numerical_columns + binary_columns, log_numerical_columns + binary_columns, cube_root_columns + binary_columns

In [4]:
work_df, categorical_columns, numerical_columns, log_numerical_columns, croot_numerical_columns = get_cleaned_data()
# work_df = get_cleaned_data(local=False)
work_df.head(20)

Unnamed: 0,year,comp_id,begin,end,n_amort,n_curr_assets,n_curr_liab,n_extra_exp,n_extra_inc,n_extra_profit_loss,...,cb_intang_assets,cb_inventories,cb_liq_assets,cb_material_exp,cb_personnel_exp,cb_profit_loss_year,cb_sales,cb_share_eq,cb_subscribed_cap,cb_tang_assets
0,2005,6538183.0,01/01/2005,31/12/2005,792.59259,6237.037109,348.148163,0.0,0.0,0.0,...,0.0,0.0,17.623963,29.664296,15.053308,-10.459997,30.824849,19.590436,3.815714,11.943493
1,2005,6934257.0,13/05/2005,31/12/2005,803.703674,4648.147949,9311.111328,0.0,0.0,0.0,...,0.0,0.0,13.463993,21.20386,5.737669,-19.293218,15.420217,8.311052,22.314432,22.816292
2,2005,8416055.0,01/01/2005,31/12/2005,3155.555664,71070.36719,25514.81445,74.074074,0.0,-74.074074,...,3.728963,15.730637,39.718705,51.001534,28.469167,0.0,56.215174,38.890824,22.314432,23.292445
5,2005,12428378.0,01/01/2005,31/12/2005,3137.037109,1740.740723,2192.592529,0.0,0.0,0.0,...,2.456021,0.0,11.821076,24.881414,21.749844,7.956555,31.078717,11.342553,4.199737,12.37757
7,2005,15711554.0,01/01/2005,31/12/2005,1592.592651,4740.740723,4970.370605,0.0,7407.407227,7407.407227,...,0.0,8.67061,11.623423,22.28961,15.107594,6.8557,20.880427,9.093075,22.314432,9.937887
8,2005,16005183.0,01/01/2005,31/12/2005,5659.259277,71714.8125,40607.40625,0.0,0.0,0.0,...,4.860332,34.307896,18.69087,58.164592,17.592107,-16.181574,60.713699,35.500494,22.314432,23.821898
10,2005,17776540.0,01/01/2005,31/12/2005,11722.22266,167988.8906,156914.8125,0.0,1851.851807,1851.851807,...,4.753477,39.088168,15.675565,88.083547,31.929484,22.319389,91.478107,43.872937,36.379763,51.380776
15,2005,50481104.0,01/01/2005,31/12/2005,288.888886,814.814819,2411.111084,0.0,0.0,0.0,...,0.0,0.0,9.340131,10.582674,22.04338,-6.49565,23.148195,-7.857828,6.057069,10.357442
16,2005,55608576.0,01/01/2005,31/12/2005,3196.296387,3781.481445,0.0,0.0,0.0,0.0,...,0.0,0.0,15.466804,9.950371,0.0,8.735805,17.390345,17.777633,5.69992,14.475716
22,2005,82197808.0,01/01/2005,31/12/2005,20574.07422,151692.5938,156037.0313,0.0,0.0,0.0,...,25.264355,29.3323,35.747586,78.74249,32.224645,17.68733,80.677882,29.784464,22.314432,45.96513


In [5]:
holdout_set = work_df[(work_df.year == 2014)]
holdout_set.shape

(1037, 60)

In [6]:
holdout_set.default.sum()

56

In [7]:
training_set = work_df[(work_df.year != 2014)]
training_set.shape

(10727, 60)

In [8]:
training_set[(work_df.year == 2015)].default.sum()

489

In [9]:
training_set[numerical_columns].skew();

In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# building preprocessing for pipeline
categorical_encoder = OneHotEncoder(handle_unknown="ignore")

preprocessing = ColumnTransformer(
    [
        ("cat", categorical_encoder, categorical_columns)
    ], remainder='passthrough'
)

Logit

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import GridSearchCV

# no regularisation needed so setting the parameter to very high value
Cs_value_logit = [1e20]

# define cv search
logit_search = LogisticRegressionCV(
    Cs=Cs_value_logit,
    refit=True, 
    scoring='neg_brier_score', 
    solver="newton-cg", 
    tol=1e-7, 
    random_state=42
)

logit_pipe = Pipeline(
    [("preprocess", preprocessing), ("regressor", logit_search)], verbose=True
)

In [12]:
# start_time = datetime.now()
# logit_results = logit_pipe.fit(training_set[categorical_columns + numerical_columns], training_set.default)
# logit1_time = datetime.now() - start_time
# 
# logit_search.scores_[1].mean()

In [13]:
# start_time = datetime.now()
# logit_results = logit_pipe.fit(training_set[categorical_columns + log_numerical_columns], training_set.default)
# logit2_time = datetime.now() - start_time
# 
# logit_search.scores_[1].mean()

In [14]:
start_time = datetime.now()
logit_results = logit_pipe.fit(training_set[categorical_columns + croot_numerical_columns], training_set.default)
logit3_time = datetime.now() - start_time

scores = logit_search.scores_[1]  # For class label 1
mean_scores = np.mean(scores, axis=0)
mean_scores

[Pipeline] ........ (step 1 of 2) Processing preprocess, total=   0.0s
[Pipeline] ......... (step 2 of 2) Processing regressor, total= 2.3min


array([-0.08287456])

In [15]:
logit_holdout_rmse = mean_squared_error(logit_pipe.predict(holdout_set[categorical_columns + croot_numerical_columns]), holdout_set['default'], squared=False)
logit_holdout_rmse

0.24451554054099697

LASSO Logit

In [16]:
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
# define cv search
# lasso_search = GridSearchCV(
#     lasso_model,
#     {"alpha":[i/100 for i in range(1, 5, 1)], "l1_ratio": [0.1, 0.2, 0.3],},
#     cv=5,
#     scoring="neg_root_mean_squared_error",
#     verbose=3,
# )
lambdas = list(10**np.arange(-1, -4.01, -1/3))
n_obs = training_set.shape[0]*4/5
Cs_values = [1/(l*n_obs) for l in lambdas]

lasso_search = LogisticRegressionCV(
    Cs = Cs_values, 
    penalty = 'l1', # L1 makes it lasso
    cv = 5, 
    refit = True, 
    scoring = 'neg_brier_score', 
    solver = 'liblinear',
    random_state = 42
)

lasso_pipe = Pipeline(
    [("preprocess", preprocessing), 
     ('interactions', PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)),
     ('scaling', StandardScaler()),
     ("regressor", lasso_search)
     ], verbose=True
)
Cs_values

[0.001165283863149063,
 0.0025105279785027075,
 0.0054087685671818505,
 0.011652838631490626,
 0.025105279785027055,
 0.054087685671818445,
 0.11652838631490617,
 0.25105279785027057,
 0.5408768567181845,
 1.1652838631490607]

In [17]:
# start_time = datetime.now()
# lasso_results = lasso_pipe.fit(training_set[categorical_columns + numerical_columns], training_set.default)
# lasso1_time = datetime.now() - start_time
# lasso_results

In [18]:
# # Getting the best score and corresponding C for binary classification
# scores = lasso_search.scores_[1]  # For class label 1
# mean_scores = np.mean(scores, axis=0)
# best_score = np.max(mean_scores)
# best_C_index = np.argmax(mean_scores)
# best_C = lasso_search.Cs_[best_C_index]
# 
# print(f"Best Score: {best_score}, Best C: {best_C}")

In [19]:
# start_time = datetime.now()
# lasso_results = lasso_pipe.fit(training_set[categorical_columns + log_numerical_columns], training_set.default)
# lasso2_time = datetime.now() - start_time
# lasso_results

In [20]:
# # Getting the best score and corresponding C for binary classification
# scores = lasso_search.scores_[1]  # For class label 1
# mean_scores = np.mean(scores, axis=0)
# best_score = np.max(mean_scores)
# best_C_index = np.argmax(mean_scores)
# best_C = lasso_search.Cs_[best_C_index]
# 
# print(f"Best Score: {best_score}, Best C: {best_C}")

In [21]:
start_time = datetime.now()
lasso_results = lasso_pipe.fit(training_set[categorical_columns + croot_numerical_columns], training_set.default)
lasso3_time = datetime.now() - start_time
lasso_results

[Pipeline] ........ (step 1 of 4) Processing preprocess, total=   0.0s
[Pipeline] ...... (step 2 of 4) Processing interactions, total=   0.1s
[Pipeline] ........... (step 3 of 4) Processing scaling, total=   0.1s
[Pipeline] ......... (step 4 of 4) Processing regressor, total= 3.6min


In [22]:
# Getting the best score and corresponding C for binary classification
scores = lasso_search.scores_[1]  # For class label 1
mean_scores = np.mean(scores, axis=0)
best_score = np.max(mean_scores)
best_C_index = np.argmax(mean_scores)
best_C = lasso_search.Cs_[best_C_index]

print(f"Best Score: {best_score}, Best C: {best_C}")

Best Score: -0.08116624142785218, Best C: 0.054087685671818445


In [23]:
lasso_holdout_rmse = mean_squared_error(lasso_pipe.predict(holdout_set[categorical_columns + croot_numerical_columns]), holdout_set['default'], squared=False)
lasso_holdout_rmse

0.23852649559992756

Random Forest

In [24]:
np.sqrt(len(numerical_columns + categorical_columns))

5.477225575051661

In [25]:
from sklearn.ensemble import RandomForestClassifier

grid = {'max_features': [6, 8, 10],
        'criterion':['gini', 'entropy'],
        'min_samples_split': [8, 10, 12],
        "min_samples_leaf": [5, 8, 10]}

prob_forest = RandomForestClassifier(
    random_state=42, 
    n_estimators=500, 
    oob_score=True)

prob_forest_search = GridSearchCV(
    prob_forest, 
    grid, 
    cv=5, 
    refit='accuracy',
    scoring = ['accuracy', 'roc_auc', 'neg_brier_score'], 
    n_jobs=-1)

prob_forest_pipe = Pipeline(
    [("preprocess", preprocessing), ("regressor", prob_forest_search)], verbose=True
)

In [26]:
start_time = datetime.now()
prob_forest_results = prob_forest_pipe.fit(training_set[categorical_columns + croot_numerical_columns], training_set.default)
prob_forest1_time = datetime.now() - start_time
prob_forest_results

[Pipeline] ........ (step 1 of 2) Processing preprocess, total=   0.0s
[Pipeline] ......... (step 2 of 2) Processing regressor, total=10.6min


In [27]:
prob_forest_search.best_estimator_

In [28]:
prob_forest_search.best_score_

0.900624873728322

In [29]:
df_rf_model_cv_results = pd.DataFrame(prob_forest_search.cv_results_)[[
    'param_max_features', 'param_min_samples_leaf', 
    'param_min_samples_split', 'rank_test_roc_auc', 'mean_test_roc_auc', 
    'rank_test_accuracy', 'mean_test_accuracy', 
    'rank_test_neg_brier_score', 'mean_test_neg_brier_score']]
df_rf_model_cv_results.columns = ['max features', 'min node size', 'min split size', 'rank roc auc', 'roc auc score', 'rank accuracy', 'accuracy score', 'rank brier score', 'brier score']
# df_rf_model_cv_results.pivot(
#     index = 'max features', 
#     columns = 'min node size', 
#     values = 'RMSE').round(2)*-1
df_rf_model_cv_results['root brier score'] = np.sqrt(-1*df_rf_model_cv_results['brier score'])
df_rf_model_cv_results;

In [30]:
prob_rf_holdout_rmse = mean_squared_error(prob_forest_pipe.predict(holdout_set[categorical_columns + croot_numerical_columns]), holdout_set['default'], squared=False)
prob_rf_holdout_rmse

0.2978547535205585

GBM

In [31]:
from sklearn.ensemble import HistGradientBoostingClassifier

# gbm_grid = {"max_depth": [5, 10, 15], "min_samples_leaf": [5, 10, 15], "max_features": [8, 10, 12]}
gbm_grid = {"max_depth": [12, 15, 18, 21], "min_samples_leaf": [5, 6, 7], 'learning_rate': [0.1, 0.01, 0.001]}

gbm = HistGradientBoostingClassifier( 
    random_state = 42)

gbm_search = GridSearchCV(
    gbm, 
    gbm_grid, 
    cv=5, 
    refit='accuracy',
    scoring = ['accuracy', 'roc_auc', 'neg_brier_score'], 
    n_jobs=-1)

gbm_pipe = Pipeline(
    [("preprocess", preprocessing), ("regressor", gbm_search)], verbose=True
)

In [32]:
start_time = datetime.now()
gbm_results = gbm_pipe.fit(training_set[categorical_columns + croot_numerical_columns], training_set.default)
gbm1_time = datetime.now() - start_time
gbm_results

[Pipeline] ........ (step 1 of 2) Processing preprocess, total=   0.0s
[Pipeline] ......... (step 2 of 2) Processing regressor, total=  24.2s


In [33]:
gbm_search.best_estimator_

In [34]:
gbm_search.best_score_

0.9004384804384804

In [35]:
gbm_holdout_rmse = mean_squared_error(gbm_pipe.predict(holdout_set[categorical_columns + croot_numerical_columns]), holdout_set['default'], squared=False)
gbm_holdout_rmse

0.2634977250546353

In [37]:
# best estimator on holdout set
diag_df = pd.DataFrame({'Model': ['Logit', 'LASSO Logit', 'RF', 'GBM'],
                        'Holdout RMSE': [logit_holdout_rmse, lasso_holdout_rmse, prob_rf_holdout_rmse, gbm_holdout_rmse],
                        'Training time': [logit3_time, lasso3_time, prob_forest1_time, gbm1_time]})
diag_df

Unnamed: 0,Model,Holdout RMSE,Training time
0,Logit,0.244516,0 days 00:02:15.338164
1,LASSO Logit,0.238526,0 days 00:03:33.718209
2,RF,0.297855,0 days 00:10:33.603558
3,GBM,0.263498,0 days 00:00:24.203978
