# Bibliotecas

In [1]:
# Análise Exploratória
import pandas as pd
import numpy as np
import locale
import missingno as msno
from unidecode import unidecode

# Configurações do Pandas
pd.options.display.max_columns = 300
pd.options.display.max_rows = 50

# Configuração de localização
locale.setlocale(locale.LC_TIME, 'pt_BR.UTF-8')  # Seta os meses, horas e etc para br

# Bibliotecas Gráficas
import matplotlib.pyplot as plt
import seaborn as sns

# Bibliotecas de ML
from sklearn.preprocessing import OneHotEncoder
from joblib import dump

# Carregando Base & Padronização das Colunas

In [2]:
application_df = pd.read_csv(
    r"G:\Meu Drive\Dissertacao\Data\Processado\StepWise Application Data Processado.csv",
    parse_dates = ['data'],
    dtype = {
        'inadimplencia' : 'Int64',
        'score_fragilidade_social' : 'Int64',
        'score_credito' : 'Int64'
    }
)

application_df.head(2)

Unnamed: 0,data,inadimplencia,amostra,score_fragilidade_social,score_credito,porte,is_mei,faturamento_presumido,natureza_juridica,cnae_secao,tempo_dias_entrada_socios_media,tempo_meses_de_existencia_da_matriz_ate_data_referencia,qtde_total_cnae,qnt_socios_administradores,valor_aluguel_reais_igpm,per_capita_reais,cep_consultado_pct_0_ate_1_quarto_salario_minimo,qtde_enderecos_favorecidos_socialmente_2km,qtde_enderecos_desfavorecidos_socialmente_2km,qtde_hospital_3km,pct_trabalhadores_sem_carteira_assinada,pct_cnpj_ativa_opcao_mei_no_cep,pct_abastecimento_agua_canalizacao_total,pct_domicilio_adequado,pct_esgoto_ceu_aberto,pct_arborizacao,pct_desempregados_em_busca_trabalho,pct_possui_automovel,pct_possui_radio,pct_existencia_moto,pct_existencia_geladeira,pct_regiao_ocupada_sim,pct_tempo_gasto_deslocamento_trabalho_0_5_min,pct_tempo_gasto_deslocamento_trabalho_6_30_min,pct_tempo_gasto_deslocamento_trabalho_31_60_min,pct_tempo_gasto_deslocamento_trabalho_mais_120_min,regiao
0,2021-02-10,0,DES,553,842,DEMAIS,False,11000000.0,COOPERATIVA,"Atividades Financeiras, De Seguros E Serviços ...",982.0,590.0,1.0,0,972.634929,5203.0,4.829,3.0,0.0,54.0,10.606,34.43,97.275,99.53,0.0,74.79,5.45,67.505,82.6,88.727,96.855,96.284,16.017,66.45,15.801,0.0,Sudeste
1,2020-07-31,0,DES,783,142,DEMAIS,False,41500.0,SOCIEDADE EMPRESARIA LIMITADA,Comércio - Reparação De Veículos Automotores E...,4107.0,343.0,15.0,1,436.878933,1724.25,4.817,1.0,0.0,14.0,16.518,62.99,97.021,62.51,17.17,43.41,8.718,48.369,89.787,76.305,97.021,95.267,8.681,60.069,28.356,0.231,Sudeste


In [4]:
cat_cols = ['porte', 'natureza_juridica', 'cnae_secao', 'regiao', 'qnt_socios_administradores']

for col in application_df[cat_cols].columns:
    application_df[col] = application_df[col].str.lower().str.replace(' ', '_').str.replace(',', '').str.replace('-', '_').str.replace('___', '_').apply(unidecode)
    
application_df[cat_cols]

Unnamed: 0,porte,natureza_juridica,cnae_secao,regiao,qnt_socios_administradores
0,demais,cooperativa,atividades_financeiras_de_seguros_e_servicos_r...,sudeste,0
1,demais,sociedade_empresaria_limitada,comercio_reparacao_de_veiculos_automotores_e_m...,sudeste,1
2,micro_empresa,empresa_individual_de_responsabilidade_limitad...,industrias_de_transformacao,sudeste,0
3,demais,associacao_privada,outras_atividades_de_servicos,nordeste,0
4,demais,cooperativa,atividades_financeiras_de_seguros_e_servicos_r...,norte,0
...,...,...,...,...,...
18162,empresa_de_pequeno_porte,empresario_individual,transporte_armazenagem_e_correio,sudeste,0
18163,demais,sociedade_empresaria_limitada,comercio_reparacao_de_veiculos_automotores_e_m...,sudeste,0
18164,micro_empresa,sociedade_empresaria_limitada,atividades_profissionais_cientificas_e_tecnicas,nordeste,1
18165,micro_empresa,empresario_individual,comercio_reparacao_de_veiculos_automotores_e_m...,sudeste,0


In [5]:
# Divisão em treino, teste e validação
X_train = application_df.loc[application_df['amostra'] == 'DES', :].drop(['data', 'amostra', 'inadimplencia'], axis = 1)
X_test = application_df.loc[application_df['amostra'] == 'OOS', :].drop(['data', 'amostra', 'inadimplencia'], axis = 1)
X_val = application_df.loc[application_df['amostra'] == 'OOT', :].drop(['data', 'amostra', 'inadimplencia'], axis = 1)

y_train = application_df.loc[application_df['amostra'] == 'DES', 'inadimplencia']
y_test = application_df.loc[application_df['amostra'] == 'OOS', 'inadimplencia']
y_val = application_df.loc[application_df['amostra'] == 'OOT', 'inadimplencia']

# Reset de índice para tudo ficar mais fácil de ser trabalhado
X_train = X_train.reset_index(drop = True)
X_test = X_test.reset_index(drop = True)
X_val = X_val.reset_index(drop = True)

y_train = y_train.reset_index(drop = True)
y_test = y_test.reset_index(drop = True)
y_val = y_val.reset_index(drop = True)


# Criação das covariáveis dummies
ohe = OneHotEncoder(sparse_output = False, drop = 'first')
ohe.fit(X_train[cat_cols])


X_train_dummies = pd.DataFrame(ohe.transform(X_train[cat_cols]), columns = ohe.get_feature_names_out())
X_test_dummies = pd.DataFrame(ohe.transform(X_test[cat_cols]), columns = ohe.get_feature_names_out())
X_val_dummies = pd.DataFrame(ohe.transform(X_val[cat_cols]), columns = ohe.get_feature_names_out())

X_train = pd.concat([X_train.drop(cat_cols, axis = 1), X_train_dummies], axis = 1)
X_test = pd.concat([X_test.drop(cat_cols, axis = 1), X_test_dummies], axis = 1)
X_val = pd.concat([X_val.drop(cat_cols, axis = 1), X_val_dummies], axis = 1)

ValueError: Found array with 0 sample(s) (shape=(0,)) while a minimum of 1 is required.

# Simulações & Treinamento dos Modelos

In [6]:
from sklearn.linear_model import LogisticRegression
from imbens.ensemble import EasyEnsembleClassifier, BalancedRandomForestClassifier, RUSBoostClassifier, UnderBaggingClassifier, OverBoostClassifier, SMOTEBoostClassifier, SMOTEBaggingClassifier, OverBaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, roc_curve, precision_recall_curve, precision_recall_curve, auc, cohen_kappa_score, fbeta_score
from imblearn.metrics import geometric_mean_score


import time

In [42]:
def ks_score(y_true, y_pred_proba):
    fpr, tpr, thresholds = roc_curve(y_true, y_pred_proba)
    ks = max(tpr - fpr)
    return ks

In [16]:
resultados_performance = pd.DataFrame({
        'KS' : [],
        'G-Mean' : [],
        'Precisão' : [],
        'Recall' : [],
        'F1-Score' : [],
        'ROC AUC' : [],
        'Pr-Re AUC' : [],
        'Tempo execução' : []
})


resultados_performance_validacao = pd.DataFrame({
        'KS' : [],
        'G-Mean' : [],
        'Precisão' : [],
        'Recall' : [],
        'F1-Score' : [],
        'ROC AUC' : [],
        'Pr-Re AUC' : [],
        'Tempo execução' : []
})

modelos_utilizados = {
        'Logistic Regression' : GridSearchCV(
            LogisticRegression(),
            param_grid = {
                'penalty' : [None, 'l1', 'l2'],
                'C' : [0.01, 0.1, 0.25, 1],
                'class_weight' : [None, 'balanced']
            },
            n_jobs = -1,
            scoring = 'average_precision'
        ),
        'Decision Tree' : GridSearchCV(
            DecisionTreeClassifier(),
            param_grid = {
                'max_features' : [None, 'sqrt', 'log2'],
                'class_weight' : [None, 'balanced']
            },
            n_jobs = -1,
            scoring = 'average_precision'
        ),
        'AdaBoostClassifier' : GridSearchCV(
            AdaBoostClassifier(),
            param_grid = {
                'n_estimators' : [30, 50, 70, 100, 200],
                'learning_rate' : [0.01, 0.01, 0.25]
            },
            n_jobs = -1,
            scoring = 'average_precision'
        ),
        'XGBClassifier' : GridSearchCV(
            XGBClassifier(),
            param_grid = {
                'n_estimator' : [30, 50, 70, 100, 200],
                'learning_rate' : [0.01, 0.01, 0.25],
                'max_depth' : [1, 3, 5],
                'subsample' : [0.5, 1]
            },
            n_jobs = -1,
            scoring = 'average_precision'
        ),
        'LGBMClassifier' : GridSearchCV(
            LGBMClassifier(verbose = -1),
            param_grid = {
                'n_estimator' : [30, 50, 70, 100, 200],
                'learning_rate' : [0.01, 0.01, 0.25],
                'max_depth' : [1, 3, 5],
                'subsample' : [0.5, 1]
            },
            n_jobs = -1,
            scoring = 'average_precision'
        ),
        'RUSBoost' : GridSearchCV(
            RUSBoostClassifier(),
            param_grid = {
                'n_estimators' : [10, 30, 50, 70, 100, 200],
                'learning_rate' : [0.01, 0.01, 0.25]
            },
            n_jobs = -1,
            scoring = 'average_precision'
        ),
        'UnderBagging' : GridSearchCV(
            UnderBaggingClassifier(),
            param_grid = {
                'n_estimators' : [10, 30, 50, 70, 100],
                'max_samples' : [0.5, 1]
            },
            n_jobs = -1,
            scoring = 'average_precision'
        ),
        'BalancedRandomForest' : GridSearchCV(
            BalancedRandomForestClassifier(),
            param_grid = {
                'n_estimators' : [30, 50, 70, 100, 200],
                'max_depth' : [3, 5, 7]
            },
            n_jobs = -1,
            scoring = 'average_precision'
        ),
        # 'OverBoost' : OverBoostClassifier(),
        # 'SMOTEBoost' : SMOTEBoostClassifier(),
        # 'OverBagging' : OverBaggingClassifier(),
        # 'SMOTEBagging' : SMOTEBaggingClassifier()
}

for model_name, model in modelos_utilizados.items():
    print(model_name)
    start_time = time.time()
    
    model.fit(X_train, y_train)
    
    end_time = time.time()
    execution_time = end_time - start_time

    y_pred_proba = model.predict_proba(X_test)[:, 1:]
    
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
    # Calcule a área sob a curva de precisão-recall
    precision_recall_auc = auc(recall, precision)

    fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
    melhor_ponto_corte = thresholds[np.argmax(tpr - fpr)]
    y_pred = np.array([1 if y_prob >= melhor_ponto_corte else 0 for y_prob in y_pred_proba])

    resultados_performance.loc[model_name, :] = [
        ks_score(y_test, y_pred_proba),
        geometric_mean_score(y_test, y_pred),
        precision_score(y_test, y_pred),
        recall_score(y_test, y_pred),
        f1_score(y_test, y_pred),
        roc_auc_score(y_test, y_pred_proba),
        precision_recall_auc,
        execution_time
    ]
    
    
    y_pred_proba_val = model.predict_proba(X_val)[:, 1:]
    
    precision, recall, thresholds = precision_recall_curve(y_val, y_pred_proba_val)
    # Calcule a área sob a curva de precisão-recall
    precision_recall_auc = auc(recall, precision)

    fpr, tpr, thresholds = roc_curve(y_val, y_pred_proba_val)
    melhor_ponto_corte = thresholds[np.argmax(tpr - fpr)]
    y_pred_val = np.array([1 if y_prob >= melhor_ponto_corte else 0 for y_prob in y_pred_proba_val])

    resultados_performance_validacao.loc[model_name, :] = [
        ks_score(y_val, y_pred_proba_val),
        geometric_mean_score(y_val, y_pred_val),
        precision_score(y_val, y_pred_val),
        recall_score(y_val, y_pred_val),
        f1_score(y_val, y_pred_val),
        roc_auc_score(y_val, y_pred_proba_val),
        precision_recall_auc,
        execution_time
    ]
    
    dump(model, rf'G:\Meu Drive\Dissertacao\Modelos\StepWise\{model_name}.joblib')

resultados_performance.sort_values('F1-Score', ascending = False)

Logistic Regression
Decision Tree
AdaBoostClassifier
XGBClassifier
LGBMClassifier
EasyEnsemble
RUSBoost
UnderBagging
BalancedRandomForest


Unnamed: 0,G-Mean,Precisão,Recall,F1-Score,ROC AUC,Pr-Re AUC,Tempo execucao
LGBMClassifier,0.636537,0.093541,0.597156,0.161746,0.69284,0.122968,32.293382
Decision Tree,0.31411,0.097345,0.104265,0.100686,0.52339,0.121495,2.072632
XGBClassifier,0.644221,0.092414,0.635071,0.161349,0.696104,0.12042,41.45215
AdaBoostClassifier,0.652048,0.094613,0.649289,0.16516,0.694936,0.119901,47.215824
EasyEnsemble,0.666266,0.100215,0.663507,0.174129,0.698224,0.115079,1947.132893
RUSBoost,0.655563,0.09,0.725118,0.160126,0.696981,0.114534,17.33137
BalancedRandomForest,0.667566,0.099649,0.672986,0.173594,0.703937,0.10709,7.256804
Logistic Regression,0.645664,0.088469,0.687204,0.156757,0.671837,0.10296,4.652004
UnderBagging,0.64455,0.088344,0.682464,0.156437,0.682636,0.093967,7.178249


In [32]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler()

X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)

modelo_teste = modelos_utilizados['Logistic Regression']

modelo_teste.fit(X_train_ros, y_train_ros)

In [41]:
resultados_performance.sort_values('Recall', ascending = False)

Unnamed: 0,G-Mean,Precisão,Recall,F1-Score,ROC AUC,Pr-Re AUC,Tempo execucao
RUSBoost,0.655563,0.09,0.725118,0.160126,0.696981,0.114534,17.33137
Logistic Regression,0.645664,0.088469,0.687204,0.156757,0.671837,0.10296,4.652004
UnderBagging,0.64455,0.088344,0.682464,0.156437,0.682636,0.093967,7.178249
BalancedRandomForest,0.667566,0.099649,0.672986,0.173594,0.703937,0.10709,7.256804
EasyEnsemble,0.666266,0.100215,0.663507,0.174129,0.698224,0.115079,1947.132893
AdaBoostClassifier,0.652048,0.094613,0.649289,0.16516,0.694936,0.119901,47.215824
XGBClassifier,0.644221,0.092414,0.635071,0.161349,0.696104,0.12042,41.45215
LGBMClassifier,0.636537,0.093541,0.597156,0.161746,0.69284,0.122968,32.293382
Decision Tree,0.31411,0.097345,0.104265,0.100686,0.52339,0.121495,2.072632


In [40]:
resultados_performance_validacao.sort_values('Recall', ascending = False)

Unnamed: 0,G-Mean,Precisão,Recall,F1-Score,ROC AUC,Pr-Re AUC,Tempo execucao
EasyEnsemble,0.604545,0.063894,0.78972,0.118223,0.665974,0.100584,1947.132893
UnderBagging,0.634528,0.071744,0.682243,0.129835,0.68385,0.092812,7.178249
Logistic Regression,0.598252,0.062693,0.663551,0.114562,0.618631,0.061429,4.652004
BalancedRandomForest,0.647748,0.078081,0.654206,0.139512,0.689501,0.103787,7.256804
AdaBoostClassifier,0.632864,0.083507,0.560748,0.145366,0.687247,0.097366,47.215824
XGBClassifier,0.641511,0.090357,0.556075,0.155454,0.697492,0.101371,41.45215
LGBMClassifier,0.641417,0.090288,0.556075,0.155352,0.69832,0.102846,32.293382
RUSBoost,0.620902,0.080166,0.542056,0.139675,0.663917,0.104756,17.33137
Decision Tree,0.327244,0.103448,0.11215,0.107623,0.532836,0.126151,2.072632


---