In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


<br>
<br>
<br>
<br>

# **Modelo Baseline**

Vamos criar um modelo básico, sem muitas técnicas e métodos, utilizando somente a tabela de **application_train** que já temos disponível, e treinar alguns algoritmos para saber como eles se saem em questão de desempenho, e a partir daí tentar melhorar essa performance, seja criando novas features, seja utilizando outras técnicas, tratamentos e etc.
<br>
<br>
Fazer um modelo baseline é essencial ao treinar um modelo de machine learning, pois fornece uma referência inicial para entender a complexidade do problema, estabelece um benchmark para comparação com modelos mais avançados, identifica problemas nos dados ou formulação do problema e facilita a comunicação com stakeholders ao oferecer uma referência compreensível para discutir desempenho e resultados esperados do modelo.

<br>

## **Pacotes e Bibliotecas**

In [None]:
# Instalação dos Pacotes.
!pip install colorama > /dev/null
!pip install catboost > /dev/null
!pip install category_encoders > /dev/null

In [None]:
# Importando as bibliotecas Pandas e Numpy.
import pandas as pd
import numpy as np

# Importando a função train_test_split para a divisão do nosso dataset em treino e teste.
from sklearn.model_selection import train_test_split

# Importando a classe Pipeline para simplificar o processo de pré-processamento.
from sklearn.pipeline import Pipeline

# Importando as bibliotecas necessárias para o tratamento dos dados.
from sklearn.preprocessing import StandardScaler
from category_encoders import TargetEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Importando os algoritmos de Machine Learning, da biblioteca Scikit-Learn, que iremos utilizar nesse projeto.
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import lightgbm as lgb
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

# Importando as bibliotecas para avaliação dos modelos.
from sklearn.metrics import confusion_matrix, roc_curve, precision_recall_curve, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, classification_report, auc

# Importando a pickle para serialização.
import pickle

# Importando as constantes definidas na biblioteca colorama que permitem alterar as cores e estilos de texto impresso no terminal.
from colorama import Fore, Style

<br>
<br>
<br>
<br>

## **Funções**

In [None]:
# Função para checar quantidade de linhas e colunas dos conjuntos de treino e teste.
def info_dataframe(treino, teste):
    '''
    Esta função exibe informações sobre a quantidade de linhas e colunas dos conjuntos de treino e teste.

    Parâmetros:
    - treino: DataFrame do conjunto de treino.
    - teste: DataFrame do conjunto de teste.

    Retorna:
    Um print com as informações.
    '''

    # Criando uma borda destacada para o título do conjunto de treino.
    print(Fore.BLUE + '=' * 45)
    print('CONJUNTO DE TREINO'.center(45))
    print('')
    print('Quantidade de linhas:', treino.shape[0])
    print('Quantidade de colunas:', treino.shape[1])
    print('=' * 45 + Style.RESET_ALL)
    print('')

    # Criando uma borda destacada para o título do conjunto de teste.
    print(Fore.GREEN + '=' * 45)
    print('CONJUNTO DE TESTE'.center(45))
    print('')
    print('Quantidade de linhas:', teste.shape[0])
    print('Quantidade de colunas:', teste.shape[1])
    print('=' * 45 + Style.RESET_ALL)



# =================================================================================================================================================== #



# Função para o cálculo das métricas.
def calculate_metrics(nm_modelo, model, X_train, y_train, X_test, y_test):
    '''
    Esta função calcula e imprime várias métricas de desempenho do modelo.

    Parâmetros:
    - nm_modelo: Nome do modelo.
    - model: O modelo treinado.
    - X_train: DataFrame de treino, com as variáveis explicativas.
    - y_train: DataFrame de treino com a coluna de Target.
    - X_test: DataFrame de teste, com as variáveis explicativas.
    - y_test: DataFrame de teste com a coluna de Target.

    Retorna:
    Imprime as métricas de desempenho do modelo.
    '''

    # Faz as predições.
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Calcula as métricas para o conjunto de treino.
    accuracy_train = accuracy_score(y_train, y_train_pred)
    precision_train = precision_score(y_train, y_train_pred, zero_division=1)
    recall_train = recall_score(y_train, y_train_pred)
    auc_roc_train = roc_auc_score(y_train, model.predict_proba(X_train)[:, 1])

    # Calcula o Índice Gini e Estatística KS para o conjunto de treino.
    probabilities_train = model.predict_proba(X_train)[:, 1]
    df_train = pd.DataFrame({'true_labels': y_train, 'predicted_probs': probabilities_train})
    df_train = df_train.sort_values(by='predicted_probs', ascending=False)
    df_train['cumulative_true'] = df_train['true_labels'].cumsum() / df_train['true_labels'].sum()
    df_train['cumulative_false'] = (1 - df_train['true_labels']).cumsum() / (1 - df_train['true_labels']).sum()
    ks_statistic_train = max(abs(df_train['cumulative_true'] - df_train['cumulative_false']))
    gini_index_train = 2 * auc_roc_train - 1

    # Calcula as métricas para o conjunto de teste.
    accuracy_test = accuracy_score(y_test, y_test_pred)
    precision_test = precision_score(y_test, y_test_pred, zero_division=1)
    recall_test = recall_score(y_test, y_test_pred)
    auc_roc_test = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

    # Calcula o Índice Gini e Estatística KS para o conjunto de teste.
    probabilities_test = model.predict_proba(X_test)[:, 1]
    df_test = pd.DataFrame({'true_labels': y_test, 'predicted_probs': probabilities_test})
    df_test = df_test.sort_values(by='predicted_probs', ascending=False)
    df_test['cumulative_true'] = df_test['true_labels'].cumsum() / df_test['true_labels'].sum()
    df_test['cumulative_false'] = (1 - df_test['true_labels']).cumsum() / (1 - df_test['true_labels']).sum()
    ks_statistic_test = max(abs(df_test['cumulative_true'] - df_test['cumulative_false']))
    gini_index_test = 2 * auc_roc_test - 1


    # Criando o DataFrame.
    metrics_df = pd.DataFrame({
        'Algoritmo': [nm_modelo, nm_modelo],
        'Conjunto': ['Treino', 'Teste'],
        'Acuracia': [accuracy_train, accuracy_test],
        'Precisao': [precision_train, precision_test],
        'Recall': [recall_train, recall_test],
        'AUC_ROC': [auc_roc_train, auc_roc_test],
        'GINI': [gini_index_train, gini_index_test],
        'KS': [ks_statistic_train, ks_statistic_test]
    })

    return metrics_df

<br>
<br>
<br>
<br>

## **Leitura dos Dados**

In [None]:
# Gravando o diretório em uma variável.
dir = '/content/drive/MyDrive/Projetos_Big_Data_Analytics/Ciencia_de_Dados/Etapa_Modelagem_Credito/pod-academy-analise-de-credito-para-fintech'

# Lendo nossa tabela de 'application_train'.
df_00 = pd.read_csv(dir + '/database/application_train.csv', sep=',', encoding='latin-1')

df_00.shape

(215257, 172)

In [None]:
pd.set_option('display.max_columns', None)

df_00.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,var_10,var_11,var_12,var_13,var_14,var_15,var_16,var_17,var_18,var_19,var_20,var_21,var_22,var_23,var_24,var_25,var_26,var_27,var_28,var_29,var_30,var_31,var_32,var_33,var_34,var_35,var_36,var_37,var_38,var_39,var_40,var_41,var_42,var_43,var_44,var_45,var_46,var_47,var_48,var_49,var_50
0,247330,0,Cash loans,F,N,N,0,157500.0,706410.0,67072.5,679500.0,Unaccompanied,Commercial associate,Higher education,Married,House / apartment,0.032561,-14653,-2062,-8599.0,-2087,,1,1,0,1,1,0,Private service staff,2.0,1,1,WEDNESDAY,13,0,0,0,0,0,0,Services,,0.632424,0.220095,,0.105,,,,,,,,,,,,,,0.109,,,,,,,,,,,,,,0.105,,,,,,,,,,,,,,,0.0702,Panel,No,1.0,0.0,1.0,0.0,-1254.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.38134,0.253773,0.205728,0.808261,0.9177,0.487698,0.955921,0.089342,0.519432,0.667806,0.33332,0.873508,0.293837,0.758751,0.97264,0.813237,0.398762,0.060109,0.432021,0.711729,0.455977,0.532977,0.615955,0.005083,0.465449,0.145924,0.026534,0.562217,0.380997,0.634713,0.322195,0.677877,0.518137,0.284267,0.896499,0.260938,0.030923,0.052023,0.969193,0.984378,0.824762,0.333516,0.29326,0.564878,0.115058,0.655605,0.415562,0.092643,0.723331,0.796523
1,425716,1,Cash loans,F,Y,Y,1,121500.0,545040.0,25407.0,450000.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.007114,-13995,-2246,-348.0,-172,12.0,1,1,1,1,1,0,Secretaries,3.0,2,2,MONDAY,10,0,0,0,0,0,0,Business Entity Type 3,0.593456,0.695997,0.633032,0.668,,0.9856,,,,,,,,,0.6817,,,0.6807,,0.9856,,,,,,,,,0.7102,,,0.6745,,0.9856,,,,,,,,,0.6939,,,,block of flats,0.5501,"Stone, brick",No,1.0,0.0,1.0,0.0,-907.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.936515,0.179481,0.843631,0.520029,0.907421,0.442279,0.305319,0.125968,0.925484,0.198714,0.793117,0.920624,0.587697,0.193858,0.720867,0.347189,0.906016,0.329694,0.802493,0.150473,0.418284,0.868025,0.254219,0.956146,0.347596,0.341439,0.744123,0.045891,0.978561,0.961868,0.985735,0.547768,0.822529,0.392172,0.463642,0.5239,0.397622,0.483889,0.599514,0.101305,0.41626,0.404293,0.137944,0.457971,0.303691,0.215059,0.838892,0.608335,0.585643,0.298456
2,331625,0,Cash loans,M,Y,Y,1,225000.0,942300.0,27679.5,675000.0,Unaccompanied,Working,Secondary / secondary special,Married,Municipal apartment,0.022625,-21687,-1335,-6306.0,-4026,1.0,1,1,0,1,0,0,Laborers,3.0,2,2,THURSDAY,10,0,0,0,0,0,0,Self-employed,,0.667686,0.607557,0.6443,0.3483,0.9791,0.7144,0.3331,0.72,0.6207,0.3333,0.375,0.372,0.5253,0.6223,0.0,0.0,0.6565,0.3615,0.9791,0.7256,0.3361,0.725,0.6207,0.3333,0.375,0.3804,0.5739,0.6483,0.0,0.0,0.6506,0.3483,0.9791,0.7182,0.3352,0.72,0.6207,0.3333,0.375,0.3784,0.5344,0.6334,0.0,0.0,reg oper account,block of flats,0.6714,Panel,No,0.0,0.0,0.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.499898,0.9843,0.145573,0.763623,0.517144,0.251938,0.778078,0.778633,0.711306,0.606748,0.808992,0.594886,0.861306,0.225132,0.578306,0.007019,0.651399,0.145081,0.724807,0.154568,0.379459,0.901351,0.569352,0.36635,0.004014,0.151749,0.197556,0.512563,0.932741,0.427496,0.737803,0.399106,0.900378,0.348174,0.614347,0.934229,0.006252,0.547868,0.47908,0.600169,0.037711,0.124465,0.09184,0.364601,0.97822,0.520309,0.594523,0.55965,0.361873,0.254804
3,455397,0,Revolving loans,F,N,Y,2,144000.0,180000.0,9000.0,180000.0,Unaccompanied,Commercial associate,Secondary / secondary special,Separated,House / apartment,0.006629,-13071,-2292,-742.0,-1201,,1,1,1,1,1,0,Cooking staff,3.0,2,2,MONDAY,8,0,0,0,0,0,0,Restaurant,,0.314634,0.427657,0.0261,0.0,0.9881,0.864,,0.0,0.0803,0.0692,,0.0085,,0.019,,0.0,0.0189,0.0,0.9871,0.8693,,0.0,0.1034,0.0833,,0.0068,,0.0114,,0.0,0.0281,0.0,0.9871,0.8658,,0.0,0.1034,0.0833,,0.0088,,0.0195,,0.0,reg oper account,block of flats,0.018,Block,No,0.0,0.0,0.0,0.0,-394.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,2.0,0.315107,0.766009,0.065409,0.20537,0.426937,0.669344,0.490902,0.392566,0.346318,0.025155,0.760782,0.530627,0.848179,0.759807,0.754668,0.795626,0.242511,0.802291,0.026778,0.78787,0.355061,0.13229,0.246993,0.506481,0.684924,0.23369,0.804141,0.010132,0.932631,0.09054,0.683468,0.365466,0.280388,0.670943,0.850415,0.759835,0.979863,0.922059,0.950338,0.822062,0.78463,0.831403,0.210872,0.049639,0.814219,0.830179,0.755163,0.216664,0.603002,0.429001
4,449114,0,Cash loans,F,N,Y,0,112500.0,729792.0,37390.5,630000.0,Unaccompanied,Pensioner,Secondary / secondary special,Civil marriage,House / apartment,0.04622,-19666,365243,-169.0,-3112,,1,0,0,1,0,0,,2.0,1,1,FRIDAY,10,0,0,0,0,0,0,XNA,0.599579,0.505944,0.239226,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,0.0,0.448089,0.374995,0.606112,0.488309,0.321528,0.338132,0.866652,0.102163,0.449259,0.604513,0.691355,0.978607,0.978347,0.755776,0.037144,0.20308,0.492793,0.334378,0.44721,0.295394,0.620938,0.322072,0.264834,0.209451,0.654736,0.264538,0.13339,0.91864,0.890932,0.243396,0.108121,0.472345,0.045164,0.746089,0.469676,0.594308,0.961698,0.608098,0.041375,0.767341,0.265381,0.655344,0.668705,0.171391,0.335702,0.585494,0.619551,0.686738,0.540449,0.343632


<br>
<br>
<br>
<br>

## **Validação Cruzada tipo Holdout utilizando modo Out-of-Sample**

A validação cruzada é a divisão do dataset em conjuntos de treino e teste e é de extrema importância para avaliarmos, ajustarmos e selecionarmos os modelos de aprendizado de máquina de forma robusta e confiável, garantindo que eles sejam capazes de generalizar para novos dados e resolver efetivamente o problema em questão. Vamos então separar o nosso dataset em 80% para treino e 20% para teste.

In [None]:
# Separando o Target.
X = df_00.drop(columns=['TARGET'])
y = df_00['TARGET']

In [None]:
# Dividindo os dados em conjunto de treinamento e teste (80% treino, 20% teste).
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

info_dataframe(X_train, X_test)

              CONJUNTO DE TREINO             

Quantidade de linhas: 172205
Quantidade de colunas: 171

              CONJUNTO DE TESTE              

Quantidade de linhas: 43052
Quantidade de colunas: 171


In [None]:
X_train.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,var_10,var_11,var_12,var_13,var_14,var_15,var_16,var_17,var_18,var_19,var_20,var_21,var_22,var_23,var_24,var_25,var_26,var_27,var_28,var_29,var_30,var_31,var_32,var_33,var_34,var_35,var_36,var_37,var_38,var_39,var_40,var_41,var_42,var_43,var_44,var_45,var_46,var_47,var_48,var_49,var_50
54931,281575,Cash loans,F,Y,Y,0,202500.0,625500.0,34060.5,625500.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.028663,-19489,-1075,-3962.0,-3011,3.0,1,1,0,1,0,0,Managers,2.0,2,2,THURSDAY,9,0,0,0,0,1,1,Trade: type 7,0.758916,0.526224,0.51009,0.0619,0.0841,0.9771,0.6872,0.0079,0.0,0.1379,0.1667,0.2083,0.0458,0.0471,0.053,0.0154,0.0456,0.063,0.0873,0.9772,0.6994,0.0079,0.0,0.1379,0.1667,0.2083,0.0469,0.0514,0.0552,0.0156,0.0483,0.0625,0.0841,0.9771,0.6914,0.0079,0.0,0.1379,0.1667,0.2083,0.0466,0.0479,0.0539,0.0155,0.0466,reg oper account,block of flats,0.0559,"Stone, brick",No,1.0,0.0,1.0,0.0,-1497.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0,0.572098,0.895317,0.666734,0.547289,0.131815,0.762465,0.411189,0.94581,0.907047,0.779933,0.28941,0.226077,0.217647,0.566837,0.395197,0.333989,0.006813,0.46167,0.770898,0.876823,0.744295,0.668739,0.685141,0.659498,0.440569,0.004373,0.791284,0.075672,0.59523,0.235064,0.985042,0.576337,0.350662,0.084907,0.64452,0.48717,0.427249,0.031264,0.699787,0.831006,0.471336,0.264834,0.010173,0.53029,0.102242,0.224055,0.630836,0.543184,0.969463,0.7639
105184,253241,Cash loans,M,N,Y,0,112500.0,278811.0,14364.0,189000.0,"Spouse, partner",Working,Secondary / secondary special,Married,House / apartment,0.030755,-21070,-1099,-5917.0,-4598,,1,1,1,1,0,0,Laborers,2.0,2,2,SUNDAY,9,0,0,0,0,0,0,Business Entity Type 3,,0.698457,0.367291,0.1113,0.0653,0.9826,0.762,0.0227,0.12,0.1034,0.3333,0.375,0.0713,0.0883,0.1114,0.0116,0.004,0.1134,0.0678,0.9826,0.7713,0.0229,0.1208,0.1034,0.3333,0.375,0.073,0.0964,0.1161,0.0117,0.0042,0.1124,0.0653,0.9826,0.7652,0.0229,0.12,0.1034,0.3333,0.375,0.0726,0.0898,0.1134,0.0116,0.0041,reg oper account,block of flats,0.1009,Panel,No,0.0,0.0,0.0,0.0,-1087.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0,0.35173,0.41307,0.211171,0.537147,0.580967,0.13541,0.326075,0.683109,0.694268,0.102148,0.026424,0.937358,0.671953,0.051573,0.851337,0.80958,0.74458,0.208471,0.100114,0.919941,0.404974,0.222986,0.729671,0.167075,0.129317,0.388579,0.118802,0.940996,0.511282,0.463729,0.95138,0.174999,0.235495,0.65313,0.180304,0.815912,0.258683,0.369354,0.999755,0.58337,0.8118,0.906224,0.507021,0.391292,0.811013,0.988231,0.411279,0.733242,0.438364,0.979158
49525,257591,Cash loans,F,N,N,1,90000.0,193500.0,20529.0,193500.0,Unaccompanied,Working,Incomplete higher,Separated,House / apartment,0.015221,-11513,-2619,-2453.0,-2602,,1,1,0,1,1,0,Managers,2.0,2,2,WEDNESDAY,20,0,0,0,0,0,0,Business Entity Type 3,0.287588,0.681713,0.150085,0.0619,0.0614,0.9831,,,0.0,0.1379,0.1667,,0.0425,,0.0534,,0.0,0.063,0.0637,0.9831,,,0.0,0.1379,0.1667,,0.0435,,0.0557,,0.0,0.0625,0.0614,0.9831,,,0.0,0.1379,0.1667,,0.0432,,0.0544,,0.0,,block of flats,0.0463,Panel,No,0.0,0.0,0.0,0.0,-253.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.47529,0.393834,0.752983,0.896069,0.589042,0.459099,0.471386,0.547479,0.436461,0.377657,0.37566,0.352266,0.303279,0.977967,0.463595,0.616656,0.33816,0.317583,0.451255,0.430385,0.130966,0.989194,0.596087,0.899408,0.320739,0.477462,0.051342,0.892001,0.593667,0.272567,0.653061,0.727226,0.296557,0.977834,0.568195,0.019867,0.843434,0.59182,0.981747,0.664422,0.769413,0.253369,0.630179,0.751577,0.456636,0.226,0.140894,0.181158,0.798929,0.212588
161471,440634,Cash loans,F,N,Y,0,90000.0,135000.0,7668.0,135000.0,Unaccompanied,Pensioner,Secondary / secondary special,Married,House / apartment,0.028663,-21850,365243,-2921.0,-4505,,1,0,0,1,1,0,,2.0,2,2,MONDAY,12,0,0,0,0,0,0,XNA,,0.25231,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-2871.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.695244,0.034982,0.385054,0.569582,0.742382,0.107622,0.941336,0.118326,0.361828,0.352864,0.580449,0.275319,0.926335,0.440895,0.156727,0.034869,0.083981,0.227421,0.514021,0.970859,0.369098,0.566219,0.281436,0.758343,0.091105,0.787326,0.311799,0.131824,0.091539,0.086925,0.015665,0.743101,0.638351,0.421537,0.102328,0.212616,0.770783,0.93646,0.613894,0.94576,0.994137,0.698837,0.958245,0.478623,0.779404,0.071388,0.90932,0.178555,0.955696,0.238835
39438,101232,Cash loans,F,N,Y,1,189000.0,1288350.0,37800.0,1125000.0,Unaccompanied,Working,Secondary / secondary special,Civil marriage,House / apartment,0.006671,-13273,-6480,-4053.0,-4053,,1,1,0,1,0,0,Waiters/barmen staff,3.0,2,2,FRIDAY,18,0,0,0,0,0,0,Business Entity Type 3,,0.161661,0.490258,0.0299,0.0465,0.9856,,,0.0,0.1379,0.0833,,,,0.0291,,0.0037,0.0305,0.0482,0.9856,,,0.0,0.1379,0.0833,,,,0.0304,,0.0039,0.0302,0.0465,0.9856,,,0.0,0.1379,0.0833,,,,0.0297,,0.0038,,block of flats,0.0267,"Stone, brick",No,0.0,0.0,0.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.915559,0.708479,0.929546,0.020011,0.42227,0.889473,0.659386,0.696708,0.118795,0.113957,0.734756,0.007492,0.157285,0.896614,0.539224,0.796315,0.025607,0.233157,0.043198,0.391333,0.927635,0.649995,0.571242,0.529131,0.043954,0.531916,0.686774,0.727816,0.81053,0.466825,0.169806,0.051798,0.974093,0.160077,0.960239,0.673859,0.563534,0.219257,0.144667,0.912435,0.989871,0.73681,0.675635,0.959182,0.851175,0.407824,0.207485,0.54949,0.618668,0.236607


In [None]:
X_test.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,var_10,var_11,var_12,var_13,var_14,var_15,var_16,var_17,var_18,var_19,var_20,var_21,var_22,var_23,var_24,var_25,var_26,var_27,var_28,var_29,var_30,var_31,var_32,var_33,var_34,var_35,var_36,var_37,var_38,var_39,var_40,var_41,var_42,var_43,var_44,var_45,var_46,var_47,var_48,var_49,var_50
196348,243431,Revolving loans,M,N,Y,0,90000.0,180000.0,9000.0,180000.0,Unaccompanied,Commercial associate,Secondary / secondary special,Single / not married,With parents,0.031329,-9579,-489,-9175.0,-1161,,1,1,0,1,0,0,Laborers,1.0,2,2,WEDNESDAY,16,0,0,0,0,0,0,Business Entity Type 3,0.217777,0.634658,0.554947,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-393.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0,0.5536,0.735979,0.037344,0.338258,0.545242,0.841952,0.024305,0.615976,0.18746,0.588908,0.184133,0.216715,0.295191,0.088721,0.783669,0.926316,0.825076,0.502182,0.08837,0.89717,0.443596,0.505481,0.760015,0.290377,0.252905,0.993413,0.954151,0.111669,0.216014,0.629482,0.292913,0.325685,0.409491,0.00087,0.100324,0.988315,0.332587,0.23148,0.526877,0.678298,0.593565,0.789703,0.254486,0.969768,0.918087,0.939984,0.941608,0.053827,0.571771,0.186664
147976,127962,Cash loans,F,N,N,0,225000.0,781920.0,42547.5,675000.0,Unaccompanied,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,0.018801,-20151,-3330,-10255.0,-3468,,1,1,0,1,0,0,Laborers,1.0,2,2,MONDAY,11,0,0,0,0,0,0,Business Entity Type 3,0.804014,0.501598,0.384207,0.0082,0.0,0.9687,0.5716,0.0026,0.0,0.0345,0.0417,0.0833,0.0135,0.0067,0.0075,0.0,0.0,0.0084,0.0,0.9687,0.5884,0.0026,0.0,0.0345,0.0417,0.0833,0.0138,0.0073,0.0079,0.0,0.0,0.0083,0.0,0.9687,0.5773,0.0026,0.0,0.0345,0.0417,0.0833,0.0137,0.0068,0.0077,0.0,0.0,reg oper account,block of flats,0.0074,Wooden,No,2.0,1.0,2.0,0.0,-2005.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,4.0,0.05563,0.969683,0.143565,0.064058,0.807627,0.711164,0.01274,0.556794,0.01322,0.55256,0.426758,0.691886,0.36791,0.560955,0.803954,0.494718,0.307866,0.047729,0.384242,0.988515,0.026243,0.267994,0.231065,0.425559,0.617888,0.28429,0.020444,0.552369,0.461655,0.750744,0.294814,0.826447,0.731267,0.703634,0.405295,0.985253,0.666051,0.595347,0.86578,0.637372,0.34632,0.912834,0.910227,0.784684,0.485868,0.812601,0.425869,0.335348,0.878763,0.792996
52662,244667,Cash loans,M,N,Y,1,112500.0,450000.0,21888.0,450000.0,Unaccompanied,Working,Secondary / secondary special,Civil marriage,House / apartment,0.019689,-11641,-370,-218.0,-3796,,1,1,1,1,0,0,Laborers,3.0,2,2,THURSDAY,9,0,0,0,0,0,0,Construction,,0.278945,0.300108,0.0021,,0.9707,,,0.0,0.069,0.0,,0.0206,,0.0015,,0.0,0.0021,,0.9707,,,0.0,0.069,0.0,,0.021,,0.0016,,0.0,0.0021,,0.9707,,,0.0,0.069,0.0,,0.0209,,0.0015,,0.0,,terraced house,0.0012,Wooden,No,1.0,0.0,1.0,0.0,-1022.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,2.0,4.0,0.963148,0.610721,0.910276,0.985248,0.993294,0.979476,0.502643,0.153952,0.100604,0.582648,0.268764,0.264136,0.798616,0.640491,0.581764,0.113877,0.599859,0.384104,0.882926,0.656345,0.409666,0.822635,0.729645,0.389186,0.172874,0.63422,0.060167,0.666173,0.432991,0.32044,0.183687,0.072003,0.058414,0.273381,0.818123,0.553403,0.393062,0.149542,0.06534,0.581393,0.973969,0.671805,0.428278,0.189641,0.765047,0.931675,0.944781,0.270274,0.296258,0.518378
101577,220032,Cash loans,F,N,Y,0,225000.0,760225.5,32337.0,679500.0,Unaccompanied,Working,Secondary / secondary special,Married,With parents,0.00733,-10035,-144,-5885.0,-677,,1,1,0,1,0,0,Laborers,2.0,2,2,TUESDAY,15,0,0,0,1,1,1,Business Entity Type 3,0.279232,0.213085,0.556727,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.0,0.0,2.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.699803,0.42191,0.417316,0.528927,0.405325,0.816989,0.633609,0.482975,0.711495,0.47159,0.302225,0.073788,0.630507,0.419388,0.652081,0.229409,0.615194,0.231686,0.377791,0.876791,0.300196,0.789843,0.422809,0.209413,0.377615,0.839231,0.662908,0.630662,0.854113,0.623451,0.717409,0.920094,0.191716,0.036136,0.028467,0.833765,0.772909,0.940978,0.171637,0.786283,0.766011,0.533777,0.995612,0.998489,0.31854,0.923508,0.271327,0.558144,0.926976,0.794804
173078,123746,Cash loans,F,N,N,0,225000.0,808650.0,26217.0,675000.0,Family,State servant,Higher education,Married,House / apartment,0.006207,-16462,-8468,-8477.0,0,,1,1,0,1,0,0,Core staff,2.0,2,2,TUESDAY,16,0,0,0,0,0,0,School,0.583032,0.528639,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-1322.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,,0.733595,0.21442,0.649903,0.75118,0.202763,0.696091,0.745274,0.357072,0.458827,0.731857,0.954352,0.428362,0.881391,0.282845,0.719461,0.592143,0.747439,0.873098,0.132575,0.531037,0.930192,0.47,0.278429,0.886347,0.547333,0.115353,0.362414,0.48431,0.293341,0.334486,0.414042,0.021479,0.300477,0.658067,0.759636,0.228552,0.949061,0.452163,0.243199,0.534236,0.60826,0.155219,0.634449,0.650114,0.198636,0.20231,0.19313,0.437779,0.901853,0.126509


<br>
<br>
<br>
<br>

## **Pipeline de Preparação dos Dados (DataPrep)**

A etapa de preparação dos dados (dataprep) é crucial para o sucesso da modelagem de machine learning, pois os dados precisam estar limpos, organizados e em um formato adequado para serem utilizados pelos algoritmos de aprendizado de máquina. Isso inclui garantir a qualidade dos dados, ajustar o formato para atender aos requisitos dos algoritmos, garantir eficiência computacional, promover interpretabilidade e explicabilidade dos modelos e assegurar generalização e robustez.
<br>
<br>
Vamos utilizar a classe **Pipeline** da biblioteca **Sklearn** para simplificar os processos e treinar os modelos de forma mais rápida, pois nesse momento só queremos visualizar o resultado final, que é o desempenho dos modelos.
<br>
<br>
Vamos identificar as variáveis numéricas e as variáveis categóricas. Para as variáveis categóricas vamos imputar a moda nos valores missing e encodificar os dados com o **Target Encoder**. Para as variáveis numéricas vamos imputar a média nos valores missing e padronizar os dados com **Standard Scaler**.

In [None]:
# Atribuindo uma variável para as variáveis do tipo 'object' e outra para as variáveis numéricas.
cat_attributes = X_train.select_dtypes(include='object')
num_attributes = X_train.select_dtypes(exclude='object')

# Definindo o pipeline para as variáveis categóricas.
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', TargetEncoder())
])

# Definindo o pipeline para as variáveis numéricas.
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Combinando os pipelines.
preprocessor = ColumnTransformer([
    ('cat', cat_pipe, cat_attributes.columns),
    ('num', num_pipe, num_attributes.columns)
])
processed = Pipeline(steps=[('preprocessor', preprocessor)])

# Aplicando as transformações nos dados de treino e teste.
X_train_processed = pd.DataFrame(processed.fit_transform(X_train, y_train), columns=X_train.columns)
X_test_processed = pd.DataFrame(processed.transform(X_test), columns=X_test.columns)

<br>
<br>
<br>
<br>

## **Treinamento dos Modelos**

Vamos treinar o modelo baseline com os principais algoritmos de machine learning e no final gerar um DataFrame com as principais métricas de cada um deles, para verificarmos qual obteve o melhor desempenho.

In [None]:
# Criando uma lista com os algoritmos que serão usados para o treinamento do modelo baseline.
algoritmos = [DecisionTreeClassifier(criterion='gini',random_state=42),
              LogisticRegression(solver='liblinear',random_state=42),
              RandomForestClassifier(random_state=42, verbose=False),  # Configurando verbose para False.
              GradientBoostingClassifier(random_state=42),
              XGBClassifier(random_state=42, verbosity=0),  # Configurando verbosity para 0.
              lgb.LGBMClassifier(random_state=42, silent=True, verbose=-1),  # Configurando silent para True e verbose para -1.
              CatBoostClassifier(random_state=42, verbose=False)]  # Configurando verbose para False.

# Iterando sobre os algoritmos.
for algoritmo in algoritmos:
    nome_algoritmo = algoritmo.__class__.__name__

    # Treino do modelo.
    algoritmo.fit(X_train_processed, y_train)

    # Avaliar modelo.
    metricas = calculate_metrics(nome_algoritmo, algoritmo, X_train_processed, y_train, X_test_processed, y_test)
    display(metricas)

Unnamed: 0,Algoritmo,Conjunto,Acuracia,Precisao,Recall,AUC_ROC,GINI,KS
0,DecisionTreeClassifier,Treino,1.0,1.0,1.0,1.0,1.0,1.0
1,DecisionTreeClassifier,Teste,0.851482,0.141109,0.160511,0.536759,0.073519,0.073676


Unnamed: 0,Algoritmo,Conjunto,Acuracia,Precisao,Recall,AUC_ROC,GINI,KS
0,LogisticRegression,Treino,0.919515,0.557971,0.011086,0.747434,0.494868,0.369245
1,LogisticRegression,Teste,0.918215,0.493151,0.010227,0.742754,0.485508,0.358696


Unnamed: 0,Algoritmo,Conjunto,Acuracia,Precisao,Recall,AUC_ROC,GINI,KS
0,RandomForestClassifier,Treino,0.999959,1.0,0.999496,1.0,1.0,1.0
1,RandomForestClassifier,Teste,0.918238,1.0,0.0,0.693526,0.387051,0.290994


Unnamed: 0,Algoritmo,Conjunto,Acuracia,Precisao,Recall,AUC_ROC,GINI,KS
0,GradientBoostingClassifier,Treino,0.920107,0.677249,0.018428,0.764702,0.529403,0.394877
1,GradientBoostingClassifier,Teste,0.91854,0.580247,0.013352,0.749314,0.498627,0.37504


Unnamed: 0,Algoritmo,Conjunto,Acuracia,Precisao,Recall,AUC_ROC,GINI,KS
0,XGBClassifier,Treino,0.930048,0.932521,0.143248,0.917727,0.835454,0.666612
1,XGBClassifier,Teste,0.917704,0.461017,0.038636,0.728227,0.456454,0.344353


Unnamed: 0,Algoritmo,Conjunto,Acuracia,Precisao,Recall,AUC_ROC,GINI,KS
0,LGBMClassifier,Treino,0.921152,0.825726,0.02865,0.8267,0.653399,0.496511
1,LGBMClassifier,Teste,0.918494,0.556701,0.015341,0.748999,0.497998,0.369684


Unnamed: 0,Algoritmo,Conjunto,Acuracia,Precisao,Recall,AUC_ROC,GINI,KS
0,CatBoostClassifier,Treino,0.928411,0.989362,0.113807,0.873756,0.747511,0.575303
1,CatBoostClassifier,Teste,0.918355,0.515337,0.023864,0.748419,0.496838,0.374209


É importante observar todas as métricas e como elas se comportam para cada modelo treinado, porém nesse caso, como estamos tentando resolver um problema de concessão de crédito e risco de inadimplência, iremos focar na AUC_ROC, Gini e KS, porque além de serem métricas pedidas para serem priorizadas pelo time de negócios, são métricas que possuem uma boa capacidade discriminativa do modelo e sua capacidade de distinguir entre bons e maus pagadores, ajudando assim na tomada de decisão precisa e eficaz.
<br>
<br>
Então colocando um foco maior nessas métricas, podemos chegar a conclusão que o modelo treinado com o algoritmo **GradientBoostingClassifier** obteve melhor desempenho nos dados de teste, ante os demais modelos, além de ter métricas consistentes entre os conjuntos de treino e teste, que mostra que o modelo é robusto e confiável, capaz de generalizar bem para novos dados e fazer previsões precisas em situações do mundo real.
<br>
<br>
O modelo escolhido para ser o nosso modelo baseline (modelo de entrada), será o modelo treinado com o algoritmo **GradientBoostingClassifier** e suas métricas servirão de parâmetro quando formos aplicar técnicas e métodos afim de melhorar essas métricas.

In [None]:
algoritmos = [GradientBoostingClassifier(random_state=42)]

# Iterando sobre os algoritmos.
for algoritmo in algoritmos:
    nome_algoritmo = algoritmo.__class__.__name__

    # Treino do modelo.
    algoritmo.fit(X_train_processed, y_train)

    # Avaliar modelo.
    metricas_modelo_baseline = calculate_metrics(nome_algoritmo, algoritmo, X_train_processed, y_train, X_test_processed, y_test)
    display(metricas_modelo_baseline)

Unnamed: 0,Algoritmo,Conjunto,Acuracia,Precisao,Recall,AUC_ROC,GINI,KS
0,GradientBoostingClassifier,Treino,0.920107,0.677249,0.018428,0.764702,0.529403,0.394877
1,GradientBoostingClassifier,Teste,0.91854,0.580247,0.013352,0.749314,0.498627,0.37504


<br>
<br>
<br>
<br>

## **Salvamento do Modelo escolhido em um Arquivo pkl**

Vamos salvar o modelo treinado com o algoritmo **GradientBoostingClassifier** em um arquivo serealizado do tipo pickle, para que ele possa ser usado depois caso seja necessário.

In [None]:
with open(dir + '/artefatos/baseline/baseline_gradient_boosting_v1.pkl', 'wb') as file:
  pickle.dump(algoritmo, file)