# Bibliotecas

In [1]:
# Pacotes/funções auxiliares
from numpy import linspace

# Técnicas de random sampling utilizadas
from imblearn.under_sampling import RandomUnderSampler, NearMiss, TomekLinks, EditedNearestNeighbours
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN, BorderlineSMOTE
from imblearn.combine import SMOTEENN, SMOTETomek

# Algoritmos de Machine Learning clássicos
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

# Algoritmos de Machine Learning modificados para lidar com dados desbalanceados
from imblearn.ensemble import BalancedRandomForestClassifier, BalancedBaggingClassifier, EasyEnsembleClassifier, RUSBoostClassifier
from imbens.ensemble import OverBoostClassifier, SMOTEBoostClassifier, OverBaggingClassifier, SMOTEBaggingClassifier

# Inicialização dos Modelos a serem utilizados nessa análise

In [2]:
tecnicas_de_random_sampling = {
    'RandomUnderSampler': RandomUnderSampler(),
    'RandomOverSampler': RandomOverSampler(),
    'NearMiss-1': NearMiss(version = 1),
    'NearMiss-2': NearMiss(version = 2),
    'NearMiss-3': NearMiss(version = 3),
    'TomekLinks': TomekLinks(),
    'ENN' : EditedNearestNeighbours(),
    'ADASYN' : ADASYN(),
    'SMOTE' : SMOTE(),
    'BorderlineSMOTE-1' : BorderlineSMOTE(kind = 'borderline-1'),
    'BorderlineSMOTE-2' : BorderlineSMOTE(kind= 'borderline-2'),
    'SMOTEEN' : SMOTEENN(),
    'SMOTETomek' : SMOTETomek()
}


algoritmos_classicos = {
    'DecisionTree' : DecisionTreeClassifier(),
    'AdaBoost' : AdaBoostClassifier(),
    'Bagging' : BaggingClassifier(),
    'RandomForest' : RandomForestClassifier(),
    'XGBoost' : XGBClassifier(),
    'LogisticRegression' : LogisticRegression(penalty = None)
}



algoritmos_desafiantes = {
    'EasyEnsemble' : EasyEnsembleClassifier(),
    'BalancedBaggingClassifier' : BalancedBaggingClassifier(),
    'BalancedRandomForestClassifier' : BalancedRandomForestClassifier(),
    'RUSBoost' : RUSBoostClassifier(),
    'OverBoost' : OverBoostClassifier(),
    'SMOTEBoost' : SMOTEBoostClassifier(),
    'OverBagging' : OverBaggingClassifier(),
    'SMOTEBagging' : SMOTEBaggingClassifier()
}

# Elaboração da Base de Dados

In [3]:
import pandas as pd

df = pd.read_csv(r"H:\Meu Drive\Dissertacao\Bases\StepWise Aplication Data (Modelagem).csv")

df.drop(['data', 'amostra'], axis = 1, inplace = True)
# Selecionar 2000 instâncias com inadimplência igual a 1

X, y = df.drop('inadimplencia', axis = 1), df['inadimplencia']

X = pd.get_dummies(X, drop_first = True)

# Tratar o nome das colunas de X_dummies
X.columns = X.columns.str.lower().str.replace('[^\w\s]', '').str.replace(' ', '_')

X = X.to_numpy()

X

array([[553, 842, False, ..., False, True, False],
       [783, 142, False, ..., False, True, False],
       [647, 647, False, ..., False, True, False],
       ...,
       [528, 343, False, ..., False, False, False],
       [314, 567, False, ..., False, True, False],
       [446, 126, True, ..., False, False, True]], dtype=object)

# Simulação - Algoritmos Clássicos

In [4]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve
from sklearn.model_selection import StratifiedKFold

# Função para calcular a métrica KS
def ks_metric(y_true, y_pred_proba):
    fpr, tpr, thresholds = roc_curve(y_true, y_pred_proba)
    return max(tpr - fpr)

In [None]:
# DataFrame para armazenar as métricas
metrics_df = pd.DataFrame(columns=['Algoritmo', 'Precisão', 'Recall', 'F1-Score', 'ROC AUC', 'KS'])

# Loop através dos algoritmos clássicos
for name, model in algoritmos_classicos.items():
    
    print(f'{name} com {model}')
    
    # Configuração da validação cruzada
    cv = StratifiedKFold(n_splits=3, shuffle=False)
    
    metrics_list = []  # Initialize metrics_list for each model
    
    for fold, (train_index, test_index) in enumerate(cv.split(X, y), 1):
        # Obter os conjuntos de treinamento e teste para o fold atual
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        model.fit(X_train, y_train)

        y_pred_proba = model.predict_proba(X_test)[:, 1]
        # Otimizar o ponto de corte
        fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
        optimal_idx = np.argmax(tpr - fpr)
        optimal_threshold = thresholds[optimal_idx]
        y_pred = (y_pred_proba >= optimal_threshold).astype(int)

        # Calcular métricas
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_pred_proba)
        ks = ks_metric(y_test, y_pred_proba)

        metrics_list.append({
            'Precisão': precision,
            'Recall': recall,
            'F1-Score': f1,
            'ROC AUC': roc_auc,
            'KS': ks
        })

    # Calcular a média das métricas
    avg_metrics = {
        'Precisão': np.mean([m['Precisão'] for m in metrics_list]),
        'Recall': np.mean([m['Recall'] for m in metrics_list]),
        'F1-Score': np.mean([m['F1-Score'] for m in metrics_list]),
        'ROC AUC': np.mean([m['ROC AUC'] for m in metrics_list]),
        'KS': np.mean([m['KS'] for m in metrics_list])
    }
    
    avg_metrics['Algoritmo'] = name  # Add algorithm name to avg_metrics
    
    metrics_df = pd.concat([metrics_df, pd.DataFrame([avg_metrics])], ignore_index=True)
        
metrics_df.sort_values(['F1-Score', 'ROC AUC', 'KS'], ascending=False).to_csv(r'H:\Meu Drive\Dissertacao\Notebook\Problema Real - StepWise\Resultados\algoritmos_classicos.csv', index=False)

metrics_df.sort_values(['F1-Score', 'ROC AUC', 'KS'], ascending=False)

DecisionTree com DecisionTreeClassifier()


NameError: name 'X' is not defined

In [None]:
metrics_df.sort_values('ROC AUC', ascending = False).to_clipboard(index = False)

# Simulação - Algoritmos Clássicos com Técnicas de Boosting

In [5]:
# DataFrame para armazenar as métricas
metrics_sampling_df = pd.DataFrame(columns=['Algoritmo', 'Técnica de Sampling', 'Precisão', 'Recall', 'F1-Score', 'ROC AUC', 'KS'])

# Loop através dos algoritmos clássicos e técnicas de random sampling
# Obter a proporção de classes desbalanceadas
ratio = sum(y == 1) / sum(y == 0)
imbalance_ratio = linspace(1, ratio, 20)[:-1]

# Configuração da validação cruzada
cv = StratifiedKFold(n_splits=3, shuffle=False)

# Loop pelos folds para capturar os conjuntos de treinamento e teste
# Loop através dos algoritmos clássicos e técnicas de random sampling
for sampling_name, sampling_technique in tecnicas_de_random_sampling.items():
    for model_name, model in algoritmos_classicos.items():
        
        print(f'{model_name} com {sampling_name}')
        
        best_f1 = 0
        best_metrics = None
        
        # Variar a estratégia de sampling
        for strategy in imbalance_ratio:
            metrics_list = []

            for fold, (train_index, test_index) in enumerate(cv.split(X, y), 1):
                # Obter os conjuntos de treinamento e teste para o fold atual
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]
                
                # Aplicar a técnica de random sampling
                X_resampled, y_resampled = sampling_technique.fit_resample(X_train, y_train)

                model.fit(X_resampled, y_resampled)

                y_pred_proba = model.predict_proba(X_test)[:, 1]
                # Otimizar o ponto de corte
                fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
                optimal_idx = np.argmax(tpr - fpr)
                optimal_threshold = thresholds[optimal_idx]
                y_pred = (y_pred_proba >= optimal_threshold).astype(int)

                # Calcular métricas
                precision = precision_score(y_test, y_pred)
                recall = recall_score(y_test, y_pred)
                f1 = f1_score(y_test, y_pred)
                roc_auc = roc_auc_score(y_test, y_pred_proba)
                ks = ks_metric(y_test, y_pred_proba)

                metrics_list.append({
                    'Precisão': precision,
                    'Recall': recall,
                    'F1-Score': f1,
                    'ROC AUC': roc_auc,
                    'KS': ks
                })

            # Calcular a média das métricas
            avg_metrics = {
                'Precisão': np.mean([m['Precisão'] for m in metrics_list]),
                'Recall': np.mean([m['Recall'] for m in metrics_list]),
                'F1-Score': np.mean([m['F1-Score'] for m in metrics_list]),
                'ROC AUC': np.mean([m['ROC AUC'] for m in metrics_list]),
                'KS': np.mean([m['KS'] for m in metrics_list])
            }

            if avg_metrics['F1-Score'] > best_f1:
                best_f1 = avg_metrics['F1-Score']
                best_metrics = {
                    'Algoritmo': model_name,
                    'Técnica de Sampling': sampling_name,
                    'Precisão': avg_metrics['Precisão'],
                    'Recall': avg_metrics['Recall'],
                    'F1-Score': avg_metrics['F1-Score'],
                    'ROC AUC': avg_metrics['ROC AUC'],
                    'KS': avg_metrics['KS'],
                    'Imbalance Ratio': strategy
                }

        # Adicionar as melhores métricas ao DataFrame
        if best_metrics:
            metrics_sampling_df = pd.concat([metrics_sampling_df, pd.DataFrame([best_metrics])], ignore_index=True)
            # Salvar o DataFrame em um arquivo CSV
            metrics_sampling_df.to_csv(r'H:\Meu Drive\Dissertacao\Notebook\Problema Real - StepWise\Resultados\metrics_sampling.csv', index=False)

# Exibir o DataFrame com as métricas
metrics_sampling_df.sort_values(['F1-Score', 'ROC AUC', 'KS'], ascending=False)

DecisionTree com RandomUnderSampler
AdaBoost com RandomUnderSampler
Bagging com RandomUnderSampler
RandomForest com RandomUnderSampler
XGBoost com RandomUnderSampler
LogisticRegression com RandomUnderSampler
DecisionTree com RandomOverSampler
AdaBoost com RandomOverSampler
Bagging com RandomOverSampler
RandomForest com RandomOverSampler
XGBoost com RandomOverSampler
LogisticRegression com RandomOverSampler
DecisionTree com NearMiss-1
AdaBoost com NearMiss-1
Bagging com NearMiss-1
RandomForest com NearMiss-1
XGBoost com NearMiss-1
LogisticRegression com NearMiss-1
DecisionTree com NearMiss-2
AdaBoost com NearMiss-2
Bagging com NearMiss-2
RandomForest com NearMiss-2
XGBoost com NearMiss-2
LogisticRegression com NearMiss-2
DecisionTree com NearMiss-3
AdaBoost com NearMiss-3
Bagging com NearMiss-3
RandomForest com NearMiss-3
XGBoost com NearMiss-3
LogisticRegression com NearMiss-3
DecisionTree com TomekLinks
AdaBoost com TomekLinks
Bagging com TomekLinks
RandomForest com TomekLinks
XGBoost

Unnamed: 0,Algoritmo,Técnica de Sampling,Precisão,Recall,F1-Score,ROC AUC,KS,Imbalance Ratio
62,RandomForest,BorderlineSMOTE-2,0.099122,0.576136,0.165261,0.669650,0.272995,0.252205
68,RandomForest,SMOTEEN,0.094618,0.594906,0.162883,0.683246,0.290201,0.750735
44,RandomForest,ADASYN,0.095357,0.556515,0.161839,0.668787,0.269542,1.000000
74,RandomForest,SMOTETomek,0.094265,0.563061,0.160930,0.673713,0.273770,0.750735
50,RandomForest,SMOTE,0.094759,0.566383,0.160719,0.670985,0.268342,0.651029
...,...,...,...,...,...,...,...,...
14,RandomForest,NearMiss-1,0.050458,0.994507,0.096043,0.407860,0.006445,0.501470
16,LogisticRegression,NearMiss-1,0.059820,0.669953,0.089290,0.429962,0.016962,1.000000
15,XGBoost,NearMiss-1,0.073497,0.481381,0.074405,0.446113,0.011806,1.000000
12,AdaBoost,NearMiss-1,0.145095,0.654609,0.066786,0.410895,0.011470,1.000000


In [12]:
metrics_sampling_df.sort_values('F1-Score', ascending = False)[:10].to_clipboard(index = False)

# Simulação - Extensões de Técnicas Ensemble

In [6]:
# DataFrame para armazenar as métricas
metrics_desafiantes_df = pd.DataFrame(columns=['Algoritmo', 'Precisão', 'Recall', 'F1-Score', 'ROC AUC', 'KS'])

# Loop através dos algoritmos clássicos
for name, model in algoritmos_desafiantes.items():
    
    # Configuração da validação cruzada
    cv = StratifiedKFold(n_splits=3, shuffle=False)
    
    metrics_list = []  # Initialize metrics_list for each model
    
    for fold, (train_index, test_index) in enumerate(cv.split(X, y), 1):
        # Obter os conjuntos de treinamento e teste para o fold atual
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        model.fit(X_train, y_train)

        y_pred_proba = model.predict_proba(X_test)[:, 1]
        # Otimizar o ponto de corte
        fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
        optimal_idx = np.argmax(tpr - fpr)
        optimal_threshold = thresholds[optimal_idx]
        y_pred = (y_pred_proba >= optimal_threshold).astype(int)

        # Calcular métricas
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_pred_proba)
        ks = ks_metric(y_test, y_pred_proba)

        metrics_list.append({
            'Precisão': precision,
            'Recall': recall,
            'F1-Score': f1,
            'ROC AUC': roc_auc,
            'KS': ks
        })

    # Calcular a média das métricas
    avg_metrics = {
        'Precisão': np.mean([m['Precisão'] for m in metrics_list]),
        'Recall': np.mean([m['Recall'] for m in metrics_list]),
        'F1-Score': np.mean([m['F1-Score'] for m in metrics_list]),
        'ROC AUC': np.mean([m['ROC AUC'] for m in metrics_list]),
        'KS': np.mean([m['KS'] for m in metrics_list])
    }
    
    avg_metrics['Algoritmo'] = name  # Add algorithm name to avg_metrics
    
    metrics_desafiantes_df = pd.concat([metrics_desafiantes_df, pd.DataFrame([avg_metrics])], ignore_index=True)
        
    metrics_desafiantes_df.sort_values(['F1-Score', 'ROC AUC', 'KS'], ascending=False)

    metrics_desafiantes_df.to_csv(r'H:\Meu Drive\Dissertacao\Notebook\Problema Real - StepWise\Resultados\metrics_extensao_ensemble.csv', index=False)

In [7]:
metrics_desafiantes_df.sort_values('F1-Score', ascending = False).to_clipboard(index = False)