# Bibliotecas

In [1]:
# Pacotes/funções auxiliares
from numpy import linspace

# Técnicas de random sampling utilizadas
from imblearn.under_sampling import RandomUnderSampler, NearMiss, TomekLinks, EditedNearestNeighbours
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN, BorderlineSMOTE
from imblearn.combine import SMOTEENN, SMOTETomek

# Algoritmos de Machine Learning clássicos
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

# Algoritmos de Machine Learning modificados para lidar com dados desbalanceados
from imblearn.ensemble import BalancedRandomForestClassifier, BalancedBaggingClassifier, EasyEnsembleClassifier, RUSBoostClassifier
from imbens.ensemble import OverBoostClassifier, SMOTEBoostClassifier, OverBaggingClassifier, SMOTEBaggingClassifier

# Inicialização dos Modelos a serem utilizados nessa análise

In [2]:
tecnicas_de_random_sampling = {
    'RandomUnderSampler': RandomUnderSampler(),
    'RandomOverSampler': RandomOverSampler(),
    'NearMiss-1': NearMiss(version = 1),
    'NearMiss-2': NearMiss(version = 2),
    'NearMiss-3': NearMiss(version = 3),
    'TomekLinks': TomekLinks(),
    'ENN' : EditedNearestNeighbours(),
    'ADASYN' : ADASYN(),
    'SMOTE' : SMOTE(),
    'BorderlineSMOTE-1' : BorderlineSMOTE(kind = 'borderline-1'),
    'BorderlineSMOTE-2' : BorderlineSMOTE(kind= 'borderline-2'),
    'SMOTEEN' : SMOTEENN(),
    'SMOTETomek' : SMOTETomek()
}


algoritmos_classicos = {
    'DecisionTree' : DecisionTreeClassifier(),
    'AdaBoost' : AdaBoostClassifier(),
    'Bagging' : BaggingClassifier(),
    'RandomForest' : RandomForestClassifier(),
    'XGBoost' : XGBClassifier(),
    'LogisticRegression' : LogisticRegression(penalty = None)
}



algoritmos_desafiantes = {
    'EasyEnsemble' : EasyEnsembleClassifier(),
    'BalancedBaggingClassifier' : BalancedBaggingClassifier(),
    'BalancedRandomForestClassifier' : BalancedRandomForestClassifier(),
    'RUSBoost' : RUSBoostClassifier(),
    'OverBoost' : OverBoostClassifier(),
    'SMOTEBoost' : SMOTEBoostClassifier(),
    'OverBagging' : OverBaggingClassifier(),
    'SMOTEBagging' : SMOTEBaggingClassifier()
}

# Elaboração da Base de Dados

In [3]:
from sklearn.datasets import make_classification
import pandas as pd

# Criar a base de dados
X, y = make_classification(
    n_samples=10000,
    n_features=20, 
    n_informative = 10,
    n_redundant = 10,
    n_repeated = 0,
    n_classes=2, 
    n_clusters_per_class=2,
    weights=[0.99, 0.01], 
    flip_y=0.01, 
    random_state=42
)

# Converter para DataFrame
df = pd.DataFrame(X, columns=[f'X_{i}' for i in range(X.shape[1])])
df['Y'] = y

# Exibir as primeiras linhas do DataFrame
df.head()

Unnamed: 0,X_0,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,X_9,...,X_11,X_12,X_13,X_14,X_15,X_16,X_17,X_18,X_19,Y
0,-1.891698,-1.098882,2.242979,-1.70367,1.177859,-1.965913,-3.989913,-0.315247,1.763179,0.637821,...,-1.556564,0.163268,0.817096,1.006207,-0.502394,1.764065,0.171272,-1.217057,-0.462137,0
1,0.967128,0.989727,-3.560767,2.641276,1.225926,5.643665,-1.664649,-0.15404,-1.394621,-1.826544,...,-2.343698,1.314175,1.079805,-0.228467,0.955065,1.796111,1.401819,-3.22639,0.493642,0
2,3.791968,3.748859,5.158545,-0.467279,-2.384977,5.032702,2.845661,-1.885064,-1.287809,-0.097028,...,-1.265873,4.003121,0.949578,2.616478,0.182992,2.25421,-2.178719,1.057135,3.507031,0
3,-5.675023,-2.671726,-2.11951,2.310518,2.865188,-0.945811,-4.361941,4.549177,0.954702,-2.667907,...,-0.730544,-4.132145,0.657825,-0.734123,0.009443,-0.795901,0.826155,-2.139846,-6.081064,0
4,6.774418,4.010988,6.581402,-1.035143,-4.486375,2.674443,6.95115,-1.177492,-4.052873,2.148599,...,1.850049,3.537991,-0.528561,0.370144,-0.949455,0.649594,-2.147332,1.910235,4.694199,0


# Simulação - Algoritmos Clássicos

In [5]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve
from sklearn.model_selection import StratifiedKFold

# Função para calcular a métrica KS
def ks_metric(y_true, y_pred_proba):
    fpr, tpr, thresholds = roc_curve(y_true, y_pred_proba)
    return max(tpr - fpr)

# DataFrame para armazenar as métricas
metrics_df = pd.DataFrame(columns=['Algoritmo', 'Precisão', 'Recall', 'F1-Score', 'ROC AUC', 'KS'])

# Loop através dos algoritmos clássicos
for name, model in algoritmos_classicos.items():
    
    # Configuração da validação cruzada
    cv = StratifiedKFold(n_splits=3, shuffle=False)
    
    metrics_list = []  # Initialize metrics_list for each model
    
    for fold, (train_index, test_index) in enumerate(cv.split(X, y), 1):
        # Obter os conjuntos de treinamento e teste para o fold atual
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        model.fit(X_train, y_train)

        y_pred_proba = model.predict_proba(X_test)[:, 1]
        # Otimizar o ponto de corte
        fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
        optimal_idx = np.argmax(tpr - fpr)
        optimal_threshold = thresholds[optimal_idx]
        y_pred = (y_pred_proba >= optimal_threshold).astype(int)

        # Calcular métricas
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_pred_proba)
        ks = ks_metric(y_test, y_pred_proba)

        metrics_list.append({
            'Precisão': precision,
            'Recall': recall,
            'F1-Score': f1,
            'ROC AUC': roc_auc,
            'KS': ks
        })

    # Calcular a média das métricas
    avg_metrics = {
        'Precisão': np.mean([m['Precisão'] for m in metrics_list]),
        'Recall': np.mean([m['Recall'] for m in metrics_list]),
        'F1-Score': np.mean([m['F1-Score'] for m in metrics_list]),
        'ROC AUC': np.mean([m['ROC AUC'] for m in metrics_list]),
        'KS': np.mean([m['KS'] for m in metrics_list])
    }
    
    avg_metrics['Algoritmo'] = name  # Add algorithm name to avg_metrics
    
    metrics_df = pd.concat([metrics_df, pd.DataFrame([avg_metrics])], ignore_index=True)
        
metrics_df.sort_values(['F1-Score', 'ROC AUC', 'KS'], ascending=False).to_csv(r'H:\Meu Drive\Dissertacao\Notebook\Análise Inicial dos Métodos Estudados\Resultados\algoritmos_classicos.csv', index=False)

metrics_df.sort_values(['ROC AUC', 'KS'], ascending=False)

KeyboardInterrupt: 

In [12]:
metrics_df.sort_values(['ROC AUC', 'KS'], ascending=False).to_clipboard(index=False)

In [13]:
metrics_df = pd.read_csv(
    r"H:\Meu Drive\Dissertacao\Notebook\Análise Inicial dos Métodos Estudados\Resultados\metrics_sampling.csv"
)

metrics_df

Unnamed: 0,Algoritmo,Técnica de Sampling,Precisão,Recall,F1-Score,ROC AUC,KS,Imbalance Ratio
0,DecisionTree,RandomUnderSampler,0.027774,0.668440,0.053332,0.665677,0.331353,0.273772
1,AdaBoost,RandomUnderSampler,0.071139,0.578162,0.120946,0.721266,0.425393,0.377519
2,Bagging,RandomUnderSampler,0.087655,0.555408,0.141538,0.747269,0.440374,0.118151
3,RandomForest,RandomUnderSampler,0.082727,0.576832,0.142944,0.779528,0.477522,0.170025
4,XGBoost,RandomUnderSampler,0.074492,0.612589,0.132685,0.746839,0.501816,0.273772
...,...,...,...,...,...,...,...,...
73,AdaBoost,SMOTETomek,0.080020,0.527630,0.136253,0.725386,0.421726,0.844380
74,Bagging,SMOTETomek,0.134205,0.493056,0.188592,0.746625,0.411700,0.170025
75,RandomForest,SMOTETomek,0.087004,0.654699,0.152590,0.798327,0.548896,0.170025
76,XGBoost,SMOTETomek,0.099482,0.605496,0.166837,0.797374,0.508722,0.896253


In [17]:
metrics_df.sort_values('ROC AUC', ascending = False)[:10].to_clipboard(index = False)

# Simulação - Algoritmos Clássicos com Técnicas de Boosting

In [5]:
# DataFrame para armazenar as métricas
metrics_sampling_df = pd.DataFrame(columns=['Algoritmo', 'Técnica de Sampling', 'Precisão', 'Recall', 'F1-Score', 'ROC AUC', 'KS'])

# Loop através dos algoritmos clássicos e técnicas de random sampling
# Obter a proporção de classes desbalanceadas
ratio = sum(y == 1) / sum(y == 0)
imbalance_ratio = linspace(1, ratio, 20)[:-1]

# Configuração da validação cruzada
cv = StratifiedKFold(n_splits=3, shuffle=False)

# Loop pelos folds para capturar os conjuntos de treinamento e teste
# Loop através dos algoritmos clássicos e técnicas de random sampling
for sampling_name, sampling_technique in tecnicas_de_random_sampling.items():
    for model_name, model in algoritmos_classicos.items():
        
        print(f'{model_name} com {sampling_name}')
        
        best_f1 = 0
        best_metrics = None
        
        # Variar a estratégia de sampling
        for strategy in imbalance_ratio:
            metrics_list = []

            for fold, (train_index, test_index) in enumerate(cv.split(X, y), 1):
                # Obter os conjuntos de treinamento e teste para o fold atual
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]
                
                # Aplicar a técnica de random sampling
                X_resampled, y_resampled = sampling_technique.fit_resample(X_train, y_train)

                model.fit(X_resampled, y_resampled)

                y_pred_proba = model.predict_proba(X_test)[:, 1]
                # Otimizar o ponto de corte
                fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
                optimal_idx = np.argmax(tpr - fpr)
                optimal_threshold = thresholds[optimal_idx]
                y_pred = (y_pred_proba >= optimal_threshold).astype(int)

                # Calcular métricas
                precision = precision_score(y_test, y_pred)
                recall = recall_score(y_test, y_pred)
                f1 = f1_score(y_test, y_pred)
                roc_auc = roc_auc_score(y_test, y_pred_proba)
                ks = ks_metric(y_test, y_pred_proba)

                metrics_list.append({
                    'Precisão': precision,
                    'Recall': recall,
                    'F1-Score': f1,
                    'ROC AUC': roc_auc,
                    'KS': ks
                })

            # Calcular a média das métricas
            avg_metrics = {
                'Precisão': np.mean([m['Precisão'] for m in metrics_list]),
                'Recall': np.mean([m['Recall'] for m in metrics_list]),
                'F1-Score': np.mean([m['F1-Score'] for m in metrics_list]),
                'ROC AUC': np.mean([m['ROC AUC'] for m in metrics_list]),
                'KS': np.mean([m['KS'] for m in metrics_list])
            }

            if avg_metrics['F1-Score'] > best_f1:
                best_f1 = avg_metrics['F1-Score']
                best_metrics = {
                    'Algoritmo': model_name,
                    'Técnica de Sampling': sampling_name,
                    'Precisão': avg_metrics['Precisão'],
                    'Recall': avg_metrics['Recall'],
                    'F1-Score': avg_metrics['F1-Score'],
                    'ROC AUC': avg_metrics['ROC AUC'],
                    'KS': avg_metrics['KS'],
                    'Imbalance Ratio': strategy
                }

        # Adicionar as melhores métricas ao DataFrame
        if best_metrics:
            metrics_sampling_df = pd.concat([metrics_sampling_df, pd.DataFrame([best_metrics])], ignore_index=True)
            # Salvar o DataFrame em um arquivo CSV
            metrics_sampling_df.to_csv(r'H:\Meu Drive\Dissertacao\Notebook\Análise Inicial dos Métodos Estudados\Resultados\metrics_sampling.csv', index=False)

# Exibir o DataFrame com as métricas
metrics_sampling_df.sort_values(['F1-Score', 'ROC AUC', 'KS'], ascending=False)

DecisionTree com RandomUnderSampler
AdaBoost com RandomUnderSampler
Bagging com RandomUnderSampler
RandomForest com RandomUnderSampler
XGBoost com RandomUnderSampler
LogisticRegression com RandomUnderSampler
DecisionTree com RandomOverSampler
AdaBoost com RandomOverSampler
Bagging com RandomOverSampler
RandomForest com RandomOverSampler
XGBoost com RandomOverSampler
LogisticRegression com RandomOverSampler
DecisionTree com NearMiss-1
AdaBoost com NearMiss-1
Bagging com NearMiss-1
RandomForest com NearMiss-1
XGBoost com NearMiss-1
LogisticRegression com NearMiss-1
DecisionTree com NearMiss-2
AdaBoost com NearMiss-2
Bagging com NearMiss-2
RandomForest com NearMiss-2
XGBoost com NearMiss-2
LogisticRegression com NearMiss-2
DecisionTree com NearMiss-3
AdaBoost com NearMiss-3
Bagging com NearMiss-3
RandomForest com NearMiss-3
XGBoost com NearMiss-3
LogisticRegression com NearMiss-3
DecisionTree com TomekLinks
AdaBoost com TomekLinks
Bagging com TomekLinks
RandomForest com TomekLinks
XGBoost

Unnamed: 0,Algoritmo,Técnica de Sampling,Precisão,Recall,F1-Score,ROC AUC,KS,Imbalance Ratio
32,Bagging,TomekLinks,0.199206,0.429817,0.272096,0.706343,0.404660,0.429392
58,XGBoost,BorderlineSMOTE-1,0.173558,0.591608,0.256423,0.808747,0.532468,0.533139
63,RandomForest,BorderlineSMOTE-2,0.160235,0.577275,0.249776,0.801561,0.532236,0.118151
6,DecisionTree,RandomOverSampler,0.233238,0.246158,0.238812,0.617246,0.234493,0.325645
30,DecisionTree,TomekLinks,0.245613,0.232270,0.238710,0.611012,0.222024,0.533139
...,...,...,...,...,...,...,...,...
19,AdaBoost,NearMiss-2,0.020592,0.738918,0.039748,0.559714,0.181807,1.000000
20,Bagging,NearMiss-2,0.019493,0.746158,0.037848,0.555948,0.147863,0.948127
24,DecisionTree,NearMiss-3,0.019101,0.619090,0.037055,0.578768,0.157536,0.948127
12,DecisionTree,NearMiss-1,0.017148,0.747045,0.033518,0.560071,0.120143,0.221898


In [20]:
metrics_sampling_df.sort_values('F1-Score', ascending = False)[:10].to_clipboard(index = False)

# Simulação - Extensões de Técnicas Ensemble

In [6]:
# DataFrame para armazenar as métricas
metrics_desafiantes_df = pd.DataFrame(columns=['Algoritmo', 'Precisão', 'Recall', 'F1-Score', 'ROC AUC', 'KS'])

# Loop através dos algoritmos clássicos
for name, model in algoritmos_desafiantes.items():
    
    # Configuração da validação cruzada
    cv = StratifiedKFold(n_splits=3, shuffle=False)
    
    metrics_list = []  # Initialize metrics_list for each model
    
    for fold, (train_index, test_index) in enumerate(cv.split(X, y), 1):
        # Obter os conjuntos de treinamento e teste para o fold atual
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        model.fit(X_train, y_train)

        y_pred_proba = model.predict_proba(X_test)[:, 1]
        # Otimizar o ponto de corte
        fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
        optimal_idx = np.argmax(tpr - fpr)
        optimal_threshold = thresholds[optimal_idx]
        y_pred = (y_pred_proba >= optimal_threshold).astype(int)

        # Calcular métricas
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_pred_proba)
        ks = ks_metric(y_test, y_pred_proba)

        metrics_list.append({
            'Precisão': precision,
            'Recall': recall,
            'F1-Score': f1,
            'ROC AUC': roc_auc,
            'KS': ks
        })

    # Calcular a média das métricas
    avg_metrics = {
        'Precisão': np.mean([m['Precisão'] for m in metrics_list]),
        'Recall': np.mean([m['Recall'] for m in metrics_list]),
        'F1-Score': np.mean([m['F1-Score'] for m in metrics_list]),
        'ROC AUC': np.mean([m['ROC AUC'] for m in metrics_list]),
        'KS': np.mean([m['KS'] for m in metrics_list])
    }
    
    avg_metrics['Algoritmo'] = name  # Add algorithm name to avg_metrics
    
    metrics_desafiantes_df = pd.concat([metrics_desafiantes_df, pd.DataFrame([avg_metrics])], ignore_index=True)
        
    metrics_desafiantes_df.sort_values(['F1-Score', 'ROC AUC', 'KS'], ascending=False)

In [8]:
metrics_desafiantes_df

Unnamed: 0,Algoritmo,Precisão,Recall,F1-Score,ROC AUC,KS
0,EasyEnsemble,0.059189,0.612441,0.107043,0.74535,0.462208
1,BalancedBaggingClassifier,0.044004,0.619385,0.082089,0.753132,0.421475
2,BalancedRandomForestClassifier,0.091665,0.640662,0.160225,0.783091,0.548655
3,RUSBoost,0.036151,0.618942,0.068054,0.70276,0.369094
4,OverBoost,0.076357,0.612884,0.130782,0.749913,0.463868
5,SMOTEBoost,0.05347,0.597961,0.097819,0.743732,0.442859
6,OverBagging,0.087196,0.612441,0.143765,0.789226,0.471743
7,SMOTEBagging,0.07725,0.640957,0.135784,0.785821,0.510302


In [10]:
metrics_desafiantes_df.to_csv(
    r"H:\Meu Drive\Dissertacao\Notebook\Análise Inicial dos Métodos Estudados\Resultados\extensoes_ensemble.csv",
    index = False
)

metrics_desafiantes_df

Unnamed: 0,Algoritmo,Precisão,Recall,F1-Score,ROC AUC,KS
0,EasyEnsemble,0.059189,0.612441,0.107043,0.74535,0.462208
1,BalancedBaggingClassifier,0.044004,0.619385,0.082089,0.753132,0.421475
2,BalancedRandomForestClassifier,0.091665,0.640662,0.160225,0.783091,0.548655
3,RUSBoost,0.036151,0.618942,0.068054,0.70276,0.369094
4,OverBoost,0.076357,0.612884,0.130782,0.749913,0.463868
5,SMOTEBoost,0.05347,0.597961,0.097819,0.743732,0.442859
6,OverBagging,0.087196,0.612441,0.143765,0.789226,0.471743
7,SMOTEBagging,0.07725,0.640957,0.135784,0.785821,0.510302


In [19]:
metrics_desafiantes_df.sort_values('ROC AUC', ascending = False).to_clipboard(index = False)