# Bibliotecas

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [5]:
from imblearn.under_sampling import RandomUnderSampler, NearMiss
from imblearn.over_sampling import RandomOverSampler, SMOTE

In [75]:
from sklearn.datasets import make_classification
from imblearn.under_sampling import RandomUnderSampler

# Criar uma base de dados de classificação
X, y = make_classification(n_samples=10000, n_features=20, n_informative=2, n_redundant=10, n_classes=2, weights=[0.99, 0.01], random_state=42)

# Converter para DataFrame para melhor visualização
df_classification = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(20)])
df_classification['target'] = y

In [89]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, f1_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from skopt import BayesSearchCV

# Definir diferentes valores de imbalance_ratio para testar, removendo o último elemento
imbalance_ratios = np.linspace(1, min(np.bincount(y) / len(y)), 20)[:-1]

# Lista para armazenar os f1-scores
f1_scores = []

# Loop para testar diferentes valores de imbalance_ratio
# Lista de modelos para treinar
# Definir os hiperparâmetros para otimização bayesiana
param_grid = {
    'DecisionTree': {
        'max_depth': (1, 20),
        'min_samples_split': (2, 20),
        'min_samples_leaf': (1, 20)
    },
    'RandomForest': {
        'n_estimators': (10, 200),
        'max_depth': (1, 20),
        'min_samples_split': (2, 20),
        'min_samples_leaf': (1, 20)
    },
    'XGBoost': {
        'n_estimators': (10, 200),
        'max_depth': (1, 20),
        'learning_rate': (0.01, 1.0, 'log-uniform'),
        'subsample': (0.5, 1.0),
        'colsample_bytree': (0.5, 1.0)
    },
    'AdaBoost': {
        'n_estimators': (10, 200),
        'learning_rate': (0.01, 1.0, 'log-uniform')
    }
}

# Lista de modelos para treinar com otimização bayesiana
models = {
    'DecisionTree': BayesSearchCV(DecisionTreeClassifier(random_state=42), param_grid['DecisionTree'], n_iter=32, cv=3, scoring='f1', random_state=42),
    'RandomForest': BayesSearchCV(RandomForestClassifier(random_state=42), param_grid['RandomForest'], n_iter=32, cv=3, scoring='f1', random_state=42),
    'XGBoost': BayesSearchCV(XGBClassifier(random_state=42), param_grid['XGBoost'], n_iter=32, cv=3, scoring='f1', random_state=42),
    'AdaBoost': BayesSearchCV(AdaBoostClassifier(algorithm='SAMME', random_state=42), param_grid['AdaBoost'], n_iter=32, cv=3, scoring='f1', random_state=42)
}

# Dicionário para armazenar os melhores modelos e seus respectivos imbalance ratios
best_models = {}

for ratio in imbalance_ratios:
    print(f"Testing imbalance_ratio: {ratio}")
    
    # Aplicar o RandomUnderSampler com o ratio atual
    rus = RandomUnderSampler(sampling_strategy=ratio, random_state=42)
    X_res, y_res = rus.fit_resample(X, y)
    
    # Inicializar a validação cruzada estratificada
    skf = StratifiedKFold(n_splits=3)
    
    for model_name, model in models.items():
        print(f"Training model: {model_name}")
        
        # Lista para armazenar os f1-scores de cada fold
        fold_f1_scores = []
        
        for train_index, test_index in skf.split(X_res, y_res):
            X_train, X_test = X_res[train_index], X_res[test_index]
            y_train, y_test = y_res[train_index], y_res[test_index]
            
            # Treinar o modelo
            model.fit(X_train, y_train)
            
            # Fazer previsões no conjunto de teste
            y_pred = model.predict(X_test)
            
            # Calcular o f1-score e armazenar na lista
            f1 = f1_score(y_test, y_pred)
            fold_f1_scores.append(f1)
        
        # Calcular a média dos f1-scores dos folds e armazenar na lista principal
        mean_f1 = np.mean(fold_f1_scores)
        f1_scores.append(mean_f1)
        
        # Atualizar o dicionário de melhores modelos se o f1-score for o melhor até agora
        if model_name not in best_models or mean_f1 > best_models[model_name]['f1_score']:
            best_models[model_name] = {'model': model, 'imbalance_ratio': ratio, 'f1_score': mean_f1}

# Encontrar o índice do melhor f1-score
best_index = f1_scores.index(max(f1_scores))
best_ratio = imbalance_ratios[best_index]

print(f"Melhor imbalance_ratio: {best_ratio} com F1-Score: {f1_scores[best_index]}")
print("Melhores modelos e seus respectivos imbalance ratios:")
print(best_models)


ModuleNotFoundError: No module named 'skopt'

In [88]:
# Criar um DataFrame a partir do dicionário best_models
df_best_models = pd.DataFrame.from_dict(best_models, orient='index')

# Renomear as colunas para melhor entendimento
df_best_models.columns = ['Modelo', 'Imbalance Ratio', 'F1-Score']

# Exibir o DataFrame
df_best_models

Unnamed: 0,Modelo,Imbalance Ratio,F1-Score
DecisionTree,DecisionTreeClassifier(random_state=42),0.948095,0.736664
RandomForest,"(DecisionTreeClassifier(max_features='sqrt', r...",0.844284,0.772722
XGBoost,"XGBClassifier(base_score=None, booster=None, c...",1.0,0.758141
AdaBoost,"(DecisionTreeClassifier(max_depth=1, random_st...",1.0,0.769796
