In [24]:
import sklearn as sk
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import xgboost as xgb


from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from imblearn.combine import SMOTETomek
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler



from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score


In [25]:
#arquivos

#caminho para rodar google colab
#arq_treinamento = pd.read_csv('treino.csv')
#arq_teste = pd.read_csv('teste.csv')


#caminhos para rodar localmente
arq_treinamento = pd.read_csv('arquivos/brutos/treino.csv')



def arq_treinamento_info():
    
    print(arq_treinamento.head())
    print(arq_treinamento.info())
    print(arq_treinamento.describe())
    print(arq_treinamento.isnull().sum())


#definindo target e características arquivo de treinamento
arq_treinamento_x = arq_treinamento.drop(columns=['target', 'id'])

arq_treinamento_y = arq_treinamento['target']



#train_test_split dos dados de teste

X_train, X_test, y_train, y_test = train_test_split(arq_treinamento_x, arq_treinamento_y, test_size=0.2, random_state=42)

#targets estao desbalanceados
#arq_treinamento_y.value_counts()
y_train.value_counts()

def split_info():
    print("Dados de treinamento (X_train):")
    print(X_train)
    print("Labels de treinamento (y_train):")
    print(y_train)
    print("Dados de teste (X_test):")
    print(X_test)
    print("Labels de teste (y_test):")
    print(y_test)



In [4]:
#treinamento do RandomForest, XGB e NeuralNerwork com dados balanceados com SMOTETomek ADASYN
#usando Pipeline #validacao_cruzada #f1_score

random_state_42 = 42

balanceador = Pipeline([
    ('adasyn', ADASYN(random_state=random_state_42)),
    ('smote_tomek_balanceamento', SMOTETomek(random_state=random_state_42))
])


modelos = {
    "Random Forest": RandomForestClassifier(random_state=random_state_42),
    "XGBoost": xgb.XGBClassifier(objective="multi:softmax", num_class=5, random_state=random_state_42),
    "Neural Network": MLPClassifier(max_iter=1000)
}

#validacao cruzada com stratifieldkfold
validacao_cruzada = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state_42)

resultados = []

for name, modelo in modelos.items():
    f1_macro_scores = []
    f1_micro_scores = []
    f1_balanceada_scores = []
    accuracy_scores = []

    for treino_dados, validacao_dados in validacao_cruzada.split(X_train, y_train):
        X_train_vc, X_val_vc = X_train.iloc[treino_dados], X_train.iloc[validacao_dados]
        y_train_vc, y_val_vc = y_train.iloc[treino_dados], y_train.iloc[validacao_dados]


        X_rebalanceado, y_rebalanceado = balanceador.fit_resample(X_train_vc, y_train_vc)

        modelo.fit(X_rebalanceado, y_rebalanceado)

        y_pred = modelo.predict(X_val_vc)

        ac_score = accuracy_score(y_val_vc, y_pred) #nao será muito útil por que os dados são multiclasse, mas pode ser interessante verificar diferenças gritantes
        f1_balanceada = f1_score(y_val_vc, y_pred, average='weighted')
        f1_macro = f1_score(y_val_vc, y_pred, average='macro')
        f1_micro = f1_score(y_val_vc, y_pred, average='micro')

        accuracy_scores.append(ac_score)
        f1_macro_scores.append(f1_macro)
        f1_micro_scores.append(f1_micro)
        f1_balanceada_scores.append(f1_balanceada)


        """matriz_confusao = confusion_matrix(y_val_vc, y_pred)
        display = ConfusionMatrixDisplay(confusion_matrix=matriz_confusao)
        display.plot(cmap='Blues')
        plt.title(f'{name} - Fold')
        print(name, matriz_confusao)
        plt.show()"""

    #resultados: média das métricas dos folds

    """resultados.append({
        "Model": name,
        "Accuracy": np.mean(accuracy_scores),
        "F1 Macro": np.mean(f1_macro_scores),
        "F1 Micro": np.mean(f1_micro_scores),
        "F1 Balanceada": np.mean(f1_balanceada_scores)
    })"""

    # priorizando a métrica f1-score que será cobrada
    resultados.append({
        "Model": name,
      # "Accuracy": np.mean(accuracy_scores),
       # "F1 Macro": np.mean(f1_macro_scores),
       # "F1 Micro": np.mean(f1_micro_scores),
        "F1 Balanceada": np.mean(f1_balanceada_scores)
    })


print(pd.DataFrame(resultados))



            Model  F1 Balanceada
0   Random Forest       0.757623
1         XGBoost       0.748710
2  Neural Network       0.704308


In [5]:
#Neural Network não deu resultado
#GridSearchCV não foi possível por conta do Hardware. 

In [5]:
#Testando RandomSearchCV com RandomForest

parametros_dicionario = {
    'n_estimators': [100, 300, 500, 800], 
    'max_depth': [None, 10, 20, 30], 
    'min_samples_split': [2, 5, 10], 
    'min_samples_leaf': [1, 2, 4], 
    'max_features': ['sqrt', 'log2']
}


random_modelo = RandomForestClassifier(random_state=42)


random_search_config = RandomizedSearchCV(
    random_modelo, parametros_dicionario, 
    n_iter=30,  #menos testes para conseguir rodar
    scoring='f1_weighted', 
    cv=3,  # Validação cruzada com 3 folds (mais rápido)
    verbose=2, 
    n_jobs=-1,  # Usa todos os núcleos do processador
    random_state=42
)

random_search_config.fit(X_rebalanceado_smote_tomek_adasyn, y_rebalanceado_smote_tomek_adasyn)

print("melhores parametros:", random_search_config.best_params_)

#resultados:
#melhores parametros: n_iter=10
"""
{
'n_estimators': 300, 
'min_samples_split': 2, 
'min_samples_leaf': 1, 
'max_features': 'log2', 
'max_depth': 20
}
"""


#melhores parametros: n_iter=30
"""
{'n_estimators': 800, 
 'min_samples_split': 2,
 'min_samples_leaf': 1, 
 'max_features': 'sqrt', 
 'max_depth': 20}
"\n{\n'n_estimators': 300, 
'min_samples_split': 2, 
\n'min_samples_leaf': 1, 
\n'max_features': 'log2',
\n'max_depth': 20
\n}\
n"
"""

Fitting 3 folds for each of 30 candidates, totalling 90 fits
[CV] END max_depth=30, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   9.0s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=  27.1s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=100; total time=   8.8s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=  25.3s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=300; total time=  26.4s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=100; total time=   9.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=  31.2s
[CV] END max_depth=30, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estima



melhores parametros: {'n_estimators': 800, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 20}


"\n{\n'n_estimators': 300, \n'min_samples_split': 2, \n'min_samples_leaf': 1, \n'max_features': 'log2', \n'max_depth': 20\n}\n"

[CV] END max_depth=30, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   9.2s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=  27.0s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=100; total time=   8.5s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=  24.4s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=300; total time=  27.5s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=100; total time=   8.7s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=  31.0s
[CV] END max_depth=30, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=800; total time= 1.3min
[CV] END max_depth=None, max_fea

In [6]:
#Testando RandomSearchCV com XGB

parametros_dicionario = {
    'n_estimators': [100, 300, 500, 800], 
    'max_depth': [3, 5, 10, 20], 
    'learning_rate': [0.01, 0.05, 0.1, 0.2], 
    'subsample': [0.6, 0.8, 1.0], 
    'colsample_bytree': [0.6, 0.8, 1.0]
}

xgb_modelo = XGBClassifier(random_state=42, eval_metric="mlogloss")

random_search_xgb = RandomizedSearchCV(
    xgb_modelo, parametros_dicionario, 
    n_iter=30,  # menos testes
    scoring='f1_weighted', 
    cv=3,  # Validação cruzada com 3 folds (mais rápido)
    verbose=2, 
    n_jobs=-1,  # Usa todos os núcleos do processador
    random_state=42
)

random_search_xgb.fit(X_rebalanceado_smote_tomek_adasyn, y_rebalanceado_smote_tomek_adasyn)

# Exibir melhores parâmetros
print("Melhores parâmetros", random_search_xgb.best_params_)

#resultados: n_iter=10
#Melhores parâmetros {'subsample': 1.0, 'n_estimators': 500, 
#'max_depth': 20, 'learning_rate': 0.2, 'colsample_bytree': 0.8}

#resultados: n_iter=30
#Melhores parâmetros {'subsample': 0.8, 'n_estimators': 800, 
#'max_depth': 10, 'learning_rate': 0.1, 'colsample_bytree': 1.0}


Fitting 3 folds for each of 30 candidates, totalling 90 fits
Melhores parâmetros {'subsample': 0.8, 'n_estimators': 800, 'max_depth': 10, 'learning_rate': 0.1, 'colsample_bytree': 1.0}
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=20, n_estimators=500, subsample=0.6; total time= 3.2min
[CV] END colsample_bytree=1.0, learning_rate=0.01, max_depth=5, n_estimators=800, subsample=0.6; total time=  31.7s
[CV] END colsample_bytree=1.0, learning_rate=0.1, max_depth=10, n_estimators=800, subsample=0.8; total time= 1.1min
[CV] END colsample_bytree=0.6, learning_rate=0.1, max_depth=5, n_estimators=800, subsample=0.8; total time=  18.5s
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=5, n_estimators=500, subsample=0.8; total time=  12.7s
[CV] END colsample_bytree=0.8, learning_rate=0.2, max_depth=20, n_estimators=500, subsample=1.0; total time=  49.2s
[CV] END colsample_bytree=1.0, learning_rate=0.05, max_depth=20, n_estimators=100, subsample=1.0; total time= 1.0min
[C

In [19]:
#Treinando os modelos com os parâmetros encontrados:

[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=500; total time=  53.1s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=800; total time=  59.1s
[CV] END colsample_bytree=0.8, learning_rate=0.2, max_depth=3, n_estimators=800, subsample=0.8; total time=   8.8s
[CV] END colsample_bytree=1.0, learning_rate=0.01, max_depth=5, n_estimators=500, subsample=0.6; total time=  12.8s
[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=10, n_estimators=300, subsample=1.0; total time=  54.5s
[CV] END colsample_bytree=0.6, learning_rate=0.1, max_depth=5, n_estimators=800, subsample=0.8; total time=  16.4s
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=5, n_estimators=500, subsample=0.8; total time=  13.7s
[CV] END colsample_bytree=0.8, learning_rate=0.2, max_depth=20, n_estimators=500, subsample=1.0; total time=  40.2s
[CV] END colsample_bytree=1.0, learning_rate=0.1, max_depth=5, 

In [7]:
random_state_42 = 42

balanceador = Pipeline([
    ('adasyn', ADASYN(random_state=random_state_42)),
    ('smote_tomek_balanceamento', SMOTETomek(random_state=random_state_42))
])

#melhores parametros encontrados para n_iter=10
"""
modelos = {
    "Random Forest": RandomForestClassifier(
        random_state=random_state_42,
        n_estimators=300, 
        min_samples_split=2, 
        min_samples_leaf=1, 
        max_features='log2', 
        max_depth=20
    ),
    "XGBoost": XGBClassifier(
        objective="multi:softmax",
        random_state=random_state_42,
        subsample=1.0, 
        n_estimators=500, 
        max_depth=20, 
        learning_rate=0.2, 
        colsample_bytree=0.8
    ),
}
"""
#melhores parametros encontrados para n_iter=30
modelos = {
    "Random Forest": RandomForestClassifier(
        random_state=random_state_42,
        n_estimators=800, 
        min_samples_split=2, 
        min_samples_leaf=1, 
        max_features='sqrt', 
        max_depth=20
    ),
    "XGBoost": XGBClassifier(
        objective="multi:softmax",
        random_state=random_state_42,
        subsample=0.8, 
        n_estimators=800, 
        colsample_bytree=1.0
    ),
}





#validacao cruzada com stratifieldkfold
validacao_cruzada = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state_42)

resultados = []

for name, modelo in modelos.items():
    f1_macro_scores = []
    f1_micro_scores = []
    f1_balanceada_scores = []
    accuracy_scores = []

    for treino_dados, validacao_dados in validacao_cruzada.split(X_train, y_train):
        X_train_vc, X_val_vc = X_train.iloc[treino_dados], X_train.iloc[validacao_dados]
        y_train_vc, y_val_vc = y_train.iloc[treino_dados], y_train.iloc[validacao_dados]


        X_rebalanceado, y_rebalanceado = balanceador.fit_resample(X_train_vc, y_train_vc)

        modelo.fit(X_rebalanceado, y_rebalanceado)

        y_pred = modelo.predict(X_val_vc)

        ac_score = accuracy_score(y_val_vc, y_pred) #nao será muito útil por que os dados são multiclasse, mas pode ser interessante verificar diferenças gritantes
        f1_balanceada = f1_score(y_val_vc, y_pred, average='weighted')
        f1_macro = f1_score(y_val_vc, y_pred, average='macro')
        f1_micro = f1_score(y_val_vc, y_pred, average='micro')

        accuracy_scores.append(ac_score)
        f1_macro_scores.append(f1_macro)
        f1_micro_scores.append(f1_micro)
        f1_balanceada_scores.append(f1_balanceada)


        """matriz_confusao = confusion_matrix(y_val_vc, y_pred)
        display = ConfusionMatrixDisplay(confusion_matrix=matriz_confusao)
        display.plot(cmap='Blues')
        plt.title(f'{name} - Fold')
        print(name, matriz_confusao)
        plt.show()"""

    #resultados: média das métricas dos folds

    """resultados.append({
        "Model": name,
        "Accuracy": np.mean(accuracy_scores),
        "F1 Macro": np.mean(f1_macro_scores),
        "F1 Micro": np.mean(f1_micro_scores),
        "F1 Balanceada": np.mean(f1_balanceada_scores)
    })"""

    # priorizando a métrica f1-score que será cobrada
    resultados.append({
        "Model": name,
      # "Accuracy": np.mean(accuracy_scores),
       # "F1 Macro": np.mean(f1_macro_scores),
       # "F1 Micro": np.mean(f1_micro_scores),
        "F1 Balanceada": np.mean(f1_balanceada_scores)
    })


print(pd.DataFrame(resultados))


           Model  F1 Balanceada
0  Random Forest       0.764129
1        XGBoost       0.757705


In [1]:
#resultados com n_iter=10:
"""

n_iter=10:
     Model  F1 Balanceada
0  Random Forest       0.762110
1        XGBoost       0.765687

n_iter=30:
 Model  F1 Balanceada
0  Random Forest       0.764129
1        XGBoost       0.757705

In [26]:
#Testando RandomSearchCV com RandomForest
#fazendo balanceamento dos dados fora da validacao cruzada para fazer o randomsearchCV

random_state_42 = 42

balanceador = Pipeline([
    ('adasyn', ADASYN(random_state=random_state_42)),
    ('smote_tomek_balanceamento', SMOTETomek(random_state=random_state_42))
])

X_rebalanceado_smote_tomek_adasyn, y_rebalanceado_smote_tomek_adasyn = balanceador.fit_resample(
    X_train, y_train)

#print(X_rebalanceado_smote_tomek_adasyn)
#print(y_rebalanceado_smote_tomek_adasyn)

In [27]:
.value_counts()

target
2    4224
3    4224
4    4219
1    4216
0    4186
Name: count, dtype: int64

In [16]:
#fazendo predição final:

#XGBoost n_iter=10

modelo_final_XGBoost = XGBClassifier(
        objective="multi:softmax",
        random_state=random_state_42,
        subsample=1.0, 
        n_estimators=500, 
        max_depth=20, 
        learning_rate=0.2, 
        colsample_bytree=0.8
    )

modelo_final_XGBoost.fit(X_rebalanceado_smote_tomek_adasyn, y_rebalanceado_smote_tomek_adasyn)



arq_teste = pd.read_csv('arquivos/brutos/teste.csv')

arq_teste_ids = arq_teste['id']

arq_teste_X = arq_teste.drop(columns=['id'])






arq_teste_pred = modelo_final_XGBoost.predict(arq_teste_X)

arq_teste_saida = pd.DataFrame({'id':arq_teste_ids, 'target': arq_teste_pred})

arq_teste_saida.to_csv('arquivos/final/ezequiel_melo_nogueira_3_fase_onia_resposta_teste.csv', index=False)



In [23]:
#verificando o arquivo resposta para ver se está tudo conforme as demandas da ONIA

c


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4500 entries, 0 to 4499
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   id      4500 non-null   int64
 1   target  4500 non-null   int64
dtypes: int64(2)
memory usage: 70.4 KB
None
                 id       target
count   4500.000000  4500.000000
mean    7515.308222     1.302667
std     4290.491626     1.516287
min        4.000000     0.000000
25%     3857.750000     0.000000
50%     7498.500000     1.000000
75%    11121.250000     2.000000
max    14994.000000     4.000000
id        0
target    0
dtype: int64
