<a href="https://colab.research.google.com/github/tbrocco/mestrado/blob/main/aula5a_gridsearch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
from sklearn.datasets import load_iris, load_digits
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

datasets = [load_iris(), load_digits()]

best_configs = []

for data in datasets:
    X, y = data.data, data.target

    # estimadores e os hiperparâmetros
    estimators = [
        ('SVM', SVC(), {'SVM__C': [0.1, 1.0, 10.0]}),
        ('RandomForest', RandomForestClassifier(), {'RandomForest__n_estimators': [10, 50, 100]})
    ]

    results = {}

    for name, estimator, param_grid in estimators:
        pipeline = Pipeline([
            ('Scaler', StandardScaler()),
            (name, estimator)
        ])

        inner_cv = KFold(n_splits=5, shuffle=True, random_state=42)  # Validação cruzada aninhada

        grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=inner_cv)

        # avlia o desempenho do modelo usando validação cruzada aninhada
        grid_search.fit(X, y)

        # obtem os melhores parâmetros e o melhor estimador
        best_params = grid_search.best_params_
        best_estimator = grid_search.best_estimator_

        scores = cross_val_score(grid_search.best_estimator_, X, y, cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), scoring='accuracy')

        results[name] = (best_params, best_estimator, scores)

    # Calcula a diferença entre as pontuações médias de treinamento e teste
    differences = {name: np.mean(scores) for name, (_, _, scores) in results.items()}

    best_estimator_name = max(differences, key=differences.get)
    best_params, best_estimator, _ = results[best_estimator_name]
    best_configs.append((data, best_estimator_name, differences[best_estimator_name], best_params, best_estimator))




In [3]:
for dataset, best_estimator_name, difference, best_params, best_estimator in best_configs:
    print(f"Dataset: {dataset.DESCR.splitlines()[0]}")
    print(f"Melhor Estimador: {best_estimator_name}")
    print(f"Diferença: {difference:.4f}")
    print(f"Melhores Parâmetros: {best_params}")
    print(f"Melhor Estimador: {best_estimator}\n")

Dataset: .. _iris_dataset:
Melhor Estimador: RandomForest
Diferença: 0.9600
Melhores Parâmetros: {'RandomForest__n_estimators': 50}
Melhor Estimador: Pipeline(steps=[('Scaler', StandardScaler()),
                ('RandomForest', RandomForestClassifier(n_estimators=50))])

Dataset: .. _digits_dataset:
Melhor Estimador: SVM
Diferença: 0.9839
Melhores Parâmetros: {'SVM__C': 10.0}
Melhor Estimador: Pipeline(steps=[('Scaler', StandardScaler()), ('SVM', SVC(C=10.0))])

