# Optimización hiperparámetros k-NN

In [1]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy  as np

In [2]:
mi_dir = '/home/xavi/Escritorio/midirectorio/'

# Identificar características y variable objetivo
df = pd.read_csv(f'{mi_dir}DATA/train_scaffold.csv')
sub_train = df.sample(frac =0.3, random_state=42)

target = 'peakwavs_max'
features = [col for col in df.columns if col not in ['smiles', 'solvent', 'source', 'scaffold', target]]

# Dividir en conjunto de entrenamiento y prueba
x = sub_train[features]
y = sub_train[target]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=30)


In [5]:
# Definir el espacio de búsqueda de hiperparámetros
param_dist = {
    'n_neighbors' : [3, 4, 5],
    'weights' : ['uniform', 'distance'],
    'algorithm' : ['ball_tree', 'kd_tree', 'brute'],
    'leaf_size' : [20, 30, 40],
    'p' : [1, 1.3, 1.5, 2]
}

# Configurar Randomized Search
knn = KNeighborsRegressor()
random_search = RandomizedSearchCV(
    knn, 
    param_distributions=param_dist, 
    n_iter=30,  # Número de combinaciones aleatorias a probar
    cv=3, 
    scoring='neg_mean_absolute_error', 
    n_jobs=-1, 
    verbose=10, 
    random_state=42,  # Para reproducibilidad
    return_train_score=False
)


# Entrenar el modelo con Grid Search
print(f"Buscando mejores hipeparámetros... \n ")
random_search.fit(x_train, y_train)

#  Mostrar los mejores hiperparámetros encontrados
print("Mejores hiperparámetros encontrados:")
print(random_search.best_params_)

# Evaluar el modelo con los mejores hiperparámetros
best_model = random_search.best_estimator_
y_pred = best_model.predict(x_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"Error Absoluto Medio (MAE) en el conjunto de prueba: {mae:.4f}")

Buscando mejores hipeparámetros... 
 
Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed: 14.9min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed: 26.4min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed: 29.4min
[Parallel(n_jobs=-1)]: Done  85 out of  90 | elapsed: 35.3min remaining:  2.1min
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed: 36.3min finished


Mejores hiperparámetros encontrados:
{'weights': 'distance', 'p': 1, 'n_neighbors': 3, 'leaf_size': 30, 'algorithm': 'ball_tree'}
Error Absoluto Medio (MAE) en el conjunto de prueba: 22.9680
