# Vecinos más cercanos o k-NN

In [3]:
import pandas as pd 
import numpy as np 

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error

#### SOBRE NOMENCLATURA DE VARIABLES

Al final del nombre se indica si una variable proviene de un random split: '_r' o de un scaffold split: '_s'

## **Usando un split random**

### Validación cruzada todo

In [3]:
df = pd.read_csv('scaled_df.csv')

#identificar las variables y división de datos
target = 'peakwavs_max'
features = [col for col in df.columns if col not in ['smiles', 'solvent', 'source', target]]

x_r = df[features]
y_r = df[target]

knn = KNeighborsRegressor(n_neighbors=3, 
                          weights='distance', 
                          algorithm='ball_tree', 
                          leaf_size=30, 
                          p=1, 
                          metric='minkowski')

print(cross_val_score(knn, x_r, y_r, cv=5, scoring='neg_mean_absolute_error' ))

[-29.7569113  -36.54047008 -27.40794643 -26.54648529 -21.0019733 ]


### Validación final 
    con los hiperparametros obtenidos mediante optimización

In [7]:
df = pd.read_csv('scaled_df.csv')

# Identificar características y variable objetivo
target = 'peakwavs_max'
features = [col for col in df.columns if col not in ['smiles', 'solvent', 'source', target]]

# Dividir en conjunto de entrenamiento y prueba
x = df[features]
y = df[target]
x_train_r, x_test_r, y_train_r, y_test_r = train_test_split(x, y, test_size=0.2, random_state=42)

# contruir el árbol
knn = KNeighborsRegressor(n_neighbors=3, 
                          weights='distance', 
                          algorithm='ball_tree', 
                          leaf_size=30, 
                          p=1, 
                          metric='minkowski')
knn.fit(x_train_r, y_train_r)

# Hacer predicciones
y_pred_r = knn.predict(x_test_r)

## GUARDAR RESULTADOS
y_test_r = np.array(y_test_r).flatten()
y_pred_r = np.array(y_pred_r).flatten()

resultados_knn_rand = pd.DataFrame({
    "peakwavs_max_real": y_test_r,
    "peakwavs_max_predicho": y_pred_r
})

### Cálculo de error y visualización 
mae = mean_absolute_error(y_test_r, y_pred_r)
print(f'Error absoluto medio: {mae:.3f}')

resultados_knn_rand.to_csv('Resultados/knn/random split/knn_rand.csv', index=False)

## **Usando el split por scaffold**

### Validación cruzada

In [11]:
archivos_train = ['train_1.csv', 'train_2.csv', 'train_3.csv', 'train_4.csv', 'train_5.csv']
archivos_test = ['test_1.csv', 'test_2.csv', 'test_3.csv', 'test_4.csv', 'test_5.csv']

resultados= []

for archivo_train, archivo_test in zip(archivos_train, archivos_test):
  
    train_cv = pd.read_csv(f'CV/{archivo_train}')
    test_cv = pd.read_csv(f'CV/{archivo_test}')
    target = 'peakwavs_max'
    features = [col for col in train_cv.columns if col not in ['smiles', 'solvent', 'source', target]]
    
    x_train = train_cv[features]
    y_train = train_cv[target]
    x_test = test_cv[features]
    y_test = test_cv[target]
    
    knn = KNeighborsRegressor(n_neighbors=3, 
                          weights='distance', 
                          algorithm='ball_tree', 
                          leaf_size=30, 
                          p=1, 
                          metric='minkowski')
    knn.fit(x_train, y_train)
    y_pred = knn.predict(x_test)

    resultados.append(mean_absolute_error(y_test, y_pred))
    
print(f'Error absoluto medio de cada fold: {resultados}')
print('Error medio de todos los folds: ', np.mean(resultados))

Error absoluto medio de cada fold: [35.6460668444029, 36.785608278354225, 35.62873945507117, 35.074697560802214, 34.32038363419368]
Error medio de todos los folds:  35.49109915456484


### Validación final
con los hiperparámetros obtenidos en la optimización.

In [7]:
mi_dir = '/home/xavi/Escritorio/midirectorio/'

# Identificar características y variable objetivo
train = pd.read_csv(f'{mi_dir}CV/train_1.csv')
test = pd.read_csv(f'{mi_dir}CV/test_1.csv')
target = 'peakwavs_max'
features = [col for col in train.columns if col not in ['smiles', 'solvent', 'source', target]]

# Dividir en conjunto de entrenamiento y prueba
x_train_s = train[features]
y_train_s = train[target]

x_test_s = test[features]
y_test_s = test[target]
# contruir el árbol
knn = KNeighborsRegressor(n_neighbors=3, 
                          weights='distance', 
                          algorithm='ball_tree', 
                          leaf_size=30, 
                          p=1, 
                          metric='minkowski')
knn.fit(x_train_s, y_train_s)


y_pred_s = knn.predict(x_test_s)

## GUARDAR RESULTADOS
y_test_s = np.array(y_test_s).flatten()
y_pred_s = np.array(y_pred_s).flatten()

resultados_knn_scaffol1d = pd.DataFrame({
    "peakwavs_max_real": y_test_s,
    "peakwavs_max_predicho": y_pred_s
})
resultados_knn_scaffold.to_csv('Resultados/knn/scaffold split/knn_scaffold.csv', index=False)

#Cálculo de error y visualización 
mae = mean_absolute_error(y_test_s, y_pred_s)
print(f'Error absoluto medio: {mae:.3f}')


Error absoluto medio: 35.646


NameError: name 'resultados_knn_scaffold' is not defined