In [None]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import time

In [None]:
#load the dataset
feat_no = 11
data_input = pd.read_excel('../Data/MNPs_dataset.xlsx', sheet_name='MNPs_11')
dataset = data_input.iloc[:,0:feat_no+1]
X = dataset.iloc[:,0:feat_no]
y = dataset.iloc[:,-1]

In [None]:
#Optimize the hyperparameters with grid research
rs = np.arange(50)
best_params = {
        'n_neighbors':[],'metric':[]
}
metric = {
    'r2_train':[],'rmse_train':[],'mae_train':[],
    'r2_test':[],'rmse_test':[],'mae_test':[]
}
params = {
    'n_neighbors':np.arange(1,11),
    'metric':['euclidean','manhattan','minkowski']
}
start_t = time.time()
for i in rs:
    X_train, X_test, y_train, y_test = [],[],[],[]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=i)
    model = KNeighborsRegressor()
    model_opt = GridSearchCV(model, scoring='r2', param_grid=params, cv=10)
    model_opt.fit(X_train, y_train)
    y_pred_train = model_opt.predict(X_train)

    r2_train = r2_score(y_train,y_pred_train)
    rmse_train = mean_squared_error(y_train,y_pred_train)**0.5
    mae_train = mean_absolute_error(y_train,y_pred_train)
    
    y_pred_test = model_opt.best_estimator_.predict(X_test)

    r2_test = r2_score(y_test,y_pred_test)
    rmse_test = mean_squared_error(y_test,y_pred_test)**0.5
    mae_test = mean_absolute_error(y_test,y_pred_test)
    
    metric['r2_train'].append(r2_train)
    metric['rmse_train'].append(rmse_train)
    metric['mae_train'].append(mae_train)
    metric['r2_test'].append(r2_test)
    metric['rmse_test'].append(rmse_test)
    metric['mae_test'].append(mae_test)
    
    best_params['n_neighbors'].append(model_opt.best_params_['n_neighbors'])
    best_params['metric'].append(model_opt.best_params_['metric'])
    
end_t = time.time()

print('Training time', round(end_t - start_t, 2), 's')

In [None]:
metric_df = pd.DataFrame(metric)
best_params_df = pd.DataFrame(best_params)
#metric_df.to_excel('../Data_saved/knn_metrics.xlsx')
#best_params_df.to_excel('../Data_saved/knn_params.xlsx')

In [None]:
np.average(metric['r2_test'])

In [None]:
metric_df.describe()