In [2]:
from sklearn.neighbors import KNeighborsRegressor
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_val_score
import optuna as opt
from sklearn.preprocessing import StandardScaler
from optuna.samplers import RandomSampler
from optuna.samplers import TPESampler
from sklearn.preprocessing import MinMaxScaler
import time
from sklearn.pipeline import Pipeline 

In [3]:
CWD = Path.cwd()
DATA_DIR = CWD.parent / "ready data new"
print(DATA_DIR)

/Users/antonyjiao/Desktop/UOA Master of DS/COMPSCI 760/ready data new


In [4]:
df_train = pd.read_parquet(DATA_DIR/"train_main.parquet.snappy")
df_test = pd.read_parquet(DATA_DIR/"test_main.parquet.snappy")
# train_text = pd.read_parquet(DATA_DIR/"train_text.parquet.snappy")
#test_text = pd.read_parquet(DATA_DIR/"test_text.parquet.snappy")

In [5]:
print(f"Shape of the training data : {df_train.shape}")
print(f"Shape of the test data : {df_test.shape}")

Shape of the training data : (400000, 13)
Shape of the test data : (100000, 13)


In [6]:
X_train, y_train = df_train.drop(['r_useful', 'r_id'], axis=1).values, df_train['r_useful'].values
X_test, y_test = df_test.drop(['r_useful', 'r_id'], axis=1).values, df_test['r_useful'].values


## StandardScaler dataset

In [23]:
def objective(trial):
    start = time.time()
    
    params = {
       "n_neighbors":trial.suggest_int('n_neighbors', 2, 100),
       "weights":trial.suggest_categorical("weights", ['uniform', 'distance']),
       "metric":trial.suggest_categorical("metric", ['euclidean', 'manhattan', 'minkowski'])
        }
    print("Currently running with:")
    print(params)

    model = KNeighborsRegressor(**params)

    # model.fit(X_train, y_train)
    #y_pred = model.predict(X_val)
    
    # cv for the mode
    cvscore = np.sqrt(-np.mean(cross_val_score(model, X_train, y_train, scoring="neg_mean_squared_error", cv=3)))
    print("Time cost:", time.time()-start)

    return cvscore

In [24]:
RANDOM_SEED = 2
study = opt.create_study(direction='minimize', sampler=TPESampler(seed=RANDOM_SEED))
study.optimize(objective, n_trials=50)
study.best_params

[32m[I 2022-09-27 00:01:10,212][0m A new study created in memory with name: no-name-59bdf33a-7d62-422a-92ee-101684793c48[0m


Currently running with:
{'n_neighbors': 45, 'weights': 'distance', 'metric': 'euclidean'}


[32m[I 2022-09-27 00:10:49,616][0m Trial 0 finished with value: 3.704444670270919 and parameters: {'n_neighbors': 45, 'weights': 'distance', 'metric': 'euclidean'}. Best is trial 0 with value: 3.704444670270919.[0m


Time cost: 579.4017629623413
Currently running with:
{'n_neighbors': 22, 'weights': 'uniform', 'metric': 'manhattan'}


[32m[I 2022-09-27 00:26:22,767][0m Trial 1 finished with value: 3.7103044961956244 and parameters: {'n_neighbors': 22, 'weights': 'uniform', 'metric': 'manhattan'}. Best is trial 0 with value: 3.704444670270919.[0m


Time cost: 933.1492552757263
Currently running with:
{'n_neighbors': 15, 'weights': 'uniform', 'metric': 'manhattan'}


[32m[I 2022-09-27 00:40:05,689][0m Trial 2 finished with value: 3.7270311228078814 and parameters: {'n_neighbors': 15, 'weights': 'uniform', 'metric': 'manhattan'}. Best is trial 0 with value: 3.704444670270919.[0m


Time cost: 822.9196400642395
Currently running with:
{'n_neighbors': 85, 'weights': 'distance', 'metric': 'manhattan'}


[32m[I 2022-09-27 01:04:42,004][0m Trial 3 finished with value: 3.7001719128700192 and parameters: {'n_neighbors': 85, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 3 with value: 3.7001719128700192.[0m


Time cost: 1476.3129630088806
Currently running with:
{'n_neighbors': 14, 'weights': 'uniform', 'metric': 'minkowski'}


[32m[I 2022-09-27 01:10:36,186][0m Trial 4 finished with value: 3.7532324029177095 and parameters: {'n_neighbors': 14, 'weights': 'uniform', 'metric': 'minkowski'}. Best is trial 3 with value: 3.7001719128700192.[0m


Time cost: 354.18048191070557
Currently running with:
{'n_neighbors': 48, 'weights': 'distance', 'metric': 'manhattan'}


[32m[I 2022-09-27 01:28:57,298][0m Trial 5 finished with value: 3.6938349213434005 and parameters: {'n_neighbors': 48, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 5 with value: 3.6938349213434005.[0m


Time cost: 1101.1102769374847
Currently running with:
{'n_neighbors': 80, 'weights': 'uniform', 'metric': 'manhattan'}


[32m[I 2022-09-27 01:50:48,890][0m Trial 6 finished with value: 3.709920758114732 and parameters: {'n_neighbors': 80, 'weights': 'uniform', 'metric': 'manhattan'}. Best is trial 5 with value: 3.6938349213434005.[0m


Time cost: 1311.5912010669708
Currently running with:
{'n_neighbors': 90, 'weights': 'distance', 'metric': 'minkowski'}


[32m[I 2022-09-27 02:02:23,042][0m Trial 7 finished with value: 3.706972591263961 and parameters: {'n_neighbors': 90, 'weights': 'distance', 'metric': 'minkowski'}. Best is trial 5 with value: 3.6938349213434005.[0m


Time cost: 694.1510629653931
Currently running with:
{'n_neighbors': 55, 'weights': 'uniform', 'metric': 'minkowski'}


[32m[I 2022-09-27 02:11:59,424][0m Trial 8 finished with value: 3.710286119894419 and parameters: {'n_neighbors': 55, 'weights': 'uniform', 'metric': 'minkowski'}. Best is trial 5 with value: 3.6938349213434005.[0m


Time cost: 576.3802671432495
Currently running with:
{'n_neighbors': 42, 'weights': 'distance', 'metric': 'manhattan'}


[32m[I 2022-09-27 02:29:30,594][0m Trial 9 finished with value: 3.692289680252434 and parameters: {'n_neighbors': 42, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 9 with value: 3.692289680252434.[0m


Time cost: 1051.1688959598541
Currently running with:
{'n_neighbors': 67, 'weights': 'distance', 'metric': 'euclidean'}


[32m[I 2022-09-27 02:39:51,115][0m Trial 10 finished with value: 3.703420413392382 and parameters: {'n_neighbors': 67, 'weights': 'distance', 'metric': 'euclidean'}. Best is trial 9 with value: 3.692289680252434.[0m


Time cost: 620.5198738574982
Currently running with:
{'n_neighbors': 38, 'weights': 'distance', 'metric': 'manhattan'}


[32m[I 2022-09-27 02:56:47,300][0m Trial 11 finished with value: 3.6924677572900295 and parameters: {'n_neighbors': 38, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 9 with value: 3.692289680252434.[0m


Time cost: 1016.181624174118
Currently running with:
{'n_neighbors': 31, 'weights': 'distance', 'metric': 'manhattan'}


[32m[I 2022-09-27 03:12:38,550][0m Trial 12 finished with value: 3.69362301195107 and parameters: {'n_neighbors': 31, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 9 with value: 3.692289680252434.[0m


Time cost: 951.2489941120148
Currently running with:
{'n_neighbors': 2, 'weights': 'distance', 'metric': 'manhattan'}


[32m[I 2022-09-27 03:18:38,405][0m Trial 13 finished with value: 4.263863314497401 and parameters: {'n_neighbors': 2, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 9 with value: 3.692289680252434.[0m


Time cost: 359.85178327560425
Currently running with:
{'n_neighbors': 34, 'weights': 'distance', 'metric': 'euclidean'}


[32m[I 2022-09-27 03:26:36,961][0m Trial 14 finished with value: 3.7065066610403026 and parameters: {'n_neighbors': 34, 'weights': 'distance', 'metric': 'euclidean'}. Best is trial 9 with value: 3.692289680252434.[0m


Time cost: 478.55370688438416
Currently running with:
{'n_neighbors': 60, 'weights': 'distance', 'metric': 'manhattan'}


[32m[I 2022-09-27 03:46:25,232][0m Trial 15 finished with value: 3.695752297876924 and parameters: {'n_neighbors': 60, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 9 with value: 3.692289680252434.[0m


Time cost: 1188.268387079239
Currently running with:
{'n_neighbors': 37, 'weights': 'distance', 'metric': 'manhattan'}


[32m[I 2022-09-27 04:03:10,043][0m Trial 16 finished with value: 3.692470887415427 and parameters: {'n_neighbors': 37, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 9 with value: 3.692289680252434.[0m


Time cost: 1004.8105721473694
Currently running with:
{'n_neighbors': 71, 'weights': 'distance', 'metric': 'manhattan'}


[32m[I 2022-09-27 04:24:12,878][0m Trial 17 finished with value: 3.6973782221041214 and parameters: {'n_neighbors': 71, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 9 with value: 3.692289680252434.[0m


Time cost: 1262.8332998752594
Currently running with:
{'n_neighbors': 42, 'weights': 'distance', 'metric': 'minkowski'}


[32m[I 2022-09-27 04:32:51,532][0m Trial 18 finished with value: 3.7044186752795016 and parameters: {'n_neighbors': 42, 'weights': 'distance', 'metric': 'minkowski'}. Best is trial 9 with value: 3.692289680252434.[0m


Time cost: 518.6525478363037
Currently running with:
{'n_neighbors': 98, 'weights': 'distance', 'metric': 'euclidean'}


[32m[I 2022-09-27 04:44:49,730][0m Trial 19 finished with value: 3.707959954838379 and parameters: {'n_neighbors': 98, 'weights': 'distance', 'metric': 'euclidean'}. Best is trial 9 with value: 3.692289680252434.[0m


Time cost: 718.1938619613647
Currently running with:
{'n_neighbors': 26, 'weights': 'distance', 'metric': 'manhattan'}


[32m[I 2022-09-27 04:59:41,508][0m Trial 20 finished with value: 3.695690809528321 and parameters: {'n_neighbors': 26, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 9 with value: 3.692289680252434.[0m


Time cost: 891.7757029533386
Currently running with:
{'n_neighbors': 39, 'weights': 'distance', 'metric': 'manhattan'}


[32m[I 2022-09-27 05:16:49,598][0m Trial 21 finished with value: 3.6924957675507297 and parameters: {'n_neighbors': 39, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 9 with value: 3.692289680252434.[0m


Time cost: 1028.0886871814728
Currently running with:
{'n_neighbors': 35, 'weights': 'distance', 'metric': 'manhattan'}


[32m[I 2022-09-27 05:33:16,760][0m Trial 22 finished with value: 3.693037425189343 and parameters: {'n_neighbors': 35, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 9 with value: 3.692289680252434.[0m


Time cost: 987.1615831851959
Currently running with:
{'n_neighbors': 54, 'weights': 'distance', 'metric': 'manhattan'}


[32m[I 2022-09-27 05:52:23,376][0m Trial 23 finished with value: 3.694820185285728 and parameters: {'n_neighbors': 54, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 9 with value: 3.692289680252434.[0m


Time cost: 1146.6141860485077
Currently running with:
{'n_neighbors': 22, 'weights': 'distance', 'metric': 'manhattan'}


[32m[I 2022-09-27 06:06:26,094][0m Trial 24 finished with value: 3.69928846883761 and parameters: {'n_neighbors': 22, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 9 with value: 3.692289680252434.[0m


Time cost: 842.7170250415802
Currently running with:
{'n_neighbors': 63, 'weights': 'distance', 'metric': 'manhattan'}


[32m[I 2022-09-27 06:26:36,109][0m Trial 25 finished with value: 3.6962358883542197 and parameters: {'n_neighbors': 63, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 9 with value: 3.692289680252434.[0m


Time cost: 1210.0138850212097
Currently running with:
{'n_neighbors': 50, 'weights': 'uniform', 'metric': 'manhattan'}


[32m[I 2022-09-27 06:45:12,537][0m Trial 26 finished with value: 3.7053007675912957 and parameters: {'n_neighbors': 50, 'weights': 'uniform', 'metric': 'manhattan'}. Best is trial 9 with value: 3.692289680252434.[0m


Time cost: 1116.4265022277832
Currently running with:
{'n_neighbors': 29, 'weights': 'distance', 'metric': 'manhattan'}


[32m[I 2022-09-27 07:00:32,467][0m Trial 27 finished with value: 3.6939019827833874 and parameters: {'n_neighbors': 29, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 9 with value: 3.692289680252434.[0m


Time cost: 919.9286708831787
Currently running with:
{'n_neighbors': 39, 'weights': 'distance', 'metric': 'minkowski'}


[32m[I 2022-09-27 07:08:57,735][0m Trial 28 finished with value: 3.7052588581771726 and parameters: {'n_neighbors': 39, 'weights': 'distance', 'metric': 'minkowski'}. Best is trial 9 with value: 3.692289680252434.[0m


Time cost: 505.26739621162415
Currently running with:
{'n_neighbors': 45, 'weights': 'distance', 'metric': 'euclidean'}


[32m[I 2022-09-27 07:17:50,694][0m Trial 29 finished with value: 3.704444670270919 and parameters: {'n_neighbors': 45, 'weights': 'distance', 'metric': 'euclidean'}. Best is trial 9 with value: 3.692289680252434.[0m


Time cost: 532.9569540023804
Currently running with:
{'n_neighbors': 14, 'weights': 'distance', 'metric': 'euclidean'}


[32m[I 2022-09-27 07:23:31,477][0m Trial 30 finished with value: 3.74751677275894 and parameters: {'n_neighbors': 14, 'weights': 'distance', 'metric': 'euclidean'}. Best is trial 9 with value: 3.692289680252434.[0m


Time cost: 340.78199791908264
Currently running with:
{'n_neighbors': 39, 'weights': 'distance', 'metric': 'manhattan'}


[32m[I 2022-09-27 07:40:36,037][0m Trial 31 finished with value: 3.6924957675507297 and parameters: {'n_neighbors': 39, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 9 with value: 3.692289680252434.[0m


Time cost: 1024.55921292305
Currently running with:
{'n_neighbors': 46, 'weights': 'distance', 'metric': 'manhattan'}


[32m[I 2022-09-27 07:58:40,966][0m Trial 32 finished with value: 3.6932635845889212 and parameters: {'n_neighbors': 46, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 9 with value: 3.692289680252434.[0m


Time cost: 1084.9278919696808
Currently running with:
{'n_neighbors': 22, 'weights': 'distance', 'metric': 'manhattan'}


[32m[I 2022-09-27 08:12:43,620][0m Trial 33 finished with value: 3.69928846883761 and parameters: {'n_neighbors': 22, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 9 with value: 3.692289680252434.[0m


Time cost: 842.650680065155
Currently running with:
{'n_neighbors': 38, 'weights': 'uniform', 'metric': 'manhattan'}


[32m[I 2022-09-27 08:29:40,000][0m Trial 34 finished with value: 3.7039129510152473 and parameters: {'n_neighbors': 38, 'weights': 'uniform', 'metric': 'manhattan'}. Best is trial 9 with value: 3.692289680252434.[0m


Time cost: 1016.3797509670258
Currently running with:
{'n_neighbors': 54, 'weights': 'distance', 'metric': 'manhattan'}


[32m[I 2022-09-27 08:48:42,478][0m Trial 35 finished with value: 3.694820185285728 and parameters: {'n_neighbors': 54, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 9 with value: 3.692289680252434.[0m


Time cost: 1142.4764318466187
Currently running with:
{'n_neighbors': 6, 'weights': 'uniform', 'metric': 'manhattan'}


[32m[I 2022-09-27 08:57:39,469][0m Trial 36 finished with value: 3.83429495284841 and parameters: {'n_neighbors': 6, 'weights': 'uniform', 'metric': 'manhattan'}. Best is trial 9 with value: 3.692289680252434.[0m


Time cost: 536.9898700714111
Currently running with:
{'n_neighbors': 27, 'weights': 'distance', 'metric': 'manhattan'}


[32m[I 2022-09-27 09:12:45,332][0m Trial 37 finished with value: 3.6946620550757663 and parameters: {'n_neighbors': 27, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 9 with value: 3.692289680252434.[0m


Time cost: 905.8620538711548
Currently running with:
{'n_neighbors': 43, 'weights': 'distance', 'metric': 'manhattan'}


[32m[I 2022-09-27 09:30:30,973][0m Trial 38 finished with value: 3.6923853536518423 and parameters: {'n_neighbors': 43, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 9 with value: 3.692289680252434.[0m


Time cost: 1065.6399738788605
Currently running with:
{'n_neighbors': 59, 'weights': 'uniform', 'metric': 'minkowski'}


[32m[I 2022-09-27 09:41:11,882][0m Trial 39 finished with value: 3.7103047711425283 and parameters: {'n_neighbors': 59, 'weights': 'uniform', 'metric': 'minkowski'}. Best is trial 9 with value: 3.692289680252434.[0m


Time cost: 640.9060189723969
Currently running with:
{'n_neighbors': 44, 'weights': 'distance', 'metric': 'manhattan'}


[32m[I 2022-09-27 10:00:16,439][0m Trial 40 finished with value: 3.692721785211065 and parameters: {'n_neighbors': 44, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 9 with value: 3.692289680252434.[0m


Time cost: 1144.555456161499
Currently running with:
{'n_neighbors': 34, 'weights': 'distance', 'metric': 'manhattan'}


[32m[I 2022-09-27 10:17:38,457][0m Trial 41 finished with value: 3.693240689960641 and parameters: {'n_neighbors': 34, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 9 with value: 3.692289680252434.[0m


Time cost: 1042.0165839195251
Currently running with:
{'n_neighbors': 22, 'weights': 'distance', 'metric': 'manhattan'}


[32m[I 2022-09-27 10:32:08,115][0m Trial 42 finished with value: 3.69928846883761 and parameters: {'n_neighbors': 22, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 9 with value: 3.692289680252434.[0m


Time cost: 869.6563789844513
Currently running with:
{'n_neighbors': 50, 'weights': 'distance', 'metric': 'manhattan'}


[32m[I 2022-09-27 10:50:44,320][0m Trial 43 finished with value: 3.6939023025871265 and parameters: {'n_neighbors': 50, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 9 with value: 3.692289680252434.[0m


Time cost: 1116.2044961452484
Currently running with:
{'n_neighbors': 41, 'weights': 'distance', 'metric': 'manhattan'}


[32m[I 2022-09-27 11:08:22,157][0m Trial 44 finished with value: 3.692365506136003 and parameters: {'n_neighbors': 41, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 9 with value: 3.692289680252434.[0m


Time cost: 1057.835070848465
Currently running with:
{'n_neighbors': 32, 'weights': 'distance', 'metric': 'manhattan'}


[32m[I 2022-09-27 11:24:39,387][0m Trial 45 finished with value: 3.6931627568849605 and parameters: {'n_neighbors': 32, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 9 with value: 3.692289680252434.[0m


Time cost: 977.2291760444641
Currently running with:
{'n_neighbors': 42, 'weights': 'distance', 'metric': 'manhattan'}


[32m[I 2022-09-27 11:42:23,152][0m Trial 46 finished with value: 3.692289680252434 and parameters: {'n_neighbors': 42, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 9 with value: 3.692289680252434.[0m


Time cost: 1063.7638850212097
Currently running with:
{'n_neighbors': 48, 'weights': 'uniform', 'metric': 'minkowski'}


[32m[I 2022-09-27 11:51:35,701][0m Trial 47 finished with value: 3.711807366642186 and parameters: {'n_neighbors': 48, 'weights': 'uniform', 'metric': 'minkowski'}. Best is trial 9 with value: 3.692289680252434.[0m


Time cost: 552.5480182170868
Currently running with:
{'n_neighbors': 44, 'weights': 'distance', 'metric': 'euclidean'}


[32m[I 2022-09-27 12:00:09,524][0m Trial 48 finished with value: 3.7041789248484953 and parameters: {'n_neighbors': 44, 'weights': 'distance', 'metric': 'euclidean'}. Best is trial 9 with value: 3.692289680252434.[0m


Time cost: 513.8208420276642
Currently running with:
{'n_neighbors': 57, 'weights': 'distance', 'metric': 'manhattan'}


[32m[I 2022-09-27 12:19:23,752][0m Trial 49 finished with value: 3.695314355091258 and parameters: {'n_neighbors': 57, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 9 with value: 3.692289680252434.[0m


Time cost: 1154.2261180877686


{'n_neighbors': 42, 'weights': 'distance', 'metric': 'manhattan'}

In [28]:
model = KNeighborsRegressor(**study.best_params, n_jobs=-1)
model.fit(X_train, y_train)



In [29]:
y_pred_train = model.predict(X_train)
y_pred_test  = model.predict(X_test)

mae = mean_absolute_error(y_train, y_pred_train)
mse = mean_squared_error(y_train, y_pred_train)
rmse = np.sqrt(mse)
print("On training set: RMSE:", rmse, "MAE", mae)

#Prediction(test) 
mae = mean_absolute_error(y_test, y_pred_test)
mse = mean_squared_error(y_test, y_pred_test)
rmse = np.sqrt(mse)
print("On test set: RMSE:", rmse, "MAE", mae)

On training set: RMSE: 0.0 MAE 0.0
On test set: RMSE: 11.310277371717827 MAE 10.033748505353776


The result totally overfitting, try other method for this model.

# Grid search cv

In [11]:
k_range = list(range(1, 52, 2))
metric = ['euclidean']
param_grid = {'knn__n_neighbors':k_range,'knn__metric':metric}  
print(param_grid)


{'knn__n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51], 'knn__metric': ['euclidean']}


In [12]:
pipe = Pipeline(steps=[('ss', StandardScaler()), ('knn', KNeighborsRegressor())])

grid_search = GridSearchCV(pipe,param_grid,
                           n_jobs=-1,verbose=3,
                           cv=KFold(3),scoring='neg_root_mean_squared_error')
print("start fitting")
grid_search.fit(X_train,y_train)
# print('Grid search result:',grid_search.cv_results_) 
print('best_score:',grid_search.best_score_) 
print('best_params:',grid_search.best_params_)  

start fitting
Fitting 3 folds for each of 26 candidates, totalling 78 fits
[CV 2/3] END knn__metric=euclidean, knn__n_neighbors=9;, score=-3.987 total time= 6.1min
[CV 1/3] END knn__metric=euclidean, knn__n_neighbors=21;, score=-3.487 total time= 8.3min
[CV 2/3] END knn__metric=euclidean, knn__n_neighbors=31;, score=-3.915 total time= 8.7min
[CV 3/3] END knn__metric=euclidean, knn__n_neighbors=41;, score=-3.744 total time= 9.3min
[CV 1/3] END knn__metric=euclidean, knn__n_neighbors=11;, score=-3.546 total time= 6.6min
[CV 2/3] END knn__metric=euclidean, knn__n_neighbors=21;, score=-3.921 total time= 8.3min
[CV 3/3] END knn__metric=euclidean, knn__n_neighbors=31;, score=-3.751 total time= 8.6min
[CV 1/3] END knn__metric=euclidean, knn__n_neighbors=43;, score=-3.462 total time= 9.5min
best_score: -3.706170218238256
best_params: {'knn__metric': 'euclidean', 'knn__n_neighbors': 51}
[CV 2/3] END knn__metric=euclidean, knn__n_neighbors=1;, score=-5.067 total time= 2.5min
[CV 3/3] END knn__me

In [13]:
# model = KNeighborsRegressor(metric='euclidean', n_neighbors=40, weights='distance', n_jobs=-1) # best paramater k[1:50]

model = KNeighborsRegressor(metric='euclidean', n_neighbors=51, n_jobs=-1) # best paramater list(range(1, 52, 2))
pipe = Pipeline(steps=[('ss', StandardScaler()), ('knn', model)])
pipe.fit(X_train, y_train)

In [14]:

y_pred_train = pipe.predict(X_train)
y_pred_test  = pipe.predict(X_test)

mae = mean_absolute_error(y_train, y_pred_train)
mse = mean_squared_error(y_train, y_pred_train)
rmse = np.sqrt(mse)
print("On training set: RMSE:", rmse, "MAE", mae)

#Prediction(test) 
mae = mean_absolute_error(y_test, y_pred_test)
mse = mean_squared_error(y_test, y_pred_test)
rmse = np.sqrt(mse)
print("On test set: RMSE:", rmse, "MAE", mae)

On training set: RMSE: 3.626017043422799 MAE 1.578559362745098
On test set: RMSE: 4.3049023184677955 MAE 1.6339531372549019


In [None]:
# The best knn__n_neighbors is 51 then test on the 51 - 70

In [15]:
k_range = list(range(51, 70, 2))
metric = ['euclidean']
param_grid = {'knn__n_neighbors':k_range,'knn__metric':metric}  
print(param_grid)
 

{'knn__n_neighbors': [51, 53, 55, 57, 59, 61, 63, 65, 67, 69], 'knn__metric': ['euclidean']}


In [16]:
pipe = Pipeline(steps=[('ss', StandardScaler()), ('knn', KNeighborsRegressor())])

grid_search = GridSearchCV(pipe,param_grid,
                           n_jobs=-1,verbose=3,
                           cv=KFold(3),scoring='neg_root_mean_squared_error')
print("start fitting")
grid_search.fit(X_train,y_train)
# print('Grid search result:',grid_search.cv_results_) 
print('best_score:',grid_search.best_score_) 
print('best_params:',grid_search.best_params_) 

start fitting
Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV 2/3] END knn__metric=euclidean, knn__n_neighbors=59;, score=-3.914 total time=12.2min
[CV 1/3] END knn__metric=euclidean, knn__n_neighbors=61;, score=-3.459 total time=12.3min
best_score: -3.7050443550636296
best_params: {'knn__metric': 'euclidean', 'knn__n_neighbors': 57}


In [17]:
model = KNeighborsRegressor(metric='euclidean', n_neighbors=57, n_jobs=-1) # best paramater list(range(1, 52, 2))
pipe = Pipeline(steps=[('ss', StandardScaler()), ('knn', model)])
pipe.fit(X_train, y_train)

In [18]:
y_pred_train = pipe.predict(X_train)
y_pred_test  = pipe.predict(X_test)

mae = mean_absolute_error(y_train, y_pred_train)
mse = mean_squared_error(y_train, y_pred_train)
rmse = np.sqrt(mse)
print("On training set: RMSE:", rmse, "MAE", mae)

#Prediction(test) 
mae = mean_absolute_error(y_test, y_pred_test)
mse = mean_squared_error(y_test, y_pred_test)
rmse = np.sqrt(mse)
print("On test set: RMSE:", rmse, "MAE", mae)

[CV 3/3] END knn__metric=euclidean, knn__n_neighbors=51;, score=-3.743 total time=11.3min
[CV 2/3] END knn__metric=euclidean, knn__n_neighbors=61;, score=-3.914 total time=12.1min
[CV 2/3] END knn__metric=euclidean, knn__n_neighbors=51;, score=-3.916 total time=11.4min
[CV 3/3] END knn__metric=euclidean, knn__n_neighbors=61;, score=-3.743 total time=11.9min
[CV 1/3] END knn__metric=euclidean, knn__n_neighbors=51;, score=-3.459 total time=11.5min
[CV 1/3] END knn__metric=euclidean, knn__n_neighbors=63;, score=-3.459 total time=12.2min
[CV 3/3] END knn__metric=euclidean, knn__n_neighbors=53;, score=-3.743 total time=11.5min
[CV 2/3] END knn__metric=euclidean, knn__n_neighbors=63;, score=-3.915 total time=12.1min
[CV 1/3] END knn__metric=euclidean, knn__n_neighbors=53;, score=-3.460 total time=11.6min
[CV 3/3] END knn__metric=euclidean, knn__n_neighbors=63;, score=-3.744 total time=12.0min
[CV 2/3] END knn__metric=euclidean, knn__n_neighbors=53;, score=-3.915 total time=11.6min
[CV 1/3] E

In [19]:
# Result of n_neighbors=57
On training set: RMSE: 3.6338720839780096 MAE 1.5801672368421051
On test set: RMSE: 4.3050481662529725 MAE 1.632191403508772

# Result of n_neighbors=51
On training set: RMSE: 3.626017043422799 MAE 1.578559362745098
On test set: RMSE: 4.3049023184677955 MAE 1.6339531372549019     