In [1]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split, GridSearchCV,cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale 
from sklearn import model_selection
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import BaggingRegressor

from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
df=pd.read_csv(r"/Users/mehmetkorkmaz/Applications/venv/auto-mpg.csv")

In [3]:
df=df.replace('?',np.nan)
df=df.drop('car name',axis=1)
df=df.apply(lambda x: x.fillna(x.median()),axis=0)

## KNN

In [4]:
X = df[['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model year']]
y = df["mpg"]
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25, 
                                                    random_state=42)

In [5]:
knn_model = KNeighborsRegressor().fit(X_train, y_train)

In [6]:
knn_model

In [7]:
knn_model.n_neighbors

5

In [8]:
knn_model.effective_metric_

'euclidean'

In [9]:
y_pred = knn_model.predict(X_test)

In [10]:
np.sqrt(mean_squared_error(y_test, y_pred))

3.72184147969792

In [11]:
RMSE = [] 

for k in range(10):
    k = k+1
    knn_model = KNeighborsRegressor(n_neighbors = k).fit(X_train, y_train)
    y_pred = knn_model.predict(X_train) 
    rmse = np.sqrt(mean_squared_error(y_train,y_pred)) 
    RMSE.append(rmse) 
    print("for k =" , k , "RMSE value: ", rmse)

for k = 1 RMSE value:  0.0
for k = 2 RMSE value:  2.672892445805786
for k = 3 RMSE value:  3.087526322470192
for k = 4 RMSE value:  3.372731962261713
for k = 5 RMSE value:  3.5769920999663216
for k = 6 RMSE value:  3.7422288357621243
for k = 7 RMSE value:  3.7535762976925944
for k = 8 RMSE value:  3.90057416505751
for k = 9 RMSE value:  3.9619435176631477
for k = 10 RMSE value:  3.985354648943096


model tuning

In [12]:
from sklearn.model_selection import GridSearchCV

In [13]:
knn_params = {'n_neighbors': np.arange(1,30,1)}

In [14]:
knn = KNeighborsRegressor()

In [15]:
knn_cv_model = GridSearchCV(knn, knn_params, cv = 10)

In [16]:
knn_cv_model.fit(X_train, y_train)

In [17]:
knn_cv_model.best_params_["n_neighbors"]

29

In [18]:
RMSE = [] 
RMSE_CV = []
for k in range(10):
    k = k+1
    knn_model = KNeighborsRegressor(n_neighbors = k).fit(X_train, y_train)
    y_pred = knn_model.predict(X_train) 
    rmse = np.sqrt(mean_squared_error(y_train,y_pred)) 
    rmse_cv = np.sqrt(-1*cross_val_score(knn_model, X_train, y_train, cv=10, 
                                         scoring = "neg_mean_squared_error").mean())
    RMSE.append(rmse) 
    RMSE_CV.append(rmse_cv)
    print("k =" , k , "RMSE value: ", rmse, "RMSE_CV value: ", rmse_cv )


k = 1 RMSE value:  0.0 RMSE_CV value:  5.508766263613518
k = 2 RMSE value:  2.672892445805786 RMSE_CV value:  4.663543279475407
k = 3 RMSE value:  3.087526322470192 RMSE_CV value:  4.508926919655538
k = 4 RMSE value:  3.372731962261713 RMSE_CV value:  4.51803040128069
k = 5 RMSE value:  3.5769920999663216 RMSE_CV value:  4.507817745604431
k = 6 RMSE value:  3.7422288357621243 RMSE_CV value:  4.462828696517178
k = 7 RMSE value:  3.7535762976925944 RMSE_CV value:  4.498560478174291
k = 8 RMSE value:  3.90057416505751 RMSE_CV value:  4.438033886047616
k = 9 RMSE value:  3.9619435176631477 RMSE_CV value:  4.4379404389717685
k = 10 RMSE value:  3.985354648943096 RMSE_CV value:  4.429896388347503


In [19]:
knn_tuned = KNeighborsRegressor(n_neighbors = knn_cv_model.best_params_["n_neighbors"])

In [20]:
knn_tuned.fit(X_train, y_train)

In [21]:
np.sqrt(mean_squared_error(y_test, knn_tuned.predict(X_test)))

3.5820094799410613