# KNN Regression

In [1]:
from warnings import filterwarnings
filterwarnings('ignore')

### Read IRIS dataset

In [2]:
import pandas as pd
df = pd.read_csv('iris.csv')
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [4]:
s = df.isna().sum()
s

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

In [5]:
s[s>0]

Series([], dtype: int64)

### Seperating X and Y

In [6]:
X = df.drop(labels=['petal_width'],axis=1)
Y = df[['petal_width']]

In [7]:
X.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,species
0,5.1,3.5,1.4,setosa
1,4.9,3.0,1.4,setosa
2,4.7,3.2,1.3,setosa
3,4.6,3.1,1.5,setosa
4,5.0,3.6,1.4,setosa


In [8]:
Y.head()

Unnamed: 0,petal_width
0,0.2
1,0.2
2,0.2
3,0.2
4,0.2


### Cat Con Seperation

In [9]:
cat = list(X.columns[X.dtypes=='object'])
con = list(X.columns[X.dtypes!='object'])

In [11]:
cat

['species']

In [12]:
con

['sepal_length', 'sepal_width', 'petal_length']

### Build the sklearn pipeline

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [14]:
num_pipe = Pipeline(steps=[('imputer',SimpleImputer(strategy='mean')),
                           ('scaler',StandardScaler())])

cat_pipe = Pipeline(steps=[('imputer',SimpleImputer(strategy='most_frequent')),
                           ('ohe',OneHotEncoder(handle_unknown='ignore'))])

pre = ColumnTransformer([('num',num_pipe,con),
                         ('cat',cat_pipe,cat)])

In [15]:
X_pre = pre.fit_transform(X)
X_pre

array([[-0.90068117,  1.01900435, -1.34022653,  1.        ,  0.        ,
         0.        ],
       [-1.14301691, -0.13197948, -1.34022653,  1.        ,  0.        ,
         0.        ],
       [-1.38535265,  0.32841405, -1.39706395,  1.        ,  0.        ,
         0.        ],
       [-1.50652052,  0.09821729, -1.2833891 ,  1.        ,  0.        ,
         0.        ],
       [-1.02184904,  1.24920112, -1.34022653,  1.        ,  0.        ,
         0.        ],
       [-0.53717756,  1.93979142, -1.16971425,  1.        ,  0.        ,
         0.        ],
       [-1.50652052,  0.78880759, -1.34022653,  1.        ,  0.        ,
         0.        ],
       [-1.02184904,  0.78880759, -1.2833891 ,  1.        ,  0.        ,
         0.        ],
       [-1.74885626, -0.36217625, -1.34022653,  1.        ,  0.        ,
         0.        ],
       [-1.14301691,  0.09821729, -1.2833891 ,  1.        ,  0.        ,
         0.        ],
       [-0.53717756,  1.47939788, -1.2833891 ,  1.

In [16]:
cols = pre.get_feature_names_out()
cols

array(['num__sepal_length', 'num__sepal_width', 'num__petal_length',
       'cat__species_setosa', 'cat__species_versicolor',
       'cat__species_virginica'], dtype=object)

In [17]:
X_pre = pd.DataFrame(X_pre,columns=cols)
X_pre.head()

Unnamed: 0,num__sepal_length,num__sepal_width,num__petal_length,cat__species_setosa,cat__species_versicolor,cat__species_virginica
0,-0.900681,1.019004,-1.340227,1.0,0.0,0.0
1,-1.143017,-0.131979,-1.340227,1.0,0.0,0.0
2,-1.385353,0.328414,-1.397064,1.0,0.0,0.0
3,-1.506521,0.098217,-1.283389,1.0,0.0,0.0
4,-1.021849,1.249201,-1.340227,1.0,0.0,0.0


### Train Test Split

In [19]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X_pre, Y, test_size=0.33, random_state=42)

In [20]:
xtrain.shape

(100, 6)

In [21]:
xtest.shape

(50, 6)

### Building KNN regressor tuned

In [18]:
params = {'n_neighbors':[2,3,4,5,6,7,8,9]}

In [23]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import RandomizedSearchCV
knnr = KNeighborsRegressor()
rscv = RandomizedSearchCV(knnr,param_distributions=params,cv=5,scoring='neg_mean_squared_error')
rscv.fit(xtrain,ytrain)

In [24]:
rscv.best_params_

{'n_neighbors': 3}

In [25]:
rscv.best_score_

-0.03272222222222222

In [26]:
best_knnr = rscv.best_estimator_
best_knnr

### Evaluate r2 scores

In [27]:
best_knnr.score(xtrain,ytrain)

0.9735458829172137

In [28]:
best_knnr.score(xtest,ytest)

0.9505024588411375

### Predict values for training and testing

In [29]:
ypred_tr = best_knnr.predict(xtrain)
ypred_ts = best_knnr.predict(xtest)

In [31]:
ypred_tr[0:5]

array([[1.26666667],
       [2.06666667],
       [1.33333333],
       [0.26666667],
       [2.        ]])

In [32]:
ytrain.head()

Unnamed: 0,petal_width
96,1.3
105,2.1
66,1.5
0,0.2
122,2.0


In [35]:
ypred_ts[0:5]

array([[1.43333333],
       [0.26666667],
       [1.9       ],
       [1.36666667],
       [1.5       ]])

In [36]:
ytest.head()

Unnamed: 0,petal_width
73,1.2
18,0.3
118,2.3
78,1.5
76,1.4


### Evaluate all the scores

In [37]:
from custom_def import evaluate_model
evaluate_model(xtrain,ytrain,xtest,ytest,best_knnr)

Training Results:
MSE  : 0.01
RMSE : 0.12
MAE  : 0.09
R2   : 0.9735


Testing Results:
MSE  : 0.03
RMSE : 0.18
MAE  : 0.13
R2   : 0.9505


In [38]:
from custom_def import r2_adj
r2_adj(xtrain,ytrain,best_knnr)

0.9718391656860662

In [39]:
X.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'species'], dtype='object')

### Predict the new datapoint

In [40]:
def predict_data(model, pre):
    sep_len = float(input('Please enter Sepal Length : '))
    sep_wid = float(input('Please enter Sepal Width : '))
    pet_len = float(input('Please enter petal length : '))
    spe = input('Please Enter the species : ')
    xnew = pd.DataFrame([sep_len,sep_wid,pet_len,spe]).T
    xnew.columns = ['sepal_length', 'sepal_width', 'petal_length', 'species']
    print('DataFrame Before Preprocessing : \n')
    display(xnew)
    xnew_pre = pre.transform(xnew)
    xnew_pre = pd.DataFrame(xnew_pre,columns=pre.get_feature_names_out())
    print('DataFrame After Preprocessing : \n')
    display(xnew_pre)
    pred = model.predict(xnew_pre)
    print(f'Predicted Petal Width is : {pred}')
    return pred

In [41]:
predict_data(best_knnr,pre)

DataFrame Before Preprocessing : 



Unnamed: 0,sepal_length,sepal_width,petal_length,species
0,6.2,2.2,4.5,versicolor


DataFrame After Preprocessing : 



Unnamed: 0,num__sepal_length,num__sepal_width,num__petal_length,cat__species_setosa,cat__species_versicolor,cat__species_virginica
0,0.432165,-1.973554,0.421734,0.0,1.0,0.0


Predicted Petal Width is : [[1.26666667]]


array([[1.26666667]])