In [16]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Loading the built in dataset
from sklearn.datasets import load_diabetes

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

import warnings
warnings.filterwarnings("ignore")

## Problem Statement

In [None]:
# to predict the diabetes of the patient

## Data Gathering/ Data Validation

In [2]:
# continuous values in target column : regression problem
diabetes = load_diabetes()
diabetes_df = pd.DataFrame(diabetes.data, columns = diabetes.feature_names)
diabetes_df['outcome'] = diabetes.target
diabetes_df

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,outcome
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204,75.0
2,0.085299,0.050680,0.044451,-0.005670,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.025930,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641,135.0
...,...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207,178.0
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018114,0.044485,104.0
439,0.041708,0.050680,-0.015906,0.017293,-0.037344,-0.013840,-0.024993,-0.011080,-0.046883,0.015491,132.0
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044529,-0.025930,220.0


## EDA

In [3]:
diabetes_df.isna().sum()

age        0
sex        0
bmi        0
bp         0
s1         0
s2         0
s3         0
s4         0
s5         0
s6         0
outcome    0
dtype: int64

In [4]:
diabetes_df.dtypes

age        float64
sex        float64
bmi        float64
bp         float64
s1         float64
s2         float64
s3         float64
s4         float64
s5         float64
s6         float64
outcome    float64
dtype: object

In [5]:
diabetes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   age      442 non-null    float64
 1   sex      442 non-null    float64
 2   bmi      442 non-null    float64
 3   bp       442 non-null    float64
 4   s1       442 non-null    float64
 5   s2       442 non-null    float64
 6   s3       442 non-null    float64
 7   s4       442 non-null    float64
 8   s5       442 non-null    float64
 9   s6       442 non-null    float64
 10  outcome  442 non-null    float64
dtypes: float64(11)
memory usage: 38.1 KB


In [6]:
diabetes_df.describe()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,outcome
count,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0
mean,-1.444295e-18,2.543215e-18,-2.255925e-16,-4.854086e-17,-1.428596e-17,3.898811e-17,-6.0283600000000005e-18,-1.7881000000000002e-17,9.243486e-17,1.3517700000000002e-17,152.133484
std,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,77.093005
min,-0.1072256,-0.04464164,-0.0902753,-0.1123988,-0.1267807,-0.1156131,-0.1023071,-0.0763945,-0.1260971,-0.1377672,25.0
25%,-0.03729927,-0.04464164,-0.03422907,-0.03665608,-0.03424784,-0.0303584,-0.03511716,-0.03949338,-0.03324559,-0.03317903,87.0
50%,0.00538306,-0.04464164,-0.007283766,-0.005670422,-0.004320866,-0.003819065,-0.006584468,-0.002592262,-0.001947171,-0.001077698,140.5
75%,0.03807591,0.05068012,0.03124802,0.03564379,0.02835801,0.02984439,0.0293115,0.03430886,0.03243232,0.02791705,211.5
max,0.1107267,0.05068012,0.1705552,0.1320436,0.1539137,0.198788,0.1811791,0.1852344,0.1335973,0.1356118,346.0


## Feature engineering

## Feature Selection

## Model training and model selection

In [8]:
x = diabetes_df.drop(['outcome'],axis=1)
y = diabetes_df['outcome']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=45)

#### 1. Linear Regression

In [10]:
lr_model = LinearRegression()
lr_model.fit(x_train,y_train)

In [11]:
# training
y_pred_train = lr_model.predict(x_train)
mse = mean_squared_error(y_train,y_pred_train)
print("MSE : ",mse)
rmse = np.sqrt(mse)
print("RMSE : ",rmse)
mae = mean_absolute_error(y_train,y_pred_train)
print("MAE : ",mae)
r2value = r2_score(y_train,y_pred_train)
print("R2 Score : ",r2value)

MSE :  2991.423365209392
RMSE :  54.69390610670801
MAE :  44.300039445291624
R2 Score :  0.5159857758740993


In [12]:
# testing
y_pred_test = lr_model.predict(x_test)
mse = mean_squared_error(y_test,y_pred_test)
print("MSE : ",mse)
rmse = np.sqrt(mse)
print("RMSE : ",rmse)
mae = mean_absolute_error(y_test,y_pred_test)
print("MAE : ",mae)
r2value = r2_score(y_test,y_pred_test)
print("R2 Score : ",r2value)

MSE :  2374.333939618319
RMSE :  48.72713760953253
MAE :  39.3845728615384
R2 Score :  0.5188113124539249


#### 2. KNN regressor

In [13]:
knn_model = KNeighborsRegressor()  # k=5, p=2
knn_model.fit(x_train,y_train)

In [14]:
# training
y_pred_train = knn_model.predict(x_train)
mse = mean_squared_error(y_train,y_pred_train)
print("MSE : ",mse)
rmse = np.sqrt(mse)
print("RMSE : ",rmse)
mae = mean_absolute_error(y_train,y_pred_train)
print("MAE : ",mae)
r2value = r2_score(y_train,y_pred_train)
print("R2 Score : ",r2value)

MSE :  2521.8155240793203
RMSE :  50.21768138892237
MAE :  39.314447592067985
R2 Score :  0.5919686265503027


In [15]:
# testing
y_pred_test = knn_model.predict(x_test)
mse = mean_squared_error(y_test,y_pred_test)
print("MSE : ",mse)
rmse = np.sqrt(mse)
print("RMSE : ",rmse)
mae = mean_absolute_error(y_test,y_pred_test)
print("MAE : ",mae)
r2value = r2_score(y_test,y_pred_test)
print("R2 Score : ",r2value)

MSE :  3112.0750561797754
RMSE :  55.78597544347303
MAE :  43.77752808988763
R2 Score :  0.36929878024287066


### HyperParameter

#### 1. GridSearchCV

In [21]:
hyperparameter = {'n_neighbors':np.arange(3,30),
                  'p':[1,2]}
gscv_obj = GridSearchCV(knn_model,hyperparameter,cv=5)
gscv_obj.fit(x_train,y_train)
gscv_obj.best_params_

{'n_neighbors': 14, 'p': 2}

In [24]:
knn_model_01 = KNeighborsRegressor(n_neighbors=14, p=2)
knn_model_01.fit(x_train,y_train)

In [25]:
# training
y_pred_train = knn_model_01.predict(x_train)
mse = mean_squared_error(y_train,y_pred_train)
print("MSE : ",mse)
rmse = np.sqrt(mse)
print("RMSE : ",rmse)
mae = mean_absolute_error(y_train,y_pred_train)
print("MAE : ",mae)
r2value = r2_score(y_train,y_pred_train)
print("R2 Score : ",r2value)

MSE :  2909.9341937908307
RMSE :  53.94380588900667
MAE :  43.29603399433428
R2 Score :  0.5291707762112404


In [31]:
# testing
y_pred_test = knn_model_01.predict(x_test)
mse = mean_squared_error(y_test,y_pred_test)
print("MSE : ",mse)
rmse = np.sqrt(mse)
print("RMSE : ",rmse)
mae = mean_absolute_error(y_test,y_pred_test)
print("MAE : ",mae)
r2value = r2_score(y_test,y_pred_test)
print("R2 Score : ",r2value)

MSE :  2817.6844760376057
RMSE :  53.081865792731946
MAE :  40.800963081861966
R2 Score :  0.42896074039128596


In [27]:
import pickle
import json

In [29]:
data = {'columns':list(x_train.columns)}
data

{'columns': ['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']}

In [32]:
with open("KNN_model.pkl","wb") as file:
    pickle.dump(lr_model,file)
    
with open("Data.json",'w') as file1:
    json.dump(data,file1)