In this notebook:
- We will use the `sklearn.datasets.load_diabetes` dataset
- Fit diferent regression models to this dataset
- Optimize the model hyperparameters to fine tune our models

## Import modules

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.linear_model import (LinearRegression,
                                    ElasticNet,
                                    BayesianRidge,
                                    Lasso,
                                    LassoCV)
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import (r2_score,
                                mean_absolute_error,
                                mean_squared_error,
                                root_mean_squared_error,
                                max_error)
import warnings
warnings.filterwarnings('ignore')

## Dataset for regression

Diabetes dataset from scikit-learn

In [2]:
diabetes = load_diabetes()

In [3]:
diabetes.data.shape

(442, 10)

In [4]:
diabetes.target.shape

(442,)

In [5]:
diabetes.feature_names

['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(diabetes.data,
                                                    diabetes.target,
                                                    test_size=0.20,
                                                    random_state=101)

## Regresion without hyperparameter optimization

In [7]:
models = {'Linear Regresion': LinearRegression(),
          'Lasso': Lasso(),
          'DecisionTreeRegressor': DecisionTreeRegressor(),
          'Elastic Net': ElasticNet(),
          'SVR': SVR()
         }

In [8]:
for model_name, model in models.items(): 
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    rmse = root_mean_squared_error(y_test, y_pred)
    
    print(model_name,":")
    print("R2 score =", r2)
    print("Root mean squared error =", rmse)
    print("Coefficient of determination =", model.score(X_test, y_test))
    print("Mean absolute error =", mean_absolute_error(y_test, y_pred))
    print("Mean squared error =", mean_squared_error(y_test, y_pred))
    print("Maximum error =", max_error(y_test, y_pred))
    print('.'*50)

Linear Regresion :
R2 score = 0.5651503625247858
Root mean squared error = 52.71224226668631
Coefficient of determination = 0.5651503625247858
Mean absolute error = 42.43427461237549
Mean squared error = 2778.580484781831
Maximum error = 156.1655182278922
..................................................
Lasso :
R2 score = 0.3719842416281822
Root mean squared error = 63.3471681441053
Coefficient of determination = 0.3719842416281822
Mean absolute error = 53.28710847651649
Mean squared error = 4012.8637118775487
Maximum error = 138.41413742758016
..................................................
DecisionTreeRegressor :
R2 score = 0.2295701853330384
Root mean squared error = 70.16305247481947
Coefficient of determination = 0.2295701853330384
Mean absolute error = 56.78651685393258
Mean squared error = 4922.853932584269
Maximum error = 172.0
..................................................
Elastic Net :
R2 score = 0.007967734647177038
Root mean squared error = 79.61682107977181
Coeffi

## Hyperparameters optimization

#### ElasticNet

In [9]:
time_start = datetime.now()

params = {'alpha': np.linspace(0.0001,1,100),
          'l1_ratio' : np.linspace(0,1,10),
          'selection' : ['cyclic', 'random']
          }

elastic_net = ElasticNet()
gscv = GridSearchCV(estimator=elastic_net, 
                    param_grid=params, 
                    cv=20,
                    scoring='neg_mean_squared_error')
gscv.fit(X_train, y_train)
print("Best Estimator :", gscv.best_estimator_)

model = gscv.best_estimator_
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)

print("R2 score =", r2)
print("Root mean squared error =", rmse)
print("Coefficient of determination =", model.score(X_test, y_test))
print("Mean absolute error =", mean_absolute_error(y_test, y_pred))
print("Mean squared error =", mean_squared_error(y_test, y_pred))
print("Maximum error =", max_error(y_test, y_pred))

time_stop = datetime.now()
delta = time_stop - time_start
seconds = delta.seconds + delta.microseconds/1E6
print('Time to compute: ', seconds, 'seconds')

result1 = ('ElasticNet', r2, rmse, seconds)

Best Estimator : ElasticNet(alpha=0.0001, l1_ratio=0.0)
R2 score = 0.559687042454662
Root mean squared error = 53.04233907335097
Coefficient of determination = 0.559687042454662
Mean absolute error = 42.870603422306786
Mean squared error = 2813.4897343723355
Maximum error = 150.9457214013692
Time to compute:  56.693178 seconds


#### BayesianRidge

In [10]:
time_start = datetime.now()

params = {'alpha_1': [1e-03,1e-04,1e-05,1e-06,1e-07],
          'alpha_2' : [1e-03,1e-04,1e-05,1e-06,1e-07],
          'lambda_1' : [1e-03,1e-04,1e-05,1e-06,1e-07],
          'lambda_2' : [1e-03,1e-04,1e-05,1e-06,1e-07]
          }

baye_ridg = BayesianRidge()
gscv = GridSearchCV(estimator=baye_ridg, 
                    param_grid=params, 
                    cv=20,
                    scoring='neg_mean_squared_error')
gscv.fit(X_train, y_train)
print("Best Estimator :", gscv.best_estimator_)

model = gscv.best_estimator_
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)

print("R2 score =", r2)
print("Root mean squared error =", rmse)
print("Coefficient of determination =", model.score(X_test, y_test))
print("Mean absolute error =", mean_absolute_error(y_test, y_pred))
print("Mean squared error =", mean_squared_error(y_test, y_pred))
print("Maximum error =", max_error(y_test, y_pred))

time_stop = datetime.now()
delta = time_stop - time_start
seconds = delta.seconds + delta.microseconds/1E6
print('Time to compute: ', seconds, 'seconds')

result2 = ('BayesianRidge', r2, rmse, seconds)

Best Estimator : BayesianRidge(alpha_1=1e-07, alpha_2=0.001, lambda_1=0.001, lambda_2=1e-07)
R2 score = 0.5597045445884017
Root mean squared error = 53.04128486483638
Coefficient of determination = 0.5597045445884017
Mean absolute error = 42.86821875253007
Mean squared error = 2813.3779001127205
Maximum error = 151.0234237219454
Time to compute:  19.550008 seconds


#### KNeighborsRegressor

In [11]:
time_start = datetime.now()

params = {'n_neighbors': [3,4,5,6,7,8,9,10,11,12],
          'weights': ['uniform', 'distance'], 
          'p': [1, 2]  
          }

knn = KNeighborsRegressor()
gscv = GridSearchCV(estimator=knn, 
                    param_grid=params, 
                    cv=20,
                    scoring='neg_mean_squared_error')
gscv.fit(X_train, y_train)
print("Best Estimator :", gscv.best_estimator_)

model = gscv.best_estimator_
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)

print("R2 score =", r2)
print("Root mean squared error =", rmse)
print("Coefficient of determination =", model.score(X_test, y_test))
print("Mean absolute error =", mean_absolute_error(y_test, y_pred))
print("Mean squared error =", mean_squared_error(y_test, y_pred))
print("Maximum error =", max_error(y_test, y_pred))

time_stop = datetime.now()
delta = time_stop - time_start
seconds = delta.seconds + delta.microseconds/1E6
print('Time to compute: ', seconds, 'seconds')

result3 = ('KNN', r2, rmse, seconds)

Best Estimator : KNeighborsRegressor(n_neighbors=12, weights='distance')
R2 score = 0.556223541896756
Root mean squared error = 53.25054588613651
Coefficient of determination = 0.556223541896756
Mean absolute error = 41.01160971319077
Mean squared error = 2835.6206371715293
Maximum error = 143.88626759243783
Time to compute:  1.117267 seconds


#### SVR

In [12]:
time_start = datetime.now()

params = {'C': [1, 10, 100],
        'gamma': [0.01, 0.1, 1],
        'kernel': ['linear', 'rbf'],   
       }

kfold = KFold(n_splits=5, random_state=1, shuffle=True)

svr = SVR()
gscv = GridSearchCV(estimator=svr,
                    param_grid=params,
                    cv=kfold,
                    scoring='neg_mean_squared_error')
gscv.fit(X_train, y_train)
print("Best Estimator :", gscv.best_estimator_)

model = gscv.best_estimator_
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)

print("R2 score =", r2)
print("Root mean squared error =", rmse)
print("Coefficient of determination =", model.score(X_test, y_test))
print("Mean absolute error =", mean_absolute_error(y_test, y_pred))
print("Mean squared error =", mean_squared_error(y_test, y_pred))
print("Maximum error =", max_error(y_test, y_pred))

time_stop = datetime.now()
delta = time_stop - time_start
seconds = delta.seconds + delta.microseconds/1E6
print('Time to compute: ', seconds, 'seconds')

result4 = ('SVR', r2, rmse, seconds)

Best Estimator : SVR(C=100, gamma=1)
R2 score = 0.5434083888700001
Root mean squared error = 54.01394506633181
Coefficient of determination = 0.5434083888700001
Mean absolute error = 43.788349919203256
Mean squared error = 2917.5062616287105
Maximum error = 132.12652964377392
Time to compute:  0.482517 seconds


#### DecisionTreeRegressor

In [13]:
time_start = datetime.now()

params = {'max_depth': np.linspace(1, 10, 10).astype("int"),
          'max_features': ['sqrt', 'log2'],
          'min_samples_leaf': np.linspace(1, 10, 10).astype("int"),
          'min_samples_split': [0.1, 0.2, 0.3],
          'criterion': ['squared_error']
         }
kfold = KFold(n_splits=5, random_state=1, shuffle=True)

decision_tree = DecisionTreeRegressor()
gscv = GridSearchCV(estimator=decision_tree,
                    param_grid=params,
                    cv=kfold,
                    scoring='neg_mean_squared_error')
gscv.fit(X_train, y_train)
print("Best Estimator :", gscv.best_estimator_)

model = gscv.best_estimator_
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)

print("R2 score =", r2)
print("Root mean squared error =", rmse)
print("Coefficient of determination =", model.score(X_test, y_test))
print("Mean absolute error =", mean_absolute_error(y_test, y_pred))
print("Mean squared error =", mean_squared_error(y_test, y_pred))
print("Maximum error =", max_error(y_test, y_pred))

time_stop = datetime.now()
delta = time_stop - time_start
seconds = delta.seconds + delta.microseconds/1E6
print('Time to compute: ', seconds, 'seconds')

result5 = ('DecisionTreeRegressor', r2, rmse, seconds)

Best Estimator : DecisionTreeRegressor(max_depth=9, max_features='sqrt', min_samples_leaf=10,
                      min_samples_split=0.2)
R2 score = 0.4997499362386807
Root mean squared error = 56.53735809805423
Coefficient of determination = 0.4997499362386807
Mean absolute error = 46.7137324046993
Mean squared error = 3196.4728607076177
Maximum error = 173.44444444444446
Time to compute:  3.282655 seconds


#### RandomForestRegressor

In [14]:
time_start = datetime.now()

params = {'max_depth': np.linspace(1, 10, 10).astype("int"),
          'max_features': ['sqrt', 'log2'],
          'min_samples_leaf': np.linspace(1, 10, 10).astype("int"),
          'min_samples_split': [0.1, 0.2, 0.3],
          'criterion': ['squared_error']
         }
kfold = KFold(n_splits=5, random_state=1, shuffle=True)

rand_forest = RandomForestRegressor()
gscv = GridSearchCV(estimator=rand_forest,
                    param_grid=params,
                    cv=kfold,
                    scoring='neg_mean_squared_error')
gscv.fit(X_train, y_train)
print("Best Estimator :", gscv.best_estimator_)

model = gscv.best_estimator_
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)

print("R2 score =", r2)
print("Root mean squared error =", rmse)
print("Coefficient of determination =", model.score(X_test, y_test))
print("Mean absolute error =", mean_absolute_error(y_test, y_pred))
print("Mean squared error =", mean_squared_error(y_test, y_pred))
print("Maximum error =", max_error(y_test, y_pred))

time_stop = datetime.now()
delta = time_stop - time_start
seconds = delta.seconds + delta.microseconds/1E6
print('Time to compute: ', seconds, 'seconds')

result6 = ('RandomForestRegressor', r2, rmse, seconds)

Best Estimator : RandomForestRegressor(max_depth=7, max_features='log2', min_samples_leaf=2,
                      min_samples_split=0.1)
R2 score = 0.5231122295870623
Root mean squared error = 55.2013917039004
Coefficient of determination = 0.5231122295870623
Mean absolute error = 45.19262210087315
Mean squared error = 3047.1936460474435
Maximum error = 154.17298439608723
Time to compute:  241.134673 seconds


#### Lasso

In [15]:
time_start = datetime.now()

params = {'alpha': np.linspace(0.0001,1,100),
          'selection' : ['cyclic']
          }

lasso = Lasso()
gscv = GridSearchCV(estimator=lasso, 
                    param_grid=params, 
                    cv=20,
                    scoring='neg_mean_squared_error')
gscv.fit(X_train, y_train)
print("Best Estimator :", gscv.best_estimator_)

model = gscv.best_estimator_
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)

print("R2 score =", r2)
print("Root mean squared error =", rmse)
print("Coefficient of determination =", model.score(X_test, y_test))
print("Mean absolute error =", mean_absolute_error(y_test, y_pred))
print("Mean squared error =", mean_squared_error(y_test, y_pred))
print("Maximum error =", max_error(y_test, y_pred))

time_stop = datetime.now()
delta = time_stop - time_start
seconds = delta.seconds + delta.microseconds/1E6
print('Time to compute: ', seconds, 'seconds')

result7 = ('Lasso', r2, rmse, seconds)

Best Estimator : Lasso(alpha=0.0203)
R2 score = 0.55896639327956
Root mean squared error = 53.085727860336334
Coefficient of determination = 0.55896639327956
Mean absolute error = 42.909462057320745
Mean squared error = 2818.094502461689
Maximum error = 152.87233131326207
Time to compute:  2.248498 seconds


#### LassoCV

In [16]:
time_start = datetime.now()

model = LassoCV(alphas=np.linspace(0.0001,1,100),
                cv=20,
                selection='cyclic').fit(X_train, y_train)

y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)

print("Fitted alpha =", model.alpha_)
print("R2 score =", r2)
print("Root mean squared error =", rmse)
print("Coefficient of determination =", model.score(X_test, y_test))
print("Mean absolute error =", mean_absolute_error(y_test, y_pred))
print("Mean squared error =", mean_squared_error(y_test, y_pred))
print("Maximum error =", max_error(y_test, y_pred))

time_stop = datetime.now()
delta = time_stop - time_start
seconds = delta.seconds + delta.microseconds/1E6
print('Time to compute: ', seconds, 'seconds')

result8 = ('LassoCV', r2, rmse, seconds)

Fitted alpha = 0.0203
R2 score = 0.55896639327956
Root mean squared error = 53.085727860336334
Coefficient of determination = 0.55896639327956
Mean absolute error = 42.909462057320745
Mean squared error = 2818.094502461689
Maximum error = 152.87233131326207
Time to compute:  0.174258 seconds


## Performance report 

In [17]:
df = pd.DataFrame([result1,
                   result2,
                   result3,
                   result4,
                   result5,
                   result6,
                   result7],
                  columns=['model', 'R2_score','RMSE','computing_time'])
df.sort_values('R2_score', ascending=False)

Unnamed: 0,model,R2_score,RMSE,computing_time
1,BayesianRidge,0.559705,53.041285,19.550008
0,ElasticNet,0.559687,53.042339,56.693178
6,Lasso,0.558966,53.085728,2.248498
2,KNN,0.556224,53.250546,1.117267
3,SVR,0.543408,54.013945,0.482517
5,RandomForestRegressor,0.523112,55.201392,241.134673
4,DecisionTreeRegressor,0.49975,56.537358,3.282655
