In this notebook:
- We will generate a random regression dataset with `sklearn.datasets.make_regression`
- Fit diferent regression models to this dataset
- Optimize the model hyperparameters to fine tune our models

## Import modules

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import (LinearRegression,
                                    Lasso,
                                    ElasticNet,
                                    BayesianRidge)
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import (r2_score,
                                mean_absolute_error,
                                mean_squared_error,
                                root_mean_squared_error,
                                max_error)
import warnings
warnings.filterwarnings('ignore')

## Dataset for regression

A random regression dataset with a specified number of samples, features, and noise

In [2]:
X, y = make_regression(n_samples=2000, n_features=5, noise=0.1, random_state=101)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

## Regresion without hyperparameter optimization

In [4]:
models = {'Linear Regresion': LinearRegression(),
          'Lasso': Lasso(),
          'DecisionTreeRegressor': DecisionTreeRegressor(),
          'Elastic Net': ElasticNet(),
          'SVR': SVR()
         }

In [5]:
for model_name, model in models.items(): 
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    print(model_name,":")
    print("Coefficient of determination =", model.score(X_test, y_test))
    print("Mean absolute error =", mean_absolute_error(y_test, y_pred))
    print("Mean squared error =", mean_squared_error(y_test, y_pred))
    print("Root mean squared error =", root_mean_squared_error(y_test, y_pred))
    print("Maximum error =", max_error(y_test, y_pred))
    print('.'*50)

Linear Regresion :
Coefficient of determination = 0.9999995076279349
Mean absolute error = 0.0754229237603947
Mean squared error = 0.008847068799527628
Root mean squared error = 0.09405885816619096
Maximum error = 0.33131109978414486
..................................................
Lasso :
Coefficient of determination = 0.9996939079495237
Mean absolute error = 1.8697153508200814
Mean squared error = 5.499941246721452
Root mean squared error = 2.3451953536371875
Maximum error = 8.412865439454833
..................................................
DecisionTreeRegressor :
Coefficient of determination = 0.8302701591900851
Mean absolute error = 42.05758784810864
Mean squared error = 3049.749742985566
Root mean squared error = 55.22453931890755
Maximum error = 247.49855009467223
..................................................
Elastic Net :
Coefficient of determination = 0.8795577227523895
Mean absolute error = 36.83003436250544
Mean squared error = 2164.1380344654103
Root mean squared er

## Hyperparameters optimization

#### Lasso

In [6]:
time_start = datetime.now()

params = {'alpha': np.linspace(0.0001,1,100),
          'selection' : ['cyclic', 'random']
          }

lasso = Lasso()
gscv = GridSearchCV(estimator=lasso, 
                    param_grid=params, 
                    cv=20,
                    scoring='neg_mean_squared_error')
gscv.fit(X_train, y_train)
print("Best Estimator :", gscv.best_estimator_)

model = gscv.best_estimator_
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)

print("R2 score =", r2)
print("Root mean squared error =", rmse)
print("Coefficient of determination =", model.score(X_test, y_test))
print("Mean absolute error =", mean_absolute_error(y_test, y_pred))
print("Mean squared error =", mean_squared_error(y_test, y_pred))
print("Maximum error =", max_error(y_test, y_pred))

time_stop = datetime.now()
delta = time_stop - time_start
seconds = delta.seconds + delta.microseconds/1E6
print('Time to compute: ', seconds, 'seconds')

result0 = ('Lasso', r2, rmse, seconds)

Best Estimator : Lasso(alpha=0.0001)
R2 score = 0.9999995076375631
Root mean squared error = 0.09405793851368871
Coefficient of determination = 0.9999995076375631
Mean absolute error = 0.07542469592008828
Mean squared error = 0.008846895797444845
Maximum error = 0.3314408488294944
Time to compute:  4.605217 seconds


#### ElasticNet

In [7]:
time_start = datetime.now()

params = {'alpha': np.linspace(0.0001,1,100),
          'l1_ratio' : np.linspace(0,1,10),
          'selection' : ['cyclic', 'random']
          }

elastic_net = ElasticNet()
gscv = GridSearchCV(estimator=elastic_net, 
                    param_grid=params, 
                    cv=20,
                    scoring='neg_mean_squared_error')
gscv.fit(X_train, y_train)
print("Best Estimator :", gscv.best_estimator_)

model = gscv.best_estimator_
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)

print("R2 score =", r2)
print("Root mean squared error =", rmse)
print("Coefficient of determination =", model.score(X_test, y_test))
print("Mean absolute error =", mean_absolute_error(y_test, y_pred))
print("Mean squared error =", mean_squared_error(y_test, y_pred))
print("Maximum error =", max_error(y_test, y_pred))

time_stop = datetime.now()
delta = time_stop - time_start
seconds = delta.seconds + delta.microseconds/1E6
print('Time to compute: ', seconds, 'seconds')

result1 = ('ElasticNet', r2, rmse, seconds)

Best Estimator : ElasticNet(alpha=0.0001, l1_ratio=1.0)
R2 score = 0.9999995076375631
Root mean squared error = 0.09405793851368871
Coefficient of determination = 0.9999995076375631
Mean absolute error = 0.07542469592008828
Mean squared error = 0.008846895797444845
Maximum error = 0.3314408488294944
Time to compute:  65.757792 seconds


#### BayesianRidge

In [8]:
time_start = datetime.now()

params = {'alpha_1': [1e-03,1e-04,1e-05,1e-06,1e-07],
          'alpha_2' : [1e-03,1e-04,1e-05,1e-06,1e-07],
          'lambda_1' : [1e-03,1e-04,1e-05,1e-06,1e-07],
          'lambda_2' : [1e-03,1e-04,1e-05,1e-06,1e-07]
          }

baye_ridg = BayesianRidge()
gscv = GridSearchCV(estimator=baye_ridg, 
                    param_grid=params, 
                    cv=20,
                    scoring='neg_mean_squared_error')
gscv.fit(X_train, y_train)
print("Best Estimator :", gscv.best_estimator_)

model = gscv.best_estimator_
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)

print("R2 score =", r2)
print("Root mean squared error =", rmse)
print("Coefficient of determination =", model.score(X_test, y_test))
print("Mean absolute error =", mean_absolute_error(y_test, y_pred))
print("Mean squared error =", mean_squared_error(y_test, y_pred))
print("Maximum error =", max_error(y_test, y_pred))

time_stop = datetime.now()
delta = time_stop - time_start
seconds = delta.seconds + delta.microseconds/1E6
print('Time to compute: ', seconds, 'seconds')

result2 = ('BayesianRidge', r2, rmse, seconds)

Best Estimator : BayesianRidge(alpha_1=1e-07)
R2 score = 0.9999995076279592
Root mean squared error = 0.09405885585132667
Coefficient of determination = 0.9999995076279592
Mean absolute error = 0.07542291991145109
Mean squared error = 0.008847068364060651
Maximum error = 0.3313111967855562
Time to compute:  19.264602 seconds


#### KNeighborsRegressor

In [9]:
time_start = datetime.now()

params = {'n_neighbors': [3,4,5,6,7,8,9,10,11,12],
          'weights': ['uniform', 'distance'], 
          'p': [1, 2]  
          }

knn = KNeighborsRegressor()
gscv = GridSearchCV(estimator=knn, 
                    param_grid=params, 
                    cv=20,
                    scoring='neg_mean_squared_error')
gscv.fit(X_train, y_train)
print("Best Estimator :", gscv.best_estimator_)

model = gscv.best_estimator_
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)

print("R2 score =", r2)
print("Root mean squared error =", rmse)
print("Coefficient of determination =", model.score(X_test, y_test))
print("Mean absolute error =", mean_absolute_error(y_test, y_pred))
print("Mean squared error =", mean_squared_error(y_test, y_pred))
print("Maximum error =", max_error(y_test, y_pred))

time_stop = datetime.now()
delta = time_stop - time_start
seconds = delta.seconds + delta.microseconds/1E6
print('Time to compute: ', seconds, 'seconds')

result3 = ('KNN', r2, rmse, seconds)

Best Estimator : KNeighborsRegressor(n_neighbors=8, weights='distance')
R2 score = 0.9400235104995984
Root mean squared error = 32.82793168042318
Coefficient of determination = 0.9400235104995984
Mean absolute error = 23.192706528985983
Mean squared error = 1077.6730984145315
Maximum error = 195.86678120716346
Time to compute:  1.915726 seconds


#### SVR

In [10]:
time_start = datetime.now()

params = {'C': [1, 10, 100],
        'gamma': [0.01, 0.1, 1],
        'kernel': ['linear', 'rbf'],   
       }

kfold = KFold(n_splits=5, random_state=1, shuffle=True)

svr = SVR()
gscv = GridSearchCV(estimator=svr,
                    param_grid=params,
                    cv=kfold,
                    scoring='neg_mean_squared_error')
gscv.fit(X_train, y_train)
print("Best Estimator :", gscv.best_estimator_)

model = gscv.best_estimator_
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)

print("R2 score =", r2)
print("Root mean squared error =", rmse)
print("Coefficient of determination =", model.score(X_test, y_test))
print("Mean absolute error =", mean_absolute_error(y_test, y_pred))
print("Mean squared error =", mean_squared_error(y_test, y_pred))
print("Maximum error =", max_error(y_test, y_pred))

time_stop = datetime.now()
delta = time_stop - time_start
seconds = delta.seconds + delta.microseconds/1E6
print('Time to compute: ', seconds, 'seconds')

result4 = ('SVR', r2, rmse, seconds)

Best Estimator : SVR(C=100, gamma=0.01, kernel='linear')
R2 score = 0.9999995069115029
Root mean squared error = 0.09412726404752102
Coefficient of determination = 0.9999995069115029
Mean absolute error = 0.07545018331821629
Mean squared error = 0.008859941837071743
Maximum error = 0.3378939578173359
Time to compute:  74.57242 seconds


#### DecisionTreeRegressor

In [11]:
time_start = datetime.now()

params = {'max_depth': np.linspace(1, 10, 10).astype("int"),
          'max_features': ['sqrt', 'log2'],
          'min_samples_leaf': np.linspace(1, 10, 10).astype("int"),
          'min_samples_split': [0.1, 0.2, 0.3],
          'criterion': ['squared_error']
         }
kfold = KFold(n_splits=5, random_state=1, shuffle=True)

decision_tree = DecisionTreeRegressor()
gscv = GridSearchCV(estimator=decision_tree,
                    param_grid=params,
                    cv=kfold,
                    scoring='neg_mean_squared_error')
gscv.fit(X_train, y_train)
print("Best Estimator :", gscv.best_estimator_)

model = gscv.best_estimator_
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)

print("R2 score =", r2)
print("Root mean squared error =", rmse)
print("Coefficient of determination =", model.score(X_test, y_test))
print("Mean absolute error =", mean_absolute_error(y_test, y_pred))
print("Mean squared error =", mean_squared_error(y_test, y_pred))
print("Maximum error =", max_error(y_test, y_pred))

time_stop = datetime.now()
delta = time_stop - time_start
seconds = delta.seconds + delta.microseconds/1E6
print('Time to compute: ', seconds, 'seconds')

result5 = ('DecisionTreeRegressor', r2, rmse, seconds)

Best Estimator : DecisionTreeRegressor(max_depth=10, max_features='log2', min_samples_leaf=5,
                      min_samples_split=0.1)
R2 score = 0.6433496848274426
Root mean squared error = 80.05239060985791
Coefficient of determination = 0.6433496848274426
Mean absolute error = 61.584604507189695
Mean squared error = 6408.385242353266
Maximum error = 336.792117507726
Time to compute:  4.909028 seconds


## Performance report 

In [12]:
df = pd.DataFrame([result0,
                   result1,
                   result2,
                   result3,
                   result4,
                   result5],
                  columns=['model', 'R2_score','RMSE','computing_time'])
df.sort_values('R2_score', ascending=False)

Unnamed: 0,model,R2_score,RMSE,computing_time
0,Lasso,1.0,0.094058,4.605217
1,ElasticNet,1.0,0.094058,65.757792
2,BayesianRidge,1.0,0.094059,19.264602
4,SVR,1.0,0.094127,74.57242
3,KNN,0.940024,32.827932,1.915726
5,DecisionTreeRegressor,0.64335,80.052391,4.909028
