###### HOW TO DEVELOP RIDGE REGRESSION MODELS IN PYTHON

###### TUTORIAL OVERVIEW

###### RIDGE REGRESSION

###### EXAMPLE OF RIDGE REGRESSION

###### IMPORT LIBRARIES

In [5]:
from numpy import mean
from numpy import std
from numpy import absolute
from pandas import read_csv
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import Ridge
from matplotlib import pyplot

###### IMPORT DATASET

In [7]:
# load dataset
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.csv'
dataframe = read_csv(url, header=None)

data = dataframe.values

# summarize shape
print(dataframe.shape)

# summarize first few lines
print(dataframe.head())

(506, 14)
        0     1     2   3      4      5     6       7   8      9     10  \
0  0.00632  18.0  2.31   0  0.538  6.575  65.2  4.0900   1  296.0  15.3   
1  0.02731   0.0  7.07   0  0.469  6.421  78.9  4.9671   2  242.0  17.8   
2  0.02729   0.0  7.07   0  0.469  7.185  61.1  4.9671   2  242.0  17.8   
3  0.03237   0.0  2.18   0  0.458  6.998  45.8  6.0622   3  222.0  18.7   
4  0.06905   0.0  2.18   0  0.458  7.147  54.2  6.0622   3  222.0  18.7   

       11    12    13  
0  396.90  4.98  24.0  
1  396.90  9.14  21.6  
2  392.83  4.03  34.7  
3  394.63  2.94  33.4  
4  396.90  5.33  36.2  
           0     1      2    3      4      5     6       7    8      9    10  \
0    0.00632  18.0   2.31  0.0  0.538  6.575  65.2  4.0900  1.0  296.0  15.3   
1    0.02731   0.0   7.07  0.0  0.469  6.421  78.9  4.9671  2.0  242.0  17.8   
2    0.02729   0.0   7.07  0.0  0.469  7.185  61.1  4.9671  2.0  242.0  17.8   
3    0.03237   0.0   2.18  0.0  0.458  6.998  45.8  6.0622  3.0  222.0  18.

In [9]:
pd.DataFrame(data)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22.0


###### DEFINE X and y VARIABLES

In [10]:
# DEFINE X and y Variable
X, y = data[:, :-1], data[:, -1]

###### DEFINE RIDGE MODEL INSTANCE

In [14]:
# Define Model
model = Ridge(alpha = 1.0)

###### DEFINE CROSS-VALIDATION EVALUTION METHOD

In [15]:
# define model evaluation method
cv = RepeatedKFold(n_splits = 10,
                   n_repeats = 3,
                   random_state = 1)

###### EVALUATE MODEL with PARAMETERS

In [18]:
# evaluate model
scores = cross_val_score(model,
                        X,
                        y,
                        scoring = "neg_mean_absolute_error",
                        cv = cv,
                        n_jobs = -1)

###### FORCE SCORES TO BE POSITIVE

In [20]:
# force scores to be positive
print('Mean MAE: %.3f (%.3f)' % (mean(scores), std(scores)))

scores = absolute(scores)
print('Mean MAE: %.3f (%.3f)' % (mean(scores), std(scores)))

Mean MAE: -3.382 (0.519)
Mean MAE: 3.382 (0.519)


###### MAKE A PREDICTION WITH A RIDGE REGRESSION MODEL ON THE DATASET

In [21]:
# make a prediction with a ridge regression model on the dataset
from pandas import read_csv
from sklearn.linear_model import Ridge

# load the dataset
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.csv'
dataframe = read_csv(url, header = None)
data = dataframe.values
X, y = data[:, :-1], data[:, -1]

# define model
model = Ridge(alpha=1.0)

# fit model
model.fit(X, y)

# define new data
row = [0.00632, 18.00, 2.310, 0, 0.5380, 6.5750, 65.20, 4.0900, 1, 296.0, 15.30, 396.90, 4.98]

# make a prediction
yhat = model.predict([row])

# summarize prediction
print('Predicted: %.3f' % yhat)

Predicted: 30.253


###### TUNING RIDGE HYPERPARAMETERS

In [22]:
# grid search hyperparameters for ridge regression
from numpy import arange
from pandas import read_csv
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import Ridge

# load the dataset
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.csv'
dataframe = read_csv(url, header = None)
data = dataframe.values
X, y = data[:, :-1], data[:, -1]

# define model
model = Ridge()

# define model evaluation method
cv = RepeatedKFold(n_splits =10, n_repeats = 3, random_state = 1)

# define grid
grid = dict()
grid['alpha'] = arange(0, 1, 0.01)

# define search
search = GridSearchCV(model, 
                      grid,
                      scoring = 'neg_mean_absolute_error',
                      cv = cv,
                      n_jobs = -1)

# perform the search
results = search.fit(X, y)

# summarize
print('MAE: %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)

MAE: -3.379
Config: {'alpha': 0.51}


In [23]:
# use automatically configured the ridge regression algorithm
from numpy import arange
from pandas import read_csv
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import RepeatedKFold

# load the dataset
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.csv'
dataframe = read_csv(url, header = None)

data = dataframe.values

X, y = data[:, :-1], data[:, -1]

# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

# define model
model = RidgeCV(alphas=arange(0, 1, 0.01), cv=cv, scoring='neg_mean_absolute_error')

# fit model
model.fit(X, y)

# summarize chosen configuration
print('alpha: %f' % model.alpha_)

alpha: 0.510000
