In [9]:
# In cross-validation, also known as CV, the training data is split into five folds (any number will do, but five is standard). 
# The ML algorithm is fit on one fold at a time and tested on the remaining data.
# The result is five different training and test sets that are all representative of the same data. 
# The mean of the scores is usually taken as the accuracy of the model.

In [10]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Ridge, Lasso

In [11]:
housing_df = pd.read_csv('./datasets/HousingData.csv')
housing_df.head()

# Drop null value
housing_df = housing_df.dropna()

# X for the predictor columns and y for the target column
X = housing_df.iloc[:,:-1]
y = housing_df.iloc[:, -1] # The target column is MEDV

In [12]:
def regression_model_cv(model, k=5):
  # Since mean_squared_error is not an option for cross_val_score, we choose neg_mean_squared_error
  # cross_val_score takes the highest value by default, and the highest negative mean squared error is 0
  scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=k)
  rmse = np.sqrt(-scores)
  print('Reg rmse:', rmse)
  print('Reg mean:', rmse.mean())

regression_model_cv(LinearRegression())

Reg rmse: [3.26123843 4.42712448 5.66151114 8.09493087 5.24453989]
Reg mean: 5.3378689628783516


In [13]:
# Let's try with 3 folds
regression_model_cv(LinearRegression(), k=3)

Reg rmse: [ 3.72504914  6.01655701 23.20863933]
Reg mean: 10.983415161090788


In [14]:
# Let's try with 6 folds
regression_model_cv(LinearRegression(), k=6)

Reg rmse: [3.23879491 3.97041949 5.58329663 3.92861033 9.88399671 3.91442679]
Reg mean: 5.086590810801092


In [None]:
# Regularization is an important concept in ML; it’s used to counteract overfitting.
# In the world of big data, it’s easy to overfit data to the training set. 
# When this happens, the model will often perform badly on the test set, as indicated by mean_squared_error or some other error.

# There are two main problems with fitting an ML model on all the data:
# 1. There is no way to test the model on unseen data. ML models are powerful when they make
# good predictions on new data. Models are trained on known results, but they perform in the
# real world on data that has never been seen before. It’s not vital to see how well a model fits
# known results (the training set), but it’s absolutely crucial to see how well it performs on unseen
# data (the test set).
# 2. The model may overfit the data. Models exist that may fit any set of data points perfectly.

# There are many models and approaches to counteract overfitting. Let’s go over a couple of linear models now:

In [15]:
# 1. Ridge = Ridge includes an L2 penalty term (L2 is based on Euclidean distance) that shrinks the linear coefficients
#            based on their size. The coefficients are the weights—numbers that determine how influential
#            each column is on the output. Larger weights carry greater penalties in Ridge.
regression_model_cv(Ridge())

Reg rmse: [3.17202127 4.54972372 5.36604368 8.03715216 5.03988501]
Reg mean: 5.23296516625177


In [16]:
# 2. Lasso = Lasso adds a penalty equal to the absolute value of the magnitude of coefficients. This L1 regularization (L1 is taxicab distance)
#            can eliminate some column influence, but it’s less widely used than Ridge on account of the
#            L1 distance metric being less common than L2.
regression_model_cv(Lasso())

Reg rmse: [3.52318747 5.70083491 7.82318757 6.9878025  3.97229348]
Reg mean: 5.60146118538429
