Lasso regularisation
-------

Regularisation consists in adding a penalty to the different parameters of the machine learning model to reduce the freedom of the model and in other words to avoid overfitting. In linear model regularisation, the penalty is applied over the coefficients that multiply each of the predictors. From the different types of regularisation, Lasso or l1 has the property that is able to shrink some of the coefficients to zero. Therefore, that feature can be removed from the model.

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings

from sklearn.model_selection import train_test_split

from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler

warnings.filterwarnings(action='ignore')

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
file_path = '/Users/wontaek/Documents/Lecture_dataset/BNP_Paribas_Cardif_claims/train.csv'
data = pd.read_csv(file_path, nrows=50000)
data.shape

# In practice, feature selection should be done after data pre-processing,
# so ideally, all the categorical variables are encoded into numbers,
# and then you can assess how deterministic they are of the target

# here for simplicity I will use only numerical variables
# select numerical columns:

numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerical_vars = list(data.select_dtypes(include=numerics).columns)
data = data[numerical_vars]
data.shape


# separate train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['target', 'ID'], axis=1),
    data['target'],
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((35000, 112), (15000, 112))

In [3]:
# linear models benefit from feature scaling

scaler = StandardScaler()
scaler.fit(X_train.fillna(0))

StandardScaler(copy=True, with_mean=True, with_std=True)

In [5]:
SelectFromModel?

[0;31mInit signature:[0m
[0mSelectFromModel[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mestimator[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mthreshold[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mprefit[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnorm_order[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_features[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Meta-transformer for selecting features based on importance weights.

.. versionadded:: 0.17

Parameters
----------
estimator : object
    The base estimator from which the transformer is built.
    This can be both a fitted (if ``prefit`` is set to True)
    or a non-fitted estimator. The estimator must have either a
    ``feature_importances_`` or ``coef_`` attribute after fitting.

threshold : string, float, optional default None
    The threshold value to use for f

In [4]:
# here I will do the model fitting and feature selection
# altogether in one line of code

# first I specify the Logistic Regression model, and I
# make sure I select the Lasso (l1) penalty.

# Then I use the selectFromModel object from sklearn, which
# will select in theory the features which coefficients are non-zero

sel_ = SelectFromModel(LogisticRegression(C=1, penalty='l1'))
sel_.fit(scaler.transform(X_train.fillna(0)), y_train)

SelectFromModel(estimator=LogisticRegression(C=1, class_weight=None, dual=False,
                                             fit_intercept=True,
                                             intercept_scaling=1, l1_ratio=None,
                                             max_iter=100, multi_class='warn',
                                             n_jobs=None, penalty='l1',
                                             random_state=None, solver='warn',
                                             tol=0.0001, verbose=0,
                                             warm_start=False),
                max_features=None, norm_order=1, prefit=False, threshold=None)

In [6]:
# this command let's me visualise those features that were kept
sel_.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True, False,  True,
       False,  True,  True,  True, False,  True,  True,  True, False,
       False,  True,  True,  True,  True, False, False,  True,  True,
       False,  True, False,  True,  True,  True,  True,  True, False,
       False, False, False, False, False,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
       False, False, False,  True,  True,  True,  True, False, False,
       False, False,  True,  True,  True,  True,  True, False, False,
        True, False,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True, False, False,  True,  True, False,  True,
        True,  True,  True, False])

In [7]:
# Now I make a list with the selected features
selected_feat = X_train.columns[(sel_.get_support())]

print('total features: {}'.format((X_train.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('features with coefficients shrank to zero: {}'.format(
    np.sum(sel_.estimator_.coef_ == 0)))

total features: 112
selected features: 79
features with coefficients shrank to zero: 33


In [8]:
# 가중치를 0으로 만들어 버렸다.

sel_.estimator_.coef_

array([[-0.05409264,  0.11261324, -0.01775505,  0.08203875,  0.18097962,
         0.04698098,  0.22232209,  0.0172384 ,  1.07241271,  0.        ,
        -1.30044837, -0.05759033,  0.11693635, -0.14017872, -0.16478723,
        -0.17934891,  0.03814743,  0.01265627,  0.        , -0.02968195,
         0.06337596,  0.17883558, -0.03230851, -0.02365772, -0.00386115,
         0.        ,  0.02579414,  0.        ,  0.77872597,  0.10769527,
        -0.13186004,  0.        ,  0.27091026, -0.01308322,  0.16711316,
         0.        ,  0.        , -0.03796082,  0.0654757 , -0.18652269,
        -0.34461832,  0.        ,  0.        ,  0.31432984,  0.0366303 ,
         0.        ,  0.02008912,  0.        ,  0.07331628, -0.08013521,
         0.29939859, -0.08455828, -0.30384394,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.13630567,
         0.12961183,  0.02536176,  0.21097968, -0.0380792 , -0.33272399,
        -0.16327186, -0.0035045 ,  0.        , -0.0

In [16]:
np.ravel?

[0;31mSignature:[0m [0mnp[0m[0;34m.[0m[0mravel[0m[0;34m([0m[0ma[0m[0;34m,[0m [0morder[0m[0;34m=[0m[0;34m'C'[0m[0;34m)[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Return a contiguous flattened array.

A 1-D array, containing the elements of the input, is returned.  A copy is
made only if needed.

As of NumPy 1.10, the returned array will have the same type as the input
array. (for example, a masked array will be returned for a masked array
input)

Parameters
----------
a : array_like
    Input array.  The elements in `a` are read in the order specified by
    `order`, and packed as a 1-D array.
order : {'C','F', 'A', 'K'}, optional

    The elements of `a` are read using this index order. 'C' means
    to index the elements in row-major, C-style order,
    with the last axis index changing fastest, back to the first
    axis index changing slowest.  'F' means to index the elements
    in column-major, Fortran-style order, with the
    first index changing fastest, and 

In [15]:
(sel_.estimator_.coef_ == 0).ravel().tolist()

[False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 True,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 True,
 True,
 False,
 False,
 False,
 False,
 True,
 True,
 False,
 False,
 True,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 True,
 True,
 True,
 False,
 False,
 False,
 False,
 True,
 True,
 True,
 True,
 False,
 False,
 False,
 False,
 False,
 True,
 True,
 False,
 True,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 True,
 True,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 True]

In [9]:
# we can identify the removed features like this:
removed_feats = X_train.columns[(sel_.estimator_.coef_ == 0).ravel().tolist()]
removed_feats

Index(['v11', 'v20', 'v29', 'v33', 'v37', 'v41', 'v42', 'v48', 'v49', 'v53',
       'v55', 'v62', 'v63', 'v64', 'v65', 'v67', 'v68', 'v81', 'v86', 'v87',
       'v88', 'v94', 'v95', 'v96', 'v97', 'v103', 'v104', 'v106', 'v115',
       'v121', 'v122', 'v126', 'v131'],
      dtype='object')

In [21]:
len(X_train.fillna(0).columns)

112

In [17]:
# we can then remove the features from the training and testing set
# like this
X_train_selected = sel_.transform(X_train.fillna(0))
X_test_selected = sel_.transform(X_test.fillna(0))

X_train_selected.shape, X_test_selected.shape

((35000, 79), (15000, 79))

### L2 regularisation does not shrink coefficients to zero

In [26]:
# separate train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['target', 'ID'], axis=1),
    data['target'],
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((35000, 112), (15000, 112))

In [27]:
l2_logit = LogisticRegression(C=1, penalty='l2')
l2_logit.fit(scaler.transform(X_train.fillna(0)), y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [31]:
# 0으로 coef값을 낮추지는 않는다.

l2_logit.coef_

array([[-0.08644661,  0.11848921, -0.02066516,  0.09546023,  0.19054746,
         0.20834857,  0.24117876,  0.08616941,  1.02850353,  0.03025465,
        -1.266752  ,  0.01211486,  0.11856195, -0.17169921, -0.2301516 ,
        -0.27243439,  0.04458787,  0.03295399, -0.23608815, -0.03093415,
         0.06569448,  0.21791182, -0.03558688, -0.02002269, -0.00889964,
        -0.05099209,  0.11136112,  0.22278852,  0.79323327,  0.20570042,
        -0.16315617,  0.00724886,  0.18194687,  0.0075772 ,  0.17694584,
         0.08960193, -0.10463332, -0.32323667,  0.0877821 , -0.22034634,
        -0.39611284,  0.02752573, -0.28436817,  0.33044464,  0.04428683,
         0.32332883,  0.02038317,  0.03885324,  0.15682228, -0.17900453,
         0.32748066, -0.13913087, -0.40529031, -0.11110495,  0.01518036,
         0.02659939, -0.15656106,  0.09889271,  0.12806048,  0.27403597,
         0.171565  ,  0.17197311,  0.280187  , -0.00174109, -0.63020932,
        -0.32268864, -0.04581572, -0.00180941, -0.0

In [28]:
np.sum(l2_logit.coef_==0)

0

### Regression

In [33]:
# load dataset
data = pd.read_csv('/Users/wontaek/Documents/Lecture_dataset/House_Sale_Price/train.csv')
data.shape

# In practice, feature selection should be done after data pre-processing,
# so ideally, all the categorical variables are encoded into numbers,
# and then you can assess how deterministic they are of the target

# here for simplicity I will use only numerical variables
# select numerical columns:

numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerical_vars = list(data.select_dtypes(include=numerics).columns)
data = data[numerical_vars]
data.shape


# separate train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['SalePrice'], axis=1),
    data['SalePrice'],
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((1022, 37), (438, 37))

In [34]:
# the features in the house dataset are in very
# different scales, so it helps the regression to scale
# them

scaler = StandardScaler()
scaler.fit(X_train.fillna(0))

StandardScaler(copy=True, with_mean=True, with_std=True)

In [35]:
# here, again I will train a Lasso Linear regression and select
# the non zero features in one line.
# bear in mind that the linear regression object from sklearn does
# not allow for regularisation. So If you want to make a regularised
# linear regression you need to import specifically "Lasso"
# that is the l1 version of the linear regression
# alpha is the penalisation here, so I set it high in order
# to force the algorithm to shrink some coefficients

sel_ = SelectFromModel(Lasso(alpha=100))
sel_.fit(scaler.transform(X_train.fillna(0)), y_train)

SelectFromModel(estimator=Lasso(alpha=100, copy_X=True, fit_intercept=True,
                                max_iter=1000, normalize=False, positive=False,
                                precompute=False, random_state=None,
                                selection='cyclic', tol=0.0001,
                                warm_start=False),
                max_features=None, norm_order=1, prefit=False, threshold=None)

In [36]:
sel_.get_support()

array([False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True, False,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True])

In [37]:
# make a list with the selected features and print the outputs
selected_feat = X_train.columns[(sel_.get_support())]

print('total features: {}'.format((X_train.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('features with coefficients shrank to zero: {}'.format(
    np.sum(sel_.estimator_.coef_ == 0)))

total features: 37
selected features: 33
features with coefficients shrank to zero: 4


In [46]:
from sklearn.linear_model import Ridge

In [47]:
sel_ = SelectFromModel(Ridge(alpha=1.0))
sel_.fit(scaler.transform(X_train.fillna(0)), y_train)

SelectFromModel(estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True,
                                max_iter=None, normalize=False,
                                random_state=None, solver='auto', tol=0.001),
                max_features=None, norm_order=1, prefit=False, threshold=None)

In [48]:
sel_.get_support()

array([False,  True, False,  True,  True,  True,  True, False,  True,
        True, False, False,  True,  True,  True, False,  True, False,
       False, False, False,  True, False,  True, False,  True,  True,
        True, False, False, False, False, False, False, False, False,
       False])

In [49]:
selected_feat = X_train.columns[(sel_.get_support())]

In [50]:
selected_feat

Index(['MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt',
       'MasVnrArea', 'BsmtFinSF1', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'GrLivArea', 'BedroomAbvGr', 'TotRmsAbvGrd', 'GarageYrBlt',
       'GarageCars', 'GarageArea'],
      dtype='object')