# SLU09: Model Selection & Overfitting -- Exercises
---

*Exercises are graded unless otherwise indicated.*

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, LogisticRegression
from utils import generate_test_data

## Exercise 1: Detecting bias and variance in the real world (not graded)

For each of the following, identify if they are more likely to be sources of bias or variance:

1. Using very flexible models (e.g., non-parametric, non-linear), such as K-nearest neighbors or decision trees (bias/variance)
2. Using models with simplistic assumptions, such as linear or logistic regressions (bias/variance)
3. Increasing the polynomial degree of our hypothesis function (bias/variance)
4. Ignoring important features (bias/variance)

## Exercise 2: Create training and test datasets (train-test split)

In [2]:
from sklearn.model_selection import train_test_split

def implement_hold_out_method(X, y, test_size=.4, random_state=0):
    """ 
    Implementing the holdout method, using sklearn.
    
    Args:
        X (pd.DataFrame): a pandas dataframe containing the features
        y (pd.Series): a pandas series containing the target variable
        test_size (float): proportion of the dataset to include in the test set
        random_state (int): the seed used by the random number generator

    Returns:
        X_train (pd.DataFrame): the features for the training examples
        X_test (pd.DataFrame): the features for the test examples
        y_train (pd.Series): target for the training set 
        y_test (pd.Series): target for the test set

    """
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = test_size, random_state=random_state)
    
    return X_train, X_test, y_train, y_test

In [3]:
"""Check that the solution is correct."""

X, y = generate_test_data(m=100, n=4)
X_train, X_test, y_train, y_test = implement_hold_out_method(X, y)

assert X_train.shape == (60, 3)
assert X_test.shape == (40, 3)
assert y_train.shape == (60,)
assert y_test.shape == (40,)

X_train2, X_test2, y_train2, y_test2 = implement_hold_out_method(X, y, test_size=.1)

assert X_train2.shape == (90, 3)
assert X_test2.shape == (10, 3)
assert y_train2.shape == (90,)
assert y_test2.shape == (10,)

## Exercise 3: Creating a validation dataset

In [4]:
def implement_validation_dataset(X, y, test_size=.25, val_size=.25, random_state=0):
    """ 
    Implementing the holdout method with validation, using sklearn.
    
    Args:
        X (pd.DataFrame): a pandas dataframe containing the features
        y (pd.Series): a pandas series containing the target variable
        test_size (float): proportion of the dataset to include in the test set
        val_size (float): proportion of the dataset to include in the validation set
        random_state (int): the seed used by the random number generator

    Returns:
        X_train (pd.DataFrame): the features for the training examples
        X_test (pd.DataFrame): the features for the test examples
        X_val (pd.DataFrame): the features of the validation examples
        y_train (pd.Series): target for the training set 
        y_test (pd.Series): target for the test set
        y_val (pd.Series): target for the validation set

    """
    X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    the_right_val_size = (val_size*X.shape[0])/(X.shape[0]-X_test.shape[0])
    
    X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp,
                                                               test_size=the_right_val_size, random_state=random_state)
    del X_temp, y_temp
    
    return X_train, X_test, X_val, y_train, y_test, y_val

In [5]:
"""Check that the solution is correct."""
X, y = generate_test_data(m=1000, n=5)
X_train, X_test, X_val, y_train, y_test, y_val = implement_validation_dataset(X, y)

assert X_train.shape == (500, 4)
assert X_test.shape == (250, 4)
assert X_val.shape == (250, 4)
assert y_train.shape == (500,)
assert y_test.shape == (250,)
assert y_val.shape == (250,)


X_train2, X_test2, X_val2, y_train2, y_test2, y_val2 = implement_validation_dataset(X, y, test_size=.1, val_size=.3)

assert X_train2.shape == (600, 4)
assert X_test2.shape == (100, 4)
assert X_val2.shape == (300, 4)
assert y_train2.shape == (600,)
assert y_test2.shape == (100,)
assert y_val2.shape == (300,)

## Exercise 4: Implementing K-fold cross-validation

Get the mean cross validation score for linear regression (with 4 folds) and logistic regression (with 6 folds). 

Leave the scoring parameter with the default value. 

In [6]:
# here is your data 
X_train = pd.read_csv('data/something')
y_train_num = pd.read_csv('data/something_else', header=None)
y_train_cat = y_train_num.apply(lambda x: x < 50).astype('int')

from sklearn.model_selection import cross_val_score

# here are your models
lin_reg = LinearRegression()
log_reg = LogisticRegression(solver='liblinear')

# store the mean cross val scores as lin_reg_result and log_reg_result
# use y_train_num for linear regression
# use y_train_cat for logistic regression
lin_reg_result = cross_val_score(lin_reg,X_train,y_train_num,cv=4).mean()
log_reg_result = cross_val_score(log_reg,X_train,y_train_cat,cv=6).mean()

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [7]:
np.testing.assert_almost_equal(lin_reg_result, -0.1477, 3)
np.testing.assert_almost_equal(log_reg_result, 0.5333, 3)

## Exercise 5: Regularization loss functions

In [8]:
# data 
X = pd.read_csv('data/something')
y = pd.read_csv('data/something_else', header=None)

# model
lr = LinearRegression()

# fit model
lr.fit(X, y)

# get variables for calculating loss functions
y = np.array(y[0])
y_hat = lr.predict(X).reshape(60,)
betas = np.append(lr.intercept_, lr.coef_[0])

Compute the $L_1$ and $L_2$ loss functions for the linear regression model trained above.

$$J_{L_1} = \frac{1}{N} \sum_{n=1}^N (y_n - \hat{y}_n)^2 + \lambda_1 \sum_{k=1}^K \left|\beta_k\right|$$

### Exercise 5.1: MSE

First, let's compute the Mean Squared Error part:

In [9]:
def mean_squared_error(y, y_hat):
    """
    Args: 
        y : numpy array with shape (num_observations,)
            The targets.
        y_hat : numpy array with shape (num_observations,)
            The predictions.
            
    Returns:
        Mean squared error : float
    """

    mse = ((y-y_hat)**2).mean()
    
    return mse

In [10]:
mse = mean_squared_error(y, y_hat)
np.testing.assert_almost_equal(mse, 635.2071, 3)

### Exercise 5.2: L1 loss

Now, we can compute the $L_1$ loss

In [11]:
def l1_loss(y, y_hat, betas, lamb1):
    """
    Args: 
        y : numpy array with shape (num_observations,)
            The targets.
        y_hat : numpy array with shape (num_observations,)
            The predictions.
        betas : numpy array with shape (num_features+1,)
            The parameters of your regression model. 
            The first value is the intercept and the 
            remaining ones are the feature coefficients.
        lamb1 : float
            The strength of the L1 regularizer.
            
    Returns:
        loss : float
    """
        
    # Compute the mean squared error loss part of the general loss function.
    # Hint: use the function we created above
    mse =  mean_squared_error(y, y_hat)
    
    # Compute the L1 part of the general loss function.
    # Do not forget that we are not supposed to include the intercept in the L1 regularization 
    l1 = lamb1*(np.absolute(betas[1:]).sum())
    
    # Compute the total loss by combining the parts.
    L = mse + l1
    
    
    return L

In [12]:
lamb1 = 2
loss = l1_loss(y, y_hat, betas, lamb1)
np.testing.assert_almost_equal(loss, 635.8783, 3)

lamb1_2 = 6
loss2 = l1_loss(y, y_hat, betas, lamb1_2)
np.testing.assert_almost_equal(loss2, 637.2206, 3)

### Exercise 5.3: L2 loss

Finally, let's compute the $L_2$ loss:

$$J_{L_2} = \frac{1}{N} \sum_{n=1}^N (y_n - \hat{y}_n)^2 + \lambda_2 \sum_{k=1}^K \beta_k^2$$


In [13]:
def l2_loss(y, y_hat, betas, lamb2):
    """
    Args: 
        y : numpy array with shape (num_observations,)
            The targets.
        y_hat : numpy array with shape (num_observations,)
            The predictions.
        betas : numpy array with shape (num_features+1,)
            The parameters of your regression model. 
            The first value is the intercept and the 
            remaining ones are the feature coefficients.
        lamb2 : float
            The strength of the L2 regularizer.
            
    Returns:
        loss : float
    """
    
    # Compute the mean squared error loss part of the general loss function.
    # Hint: use the function we created above
    mse = mean_squared_error(y,y_hat)
    
    # Compute the L2 part of the general loss function.
    # Do not forget that we are not supposed to include the intercept in l2 regularization
    l2 = lamb2*((betas[1:]**2).sum())
    
    # Compute the total loss by combining the parts.
    L = mse + l2 
    
    return L

In [14]:
lamb2 = 2
loss = l2_loss(y, y_hat, betas, lamb2)
np.testing.assert_almost_equal(loss, 635.2940, 3)

lamb2_2 = 6
loss2 = l2_loss(y, y_hat, betas, lamb2_2)
np.testing.assert_almost_equal(loss2, 635.4676, 3)

## Exercise 6: Regularized linear regression in practice

### Exercise 6.1: Lasso

In [15]:
from sklearn.preprocessing import PolynomialFeatures

# create data
X_temp = pd.read_csv('data/something')
y = pd.read_csv('data/something_else', header=None)
poly = PolynomialFeatures(degree=2)
X = pd.DataFrame(poly.fit_transform(X_temp))

In [16]:
from sklearn.linear_model import Lasso 

def lasso_regression(X, y, lamb):
    """
    Implements lasso (L1) regression, using sklearn
    
    Args: 
        X (pd.DataFrame): a pandas dataframe containing the features
        y (pd.Series): a pandas series containing the target variable
        lamb (float): the strength of the regularizer
            
    Returns:
        lasso : a Lasso regression model fitted to X and y
    """
    
    # instantiate an instance of sklearn's Lasso model
    # use named arguments, such as alpha=... and random_state=...
    # use a random state of 42
    
    # We should normalize the data in order to measure and compare the coefficients, 
    # however for the purpose of doing the exercise we did not do it 
    lasso = Lasso(alpha = lamb,random_state = 42)
    
    lasso.fit(X,y)
    
    return lasso

In [17]:
lamb = 0.1
check_lasso = lasso_regression(X, y, lamb)
np.testing.assert_almost_equal(np.sum(check_lasso.coef_), 0.396970, 6)
np.testing.assert_almost_equal(check_lasso.score(X, y), 0.169879, 6)

lamb2 = 100
check_lasso2 = lasso_regression(X, y, lamb2)
np.testing.assert_almost_equal(np.sum(check_lasso2.coef_), 0.003354, 6)
np.testing.assert_almost_equal(check_lasso2.score(X, y), 0.146606, 6)

### Exercise 6.2: Ridge

In [18]:
from sklearn.linear_model import Ridge

def ridge_regression(X, y, lamb):
    """
    Implements ridge (L2) regression, using sklearn
    
    Args: 
        X (pd.DataFrame): a pandas dataframe containing the features
        y (pd.Series): a pandas series containing the target variable
        lamb (float): the strength of the regularizer
            
    Returns:
        ridge : a Ridge regression model fitted to X and y
    """
    
    # instantiate an instance of sklearn's Ridge model
    # use named arguments, such as alpha=... and random_state=...
    # use a random state of 42
    ridge = Ridge(alpha = lamb, random_state = 42)
    
    ridge.fit(X,y)
    
    return ridge

In [19]:
lamb = 0.1
check_ridge = ridge_regression(X, y, lamb)
np.testing.assert_almost_equal(np.sum(check_ridge.coef_), 0.401361, 6)
np.testing.assert_almost_equal(check_ridge.score(X, y), 0.169880, 6)

lamb2 = 100
check_ridge2 = ridge_regression(X, y, lamb2)
np.testing.assert_almost_equal(np.sum(check_ridge2.coef_), 0.373489, 6)
np.testing.assert_almost_equal(check_ridge2.score(X, y), 0.169831, 6)

### Exercise 6.3: Elastic Net

In [20]:
from sklearn.linear_model import ElasticNet

def elasticnet_regression(X, y, lamb, l1_ratio):
    """
    Implements elastic net regression, using sklearn
    
    Args: 
        X (pd.DataFrame): a pandas dataframe containing the features
        y (pd.Series): a pandas series containing the target variable
        lamb (float): total weight of regularization terms
        l1_ratio: the ratio of l1 to l2 loss terms
            
    Returns:
        elasticnet : an Elastic Net regression model fitted to X and y
    """
    
    # instantiate an instance of sklearn's Elastic Net model
    # use named arguments, such as alpha=..., l1_ratio=..., and random_state=...
    # use a random state of 42
    elasticnet = ElasticNet(alpha = lamb, l1_ratio=l1_ratio, random_state = 42)
    
    elasticnet.fit(X,y)
    
    return elasticnet

In [21]:
lamb = 0.001
l1_ratio = 0.3
check_enet = elasticnet_regression(X, y, lamb, l1_ratio)
np.testing.assert_almost_equal(np.sum(check_enet.coef_), 0.401365, 6)
np.testing.assert_almost_equal(check_enet.score(X, y), 0.169880, 6)

lamb2 = 10
l1_ratio2 = 0.9
check_enet2 = elasticnet_regression(X, y, lamb2, l1_ratio2)
np.testing.assert_almost_equal(np.sum(check_enet2.coef_), 0.039402, 6)
np.testing.assert_almost_equal(check_enet2.score(X, y), 0.159236, 6)