# Ridge Regression and Lasso

This notebook explores ridge regression and lasso. These alternative linear fitting techniques can improve a model's performance and interpretability.

## Import libraries 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

## Exploratory Data Analysis 

In [1]:
DATAPATH = './data/Advertising.csv'

data = pd.read_csv(DATAPATH)
data.head()

[0;31m----------------------------------------------------------------[0m
[0;31mFileNotFoundError[0m              Traceback (most recent call last)
Cell [0;32mIn[7], line 3[0m
[1;32m      1[0m DATAPATH [38;5;241m=[39m [38;5;124m'[39m[38;5;124mdata/Advertising.csv[39m[38;5;124m'[39m
[0;32m----> 3[0m data [38;5;241m=[39m [43mpd[49m[38;5;241;43m.[39;49m[43mread_csv[49m[43m([49m[43mDATAPATH[49m[43m)[49m
[1;32m      4[0m data[38;5;241m.[39mhead()

File [0;32m/home/linuxbrew/.linuxbrew/Cellar/ipython/8.21.0/libexec/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1024[0m, in [0;36mread_csv[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunks

In [None]:
data.drop(['Unnamed: 0'], axis=1, inplace=True)

In [None]:
data.head()

In [None]:
data.columns

In [None]:
def scatter_plot(feature, target):
    plt.figure(figsize=(16, 8))
    plt.scatter(
        data[feature],
        data[target],
        c='black'
    )
    plt.xlabel("Money spent on {} ads ($)".format(feature))
    plt.ylabel("Sales ($k)")
    plt.show()

In [None]:
scatter_plot('TV', 'sales')

In [None]:
scatter_plot('radio', 'sales')

In [None]:
scatter_plot('newspaper', 'sales')

## Modelling 

### Multiple linear regression - least squares fitting 

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression

Xs = data.drop(['sales'], axis=1)
y = data['sales'].values.reshape(-1,1)

lin_reg = LinearRegression()

MSEs = cross_val_score(lin_reg, Xs, y, scoring='neg_mean_squared_error', cv=5)

mean_MSE = np.mean(MSEs)

print(mean_MSE)

### Ridge regression 

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

alpha = [1e-15, 1e-10, 1e-8, 1e-4, 1e-3,1e-2, 1, 5, 10, 20]

ridge = Ridge()

parameters = {'alpha': [1e-15, 1e-10, 1e-8, 1e-4, 1e-3,1e-2, 1, 5, 10, 20]}

ridge_regressor = GridSearchCV(ridge, parameters,scoring='neg_mean_squared_error', cv=5)

ridge_regressor.fit(Xs, y)

In [None]:
ridge_regressor.best_params_

In [None]:
ridge_regressor.best_score_

### Lasso 

In [None]:
from sklearn.linear_model import Lasso

lasso = Lasso()

parameters = {'alpha': [1e-15, 1e-10, 1e-8, 1e-4, 1e-3,1e-2, 1, 5, 10, 20]}

lasso_regressor = GridSearchCV(lasso, parameters, scoring='neg_mean_squared_error', cv = 5)

lasso_regressor.fit(Xs, y)

In [None]:
lasso_regressor.best_params_

In [None]:
lasso_regressor.best_score_