In [1]:
from src.handle_data import split_data
from src.modelling import fit_model
from src.handle_data import merge_data
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Lasso
from src.handle_data import preprocess_data
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import pandas as pd

In [2]:
data_path = "data/boston_housing_data.csv"
# use same random_state across all parts to ensure reproducibility
random_state = 20230325

# Splits the data into X and y train and test portions
X_train, X_test, y_train, y_test = split_data(data_path, proportion = 0.75, target = "MEDV", random_state=random_state)

In [3]:
alphas_range = np.exp(np.linspace(15, -5, 200))
def optimize_lasso(alphas_range):    
    metrics = ['neg_root_mean_squared_error', 'neg_mean_squared_error', 'neg_mean_absolute_percentage_error']
    result = {metric: [] for metric in metrics}
    
    grids = {metric: alphas_range for metric in metrics}
    preprocessor = preprocess_data(X_train)
    for metric, alphas in grids.items():
        for a in alphas:
            mod = make_pipeline(preprocessor, Lasso(alpha=a))
            cv_result = cross_val_score(mod, X_train, y_train, cv=5, scoring=metric)
            result[metric].append(-1 * cv_result.mean())
    result["alpha"] = alphas_range
    out = pd.DataFrame(result).rename(columns=lambda s: s.replace("neg_", "")).sort_values(by=["root_mean_squared_error", "mean_absolute_percentage_error"])
    return out      

In [4]:
out = optimize_lasso(alphas_range=alphas_range)

In [5]:
out.head(1)

Unnamed: 0,root_mean_squared_error,mean_squared_error,mean_absolute_percentage_error,alpha
185,4.649109,22.25651,0.165758,0.027517


In [6]:
out.query("alpha > 1 & root_mean_squared_error < 6 & mean_squared_error < 30").tail(1)

Unnamed: 0,root_mean_squared_error,mean_squared_error,mean_absolute_percentage_error,alpha
143,5.431409,29.906345,0.207002,1.874123


In [7]:
def lasso_coeff(alpha):
    coeffs = Lasso(alpha=alpha).fit(X_train, y_train).coef_
    out = pd.DataFrame(coeffs, 
              index=X_train.columns, 
              columns=["Coefficients"])
    return out

In [8]:
mo = lasso_coeff(alpha=0.027517)

In [9]:
mo.Coefficients

CRIM      -0.103664
ZN         0.054449
INDUS      0.011923
CHAS       2.377299
NOX       -9.771399
RM         3.195303
AGE       -0.004238
DIS       -1.219419
RAD        0.307551
TAX       -0.015716
PTRATIO   -0.727845
B          0.008159
LSTAT     -0.485616
Name: Coefficients, dtype: float64