In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score

In [3]:
from google.colab import files
uploaded = files.upload()

Saving AmesHousing.csv to AmesHousing.csv


In [11]:
ames = pd.read_csv("AmesHousing.csv")

good_cols = ames.isna().sum() < 100
ames = ames.loc[:,good_cols]

ames = ames.dropna()
ames.head()

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Area,Street,Lot Shape,Land Contour,Utilities,Lot Config,...,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,31770,Pave,IR1,Lvl,AllPub,Corner,...,0,0,0,0,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,11622,Pave,Reg,Lvl,AllPub,Inside,...,0,0,120,0,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,14267,Pave,IR1,Lvl,AllPub,Corner,...,0,0,0,0,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,11160,Pave,Reg,Lvl,AllPub,Corner,...,0,0,0,0,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,13830,Pave,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0,3,2010,WD,Normal,189900


In [5]:
X = ames.drop(["SalePrice", "Order", "PID"], axis = 1)
y = ames["SalePrice"]


ct = ColumnTransformer(
  [
    ("dummify",
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize",
    StandardScaler(),
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

lr_pipeline_1 = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
)

In [6]:
cross_val_score(lr_pipeline_1, X, y, cv = 5, scoring = 'r2')

array([-2.59303720e+21, -1.13145211e+19, -7.57138616e+20, -4.47669752e+18,
       -2.55949915e+20])

In [9]:
X = ames.drop(["SalePrice", "Order", "PID"], axis = 1)
y = ames["SalePrice"]


ct = ColumnTransformer(
  [
    ("dummify",
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize",
    StandardScaler(),
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

lr_pipeline_2 = Pipeline(
  [("preprocessing", ct),
  ("ridge_regression", Ridge(alpha = 1))]
)

In [10]:
cross_val_score(lr_pipeline_2, X, y, cv = 5, scoring = 'r2')

array([0.89815807, 0.91744024, 0.79493606, 0.78522563, 0.91389818])

In [25]:
from sklearn.model_selection import GridSearchCV, cross_val_score
ct = ColumnTransformer(
    [
        ("dummify", OneHotEncoder(sparse_output=False, handle_unknown='ignore'), make_column_selector(dtype_include=object)),
        ("standardize", StandardScaler(), make_column_selector(dtype_include=np.number))
    ],
    remainder="passthrough"
)

#define pipeline
lr_pipeline_3 = Pipeline([
    ("preprocessing", ct),
    ("ridge_regression", Ridge())
]).set_output(transform="pandas")

#parameter grid for Ridge regression
param_grid = {
    "ridge_regression__alpha": [0.001, 0.01, 0.1, 1, 10]
}

#GridSearchCV with the pipeline and parameter grid
grid_search = GridSearchCV(estimator=lr_pipeline_3, param_grid=param_grid, cv=5, scoring='r2')

grid_search.fit(X, y)

In [28]:
grid_search.cv_results_

{'mean_fit_time': array([0.37112327, 0.23419213, 0.15524645, 0.135149  , 0.11048121]),
 'std_fit_time': array([0.09480347, 0.07481112, 0.05789662, 0.02833309, 0.01546037]),
 'mean_score_time': array([0.08986692, 0.04674115, 0.04290428, 0.04999123, 0.04361639]),
 'std_score_time': array([0.03480113, 0.01874216, 0.01141751, 0.00577791, 0.0102195 ]),
 'param_ridge_regression__alpha': masked_array(data=[0.001, 0.01, 0.1, 1.0, 10.0],
              mask=[False, False, False, False, False],
        fill_value=1e+20),
 'params': [{'ridge_regression__alpha': 0.001},
  {'ridge_regression__alpha': 0.01},
  {'ridge_regression__alpha': 0.1},
  {'ridge_regression__alpha': 1},
  {'ridge_regression__alpha': 10}],
 'split0_test_score': array([0.8972854 , 0.89734306, 0.89774358, 0.89815807, 0.8977621 ]),
 'split1_test_score': array([0.91040618, 0.91061417, 0.91230557, 0.91744024, 0.92081211]),
 'split2_test_score': array([0.78901601, 0.7891259 , 0.79010977, 0.79493606, 0.80057243]),
 'split3_test_score'

In [26]:
print("Best alpha:", grid_search.best_params_['ridge_regression__alpha'])
print("Best R^2 score:", grid_search.best_score_)

Best alpha: 10
Best R^2 score: 0.8642722110166747


In [37]:
ct = ColumnTransformer(
    [
        ("dummify", OneHotEncoder(sparse_output=False, handle_unknown='ignore'), make_column_selector(dtype_include=object)),
        ("standardize", StandardScaler(), make_column_selector(dtype_include=np.number))
    ],
    remainder="passthrough"
)

#define the pipeline
lr_pipeline_4 = Pipeline([
    ("preprocessing", ct),
    ("lasso_regression", Lasso())
]).set_output(transform="pandas")

#define parameter grid for Ridge regression
param_grid2 = {
    "lasso_regression__alpha": [0.001, 0.01, 0.1, 1, 10]
}


#GridSearchCV with the pipeline and parameter grid
grid_search2 = GridSearchCV(estimator=lr_pipeline_4, param_grid=param_grid2, cv=5, scoring='r2')

grid_search2.fit(X, y)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [36]:
cross_val_score(lr_pipeline_4, X, y, cv = 5, scoring = 'r2')

  model = cd_fast.enet_coordinate_descent(


array([0.89774385, 0.91093785, 0.79691806, 0.77426245, 0.90589888])

In [38]:
ct = ColumnTransformer(
    [
        ("dummify", OneHotEncoder(sparse_output=False, handle_unknown='ignore'), make_column_selector(dtype_include=object)),
        ("standardize", StandardScaler(), make_column_selector(dtype_include=np.number))
    ],
    remainder="passthrough"
)

elastic_net_pipeline = Pipeline([
    ("preprocessing", ct),
    ("elastic_net", ElasticNet())
]).set_output(transform="pandas")

param_grid = {
    "elastic_net__alpha": [0.001, 0.01, 0.1, 1, 10],
    "elastic_net__l1_ratio": [0.1, 0.5, 0.7, 0.9, 1.0]
}

grid_search_elastic = GridSearchCV(estimator=elastic_net_pipeline, param_grid=param_grid, cv=5, scoring='r2')
grid_search_elastic.fit(X, y)

# Get the best ElasticNet model
best_elastic_net_model = grid_search_elastic.best_estimator_
print("Best Elastic Net alpha:", grid_search_elastic.best_params_['elastic_net__alpha'])
print("Best Elastic Net l1_ratio:", grid_search_elastic.best_params_['elastic_net__l1_ratio'])
print("Best Elastic Net R^2 score:", grid_search_elastic.best_score_)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Best Elastic Net alpha: 0.01
Best Elastic Net l1_ratio: 0.5
Best Elastic Net R^2 score: 0.8642678883652006


  model = cd_fast.enet_coordinate_descent(


In [39]:
#fit the best ElasticNet model on the full dataset
best_elastic_net_model.fit(X, y)

# Ridge model
ridge_model = Pipeline([
    ("preprocessing", ct),
    ("ridge", Ridge(alpha=grid_search_elastic.best_params_['elastic_net__alpha']))
]).set_output(transform="pandas")
ridge_model.fit(X, y)

#OLS model
ols_model = Pipeline([
    ("preprocessing", ct),
    ("ols", LinearRegression())
]).set_output(transform="pandas")
ols_model.fit(X, y)

#comparing the coefficients
elastic_net_coefs = best_elastic_net_model.named_steps["elastic_net"].coef_
ridge_coefs = ridge_model.named_steps["ridge"].coef_
ols_coefs = ols_model.named_steps["ols"].coef_

print("\nElastic Net Coefficients:", elastic_net_coefs)
print("Ridge Coefficients:", ridge_coefs)
print("OLS Coefficients:", ols_coefs)

  model = cd_fast.enet_coordinate_descent(



Elastic Net Coefficients: [-4.36707316e+03  2.58930223e+03 -9.19417357e+02  3.85391906e+03
  2.21693926e+03 -3.37367006e+03 -5.42672957e+03  5.42672956e+03
  1.86723305e+03  6.98591112e+03 -1.10890057e+04  2.23386196e+03
 -1.09990714e+04  8.99233117e+03 -1.25763948e+03  3.26437977e+03
  2.26595098e+03 -1.82550578e+03 -4.39445198e+02 -1.94736847e+01
  7.32006891e+03 -5.43410203e+03 -2.44365652e+03  5.78163411e+02
 -6.16362027e+02  5.18209052e+03 -4.56472859e+03 -3.87569981e+03
  1.90088683e+03  4.70492586e+03  3.00238837e+02 -3.50912503e+03
 -8.19981053e+03  9.22438676e+03 -1.44898572e+04 -1.06699450e+04
  1.14082112e+03  9.02194540e+03 -5.36482423e+03 -2.84779674e+02
  1.44588571e+03 -9.23421603e+03 -1.01670815e+04  3.40782016e+03
 -1.23392127e+04  2.60565188e+04  2.09499486e+04 -8.52307157e+03
 -5.88863748e+03 -5.64447620e+03 -8.43567282e+03  8.49411921e+03
  2.82637493e+04 -5.25961524e+03 -3.02122151e+03 -2.05208355e+03
 -4.12029964e+03  5.88166083e+03  7.06090161e+03  4.69101544e+0

In [42]:
feature_names = best_elastic_net_model.named_steps["preprocessing"].get_feature_names_out()

coef_df = pd.DataFrame({
    "Elastic Net": elastic_net_coefs,
    "Ridge": ridge_coefs,
    "OLS": ols_coefs
}, index=feature_names)

print("\nCoefficient Comparison:\n", coef_df)


Coefficient Comparison:
                             Elastic Net        Ridge           OLS
dummify__MS Zoning_C (all) -4367.073164 -4667.372061 -2.283838e+13
dummify__MS Zoning_FV       2589.302226  1542.980095 -2.283838e+13
dummify__MS Zoning_I (all)  -919.417357 -7515.308071 -2.283838e+13
dummify__MS Zoning_RH       3853.919064  7886.097997 -2.283838e+13
dummify__MS Zoning_RL       2216.939257  3768.112676 -2.283838e+13
...                                 ...          ...           ...
standardize__Screen Porch   3119.443565  2676.091536  2.672000e+03
standardize__Pool Area       410.933060  2743.480950  2.784000e+03
standardize__Misc Val      -4159.204931 -5120.987113 -5.092000e+03
standardize__Mo Sold        -614.755896  -773.819443 -7.680000e+02
standardize__Yr Sold        -979.307592  -618.612897 -6.080000e+02

[254 rows x 3 columns]
