In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler

### Step 1: Problem Definition

In [34]:
# Load a regression dataset (Boston housing prices)
df = pd.read_csv('BostonHousePriceData.csv')
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [35]:
X = pd.DataFrame(df, columns=df.columns)
y = pd.Series(df.PRICE)

### Step 2: Data Preparation

In [36]:

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [37]:
# Scale the features (important for regularization)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


### Step 3: Model Definition - Linear Regression

In [38]:
linear_regressor = LinearRegression()

### Step 4: Train the model

In [39]:
linear_regressor.fit(X_train_scaled, y_train)

### Step 5: Model Evaluation

In [40]:

# Predictions on the test set
y_pred = linear_regressor.predict(X_test_scaled)

In [41]:
# Calculate the MSE, RMSE, and R-squared
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

In [42]:
print("Linear Regression Results:")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R-squared: {r2}")

Linear Regression Results:
MSE: 1.1530578769424002e-26
RMSE: 1.073805325439579e-13
R-squared: 1.0


### Step 6: Regularization (Ridge and Lasso)

In [43]:
# Ridge Regression (L2 regularization)
ridge = Ridge()
ridge.fit(X_train_scaled, y_train)
y_pred_ridge = ridge.predict(X_test_scaled)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
rmse_ridge = np.sqrt(mse_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

In [44]:
# Lasso Regression (L1 regularization)
lasso = Lasso()
lasso.fit(X_train_scaled, y_train)
y_pred_lasso = lasso.predict(X_test_scaled)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
rmse_lasso = np.sqrt(mse_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)

In [45]:
print("\nRidge Regression Results:")
print(f"MSE: {mse_ridge}")
print(f"RMSE: {rmse_ridge}")
print(f"R-squared: {r2_ridge}")


Ridge Regression Results:
MSE: 0.0023260425405819568
RMSE: 0.048229063235583965
R-squared: 0.9999682814326448


In [46]:
print("\nLasso Regression Results:")
print(f"MSE: {mse_lasso}")
print(f"RMSE: {rmse_lasso}")
print(f"R-squared: {r2_lasso}")


Lasso Regression Results:
MSE: 0.8638481636830039
RMSE: 0.9294343245668324
R-squared: 0.9882203245699945


### Step 7: Hyperparameter Tuning (using GridSearchCV)

In [47]:
# Hyperparameter tuning for Ridge and Lasso (regularization strength alpha)
param_grid = {'alpha': [0.01, 0.1, 1, 10, 100]}

In [48]:
ridge_search = GridSearchCV(Ridge(), param_grid, cv=5)
lasso_search = GridSearchCV(Lasso(), param_grid, cv=5)

In [49]:
# Perform grid search for Ridge
ridge_search.fit(X_train_scaled, y_train)
best_ridge = ridge_search.best_estimator_


In [50]:
# Perform grid search for Lasso
lasso_search.fit(X_train_scaled, y_train)
best_lasso = lasso_search.best_estimator_

In [51]:
# Evaluate the best models from GridSearchCV
y_pred_best_ridge = best_ridge.predict(X_test_scaled)
y_pred_best_lasso = best_lasso.predict(X_test_scaled)

mse_best_ridge = mean_squared_error(y_test, y_pred_best_ridge)
rmse_best_ridge = np.sqrt(mse_best_ridge)
r2_best_ridge = r2_score(y_test, y_pred_best_ridge)

mse_best_lasso = mean_squared_error(y_test, y_pred_best_lasso)
rmse_best_lasso = np.sqrt(mse_best_lasso)
r2_best_lasso = r2_score(y_test, y_pred_best_lasso)

print("\nBest Ridge Regression Results after Grid Search:")
print(f"Best Alpha: {ridge_search.best_params_}")
print(f"MSE: {mse_best_ridge}")
print(f"RMSE: {rmse_best_ridge}")
print(f"R-squared: {r2_best_ridge}")

print("\nBest Lasso Regression Results after Grid Search:")
print(f"Best Alpha: {lasso_search.best_params_}")
print(f"MSE: {mse_best_lasso}")
print(f"RMSE: {rmse_best_lasso}")
print(f"R-squared: {r2_best_lasso}")


Best Ridge Regression Results after Grid Search:
Best Alpha: {'alpha': 0.01}
MSE: 2.3974686843684674e-07
RMSE: 0.0004896395290791449
R-squared: 0.9999999967307446

Best Lasso Regression Results after Grid Search:
Best Alpha: {'alpha': 0.01}
MSE: 8.638481636829308e-05
RMSE: 0.00929434324566793
R-squared: 0.999998822032457
