<a href="https://colab.research.google.com/github/syedabusafwan/ML-practice/blob/main/Linear_Regression_California_Housing_Price_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Loading the Data**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
# Loading the dataset
housing = fetch_california_housing(as_frame=True)

# Creating DataFrame
df = housing.frame

# **EDA**

In [None]:
housing.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'feature_names', 'DESCR'])

In [None]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MedInc       20640 non-null  float64
 1   HouseAge     20640 non-null  float64
 2   AveRooms     20640 non-null  float64
 3   AveBedrms    20640 non-null  float64
 4   Population   20640 non-null  float64
 5   AveOccup     20640 non-null  float64
 6   Latitude     20640 non-null  float64
 7   Longitude    20640 non-null  float64
 8   MedHouseVal  20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


In [None]:
df.isnull().sum()

Unnamed: 0,0
MedInc,0
HouseAge,0
AveRooms,0
AveBedrms,0
Population,0
AveOccup,0
Latitude,0
Longitude,0
MedHouseVal,0


In [None]:
df.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704,2.068558
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532,1.153956
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35,0.14999
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8,1.196
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49,1.797
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01,2.64725
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31,5.00001


# **Simple Linear Regression Model**

In [None]:
# Creating feature matrix and target vector
X = housing.data
y = housing.target

# Feature names
feature_names = housing.feature_names

In [None]:
# Splitting data into training and testing sets (80% / 20%)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
# Standardizing feature data
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
# Training Linear Regression model
lin_reg = LinearRegression()
lin_reg.fit(X_train_scaled, y_train)

# Making predictions
y_pred_lr = lin_reg.predict(X_test_scaled)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
# Calculating evaluation metrics
mae_lr = mean_absolute_error(y_test, y_pred_lr)
mse_lr = mean_squared_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mse_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print("Linear Regression Performance:")
print("MAE:", mae_lr)
print("MSE:", mse_lr)
print("RMSE:", rmse_lr)
print("R-Squared:", r2_lr)

Linear Regression Performance:
MAE: 0.5332001304956565
MSE: 0.5558915986952442
RMSE: 0.7455813830127763
R-Squared: 0.575787706032451


# **Ridge Regression**

In [None]:
from sklearn.linear_model import Ridge

In [None]:
# Training Ridge Regression model
ridge = Ridge(alpha=1.0)
ridge.fit(X_train_scaled, y_train)

In [None]:
# Making predictions
y_pred_ridge = ridge.predict(X_test_scaled)

In [None]:
# Evaluating Ridge Regression
mae_ridge = mean_absolute_error(y_test, y_pred_ridge)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
rmse_ridge = np.sqrt(mse_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

print("Ridge Regression Performance:")
print("MAE:", mae_ridge)
print("MSE:", mse_ridge)
print("RMSE:", rmse_ridge)
print("R-Squared:", r2_ridge)

Ridge Regression Performance:
MAE: 0.5331931195789733
MSE: 0.5558548589435971
RMSE: 0.7455567442814779
R-Squared: 0.5758157428913684


# **Lasso Regression**

In [None]:
from sklearn.linear_model import Lasso

In [None]:
# Training Lasso Regression model
lasso = Lasso(alpha=0.01)
lasso.fit(X_train_scaled, y_train)

# Making predictions
y_pred_lasso = lasso.predict(X_test_scaled)

In [None]:
# Evaluating Lasso Regression
mae_lasso = mean_absolute_error(y_test, y_pred_lasso)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
rmse_lasso = np.sqrt(mse_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)

print("Lasso Regression Performance:")
print("MAE:", mae_lasso)
print("MSE:", mse_lasso)
print("RMSE:", rmse_lasso)
print("R-Squared:", r2_lasso)

Lasso Regression Performance:
MAE: 0.5353261423609051
MSE: 0.5482548967938964
RMSE: 0.7404423656125414
R-Squared: 0.5816154300698727


In [None]:
# Printing Ridge coefficients
ridge_coefficients = pd.Series(ridge.coef_, index=feature_names)
print("Ridge Coefficients:")
print(ridge_coefficients)

Ridge Coefficients:
MedInc        0.854327
HouseAge      0.122624
AveRooms     -0.294210
AveBedrms     0.339008
Population   -0.002282
AveOccup     -0.040833
Latitude     -0.896168
Longitude    -0.869071
dtype: float64


In [None]:
# Printing Lasso coefficients
lasso_coefficients = pd.Series(lasso.coef_, index=feature_names)
print("Lasso Coefficients:")
print(lasso_coefficients)

Lasso Coefficients:
MedInc        0.800957
HouseAge      0.127087
AveRooms     -0.162759
AveBedrms     0.206207
Population   -0.000000
AveOccup     -0.030602
Latitude     -0.790113
Longitude    -0.755674
dtype: float64


**Observation**

Ridge Regression reduce coefficients but never sets them exactly to zero.

Lasso Regression can reduce some coefficients to **zero**, effectively performing **feature selection**.

# **Analysis and Observation**

In [34]:
# Creating comparison table
summary = pd.DataFrame({
    "Model": ["Linear Regression", "Ridge Regression", "Lasso Regression"],
    "R-Squared": [r2_lr, r2_ridge, r2_lasso],
    "RMSE": [rmse_lr, rmse_ridge, rmse_lasso],
    "MAE": [mae_lr, mae_ridge, mae_lasso]
})

summary

Unnamed: 0,Model,R-Squared,RMSE,MAE
0,Linear Regression,0.575788,0.745581,0.5332
1,Ridge Regression,0.575816,0.745557,0.533193
2,Lasso Regression,0.581615,0.740442,0.535326


**Best Model Selection**

Based on the evaluation metrics, Ridge Regression performs the best overall for predicting median house values in California. It achieves a **slightly higher R-Squared, along with lower MAE and RMSE**, indicating **more accurate and stable predictions** compared to simple Linear Regression. Lasso Regression shows slightly lower performance due to its tendency to reduce some coefficients to zero, which may remove useful predictors in this dataset.

**Role of Regularization**

Regularization helps prevent overfitting by controlling the magnitude of model coefficients. **Ridge Regression** (L2 regularization) reduces coefficient values smoothly, improving generalization without eliminating features. **Lasso Regression** (L1 regularization) can reduce some coefficients to exactly zero, effectively performing feature selection. Both methods improve model robustness, especially when features are correlated, but Ridge is more suitable when all predictors contribute to the target variable.