In [89]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")

##**1. Loading and Preprocessing**

---
 **Load the California Housing dataset using the fetch_california_housing function from sklearn.**




##**Step 1:**

In [90]:
housing = fetch_california_housing() #load data from california housing
housing

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': '.. _california_housing_dataset:\n

##**Step 2:**

 **Convert the dataset into a pandas DataFrame for easier handling. Handle
missing values (if any) and perform necessary feature scaling (e.g.,
standardization).**

In [91]:
# since the data load as a disctionary, we need to convert the dataset into dataframe
df=pd.DataFrame(housing.data,columns=housing.feature_names)
df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [92]:
df['MedHouseVal']=housing.target # adding target column to the dataframe
df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


##**Step 3:**

In [93]:
print("Shape of DataFrame",df.shape)
print("\nColumns of DataFrame",df.columns)

df.describe()



Shape of DataFrame (20640, 9)

Columns of DataFrame Index(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
       'Latitude', 'Longitude', 'MedHouseVal'],
      dtype='object')


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704,2.068558
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532,1.153956
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35,0.14999
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8,1.196
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49,1.797
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01,2.64725
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31,5.00001


##**Step 4:**

In [94]:
# Checking for missing values.
df.isna().sum()

Unnamed: 0,0
MedInc,0
HouseAge,0
AveRooms,0
AveBedrms,0
Population,0
AveOccup,0
Latitude,0
Longitude,0
MedHouseVal,0


##**Step 5:**

In [95]:
#Check for duplicate

duplicate_rows=df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_rows}")

Number of duplicate rows: 0


In [96]:
# Finding outliers and handle it
num_cols=df.select_dtypes(include=['int64','float64']).columns

for col in num_cols:
    Q1 = df[col].quantile(0.25)
    print(f"Q1 for {col}: {Q1}")
    Q2 = df[col].quantile(0.50)
    print(f"Q2 for {col}: {Q2}")
    Q3 = df[col].quantile(0.75)
    print(f"Q3 for {col}: {Q3}")
    IQR = Q3 - Q1
    print(f"IQR for {col}: {IQR}")
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    print(f"Outliers count for {col}:{len(outliers)}\n")
    df[col] = df[col].clip(lower_bound, upper_bound)

Q1 for MedInc: 2.5633999999999997
Q2 for MedInc: 3.5347999999999997
Q3 for MedInc: 4.74325
IQR for MedInc: 2.17985
Outliers count for MedInc:681

Q1 for HouseAge: 18.0
Q2 for HouseAge: 29.0
Q3 for HouseAge: 37.0
IQR for HouseAge: 19.0
Outliers count for HouseAge:0

Q1 for AveRooms: 4.440716235896959
Q2 for AveRooms: 5.229128787878788
Q3 for AveRooms: 6.052380952380952
IQR for AveRooms: 1.6116647164839932
Outliers count for AveRooms:511

Q1 for AveBedrms: 1.006079046038478
Q2 for AveBedrms: 1.048780487804878
Q3 for AveBedrms: 1.099526066350711
IQR for AveBedrms: 0.09344702031223284
Outliers count for AveBedrms:1424

Q1 for Population: 787.0
Q2 for Population: 1166.0
Q3 for Population: 1725.0
IQR for Population: 938.0
Outliers count for Population:1196

Q1 for AveOccup: 2.4297411475535755
Q2 for AveOccup: 2.818115654360196
Q3 for AveOccup: 3.2822609242736216
IQR for AveOccup: 0.8525197767200461
Outliers count for AveOccup:711

Q1 for Latitude: 33.93
Q2 for Latitude: 34.26
Q3 for Latitude

##**Step 6:**

In [97]:
#checking for categorical value to check whether encoding is needed. There is no need for encoding as there is no categorical values
cat_cols=df.select_dtypes(include=['object']).columns
print(cat_cols)

# Numerical columns
num_cols=df.select_dtypes(include=['int64','float64']).columns
print(num_cols)

Index([], dtype='object')
Index(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
       'Latitude', 'Longitude', 'MedHouseVal'],
      dtype='object')


In [98]:
#Performing feature scaling for numerical values

scaler=StandardScaler()
scaled_features = scaler.fit_transform(df[num_cols])

scaled_df=pd.DataFrame(scaled_features,columns=num_cols)
scaled_df['MedHouseVal']=df['MedHouseVal']
print("\nScaled DataFrame:\n")
scaled_df


Scaled DataFrame:



Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,2.541006,0.982143,1.347665,-0.424488,-1.325821,-0.497871,1.052548,-1.327835,4.526
1,2.541006,-0.607019,0.749027,-1.070004,1.389936,-1.142781,1.043185,-1.322844,3.585
2,2.085156,1.856182,2.394098,0.192534,-1.098528,-0.140910,1.038503,-1.332827,3.521
3,1.111288,1.856182,0.411358,0.187723,-1.017539,-0.508882,1.038503,-1.337818,3.413
4,0.027262,1.856182,0.784108,0.287439,-1.008395,-1.039145,1.038503,-1.337818,3.422
...,...,...,...,...,...,...,...,...,...
20635,-1.351765,-0.289187,-0.208070,0.936973,-0.642637,-0.490563,1.801647,-0.758826,0.781
20636,-0.750601,-0.845393,0.649438,2.259147,-1.281408,0.322894,1.806329,-0.818722,0.771
20637,-1.267488,-0.924851,-0.079603,0.772378,-0.431019,-0.830546,1.778237,-0.823713,0.923
20638,-1.166620,-0.845393,0.019880,1.416631,-0.778490,-1.123439,1.778237,-0.873626,0.847


For Loading and Preprocessing the following steps are performed,

**Step 1**: Loaded the california housing dataset(Fetch_california_housing) from sklearn library which is an inbuilt dataset provided by sklearn for Machine Learning practice. It contains the features describing California districts (e.g., median income, average rooms, population) and the target value (median house value).

**Step 2** : Since the loaded dataset is in dictionary format, we need to convert them into Pandas DataFrame format to make it easier for viewing and perform preprocessing techniques. Then add the target column 'MedHouseVal' to the DataFrame to make the features and target together.

**Step 3** : Display the overall informations about the dataset such as shape,columns etc and statistical measurements using describe() method. This will help us to identify whether the dataset loaded correctly and an overall idea about the columns.The method describe() provides a quick numerical summary of the dataset.

**Step 4** : Checking for the duplicate value using isna() method. If there is larger amount of missing data , it can cause dataset bias which lead to low performance. Here there is no missing datas found.

**Step 5** : After missing data handling, next step will be the handling of outliers. Here I used the Inter Quartile Range(IQR) method to check and manage outliers. Outliers will affect the model perfoemance and lead to bias. Capping keeps the data within reasonable limits without losing valuable samples.

**Step 6** : Here Step 1 - 5 performing the data cleaning peocess in preprocessing. Next we need to perform data tranformation to convert the data into a suitable format for machine learning algorithms. In transformation categorical values and numerical values are treated separately.
Encoding is used for categorical data.It convert categorical data to numerical values. So here first check if categorical data is available, here it is zero. So no need to encode
Scaling is applied on numerical features inorder to make them in a unique format.Scaling brings all features to a similar scale, improving model stability. Here I performed standardscalar for scaling.

##**2. Regression Algorithm Implementation:**

● Implement the following regression algorithms:

○ Linear Regression

○ Decision Tree Regressor

○ Random Forest Regressor


● For each algorithm: Provide a brief explanation of how it works. Explain why it
might be suitable for this dataset.

In [99]:
#Define X and Y from the scaled dataset
x = scaled_df.drop('MedHouseVal', axis=1)
y = scaled_df['MedHouseVal']

#Split the data
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


# Linear Regression
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)
y_pred_linear = linear_reg.predict(X_test)



**Linear Regression**

---



**Linear Regression is the relationship between one dependent variable (target) and one or more independent variables (features) by fitting a straight line. It uses the least squares method to minimize the difference between the actual and predicted values.**

**The California housing dataset has numeric features like income, number of rooms, and population, so Linear Regression can be used.It works well as a basic model to see how the features relate to house prices.**

In [100]:
# Decision Tree Regression
decision_tree_reg = DecisionTreeRegressor(random_state=42)
decision_tree_reg.fit(X_train, y_train)
y_pred_decision_tree = decision_tree_reg.predict(X_test)



**Decision Tree**

---



**Decision Tree is a Supervised Machine Learning algorithm used for both classification and prediction tasks.It splits data into smaller subsets based on feature thresholds, forming a hierachial structure. Each split tries to make the target values in each group as similar as possible. At the end, each leaf of the tree predicts the outcome forthat feature.**

**It is suitable for The California Housing dataset because it captures nonlinear relationships between features (like location, income, and rooms).It is also easy to understand, as you can see how the data is split at each step to make predictions.**

In [101]:
# Random Forest Regression
random_forest_reg = RandomForestRegressor(n_estimators=100, random_state=42)
random_forest_reg.fit(X_train, y_train)
y_pred_random_forest = random_forest_reg.predict(X_test)

**Random Forest**

---



**Random Forest is an ensemble model that builds many Decision Trees on different random subsets of the data and features, then averages their predictions to reduce overfitting and improve accuracy.**

**It works well on datasets with different types of features. It can handle nonlinear relationships and interactions between features, and usually gives accurate predictions on structured data like California Housing Dataset.**

##**3. Model Evaluation and Comparison:**

● Evaluate the performance of each algorithm using the following metrics:

○ Mean Squared Error (MSE)

○ Mean Absolute Error (MAE)

○ R-squared Score (R2)

● Compare the results of all models and identify: The best-performing algorithm
with justification and the worst-performing algorithm with reasoning.

In [102]:
def evaluate_model(y_true, y_pred, name):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"\n{name} Performance:")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"R2 Score: {r2:.4f}")


evaluate_model(y_test, y_pred_linear, "Linear Regression")
evaluate_model(y_test, y_pred_decision_tree, "Decision Tree Regression")
evaluate_model(y_test, y_pred_random_forest, "Random Forest Regression")


Linear Regression Performance:
Mean Squared Error (MSE): 0.4424
Mean Absolute Error (MAE): 0.4947
R2 Score: 0.6501

Decision Tree Regression Performance:
Mean Squared Error (MSE): 0.4864
Mean Absolute Error (MAE): 0.4504
R2 Score: 0.6153

Random Forest Regression Performance:
Mean Squared Error (MSE): 0.2427
Mean Absolute Error (MAE): 0.3238
R2 Score: 0.8080


##**Linear Regression:**
**The Linear Regression model has an MSE of 0.4424, MAE of 0.4947, and R2 of 0.6501. This means it explains about 65% of the variation in house prices. The errors are moderate, so the model predicts reasonably well. However, it cannot capture nonlinear relationships in the data, which limits its accuracy for more complex patterns.**

##**Decision Tree Regressor:**
**The Decision Tree Regressor shows an MSE of 0.4864, MAE of 0.4504, and R2 of 0.6153. Its R2 is slightly lower than Linear Regression, and the MSE is higher, indicating that it generalizes a bit worse. While it can capture some nonlinear patterns, it may overfit the training data, which reduces its performance on unseen data.So we can conclude that the worst model is the Decision Tree Regressor.**

##**Random Forest Regressor:**
**The Random Forest Regressor has an MSE of 0.2427, MAE of 0.3238, and R2 of 0.8080, explaining over 80% of the variance in house prices. Both the MSE and MAE are much lower than the other models, meaning its predictions are closer to the actual values. It captures complex patterns in the data while avoiding overfitting, making it the best-performing model among the three.**