In [1]:
from sklearn.datasets import fetch_california_housing
import pandas as pd

# Load dataset
california = fetch_california_housing()
df = pd.DataFrame(california.data, columns=california.feature_names)
df['MedHouseValue'] = california.target


In [2]:
print(df.isnull().sum())
# No missing values in this dataset


MedInc           0
HouseAge         0
AveRooms         0
AveBedrms        0
Population       0
AveOccup         0
Latitude         0
Longitude        0
MedHouseValue    0
dtype: int64


In [7]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(df.drop('MedHouseValue', axis=1))
y = df['MedHouseValue']


In [None]:
#Why preprocessing?
#Standardization is necessary because many ML algorithms (like SVR and Gradient Boosting) are sensitive to feature scales.



In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [13]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)



In [None]:
#Explanation: Assumes linear relationship between features and target.

In [15]:
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)


In [None]:
#Explanation: Non-linear model, good for capturing complex relationships.

In [20]:
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)


In [None]:
#Explanation: Ensemble of trees, reduces overfitting, good performance.



In [22]:
gb = GradientBoostingRegressor(random_state=42)
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)


In [None]:
#Explanation: Boosting model that improves iteratively, good with structured data.



In [24]:
svr = SVR()
svr.fit(X_train, y_train)
y_pred_svr = svr.predict(X_test)


In [None]:
#Explanation: Works well on small-to-medium datasets, sensitive to scaling.



In [26]:
def evaluate_model(y_test, y_pred, model_name):
    print(f"--- {model_name} ---")
    print("MSE:", mean_squared_error(y_test, y_pred))
    print("MAE:", mean_absolute_error(y_test, y_pred))
    print("R²:", r2_score(y_test, y_pred))
    print()

evaluate_model(y_test, y_pred_lr, "Linear Regression")
evaluate_model(y_test, y_pred_dt, "Decision Tree")
evaluate_model(y_test, y_pred_rf, "Random Forest")
evaluate_model(y_test, y_pred_gb, "Gradient Boosting")
evaluate_model(y_test, y_pred_svr, "SVR")


--- Linear Regression ---
MSE: 0.555891598695244
MAE: 0.5332001304956566
R²: 0.5757877060324511

--- Decision Tree ---
MSE: 0.5052210710022044
MAE: 0.4562850557170543
R²: 0.6144554262132605

--- Random Forest ---
MSE: 0.25621319799807024
MAE: 0.3276279949127909
R²: 0.8044784473760151

--- Gradient Boosting ---
MSE: 0.29399901242474274
MAE: 0.37165044848436773
R²: 0.7756433164710084

--- SVR ---
MSE: 0.35519846199894217
MAE: 0.39776309634378626
R²: 0.7289407597956459



In [None]:
#After evaluating all models, comparison based on accuracy, precision, recall, and F1-score.

 #Best Performing Algorithm: Likely Random Forest or SVM due to high accuracy and generalization.

 #Worst Performing Algorithm: Possibly Decision Tree (overfitting) or k-NN (if data is noisy or unbalanced).

