In [2]:
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_california_housing

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression

from sklearn.decomposition import PCA

from sklearn.metrics import mean_squared_error

In [3]:
# Load the California Housing dataset

data = fetch_california_housing(as_frame=True)
df = data.frame

In [4]:
# Features and Target

X = df.drop(columns="MedHouseVal")
y = df["MedHouseVal"]


In [5]:
# Standardize the data (important for PCA)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [6]:
# Apply PCA

pca = PCA()

X_pca = pca.fit_transform(X_scaled)

In [7]:
# Explained variance ratio

explained_variance = pca.explained_variance_ratio_

print('Explained variance ratio :', explained_variance)

Explained variance ratio : [0.25336868 0.23516245 0.15888635 0.12887971 0.12538195 0.0824225
 0.01020022 0.00569814]


In [8]:
# Choose number of components (e.g., to capture 95% variance)

cumulative_variance = np.cumsum(explained_variance)

n_components = np.argmax( cumulative_variance > 0.95)  + 1        

print(f"Number of components to retain 95% variance: {n_components}")


Number of components to retain 95% variance: 6


In [9]:
# Reduce the dataset to the selected number of components

pca = PCA(n_components=n_components)

X_reduced = pca.fit_transform(X_scaled)

In [10]:
# Train a regression model

X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state= 42)

model = LinearRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [11]:
# Evaluate the model

mse = mean_squared_error(y_test, y_pred)

print(f"Mean Squared Error with PCA{n_components} components :  {mse}")

Mean Squared Error with PCA6 components :  0.6676194933473529
