In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
import pandas as pd


### Load the California Housing Dataset

In [2]:
data = fetch_california_housing()
X, y = data.data, data.target  # Features and target

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the features
#scaler = StandardScaler()
#X_train = scaler.fit_transform(X_train)
#X_test = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
y_test = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)



In [3]:
print(data.DESCR)


.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

:Number of Instances: 20640

:Number of Attributes: 8 numeric, predictive attributes and the target

:Attribute Information:
    - MedInc        median income in block group
    - HouseAge      median house age in block group
    - AveRooms      average number of rooms per household
    - AveBedrms     average number of bedrooms per household
    - Population    block group population
    - AveOccup      average number of household members
    - Latitude      block group latitude
    - Longitude     block group longitude

:Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived from the 1990 U.S. census, using one row per ce

In [4]:
# Create a DataFrame with feature names
df = pd.DataFrame(data.data, columns=data.feature_names)

# Add the target column
df['Target'] = data.target

# Display the first few rows of the dataset
print("Number of rows:", df.shape[0])
df.head()

Number of rows: 20640


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


### Define the model

In [6]:
class HousePricePredictor(nn.Module):
    def __init__(self, input_size):
        super(HousePricePredictor, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_size, 64),  # Input features
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, x):
        return self.fc(x)





### Define the model without using the nn.Sequential class

In [23]:
class HousePricePredictor2(nn.Module):
    def __init__(self, input_size):
        super(HousePricePredictor2, self).__init__()

        # Correct input size to match the dataset
        self.fc1 = nn.Linear(input_size, 64)  # Input layer now accepts 8 features
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(64, 32)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(32, 1)  # Output layer for regression

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        return x


In [22]:
input_size = X_train.shape[1]  # Automatically get the number of features
model = HousePricePredictor2(input_size)

# Define the loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Training loop
num_epochs = 500

for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    predictions = model(X_train)
    loss = criterion(predictions, y_train)
    loss.backward()
    optimizer.step()

    if (epoch+1) % 50 == 0:
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}")



Epoch 50/500, Loss: 7.7160
Epoch 100/500, Loss: 2.5645
Epoch 150/500, Loss: 2.0109
Epoch 200/500, Loss: 1.6540
Epoch 250/500, Loss: 1.4179
Epoch 300/500, Loss: 1.2478
Epoch 350/500, Loss: 1.1155
Epoch 400/500, Loss: 1.0090
Epoch 450/500, Loss: 0.9226
Epoch 500/500, Loss: 0.8530


In [11]:
# Evaluate the model
model.eval()
with torch.no_grad():
    test_predictions = model(X_test)
    mae = mean_absolute_error(y_test.numpy(), test_predictions.numpy())
    print(f"Test MAE: {mae:.2f}")


Test MAE: 0.58
