In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import os

# 设置中文字体支持
plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号

# 加载加利福尼亚房价数据集
housing = fetch_california_housing()
X = housing.data
y = housing.target

# 数据拆分：训练集(60%)、验证集(20%)、测试集(20%)
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)  # 0.25 x 0.8 = 0.2

print(f"训练集大小: {X_train.shape[0]}")
print(f"验证集大小: {X_val.shape[0]}")
print(f"测试集大小: {X_test.shape[0]}")

# 数据标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# 自定义数据集类
class HousingDataset(Dataset):
    def __init__(self, features, targets):
        self.features = torch.FloatTensor(features)
        self.targets = torch.FloatTensor(targets).view(-1, 1)
        
    def __len__(self):
        return len(self.features) #返回样本数量
    
    def __getitem__(self, idx): #传入索引，返回对应索引样本的特征和目标
        return self.features[idx], self.targets[idx]

# 创建数据集实例
train_dataset = HousingDataset(X_train_scaled, y_train)
val_dataset = HousingDataset(X_val_scaled, y_val)
test_dataset = HousingDataset(X_test_scaled, y_test)


训练集大小: 13209
验证集大小: 3303
测试集大小: 4128


# 加载数据，构建模型

In [3]:
# 创建数据加载器
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

# 定义神经网络模型
class RegressionModel(nn.Module):
    def __init__(self, input_dim):
        super(RegressionModel, self).__init__()
        self.layer1 = nn.Linear(input_dim, 30)
        self.activation = nn.ReLU()
        self.output = nn.Linear(30, 1)
        
    def forward(self, x):
        x = self.activation(self.layer1(x))
        x = self.output(x)
        return x

# 初始化模型、损失函数和优化器
input_dim = X_train.shape[1]
model = RegressionModel(input_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters())

# 打印模型结构
print(model)

RegressionModel(
  (layer1): Linear(in_features=8, out_features=30, bias=True)
  (activation): ReLU()
  (output): Linear(in_features=30, out_features=1, bias=True)
)


In [4]:
# 训练模型
from tqdm import tqdm
epochs = 100
train_losses = []
val_losses = []

global_step = 0

# 使用tqdm显示总体进度
for epoch in tqdm(range(epochs), desc="训练进度"):
    # 训练模式
    model.train()
    running_loss = 0.0
    running_mae = 0.0
    
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * inputs.size(0)

        
        # 更新全局步数
        global_step += 1
    
    epoch_train_loss = running_loss / len(train_loader.dataset)

    train_losses.append(epoch_train_loss)

    
    # 验证模式
    model.eval()
    running_loss = 0.0

    
    with torch.no_grad():
        for inputs, targets in val_loader:
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            
            running_loss += loss.item() * inputs.size(0)

    epoch_val_loss = running_loss / len(val_loader.dataset)

    val_losses.append(epoch_val_loss)

    
    if (epoch + 1) % 10 == 0:
        print(f'轮次 {epoch+1}/{epochs}, 训练损失: {epoch_train_loss:.4f}, 验证损失: {epoch_val_loss:.4f}, 全局步数: {global_step}')

训练进度:  10%|█         | 10/100 [00:06<00:55,  1.63it/s]

轮次 10/100, 训练损失: 0.3644, 验证损失: 0.3975, 全局步数: 4130


训练进度:  20%|██        | 20/100 [00:11<00:47,  1.67it/s]

轮次 20/100, 训练损失: 0.3316, 验证损失: 0.3928, 全局步数: 8260


训练进度:  30%|███       | 30/100 [00:16<00:34,  2.03it/s]

轮次 30/100, 训练损失: 0.3122, 验证损失: 0.4992, 全局步数: 12390


训练进度:  40%|████      | 40/100 [00:21<00:29,  2.04it/s]

轮次 40/100, 训练损失: 0.3051, 验证损失: 0.3271, 全局步数: 16520


训练进度:  50%|█████     | 50/100 [00:26<00:24,  2.06it/s]

轮次 50/100, 训练损失: 0.3003, 验证损失: 0.3290, 全局步数: 20650


训练进度:  60%|██████    | 60/100 [00:31<00:22,  1.74it/s]

轮次 60/100, 训练损失: 0.2961, 验证损失: 0.3524, 全局步数: 24780


训练进度:  70%|███████   | 70/100 [00:37<00:17,  1.76it/s]

轮次 70/100, 训练损失: 0.2934, 验证损失: 0.3641, 全局步数: 28910


训练进度:  80%|████████  | 80/100 [00:43<00:11,  1.81it/s]

轮次 80/100, 训练损失: 0.2901, 验证损失: 0.3288, 全局步数: 33040


训练进度:  90%|█████████ | 90/100 [00:48<00:05,  1.90it/s]

轮次 90/100, 训练损失: 0.2896, 验证损失: 0.3436, 全局步数: 37170


训练进度: 100%|██████████| 100/100 [00:53<00:00,  1.86it/s]

轮次 100/100, 训练损失: 0.2893, 验证损失: 0.3625, 全局步数: 41300





In [5]:
# 评估模型
def evaluate(model, dataloader, criterion):
    model.eval()
    running_loss = 0.0

    
    with torch.no_grad():
        for inputs, targets in dataloader:
            outputs = model(inputs) #前向计算
            loss = criterion(outputs, targets) #计算损失
            
            running_loss += loss.item() * inputs.size(0)
    
    return running_loss / len(dataloader.dataset)

train_loss = evaluate(model, train_loader, criterion)
val_loss= evaluate(model, val_loader, criterion)
test_loss = evaluate(model, test_loader, criterion)

print(f"训练集 - 均方误差: {train_loss:.4f}")
print(f"验证集 - 均方误差: {val_loss:.4f}")
print(f"测试集 - 均方误差: {test_loss:.4f}")

训练集 - 均方误差: 0.2890
验证集 - 均方误差: 0.3625
测试集 - 均方误差: 0.3384
