In [1]:
# 导入模块
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import numpy as np

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [3]:
# 读取数据集
data = pd.read_csv('../data/pollution.csv',parse_dates=['date'])
data.head()

Unnamed: 0,date,pollution,dew,temp,press,wnd_dir,wnd_spd,snow,rain
0,2010-01-02 00:00:00,129.0,-16,-4.0,1020.0,SE,1.79,0,0
1,2010-01-02 01:00:00,148.0,-15,-4.0,1020.0,SE,2.68,0,0
2,2010-01-02 02:00:00,159.0,-11,-5.0,1021.0,SE,3.57,0,0
3,2010-01-02 03:00:00,181.0,-7,-5.0,1022.0,SE,5.36,1,0
4,2010-01-02 04:00:00,138.0,-7,-5.0,1022.0,SE,6.25,2,0


In [4]:
# 对 'Category' 列进行编码
encoder = LabelEncoder()
data['wnd_dir'] = encoder.fit_transform(data['wnd_dir'])
data = data.set_index('date')
data.head()

Unnamed: 0_level_0,pollution,dew,temp,press,wnd_dir,wnd_spd,snow,rain
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-01-02 00:00:00,129.0,-16,-4.0,1020.0,2,1.79,0,0
2010-01-02 01:00:00,148.0,-15,-4.0,1020.0,2,2.68,0,0
2010-01-02 02:00:00,159.0,-11,-5.0,1021.0,2,3.57,0,0
2010-01-02 03:00:00,181.0,-7,-5.0,1022.0,2,5.36,1,0
2010-01-02 04:00:00,138.0,-7,-5.0,1022.0,2,6.25,2,0


In [5]:
# 归一化
scaler = MinMaxScaler()
scaled_values = scaler.fit_transform(data)
data = pd.DataFrame(scaled_values, columns=data.columns)
data.head()

Unnamed: 0,pollution,dew,temp,press,wnd_dir,wnd_spd,snow,rain
0,0.129779,0.352941,0.245902,0.527273,0.666667,0.00229,0.0,0.0
1,0.148893,0.367647,0.245902,0.527273,0.666667,0.003811,0.0,0.0
2,0.15996,0.426471,0.229508,0.545455,0.666667,0.005332,0.0,0.0
3,0.182093,0.485294,0.229508,0.563636,0.666667,0.008391,0.037037,0.0
4,0.138833,0.485294,0.229508,0.563636,0.666667,0.009912,0.074074,0.0


In [6]:
data.shape

(43800, 8)

In [7]:
# 构造24小时数据集预测1小时数据
input = []
label = []
for i in range(data.shape[0]-24):
    input.append((data.iloc[i:i+24,1:]).values)
    label.append(data.iloc[i+24,0])

input=np.array(input)
label = np.array(label)
print(input.shape,label.shape)

(43776, 24, 7) (43776,)


In [8]:
X = torch.tensor(input, dtype=torch.float32).to(device)
y = torch.tensor(label, dtype=torch.float32).to(device)

In [9]:
time_length = 24*365

In [10]:
Xtrain = X[:24*365]
Xtest = X[24*365:]

ytrain = y[:24*365]
ytest=y[24*365:]

In [11]:
# 创建训练集数据加载器
# TensorDataset用于将输入特征和标签数据打包成一个可迭代的对象。
trainset = TensorDataset(Xtrain, ytrain)
batch_size = 64
train_loader = DataLoader(trainset, batch_size=batch_size, shuffle=True)
len(train_loader)

137

In [12]:
# 创建测试集数据加载器
# TensorDataset用于将输入特征和标签数据打包成一个可迭代的对象。
testset = TensorDataset(Xtest, ytest)
batch_size = 64
test_loader = DataLoader(testset, batch_size=batch_size, shuffle=True)
len(test_loader)

548

In [13]:
# 获取数据加载器中的第一个批次
for data, labels in test_loader:
    print("数据批次形状:", data.shape)  # 打印数据的形状
    print("标签批次形状:", labels.shape)  # 打印标签的形状
    break  # 只获取第一个批次，避免遍历整个数据集

数据批次形状: torch.Size([64, 24, 7])
标签批次形状: torch.Size([64])


In [14]:
import torch.nn as nn
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        lstm_out, (h_n, c_n) = self.lstm(x)
        out = self.fc(lstm_out[:, -1, :])  # 只取最后一个时间步的输出
        return out

# 初始化模型、损失函数和优化器
input_size = Xtrain.shape[2]  # 特征数
hidden_size = 50               # LSTM 隐藏层单元数
output_size = 1                # 假设 y 是一个标量

model = LSTMModel(input_size, hidden_size, output_size).to(device)
loss_funciton = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [15]:

# 训练模型
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    for inputs, targets in train_loader:
        optimizer.zero_grad()

        # 前向传播
        outputs = model(inputs)

        # 计算损失
        loss = loss_funciton(outputs, targets)

        # 反向传播和优化
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    # 输出每个 epoch 的平均损失
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader)}")

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [1/10], Loss: 0.008826154207701992
Epoch [2/10], Loss: 0.008801617907754479
Epoch [3/10], Loss: 0.00880196440733806
Epoch [4/10], Loss: 0.008767340209906119
Epoch [5/10], Loss: 0.008784593096560371
Epoch [6/10], Loss: 0.008755727365177914
Epoch [7/10], Loss: 0.008768403389402768
Epoch [8/10], Loss: 0.008796901136201664
Epoch [9/10], Loss: 0.008761707326247745
Epoch [10/10], Loss: 0.008749207216620881


In [16]:
# 设置模型为评估模式
model.eval()
with torch.no_grad():
    # 创建评估变量
    total_loss = 0
    total_samples = 0
    correct_predictions = 0

    # 评估测试集
    for inputs, targets in test_loader:
        # 前向传播
        outputs = model(inputs)

        # 计算损失
        loss = loss_funciton(outputs, targets)
        total_loss += loss.item() * inputs.size(0)  # 累积损失

        # 统计正确预测
        correct_predictions += (outputs.round() == targets).sum().item()  # 对于回归任务可修改为适应任务类型

        total_samples += inputs.size(0)

    # 计算平均损失和准确率
    avg_loss = total_loss / total_samples
    accuracy = correct_predictions / total_samples

    # 输出结果
    print(f"Test Loss: {avg_loss:.4f}")
    print(f"Test Accuracy: {accuracy:.4f}")


Test Loss: 0.0089
Test Accuracy: 2.5134


  return F.mse_loss(input, target, reduction=self.reduction)
