#### 使用RNN实现一个天气预测模型，能预测5天的最高气温

In [2]:
import pandas as pd
import torch
import math
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import mean_squared_error
from torch.utils.tensorboard import SummaryWriter

In [7]:
def read_data():
    df = pd.read_csv('/kaggle/input/weatherww2/Summary of Weather.csv')
    sta_temp_list = []
    len_list = []
    for sta, group in df.groupby('STA'):
        max_temp = group['MaxTemp']
        max_temp_tensor = torch.tensor(max_temp.values, dtype=torch.float32)
        if len(max_temp_tensor) < 365: continue # 只保留365天以上的气象站数据
        sta_temp_list.append(max_temp_tensor)
        len_list.append(len(max_temp_tensor))
    min_len = min(len_list)
    batch_num = len(sta_temp_list)

    weather_series = torch.zeros(batch_num, min_len, 1)
    for i in range(batch_num):
        weather_series[i, :min_len, :] = sta_temp_list[i][:min_len].contiguous().view(-1, 1)
    return weather_series

def plot_series(x, y=None, y_pred=None):
    scalar_num = 10
    writer_x = SummaryWriter(log_dir='./runs/x5')
    for n in range(scalar_num):
        item = x[n]
        for i in range(item.shape[0]):
            writer_x.add_scalar(f'series/num{n}', item[i][0], global_step=i)
    writer_x.close()

    if y is not None:
        writer_y = SummaryWriter(log_dir='./runs/y5')
        for n in range(scalar_num):
            item = y[n]
            for i in range(item.shape[0]):
                writer_y.add_scalar(f'series/num{n}', item[i], global_step=i+x.shape[1])
        writer_y.close()

    if y_pred is not None:
        writer_y_pred = SummaryWriter(log_dir='./runs/y_pred5')
        for n in range(scalar_num):
            item = y_pred[n]
            for i in range(item.shape[0]):
                writer_y_pred.add_scalar(f'series/num{n}', item[i], global_step=i+x.shape[1])
        writer_y_pred.close()

class TimeSeriesDataset(Dataset):
    def __init__(self, X,y=None, train=True):
        self.X = X
        self.y = y
        self.train = train
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        if self.train:
            return self.X[idx], self.y[idx]
        return self.X[idx]

In [51]:
class DeepRNN(torch.nn.Module):
  def __init__(self, n_out=5, dropout=0, hidden_size=200, num_layers=2):
    super().__init__()
    self.rnn = torch.nn.GRU(input_size=1, hidden_size=hidden_size, num_layers=num_layers, batch_first=True, dropout=dropout)
    self.fc = torch.nn.Linear(hidden_size, n_out)

  def forward(self, x):
    x, h = self.rnn(x) 
    # x, (h, _) = self.rnn(x) 
    # [ Batch, time steps, features ] --> [ Batch x time steps, features ]
    x_reshaped = x.contiguous().view(-1, x.size(-1))
    y = self.fc(x_reshaped)
    # [ Batch x time steps, features ] --> [ Batch, time steps, features ]
    y = y.contiguous().view(x.size(0), -1, y.size(-1))
    return y

In [66]:
device = "cuda" if torch.cuda.is_available() else "cpu"
def fit(model, dataloader, epochs=10):
    model.to(device)
    criterion = torch.nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    for epoch in range(epochs):
        model.train()
        train_loss = []
        for X, y in dataloader['train']:
            X, y = X.to(device), y.to(device)
            y_hat = model(X)
            loss = criterion(y_hat, y)
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(),max_norm=1.0) # 梯度裁剪
            optimizer.step()
            train_loss.append((y[:,-1] - y_hat[:,-1]).pow(2).mean().item()) # 只计算最后一个时间步的loss
        if (epoch + 1) % 10 == 0:
            print(f"eopch={epoch},loss={np.mean(train_loss)}")

def predict(model, dataloader):
    model.eval()
    with torch.no_grad():
        preds = torch.tensor([]).to(device)
        for X in dataloader:
            X = X.to(device)
            pred = model(X)
            preds = torch.cat([preds, pred])
    return preds

In [10]:
weather_series = read_data()
batch_size = weather_series.shape[0]
s_steps = weather_series.shape[1]
train_size = math.ceil(batch_size*0.8)
# 训练集和测试集划分
X_train = weather_series[:train_size, :s_steps-5]
X_test = weather_series[train_size:, :s_steps-5]
Y = np.empty((batch_size, s_steps-5, 5), dtype=np.float32)
for step_ahead in range(1, 5 + 1):
    Y[..., step_ahead - 1] = weather_series[..., step_ahead:step_ahead + s_steps - 5, 0]
Y_train = Y[:train_size]
Y_test = Y[train_size:]
dataset = {
    'train': TimeSeriesDataset(X_train, Y_train),
    'test': TimeSeriesDataset(X_test, Y_test, train=False)
}

dataloader = {
    'train': DataLoader(dataset['train'], shuffle=True, batch_size=64),
    'test': DataLoader(dataset['test'], shuffle=False, batch_size=64)
}

  df = pd.read_csv('/kaggle/input/weatherww2/Summary of Weather.csv')


In [71]:
# 模型训练
rnn = DeepRNN(hidden_size=500, num_layers=2)
fit(rnn, dataloader, 2000)
# 模型预测：未来5天的最高气温
y_pred = predict(rnn, dataloader['test'])
plot_series(X_test, Y_test[:,-1], y_pred[:,-1].cpu())
mean_squared_error(Y_test[:,-1], y_pred[:,-1].cpu())

eopch=9,loss=259.0362091064453
eopch=19,loss=71.84402084350586
eopch=29,loss=13.31692123413086
eopch=39,loss=9.654211044311523
eopch=49,loss=9.51150894165039
eopch=59,loss=7.477546453475952
eopch=69,loss=7.185171604156494
eopch=79,loss=7.370149374008179
eopch=89,loss=6.55795693397522
eopch=99,loss=5.652295112609863
eopch=109,loss=5.718421697616577
eopch=119,loss=9.011672973632812
eopch=129,loss=5.838027238845825
eopch=139,loss=6.916633367538452
eopch=149,loss=5.3243255615234375
eopch=159,loss=6.562546014785767
eopch=169,loss=7.256799221038818
eopch=179,loss=5.8081581592559814
eopch=189,loss=6.242969036102295
eopch=199,loss=5.355278253555298
eopch=209,loss=5.626178741455078
eopch=219,loss=6.477880477905273
eopch=229,loss=5.426557540893555
eopch=239,loss=5.865006685256958
eopch=249,loss=5.564425468444824
eopch=259,loss=5.948466539382935
eopch=269,loss=5.574783802032471
eopch=279,loss=6.010721683502197
eopch=289,loss=4.991860389709473
eopch=299,loss=5.122057914733887
eopch=309,loss=4.6364

3.5016303