In [1]:
import torch
import torch.nn as nn
from torchvision.transforms import ToTensor
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler


#### 数据预处理

In [2]:
df = pd.read_csv("./data/archive/Summary of Weather.csv")
df.head()

  df = pd.read_csv("./data/archive/Summary of Weather.csv")


Unnamed: 0,STA,Date,Precip,WindGustSpd,MaxTemp,MinTemp,MeanTemp,Snowfall,PoorWeather,YR,...,FB,FTI,ITH,PGT,TSHDSBRSGF,SD3,RHX,RHN,RVG,WTE
0,10001,1942-7-1,1.016,,25.555556,22.222222,23.888889,0.0,,42,...,,,,,,,,,,
1,10001,1942-7-2,0.0,,28.888889,21.666667,25.555556,0.0,,42,...,,,,,,,,,,
2,10001,1942-7-3,2.54,,26.111111,22.222222,24.444444,0.0,,42,...,,,,,,,,,,
3,10001,1942-7-4,2.54,,26.666667,22.222222,24.444444,0.0,,42,...,,,,,,,,,,
4,10001,1942-7-5,0.0,,26.666667,21.666667,24.444444,0.0,,42,...,,,,,,,,,,


In [3]:
# T值表示微小量，用数值0.05替代
df = df.replace(to_replace='T', value=0.05)

In [4]:
print(df.isnull().sum())

STA                 0
Date                0
Precip              0
WindGustSpd    118508
MaxTemp             0
MinTemp             0
MeanTemp            0
Snowfall         1163
PoorWeather     84803
YR                  0
MO                  0
DA                  0
PRCP             1932
DR             118507
SPD            118508
MAX               474
MIN               468
MEA               498
SNF              1163
SND            113477
FT             119040
FB             119040
FTI            119040
ITH            119040
PGT            118515
TSHDSBRSGF      84803
SD3            119040
RHX            119040
RHN            119040
RVG            119040
WTE            119040
dtype: int64


In [5]:
# 统计STA列不同值的数量
print(df['STA'].value_counts())


STA
22508    2192
10701    2185
22502    2154
22504    2118
10803    1750
         ... 
42206      50
33123      47
43307      46
34101      31
33121      13
Name: count, Length: 159, dtype: int64


In [6]:

# 取STA列为10701的所有行数据
df = df[df['STA'] == 22504]
# 删除STA列,对数据重新按时间排序
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')  
df = df.sort_values(by=['Date']).reset_index(drop=True)
# 重新索引
df = df.reset_index(drop=True)
# 删除STA列
df = df.drop(columns=['STA'])
df.head(20)

Unnamed: 0,Date,Precip,WindGustSpd,MaxTemp,MinTemp,MeanTemp,Snowfall,PoorWeather,YR,MO,...,FB,FTI,ITH,PGT,TSHDSBRSGF,SD3,RHX,RHN,RVG,WTE
0,1940-01-01,2.286,,26.666667,17.222222,22.222222,0,,40,1,...,,,,,,,,,,
1,1940-01-02,5.334,,25.555556,18.333333,22.222222,0,,40,1,...,,,,,,,,,,
2,1940-01-03,0.0,,26.111111,17.222222,21.666667,0,,40,1,...,,,,,,,,,,
3,1940-01-04,7.112,,26.666667,22.222222,24.444444,0,,40,1,...,,,,,,,,,,
4,1940-01-05,0.0,,25.555556,17.777778,21.666667,0,,40,1,...,,,,,,,,,,
5,1940-01-06,0.0,,26.666667,16.666667,21.666667,0,,40,1,...,,,,,,,,,,
6,1940-01-07,23.114,,27.222222,18.888889,23.333333,0,,40,1,...,,,,,,,,,,
7,1940-01-08,21.082,,24.444444,17.222222,21.111111,0,,40,1,...,,,,,,,,,,
8,1940-01-09,2.54,,25.0,16.666667,21.111111,0,,40,1,...,,,,,,,,,,
9,1940-01-10,0.0,,26.111111,16.666667,21.111111,0,,40,1,...,,,,,,,,,,


In [7]:
# 对Snowfall列nan值进行插值
df['Snowfall'] = pd.to_numeric(df['Snowfall'], errors='coerce')  # coerce 将非数值转为 NaN
df['Snowfall'] = df['Snowfall'].interpolate(method='linear', limit_direction='both')

In [8]:
# PoorWeather表示有雷电的天气；雨夹雪；冰雹；尘土或沙尘；烟雾或霾；吹雪；雨；雪；雨凇；雾；0 = 否，1 = 是

df['PoorWeather'] = pd.to_numeric(df['PoorWeather'], errors='coerce')
df['PoorWeather'] = df['PoorWeather'].fillna(0)  # 填充 NaN 值为 0

In [9]:
print(df.isna().sum())  # 检查缺失值  

Date              0
Precip            0
WindGustSpd    2118
MaxTemp           0
MinTemp           0
MeanTemp          0
Snowfall          0
PoorWeather       0
YR                0
MO                0
DA                0
PRCP             31
DR             2118
SPD            2118
MAX              17
MIN              17
MEA              17
SNF               1
SND            2118
FT             2118
FB             2118
FTI            2118
ITH            2118
PGT            2118
TSHDSBRSGF     1739
SD3            2118
RHX            2118
RHN            2118
RVG            2118
WTE            2118
dtype: int64


In [10]:
# 选择特征

features = ["MaxTemp", "MinTemp", "Precip", "Snowfall","PoorWeather"]
df = df[features]

# 将数据集中所有的列转换为数值类型  
df = df.apply(pd.to_numeric, errors='coerce')
df.head(10)


Unnamed: 0,MaxTemp,MinTemp,Precip,Snowfall,PoorWeather
0,26.666667,17.222222,2.286,0.0,0.0
1,25.555556,18.333333,5.334,0.0,0.0
2,26.111111,17.222222,0.0,0.0,0.0
3,26.666667,22.222222,7.112,0.0,0.0
4,25.555556,17.777778,0.0,0.0,0.0
5,26.666667,16.666667,0.0,0.0,0.0
6,27.222222,18.888889,23.114,0.0,0.0
7,24.444444,17.222222,21.082,0.0,0.0
8,25.0,16.666667,2.54,0.0,0.0
9,26.111111,16.666667,0.0,0.0,0.0


In [11]:
# 数据标准化
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data_scaled = scaler.fit_transform(df)


In [12]:
def create_sequences(data, seq_length,pred_length):
    xs, ys = [], []
    for i in range(len(data) - seq_length - pred_length):
        x = data[i:(i + seq_length)]
        y = data[(i + seq_length):(i + seq_length + pred_length), 0]  # 预测第一个特征（MaxTemp）
        xs.append(x)
        ys.append(y)
    return np.array(xs), np.array(ys)
    

In [13]:
seq_length = 10  # 时间步长
pred_length_1 = 1  # 预测 1 天
pred_length_5 = 5  # 预测 5 天

In [14]:
from sklearn.model_selection import train_test_split

X_1, y_1 = create_sequences(data_scaled, seq_length, pred_length_1)
X_5, y_5 = create_sequences(data_scaled, seq_length, pred_length_5)


X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_1, y_1, test_size=0.3)
X_train_5, X_test_5, y_train_5, y_test_5 = train_test_split(X_5, y_5, test_size=0.3)

# 转换为 PyTorch 张量
X_train_1 = torch.tensor(X_train_1, dtype=torch.float32)
y_train_1 = torch.tensor(y_train_1, dtype=torch.float32)
X_test_1 = torch.tensor(X_test_1, dtype=torch.float32)
y_test_1 = torch.tensor(y_test_1, dtype=torch.float32)

X_train_5 = torch.tensor(X_train_5, dtype=torch.float32)
y_train_5 = torch.tensor(y_train_5, dtype=torch.float32)
X_test_5 = torch.tensor(X_test_5, dtype=torch.float32)
y_test_5 = torch.tensor(y_test_5, dtype=torch.float32)

train_loader_1 = DataLoader(list(zip(X_train_1, y_train_1)), batch_size=32, shuffle=True)
train_loader_5 = DataLoader(list(zip(X_train_5, y_train_5)), batch_size=32, shuffle=True)

test_loader_1 = DataLoader(list(zip(X_test_1, y_test_1)), batch_size=32, shuffle=False)
test_loader_5 = DataLoader(list(zip(X_test_5, y_test_5)), batch_size=32, shuffle=False)


In [15]:
# 获取原始温度数据的标准差
def get_original_temp_std():
    if 'scaler' in globals():
        temp_index = features.index("MaxTemp")  
        return scaler.scale_[temp_index]
    else:
        return df["MaxTemp"].std()

# 计算不同容忍度对应的实际温度误差
temp_std = get_original_temp_std()
tolerances = [0.1, 0.2, 0.5,0.7, 1, 1.5,2]  # 容忍度列表

print("容忍度与实际温度误差对应关系:")
print("-" * 40)
print(f"{'容忍度':<10}{'实际温度误差 (°C)':<20}")
print("-" * 40)
for tolerance in tolerances:
    actual_error = tolerance * temp_std
    print(f"{tolerance:<10.1f}±{actual_error:<18.2f}")

容忍度与实际温度误差对应关系:
----------------------------------------
容忍度       实际温度误差 (°C)         
----------------------------------------
0.1       ±0.45              
0.2       ±0.91              
0.5       ±2.27              
0.7       ±3.18              
1.0       ±4.54              
1.5       ±6.82              
2.0       ±9.09              


#### RNN模型定义与训练

In [16]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)


    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)  # 初始化隐藏状态
        out, _ = self.rnn(x, h0)
        out = self.fc(out[:, -1, :])
        return out


In [17]:
# 定义模型参数
input_size = X_train_1.shape[2]  # 特征数量   
hidden_size = 50  # 隐藏层大小
num_layers = 3 # RNN 层数
output_size_1 = 1  
output_size_5 = 5  

In [18]:
writer = SummaryWriter(log_dir='./logs/rnn_weather')

In [19]:
def train_model(model, train_loader, criterion, optimizer, device, epoch):
    model.train()
    total_loss = 0
    for inputs, labels in train_loader: 
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f'Epoch [{epoch}] Training Loss: {avg_loss:.4f}')
    writer.add_scalar('Training Loss', avg_loss, epoch)



In [20]:
def test_model(model, test_loader, criterion, device, epoch, tolerance):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            
            # 相对误差小于tolerance视为正确
            relative_error = torch.abs((outputs - labels) / (labels + 1e-8))
            correct += (relative_error <= tolerance).sum().item()
            total += labels.numel()
    
    accuracy = 100 * correct / total
    
    print(f'Epoch [{epoch}] Testing Accuracy: {accuracy:.2f}% (tolerance={tolerance:.2f})')
    writer.add_scalar(f'Testing Accuracy)', accuracy, epoch)

In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
criterion = nn.MSELoss()

model_1 = RNN(input_size, hidden_size, num_layers, output_size_1).to(device)
model_5 = RNN(input_size, hidden_size, num_layers, output_size_5).to(device)

optimizer_1 = torch.optim.Adam(model_1.parameters(), lr=0.001, weight_decay=1e-5)
optimizer_5 = torch.optim.Adam(model_5.parameters(), lr=0.001, weight_decay=1e-5)

num_epochs = 30


In [22]:
for epoch in range(num_epochs):
    train_model(model_1, train_loader_1, criterion, optimizer_1, device, epoch)
    test_model(model_1, test_loader_1, criterion, device, epoch, tolerance=0.7)  

writer.close()

torch.save(model_1.state_dict(), './model/model_1.pth')


Epoch [0] Training Loss: 0.5822
Epoch [0] Testing Accuracy: 60.03% (tolerance=0.70)
Epoch [1] Training Loss: 0.4115
Epoch [1] Testing Accuracy: 59.08% (tolerance=0.70)
Epoch [2] Training Loss: 0.3284
Epoch [2] Testing Accuracy: 55.45% (tolerance=0.70)
Epoch [3] Training Loss: 0.2694
Epoch [3] Testing Accuracy: 65.88% (tolerance=0.70)
Epoch [4] Training Loss: 0.2187
Epoch [4] Testing Accuracy: 65.09% (tolerance=0.70)
Epoch [5] Training Loss: 0.1929
Epoch [5] Testing Accuracy: 58.45% (tolerance=0.70)
Epoch [6] Training Loss: 0.1619
Epoch [6] Testing Accuracy: 65.24% (tolerance=0.70)
Epoch [7] Training Loss: 0.1501
Epoch [7] Testing Accuracy: 67.77% (tolerance=0.70)
Epoch [8] Training Loss: 0.1334
Epoch [8] Testing Accuracy: 69.04% (tolerance=0.70)
Epoch [9] Training Loss: 0.1224
Epoch [9] Testing Accuracy: 67.61% (tolerance=0.70)
Epoch [10] Training Loss: 0.1191
Epoch [10] Testing Accuracy: 60.51% (tolerance=0.70)
Epoch [11] Training Loss: 0.1228
Epoch [11] Testing Accuracy: 62.24% (tole

In [23]:
for epoch in range(num_epochs):
    train_model(model_5, train_loader_5, criterion, optimizer_5, device, epoch)
    test_model(model_5, test_loader_5, criterion, device, epoch, tolerance=1.0)

writer.close()

torch.save(model_5.state_dict(), './model/model_5.pth')

Epoch [0] Training Loss: 1.0389
Epoch [0] Testing Accuracy: 73.31% (tolerance=1.00)
Epoch [1] Training Loss: 0.8685
Epoch [1] Testing Accuracy: 70.11% (tolerance=1.00)
Epoch [2] Training Loss: 0.7616
Epoch [2] Testing Accuracy: 71.19% (tolerance=1.00)
Epoch [3] Training Loss: 0.6953
Epoch [3] Testing Accuracy: 67.13% (tolerance=1.00)
Epoch [4] Training Loss: 0.6346
Epoch [4] Testing Accuracy: 67.61% (tolerance=1.00)
Epoch [5] Training Loss: 0.6007
Epoch [5] Testing Accuracy: 69.06% (tolerance=1.00)
Epoch [6] Training Loss: 0.5721
Epoch [6] Testing Accuracy: 69.76% (tolerance=1.00)
Epoch [7] Training Loss: 0.5533
Epoch [7] Testing Accuracy: 71.70% (tolerance=1.00)
Epoch [8] Training Loss: 0.5253
Epoch [8] Testing Accuracy: 68.65% (tolerance=1.00)
Epoch [9] Training Loss: 0.4949
Epoch [9] Testing Accuracy: 72.90% (tolerance=1.00)
Epoch [10] Training Loss: 0.4776
Epoch [10] Testing Accuracy: 72.36% (tolerance=1.00)
Epoch [11] Training Loss: 0.4703
Epoch [11] Testing Accuracy: 74.55% (tole

#### 模型预测

In [24]:
# 预测
def prepare_data(raw_data, seq_length=10):
    """改进的数据准备函数"""
    # 转换为DataFrame以保持特征名称
    if not isinstance(raw_data, pd.DataFrame):
        raw_data_df = pd.DataFrame(raw_data, columns=features)
    else:
        raw_data_df = raw_data
        
    # 数据标准化
    data_scaled = scaler.transform(raw_data_df)
    
    # 创建时序序列
    X = []
    for i in range(len(data_scaled) - seq_length + 1):
        X.append(data_scaled[i:i+seq_length])
    
    return torch.tensor(np.array(X), dtype=torch.float32)


def load_model(model_path, input_size, hidden_size, num_layers, output_size):
    """加载保存的模型"""
    model = RNN(input_size, hidden_size, num_layers, output_size)
    # 根据当前设备加载模型
    if device.type == 'cuda':
        model.load_state_dict(torch.load(model_path))
    else:
        model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
    model = model.to(device)  
    model.eval()
    return model



In [25]:
def predict_next_day(input_data):
    """预测未来一天的最高温度"""
    # 准备数据
    inputs = prepare_data(input_data).to(device)
    
    # 加载模型
    model = load_model('./model/model_1.pth', input_size, hidden_size, num_layers, output_size_1)
    
    # 预测
    with torch.no_grad():
        outputs = model(inputs)
    
    # 反标准化
    temp_index = features.index("MaxTemp")  
    temp_scale = scaler.scale_[temp_index]  
    temp_mean = scaler.mean_[temp_index]  
    predictions = outputs.cpu().numpy() * temp_scale + temp_mean
    
    return predictions.flatten()

In [26]:
def predict_five_days(input_data):
    """预测未来五天的最高温度"""
    # 准备数据
    inputs = prepare_data(input_data).to(device)
    
    # 加载模型
    model = load_model('./model/model_5.pth', input_size, hidden_size, num_layers, output_size_5)
    
    # 预测
    with torch.no_grad():
        outputs = model(inputs)
    
    # 反标准化
    temp_index = features.index("MaxTemp")  
    temp_scale = scaler.scale_[temp_index]  
    temp_mean = scaler.mean_[temp_index]  
    predictions = outputs.cpu().numpy() * temp_scale + temp_mean
    
    return predictions

In [27]:
# 检查原始温度范围
print("原始最高温度范围:", df["MaxTemp"].min(), "到", df["MaxTemp"].max())

原始最高温度范围: -17.77777778 到 33.33333333


In [28]:
# 随机获取连续10天的气象数据作为输入
recent_data = df.sample(10).sort_index().reset_index(drop=True)
print("最近10天的气象数据:")
print(recent_data)

# 预测未来一天温度
next_day_temp = predict_next_day(recent_data)
print(f"明天预测最高温度: {next_day_temp[0]:.1f}°C")

# 预测未来五天温度
five_day_temps = predict_five_days(recent_data)
for i, temp in enumerate(five_day_temps[0]):
    print(f"未来第{i+1}天预测最高温度: {temp:.1f}°C")


最近10天的气象数据:
     MaxTemp    MinTemp  Precip  Snowfall  PoorWeather
0  30.555556  21.666667   0.000       0.0          0.0
1  31.111111  21.111111   0.000       0.0          0.0
2  29.444444  23.333333   0.000       0.0          0.0
3  27.777778  20.000000   0.050       0.0          0.0
4  32.222222  23.333333   0.050       0.0          0.0
5  25.000000  20.000000   1.778       0.0          0.0
6  30.000000  24.444444   0.050       0.0          0.0
7  27.777778  18.333333   0.254       0.0          1.0
8  31.111111  22.777778   0.000       0.0          0.0
9  31.111111  23.888889   0.000       0.0          0.0
明天预测最高温度: 30.3°C
未来第1天预测最高温度: 29.6°C
未来第2天预测最高温度: 29.4°C
未来第3天预测最高温度: 29.5°C
未来第4天预测最高温度: 29.3°C
未来第5天预测最高温度: 29.1°C
