In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
from scipy.stats import randint, uniform
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import mean_squared_error, r2_score
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import KFold
import torch.nn.functional as F

In [None]:
class MultiHeadAttention(nn.Module):
    """多头注意力机制"""
    def __init__(self, feature_dim, num_heads=8):
        super().__init__()
        assert feature_dim % num_heads == 0, "feature_dim必须能被num_heads整除"
        self.num_heads = num_heads
        self.head_dim = feature_dim // num_heads
        
        # 定义QKV变换层
        self.Wq = nn.Linear(feature_dim, feature_dim)
        self.Wk = nn.Linear(feature_dim, feature_dim)
        self.Wv = nn.Linear(feature_dim, feature_dim)
        self.fc_out = nn.Linear(feature_dim, feature_dim)
        
        # 缩放因子
        self.scale = self.head_dim ** -0.5
        
    def forward(self, x):
        # x形状: (batch_size, seq_len, feature_dim)
        batch_size, seq_len, _ = x.size()
        
        # 生成QKV
        Q = self.Wq(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        K = self.Wk(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        V = self.Wv(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        
        # 计算注意力得分
        scores = torch.matmul(Q, K.transpose(-2, -1)) * self.scale
        attn_weights = F.softmax(scores, dim=-1)
        
        # 注意力加权
        context = torch.matmul(attn_weights, V).transpose(1, 2).contiguous()
        context = context.view(batch_size, seq_len, -1)
        
        # 输出变换
        return self.fc_out(context)

class EnhancedSequenceAttention(nn.Module):
    """增强版序列注意力（多头 + 残差）"""
    def __init__(self, feature_dim, num_heads=8, output_dim=64):
        super().__init__()
        self.multihead_attn = MultiHeadAttention(feature_dim, num_heads)
        self.layer_norm = nn.LayerNorm(feature_dim)
        self.dropout = nn.Dropout(0.1)
        self.relu = nn.ReLU()
        self.reduce_dim = nn.Linear(feature_dim, output_dim)
        
    def forward(self, x):
        # 残差连接
        attn_output = self.multihead_attn(x)
        out = self.layer_norm(x + self.dropout(attn_output))

        out = torch.mean(out, dim=1)  # 形状变为 (batch, 128)
        out = self.reduce_dim(out) 
        out = self.relu(out)
        # 形状变为 (batch, 64)
        # 全局平均池化
        return out  # 或使用加权求和
class ImprovedBidirectionalGRU(nn.Module):
    def __init__(self, input_dim, hidden_dims, output_dim, external_dim=2):
        super().__init__()
        
        # ------------------- 时间序列部分 -------------------
        self.gru1 = nn.GRU(input_dim, hidden_dims[0], batch_first=True, bidirectional=True)
        self.ln1 = nn.LayerNorm(hidden_dims[0] * 2)
        
        self.gru2 = nn.GRU(hidden_dims[0] * 2, hidden_dims[1], batch_first=True, bidirectional=True)
        self.ln2 = nn.LayerNorm(hidden_dims[1] * 2)
        
        self.gru3 = nn.GRU(hidden_dims[1] * 2, hidden_dims[2], batch_first=True, bidirectional=True)
        self.ln3 = nn.LayerNorm(hidden_dims[2] * 2)
        
        self.attention = EnhancedSequenceAttention(hidden_dims[2] * 2)
        
        # ------------------- 外部变量直接拼接 -------------------
        # 删除原有的外部变量处理层（self.ext_fc）
        
        # ------------------- 联合全连接层 -------------------
        # 调整输入维度：时间序列特征维度 + 外部变量原始维度
        self.fc1 = nn.Linear(64 + external_dim, 32)  # 原16 → 改为 external_dim=2
        self.ln_fc1 = nn.LayerNorm(32)
        self.output_fc = nn.Linear(32, output_dim)
        self.relu = nn.ReLU()

    def forward(self, ts_input, ext_input):
        # 时间序列处理（保持不变）
        out, _ = self.gru1(ts_input)
        out = self.ln1(out)
        out = self.relu(out)
        
        out, _ = self.gru2(out)
        out = self.ln2(out)
        out = self.relu(out)
        
        out, _ = self.gru3(out)
        out = self.ln3(out)
        out = self.relu(out)
        
        
        # 注意力层
        ts_feature = self.attention(out)  # (batch_size, hidden_dims[3]*2)
        
        # ------------------- 直接拼接外部变量 -------------------
        # 假设 ext_input 已经过标准化处理（如StandardScaler）
        combined = torch.cat([ts_feature, ext_input], dim=1)  # (batch_size, hidden_dims[3]*2 + 2)
        
        # 全连接层（保持不变）
        out = self.fc1(combined)
        out = self.ln_fc1(out)
        out = self.relu(out)

        return self.output_fc(out)