# 优化后的 LSTM-AdaBoost 时间序列预测模型

## 1. 模型介绍
本项目实现了一个基于**LSTM (长短期记忆网络)**和**AdaBoost (自适应增强)**的集成学习模型，用于时间序列预测。

- **LSTM**: 作为模型的“弱学习器”，擅长捕捉时间序列中的长期依赖关系。
- **AdaBoost**: 作为集成框架，通过串行训练多个LSTM，让后续模型更关注先前模型预测错误的样本，从而逐步提升整体模型的预测精度和鲁棒性。

## 2. 核心优化点
此版本相比原始代码，进行了以下关键优化：

1.  **修正数据划分**: 采用严格的**时间顺序**划分训练、验证和测试集，杜绝了未来数据泄露，保证了模型评估的有效性。
2.  **修正数据标准化**: 标准化器 (Scaler) **仅在训练集上拟合**，然后应用于验证集和测试集，避免了数据泄露。
3.  **增强模型封装**: 添加了统一的 `save` 和 `load` 方法，可以一键保存和加载模型的完整状态（包括所有弱学习器、标准化器和权重），极大地方便了模型的部署和复用。
4.  **提升代码可读性**: 优化了代码结构，并增加了详细的中文注释。

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import os
import json
import joblib # 用于保存scaler

# 设置日志级别，只显示错误信息
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

## 3. LSTMAdaBoost 模型类定义

In [None]:
class LSTMAdaBoost:
    def __init__(self, n_estimators=10, seq_length=24, learning_rate=0.001, use_gpu=True,
                 early_stopping_patience=5, reduce_lr_patience=3):
        self.n_estimators = n_estimators
        self.seq_length = seq_length
        self.learning_rate = learning_rate
        self.estimators = []
        self.estimator_weights = []
        self.scaler_X = StandardScaler() # 特征标准化器
        self.scaler_y = StandardScaler() # 目标值标准化器
        self.early_stopping_patience = early_stopping_patience
        self.reduce_lr_patience = reduce_lr_patience
        self.is_fitted = False # 标记模型是否已训练

        if use_gpu:
            self._configure_gpu()

    def _configure_gpu(self):
        gpus = tf.config.list_physical_devices('GPU')
        if gpus:
            try:
                for gpu in gpus:
                    tf.config.experimental.set_memory_growth(gpu, True)
                print(f"检测到 {len(gpus)} 个GPU，已配置内存增长模式。")
            except RuntimeError as e:
                print(f"GPU配置失败: {e}")
        else:
            print("未检测到GPU，将使用CPU进行训练。")

    def _create_sequences(self, X, y):
        Xs, ys = [], []
        for i in range(len(X) - self.seq_length):
            Xs.append(X[i:(i + self.seq_length)])
            ys.append(y[i + self.seq_length])
        return np.array(Xs), np.array(ys)

    def _build_lstm_model(self, input_shape):
        model = Sequential([
            LSTM(32, return_sequences=True, input_shape=input_shape),
            Dropout(0.3),
            LSTM(16),
            Dropout(0.3),
            Dense(1)
        ])
        model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate), metrics=['mae'])
        return model

    def fit(self, X_train, y_train, X_val, y_val, epochs=50, batch_size=32, verbose=1):
        """使用修正后的数据划分和标准化流程来训练模型"""
        # [优化] 仅在训练数据上拟合标准化器
        X_train_scaled = self.scaler_X.fit_transform(X_train)
        y_train_scaled = self.scaler_y.fit_transform(y_train.reshape(-1, 1)).flatten()
        
        # [优化] 对验证数据使用相同的标准化器进行转换
        X_val_scaled = self.scaler_X.transform(X_val)
        y_val_scaled = self.scaler_y.transform(y_val.reshape(-1, 1)).flatten()

        # 创建时间序列数据
        X_train_seq, y_train_seq = self._create_sequences(X_train_scaled, y_train_scaled)
        X_val_seq, y_val_seq = self._create_sequences(X_val_scaled, y_val_scaled)

        if X_train_seq.shape[0] == 0:
            raise ValueError("训练数据太少，无法创建任何时间序列样本。请增加训练数据量或减小seq_length。")

        # 初始化样本权重
        sample_weights = np.ones(len(X_train_seq)) / len(X_train_seq)

        for i in range(self.n_estimators):
            print(f"\n--- 训练第 {i+1}/{self.n_estimators} 个LSTM模型 ---")
            model = self._build_lstm_model((self.seq_length, X_train.shape[1]))

            callbacks = [
                EarlyStopping(monitor='val_loss', patience=self.early_stopping_patience, restore_best_weights=True, verbose=1),
                ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=self.reduce_lr_patience, min_lr=1e-6, verbose=1)
            ]

            history = model.fit(X_train_seq, y_train_seq,
                              epochs=epochs,
                              batch_size=batch_size,
                              validation_data=(X_val_seq, y_val_seq),
                              verbose=verbose,
                              sample_weight=sample_weights,
                              callbacks=callbacks)

            y_pred_train = model.predict(X_train_seq).flatten()
            error = np.abs(y_pred_train - y_train_seq)
            weighted_error = np.sum(sample_weights * error)

            # 防止除零错误
            if weighted_error >= 1.0:
                weighted_error = 1.0 - 1e-10

            alpha = 0.5 * np.log((1 - weighted_error) / max(weighted_error, 1e-10))
            error_rate = error / (np.max(error) + 1e-10)
            sample_weights *= np.exp(alpha * error_rate)
            sample_weights /= np.sum(sample_weights)

            self.estimators.append(model)
            self.estimator_weights.append(alpha)
            print(f"模型 {i+1} 训练完成: 误差率={weighted_error:.4f}, 学习器权重={alpha:.4f}")
        
        self.is_fitted = True
        return self

    def predict(self, X):
        if not self.is_fitted:
            raise RuntimeError("模型尚未训练，请先调用fit()方法。")
        
        X_scaled = self.scaler_X.transform(X)
        # 创建序列时，y可以是任意值，因为它不会被使用
        X_seq, _ = self._create_sequences(X_scaled, np.zeros(len(X_scaled)))
        
        if X_seq.shape[0] == 0:
            return np.array([]) # 如果输入数据不足以形成一个序列，返回空数组

        weighted_predictions = np.zeros(len(X_seq))
        total_weight = sum(self.estimator_weights)

        for weight, model in zip(self.estimator_weights, self.estimators):
            pred = model.predict(X_seq).flatten()
            weighted_predictions += weight * pred

        # 计算加权平均
        if total_weight > 0:
            final_predictions_scaled = weighted_predictions / total_weight
        else:
            final_predictions_scaled = weighted_predictions # 如果所有权重为0，则为简单平均

        # 反标准化
        final_predictions = self.scaler_y.inverse_transform(final_predictions_scaled.reshape(-1, 1)).flatten()
        return final_predictions
    
    # [优化] 添加统一的保存方法
    def save(self, directory_path):
        os.makedirs(directory_path, exist_ok=True)
        
        # 保存弱学习器
        for i, model in enumerate(self.estimators):
            model.save(os.path.join(directory_path, f'estimator_{i}.h5'))
        
        # 保存标准化器
        joblib.dump(self.scaler_X, os.path.join(directory_path, 'scaler_X.gz'))
        joblib.dump(self.scaler_y, os.path.join(directory_path, 'scaler_y.gz'))
        
        # 保存模型配置和权重
        config = {
            'n_estimators': self.n_estimators,
            'seq_length': self.seq_length,
            'learning_rate': self.learning_rate,
            'estimator_weights': self.estimator_weights,
            'is_fitted': self.is_fitted
        }
        with open(os.path.join(directory_path, 'config.json'), 'w') as f:
            json.dump(config, f, indent=4)
            
        print(f"模型已完整保存至: {directory_path}")

    # [优化] 添加统一的加载方法
    @classmethod
    def load(cls, directory_path):
        config_path = os.path.join(directory_path, 'config.json')
        with open(config_path, 'r') as f:
            config = json.load(f)
        
        # 使用保存的配置初始化模型
        model_instance = cls(
            n_estimators=config['n_estimators'],
            seq_length=config['seq_length'],
            learning_rate=config['learning_rate']
        )
        model_instance.estimator_weights = config['estimator_weights']
        model_instance.is_fitted = config['is_fitted']
        
        # 加载标准化器
        model_instance.scaler_X = joblib.load(os.path.join(directory_path, 'scaler_X.gz'))
        model_instance.scaler_y = joblib.load(os.path.join(directory_path, 'scaler_y.gz'))
        
        # 加载弱学习器
        model_instance.estimators = []
        for i in range(config['n_estimators']):
            estimator_path = os.path.join(directory_path, f'estimator_{i}.h5')
            estimator = load_model(estimator_path)
            model_instance.estimators.append(estimator)
            
        print(f"模型已从 {directory_path} 完整加载。")
        return model_instance

## 4. 辅助函数（数据加载、评估、可视化）

In [None]:
def load_and_split_data(file_path, train_ratio=0.7, val_ratio=0.15):
    df = pd.read_csv(file_path)
    if df.isnull().any().any():
        print("检测到缺失值，使用前向填充(ffill)处理。")
        df = df.fillna(method='ffill')
    
    X = df.drop(['value'], axis=1).values
    y = df['value'].values
    
    # [优化] 按时间顺序划分数据集
    train_size = int(len(X) * train_ratio)
    val_size = int(len(X) * val_ratio)
    
    X_train, y_train = X[:train_size], y[:train_size]
    X_val, y_val = X[train_size:train_size+val_size], y[train_size:train_size+val_size]
    X_test, y_test = X[train_size+val_size:], y[train_size+val_size:]
    
    print(f"数据加载与划分完成:")
    print(f"  - 训练集: {len(X_train)} 条")
    print(f"  - 验证集: {len(X_val)} 条")
    print(f"  - 测试集: {len(X_test)} 条")
    
    return X_train, y_train, X_val, y_val, X_test, y_test

def evaluate_performance(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / (y_true + 1e-8))) * 100 # 防止除以0
    
    print("\n--- 模型评估结果 ---")
    print(f"均方误差 (MSE): {mse:.4f}")
    print(f"平均绝对误差 (MAE): {mae:.4f}")
    print(f"R-squared (R²): {r2:.4f}")
    print(f"平均绝对百分比误差 (MAPE): {mape:.2f}%")
    
    return {'mse': mse, 'mae': mae, 'r2': r2, 'mape': mape}

def plot_predictions(y_true, y_pred, title="模型预测结果", save_path=None):
    plt.figure(figsize=(15, 10))
    plt.rcParams['font.sans-serif'] = ['SimHei'] # 设置字体以显示中文
    plt.rcParams['axes.unicode_minus'] = False
    
    # 绘制整体对比图
    plt.subplot(2, 1, 1)
    plt.plot(y_true, label='真实值', color='blue', alpha=0.8)
    plt.plot(y_pred, label='预测值', color='red', linestyle='--')
    plt.title(title, fontsize=16)
    plt.xlabel('时间步')
    plt.ylabel('负荷值')
    plt.legend()
    plt.grid(True)

    # [可视化要求] 绘制前100个点的细节对比图
    plt.subplot(2, 1, 2)
    sample_size = min(100, len(y_true))
    plt.plot(y_true[:sample_size], 'o-', label='真实值', color='blue')
    plt.plot(y_pred[:sample_size], 'x--', label='预测值', color='red')
    plt.title(f'预测细节（前 {sample_size} 个数据点）', fontsize=14)
    plt.xlabel('时间步')
    plt.ylabel('负荷值')
    plt.legend()
    plt.grid(True)

    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path)
        print(f"图表已保存至: {save_path}")
    else:
        plt.show()


## 5. 主执行流程
在这里我们将执行完整的模型训练、评估、保存、加载和预测流程。

In [None]:
# --- 1. 参数配置与数据加载 ---
file_path = '../../code/new_data.csv' # 请确保文件路径正确
X_train, y_train, X_val, y_val, X_test, y_test = load_and_split_data(file_path)

# --- 2. 模型训练 ---
model = LSTMAdaBoost(
    n_estimators=5,       # [建议] 先用较小的数量（如3-5）快速迭代，找到最优参数后再增加
    seq_length=48,        # [建议] 序列长度可以根据数据特性调整，例如24(一天)或168(一周)
    use_gpu=True,
    early_stopping_patience=10,
    reduce_lr_patience=5
)

model.fit(X_train, y_train, X_val, y_val, epochs=100, batch_size=64)

# --- 3. 在测试集上评估 ---
# 注意：y_test需要根据seq_length进行裁剪以匹配预测输出的长度
y_pred_test = model.predict(X_test)
y_true_test_aligned = y_test[model.seq_length:]
evaluate_performance(y_true_test_aligned, y_pred_test)

# --- 4. 可视化预测结果 ---
plot_predictions(y_true_test_aligned, y_pred_test, save_path='./optimized_prediction_plot.png')

# --- 5. 保存和加载模型演示 ---
model_save_dir = './final_lstm_adaboost_model'
model.save(model_save_dir)

# 加载模型
loaded_model = LSTMAdaBoost.load(model_save_dir)

# 使用加载后的模型进行预测，验证其功能是否正常
y_pred_from_loaded_model = loaded_model.predict(X_test)

# 比较两次预测结果是否一致
if np.allclose(y_pred_test, y_pred_from_loaded_model):
    print("\n加载的模型预测结果与原模型一致，保存和加载功能验证成功！")
else:
    print("\n警告：加载的模型预测结果与原模型不一致！")