In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from statsmodels.tsa.statespace.sarimax import SARIMAX
import scipy.stats as stats

# 设置matplotlib使用支持中文的字体
plt.rcParams['font.sans-serif'] = ['SimHei']  # 使用黑体显示中文
plt.rcParams['axes.unicode_minus'] = False  # 解决负号显示问题

# 加载数据
data = pd.read_excel('Attachment 1.xlsx')
data.columns = ['MJD(days)', 'PT-TT(s)']

# 设置MJD(days)为索引
data.set_index('MJD(days)', inplace=True)

# 清理数据，将非数值转换为NaN并删除
data['PT-TT(s)'] = pd.to_numeric(data['PT-TT(s)'], errors='coerce')
data = data.dropna()

# 标准化数据
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data['PT-TT(s)'].values.reshape(-1, 1))

# 划分训练集和测试集
train_size = int(len(data_scaled) * 0.8)
train_data = data_scaled[:train_size]
test_data = data_scaled[train_size:]

# 对于SARIMA模型，获取未标准化的训练和测试数据
train_data_sarima = data.iloc[:train_size]
test_data_sarima = data.iloc[train_size:]


In [2]:
# 准备LSTM的输入数据
def create_sequences(data, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:(i + seq_length), 0])
        y.append(data[i + seq_length, 0])
    return np.array(X), np.array(y)

seq_length = 100  # 序列长度

X, y = create_sequences(data_scaled, seq_length)

# 划分训练集和测试集
X_train, X_test = X[:train_size - seq_length], X[train_size - seq_length:]
y_train, y_test = y[:train_size - seq_length], y[train_size - seq_length:]

# 重塑数据以适应LSTM输入格式 [samples, time steps, features]
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))

# 构建LSTM模型
model = Sequential([
    Bidirectional(LSTM(200, activation='relu', return_sequences=True), input_shape=(seq_length, 1)),
    Dropout(0.3),
    Bidirectional(LSTM(150, activation='relu', return_sequences=True)),
    Dropout(0.3),
    Bidirectional(LSTM(100, activation='relu')),
    Dropout(0.3),
    Dense(50, activation='relu'),
    Dense(1)
])

# 编译模型
model.compile(optimizer=Adam(learning_rate=0.0005), loss='mse')

# 训练模型
history = model.fit(X_train, y_train, epochs=300, batch_size=64, validation_split=0.1, verbose=1)

# 在测试集上进行预测
y_pred_lstm = model.predict(X_test)

# 反向转换预测值和实际值
y_pred_lstm_inv = scaler.inverse_transform(y_pred_lstm)
y_test_inv = scaler.inverse_transform(y_test.reshape(-1, 1))


  super().__init__(**kwargs)


Epoch 1/300
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 622ms/step - loss: 0.2199 - val_loss: 7.8248e-04
Epoch 2/300
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 432ms/step - loss: 0.0450 - val_loss: 0.0044
Epoch 3/300
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 448ms/step - loss: 0.0158 - val_loss: 0.0070
Epoch 4/300
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 440ms/step - loss: 0.0120 - val_loss: 0.0054
Epoch 5/300
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 430ms/step - loss: 0.0089 - val_loss: 0.0018
Epoch 6/300
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 444ms/step - loss: 0.0073 - val_loss: 5.2833e-04
Epoch 7/300
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 440ms/step - loss: 0.0063 - val_loss: 4.7234e-04
Epoch 8/300
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 452ms/step - loss: 0.0055 - val_loss: 7.8700e-04
Epoch 9/300
[1m8/8[0m [32m━━

In [12]:
# 定义SARIMA模型参数
p, d, q = 1, 1, 1
P, D, Q, s = 1, 1, 1, 365  # 季节性周期设为365天

# 拟合SARIMA模型
model_sarima = SARIMAX(train_data_sarima['PT-TT(s)'], order=(p, d, q), seasonal_order=(P, D, Q, s))
model_sarima_fit = model_sarima.fit(disp=False)

# 在测试集上进行预测
forecast_sarima = model_sarima_fit.get_forecast(steps=len(test_data_sarima))
y_pred_sarima = forecast_sarima.predicted_mean.values
y_test_sarima = test_data_sarima['PT-TT(s)'].values


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'


MemoryError: Unable to allocate 2.51 GiB for an array with shape (733, 733, 627) and data type float64

In [2]:
# LSTM模型的预测误差
error_lstm = y_test_inv.flatten() - y_pred_lstm_inv.flatten()

# SARIMA模型的预测误差
error_sarima = y_test_sarima - y_pred_sarima


NameError: name 'y_test_inv' is not defined

In [3]:
def dm_test(e1, e2, h=1, crit='MSE'):
    """
    执行Diebold-Mariano检验。

    参数:
    e1, e2: 两个模型的预测误差数组
    h: 预测步长（默认为1）
    crit: 损失函数标准（'MSE' 或 'MAD'）

    返回值:
    DM统计量和p值
    """
    e1, e2 = np.array(e1), np.array(e2)
    T = len(e1)
    d = e1 ** 2 - e2 ** 2 if crit == 'MSE' else np.abs(e1) - np.abs(e2)
    mean_d = np.mean(d)
    var_d = np.var(d, ddof=1)
    DM_stat = mean_d / np.sqrt(var_d / T)
    p_value = 2 * (1 - stats.norm.cdf(np.abs(DM_stat)))
    return DM_stat, p_value

# 执行DM检验
DM_stat, p_value = dm_test(error_lstm, error_sarima, crit='MSE')

print(f'DM统计量: {DM_stat:.4f}')
print(f'对应的p值: {p_value:.4f}')

if p_value < 0.05:
    print("LSTM模型的预测能力显著优于SARIMA模型")
else:
    print("无法证明LSTM模型的预测能力显著优于SARIMA模型")

def plot_forecast(y, yhat, yhat_lower, yhat_upper, title):
    plt.figure(figsize=(12, 6))
    
    # 绘制实际值
    plt.plot(y.flatten(), label='实际值', color='#2E86C1', linewidth=2)
    
    # 绘制预测值
    plt.plot(yhat.flatten(), color='#E74C3C', label='预测值', linewidth=2, linestyle='--')
    
    # 绘制置信区间
    plt.fill_between(range(len(yhat)), 
                    yhat_lower.flatten(), 
                    yhat_upper.flatten(), 
                    color='#E74C3C', 
                    alpha=0.2,
                    label='95%置信区间')
    
    plt.title(title, fontsize=14, pad=20)
    plt.xlabel('时间', fontsize=12)
    plt.ylabel('值', fontsize=12)
    plt.legend(fontsize=10, loc='best')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

# 绘制预测结果对比图
plot_forecast(y_test_inv, y_pred_lstm_inv, y_pred_lstm_inv*0.95, y_pred_lstm_inv*1.05, 'LSTM模型预测结果')
plot_forecast(y_test_sarima.reshape(-1,1), y_pred_sarima.reshape(-1,1), 
             y_pred_sarima.reshape(-1,1)*0.95, y_pred_sarima.reshape(-1,1)*1.05, 
             'SARIMA模型预测结果')

def plot_residuals(residuals, title):
    plt.figure(figsize=(12, 6))
    
    # 绘制残差散点图
    plt.scatter(range(len(residuals)), residuals, 
               color='#2ECC71', alpha=0.6, s=30)
    
    # 绘制零线
    plt.axhline(y=0, color='#E74C3C', linestyle='--')
    
    plt.title(title, fontsize=14, pad=20)
    plt.xlabel('观测点', fontsize=12)
    plt.ylabel('残差值', fontsize=12)
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

# 绘制残差图
plot_residuals(error_lstm, 'LSTM模型残差分析')
plot_residuals(error_sarima, 'SARIMA模型残差分析')


NameError: name 'error_lstm' is not defined