In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import tensorflow as tf
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import warnings
import logging
mpl_logger = logging.getLogger('matplotlib')
mpl_logger.setLevel(logging.WARNING)  # 设置 matplotlib 日志级别为 WARNING
warnings.filterwarnings("ignore", category=UserWarning, module="matplotlib.font_manager")

matplotlib.rcParams['font.family']= ['Songti SC']  # 使用黑体-简
matplotlib.rcParams['axes.unicode_minus'] = False # 解决负号显示问题

In [None]:
df = pd.read_csv('../data/train.csv', parse_dates=True)
df.head()

In [None]:
target ='close'
features = df.drop(columns=['date', target]).values
X = features.copy()
y = df[target].copy().values.reshape(-1, 1)
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()
X_train_scaled = scaler_X.fit_transform(X)
y_train_scaled = scaler_y.fit_transform(y)

In [None]:
# 创建输入序列和标签，使用滑动窗口
def create_sequneces(X, y, time_steps=24):
    Xs, ys = [], []
    for i in range(len(X)-time_steps):
        Xs.append(X[i:i+time_steps])
        ys.append(y[i+time_steps])
    return np.array(Xs), np.array(ys)

lstm_params={"layers":2, 'units': 64,  'dropout_rate': 0.3, 'learning_rate': 0.0001, 'clipvalue': 0.5, 'epochs': 20,'batch_size': 64, 'input_window': 120}
layers = lstm_params.get('layers')
model_name=f"LSTM-w{lstm_params.get('input_window')}-layers{layers}-u{lstm_params.get('units')}-d{lstm_params.get('dropout_rate')}-l{lstm_params.get('learning_rate')}-c{lstm_params.get('clipvalue')}-e{lstm_params.get('epochs')}-b{lstm_params.get('batch_size')}"
X_train, y_train = create_sequneces(X_train_scaled, y_train_scaled)

In [None]:
print("划分数据集...")
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.3, random_state=42, shuffle=False)
X_val,X_test, y_val,y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42, shuffle=False)

In [None]:
print("构建LSTM模型...")
print(f"输入数据形状检查：{X_train.shape}")  # 应为 (样本数, 时间步, 特征数)
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.LSTM(units=lstm_params.get('units'), 
                               input_shape=( X_train.shape[1], X_train.shape[2]), return_sequences=layers > 1))
model.add(tf.keras.layers.Dropout(lstm_params.get('dropout_rate')))
for i in range(1, lstm_params.get('layers')):
    print(f"增加一层layer = {i+1}, return_sequences={layers -1 > i}")
    model.add(tf.keras.layers.LSTM(units=lstm_params.get('units'), return_sequences=(layers -1 > i)))
    model.add(tf.keras.layers.Dropout(lstm_params.get('dropout_rate')))
model.add(tf.keras.layers.Dense(units=1))

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate= lstm_params.get('learning_rate'),
                                                 clipvalue=lstm_params.get('clipvalue')), loss='mean_squared_error', metrics=['mae'])
model.summary()

In [None]:
print("训练模型...")
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = model.fit(X_train, y_train, epochs=lstm_params.get('epochs'), 
                    batch_size=lstm_params.get('batch_size'), 
                    validation_data=(X_val, y_val), verbose=1, callbacks=[early_stopping])

In [None]:
print("预测模型...")
predictions = model.predict(X_test)
predicted_prices =  scaler_y.inverse_transform(predictions)
real_prices = scaler_y.inverse_transform(y_test.reshape(-1, 1))

In [None]:
# 打印结果
fig, (ax) = plt.subplots(1,1,figsize=(12, 6))
plt.plot(predicted_prices, label='predicted_prices')
plt.plot(real_prices, label='real_prices')
plt.title('Model Accuracy History')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
ax.xaxis.set_major_locator(mdates.AutoDateLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m-%d %H:%M"))
plt.grid(True)
fig.savefig(f'results/model_predicted_prices-{model_name}.png')
plt.show()

fig2, (ax2) = plt.subplots(1,1,figsize=(12, 6))
plt.plot(predicted_prices[-100:], label='predicted_prices')
plt.plot(real_prices[-100:], label='real_prices')
ax2.xaxis.set_major_locator(mdates.AutoDateLocator())
ax2.xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m-%d %H:%M"))
plt.title('Model Accuracy History')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
"""模型验证和评估"""
print("评估模型...")
mse = mean_squared_error(real_prices, predicted_prices)
rmse = np.sqrt(mse)
mae = mean_absolute_error(real_prices, predicted_prices)
r2 = r2_score(real_prices, predicted_prices)
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared (R2): {r2}")

In [None]:
"""打印验证损失曲线"""
fig2 = plt.figure(figsize=(16, 6))
plt.plot(history.history['loss'], label='Training Loss')
# 绘制验证损失曲线（如果有验证集）
plt.plot(history.history['val_loss'], label='Validation Loss')
# 添加统计信息框
stats_text = f"""模型验证和评估：
• Mean Squared Error (MSE):{mse}
• Root Mean Squared Error (RMSE):{rmse}
• Mean Absolute Error (MAE):{mae}
• R-squared (R2): {r2}
"""

plt.annotate(stats_text, 
            xy=(0.78, 0.85), 
            xycoords='axes fraction',
            bbox=dict(boxstyle="round", fc="white", ec="#999999", alpha=0.8))
plt.title(f'Model Training History')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend()
plt.grid(True)
fig2.savefig(f'results/model_loss-{model_name}.png')
plt.show()

In [None]:
# 特征重构（加入时序特征）
def create_ensemble_features(X, predictions):
    return np.concatenate([
        X.reshape(X.shape[0], -1),          # 原始特征
        predictions.reshape(-1,1),                        # LSTM预测值
        np.arange(len(predictions))[:,None] # 时序位置特征
    ], axis=1)

# 训练残差修正模型
ensemble_train_features = create_ensemble_features(X_train, model.predict(X_train))

# 在数据预处理阶段
print(f"原始y形状：{y.shape}")  # 应为 (n_samples,)

# 在创建序列后
print(f"序列y形状：{y_train.shape}")  # 应为 (n_samples,)

# 在残差模型训练前
print(f"ensemble特征形状：{ensemble_train_features.shape}")
print(f"残差模型y形状：{y_train.shape}")
residual_model = GradientBoostingRegressor(
    n_estimators=100, 
    learning_rate=0.01,
    max_depth=3
)
residual_model.fit(ensemble_train_features, y_train.ravel())

# 组合预测
test_features = create_ensemble_features(X_test, predictions)
adjusted_predictions = predicted_prices.flatten() + residual_model.predict(test_features)

# 带残差修正的评估
def evaluate_adjusted():
    adj_mse = mean_squared_error(real_prices, adjusted_predictions)
    adj_rmse = np.sqrt(adj_mse)
    adj_mae = mean_absolute_error(real_prices, adjusted_predictions)
    adj_r2 = r2_score(real_prices, adjusted_predictions)
    print(f"修正后 MSE: {adj_mse:.4f} (改善 {100*(mse-adj_mse)/mse:.1f}%)")
    print(f"修正后 MAE: {adj_mae:.4f} (改善 {100*(mae-adj_mae)/mae:.1f}%)")

    fig4 = plt.figure(figsize=(12,6))
    plt.plot(real_prices[-100:], label='True')
    plt.plot(predicted_prices[-100:], label='LSTM', alpha=0.7)
    plt.plot(adjusted_predictions[-100:], label='Adjusted', linestyle='--')
    plt.title('Prediction Comparison')
    stats_text = f"""模型验证和评估：
        • Mean Squared Error (MSE):{adj_mse}
        • Root Mean Squared Error (RMSE):{adj_rmse}
        • Mean Absolute Error (MAE):{adj_mae}
        • R-squared (R2): {adj_r2}
        """

    plt.annotate(stats_text, 
                xy=(0.78, 0.85), 
                xycoords='axes fraction',
                bbox=dict(boxstyle="round", fc="white", ec="#999999", alpha=0.8))
    plt.legend()
    fig4.savefig(f'results/evaluate_adjusted-{model_name}.png')
    plt.show()

evaluate_adjusted()