In [17]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import tensorflow as tf
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import TimeSeriesSplit

import warnings
import logging
mpl_logger = logging.getLogger('matplotlib')
mpl_logger.setLevel(logging.WARNING)  # 设置 matplotlib 日志级别为 WARNING
warnings.filterwarnings("ignore", category=UserWarning, module="matplotlib.font_manager")

matplotlib.rcParams['font.family']= ['Songti SC']  # 使用黑体-简
matplotlib.rcParams['axes.unicode_minus'] = False # 解决负号显示问题

tf.config.threading.set_inter_op_parallelism_threads(16)
tf.config.threading.set_intra_op_parallelism_threads(16)

def create_sequneces(X, y, time_steps=24):
    Xs, ys = [], []
    for i in range(len(X)-time_steps):
        Xs.append(X[i:i+time_steps])
        ys.append(y[i+time_steps])
    return np.array(Xs), np.array(ys)

"""数据增强"""
def moving_average_smoothing(series, window_size=3):
    if series.ndim == 1:  # 处理一维情况
        series = series.reshape(-1, 1)
    smoothed_data = np.empty_like(series)  # 创建一个与原始数据形状相同的空数组
    for col in range(series.shape[1]):
        # 对每一列(每个特征)进行平滑处理
        smoothed_data[:,col] = np.convolve(series[:,col], np.ones(window_size)/window_size, mode='same')
    return smoothed_data

def random_noise(data, noise_factor=0.01):
    """随机噪声"""
    noise = noise_factor + np.random.randn(*data.shape)
    return data + noise

def time_series_shift(series, shift_range=5):
    """时间序列平移"""
    shift = np.random.randint(-shift_range, shift_range + 1)
    return np.roll(series, shift, axis=0)

def data_augmentation(X, y, num_augmentations=5):
    augmented_X, augmented_y = [], []
    for i in range(len(X)):
        # 移动平均平滑
        augmented_X.append(X[i])
        augmented_y.append(y[i])
        for _ in range(num_augmentations):
            # 增强方法1 平滑处理
            X_smooth = moving_average_smoothing(X[i])
            # 增加方法2 添加噪声
            X_noise = random_noise(X_smooth)
            # 增加方法3 时间偏移
            X_shift = time_series_shift(X_noise)
            augmented_X.append(X_shift)
            augmented_y.append(y[i])
    return np.array(augmented_X), np.array(augmented_y)

In [18]:
df = pd.read_csv('../data/train.csv', parse_dates=True)
df.head()

Unnamed: 0,date,open,close,low,high,volume,count,amount,EMA20,EMA100,...,close_volatility,Volatility_10,close_Volume_volatility,Volume_volatility_10,UpperBB,MiddleBB,LowerBB,return_lag1,return_lag3,return_lag5
0,2017-10-28 13:30:00,292.37,292.74,292.37,292.75,160.682134,20.0,0.5489,294.090399,294.8432,...,0.001299,0.016358,0.208714,20.221269,298.296695,294.6315,290.966305,0.0013,0.001437,-0.00756
1,2017-10-28 14:00:00,292.73,291.96,291.86,292.75,176.74089,22.0,0.6049,293.887504,294.786107,...,-0.002668,0.013005,-0.471551,2.280677,298.160943,294.411,290.661057,-0.002664,-0.00263,-0.002562
2,2017-10-28 14:30:00,291.97,292.91,291.2,294.59,234.267629,22.0,0.8014,293.794408,294.748956,...,0.003249,0.013102,0.761039,2.320067,297.903752,294.2305,290.557248,0.003254,0.001881,0.002018
3,2017-10-28 15:00:00,292.94,292.94,291.29,294.58,133.953471,21.0,0.4573,293.713036,294.713135,...,0.000102,0.012696,0.013719,2.177855,297.630285,294.0575,290.484715,0.000102,0.000683,0.000717
4,2017-10-28 15:30:00,292.89,292.87,291.2,292.89,271.680139,23.0,0.9288,293.632747,294.676638,...,-0.000239,0.009353,-0.064928,1.628303,297.295646,293.875,290.454354,-0.000239,0.003117,0.001744


In [19]:
target ='close'
features = df.drop(columns=['date', target]).values
X = features.copy()
y = df[target].copy().values.reshape(-1, 1)
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()
X_train_scaled = scaler_X.fit_transform(X)
y_train_scaled = scaler_y.fit_transform(y)

In [None]:
print("划分数据集...")


划分数据集...


: 

In [None]:
"""
将时间序列数据划分为多个窗口，每个窗口包含过去window_size个时间步的数据和下一个时间步的标签。
"""
metrics = ['MSE', 'RMSE', 'MAE', 'R2']
window_sizes = [10, 30, 60, 90, 120]
lstm_params={"layers":2, 'units': 64,  'dropout_rate': 0.3, 'learning_rate': 0.0001, 'clipvalue': 0.5, 'epochs': 20,'batch_size': 64, 'input_window': 120}
layers = lstm_params.get('layers')
model_name=f"Cross-LSTM-w{lstm_params.get('input_window')}-layers{layers}-u{lstm_params.get('units')}-d{lstm_params.get('dropout_rate')}-l{lstm_params.get('learning_rate')}-c{lstm_params.get('clipvalue')}-e{lstm_params.get('epochs')}-b{lstm_params.get('batch_size')}"

print("划分数据集...")
X_train_full, y_train_full = create_sequneces(X_train_scaled, y_train_scaled, lstm_params.get('input_window'))
X_train, X_test, y_train, y_test = train_test_split(X_train_full, y_train_full, test_size=0.3, random_state=42, shuffle=False)
print("数据增强...")
X_train_full_augmented, y_train_full_augmented = data_augmentation(X_train, y_train)



print(f"Origin traning data shape: {X_train.shape}")
print(f"Augmented traning data shape: {X_train_full_augmented.shape}")
print("交叉验证...")
tscv = TimeSeriesSplit(n_splits=5, test_size=30)
result = {metric: [] for metric in metrics}
for fold,(train_index, val_index) in enumerate(tscv.split(X_train_full_augmented)):
    print(f"Fold {fold+1}")
    X_train_fold, X_val_fold = X_train_full_augmented[train_index], X_train_full_augmented[val_index]
    y_train_fold, y_val_fold = y_train_full_augmented[train_index], y_train_full_augmented[val_index]

    print("构建LSTM模型...")
    print(f"输入数据形状检查：{X_train_fold.shape}")  # 应为 (样本数, 时间步, 特征数)
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.LSTM(units=lstm_params.get('units'), 
                                input_shape=( X_train_fold.shape[1], X_train_fold.shape[2]), return_sequences=layers > 1, 
                                kernel_regularizer=tf.keras.regularizers.l2(0.01)))
    model.add(tf.keras.layers.Dropout(lstm_params.get('dropout_rate')))
    for i in range(1, lstm_params.get('layers')):
        print(f"增加一层layer = {i+1}, return_sequences={layers -1 > i}")
        model.add(tf.keras.layers.LSTM(units=lstm_params.get('units'), return_sequences=(layers -1 > i)))
        model.add(tf.keras.layers.Dropout(lstm_params.get('dropout_rate')))
    model.add(tf.keras.layers.Dense(units=1))

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate= lstm_params.get('learning_rate'),
                                                    clipvalue=lstm_params.get('clipvalue')), loss='mean_squared_error', metrics=['mae'])
    model.summary()

    print("训练模型...")
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    history = model.fit(X_train, y_train, epochs=lstm_params.get('epochs'), 
                        batch_size=lstm_params.get('batch_size'), 
                        validation_data=(X_val_fold, y_val_fold), verbose=1, callbacks=[early_stopping])
    
    print("预测模型...")
    predictions = model.predict(X_val_fold)
    predicted_prices =  scaler_y.inverse_transform(predictions)
    real_prices = scaler_y.inverse_transform(y_train_scaled.reshape(-1, 1))
    """模型验证和评估"""
    print("评估模型...")
    mse = mean_squared_error(real_prices, predicted_prices)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(real_prices, predicted_prices)
    r2 = r2_score(real_prices, predicted_prices)
    result['MSE'].append(mse)
    result['RMSE'].append(rmse)
    result['MAE'].append(mae)
    result['R2'].append(r2)
    print(f"Mean Squared Error (MSE): {mse}")
    print(f"Root Mean Squared Error (RMSE): {rmse}")
    print(f"Mean Absolute Error (MAE): {mae}")
    print(f"R-squared (R2): {r2}")

    # 打印结果
    fig, (ax) = plt.subplots(1,1,figsize=(12, 6))
    plt.plot(predicted_prices, label='predicted_prices')
    plt.plot(real_prices, label='real_prices')
    plt.title('Model Accuracy History')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend()
    ax.xaxis.set_major_locator(mdates.AutoDateLocator())
    ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m-%d %H:%M"))
    plt.grid(True)
    fig.savefig(f'results/model_predicted_prices-{model_name}.png')
    plt.show()

    fig2, (ax2) = plt.subplots(1,1,figsize=(12, 6))
    plt.plot(predicted_prices[-100:], label='predicted_prices')
    plt.plot(real_prices[-100:], label='real_prices')
    ax2.xaxis.set_major_locator(mdates.AutoDateLocator())
    ax2.xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m-%d %H:%M"))
    plt.title('Model Accuracy History')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend()
    plt.grid(True)
    plt.show()


    """打印验证损失曲线"""
    fig2 = plt.figure(figsize=(16, 6))
    plt.plot(history.history['loss'], label='Training Loss')
    # 绘制验证损失曲线（如果有验证集）
    plt.plot(history.history['val_loss'], label='Validation Loss')
    # 添加统计信息框
    stats_text = f"""模型验证和评估：
    • Mean Squared Error (MSE):{mse}
    • Root Mean Squared Error (RMSE):{rmse}
    • Mean Absolute Error (MAE):{mae}
    • R-squared (R2): {r2}
    """

    plt.annotate(stats_text, 
                xy=(0.78, 0.85), 
                xycoords='axes fraction',
                bbox=dict(boxstyle="round", fc="white", ec="#999999", alpha=0.8))
    plt.title(f'Model Training History')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend()
    plt.grid(True)
    fig2.savefig(f'results/model_loss-{model_name}.png')
    plt.show()

print(f"交叉验证结果：{result}")

划分数据集...
数据增强...
