In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import tensorflow as tf
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import TimeSeriesSplit

import warnings
import logging
import platform
mpl_logger = logging.getLogger('matplotlib')
mpl_logger.setLevel(logging.WARNING)  # 设置 matplotlib 日志级别为 WARNING
warnings.filterwarnings("ignore", category=UserWarning, module="matplotlib.font_manager")

if platform.system() == 'Darwin':  # macOS
    plt.rcParams['font.family'] = ['Songti SC']
elif platform.system() == 'Windows':
    plt.rcParams['font.family'] = ['SimSun']
else:  # Linux
    plt.rcParams['font.family'] = ['Noto Sans CJK SC']
# matplotlib.rcParams['font.family']= ['Songti SC']  # 使用黑体-简
matplotlib.rcParams['axes.unicode_minus'] = False # 解决负号显示问题

tf.config.threading.set_inter_op_parallelism_threads(16)
tf.config.threading.set_intra_op_parallelism_threads(16)

def create_sequneces(X, y, time_steps=24):
    Xs, ys = [], []
    for i in range(len(X)-time_steps):
        Xs.append(X[i:i+time_steps])
        ys.append(y[i+time_steps])
    return np.array(Xs), np.array(ys)

"""数据增强"""
def moving_average_smoothing(series, window_size=5):
    if series.ndim == 1:  # 处理一维情况
        series = series.reshape(-1, 1)
    smoothed_data = np.empty_like(series)  # 创建一个与原始数据形状相同的空数组
    for col in range(series.shape[1]):
        # 对每一列(每个特征)进行平滑处理
        smoothed_data[:,col] = np.convolve(series[:,col], np.ones(window_size)/window_size, mode='same')
    return smoothed_data

def random_noise(data, noise_factor=0.01):
    """随机噪声"""
    noise = noise_factor + np.random.randn(*data.shape)
    return data + noise

def time_series_shift(series, shift_range=5):
    """时间序列平移"""
    shift = np.random.randint(-shift_range, shift_range + 1)
    return np.roll(series, shift, axis=0)

def data_augmentation(X, y, num_augmentations=5):
    augmented_X, augmented_y = [], []
    for i in range(len(X)):
        # 移动平均平滑
        augmented_X.append(X[i])
        augmented_y.append(y[i])
        for _ in range(num_augmentations):
            # 特征处理（差异化增强）
            # X_aug = X[i].copy()
            # 随机选择增强方式
            # aug_type = np.random.choice(['smooth', 'noise', 'shift', 'combo'])
            
            # if aug_type in ['smooth', 'combo']:
            #     # 动态窗口大小（3-7随机）
            #     window = np.random.randint(3, 8)
            #     X_aug = moving_average_smoothing(X_aug, window)
            
            # if aug_type in ['noise', 'combo']:
            #     # 自适应噪声强度（0.5%-5%）
            #     noise_level = np.random.uniform(0.005, 0.05)
            #     X_aug = random_noise(X_aug, noise_level)
            
            # if aug_type in ['shift', 'combo']:
            #     # 随机偏移量（1-3个时间步）
            #     shift = np.random.randint(1, 4)
            #     X_aug = time_series_shift(X_aug, shift)
            
            # 目标值同步增强
            y_aug = y[i] * np.random.uniform(0.95, 1.05)  # ±5%波动
            y_aug += np.random.normal(0, 0.02)  # 添加2%噪声
            
            # augmented_X.append(X_aug)
            augmented_y.append(y_aug)
            # 增强方法1 平滑处理
            X_smooth = moving_average_smoothing(X[i])
            # 增加方法2 添加噪声
            X_noise = random_noise(X_smooth)
            # 增加方法3 时间偏移
            X_shift = time_series_shift(X_noise)
            augmented_X.append(X_shift)
            # # 同步y增强（示例：添加噪声）
            # y_smooth = moving_average_smoothing(y[i])
            # y_noise = random_noise(y_smooth)
            # y_shift = time_series_shift(y_noise)
            # # y_noise = y[i] + np.random.normal(0, 0.01)  # 添加1%噪声
            # augmented_y.append(y_shift)
    return np.array(augmented_X), np.array(augmented_y)

In [None]:
df = pd.read_csv('../data/train.csv', parse_dates=True)
df.tail()

In [None]:
target ='close'
features = df.drop(columns=['date', target]).values
X_origin = features.copy()
y_origin = df[target].copy().values.reshape(-1, 1)
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()
X_train_scaled = scaler_X.fit_transform(X_origin)
y_train_scaled = scaler_y.fit_transform(y_origin)

In [None]:
def build_model(lstm_params, X_train_fold):
    # 输入层
    print("构建LSTM模型...")
    layers = lstm_params.get('layers')
    units = lstm_params.get('units')
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.LSTM(units=units, 
                                input_shape=( X_train_fold.shape[1], X_train_fold.shape[2]), return_sequences=layers > 1, 
                                kernel_regularizer=tf.keras.regularizers.l1_l2(0.01, 0.01)))
    model.add(tf.keras.layers.Dropout(lstm_params.get('dropout_rate')))
    for i in range(1, layers):
        print(f"增加一层layer = {i+1}, return_sequences={layers -1 > i}")
        model.add(tf.keras.layers.LSTM(units=units, return_sequences=(layers -1 > i),
                                    activation='tanh',
                                    recurrent_activation='sigmoid'))
        model.add(tf.keras.layers.Dropout(lstm_params.get('dropout_rate')))
    for i in range(lstm_params.get('dense_layer')):
        print(f"增加一层Dense layer = {i+1}")
        model.add(tf.keras.layers.Dense(units=units, activation='relu'))
    model.add(tf.keras.layers.Dense(units=1))

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate= lstm_params.get('learning_rate'),
                                                    clipvalue=lstm_params.get('clipvalue')), 
                                                    loss='mean_squared_error', metrics=['mae'])
    model.summary()
    return model

In [None]:
"""
将时间序列数据划分为多个窗口，每个窗口包含过去window_size个时间步的数据和下一个时间步的标签。
"""
metrics = ['MSE', 'RMSE', 'MAE', 'R2']
 
def fit_data(X_train_scaled, y_train_scaled, window_size = 60, layers =3, units = 64, batch_size = 64, epochs= 20, dense_layer=1):
    print(f"窗口大小: {window_size}, LSTM层数: {layers}, 单元数: {units}, 批量大小: {batch_size}, 训练周期: {epochs}, Dense层数: {dense_layer}")
    lstm_params={'layers': layers, 'units': units,  'dropout_rate': 0.3, 'learning_rate': 0.0001, 'clipvalue': 0.5, 'epochs': epochs,'batch_size': batch_size, 'dense_layer': dense_layer}
    model_name=f"Cross-LSTM-win{window_size}-layers{layers}-unit{lstm_params.get('units')}-dense{dense_layer} -epochs{epochs}-b{batch_size}"

    print("划分数据集...")
    X_train_full, y_train_full = create_sequneces(X_train_scaled, y_train_scaled, window_size)
    X_train, X_test, y_train, y_test = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42, shuffle=False)

    # print("数据增强...")
    # print(f"Origin traning data shape: {X_train.shape}")
    X_train_full_augmented, y_train_full_augmented = data_augmentation(X_train, y_train)
    # print(f"Augmented traning data shape: {X_train_full_augmented.shape}")
    # print("交叉验证...")
    tscv = TimeSeriesSplit(n_splits=5, test_size=30)
    result = {metric: [] for metric in metrics}
    for fold,(train_index, val_index) in enumerate(tscv.split(X_train_full_augmented)):
        print(f"Fold {fold+1}")
        X_train_fold, X_val_fold = X_train_full_augmented[train_index], X_train_full_augmented[val_index]
        y_train_fold, y_val_fold = y_train_full_augmented[train_index], y_train_full_augmented[val_index]

        # print(f"输入数据形状检查：{X_train_fold.shape}")  # 应为 (样本数, 时间步, 特征数)
        model = build_model(lstm_params, X_train_fold)

        # print("训练模型...")
        early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

        history = model.fit(X_train, y_train, epochs=lstm_params.get('epochs'), 
                            batch_size=lstm_params.get('batch_size'), 
                            validation_data=(X_val_fold, y_val_fold), verbose=1, callbacks=[early_stopping])
        
        # print("预测模型...")
        predictions = model.predict(X_test)
        predicted_prices =  scaler_y.inverse_transform(predictions)
        real_prices = scaler_y.inverse_transform(y_test.reshape(-1, 1))
        # 在预测阶段使用原始y值
        real_raw_prices = scaler_y.inverse_transform(
            y_train_scaled[-len(predicted_prices):].reshape(-1,1)
        )
    # 更直观的对比方式
        comparison_df = pd.DataFrame({
            'date':df['date'].iloc[-len(real_prices):].values,
            'Predicted': predicted_prices.ravel(),
            'Actual_Processed Price': real_prices.ravel(), # 数据增强处理后的实际值
            'Origin Price': real_raw_prices[-len(real_prices):].ravel() # 原始数据
        })
        # print("\\n最后10条价格对比:")
        # print(comparison_df.tail(10).to_markdown(floatfmt=".2f"))
        """模型验证和评估"""
        # print("评估模型...")
        mse = mean_squared_error(real_prices, predicted_prices)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(real_prices, predicted_prices)
        r2 = r2_score(real_prices, predicted_prices)
        result['MSE'].append(mse)
        result['RMSE'].append(rmse)
        result['MAE'].append(mae)
        result['R2'].append(r2)
        # print(f"Mean Squared Error (MSE): {mse}")
        # print(f"Root Mean Squared Error (RMSE): {rmse}")
        # print(f"Mean Absolute Error (MAE): {mae}")
        # print(f"R-squared (R2): {r2}")
        # results[window_size] = result
        # # 打印结果
        fig, (ax) = plt.subplots(1,1,figsize=(12, 6))
        plt.plot(comparison_df['date'],predicted_prices, label='predicted_prices', linestyle='--')
        plt.plot(comparison_df['date'],real_prices, label='real_prices')
        plt.title('Model Accuracy History')
        plt.ylabel('Accuracy')
        plt.xlabel('Epoch')
        plt.legend()
                # # 添加统计信息框
        stats_text = f"""模型验证和评估：
        • Mean Squared Error (MSE):{mse}
        • Root Mean Squared Error (RMSE):{rmse}
        • Mean Absolute Error (MAE):{mae}
        • R-squared (R2): {r2}
        """
        
        plt.annotate(stats_text, 
                    xy=(0.78, 0.85), 
                    xycoords='axes fraction',
                    bbox=dict(boxstyle="round", fc="white", ec="#999999", alpha=0.8))
        fig.autofmt_xdate()  # 自动优化日期显示
        ax.xaxis.set_major_locator(mdates.AutoDateLocator())
        ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m-%d %H:%M"))
        plt.grid(True)
        fig.savefig(f'results/model_predicted_prices-{model_name}-{fold+1}.png')
        plt.show()

        fig2, (ax2) = plt.subplots(1,1,figsize=(12, 6))
        plt.plot(predicted_prices[-500:], label='predicted_prices')
        plt.plot(real_prices[-500:], label='real_prices')
        ax2.xaxis.set_major_locator(mdates.AutoDateLocator())
        ax2.xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m-%d %H:%M"))
        fig2.autofmt_xdate()  # 自动优化日期显示
        plt.title('Model Accuracy History')
        plt.ylabel('Accuracy')
        plt.xlabel('Epoch')
        plt.legend()
        plt.grid(True)
        plt.show()


        # """打印验证损失曲线"""
        # fig2 = plt.figure(figsize=(16, 6))
        # plt.plot(history.history['loss'], label='Training Loss')
        # # 绘制验证损失曲线（如果有验证集）
        # plt.plot(history.history['val_loss'], label='Validation Loss')
        # fig2.autofmt_xdate()  # 自动优化日期显示
        # ax2.xaxis.set_major_locator(mdates.AutoDateLocator())
        # ax2.xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m-%d %H:%M"))
        # # 添加统计信息框
        # stats_text = f"""模型验证和评估：
        # • Mean Squared Error (MSE):{mse}
        # • Root Mean Squared Error (RMSE):{rmse}
        # • Mean Absolute Error (MAE):{mae}
        # • R-squared (R2): {r2}
        # """
        
        # plt.annotate(stats_text, 
        #             xy=(0.78, 0.85), 
        #             xycoords='axes fraction',
        #             bbox=dict(boxstyle="round", fc="white", ec="#999999", alpha=0.8))
        # plt.title(f'Model Training History')
        # plt.ylabel('Loss')
        # plt.xlabel('Epoch')
        # plt.legend()
        # plt.grid(True)
        # fig2.savefig(f'results/model_loss-{model_name}-{fold+1}.png')
        # plt.show()

    print(f"交叉验证结果：{result}")

    return model_name, {
        'MSE': np.mean(result['MSE']),
        'RMSE': np.mean(result['RMSE']),
        'MAE': np.mean(result['MAE']),
        'R2': np.mean(result['R2'])
    }


In [None]:
window_sizes = [10,30,60,90,120,200]
layers = [2, 3,4,5]
denes_layers = [1,2,3]
units = [64, 128, 256]
results = {}  
for window_size in window_sizes:
    for layer in layers:
        for dense_layer in denes_layers:
            for unit in units:
                model_name,  result = fit_data(X_train_scaled, y_train_scaled, window_size =window_size, layers=layer, dense_layer=dense_layer, units=unit)
                results[model_name] = result

        
print(f'results:\n {results}')

results_df = pd.DataFrame(results)
results_df.to_csv(f'results/results.csv')
# print(f"模型训练完成，模型保存为 {model_name}.h5")

[1m701/701[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 98ms/step - loss: 0.0027 - mae: 0.0166 - val_loss: 0.0105 - val_mae: 0.0696
Epoch 10/20
[1m 18/701[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2:04[0m 183ms/step - loss: 0.0028 - mae: 0.0185

In [None]:
# # 计算每个窗口大小的平均结果
# averages = { window_size: {
#     metrics:np.mean(results[window_size][metric] for metric in metrics)
# } for window_size in window_sizes }

# 对指标进行对数变换
log_transformed_results = {
    window_size:{
        'MSE': np.log1p(results[_key]['MSE']),
        'RMSE': np.log1p(results[_key]['RMSE']),
        'MAE': results[_key]['MAE'],
        'R2': results[_key]['R2']
    } for _key in results}

In [None]:
mse = [results[model] for model in results]
rmse = [np.sqrt(mse[i]) for i in range(len(mse))]
mae = [results[model] for model in results]
r2 = [results[model] for model in results]
results_d = pd.DataFrame({'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'R2': r2}, index=results.keys())

fig6 = plt.subplots(figsize=(10, 6))
bar_width = 0.2
# 绘制柱状图
index = np.arange(len(results))
plt.bar(index, mse, bar_width, label='MSE', color='blue')
plt.bar(index + bar_width, rmse, bar_width, label='RMSE', color='green')
plt.bar(index + 2 * bar_width, mae, bar_width, label='MAE', color='red')
plt.bar(index + 3 * bar_width, r2, bar_width, label='R2', color='purple')
# 设置标题和标签
plt.title('Comparison of Regression Models')
plt.xlabel('Model Complexity')
plt.ylabel('Metrics Values (Log Scale)')
plt.yscale('log')
plt.xticks(index + bar_width, results.keys())
plt.legend()
plt.tight_layout()
plt.savefig('results/model_comparison.png')
plt.show()

In [None]:
colors = sns.color_palette("Blues", len(window_sizes))
r = np.arange(len(metrics))
plt.figure(figsize=(10, 6))
barWidth = 0.15 # 每个柱子的宽度
for i, (window_size, result) in enumerate(results.items()):
    avg_metrics =[log_transformed_results[metric] for metric in metrics]
    bars = plt.bar(r + i * barWidth, avg_metrics, width=barWidth, colors=colors[i], edgecolor='white', label=f'Window Size {window_size}')
    for bar in bars:
        yval = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2, yval, round(yval, 2), va='bottom', ha='center')
plt.xlabel("Metrics", fontweight='bold')
plt.ylabel("log-transformed / Original Value", fontweight='bold')
plt.xticks([r + barWidth * (len(window_sizes) /2 -0.5) for r in np.arange(len(metrics))], metrics)
# 添加统计信息框
plt.title("Log-Transformed Evaluation Metrics for Different Window Sizes", fontweight='bold')

plt.legend()
plt.tight_layout()
plt.show()