In [None]:
from google.colab import drive
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
import os

# Mount Google Drive
drive.mount('/content/drive')

# Replace 'df.csv' with the actual path to your file in Google Drive if it's not in the root directory
file_path = '/content/drive/My Drive/data.xlsx'

# 读取数据，time列解析为日期但不设为索引
df1 = pd.read_excel(file_path, parse_dates=['time'])

display(df1.head(6))
print(df1.dtypes)

# 去掉df最后5行
df = df1[:-5]
print(f"删除后数据形状：{df.shape}")

Mounted at /content/drive


Unnamed: 0,time,ln_gdp_sa,ln_gdp_seasonal_factor,dln_gdp_sa,city_mean_light,region_mean_light,ln_city,ln_region,GDP,ln_gdp,pc1,pc2,pc3,dpc2,quarter,dln_city,dln_region
0,2008-01-01,7.477351,-0.078167,,12.553247,0.953015,2.529979,-0.048125,1634.65,7.399184,-0.930701,0.185353,0.281816,,1,,
1,2008-04-01,7.498888,-0.053482,0.021537,13.280818,0.986813,2.586321,-0.013275,1711.98,7.445406,-0.810655,0.314787,0.089018,0.129435,2,0.056341,0.03485
2,2008-07-01,7.529319,0.074136,0.03043,13.19119,0.972665,2.579549,-0.027716,2005.11,7.603454,0.061691,0.290403,0.006259,-0.024384,3,-0.006772,-0.014441
3,2008-10-01,7.554362,0.074117,0.025043,13.697327,1.017466,2.617201,0.017316,2055.92,7.628479,0.236543,0.422133,-0.012724,0.13173,4,0.037652,0.045031
4,2009-01-01,7.565732,-0.103575,0.01137,12.906907,0.98098,2.557763,-0.019203,1740.9,7.462157,0.327092,0.505859,-0.071886,0.083726,1,-0.059438,-0.036519
5,2009-04-01,7.599352,-0.054602,0.03362,13.654976,1.015769,2.614104,0.015646,1890.79,7.54475,-0.003495,0.485063,-0.394677,-0.020796,2,0.056341,0.03485


time                      datetime64[ns]
ln_gdp_sa                        float64
ln_gdp_seasonal_factor           float64
dln_gdp_sa                       float64
city_mean_light                  float64
region_mean_light                float64
ln_city                          float64
ln_region                        float64
GDP                              float64
ln_gdp                           float64
pc1                              float64
pc2                              float64
pc3                              float64
dpc2                             float64
quarter                            int64
dln_city                         float64
dln_region                       float64
dtype: object
删除后数据形状：(59, 17)


In [None]:
# 1. 数据准备
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

target = 'dln_gdp_sa'
base_vars = ['pc1', 'pc3', 'dpc2', 'dln_city', 'dln_region']
max_lag = 4

# 创建滞后变量
data = df.copy()
candidates = []

for var in base_vars + [target]:
    for lag in range(1, max_lag + 1):
        lag_name = f'{var}_lag{lag}'
        data[lag_name] = df[var].shift(lag)
        candidates.append(lag_name)

print(f"候选变量数: {len(candidates)}")
print(f"原始数据: {data.shape}")

# 处理缺失值
data_clean1 = data.dropna()
print(f"清理后数据: {data_clean1.shape}")
print(f"删除了 {data.shape[0] - data_clean1.shape[0]} 行缺失值")

# 归一化处理
scaler_X = MinMaxScaler(feature_range=(-1, 1))
scaler_y = MinMaxScaler(feature_range=(-1, 1))

cols_to_keep_and_scale = candidates + [target] + ['time']
data_to_scale = data_clean1[cols_to_keep_and_scale].copy()

# 对特征列进行归一化
feature_cols_to_scale = candidates
data_to_scale[feature_cols_to_scale] = scaler_X.fit_transform(data_to_scale[feature_cols_to_scale])

# 对目标列进行归一化
target_col_to_scale = [target]
data_to_scale[target_col_to_scale] = scaler_y.fit_transform(data_to_scale[target_col_to_scale])

data_clean = data_to_scale
print("数据归一化完成")

候选变量数: 24
原始数据: (59, 41)
清理后数据: (54, 41)
删除了 5 行缺失值
数据归一化完成


In [None]:
import joblib
import os
from google.colab import drive

drive.mount('/content/drive')  # 确保Google Drive已挂载

scaler_save_path = '/content/drive/My Drive'
os.makedirs(scaler_save_path, exist_ok=True)

scaler_x_filename = os.path.join(scaler_save_path, 'scaler_X.joblib')
scaler_y_filename = os.path.join(scaler_save_path, 'scaler_y.joblib')

joblib.dump(scaler_X, scaler_x_filename)
joblib.dump(scaler_y, scaler_y_filename)
print(f"归一化器已保存")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
归一化器已保存


In [None]:
# Ensure data_clean1 is available from previous cells
if 'data_clean1' not in globals():
    print("错误：未找到 data_clean1 数据框。请确保您已运行数据加载和预处理代码。")
else:
    # Define the target variable column name
    target = 'dln_gdp_sa'

    # Define the list of independent variable (feature) column names
    # Exclude 'time' and the target column from the list of all columns
    independent_variables = ['pc3_lag4', 'dln_gdp_sa_lag1', 'dln_gdp_sa_lag4', 'dpc2_lag4', 'dln_region_lag3', 'pc1_lag1', 'dln_city_lag1', 'dln_gdp_sa_lag3', 'dln_gdp_sa_lag2', 'dln_region_lag2', 'pc3_lag3', 'dln_city_lag2', 'dpc2_lag3', 'dpc2_lag1', 'pc1_lag4', 'pc3_lag1']

    # Check if the specified columns exist in data_clean1
    required_columns = [target] + independent_variables + ['time']
    if not all(col in data_clean1.columns for col in required_columns):
        missing_cols = [col for col in required_columns if col not in data_clean1.columns]
        print(f"错误：data_clean1 中缺少必需的列：{missing_cols}。请检查数据预处理步骤。")
    else:
        # Select the independent variables (features)
        X_data = data_clean1[independent_variables].copy()

        # Select the dependent variable (target)
        y_data = data_clean1[target].copy()

        # Optionally keep the time column separate
        time_data = data_clean1['time'].copy()

        print(f"已从 data_clean1 中创建自变量集 X_data ({X_data.shape}) 和因变量集 y_data ({y_data.shape})。")
        print(f"自变量列: {X_data.columns.tolist()}")
        print(f"因变量列: {target}")
        print(f"时间列已单独保留为 time_data ({time_data.shape})。")

        # Display the first few rows of the created datasets
        display("X_data head:", X_data.head())
        display("y_data head:", y_data.head())
        display("time_data head:", time_data.head())

已从 data_clean1 中创建自变量集 X_data ((54, 16)) 和因变量集 y_data ((54,))。
自变量列: ['pc3_lag4', 'dln_gdp_sa_lag1', 'dln_gdp_sa_lag4', 'dpc2_lag4', 'dln_region_lag3', 'pc1_lag1', 'dln_city_lag1', 'dln_gdp_sa_lag3', 'dln_gdp_sa_lag2', 'dln_region_lag2', 'pc3_lag3', 'dln_city_lag2', 'dpc2_lag3', 'dpc2_lag1', 'pc1_lag4', 'pc3_lag1']
因变量列: dln_gdp_sa
时间列已单独保留为 time_data ((54,))。


'X_data head:'

Unnamed: 0,pc3_lag4,dln_gdp_sa_lag1,dln_gdp_sa_lag4,dpc2_lag4,dln_region_lag3,pc1_lag1,dln_city_lag1,dln_gdp_sa_lag3,dln_gdp_sa_lag2,dln_region_lag2,pc3_lag3,dln_city_lag2,dpc2_lag3,dpc2_lag1,pc1_lag4,pc3_lag1
5,0.089018,0.01137,0.021537,0.129435,-0.014441,0.327092,-0.059438,0.03043,0.025043,0.045031,0.006259,0.037652,-0.024384,0.083726,-0.810655,-0.071886
6,0.006259,0.03362,0.03043,-0.024384,0.045031,-0.003495,0.056341,0.025043,0.01137,-0.036519,-0.012724,-0.059438,0.13173,-0.020796,0.061691,-0.394677
7,-0.012724,0.026588,0.025043,0.13173,-0.036519,0.160463,-0.006772,0.01137,0.03362,0.03485,-0.071886,0.056341,0.083726,-0.300986,0.236543,-0.114024
8,-0.071886,0.040039,0.01137,0.083726,0.03485,0.214866,0.037652,0.03362,0.026588,-0.014441,-0.394677,-0.006772,-0.020796,0.096575,0.327092,-0.031341
9,-0.394677,0.053382,0.03362,-0.020796,-0.014441,-1.119556,-0.015751,0.026588,0.040039,0.045031,-0.114024,0.037652,-0.300986,-0.144311,-0.003495,0.231504


'y_data head:'

Unnamed: 0,dln_gdp_sa
5,0.03362
6,0.026588
7,0.040039
8,0.053382
9,0.049996


'time_data head:'

Unnamed: 0,time
5,2009-04-01
6,2009-07-01
7,2009-10-01
8,2010-01-01
9,2010-04-01


In [None]:
# Define the target variable
target = 'dln_gdp_sa'

# Define the list of candidate independent variables
candidate_independent_variables = ['pc3_lag4', 'dln_gdp_sa_lag1', 'dln_gdp_sa_lag4', 'dpc2_lag4', 'dln_region_lag3', 'pc1_lag1', 'dln_city_lag1', 'dln_gdp_sa_lag3', 'dln_gdp_sa_lag2', 'dln_region_lag2', 'pc3_lag3', 'dln_city_lag2', 'dpc2_lag3', 'dpc2_lag1', 'pc1_lag4', 'pc3_lag1']

# Check if the target column exists in data_clean1
if target not in data_clean1.columns:
    print(f"Error: Target column '{target}' not found in data_clean1.")
else:
    print(f"Target variable '{target}' is defined and exists in data_clean1.")

# Check if all candidate independent variable columns exist in data_clean1
missing_candidate_cols = [col for col in candidate_independent_variables if col not in data_clean1.columns]

if missing_candidate_cols:
    print(f"Error: Missing candidate independent variable columns in data_clean1: {missing_candidate_cols}")
else:
    print("All candidate independent variable columns exist in data_clean1.")
    print(f"Candidate independent variables: {candidate_independent_variables}")

# Display the first few rows and columns of data_clean1 for visual inspection
display("First 5 rows of data_clean1:", data_clean.head())
display("Columns in data_clean1:", data_clean.columns.tolist())

Target variable 'dln_gdp_sa' is defined and exists in data_clean1.
All candidate independent variable columns exist in data_clean1.
Candidate independent variables: ['pc3_lag4', 'dln_gdp_sa_lag1', 'dln_gdp_sa_lag4', 'dpc2_lag4', 'dln_region_lag3', 'pc1_lag1', 'dln_city_lag1', 'dln_gdp_sa_lag3', 'dln_gdp_sa_lag2', 'dln_region_lag2', 'pc3_lag3', 'dln_city_lag2', 'dpc2_lag3', 'dpc2_lag1', 'pc1_lag4', 'pc3_lag1']


'First 5 rows of data_clean1:'

Unnamed: 0,pc1_lag1,pc1_lag2,pc1_lag3,pc1_lag4,pc3_lag1,pc3_lag2,pc3_lag3,pc3_lag4,dpc2_lag1,dpc2_lag2,...,dln_region_lag1,dln_region_lag2,dln_region_lag3,dln_region_lag4,dln_gdp_sa_lag1,dln_gdp_sa_lag2,dln_gdp_sa_lag3,dln_gdp_sa_lag4,dln_gdp_sa,time
5,0.379795,0.303054,0.154866,-0.584454,-0.088699,0.078325,0.131918,0.365562,0.614922,0.801469,...,-0.040532,0.157725,0.013142,0.132972,0.170179,0.253089,0.285753,0.231828,0.305093,2009-04-01
6,0.09962,0.379795,0.303054,0.154866,-1.0,-0.088699,0.078325,0.131918,0.208738,0.614922,...,0.132972,-0.040532,0.157725,0.013142,0.305093,0.170179,0.253089,0.285753,0.262455,2009-07-01
7,0.238576,0.09962,0.379795,0.303054,-0.207664,-1.0,-0.088699,0.078325,-0.88011,0.208738,...,0.013142,0.132972,-0.040532,0.157725,0.262455,0.305093,0.170179,0.253089,0.344014,2009-10-01
8,0.284683,0.238576,0.09962,0.379795,0.025767,-0.207664,-1.0,-0.088699,0.664855,-0.88011,...,0.157725,0.013142,0.132972,-0.040532,0.344014,0.262455,0.305093,0.170179,0.424926,2010-01-01
9,-0.846249,0.284683,0.238576,0.09962,0.767828,0.025767,-0.207664,-1.0,-0.271255,0.664855,...,0.555243,0.157725,0.013142,0.132972,0.424926,0.344014,0.262455,0.305093,0.404393,2010-04-01


'Columns in data_clean1:'

['pc1_lag1',
 'pc1_lag2',
 'pc1_lag3',
 'pc1_lag4',
 'pc3_lag1',
 'pc3_lag2',
 'pc3_lag3',
 'pc3_lag4',
 'dpc2_lag1',
 'dpc2_lag2',
 'dpc2_lag3',
 'dpc2_lag4',
 'dln_city_lag1',
 'dln_city_lag2',
 'dln_city_lag3',
 'dln_city_lag4',
 'dln_region_lag1',
 'dln_region_lag2',
 'dln_region_lag3',
 'dln_region_lag4',
 'dln_gdp_sa_lag1',
 'dln_gdp_sa_lag2',
 'dln_gdp_sa_lag3',
 'dln_gdp_sa_lag4',
 'dln_gdp_sa',
 'time']

## Define gru model architecture

### Subtask:
Create a function to build the GRU model with specified layers, dropout, activation, and output shape.


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Dropout
from tensorflow.keras.optimizers import Adam

def build_gru_model(input_shape, output_shape):
    """
    Builds a Sequential GRU model with specified architecture.

    Args:
        input_shape (tuple): The shape of the input data (timesteps, features).
        output_shape (int): The number of units in the output layer.

    Returns:
        keras.src.engine.sequential.Sequential: Compiled GRU model.
    """
    model = Sequential()
    model.add(GRU(units=3, return_sequences=False, input_shape=input_shape))
    model.add(Dropout(0.1))
    model.add(Dense(units=output_shape, activation='tanh')) # Using tanh activation as requested
    model.compile(optimizer=Adam(), loss='mse')
    return model

print("GRU model building function defined.")

GRU model building function defined.


## Implement rolling window cross-validation

### Subtask:
Develop a function to generate training, validation, and testing sets using a rolling window approach with fixed sizes.


In [None]:
import numpy as np

def create_sliding_window_datasets(data, input_length, output_length, independent_variables, target):
    """
    Creates sliding window datasets for time series forecasting.

    Args:
        data (pd.DataFrame): The input data.
        input_length (int): The length of the input sequence (X).
        output_length (int): The length of the output sequence (y).
        independent_variables (list): A list of column names for the independent variables.
        target (str): The column name for the target variable.

    Returns:
        tuple: A tuple containing (X, y) where X is the input data and y is the target data.
    """
    X, y = [], []
    for i in range(len(data) - input_length - output_length + 1):
        # Extract the input sequence (X)
        X_sequence = data[independent_variables].iloc[i:(i + input_length)].values
        X.append(X_sequence)

        # Extract the output sequence (y)
        y_sequence = data[target].iloc[(i + input_length):(i + input_length + output_length)].values
        y.append(y_sequence)

    return np.array(X), np.array(y)

# Define the target variable
target = 'dln_gdp_sa'

# Define the list of candidate independent variables
candidate_independent_variables = ['pc3_lag4', 'dln_gdp_sa_lag1', 'dln_gdp_sa_lag4', 'dpc2_lag4', 'dln_region_lag3', 'pc1_lag1', 'dln_city_lag1', 'dln_gdp_sa_lag3', 'dln_gdp_sa_lag2', 'dln_region_lag2', 'pc3_lag3', 'dln_city_lag2', 'dpc2_lag3', 'dpc2_lag1', 'pc1_lag4', 'pc3_lag1']

# Example usage with the defined variables and sliding window
input_length = 40
output_length = 5

# Initially use all candidate variables to create the full dataset for splitting
X_full, y_full = create_sliding_window_datasets(data_clean, input_length, output_length, candidate_independent_variables, target)

# Split the data into training and validation sets (8 samples for training, 2 for validation)
num_samples = X_full.shape[0]
num_train_samples = 8
num_val_samples = 2

if num_samples < num_train_samples + num_val_samples:
    print(f"Error: Not enough samples ({num_samples}) for the requested train/validation split ({num_train_samples}/{num_val_samples}).")
else:
    X_train_full = X_full[:num_train_samples]
    y_train_full = y_full[:num_train_samples]
    X_val_full = X_full[num_train_samples:num_train_samples + num_val_samples]
    y_val_full = y_full[num_train_samples:num_train_samples + num_val_samples]

    print(f"Created datasets with sliding window:")
    print(f"X_full shape: {X_full.shape}")
    print(f"y_full shape: {y_full.shape}")
    print(f"X_train_full shape: {X_train_full.shape}")
    print(f"y_train_full shape: {y_train_full.shape}")
    print(f"X_val_full shape: {X_val_full.shape}")
    print(f"y_val_full shape: {y_val_full.shape}")

Created datasets with sliding window:
X_full shape: (10, 40, 16)
y_full shape: (10, 5)
X_train_full shape: (8, 40, 16)
y_train_full shape: (8, 5)
X_val_full shape: (2, 40, 16)
y_val_full shape: (2, 5)


## Implement forward selection and train GRU models

### Subtask:
Implement the forward selection algorithm to iteratively add variables, train GRU models with different learning rates and a fixed random seed, evaluate performance using validation loss and Early Stopping, and save the best models at each step.

In [None]:
import tensorflow as tf
import numpy as np
import random
import joblib
import os
import pandas as pd
from sklearn.metrics import mean_squared_error
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# 确保可复现性
random_seeds = [1, 2]
os.environ['TF_DETERMINISTIC_OPS'] = '1'

# 定义GRU模型构建函数
def build_gru_model(input_shape, output_shape):
    model = Sequential()
    model.add(GRU(units=3, return_sequences=False, input_shape=input_shape))
    model.add(Dropout(0.1))
    model.add(Dense(units=output_shape, activation='tanh'))
    model.compile(optimizer=Adam(), loss='mse')
    return model

# 定义目标变量和候选自变量
target = 'dln_gdp_sa'
candidate_independent_variables = ['pc3_lag4', 'dln_gdp_sa_lag1', 'dln_gdp_sa_lag4', 'dpc2_lag4', 'dln_region_lag3', 'pc1_lag1', 'dln_city_lag1', 'dln_gdp_sa_lag3', 'dln_gdp_sa_lag2', 'dln_region_lag2', 'pc3_lag3', 'dln_city_lag2', 'dpc2_lag3', 'dpc2_lag1', 'pc1_lag4', 'pc3_lag1']

# 滑动窗口参数
input_length = 40
output_length = 5

# 超参数
learning_rates = [0.01, 0.001]
epochs = 50
batch_size = 4
early_stopping_patience = 10
early_stopping_min_delta = 0.001

# 前向选择初始化
selected_variables = []
remaining_variables = candidate_independent_variables.copy()
forward_selection_results = []  # 存储结果表格

print("开始前向选择，使用多个随机种子...")
print(f"随机种子: {random_seeds}")

for step in range(7):
    best_variable_for_this_step = None
    best_avg_loss_for_this_step = float('inf')
    best_lr_for_this_step = None
    candidate_results = []  # 存储候选变量结果

    print(f"\n步骤 {step + 1}: 选择第 {len(selected_variables) + 1} 个变量")
    print(f"当前已选变量: {selected_variables}")
    print(f"剩余候选变量: {remaining_variables}")

    for candidate_variable in remaining_variables:
        current_variables = selected_variables + [candidate_variable]
        print(f"\n测试变量组合: {current_variables}")

        # 创建滑动窗口数据集
        X_full_current, y_full_current = create_sliding_window_datasets(
            data_clean, input_length, output_length, current_variables, target
        )

        num_samples_current = X_full_current.shape[0]
        num_train_samples_current = 8
        num_val_samples_current = 2

        if num_samples_current < num_train_samples_current + num_val_samples_current:
             print(f"跳过组合，样本不足 ({num_samples_current})")
             continue

        X_train_current = X_full_current[:num_train_samples_current]
        y_train_current = y_full_current[:num_train_samples_current]
        X_val_current = X_full_current[num_train_samples_current:num_train_samples_current + num_val_samples_current]
        y_val_current = y_full_current[num_train_samples_current:num_train_samples_current + num_val_samples_current]

        # 不同学习率训练
        for lr in learning_rates:
            print(f"  学习率: {lr}")
            seed_losses = []
            seed_models = {}
            seed_epochs = []

            for seed in random_seeds:
                # 设置种子
                np.random.seed(seed)
                tf.random.set_seed(seed)
                os.environ['PYTHONHASHSEED'] = str(seed)
                random.seed(seed)

                print(f"    种子 {seed}:")

                # 构建模型
                model = build_gru_model(input_shape=(input_length, len(current_variables)),
                                      output_shape=output_length)
                model.compile(optimizer=Adam(learning_rate=lr), loss='mse')

                # 早停回调
                early_stopping = EarlyStopping(
                    monitor='val_loss',
                    patience=early_stopping_patience,
                    min_delta=early_stopping_min_delta,
                    restore_best_weights=True
                )

                # 训练模型
                history = model.fit(
                    X_train_current, y_train_current,
                    epochs=epochs,
                    batch_size=batch_size,
                    validation_data=(X_val_current, y_val_current),
                    callbacks=[early_stopping],
                    verbose=0
                )

                # 获取最佳验证损失和轮次
                min_val_loss = min(history.history['val_loss'])
                best_epoch = history.history['val_loss'].index(min_val_loss) + 1

                # 记录结果
                seed_losses.append(min_val_loss)
                seed_models[seed] = model
                seed_epochs.append(best_epoch)

                print(f"      验证损失: {min_val_loss:.4f} (轮次: {best_epoch})")

            # 计算平均损失
            avg_loss = np.mean(seed_losses)
            print(f"  平均验证损失: {avg_loss:.4f} (学习率 {lr})")

            # 存储候选结果
            candidate_results.append({
                'variable': candidate_variable,
                'lr': lr,
                'avg_loss': avg_loss,
                'seed_losses': seed_losses,
                'seed_epochs': seed_epochs,
                'seed_models': seed_models
            })

            # 检查是否最佳组合
            if avg_loss < best_avg_loss_for_this_step:
                best_avg_loss_for_this_step = avg_loss
                best_variable_for_this_step = candidate_variable
                best_lr_for_this_step = lr

                # 暂时保存最佳模型
                best_seed_idx = np.argmin(seed_losses)
                best_seed = random_seeds[best_seed_idx]
                best_model_for_step = seed_models[best_seed]

                temp_model_save_path = f'/content/drive/My Drive/GRU3temp_best_model_step_{step+1}.keras'
                best_model_for_step.save(temp_model_save_path)
                print(f"  临时保存步骤 {step+1} 最佳模型: '{candidate_variable}', 学习率 {lr}, 种子 {best_seed}")

    # 添加最佳变量到已选列表
    if best_variable_for_this_step:
        selected_variables.append(best_variable_for_this_step)
        remaining_variables.remove(best_variable_for_this_step)

        print(f"\n--- 步骤 {step + 1} 结果 ---")
        print(f"选中变量: '{best_variable_for_this_step}'")
        print(f"最佳平均验证损失: {best_avg_loss_for_this_step:.4f}")
        print(f"最佳学习率: {best_lr_for_this_step}")
        print(f"当前已选变量: {selected_variables}")

        # 获取最佳候选结果
        best_candidate_data = [d for d in candidate_results
                              if d['variable'] == best_variable_for_this_step
                              and d['lr'] == best_lr_for_this_step][0]

        # 记录结果到表格
        step_result = {
            'Step': step + 1,
            'Variable number': len(selected_variables),
            'Selected Variables': ', '.join(selected_variables),
            'Best average Val Loss': best_avg_loss_for_this_step,
            'Best average LR': best_lr_for_this_step,
        }

        # 添加种子详细结果
        for i, seed in enumerate(random_seeds):
            step_result[f'seed {seed} Val Loss'] = best_candidate_data['seed_losses'][i]
            step_result[f'seed {seed} Epochs Used'] = best_candidate_data['seed_epochs'][i]

        forward_selection_results.append(step_result)

        # 保存最终模型（两个种子中MSE最低的）
        best_seed_idx = np.argmin(best_candidate_data['seed_losses'])
        best_seed = random_seeds[best_seed_idx]
        best_model = best_candidate_data['seed_models'][best_seed]

        # 修改：使用新命名规则 GRU3_{变量数量}
        final_model_name = f'GRU3_{len(selected_variables)}.keras'  # 修改点
        final_model_save_path = os.path.join('/content/drive/My Drive', final_model_name)
        best_model.save(final_model_save_path)
        print(f"已保存最优模型到Google Drive: '{final_model_name}' (seed {best_seed})")

    else:
        print(f"\n--- 步骤 {step + 1} 结果 ---")
        print("本步骤无改进变量")
        break

print("\n前向选择完成")
print(f"最终选中变量: {selected_variables}")

# 保存结果表格
results_df = pd.DataFrame(forward_selection_results)
results_csv_path = os.path.join('/content/drive/My Drive', 'GRU3_results.csv')
results_df.to_csv(results_csv_path, index=False)
print(f"已保存前向选择结果表格到Google Drive: {results_csv_path}")

# 显示结果表格
print("\n前向选择详细结果:")
display(results_df)

开始前向选择，使用多个随机种子...
随机种子: [1, 2]

步骤 1: 选择第 1 个变量
当前已选变量: []
剩余候选变量: ['pc3_lag4', 'dln_gdp_sa_lag1', 'dln_gdp_sa_lag4', 'dpc2_lag4', 'dln_region_lag3', 'pc1_lag1', 'dln_city_lag1', 'dln_gdp_sa_lag3', 'dln_gdp_sa_lag2', 'dln_region_lag2', 'pc3_lag3', 'dln_city_lag2', 'dpc2_lag3', 'dpc2_lag1', 'pc1_lag4', 'pc3_lag1']

测试变量组合: ['pc3_lag4']
  学习率: 0.01
    种子 1:
      验证损失: 0.0026 (轮次: 20)
    种子 2:
      验证损失: 0.0025 (轮次: 14)
  平均验证损失: 0.0025 (学习率 0.01)
  临时保存步骤 1 最佳模型: 'pc3_lag4', 学习率 0.01, 种子 2
  学习率: 0.001
    种子 1:
      验证损失: 0.0029 (轮次: 50)
    种子 2:
      验证损失: 0.0064 (轮次: 50)
  平均验证损失: 0.0046 (学习率 0.001)

测试变量组合: ['dln_gdp_sa_lag1']
  学习率: 0.01
    种子 1:
      验证损失: 0.0025 (轮次: 17)
    种子 2:
      验证损失: 0.0035 (轮次: 25)
  平均验证损失: 0.0030 (学习率 0.01)
  学习率: 0.001
    种子 1:
      验证损失: 0.0071 (轮次: 50)
    种子 2:
      验证损失: 0.0098 (轮次: 50)
  平均验证损失: 0.0085 (学习率 0.001)

测试变量组合: ['dln_gdp_sa_lag4']
  学习率: 0.01
    种子 1:
      验证损失: 0.0023 (轮次: 19)
    种子 2:
      验证损失: 0.0030 (轮次: 10)
  平均

Unnamed: 0,Step,Variable number,Selected Variables,Best average Val Loss,Best average LR,seed 1 Val Loss,seed 1 Epochs Used,seed 2 Val Loss,seed 2 Epochs Used
0,1,1,dpc2_lag3,0.001549,0.01,0.001641,8,0.001458,11
1,2,2,"dpc2_lag3, dln_region_lag3",0.001558,0.01,0.00164,9,0.001475,14
2,3,3,"dpc2_lag3, dln_region_lag3, dln_city_lag2",0.001432,0.01,0.001335,9,0.001529,22
3,4,4,"dpc2_lag3, dln_region_lag3, dln_city_lag2, dln...",0.001399,0.01,0.001266,11,0.001532,18
4,5,5,"dpc2_lag3, dln_region_lag3, dln_city_lag2, dln...",0.001237,0.01,0.001233,9,0.001242,20
5,6,6,"dpc2_lag3, dln_region_lag3, dln_city_lag2, dln...",0.000943,0.01,0.000772,9,0.001113,18
6,7,7,"dpc2_lag3, dln_region_lag3, dln_city_lag2, dln...",0.00077,0.01,0.000779,9,0.000762,21


# 试试GRU 4

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Dropout
from tensorflow.keras.optimizers import Adam

def build_gru_model_4_neurons(input_shape, output_shape):
    """
    Builds a Sequential GRU model with 1 layer and 4 neurons.

    Args:
        input_shape (tuple): The shape of the input data (timesteps, features).
        output_shape (int): The number of units in the output layer.

    Returns:
        keras.src.engine.sequential.Sequential: Compiled GRU model.
    """
    model = Sequential()
    model.add(GRU(units=4, return_sequences=False, input_shape=input_shape))
    model.add(Dropout(0.1))
    model.add(Dense(units=output_shape, activation='tanh')) # Using tanh activation as requested
    model.compile(optimizer=Adam(), loss='mse')
    return model

print("New GRU model building function with 4 neurons defined.")

New GRU model building function with 4 neurons defined.


In [None]:
import tensorflow as tf
import numpy as np
import random
import joblib
import os
import pandas as pd
from sklearn.metrics import mean_squared_error
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# 确保可复现性
random_seeds = [1, 2]
os.environ['TF_DETERMINISTIC_OPS'] = '1'

# 定义GRU模型构建函数
def build_gru_model(input_shape, output_shape):
    model = Sequential()
    model.add(GRU(units=3, return_sequences=False, input_shape=input_shape))
    model.add(Dropout(0.1))
    model.add(Dense(units=output_shape, activation='tanh'))
    model.compile(optimizer=Adam(), loss='mse')
    return model

# 定义目标变量和候选自变量
target = 'dln_gdp_sa'
candidate_independent_variables = ['pc3_lag4', 'dln_gdp_sa_lag1', 'dln_gdp_sa_lag4', 'dpc2_lag4', 'dln_region_lag3', 'pc1_lag1', 'dln_city_lag1', 'dln_gdp_sa_lag3', 'dln_gdp_sa_lag2', 'dln_region_lag2', 'pc3_lag3', 'dln_city_lag2', 'dpc2_lag3', 'dpc2_lag1', 'pc1_lag4', 'pc3_lag1']

# 滑动窗口参数
input_length = 40
output_length = 5

# 超参数
learning_rates = [0.01, 0.001]
epochs = 50
batch_size = 4
early_stopping_patience = 10
early_stopping_min_delta = 0.001

# 前向选择初始化
selected_variables = []
remaining_variables = candidate_independent_variables.copy()
forward_selection_results = []  # 存储结果表格

print("开始前向选择，使用多个随机种子...")
print(f"随机种子: {random_seeds}")

for step in range(7):
    best_variable_for_this_step = None
    best_avg_loss_for_this_step = float('inf')
    best_lr_for_this_step = None
    candidate_results = []  # 存储候选变量结果

    print(f"\n步骤 {step + 1}: 选择第 {len(selected_variables) + 1} 个变量")
    print(f"当前已选变量: {selected_variables}")
    print(f"剩余候选变量: {remaining_variables}")

    for candidate_variable in remaining_variables:
        current_variables = selected_variables + [candidate_variable]
        print(f"\n测试变量组合: {current_variables}")

        # 创建滑动窗口数据集
        X_full_current, y_full_current = create_sliding_window_datasets(
            data_clean, input_length, output_length, current_variables, target
        )

        num_samples_current = X_full_current.shape[0]
        num_train_samples_current = 8
        num_val_samples_current = 2

        if num_samples_current < num_train_samples_current + num_val_samples_current:
             print(f"跳过组合，样本不足 ({num_samples_current})")
             continue

        X_train_current = X_full_current[:num_train_samples_current]
        y_train_current = y_full_current[:num_train_samples_current]
        X_val_current = X_full_current[num_train_samples_current:num_train_samples_current + num_val_samples_current]
        y_val_current = y_full_current[num_train_samples_current:num_train_samples_current + num_val_samples_current]

        # 不同学习率训练
        for lr in learning_rates:
            print(f"  学习率: {lr}")
            seed_losses = []
            seed_models = {}
            seed_epochs = []

            for seed in random_seeds:
                # 设置种子
                np.random.seed(seed)
                tf.random.set_seed(seed)
                os.environ['PYTHONHASHSEED'] = str(seed)
                random.seed(seed)

                print(f"    种子 {seed}:")

                # 构建模型
                model = build_gru_model_4_neurons(input_shape=(input_length, len(current_variables)),
                                      output_shape=output_length)
                model.compile(optimizer=Adam(learning_rate=lr), loss='mse')

                # 早停回调
                early_stopping = EarlyStopping(
                    monitor='val_loss',
                    patience=early_stopping_patience,
                    min_delta=early_stopping_min_delta,
                    restore_best_weights=True
                )

                # 训练模型
                history = model.fit(
                    X_train_current, y_train_current,
                    epochs=epochs,
                    batch_size=batch_size,
                    validation_data=(X_val_current, y_val_current),
                    callbacks=[early_stopping],
                    verbose=0
                )

                # 获取最佳验证损失和轮次
                min_val_loss = min(history.history['val_loss'])
                best_epoch = history.history['val_loss'].index(min_val_loss) + 1

                # 记录结果
                seed_losses.append(min_val_loss)
                seed_models[seed] = model
                seed_epochs.append(best_epoch)

                print(f"      验证损失: {min_val_loss:.4f} (轮次: {best_epoch})")

            # 计算平均损失
            avg_loss = np.mean(seed_losses)
            print(f"  平均验证损失: {avg_loss:.4f} (学习率 {lr})")

            # 存储候选结果
            candidate_results.append({
                'variable': candidate_variable,
                'lr': lr,
                'avg_loss': avg_loss,
                'seed_losses': seed_losses,
                'seed_epochs': seed_epochs,
                'seed_models': seed_models
            })

            # 检查是否最佳组合
            if avg_loss < best_avg_loss_for_this_step:
                best_avg_loss_for_this_step = avg_loss
                best_variable_for_this_step = candidate_variable
                best_lr_for_this_step = lr

                # 暂时保存最佳模型
                best_seed_idx = np.argmin(seed_losses)
                best_seed = random_seeds[best_seed_idx]
                best_model_for_step = seed_models[best_seed]

                temp_model_save_path = f'/content/drive/My Drive/GRU4temp_best_model_step_{step+1}.keras'
                best_model_for_step.save(temp_model_save_path)
                print(f"  临时保存步骤 {step+1} 最佳模型: '{candidate_variable}', 学习率 {lr}, 种子 {best_seed}")

    # 添加最佳变量到已选列表
    if best_variable_for_this_step:
        selected_variables.append(best_variable_for_this_step)
        remaining_variables.remove(best_variable_for_this_step)

        print(f"\n--- 步骤 {step + 1} 结果 ---")
        print(f"选中变量: '{best_variable_for_this_step}'")
        print(f"最佳平均验证损失: {best_avg_loss_for_this_step:.4f}")
        print(f"最佳学习率: {best_lr_for_this_step}")
        print(f"当前已选变量: {selected_variables}")

        # 获取最佳候选结果
        best_candidate_data = [d for d in candidate_results
                              if d['variable'] == best_variable_for_this_step
                              and d['lr'] == best_lr_for_this_step][0]

        # 记录结果到表格
        step_result = {
            'Step': step + 1,
            'Variable number': len(selected_variables),
            'Selected Variables': ', '.join(selected_variables),
            'Best average Val Loss': best_avg_loss_for_this_step,
            'Best average LR': best_lr_for_this_step,
        }

        # 添加种子详细结果
        for i, seed in enumerate(random_seeds):
            step_result[f'seed {seed} Val Loss'] = best_candidate_data['seed_losses'][i]
            step_result[f'seed {seed} Epochs Used'] = best_candidate_data['seed_epochs'][i]

        forward_selection_results.append(step_result)

        # 保存最终模型（两个种子中MSE最低的）
        best_seed_idx = np.argmin(best_candidate_data['seed_losses'])
        best_seed = random_seeds[best_seed_idx]
        best_model = best_candidate_data['seed_models'][best_seed]

        # 修改：使用新命名规则 GRU4_{变量数量}
        final_model_name = f'GRU4_{len(selected_variables)}.keras'  # 修改点
        final_model_save_path = os.path.join('/content/drive/My Drive', final_model_name)
        best_model.save(final_model_save_path)
        print(f"已保存最优模型到Google Drive: '{final_model_name}' (seed {best_seed})")

    else:
        print(f"\n--- 步骤 {step + 1} 结果 ---")
        print("本步骤无改进变量")
        break

print("\n前向选择完成")
print(f"最终选中变量: {selected_variables}")

# 保存结果表格
results_df = pd.DataFrame(forward_selection_results)
results_csv_path = os.path.join('/content/drive/My Drive', 'GRU4_results.csv')
results_df.to_csv(results_csv_path, index=False)
print(f"已保存前向选择结果表格到Google Drive: {results_csv_path}")

# 显示结果表格
print("\n前向选择详细结果:")
display(results_df)

开始前向选择，使用多个随机种子...
随机种子: [1, 2]

步骤 1: 选择第 1 个变量
当前已选变量: []
剩余候选变量: ['pc3_lag4', 'dln_gdp_sa_lag1', 'dln_gdp_sa_lag4', 'dpc2_lag4', 'dln_region_lag3', 'pc1_lag1', 'dln_city_lag1', 'dln_gdp_sa_lag3', 'dln_gdp_sa_lag2', 'dln_region_lag2', 'pc3_lag3', 'dln_city_lag2', 'dpc2_lag3', 'dpc2_lag1', 'pc1_lag4', 'pc3_lag1']

测试变量组合: ['pc3_lag4']
  学习率: 0.01
    种子 1:
      验证损失: 0.0017 (轮次: 12)
    种子 2:
      验证损失: 0.0017 (轮次: 18)
  平均验证损失: 0.0017 (学习率 0.01)
  临时保存步骤 1 最佳模型: 'pc3_lag4', 学习率 0.01, 种子 2
  学习率: 0.001
    种子 1:
      验证损失: 0.0017 (轮次: 50)
    种子 2:
      验证损失: 0.0043 (轮次: 50)
  平均验证损失: 0.0030 (学习率 0.001)

测试变量组合: ['dln_gdp_sa_lag1']
  学习率: 0.01
    种子 1:
      验证损失: 0.0023 (轮次: 14)
    种子 2:
      验证损失: 0.0024 (轮次: 19)
  平均验证损失: 0.0023 (学习率 0.01)
  学习率: 0.001
    种子 1:
      验证损失: 0.0030 (轮次: 50)
    种子 2:
      验证损失: 0.0088 (轮次: 50)
  平均验证损失: 0.0059 (学习率 0.001)

测试变量组合: ['dln_gdp_sa_lag4']
  学习率: 0.01
    种子 1:
      验证损失: 0.0019 (轮次: 24)
    种子 2:
      验证损失: 0.0022 (轮次: 14)
  平均

Unnamed: 0,Step,Variable number,Selected Variables,Best average Val Loss,Best average LR,seed 1 Val Loss,seed 1 Epochs Used,seed 2 Val Loss,seed 2 Epochs Used
0,1,1,pc1_lag1,0.001646,0.01,0.001384,24,0.001908,17
1,2,2,"pc1_lag1, dln_region_lag2",0.001432,0.01,0.001406,26,0.001457,19
2,3,3,"pc1_lag1, dln_region_lag2, dln_region_lag3",0.001344,0.01,0.001406,27,0.001281,11
3,4,4,"pc1_lag1, dln_region_lag2, dln_region_lag3, dl...",0.001372,0.01,0.001414,27,0.00133,11
4,5,5,"pc1_lag1, dln_region_lag2, dln_region_lag3, dl...",0.001532,0.01,0.001309,8,0.001755,9
5,6,6,"pc1_lag1, dln_region_lag2, dln_region_lag3, dl...",0.001336,0.01,0.001206,7,0.001466,9
6,7,7,"pc1_lag1, dln_region_lag2, dln_region_lag3, dl...",0.001322,0.01,0.000994,7,0.001651,10


# GRU 2X2

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Dropout
from tensorflow.keras.optimizers import Adam

def build_gru_model_2_layers_2_neurons(input_shape, output_shape):
    """
    Builds a Sequential GRU model with 2 layers and 2 neurons each.

    Args:
        input_shape (tuple): The shape of the input data (timesteps, features).
        output_shape (int): The number of units in the output layer.

    Returns:
        keras.src.engine.sequential.Sequential: Compiled GRU model.
    """
    model = Sequential()
    model.add(GRU(units=2, return_sequences=True, input_shape=input_shape)) # Return sequences for the first layer
    model.add(Dropout(0.1))
    model.add(GRU(units=2, return_sequences=False)) # Last GRU layer does not return sequences
    model.add(Dropout(0.1))
    model.add(Dense(units=output_shape, activation='tanh')) # Using tanh activation as requested
    model.compile(optimizer=Adam(), loss='mse')
    return model

print("New GRU model building function with 2 layers (2 neurons each) defined.")

New GRU model building function with 2 layers (2 neurons each) defined.


In [None]:
import tensorflow as tf
import numpy as np
import random
import joblib
import os
import pandas as pd
from sklearn.metrics import mean_squared_error
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# 确保可复现性
random_seeds = [1, 2]
os.environ['TF_DETERMINISTIC_OPS'] = '1'

# 定义GRU模型构建函数
def build_gru_model(input_shape, output_shape):
    model = Sequential()
    model.add(GRU(units=3, return_sequences=False, input_shape=input_shape))
    model.add(Dropout(0.1))
    model.add(Dense(units=output_shape, activation='tanh'))
    model.compile(optimizer=Adam(), loss='mse')
    return model

# 定义目标变量和候选自变量
target = 'dln_gdp_sa'
candidate_independent_variables = ['pc3_lag4', 'dln_gdp_sa_lag1', 'dln_gdp_sa_lag4', 'dpc2_lag4', 'dln_region_lag3', 'pc1_lag1', 'dln_city_lag1', 'dln_gdp_sa_lag3', 'dln_gdp_sa_lag2', 'dln_region_lag2', 'pc3_lag3', 'dln_city_lag2', 'dpc2_lag3', 'dpc2_lag1', 'pc1_lag4', 'pc3_lag1']

# 滑动窗口参数
input_length = 40
output_length = 5

# 超参数
learning_rates = [0.01, 0.001]
epochs = 50
batch_size = 4
early_stopping_patience = 10
early_stopping_min_delta = 0.001

# 前向选择初始化
selected_variables = []
remaining_variables = candidate_independent_variables.copy()
forward_selection_results = []  # 存储结果表格

print("开始前向选择，使用多个随机种子...")
print(f"随机种子: {random_seeds}")

for step in range(7):
    best_variable_for_this_step = None
    best_avg_loss_for_this_step = float('inf')
    best_lr_for_this_step = None
    candidate_results = []  # 存储候选变量结果

    print(f"\n步骤 {step + 1}: 选择第 {len(selected_variables) + 1} 个变量")
    print(f"当前已选变量: {selected_variables}")
    print(f"剩余候选变量: {remaining_variables}")

    for candidate_variable in remaining_variables:
        current_variables = selected_variables + [candidate_variable]
        print(f"\n测试变量组合: {current_variables}")

        # 创建滑动窗口数据集
        X_full_current, y_full_current = create_sliding_window_datasets(
            data_clean, input_length, output_length, current_variables, target
        )

        num_samples_current = X_full_current.shape[0]
        num_train_samples_current = 8
        num_val_samples_current = 2

        if num_samples_current < num_train_samples_current + num_val_samples_current:
             print(f"跳过组合，样本不足 ({num_samples_current})")
             continue

        X_train_current = X_full_current[:num_train_samples_current]
        y_train_current = y_full_current[:num_train_samples_current]
        X_val_current = X_full_current[num_train_samples_current:num_train_samples_current + num_val_samples_current]
        y_val_current = y_full_current[num_train_samples_current:num_train_samples_current + num_val_samples_current]

        # 不同学习率训练
        for lr in learning_rates:
            print(f"  学习率: {lr}")
            seed_losses = []
            seed_models = {}
            seed_epochs = []

            for seed in random_seeds:
                # 设置种子
                np.random.seed(seed)
                tf.random.set_seed(seed)
                os.environ['PYTHONHASHSEED'] = str(seed)
                random.seed(seed)

                print(f"    种子 {seed}:")

                # 构建模型
                model = build_gru_model_2_layers_2_neurons(input_shape=(input_length, len(current_variables)),
                                      output_shape=output_length)
                model.compile(optimizer=Adam(learning_rate=lr), loss='mse')

                # 早停回调
                early_stopping = EarlyStopping(
                    monitor='val_loss',
                    patience=early_stopping_patience,
                    min_delta=early_stopping_min_delta,
                    restore_best_weights=True
                )

                # 训练模型
                history = model.fit(
                    X_train_current, y_train_current,
                    epochs=epochs,
                    batch_size=batch_size,
                    validation_data=(X_val_current, y_val_current),
                    callbacks=[early_stopping],
                    verbose=0
                )

                # 获取最佳验证损失和轮次
                min_val_loss = min(history.history['val_loss'])
                best_epoch = history.history['val_loss'].index(min_val_loss) + 1

                # 记录结果
                seed_losses.append(min_val_loss)
                seed_models[seed] = model
                seed_epochs.append(best_epoch)

                print(f"      验证损失: {min_val_loss:.4f} (轮次: {best_epoch})")

            # 计算平均损失
            avg_loss = np.mean(seed_losses)
            print(f"  平均验证损失: {avg_loss:.4f} (学习率 {lr})")

            # 存储候选结果
            candidate_results.append({
                'variable': candidate_variable,
                'lr': lr,
                'avg_loss': avg_loss,
                'seed_losses': seed_losses,
                'seed_epochs': seed_epochs,
                'seed_models': seed_models
            })

            # 检查是否最佳组合
            if avg_loss < best_avg_loss_for_this_step:
                best_avg_loss_for_this_step = avg_loss
                best_variable_for_this_step = candidate_variable
                best_lr_for_this_step = lr

                # 暂时保存最佳模型
                best_seed_idx = np.argmin(seed_losses)
                best_seed = random_seeds[best_seed_idx]
                best_model_for_step = seed_models[best_seed]

                temp_model_save_path = f'/content/drive/My Drive/GRU22temp_best_model_step_{step+1}.keras'
                best_model_for_step.save(temp_model_save_path)
                print(f"  临时保存步骤 {step+1} 最佳模型: '{candidate_variable}', 学习率 {lr}, 种子 {best_seed}")

    # 添加最佳变量到已选列表
    if best_variable_for_this_step:
        selected_variables.append(best_variable_for_this_step)
        remaining_variables.remove(best_variable_for_this_step)

        print(f"\n--- 步骤 {step + 1} 结果 ---")
        print(f"选中变量: '{best_variable_for_this_step}'")
        print(f"最佳平均验证损失: {best_avg_loss_for_this_step:.4f}")
        print(f"最佳学习率: {best_lr_for_this_step}")
        print(f"当前已选变量: {selected_variables}")

        # 获取最佳候选结果
        best_candidate_data = [d for d in candidate_results
                              if d['variable'] == best_variable_for_this_step
                              and d['lr'] == best_lr_for_this_step][0]

        # 记录结果到表格
        step_result = {
            'Step': step + 1,
            'Variable number': len(selected_variables),
            'Selected Variables': ', '.join(selected_variables),
            'Best average Val Loss': best_avg_loss_for_this_step,
            'Best average LR': best_lr_for_this_step,
        }

        # 添加种子详细结果
        for i, seed in enumerate(random_seeds):
            step_result[f'seed {seed} Val Loss'] = best_candidate_data['seed_losses'][i]
            step_result[f'seed {seed} Epochs Used'] = best_candidate_data['seed_epochs'][i]

        forward_selection_results.append(step_result)

        # 保存最终模型（两个种子中MSE最低的）
        best_seed_idx = np.argmin(best_candidate_data['seed_losses'])
        best_seed = random_seeds[best_seed_idx]
        best_model = best_candidate_data['seed_models'][best_seed]

        # 修改：使用新命名规则 GRU22_{变量数量}
        final_model_name = f'GRU22_{len(selected_variables)}.keras'  # 修改点
        final_model_save_path = os.path.join('/content/drive/My Drive', final_model_name)
        best_model.save(final_model_save_path)
        print(f"已保存最优模型到Google Drive: '{final_model_name}' (seed {best_seed})")

    else:
        print(f"\n--- 步骤 {step + 1} 结果 ---")
        print("本步骤无改进变量")
        break

print("\n前向选择完成")
print(f"最终选中变量: {selected_variables}")

# 保存结果表格
results_df = pd.DataFrame(forward_selection_results)
results_csv_path = os.path.join('/content/drive/My Drive', 'GRU22_results.csv')
results_df.to_csv(results_csv_path, index=False)
print(f"已保存前向选择结果表格到Google Drive: {results_csv_path}")

# 显示结果表格
print("\n前向选择详细结果:")
display(results_df)

开始前向选择，使用多个随机种子...
随机种子: [1, 2]

步骤 1: 选择第 1 个变量
当前已选变量: []
剩余候选变量: ['pc3_lag4', 'dln_gdp_sa_lag1', 'dln_gdp_sa_lag4', 'dpc2_lag4', 'dln_region_lag3', 'pc1_lag1', 'dln_city_lag1', 'dln_gdp_sa_lag3', 'dln_gdp_sa_lag2', 'dln_region_lag2', 'pc3_lag3', 'dln_city_lag2', 'dpc2_lag3', 'dpc2_lag1', 'pc1_lag4', 'pc3_lag1']

测试变量组合: ['pc3_lag4']
  学习率: 0.01
    种子 1:
      验证损失: 0.0019 (轮次: 11)
    种子 2:
      验证损失: 0.0018 (轮次: 10)
  平均验证损失: 0.0019 (学习率 0.01)
  临时保存步骤 1 最佳模型: 'pc3_lag4', 学习率 0.01, 种子 2
  学习率: 0.001
    种子 1:
      验证损失: 0.0081 (轮次: 50)
    种子 2:
      验证损失: 0.0072 (轮次: 50)
  平均验证损失: 0.0076 (学习率 0.001)

测试变量组合: ['dln_gdp_sa_lag1']
  学习率: 0.01
    种子 1:
      验证损失: 0.0016 (轮次: 10)
    种子 2:
      验证损失: 0.0019 (轮次: 9)
  平均验证损失: 0.0017 (学习率 0.01)
  临时保存步骤 1 最佳模型: 'dln_gdp_sa_lag1', 学习率 0.01, 种子 1
  学习率: 0.001
    种子 1:
      验证损失: 0.0076 (轮次: 50)
    种子 2:
      验证损失: 0.0075 (轮次: 50)
  平均验证损失: 0.0075 (学习率 0.001)

测试变量组合: ['dln_gdp_sa_lag4']
  学习率: 0.01
    种子 1:
      验证损失: 0.0020 (

Unnamed: 0,Step,Variable number,Selected Variables,Best average Val Loss,Best average LR,seed 1 Val Loss,seed 1 Epochs Used,seed 2 Val Loss,seed 2 Epochs Used
0,1,1,dln_gdp_sa_lag2,0.00139,0.01,0.001407,11,0.001372,10
1,2,2,"dln_gdp_sa_lag2, pc1_lag4",0.001264,0.01,0.001175,10,0.001352,9
2,3,3,"dln_gdp_sa_lag2, pc1_lag4, dln_region_lag2",0.001244,0.01,0.001184,10,0.001304,10
3,4,4,"dln_gdp_sa_lag2, pc1_lag4, dln_region_lag2, pc...",0.001275,0.01,0.001186,10,0.001364,10
4,5,5,"dln_gdp_sa_lag2, pc1_lag4, dln_region_lag2, pc...",0.00131,0.01,0.001176,10,0.001445,10
5,6,6,"dln_gdp_sa_lag2, pc1_lag4, dln_region_lag2, pc...",0.001351,0.01,0.001261,11,0.00144,11
6,7,7,"dln_gdp_sa_lag2, pc1_lag4, dln_region_lag2, pc...",0.001284,0.01,0.001242,10,0.001326,11


In [None]:
import os
from google.colab import drive
import pandas as pd

# Mount Google Drive if not already mounted
if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')
else:
    print("Google Drive is already mounted.")

# Define the save path and filename
save_path = '/content/drive/My Drive'
summary_filename_2l2n = os.path.join(save_path, 'forward_selection_GRU2L2N_summary.csv') # Updated filename

# Save the summary DataFrame to a CSV file
if 'summary_df_2l2n' in globals():
    summary_df_2l2n.to_csv(summary_filename_2l2n, index=False)
    print(f"Forward selection summary (GRU 2 Layers, 2 Neurons Each) saved to: {summary_filename_2l2n}")
else:
    print("Error: 'summary_df_2l2n' DataFrame not found. Please ensure the forward selection code for GRU 2 Layers, 2 Neurons Each has been executed.")

Google Drive is already mounted.
Forward selection summary (GRU 2 Layers, 2 Neurons Each) saved to: /content/drive/My Drive/forward_selection_GRU2L2N_summary.csv


# CNN

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense
from tensorflow.keras.optimizers import Adam

def build_cnn_model(input_shape, output_shape):
    """
    Builds a Sequential CNN model for time series forecasting.

    Args:
        input_shape (tuple): The shape of the input data (timesteps, features).
        output_shape (int): The number of units in the output layer (prediction length).

    Returns:
        keras.src.engine.sequential.Sequential: Compiled CNN model.
    """
    model = Sequential()
    model.add(Conv1D(filters=3, kernel_size=4, activation='tanh', padding='causal', input_shape=input_shape))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten()) # Flatten the output of the convolutional layers
    model.add(Dense(units=output_shape, activation='tanh')) # Output layer for forecasting

    model.compile(optimizer=Adam(), loss='mse') # Using Adam and MSE as requested
    return model

print("CNN model building function defined.")

CNN model building function defined.


In [None]:
import tensorflow as tf
import numpy as np
import random
import joblib
import os
import pandas as pd
from sklearn.metrics import mean_squared_error
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Dropout
from tensorflow.keras.optimizers import Adam

random_seeds = [1, 2]
os.environ['TF_DETERMINISTIC_OPS'] = '1'

# 定义GRU模型构建函数
def build_gru_model(input_shape, output_shape):
    model = Sequential()
    model.add(GRU(units=3, return_sequences=False, input_shape=input_shape))
    model.add(Dropout(0.1))
    model.add(Dense(units=output_shape, activation='tanh'))
    model.compile(optimizer=Adam(), loss='mse')
    return model

# 定义目标变量和候选自变量
target = 'dln_gdp_sa'
candidate_independent_variables = ['pc3_lag4', 'dln_gdp_sa_lag1', 'dln_gdp_sa_lag4', 'dpc2_lag4', 'dln_region_lag3', 'pc1_lag1', 'dln_city_lag1', 'dln_gdp_sa_lag3', 'dln_gdp_sa_lag2', 'dln_region_lag2', 'pc3_lag3', 'dln_city_lag2', 'dpc2_lag3', 'dpc2_lag1', 'pc1_lag4', 'pc3_lag1']

# 滑动窗口参数
input_length = 40
output_length = 5

# 超参数
learning_rates = [0.01, 0.001]
epochs = 50
batch_size = 4
early_stopping_patience = 10
early_stopping_min_delta = 0.001

# 前向选择初始化
selected_variables = []
remaining_variables = candidate_independent_variables.copy()
forward_selection_results = []  # 存储结果表格

print("开始前向选择，使用多个随机种子...")
print(f"随机种子: {random_seeds}")

for step in range(7):
    best_variable_for_this_step = None
    best_avg_loss_for_this_step = float('inf')
    best_lr_for_this_step = None
    candidate_results = []  # 存储候选变量结果

    print(f"\n步骤 {step + 1}: 选择第 {len(selected_variables) + 1} 个变量")
    print(f"当前已选变量: {selected_variables}")
    print(f"剩余候选变量: {remaining_variables}")

    for candidate_variable in remaining_variables:
        current_variables = selected_variables + [candidate_variable]
        print(f"\n测试变量组合: {current_variables}")

        # 创建滑动窗口数据集
        X_full_current, y_full_current = create_sliding_window_datasets(
            data_clean, input_length, output_length, current_variables, target
        )

        num_samples_current = X_full_current.shape[0]
        num_train_samples_current = 8
        num_val_samples_current = 2

        if num_samples_current < num_train_samples_current + num_val_samples_current:
             print(f"跳过组合，样本不足 ({num_samples_current})")
             continue

        X_train_current = X_full_current[:num_train_samples_current]
        y_train_current = y_full_current[:num_train_samples_current]
        X_val_current = X_full_current[num_train_samples_current:num_train_samples_current + num_val_samples_current]
        y_val_current = y_full_current[num_train_samples_current:num_train_samples_current + num_val_samples_current]

        # 不同学习率训练
        for lr in learning_rates:
            print(f"  学习率: {lr}")
            seed_losses = []
            seed_models = {}
            seed_epochs = []

            for seed in random_seeds:
                # 设置种子
                np.random.seed(seed)
                tf.random.set_seed(seed)
                os.environ['PYTHONHASHSEED'] = str(seed)
                random.seed(seed)

                print(f"    种子 {seed}:")

                # 构建模型
                model = build_cnn_model(input_shape=(input_length, len(current_variables)),
                                      output_shape=output_length)
                model.compile(optimizer=Adam(learning_rate=lr), loss='mse')

                # 早停回调
                early_stopping = EarlyStopping(
                    monitor='val_loss',
                    patience=early_stopping_patience,
                    min_delta=early_stopping_min_delta,
                    restore_best_weights=True
                )

                # 训练模型
                history = model.fit(
                    X_train_current, y_train_current,
                    epochs=epochs,
                    batch_size=batch_size,
                    validation_data=(X_val_current, y_val_current),
                    callbacks=[early_stopping],
                    verbose=0
                )

                # 获取最佳验证损失和轮次
                min_val_loss = min(history.history['val_loss'])
                best_epoch = history.history['val_loss'].index(min_val_loss) + 1

                # 记录结果
                seed_losses.append(min_val_loss)
                seed_models[seed] = model
                seed_epochs.append(best_epoch)

                print(f"      验证损失: {min_val_loss:.4f} (轮次: {best_epoch})")

            # 计算平均损失
            avg_loss = np.mean(seed_losses)
            print(f"  平均验证损失: {avg_loss:.4f} (学习率 {lr})")

            # 存储候选结果
            candidate_results.append({
                'variable': candidate_variable,
                'lr': lr,
                'avg_loss': avg_loss,
                'seed_losses': seed_losses,
                'seed_epochs': seed_epochs,
                'seed_models': seed_models
            })

            # 检查是否最佳组合
            if avg_loss < best_avg_loss_for_this_step:
                best_avg_loss_for_this_step = avg_loss
                best_variable_for_this_step = candidate_variable
                best_lr_for_this_step = lr

                # 暂时保存最佳模型
                best_seed_idx = np.argmin(seed_losses)
                best_seed = random_seeds[best_seed_idx]
                best_model_for_step = seed_models[best_seed]

                temp_model_save_path = f'/content/drive/My Drive/cnn3temp_best_model_step_{step+1}.keras'
                best_model_for_step.save(temp_model_save_path)
                print(f"  临时保存步骤 {step+1} 最佳模型: '{candidate_variable}', 学习率 {lr}, 种子 {best_seed}")

    # 添加最佳变量到已选列表
    if best_variable_for_this_step:
        selected_variables.append(best_variable_for_this_step)
        remaining_variables.remove(best_variable_for_this_step)

        print(f"\n--- 步骤 {step + 1} 结果 ---")
        print(f"选中变量: '{best_variable_for_this_step}'")
        print(f"最佳平均验证损失: {best_avg_loss_for_this_step:.4f}")
        print(f"最佳学习率: {best_lr_for_this_step}")
        print(f"当前已选变量: {selected_variables}")

        # 获取最佳候选结果
        best_candidate_data = [d for d in candidate_results
                              if d['variable'] == best_variable_for_this_step
                              and d['lr'] == best_lr_for_this_step][0]

        # 记录结果到表格
        step_result = {
            'Step': step + 1,
            'Variable number': len(selected_variables),
            'Selected Variables': ', '.join(selected_variables),
            'Best average Val Loss': best_avg_loss_for_this_step,
            'Best average LR': best_lr_for_this_step,
        }

        # 添加种子详细结果
        for i, seed in enumerate(random_seeds):
            step_result[f'seed {seed} Val Loss'] = best_candidate_data['seed_losses'][i]
            step_result[f'seed {seed} Epochs Used'] = best_candidate_data['seed_epochs'][i]

        forward_selection_results.append(step_result)

        # 保存最终模型（两个种子中MSE最低的）
        best_seed_idx = np.argmin(best_candidate_data['seed_losses'])
        best_seed = random_seeds[best_seed_idx]
        best_model = best_candidate_data['seed_models'][best_seed]

        # 修改：使用新命名规则 GRU22_{变量数量}
        final_model_name = f'CNN3_{len(selected_variables)}.keras'  # 修改点
        final_model_save_path = os.path.join('/content/drive/My Drive', final_model_name)
        best_model.save(final_model_save_path)
        print(f"已保存最优模型到Google Drive: '{final_model_name}' (seed {best_seed})")

    else:
        print(f"\n--- 步骤 {step + 1} 结果 ---")
        print("本步骤无改进变量")
        break

print("\n前向选择完成")
print(f"最终选中变量: {selected_variables}")

# 保存结果表格
results_df = pd.DataFrame(forward_selection_results)
results_csv_path = os.path.join('/content/drive/My Drive', 'CNN3_results.csv')
results_df.to_csv(results_csv_path, index=False)
print(f"已保存前向选择结果表格到Google Drive: {results_csv_path}")

# 显示结果表格
print("\n前向选择详细结果:")
display(results_df)

开始前向选择，使用多个随机种子...
随机种子: [1, 2]

步骤 1: 选择第 1 个变量
当前已选变量: []
剩余候选变量: ['pc3_lag4', 'dln_gdp_sa_lag1', 'dln_gdp_sa_lag4', 'dpc2_lag4', 'dln_region_lag3', 'pc1_lag1', 'dln_city_lag1', 'dln_gdp_sa_lag3', 'dln_gdp_sa_lag2', 'dln_region_lag2', 'pc3_lag3', 'dln_city_lag2', 'dpc2_lag3', 'dpc2_lag1', 'pc1_lag4', 'pc3_lag1']

测试变量组合: ['pc3_lag4']
  学习率: 0.01
    种子 1:
      验证损失: 0.0136 (轮次: 7)
    种子 2:
      验证损失: 0.0232 (轮次: 4)
  平均验证损失: 0.0184 (学习率 0.01)
  临时保存步骤 1 最佳模型: 'pc3_lag4', 学习率 0.01, 种子 1
  学习率: 0.001
    种子 1:
      验证损失: 0.0291 (轮次: 50)
    种子 2:
      验证损失: 0.0269 (轮次: 36)
  平均验证损失: 0.0280 (学习率 0.001)

测试变量组合: ['dln_gdp_sa_lag1']
  学习率: 0.01
    种子 1:
      验证损失: 0.0158 (轮次: 11)
    种子 2:
      验证损失: 0.0350 (轮次: 15)
  平均验证损失: 0.0254 (学习率 0.01)
  学习率: 0.001
    种子 1:
      验证损失: 0.0284 (轮次: 40)
    种子 2:
      验证损失: 0.0561 (轮次: 50)
  平均验证损失: 0.0423 (学习率 0.001)

测试变量组合: ['dln_gdp_sa_lag4']
  学习率: 0.01
    种子 1:
      验证损失: 0.0301 (轮次: 17)
    种子 2:
      验证损失: 0.0486 (轮次: 2)
  平均验证损

Unnamed: 0,Step,Variable number,Selected Variables,Best average Val Loss,Best average LR,seed 1 Val Loss,seed 1 Epochs Used,seed 2 Val Loss,seed 2 Epochs Used
0,1,1,dln_region_lag3,0.006609,0.01,0.004201,8,0.009016,10
1,2,2,"dln_region_lag3, dln_region_lag2",0.010909,0.01,0.005903,6,0.015915,15
2,3,3,"dln_region_lag3, dln_region_lag2, dln_city_lag2",0.011786,0.01,0.011168,11,0.012405,10
3,4,4,"dln_region_lag3, dln_region_lag2, dln_city_lag...",0.005873,0.01,0.007279,12,0.004466,16
4,5,5,"dln_region_lag3, dln_region_lag2, dln_city_lag...",0.013651,0.01,0.014328,14,0.012974,14
5,6,6,"dln_region_lag3, dln_region_lag2, dln_city_lag...",0.016414,0.01,0.019012,12,0.013817,34
6,7,7,"dln_region_lag3, dln_region_lag2, dln_city_lag...",0.039162,0.01,0.012866,31,0.065458,12


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense
from tensorflow.keras.optimizers import Adam

def build_cnn_model_filters4(input_shape, output_shape):
    """
    Builds a Sequential CNN model for time series forecasting with Conv1D filters=4.

    Args:
        input_shape (tuple): The shape of the input data (timesteps, features).
        output_shape (int): The number of units in the output layer (prediction length).

    Returns:
        keras.src.engine.sequential.Sequential: Compiled CNN model.
    """
    model = Sequential()
    model.add(Conv1D(filters=4, kernel_size=4, activation='tanh', padding='causal', input_shape=input_shape)) # Changed filters to 4
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten()) # Flatten the output of the convolutional layers
    model.add(Dense(units=output_shape, activation='tanh')) # Output layer for forecasting

    model.compile(optimizer=Adam(), loss='mse') # Using Adam and MSE as requested
    return model

print("CNN model building function with filters=4 defined.")

CNN model building function with filters=4 defined.


In [None]:
# 确保可复现性
random_seeds = [1, 2]
os.environ['TF_DETERMINISTIC_OPS'] = '1'

# 定义GRU模型构建函数
def build_gru_model(input_shape, output_shape):
    model = Sequential()
    model.add(GRU(units=3, return_sequences=False, input_shape=input_shape))
    model.add(Dropout(0.1))
    model.add(Dense(units=output_shape, activation='tanh'))
    model.compile(optimizer=Adam(), loss='mse')
    return model

# 定义目标变量和候选自变量
target = 'dln_gdp_sa'
candidate_independent_variables = ['pc3_lag4', 'dln_gdp_sa_lag1', 'dln_gdp_sa_lag4', 'dpc2_lag4', 'dln_region_lag3', 'pc1_lag1', 'dln_city_lag1', 'dln_gdp_sa_lag3', 'dln_gdp_sa_lag2', 'dln_region_lag2', 'pc3_lag3', 'dln_city_lag2', 'dpc2_lag3', 'dpc2_lag1', 'pc1_lag4', 'pc3_lag1']

# 滑动窗口参数
input_length = 40
output_length = 5

# 超参数
learning_rates = [0.01, 0.001]
epochs = 50
batch_size = 4
early_stopping_patience = 10
early_stopping_min_delta = 0.001

# 前向选择初始化
selected_variables = []
remaining_variables = candidate_independent_variables.copy()
forward_selection_results = []  # 存储结果表格

print("开始前向选择，使用多个随机种子...")
print(f"随机种子: {random_seeds}")

for step in range(7):
    best_variable_for_this_step = None
    best_avg_loss_for_this_step = float('inf')
    best_lr_for_this_step = None
    candidate_results = []  # 存储候选变量结果

    print(f"\n步骤 {step + 1}: 选择第 {len(selected_variables) + 1} 个变量")
    print(f"当前已选变量: {selected_variables}")
    print(f"剩余候选变量: {remaining_variables}")

    for candidate_variable in remaining_variables:
        current_variables = selected_variables + [candidate_variable]
        print(f"\n测试变量组合: {current_variables}")

        # 创建滑动窗口数据集
        X_full_current, y_full_current = create_sliding_window_datasets(
            data_clean, input_length, output_length, current_variables, target
        )

        num_samples_current = X_full_current.shape[0]
        num_train_samples_current = 8
        num_val_samples_current = 2

        if num_samples_current < num_train_samples_current + num_val_samples_current:
             print(f"跳过组合，样本不足 ({num_samples_current})")
             continue

        X_train_current = X_full_current[:num_train_samples_current]
        y_train_current = y_full_current[:num_train_samples_current]
        X_val_current = X_full_current[num_train_samples_current:num_train_samples_current + num_val_samples_current]
        y_val_current = y_full_current[num_train_samples_current:num_train_samples_current + num_val_samples_current]

        # 不同学习率训练
        for lr in learning_rates:
            print(f"  学习率: {lr}")
            seed_losses = []
            seed_models = {}
            seed_epochs = []

            for seed in random_seeds:
                # 设置种子
                np.random.seed(seed)
                tf.random.set_seed(seed)
                os.environ['PYTHONHASHSEED'] = str(seed)
                random.seed(seed)

                print(f"    种子 {seed}:")

                # 构建模型
                model = build_cnn_model_filters4(input_shape=(input_length, len(current_variables)),
                                      output_shape=output_length)
                model.compile(optimizer=Adam(learning_rate=lr), loss='mse')

                # 早停回调
                early_stopping = EarlyStopping(
                    monitor='val_loss',
                    patience=early_stopping_patience,
                    min_delta=early_stopping_min_delta,
                    restore_best_weights=True
                )

                # 训练模型
                history = model.fit(
                    X_train_current, y_train_current,
                    epochs=epochs,
                    batch_size=batch_size,
                    validation_data=(X_val_current, y_val_current),
                    callbacks=[early_stopping],
                    verbose=0
                )

                # 获取最佳验证损失和轮次
                min_val_loss = min(history.history['val_loss'])
                best_epoch = history.history['val_loss'].index(min_val_loss) + 1

                # 记录结果
                seed_losses.append(min_val_loss)
                seed_models[seed] = model
                seed_epochs.append(best_epoch)

                print(f"      验证损失: {min_val_loss:.4f} (轮次: {best_epoch})")

            # 计算平均损失
            avg_loss = np.mean(seed_losses)
            print(f"  平均验证损失: {avg_loss:.4f} (学习率 {lr})")

            # 存储候选结果
            candidate_results.append({
                'variable': candidate_variable,
                'lr': lr,
                'avg_loss': avg_loss,
                'seed_losses': seed_losses,
                'seed_epochs': seed_epochs,
                'seed_models': seed_models
            })

            # 检查是否最佳组合
            if avg_loss < best_avg_loss_for_this_step:
                best_avg_loss_for_this_step = avg_loss
                best_variable_for_this_step = candidate_variable
                best_lr_for_this_step = lr

                # 暂时保存最佳模型
                best_seed_idx = np.argmin(seed_losses)
                best_seed = random_seeds[best_seed_idx]
                best_model_for_step = seed_models[best_seed]

                temp_model_save_path = f'/content/drive/My Drive/cnn4temp_best_model_step_{step+1}.keras'
                best_model_for_step.save(temp_model_save_path)
                print(f"  临时保存步骤 {step+1} 最佳模型: '{candidate_variable}', 学习率 {lr}, 种子 {best_seed}")

    # 添加最佳变量到已选列表
    if best_variable_for_this_step:
        selected_variables.append(best_variable_for_this_step)
        remaining_variables.remove(best_variable_for_this_step)

        print(f"\n--- 步骤 {step + 1} 结果 ---")
        print(f"选中变量: '{best_variable_for_this_step}'")
        print(f"最佳平均验证损失: {best_avg_loss_for_this_step:.4f}")
        print(f"最佳学习率: {best_lr_for_this_step}")
        print(f"当前已选变量: {selected_variables}")

        # 获取最佳候选结果
        best_candidate_data = [d for d in candidate_results
                              if d['variable'] == best_variable_for_this_step
                              and d['lr'] == best_lr_for_this_step][0]

        # 记录结果到表格
        step_result = {
            'Step': step + 1,
            'Variable number': len(selected_variables),
            'Selected Variables': ', '.join(selected_variables),
            'Best average Val Loss': best_avg_loss_for_this_step,
            'Best average LR': best_lr_for_this_step,
        }

        # 添加种子详细结果
        for i, seed in enumerate(random_seeds):
            step_result[f'seed {seed} Val Loss'] = best_candidate_data['seed_losses'][i]
            step_result[f'seed {seed} Epochs Used'] = best_candidate_data['seed_epochs'][i]

        forward_selection_results.append(step_result)

        # 保存最终模型（两个种子中MSE最低的）
        best_seed_idx = np.argmin(best_candidate_data['seed_losses'])
        best_seed = random_seeds[best_seed_idx]
        best_model = best_candidate_data['seed_models'][best_seed]

        # 修改：使用新命名规则 CNN4_{变量数量}
        final_model_name = f'CNN4_{len(selected_variables)}.keras'  # 修改点
        final_model_save_path = os.path.join('/content/drive/My Drive', final_model_name)
        best_model.save(final_model_save_path)
        print(f"已保存最优模型到Google Drive: '{final_model_name}' (seed {best_seed})")

    else:
        print(f"\n--- 步骤 {step + 1} 结果 ---")
        print("本步骤无改进变量")
        break

print("\n前向选择完成")
print(f"最终选中变量: {selected_variables}")

# 保存结果表格
results_df = pd.DataFrame(forward_selection_results)
results_csv_path = os.path.join('/content/drive/My Drive', 'CNN4_results.csv')
results_df.to_csv(results_csv_path, index=False)
print(f"已保存前向选择结果表格到Google Drive: {results_csv_path}")

# 显示结果表格
print("\n前向选择详细结果:")
display(results_df)

开始前向选择，使用多个随机种子...
随机种子: [1, 2]

步骤 1: 选择第 1 个变量
当前已选变量: []
剩余候选变量: ['pc3_lag4', 'dln_gdp_sa_lag1', 'dln_gdp_sa_lag4', 'dpc2_lag4', 'dln_region_lag3', 'pc1_lag1', 'dln_city_lag1', 'dln_gdp_sa_lag3', 'dln_gdp_sa_lag2', 'dln_region_lag2', 'pc3_lag3', 'dln_city_lag2', 'dpc2_lag3', 'dpc2_lag1', 'pc1_lag4', 'pc3_lag1']

测试变量组合: ['pc3_lag4']
  学习率: 0.01
    种子 1:
      验证损失: 0.0095 (轮次: 11)
    种子 2:
      验证损失: 0.0304 (轮次: 3)
  平均验证损失: 0.0199 (学习率 0.01)
  临时保存步骤 1 最佳模型: 'pc3_lag4', 学习率 0.01, 种子 1
  学习率: 0.001
    种子 1:
      验证损失: 0.0128 (轮次: 50)
    种子 2:
      验证损失: 0.0431 (轮次: 35)
  平均验证损失: 0.0280 (学习率 0.001)

测试变量组合: ['dln_gdp_sa_lag1']
  学习率: 0.01
    种子 1:
      验证损失: 0.0389 (轮次: 1)
    种子 2:
      验证损失: 0.0430 (轮次: 10)
  平均验证损失: 0.0409 (学习率 0.01)
  学习率: 0.001
    种子 1:
      验证损失: 0.0364 (轮次: 8)
    种子 2:
      验证损失: 0.0477 (轮次: 47)
  平均验证损失: 0.0420 (学习率 0.001)

测试变量组合: ['dln_gdp_sa_lag4']
  学习率: 0.01
    种子 1:
      验证损失: 0.0183 (轮次: 2)
    种子 2:
      验证损失: 0.0518 (轮次: 8)
  平均验证损失:

Unnamed: 0,Step,Variable number,Selected Variables,Best average Val Loss,Best average LR,seed 1 Val Loss,seed 1 Epochs Used,seed 2 Val Loss,seed 2 Epochs Used
0,1,1,dln_city_lag1,0.004625,0.001,0.004598,26,0.004652,20
1,2,2,"dln_city_lag1, dln_city_lag2",0.005959,0.01,0.003881,6,0.008038,8
2,3,3,"dln_city_lag1, dln_city_lag2, pc3_lag4",0.01027,0.001,0.010703,50,0.009837,13
3,4,4,"dln_city_lag1, dln_city_lag2, pc3_lag4, pc3_lag3",0.019232,0.01,0.028578,13,0.009885,6
4,5,5,"dln_city_lag1, dln_city_lag2, pc3_lag4, pc3_la...",0.021572,0.01,0.02947,10,0.013674,15
5,6,6,"dln_city_lag1, dln_city_lag2, pc3_lag4, pc3_la...",0.009689,0.01,0.007192,10,0.012186,5
6,7,7,"dln_city_lag1, dln_city_lag2, pc3_lag4, pc3_la...",0.040992,0.01,0.016795,1,0.065189,24


In [None]:
import os
from google.colab import drive
import pandas as pd

# Mount Google Drive if not already mounted
if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')
else:
    print("Google Drive is already mounted.")

# Define the save path and filename
save_path = '/content/drive/My Drive'
summary_filename_cnn4 = os.path.join(save_path, 'CNN4_summary.csv') # Updated filename for CNN4

# Save the summary DataFrame to a CSV file
if 'summary_df_cnn4' in globals():
    summary_df_cnn4.to_csv(summary_filename_cnn4, index=False)
    print(f"Forward selection summary (CNN Filters=4) saved to: {summary_filename_cnn4}")
else:
    print("Error: 'summary_df_cnn4' DataFrame not found. Please ensure the forward selection code for CNN Filters=4 has been executed.")

Google Drive is already mounted.
Forward selection summary (CNN Filters=4) saved to: /content/drive/My Drive/CNN4_summary.csv


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense
from tensorflow.keras.optimizers import Adam

def build_cnn_model_2layers_2filters(input_shape, output_shape):
    """
    Builds a Sequential 2-layer 1D CNN model for time series forecasting
    with 2 filters per layer.

    Args:
        input_shape (tuple): The shape of the input data (timesteps, features).
        output_shape (int): The number of units in the output layer (prediction length).

    Returns:
        keras.src.engine.sequential.Sequential: Compiled CNN model.
    """
    model = Sequential()
    # First Convolutional Block
    model.add(Conv1D(filters=2, kernel_size=4, activation='tanh', padding='causal', input_shape=input_shape))
    model.add(MaxPooling1D(pool_size=2))

    # Second Convolutional Block
    # Input shape to the second Conv1D will be (new_timesteps_after_pooling1, filters_1)
    model.add(Conv1D(filters=2, kernel_size=4, activation='tanh', padding='causal')) # No input_shape needed here
    model.add(MaxPooling1D(pool_size=2))

    # Flatten the output before the Dense layer
    model.add(Flatten())

    # Output Dense layer
    model.add(Dense(units=output_shape, activation='tanh'))

    model.compile(optimizer=Adam(), loss='mse') # Using Adam and MSE as requested
    return model

print("New 2-layer 1D CNN model building function with 2 filters per layer defined.")

New 2-layer 1D CNN model building function with 2 filters per layer defined.


In [None]:
# 确保可复现性
random_seeds = [1, 2]
os.environ['TF_DETERMINISTIC_OPS'] = '1'

# 定义GRU模型构建函数
def build_gru_model(input_shape, output_shape):
    model = Sequential()
    model.add(GRU(units=3, return_sequences=False, input_shape=input_shape))
    model.add(Dropout(0.1))
    model.add(Dense(units=output_shape, activation='tanh'))
    model.compile(optimizer=Adam(), loss='mse')
    return model

# 定义目标变量和候选自变量
target = 'dln_gdp_sa'
candidate_independent_variables = ['pc3_lag4', 'dln_gdp_sa_lag1', 'dln_gdp_sa_lag4', 'dpc2_lag4', 'dln_region_lag3', 'pc1_lag1', 'dln_city_lag1', 'dln_gdp_sa_lag3', 'dln_gdp_sa_lag2', 'dln_region_lag2', 'pc3_lag3', 'dln_city_lag2', 'dpc2_lag3', 'dpc2_lag1', 'pc1_lag4', 'pc3_lag1']

# 滑动窗口参数
input_length = 40
output_length = 5

# 超参数
learning_rates = [0.01, 0.001]
epochs = 50
batch_size = 4
early_stopping_patience = 10
early_stopping_min_delta = 0.001

# 前向选择初始化
selected_variables = []
remaining_variables = candidate_independent_variables.copy()
forward_selection_results = []  # 存储结果表格

print("开始前向选择，使用多个随机种子...")
print(f"随机种子: {random_seeds}")

for step in range(7):
    best_variable_for_this_step = None
    best_avg_loss_for_this_step = float('inf')
    best_lr_for_this_step = None
    candidate_results = []  # 存储候选变量结果

    print(f"\n步骤 {step + 1}: 选择第 {len(selected_variables) + 1} 个变量")
    print(f"当前已选变量: {selected_variables}")
    print(f"剩余候选变量: {remaining_variables}")

    for candidate_variable in remaining_variables:
        current_variables = selected_variables + [candidate_variable]
        print(f"\n测试变量组合: {current_variables}")

        # 创建滑动窗口数据集
        X_full_current, y_full_current = create_sliding_window_datasets(
            data_clean, input_length, output_length, current_variables, target
        )

        num_samples_current = X_full_current.shape[0]
        num_train_samples_current = 8
        num_val_samples_current = 2

        if num_samples_current < num_train_samples_current + num_val_samples_current:
             print(f"跳过组合，样本不足 ({num_samples_current})")
             continue

        X_train_current = X_full_current[:num_train_samples_current]
        y_train_current = y_full_current[:num_train_samples_current]
        X_val_current = X_full_current[num_train_samples_current:num_train_samples_current + num_val_samples_current]
        y_val_current = y_full_current[num_train_samples_current:num_train_samples_current + num_val_samples_current]

        # 不同学习率训练
        for lr in learning_rates:
            print(f"  学习率: {lr}")
            seed_losses = []
            seed_models = {}
            seed_epochs = []

            for seed in random_seeds:
                # 设置种子
                np.random.seed(seed)
                tf.random.set_seed(seed)
                os.environ['PYTHONHASHSEED'] = str(seed)
                random.seed(seed)

                print(f"    种子 {seed}:")

                # 构建模型
                model = build_cnn_model_2layers_2filters(input_shape=(input_length, len(current_variables)),
                                      output_shape=output_length)
                model.compile(optimizer=Adam(learning_rate=lr), loss='mse')

                # 早停回调
                early_stopping = EarlyStopping(
                    monitor='val_loss',
                    patience=early_stopping_patience,
                    min_delta=early_stopping_min_delta,
                    restore_best_weights=True
                )

                # 训练模型
                history = model.fit(
                    X_train_current, y_train_current,
                    epochs=epochs,
                    batch_size=batch_size,
                    validation_data=(X_val_current, y_val_current),
                    callbacks=[early_stopping],
                    verbose=0
                )

                # 获取最佳验证损失和轮次
                min_val_loss = min(history.history['val_loss'])
                best_epoch = history.history['val_loss'].index(min_val_loss) + 1

                # 记录结果
                seed_losses.append(min_val_loss)
                seed_models[seed] = model
                seed_epochs.append(best_epoch)

                print(f"      验证损失: {min_val_loss:.4f} (轮次: {best_epoch})")

            # 计算平均损失
            avg_loss = np.mean(seed_losses)
            print(f"  平均验证损失: {avg_loss:.4f} (学习率 {lr})")

            # 存储候选结果
            candidate_results.append({
                'variable': candidate_variable,
                'lr': lr,
                'avg_loss': avg_loss,
                'seed_losses': seed_losses,
                'seed_epochs': seed_epochs,
                'seed_models': seed_models
            })

            # 检查是否最佳组合
            if avg_loss < best_avg_loss_for_this_step:
                best_avg_loss_for_this_step = avg_loss
                best_variable_for_this_step = candidate_variable
                best_lr_for_this_step = lr

                # 暂时保存最佳模型
                best_seed_idx = np.argmin(seed_losses)
                best_seed = random_seeds[best_seed_idx]
                best_model_for_step = seed_models[best_seed]

                temp_model_save_path = f'/content/drive/My Drive/cnn22temp_best_model_step_{step+1}.keras'
                best_model_for_step.save(temp_model_save_path)
                print(f"  临时保存步骤 {step+1} 最佳模型: '{candidate_variable}', 学习率 {lr}, 种子 {best_seed}")

    # 添加最佳变量到已选列表
    if best_variable_for_this_step:
        selected_variables.append(best_variable_for_this_step)
        remaining_variables.remove(best_variable_for_this_step)

        print(f"\n--- 步骤 {step + 1} 结果 ---")
        print(f"选中变量: '{best_variable_for_this_step}'")
        print(f"最佳平均验证损失: {best_avg_loss_for_this_step:.4f}")
        print(f"最佳学习率: {best_lr_for_this_step}")
        print(f"当前已选变量: {selected_variables}")

        # 获取最佳候选结果
        best_candidate_data = [d for d in candidate_results
                              if d['variable'] == best_variable_for_this_step
                              and d['lr'] == best_lr_for_this_step][0]

        # 记录结果到表格
        step_result = {
            'Step': step + 1,
            'Variable number': len(selected_variables),
            'Selected Variables': ', '.join(selected_variables),
            'Best average Val Loss': best_avg_loss_for_this_step,
            'Best average LR': best_lr_for_this_step,
        }

        # 添加种子详细结果
        for i, seed in enumerate(random_seeds):
            step_result[f'seed {seed} Val Loss'] = best_candidate_data['seed_losses'][i]
            step_result[f'seed {seed} Epochs Used'] = best_candidate_data['seed_epochs'][i]

        forward_selection_results.append(step_result)

        # 保存最终模型（两个种子中MSE最低的）
        best_seed_idx = np.argmin(best_candidate_data['seed_losses'])
        best_seed = random_seeds[best_seed_idx]
        best_model = best_candidate_data['seed_models'][best_seed]

        # 修改：使用新命名规则 CNN4_{变量数量}
        final_model_name = f'CNN22_{len(selected_variables)}.keras'  # 修改点
        final_model_save_path = os.path.join('/content/drive/My Drive', final_model_name)
        best_model.save(final_model_save_path)
        print(f"已保存最优模型到Google Drive: '{final_model_name}' (seed {best_seed})")

    else:
        print(f"\n--- 步骤 {step + 1} 结果 ---")
        print("本步骤无改进变量")
        break

print("\n前向选择完成")
print(f"最终选中变量: {selected_variables}")

# 保存结果表格
results_df = pd.DataFrame(forward_selection_results)
results_csv_path = os.path.join('/content/drive/My Drive', 'CNN22_results.csv')
results_df.to_csv(results_csv_path, index=False)
print(f"已保存前向选择结果表格到Google Drive: {results_csv_path}")

# 显示结果表格
print("\n前向选择详细结果:")
display(results_df)

开始前向选择，使用多个随机种子...
随机种子: [1, 2]

步骤 1: 选择第 1 个变量
当前已选变量: []
剩余候选变量: ['pc3_lag4', 'dln_gdp_sa_lag1', 'dln_gdp_sa_lag4', 'dpc2_lag4', 'dln_region_lag3', 'pc1_lag1', 'dln_city_lag1', 'dln_gdp_sa_lag3', 'dln_gdp_sa_lag2', 'dln_region_lag2', 'pc3_lag3', 'dln_city_lag2', 'dpc2_lag3', 'dpc2_lag1', 'pc1_lag4', 'pc3_lag1']

测试变量组合: ['pc3_lag4']
  学习率: 0.01
    种子 1:
      验证损失: 0.0046 (轮次: 12)
    种子 2:
      验证损失: 0.0044 (轮次: 4)
  平均验证损失: 0.0045 (学习率 0.01)
  临时保存步骤 1 最佳模型: 'pc3_lag4', 学习率 0.01, 种子 2
  学习率: 0.001
    种子 1:
      验证损失: 0.0073 (轮次: 50)
    种子 2:
      验证损失: 0.0068 (轮次: 46)
  平均验证损失: 0.0070 (学习率 0.001)

测试变量组合: ['dln_gdp_sa_lag1']
  学习率: 0.01
    种子 1:
      验证损失: 0.0055 (轮次: 14)
    种子 2:
      验证损失: 0.0128 (轮次: 8)
  平均验证损失: 0.0091 (学习率 0.01)
  学习率: 0.001
    种子 1:
      验证损失: 0.0127 (轮次: 41)
    种子 2:
      验证损失: 0.0168 (轮次: 50)
  平均验证损失: 0.0147 (学习率 0.001)

测试变量组合: ['dln_gdp_sa_lag4']
  学习率: 0.01
    种子 1:
      验证损失: 0.0050 (轮次: 15)
    种子 2:
      验证损失: 0.0155 (轮次: 22)
  平均验证

Unnamed: 0,Step,Variable number,Selected Variables,Best average Val Loss,Best average LR,seed 1 Val Loss,seed 1 Epochs Used,seed 2 Val Loss,seed 2 Epochs Used
0,1,1,dln_city_lag2,0.004163,0.01,0.002445,13,0.00588,7
1,2,2,"dln_city_lag2, dln_city_lag1",0.003269,0.01,0.002601,15,0.003936,7
2,3,3,"dln_city_lag2, dln_city_lag1, dln_gdp_sa_lag4",0.004656,0.01,0.003744,12,0.005569,6
3,4,4,"dln_city_lag2, dln_city_lag1, dln_gdp_sa_lag4,...",0.007856,0.01,0.002983,12,0.012729,8
4,5,5,"dln_city_lag2, dln_city_lag1, dln_gdp_sa_lag4,...",0.007439,0.01,0.010667,16,0.004211,5
5,6,6,"dln_city_lag2, dln_city_lag1, dln_gdp_sa_lag4,...",0.006916,0.01,0.009022,4,0.004811,17
6,7,7,"dln_city_lag2, dln_city_lag1, dln_gdp_sa_lag4,...",0.007816,0.01,0.010266,4,0.005366,9
