In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/all-province/all_province_data.xlsx


In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import KFold
import time
# 种子固定
np.random.seed(42)
torch.manual_seed(42)  # 设置 CPU 随机种子
torch.cuda.manual_seed(42)  # 设置 GPU 随机种子
torch.cuda.manual_seed_all(42)  # 如果使用多 GPU，设置所有 GPU 的随机种子
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
# ===============================
# 1. 数据预处理
# ===============================
file_path = '/kaggle/input/all-province/all_province_data.xlsx'
data = pd.read_excel(file_path)
data['Year'] = data['时间'].str.extract('(\d+)').astype(int)
data = pd.get_dummies(data, columns=['Province'])

# 选择特征
features = [
    'Year', 'Ammonia Nitrogen Emissions', 'Average Temperature',
    'Average Years of Education per Capita', 'Chemical Oxygen Demand Emissions',
    'Electricity Consumption', 'Geographic-Mean PM2',
    'Government Expenditure on Environmental Protection',
    'NOx Emissions', 'Number of Healthcare Institutions',
    'Number of Healthcare Personnel', 'Oil Emissions',
    'Per Capita Disposable Income', 'Resident Population',
    'SO2 Emissions', 'Total Nitrogen Emissions',
    'Total Phosphorus Emissions'
] + list(data.columns[data.columns.str.startswith('Province_')])

X = data[features]
y = data['Total CO2 emissions']
X = X.fillna(X.mean())

# 划分训练集和测试集
X_test_2020 = X[X['Year'] == 2020]
X_test_2021 = X[X['Year'] == 2021]
X_test_2022 = X[X['Year'] == 2022]
y_test_2020 = y[X['Year'] == 2020]
y_test_2021 = y[X['Year'] == 2021]
y_test_2022 = y[X['Year'] == 2022]
X_train_all = X[~X['Year'].isin([2020, 2021, 2022])]
y_train_all = y[~X['Year'].isin([2020, 2021, 2022])]

# 标准化数据
scaler_x = MinMaxScaler()
X_train_all_scaled = scaler_x.fit_transform(X_train_all)
X_test_2020_scaled = scaler_x.transform(X_test_2020)
X_test_2021_scaled = scaler_x.transform(X_test_2021)
X_test_2022_scaled = scaler_x.transform(X_test_2022)

scaler_y = MinMaxScaler()
y_train_all_scaled = scaler_y.fit_transform(y_train_all.values.reshape(-1, 1)).ravel()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# ===============================
# 2. 定义 CNN_LSTM 模型
# ===============================
class CNN_LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers=2):
        super(CNN_LSTMModel, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=hidden_dim, kernel_size=3, padding=1)
        self.lstm = nn.LSTM(hidden_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = x.unsqueeze(1)  # (batch_size, 1, num_features)
        x = torch.relu(self.conv1(x))
        x = x.permute(0, 2, 1)  # 调整维度以适应 LSTM 输入 (batch_size, sequence_length, hidden_dim)
        out, _ = self.lstm(x)
        x = self.fc(out[:, -1, :])  # 取最后一个时间步的输出
        return x



# ===============================
# 3. 五折交叉验证 (超参数选择)
# ===============================
candidate_learning_rates = [0.001, 0.005, 0.01]
candidate_hidden_dims = [32, 64, 128]
candidate_patience_values = [50, 100, 150]

num_folds = 5
kfold = KFold(n_splits=num_folds, shuffle=True, random_state=42)
best_hyperparams = None
best_avg_r2 = float('-inf')

print("\n====== Begin CNN_LSTM Grid Search ======")
for lr in candidate_learning_rates:
    for hidden_dim in candidate_hidden_dims:
        for patience in candidate_patience_values:
            print(f"\nTesting Learning Rate: {lr}, Hidden Dim: {hidden_dim}, Patience: {patience}")
            fold_r2_scores = []
            fold_best_epochs = []

            for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train_all_scaled), 1):
                print(f"  Fold {fold} - Training...")

                X_train_fold, X_val_fold = X_train_all_scaled[train_idx], X_train_all_scaled[val_idx]
                y_train_fold, y_val_fold = y_train_all_scaled[train_idx], y_train_all_scaled[val_idx]

                X_train_tensor = torch.tensor(X_train_fold, dtype=torch.float32).to(device)
                y_train_tensor = torch.tensor(y_train_fold, dtype=torch.float32).to(device)
                X_val_tensor = torch.tensor(X_val_fold, dtype=torch.float32).to(device)
                y_val_tensor = torch.tensor(y_val_fold, dtype=torch.float32).to(device)

                model = CNN_LSTMModel(X_train_all.shape[1], hidden_dim, 1).to(device)
                optimizer = optim.Adam(model.parameters(), lr=lr)
                criterion = nn.MSELoss()

                best_val_r2 = float('-inf')
                best_epoch = 0
                no_improve_counter = 0
                max_epochs = 1000

                for epoch in range(max_epochs):
                    model.train()
                    optimizer.zero_grad()
                    predictions = model(X_train_tensor).squeeze()
                    loss = criterion(predictions, y_train_tensor)
                    loss.backward()
                    optimizer.step()

                    model.eval()
                    with torch.no_grad():
                        preds_val = model(X_val_tensor).squeeze().cpu().numpy()
                    val_r2 = r2_score(y_val_tensor.cpu().numpy(), preds_val)

                    if val_r2 > best_val_r2:
                        best_val_r2 = val_r2
                        best_epoch = epoch + 1
                        no_improve_counter = 0
                    else:
                        no_improve_counter += 1
                    if no_improve_counter >= patience:
                        print(f"    Fold {fold}: Early stopping at epoch {epoch+1} (patience={patience})")
                        break

                fold_r2_scores.append(best_val_r2)
                fold_best_epochs.append(best_epoch)

            avg_r2 = np.mean(fold_r2_scores)
            avg_best_epoch = np.mean(fold_best_epochs)

            print(f"Avg R² for this set of hyperparameters: {avg_r2:.4f}")

            if avg_r2 > best_avg_r2:
                best_avg_r2 = avg_r2
                best_hyperparams = (lr, hidden_dim, avg_best_epoch)

print("\n====== CNN_LSTM Grid Search Completed ======")
best_learning_rate, best_hidden_dim, final_num_epochs = best_hyperparams
print(f"Best Hyperparameters: Learning Rate = {best_learning_rate}, Hidden Dim = {best_hidden_dim}, Epochs = {int(final_num_epochs)}")

# ===============================
# 4. 计算 RMSE、MAE、R²、NMAE、NRMSE
# ===============================
def evaluate_model(model, X_test, y_test):
    X_test_scaled = scaler_x.transform(X_test)
    X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32).to(device)
    y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).to(device)

    model.eval()
    with torch.no_grad():
        preds_test = model(X_test_tensor).squeeze().cpu().numpy()
    preds_test_unscaled = scaler_y.inverse_transform(preds_test.reshape(-1, 1)).ravel()

    rmse = np.sqrt(mean_squared_error(y_test, preds_test_unscaled))
    mae = mean_absolute_error(y_test, preds_test_unscaled)
    r2 = r2_score(y_test, preds_test_unscaled)
    nmae = mae / np.mean(y_test) if np.mean(y_test) != 0 else np.nan
    nrmse = rmse / np.mean(y_test) if np.mean(y_test) != 0 else np.nan

    return rmse, mae, r2, nmae, nrmse

# ===============================
# 5. 10 次训练，保存结果
# ===============================

results = []

for i in range(10):
    print(f"\n====== Training CNN_LSTM Run {i+1}/10 ======")
    start_time = time.time()
    
    # 初始化 CNN_LSTM 模型
    model = CNN_LSTMModel(X_train_all.shape[1], best_hidden_dim, 1).to(device)
    optimizer = optim.Adam(model.parameters(), lr=best_learning_rate)
    criterion = nn.MSELoss()

    # 训练模型
    for epoch in range(int(final_num_epochs)):
        model.train()
        optimizer.zero_grad()
        predictions = model(torch.tensor(X_train_all_scaled, dtype=torch.float32).to(device)).squeeze()
        loss = criterion(predictions, torch.tensor(y_train_all_scaled, dtype=torch.float32).to(device))
        loss.backward()
        optimizer.step()

    # 计算训练时间
    end_time = time.time()
    training_time = end_time - start_time

    # 保存模型
    model_filename = f'final_CNN_LSTM_model_run_{i+1}.pth'
    torch.save(model.state_dict(), model_filename)
    print(f"Model {i+1} saved as {model_filename}")

    # 评估模型在 2020、2021、2022 上的表现
    rmse_2020, mae_2020, r2_2020, nmae_2020, nrmse_2020 = evaluate_model(model, X_test_2020, y_test_2020)
    rmse_2021, mae_2021, r2_2021, nmae_2021, nrmse_2021 = evaluate_model(model, X_test_2021, y_test_2021)
    rmse_2022, mae_2022, r2_2022, nmae_2022, nrmse_2022 = evaluate_model(model, X_test_2022, y_test_2022)

    print(f"Run {i+1}: Training Time = {training_time:.2f} s")
    print(f"2020 - RMSE: {rmse_2020:.4f}, MAE: {mae_2020:.4f}, R²: {r2_2020:.4f}, NMAE: {nmae_2020:.4f}, NRMSE: {nrmse_2020:.4f}")
    print(f"2021 - RMSE: {rmse_2021:.4f}, MAE: {mae_2021:.4f}, R²: {r2_2021:.4f}, NMAE: {nmae_2021:.4f}, NRMSE: {nrmse_2021:.4f}")
    print(f"2022 - RMSE: {rmse_2022:.4f}, MAE: {mae_2022:.4f}, R²: {r2_2022:.4f}, NMAE: {nmae_2022:.4f}, NRMSE: {nrmse_2022:.4f}")

    # 记录结果
    results.append({
        'Run': i+1,
        'Training Time (s)': training_time,
        'RMSE_2020': rmse_2020, 'MAE_2020': mae_2020, 'R²_2020': r2_2020, 'NMAE_2020': nmae_2020, 'NRMSE_2020': nrmse_2020,
        'RMSE_2021': rmse_2021, 'MAE_2021': mae_2021, 'R²_2021': r2_2021, 'NMAE_2021': nmae_2021, 'NRMSE_2021': nrmse_2021,
        'RMSE_2022': rmse_2022, 'MAE_2022': mae_2022, 'R²_2022': r2_2022, 'NMAE_2022': nmae_2022, 'NRMSE_2022': nrmse_2022
    })

# 保存结果到 CSV
df_results = pd.DataFrame(results)
df_results.to_csv("CNN_LSTM_results.csv", index=False)
print("Results saved to CNN_LSTM_results.csv")

Using device: cuda


Testing Learning Rate: 0.001, Hidden Dim: 32, Patience: 50
  Fold 1 - Training...
  Fold 2 - Training...
    Fold 2: Early stopping at epoch 425 (patience=50)
  Fold 3 - Training...
    Fold 3: Early stopping at epoch 660 (patience=50)
  Fold 4 - Training...
    Fold 4: Early stopping at epoch 156 (patience=50)
  Fold 5 - Training...
    Fold 5: Early stopping at epoch 891 (patience=50)
Avg R² for this set of hyperparameters: 0.7435

Testing Learning Rate: 0.001, Hidden Dim: 32, Patience: 100
  Fold 1 - Training...
  Fold 2 - Training...
    Fold 2: Early stopping at epoch 680 (patience=100)
  Fold 3 - Training...
  Fold 4 - Training...
  Fold 5 - Training...
    Fold 5: Early stopping at epoch 879 (patience=100)
Avg R² for this set of hyperparameters: 0.9060

Testing Learning Rate: 0.001, Hidden Dim: 32, Patience: 150
  Fold 1 - Training...
  Fold 2 - Training...
  Fold 3 - Training...
  Fold 4 - Training...
    Fold 4: Early stopping at epoch 901 (patience=150)
 