In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/all-province/all_province_data.xlsx


In [2]:
!pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m27.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.6.1


In [3]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv,global_mean_pool
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import KFold

# 种子固定
np.random.seed(42)
torch.manual_seed(42)  # 设置 CPU 随机种子
torch.cuda.manual_seed(42)  # 设置 GPU 随机种子
torch.cuda.manual_seed_all(42)  # 如果使用多 GPU，设置所有 GPU 的随机种子
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# -------------------------
# 1. 环境及数据准备
# -------------------------
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

file_path = '/kaggle/input/all-province/all_province_data.xlsx'  # 请根据实际路径修改
data = pd.read_excel(file_path)

# 将时间列转换为年份（假设 '时间' 列格式可提取数字）
data['Year'] = data['时间'].str.extract('(\d+)').astype(int)

# 独热编码省份
data = pd.get_dummies(data, columns=['Province'])

# 选择特征
features = [
    'Year', 'Ammonia Nitrogen Emissions', 'Average Temperature', 
    'Average Years of Education per Capita', 'Chemical Oxygen Demand Emissions', 
    'Electricity Consumption', 'Geographic-Mean PM2', 
    'Government Expenditure on Environmental Protection', 
    'NOx Emissions', 'Number of Healthcare Institutions', 
    'Number of Healthcare Personnel', 'Oil Emissions', 
    'Per Capita Disposable Income', 'Resident Population', 
    'SO2 Emissions', 'Total Nitrogen Emissions', 
    'Total Phosphorus Emissions'
] + list(data.columns[data.columns.str.startswith('Province_')])

X = data[features]
y = data['Total CO2 emissions']

X = X.fillna(X.mean())

# -------------------------
# 2. 划分数据集
# -------------------------
# 将 2020, 2021, 2022 年的数据作为最终测试集，不参与交叉验证
X_test_2020 = X[X['Year'] == 2020]
X_test_2021 = X[X['Year'] == 2021]
X_test_2022 = X[X['Year'] == 2022]

y_test_2020 = y[X['Year'] == 2020]
y_test_2021 = y[X['Year'] == 2021]
y_test_2022 = y[X['Year'] == 2022]

# 其他年份作为训练集进行交叉验证（例如：<=2019）
X_train_all = X[~X['Year'].isin([2020, 2021, 2022])]
y_train_all = y[~X['Year'].isin([2020, 2021, 2022])]

# -------------------------
# 3. 定义模型
# -------------------------
class GraphSAGEModel(torch.nn.Module):
    def __init__(self, num_node_features, hidden_dim, out_dim):
        super(GraphSAGEModel, self).__init__()
        self.conv1 = SAGEConv(num_node_features, hidden_dim)
        self.conv2 = SAGEConv(hidden_dim, hidden_dim)
        self.fc = torch.nn.Linear(hidden_dim, out_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = F.relu(self.conv2(x, edge_index))
        x = self.fc(x)
        return x

# -------------------------
# 4. 定义构图函数
# -------------------------
def build_graph(x_array, threshold_percent=90):
    """
    根据给定的特征数组构建图，使用欧氏距离与指定百分位阈值构图。
    """
    distances = euclidean_distances(x_array)
    threshold = np.percentile(distances, threshold_percent)
    edge_index = np.array(np.where(distances < threshold))
    # 去除自环
    edge_index = edge_index[:, edge_index[0] != edge_index[1]]
    edge_index = torch.tensor(edge_index, dtype=torch.long)
    x_tensor = torch.tensor(x_array, dtype=torch.float)
    return Data(x=x_tensor, edge_index=edge_index)

# -------------------------
# 5. 五折交叉验证+早停+网格搜索
# -------------------------
candidate_learning_rates = [0.001, 0.005, 0.01]
candidate_patience_values = [10, 20, 30]
candidate_hidden_dims = [32, 64, 128]

num_folds = 5
kfold = KFold(n_splits=num_folds, shuffle=True, random_state=42)
grid_search_results = []

print("\n====== Begin Grid Search ======")
for lr in candidate_learning_rates:
    for patience in candidate_patience_values:
        for hidden_dim in candidate_hidden_dims:
            fold_r2_scores = []
            fold_best_epochs = []
            print(f"\nTesting combination: learning_rate={lr}, patience={patience}, hidden_dim={hidden_dim}")
            for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train_all)):
                # 划分本折训练和验证数据
                X_train_fold = X_train_all.iloc[train_idx].copy()
                y_train_fold = y_train_all.iloc[train_idx].copy()
                X_val_fold = X_train_all.iloc[val_idx].copy()
                y_val_fold = y_train_all.iloc[val_idx].copy()
                
                scaler_x = MinMaxScaler()
                X_train_fold_scaled = scaler_x.fit_transform(X_train_fold)
                X_val_fold_scaled = scaler_x.transform(X_val_fold)
                scaler_y = MinMaxScaler()
                y_train_fold_scaled = scaler_y.fit_transform(y_train_fold.values.reshape(-1, 1)).ravel()
                
                train_data_fold = build_graph(X_train_fold_scaled).to(device)
                val_data_fold = build_graph(X_val_fold_scaled).to(device)
                y_train_fold_scaled_t = torch.tensor(y_train_fold_scaled, dtype=torch.float).to(device)
                y_val_fold_t = torch.tensor(y_val_fold.values, dtype=torch.float).to(device)
                
                model = GraphSAGEModel(num_node_features=X_train_fold_scaled.shape[1],
                                       hidden_dim=hidden_dim,
                                       out_dim=1).to(device)
                optimizer = torch.optim.Adam(model.parameters(), lr=lr)
                criterion = torch.nn.MSELoss()
                
                best_val_r2 = float('-inf')
                best_epoch = 0
                no_improve_counter = 0
                max_epochs = 200
                
                for epoch in range(max_epochs):
                    model.train()
                    optimizer.zero_grad()
                    out = model(train_data_fold).squeeze()
                    loss = criterion(out, y_train_fold_scaled_t)
                    loss.backward()
                    optimizer.step()
                    
                    # 验证
                    model.eval()
                    with torch.no_grad():
                        preds_val = model(val_data_fold).squeeze().cpu().numpy()
                    preds_val_unscaled = scaler_y.inverse_transform(preds_val.reshape(-1, 1)).ravel()
                    val_r2 = r2_score(y_val_fold_t.cpu().numpy(), preds_val_unscaled)
                    
                    if val_r2 > best_val_r2:
                        best_val_r2 = val_r2
                        best_epoch = epoch + 1
                        no_improve_counter = 0
                    else:
                        no_improve_counter += 1
                    if no_improve_counter >= patience:
                        print(f"Fold {fold+1}: Early stopping at epoch {epoch+1} (patience={patience})")
                        break
                fold_r2_scores.append(best_val_r2)
                fold_best_epochs.append(best_epoch)
                print(f"Fold {fold+1}: Best Epoch = {best_epoch}, Val_R² = {best_val_r2:.4f}")
            
            avg_r2 = np.mean(fold_r2_scores)
            avg_best_epoch = np.mean(fold_best_epochs)
            print(f"Combination lr={lr}, patience={patience}, hidden_dim={hidden_dim}: Avg Val_R² = {avg_r2:.4f}, Avg Best Epoch = {avg_best_epoch:.2f}")
            grid_search_results.append({
                'learning_rate': lr,
                'patience': patience,
                'hidden_dim': hidden_dim,
                'avg_val_r2': avg_r2,
                'avg_best_epoch': avg_best_epoch
            })

# 选择最佳超参数组合
best_combo = max(grid_search_results, key=lambda x: x['avg_val_r2'])
best_learning_rate = best_combo['learning_rate']
best_patience = best_combo['patience']
best_hidden_dim = best_combo['hidden_dim']
final_num_epochs = int(best_combo['avg_best_epoch'])  # 使用平均 best epoch 作为最终训练轮次

print("\n====== Grid Search Completed ======")
print("Best hyperparameters from grid search:")
print(best_combo)
print(f"Final training epochs set to: {final_num_epochs}")

Using device: cuda


Testing combination: learning_rate=0.001, patience=10, hidden_dim=32
Fold 1: Early stopping at epoch 37 (patience=10)
Fold 1: Best Epoch = 27, Val_R² = 0.1621
Fold 2: Best Epoch = 200, Val_R² = 0.9714
Fold 3: Early stopping at epoch 31 (patience=10)
Fold 3: Best Epoch = 21, Val_R² = 0.1011
Fold 4: Early stopping at epoch 29 (patience=10)
Fold 4: Best Epoch = 19, Val_R² = 0.0669
Fold 5: Early stopping at epoch 29 (patience=10)
Fold 5: Best Epoch = 19, Val_R² = 0.1140
Combination lr=0.001, patience=10, hidden_dim=32: Avg Val_R² = 0.2831, Avg Best Epoch = 57.20

Testing combination: learning_rate=0.001, patience=10, hidden_dim=64
Fold 1: Early stopping at epoch 31 (patience=10)
Fold 1: Best Epoch = 21, Val_R² = 0.2108
Fold 2: Early stopping at epoch 24 (patience=10)
Fold 2: Best Epoch = 14, Val_R² = 0.1304
Fold 3: Best Epoch = 200, Val_R² = 0.9615
Fold 4: Early stopping at epoch 21 (patience=10)
Fold 4: Best Epoch = 11, Val_R² = 0.1185
Fold 5: Early stopping at epoch 

In [4]:
import pandas as pd
import numpy as np
import torch
import time
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# 种子固定
np.random.seed(42)
torch.manual_seed(42)  # 设置 CPU 随机种子
torch.cuda.manual_seed(42)  # 设置 GPU 随机种子
torch.cuda.manual_seed_all(42)  # 如果使用多 GPU，设置所有 GPU 的随机种子
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# -------------------------
# 7. 可选：使用最佳超参数以及数据训练最终模型，并对 2020-2022 预测
# -------------------------
# 对所有训练数据重新标准化
scaler_x_final = MinMaxScaler()
X_train_all_scaled = scaler_x_final.fit_transform(X_train_all)
scaler_y_final = MinMaxScaler()
y_train_all_scaled = scaler_y_final.fit_transform(y_train_all.values.reshape(-1, 1)).ravel()
train_data_final = build_graph(X_train_all_scaled).to(device)
y_train_all_scaled_t = torch.tensor(y_train_all_scaled, dtype=torch.float).to(device)

results = []

print("\n====== Begin Final Training with Best Hyperparameters ======")
for i in range(10):
    print(f"\nFinal Training run {i+1}/10")
    start_time = time.time()
    
    model_final = GraphSAGEModel(num_node_features=X_train_all_scaled.shape[1],
                                 hidden_dim=best_hidden_dim,
                                 out_dim=1).to(device)
    optimizer_final = torch.optim.Adam(model_final.parameters(), lr=best_learning_rate)
    criterion_final = torch.nn.MSELoss()
    
    for epoch in range(final_num_epochs):
        model_final.train()
        optimizer_final.zero_grad()
        out_final = model_final(train_data_final).squeeze()
        loss_final = criterion_final(out_final, y_train_all_scaled_t)
        loss_final.backward()
        optimizer_final.step()
        
        if (epoch+1) % 50 == 0:
            print(f"[Final Train] Epoch {epoch+1}/{final_num_epochs}, Loss: {loss_final.item():.4f}")
    
    model_filename = f'final_model_run_{i+1}.pth'
    torch.save(model_final.state_dict(), model_filename)
    
    def predict_and_evaluate(model, X_test, y_test):
        X_test_scaled = scaler_x_final.transform(X_test)
        data_test = build_graph(X_test_scaled).to(device)
        y_test_t = torch.tensor(y_test.values, dtype=torch.float).to(device)
        model.eval()
        with torch.no_grad():
            preds_test = model(data_test).squeeze().cpu().numpy()
        preds_test_unscaled = scaler_y_final.inverse_transform(preds_test.reshape(-1, 1)).ravel()
        rmse = np.sqrt(mean_squared_error(y_test_t.cpu().numpy(), preds_test_unscaled))
        mae = mean_absolute_error(y_test_t.cpu().numpy(), preds_test_unscaled)
        r2 = r2_score(y_test_t.cpu().numpy(), preds_test_unscaled)
        nmae = mae / np.mean(y_test_t.cpu().numpy()) if np.mean(y_test_t.cpu().numpy()) != 0 else np.nan
        nrmse = rmse / np.mean(y_test_t.cpu().numpy()) if np.mean(y_test_t.cpu().numpy()) != 0 else np.nan
        return rmse, mae, r2, nmae, nrmse
    
    rmse_2020, mae_2020, r2_2020, nmae_2020, nrmse_2020 = predict_and_evaluate(model_final, X_test_2020, y_test_2020)
    rmse_2021, mae_2021, r2_2021, nmae_2021, nrmse_2021 = predict_and_evaluate(model_final, X_test_2021, y_test_2021)
    rmse_2022, mae_2022, r2_2022, nmae_2022, nrmse_2022 = predict_and_evaluate(model_final, X_test_2022, y_test_2022)
    
    end_time = time.time()
    training_time = end_time - start_time
    
    print(f"Run {i+1}: Training Time = {training_time:.2f} s")
    print(f"2020 - RMSE: {rmse_2020:.4f}, MAE: {mae_2020:.4f}, R²: {r2_2020:.4f}, NMAE: {nmae_2020:.4f}, NRMSE: {nrmse_2020:.4f}")
    print(f"2021 - RMSE: {rmse_2021:.4f}, MAE: {mae_2021:.4f}, R²: {r2_2021:.4f}, NMAE: {nmae_2021:.4f}, NRMSE: {nrmse_2021:.4f}")
    print(f"2022 - RMSE: {rmse_2022:.4f}, MAE: {mae_2022:.4f}, R²: {r2_2022:.4f}, NMAE: {nmae_2022:.4f}, NRMSE: {nrmse_2022:.4f}")
    
    results.append({
        'Run': i+1,
        'Training_Time (s)': training_time,
        'RMSE_2020': rmse_2020, 'MAE_2020': mae_2020, 'R²_2020': r2_2020, 'NMAE_2020': nmae_2020, 'NRMSE_2020': nrmse_2020,
        'RMSE_2021': rmse_2021, 'MAE_2021': mae_2021, 'R²_2021': r2_2021, 'NMAE_2021': nmae_2021, 'NRMSE_2021': nrmse_2021,
        'RMSE_2022': rmse_2022, 'MAE_2022': mae_2022, 'R²_2022': r2_2022, 'NMAE_2022': nmae_2022, 'NRMSE_2022': nrmse_2022
    })

results_df = pd.DataFrame(results)
results_df.to_csv('model_training_results_with_time.csv', index=False)
print("\nResults saved to 'model_training_results_with_time.csv'.")



Final Training run 1/10
[Final Train] Epoch 50/88, Loss: 0.0015
Run 1: Training Time = 0.22 s
2020 - RMSE: 2670.8430, MAE: 2163.3206, R²: 0.9684, NMAE: 0.0601, NRMSE: 0.0742
2021 - RMSE: 3128.9417, MAE: 2631.5212, R²: 0.9595, NMAE: 0.0716, NRMSE: 0.0852
2022 - RMSE: 3123.2458, MAE: 2641.5251, R²: 0.9559, NMAE: 0.0718, NRMSE: 0.0849

Final Training run 2/10
[Final Train] Epoch 50/88, Loss: 0.0017
Run 2: Training Time = 0.22 s
2020 - RMSE: 2767.6313, MAE: 2345.6641, R²: 0.9661, NMAE: 0.0651, NRMSE: 0.0769
2021 - RMSE: 3262.3987, MAE: 2783.6306, R²: 0.9559, NMAE: 0.0758, NRMSE: 0.0888
2022 - RMSE: 2786.8828, MAE: 2159.6018, R²: 0.9649, NMAE: 0.0587, NRMSE: 0.0758

Final Training run 3/10
[Final Train] Epoch 50/88, Loss: 0.0018
Run 3: Training Time = 0.21 s
2020 - RMSE: 2881.1753, MAE: 2438.5386, R²: 0.9632, NMAE: 0.0677, NRMSE: 0.0800
2021 - RMSE: 3188.2766, MAE: 2738.1638, R²: 0.9579, NMAE: 0.0745, NRMSE: 0.0868
2022 - RMSE: 2759.3784, MAE: 2245.3315, R²: 0.9656, NMAE: 0.0611, NRMSE: 0