In [2]:
import os
import xarray as xr
from joblib import Parallel, delayed
import pandas as pd
from typing import List, Dict, Any
from pathlib import Path
import math

from tools.tools import get_path,get_year,filter_all_from_dims
import tools.config as config

In [3]:
def summarize_netcdf_to_excel(
    input_path: str,
    years: List[int],
    files: List[str],
    n_jobs: int = 41,
    excel_type: str = "all",
) -> None:
    """
    从指定的目录结构中读取单变量NetCDF文件，计算其总和，并将结果保存到Excel。

    Args:
        input_path (str): 包含 'output_{year}' 子目录的基础路径。
        years (List[int]): 需要处理的年份列表。
        files (List[str]): 文件的基础名称列表 (不含年份和扩展名)。
        output_excel_path (str, optional): 输出的Excel文件名。
                                            默认为 "economic_summary.xlsx"。
    """
    print(f"Start {input_path}...")

    def _process_single_year(
        year: int, 
        files: List[str], 
        input_path: str, 
        task_name: str,
    ) -> Dict[str, Any]:
        """
        Processes all specified files for a single year.
        This function is designed to be called in parallel by joblib.
        
        Returns:
            A dictionary containing the results for the given year (e.g., {'Year': 2025, 'file1': 123.4, ...}).
        """
        print(f"Processing year: {year}")
        year_data = {'Year': year}
    
        for file in files:
            total_sum = None
            # Build the full file path based on the file name
            file_path = os.path.join(
                input_path,
                f'{year}',
                f'{file}_{year}.nc'
            )

            # Check for file existence before trying to open
            if os.path.exists(file_path):
                with xr.open_dataarray(file_path) as da:
                    filtered_da = filter_all_from_dims(da)
                    total_sum = filtered_da.sum().item()
            else:
                print(f"  - WARNING: File '{file_path}' for year {year} does not exist.")

            # Add the result to the dictionary for the current year
            year_data[file] = total_sum
                
        return year_data

    all_data = Parallel(n_jobs=n_jobs)(
            delayed(_process_single_year)(year, files, input_path, config.TASK_NAME)
            for year in sorted(years)
        )
    # 将结果列表转换为 pandas DataFrame
    results_df = pd.DataFrame(all_data)
    
    # 将 'Year' 列设为索引
    results_df = results_df.set_index('Year')
    results_df = results_df/1e6
    results_df = results_df.rename(columns=config.KEY_TO_COLUMN_MAP)
    output_excel_path = os.path.join(
                    f'../../../output/{config.TASK_NAME}',
                    'carbon_price',
                    '1_excel',
                    f'0_Origin_{excel_type}_{Path(input_path).parts[5]}.xlsx')
    results_df.to_excel(output_excel_path)
    return results_df

In [4]:
economic_files = ['xr_cost_ag', 'xr_cost_agricultural_management', 'xr_cost_non_ag', 'xr_cost_transition_ag2ag','xr_transition_cost_ag2non_ag','xr_transition_cost_ag2non_ag_amortised','xr_revenue_ag', 'xr_revenue_agricultural_management', 'xr_revenue_non_ag']
carbon_files = ['xr_GHG_ag', 'xr_GHG_ag_management', 'xr_GHG_non_ag', 'xr_transition_GHG']
bio_files = ['xr_biodiversity_GBF2_priority_ag', 'xr_biodiversity_GBF2_priority_ag_management','xr_biodiversity_GBF2_priority_non_ag']

input_files_0 = ['Run_13_GHG_off_BIO_off_CUT_50']
input_files_1 = ['Run_06_GHG_high_BIO_off_CUT_50','Run_12_GHG_low_BIO_off_CUT_50']
input_files_2 = ['Run_01_GHG_high_BIO_high_CUT_50','Run_02_GHG_high_BIO_high_CUT_40','Run_03_GHG_high_BIO_high_CUT_30','Run_04_GHG_high_BIO_high_CUT_20','Run_05_GHG_high_BIO_high_CUT_10',
                     'Run_07_GHG_low_BIO_high_CUT_50', 'Run_08_GHG_low_BIO_high_CUT_40', 'Run_09_GHG_low_BIO_high_CUT_30', 'Run_10_GHG_low_BIO_high_CUT_20', 'Run_11_GHG_low_BIO_high_CUT_10']
input_files = input_files_0 + input_files_1 + input_files_2

In [5]:
njobs = math.ceil(41 / 4)
task_dir = f'../../../output/{config.TASK_NAME}/carbon_price/0_base_data'
years = list(range(2020,2051,1))
for input_file in input_files:
    input_path = os.path.join(task_dir, input_file)
    df = summarize_netcdf_to_excel(input_path, years, economic_files,njobs,'economic')
for input_file in input_files_1:
    input_path = get_path(config.TASK_NAME, input_file)
    df = summarize_netcdf_to_excel(input_path, years, carbon_files,njobs,'carbon')
for input_file in input_files_2:
    input_path = get_path(config.TASK_NAME, input_file)
    df = summarize_netcdf_to_excel(input_path, years, bio_files,njobs,'biodiversity')

Start ../../../output/20250908_Paper2_Results_NCI/carbon_price/0_base_data\Run_13_GHG_off_BIO_off_CUT_50...
Start ../../../output/20250908_Paper2_Results_NCI/carbon_price/0_base_data\Run_06_GHG_high_BIO_off_CUT_50...


MemoryError: Unable to allocate 11.8 GiB for an array with shape (2, 168778, 28, 2, 28, 6) and data type float32

In [None]:
years = list(range(2020,2051,1))

In [None]:
years