In [1]:
import os
import pandas as pd
from joblib import Parallel, delayed
import zipfile
import shutil
import tempfile

def clean_and_recompress(root_path, njobs=-1, archive_root=True):
    # 1. 找到所有Run_Archive.zip
    run_zip_paths = []
    for dirpath, dirnames, filenames in os.walk(root_path):
        for filename in filenames:
            if filename == "Run_Archive.zip":
                run_zip_paths.append(os.path.join(dirpath, filename))
    
    def _process_zip(zip_path):
        print(f"Processing {zip_path}")
        with zipfile.ZipFile(zip_path, "r") as zf:
            names_to_keep = [name for name in zf.namelist() if name.startswith("DATA_REPORT/")]
            keep_files = {name: zf.read(name) for name in names_to_keep if not name.endswith('/')}
        # 在目标目录创建临时文件
        target_dir = os.path.dirname(zip_path)
        with tempfile.NamedTemporaryFile(delete=False, dir=target_dir) as tmp:
            tmp_zip_path = tmp.name
        with zipfile.ZipFile(tmp_zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
            for name, data in keep_files.items():
                zf.writestr(name, data)
        os.replace(tmp_zip_path, zip_path)
            
    # 2. 并行处理每个Run_Archive.zip
    Parallel(n_jobs=njobs)(delayed(_process_zip)(zip_path) for zip_path in run_zip_paths)

    # 3. 可选：压缩整个root_path到上一级目录
    if archive_root:
        parent_dir = os.path.dirname(root_path)
        base_name = os.path.basename(root_path)
        archive_path = os.path.join(parent_dir, base_name)
        with zipfile.ZipFile(f"{archive_path}.zip", 'w', zipfile.ZIP_DEFLATED, compresslevel=3) as zipf:
            for root, dirs, files in os.walk(root_path):
                for file in files:
                    file_path = os.path.join(root, file)
                    arcname = os.path.relpath(file_path, root_path)
                    zipf.write(file_path, arcname)
        print(f"Recompressed whole directory to {archive_path}.zip")

def find_missing_reports_by_col(path):
    # 读取csv列名
    csv_path = os.path.join(path, "grid_search_template.csv")
    df = pd.read_csv(csv_path,index_col=0)
    col_names = df.columns.tolist()

    missing_cols = []
    for col_name in col_names:
        new_path = os.path.join(path, str(col_name))
        zip_path = os.path.join(new_path, "Run_Archive.zip")
        if not os.path.exists(zip_path):
            missing_cols.append(col_name)
    return missing_cols 

In [2]:
path = "../../output/20251004_Cost_curve_task"
print(find_missing_reports_by_col(path))

['Run_91_GBF2_off_CUT_50_CarbonPrice_133.84', 'Run_92_GBF2_off_CUT_50_CarbonPrice_178.46', 'Run_93_GBF2_off_CUT_50_CarbonPrice_233.07', 'Run_94_GBF2_off_CUT_50_CarbonPrice_267.69', 'Run_95_GBF2_off_CUT_50_CarbonPrice_312.3', 'Run_96_GBF2_off_CUT_50_CarbonPrice_356.92']


In [3]:
clean_and_recompress(path, archive_root=True)

Recompressed whole directory to ../../output\20251004_Cost_curve_task.zip
