In [1]:
# python
import sys
import os
import importlib
# columnar analysis
from coffea import processor
from coffea.nanoevents import NanoEventsFactory, NanoAODSchema
import awkward as ak
from dask.distributed import Client, performance_report
# local

sidm_path = str(os.getcwd()).split("/sidm")[0]
# sidm_path = str(sys.path[0]).split("/sidm")[0]
if sidm_path not in sys.path: sys.path.insert(1, sidm_path)
from sidm.tools import utilities, sidm_processor, scaleout, cutflow
from sidm.tools import llpnanoaodschema
# always reload local modules to pick up changes during development
importlib.reload(utilities)
importlib.reload(sidm_processor)
importlib.reload(scaleout)
# plotting
import matplotlib.pyplot as plt
utilities.set_plot_style()
%matplotlib inline
from tqdm.notebook import tqdm
import coffea.util
import numpy as np

In [2]:
client = scaleout.make_dask_client("tls://localhost:8786")
client

0,1
Connection method: Direct,
Dashboard: /user/dongyub.lee@cern.ch/proxy/8787/status,

0,1
Comm: tls://192.168.121.97:8786,Workers: 0
Dashboard: /user/dongyub.lee@cern.ch/proxy/8787/status,Total threads: 0
Started: Just now,Total memory: 0 B


In [2]:
samples = [
    "2Mu2E_200GeV_5p0GeV_0p2mm",
    "2Mu2E_200GeV_5p0GeV_200p0mm",
    "2Mu2E_1000GeV_0p25GeV_0p002mm",
    "2Mu2E_1000GeV_0p25GeV_2p0mm",
    "4Mu_200GeV_5p0GeV_0p2mm",
    "4Mu_200GeV_5p0GeV_200p0mm",
    "4Mu_1000GeV_0p25GeV_0p002mm",
    "4Mu_1000GeV_0p25GeV_2p0mm",
    "DYJetsToMuMu_M10to50", 
    "DYJetsToMuMu_M50",    
    "TTJets",
    "QCD_Pt120To170",
    "QCD_Pt170To300",
    "QCD_Pt300To470",
    "QCD_Pt470To600",
    "QCD_Pt600To800",
    "QCD_Pt800To1000",
    "QCD_Pt1000",
]

In [3]:
samples_bkg = [
    "DYJetsToMuMu_M10to50", 
    "DYJetsToMuMu_M50",    
    "TTJets",
    "QCD_Pt170To300",
    "QCD_Pt300To470",
    "QCD_Pt470To600",
    "QCD_Pt600To800",
    "QCD_Pt120To170",
    "QCD_Pt800To1000",
    "QCD_Pt1000",
]

In [3]:
fileset = utilities.make_fileset(samples[0:1], 
                                 "llpNanoAOD_v2", 
                                 location_cfg="signal_2mu2e_v10.yaml",
                                 max_files = 1,
                                )

In [4]:
runner = processor.Runner(
    # executor=processor.DaskExecutor(client=client),
    executor=processor.IterativeExecutor(),
    schema=llpnanoaodschema.LLPNanoAODSchema,
    skipbadfiles=True
)

channels = [
    "base", "base_isosel"
]

p = sidm_processor.SidmProcessor(
    channels,
    ["matched_jet_base", "fraction_base", "isolation_base", "mother_tracking_base", "muon_crosscleaning_base"],
    unweighted_hist=False,
)

out = {}
for i, sample in enumerate(samples):

    # print(f"Processing {sample}")
    fileset_one_sample = {samples[i]:fileset.get(samples[i])}
    
    output = runner.run(fileset_one_sample, treename='Events', processor_instance=p)

    #Add this sample's output to the out variable
    out[sample] = output["out"][sample]

    #Save output to a file!!
    out_file_name = "output_" + sample + ".coffea"
    coffea.util.save(output,out_file_name)

Output()

Output()

#--------------------------------------------------------------------------
#                         FastJet release 3.4.3
#                 M. Cacciari, G.P. Salam and G. Soyez                  
#     A software package for jet finding and analysis at colliders      
#                           http://fastjet.fr                           
#	                                                                      
# Please cite EPJC72(2012)1896 [arXiv:1111.6097] if you use this package
# for scientific work and optionally PLB641(2006)57 [hep-ph/0512210].   
#                                                                       
# FastJet is provided without warranty under the GNU GPL v2 or higher.  
# It uses T. Chan's closest pair algorithm, S. Fortune's Voronoi code,
# CGAL and 3rd party plugin jet algorithms. See COPYING file for details.
#--------------------------------------------------------------------------


Signal not in xs cfg, assuming 1fb


ValueError: list of filenames in fileset must be a list or a dict

In [5]:
import os
import math
from copy import deepcopy
from coffea.util import save, load
from dask.distributed import performance_report
try:
    from coffea.processor import accumulate
except Exception:
    # 구버전 호환
    from coffea.processor.accumulator import accumulate


# ==============================
# 0) 파라미터
# ==============================
SAMPLES_SLICE = samples_bkg[9:10]    # 질문에서와 동일
TREE_NAME     = "Events"
LOCATION_CFG  = "background.yaml"
MAX_FILES     = -1
N_BATCHES     = 2                   # 1500개 기준으로 자동 분할하려면 아래 AUTO_SPLIT 을 True로
AUTO_SPLIT    = True                # True면 파일 수 기준(≈1500개)으로 배치 수 산정
FILES_PER_BATCH_TARGET = 1600       # AUTO_SPLIT=True일 때 목표 파일 수

# ======================================
# 1) fileset 생성
# ======================================
fileset = utilities.make_fileset(
    SAMPLES_SLICE,
    "skimmed_llpNanoAOD_v2",
    location_cfg=LOCATION_CFG,
    max_files=MAX_FILES,
)

# 단일 샘플 가정
assert len(fileset) == 1, "이 스크립트는 단일 샘플 처리에 맞춰져 있습니다."
sample_name = next(iter(fileset.keys()))
sample_entry = fileset[sample_name]  # list / tuple / dict 가능

# ======================================
# 2) 파일 리스트 정규화 함수
# ======================================
def normalize_files(entry):
    """
    fileset[sample]에서 '파일 경로 리스트'만 뽑아내고, 원래 entry 구조 유형을 반환.
    반환: (file_list, entry_kind)
      - entry_kind == "list"            : 원래가 리스트/튜플
      - entry_kind == "dict_files_key"  : {"files": [...], ...} 형태
      - entry_kind == "dict_filemap"    : {"/path1.root": meta, "/path2.root": meta, ...} 형태
    """
    # case A: 이미 리스트/튜플
    if isinstance(entry, (list, tuple)):
        return list(entry), "list"

    # case B: 딕셔너리
    if isinstance(entry, dict):
        # 표준 coffea 스타일: {"files": [...], ...}
        if "files" in entry:
            files = entry["files"]
            if isinstance(files, dict):   # 드물지만 "files"가 dict 인 경우
                return list(files.keys()), "dict_files_key"
            return list(files), "dict_files_key"
        # 파일 경로를 key로 메타를 value로 들고 있는 형태
        return list(entry.keys()), "dict_filemap"

    raise TypeError(f"Unsupported fileset entry type: {type(entry)}")

file_list, entry_kind = normalize_files(sample_entry)

# ======================================
# 3) 배치 분할 함수
# ======================================
def split_even(lst, n):
    """
    lst를 n개의 (가능한 균등한) 조각으로 분할하여 리스트[chunks] 반환
    """
    if n <= 0:
        raise ValueError("n must be positive")
    k, m = divmod(len(lst), n)
    chunks = []
    start = 0
    for i in range(n):
        size = k + (1 if i < m else 0)
        if size > 0:
            chunks.append(lst[start:start+size])
            start += size
    return chunks

# 배치 수 확정
if AUTO_SPLIT:
    N_BATCHES = max(1, math.ceil(len(file_list) / max(1, FILES_PER_BATCH_TARGET)))

batches = split_even(file_list, N_BATCHES)

print(f"[Info] sample={sample_name}, total_files={len(file_list)}, batches={len(batches)}")
for i, b in enumerate(batches, 1):
    print(f"  - Batch {i}: {len(b)} files")

# ======================================
# 4) 부분 실행 (이미 만들어진 파트는 스킵)
# ======================================
part_output_files = []

for i, files_in_batch in enumerate(batches, start=1):
    out_file_name = f"output_{sample_name}_part{i}.coffea"
    report_name   = f"dask-report_part{i}.html"

    # 이미 결과 파일이 있으면 스킵(재시도/재개)
    if os.path.exists(out_file_name):
        print(f"[Batch {i}] 이미 {out_file_name} 존재 → 스킵합니다.")
        part_output_files.append(out_file_name)
        continue

    # 원래 entry 구조에 맞춰 부분 fileset 구성
    if entry_kind == "list":
        sub_fileset_entry = files_in_batch

    elif entry_kind in ("dict_files_key", "dict_filemap"):
        if isinstance(sample_entry, dict) and "files" in sample_entry:
            # {"files":[...], ...} 형태 유지: 다른 메타는 그대로 복사
            sub_fileset_entry = dict(sample_entry)  # 얕은 복사
            sub_fileset_entry["files"] = files_in_batch
        else:
            # {"/path": meta, ...} 형태였으면 부분 키만 선택
            # (값이 필요 없는 변형이면 아래 dict comp가 빈 dict가 될 수 있어 안전장치 포함)
            try:
                sub_fileset_entry = {f: sample_entry[f] for f in files_in_batch}
            except Exception:
                # 키 접근이 안 되는 경우엔 단순 리스트로 전달
                sub_fileset_entry = files_in_batch
    else:
        raise RuntimeError("Unexpected entry_kind")

    sub_fileset = {sample_name: sub_fileset_entry}

    # Processor 인스턴스 준비(상태성 있을 수 있으니 가급적 새로 만들거나 deepcopy)
    try:
        p_to_use = deepcopy(p)
    except Exception:
        p_to_use = p

    print(f"[Batch {i}/{len(batches)}] files: {len(files_in_batch)} → running...")

    with performance_report(filename=report_name):
        output = runner.run(
            sub_fileset,
            treename=TREE_NAME,
            processor_instance=p_to_use,
        )

    save(output, out_file_name)
    part_output_files.append(out_file_name)
    print(f"[Batch {i}] done → saved {out_file_name} and {report_name}")

[Info] sample=QCD_Pt1000, total_files=3087, batches=2
  - Batch 1: 1544 files
  - Batch 2: 1543 files
[Batch 1/2] files: 1544 → running...
[Batch 1] done → saved output_QCD_Pt1000_part1.coffea and dask-report_part1.html
[Batch 2/2] files: 1543 → running...
[Batch 2] done → saved output_QCD_Pt1000_part2.coffea and dask-report_part2.html
