In [1]:
import os
import re
import pandas as pd
from datetime import datetime
from glob import glob

In [4]:
# Directory pattern
base_dir = '/glade/work/swei/Git/JEDI-METplus/output/wrfchem_evaluate/stats'
variable = 'Ozone' # Ozone, PM2p5

folder_pattern = os.path.join(base_dir, 'f[0-2][0-9]')
file_pattern = f'CTC.{variable}.*.out'

In [5]:
# Storage for parsed data
records = []

# Loop over folders f01 to f24
for folder in sorted(glob(folder_pattern)):
    for filepath in sorted(glob(os.path.join(folder, file_pattern))):
        filename = os.path.basename(filepath)

        # Parse pollutant, date, and hour from filename
        match = re.match(r'CTC\.(\w+)\.(\d{10})\.out', filename)
        if not match:
            print('not match file')
            continue
        pollutant, datestr = match.groups()
        datetime = datetime.strptime(datestr, "%Y%m%d%H")
        
        # Read file
        with open(filepath, 'r') as f:
            lines = f.readlines()

        # Extract thresholds from JOB_LIST
        job_line = next((l for l in lines if l.startswith('JOB_LIST:')), None)
        if not job_line:
            continue
        thresholds = re.findall(r'-out_thresh ([^ ]+)', job_line)

        # Extract CTC lines
        ctc_lines = [l.strip() for l in lines if l.strip().startswith('CTC:')]

        for i, line in enumerate(ctc_lines):
            if i >= len(thresholds):
                break  # safeguard if mismatch
            threshold = thresholds[i]

            parts = line.split()
            if len(parts) < 8:
                continue

            _, fcst_var, total, fy_oy, fy_on, fn_oy, fn_on, ec_value = parts
            records.append({
                'folder': os.path.basename(folder),
                'file': filename,
                'pollutant': pollutant,
                'datetime': datetime,
                'threshold_expr': threshold,
                'fcst_var': fcst_var,
                'total': int(total),
                'FY_OY': int(fy_oy),
                'FY_ON': int(fy_on),
                'FN_OY': int(fn_oy),
                'FN_ON': int(fn_on),
                'EC_VALUE': float(ec_value)
            })

# Create DataFrame
df = pd.DataFrame(records)

# Group by threshold expr
sum_df = df.groupby(['threshold_expr'])[['total', 'FY_OY', 'FY_ON', 'FN_OY', 'FN_ON']].sum()
sum_df.to_csv(f'{base_dir}/CTC.{variable}.csv')

# # Display or save
# print(df.head())
# # df.to_csv("parsed_ctc_output.csv", index=False)



In [27]:
sum_df

Unnamed: 0_level_0,total,FY_OY,FY_ON,FN_OY,FN_ON
threshold_expr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
>0&&<=7e-8,52488,0,0,50874,1614
>7e-8,52488,415,52073,0,0
