# 2. Preprocess the MSWX_V100_r

### 2.1 Transmit the MSWX data

In [2]:
from pathlib import Path
import subprocess

input_dir = Path("/share/home/dq013/zhwei/colm/data/CoLM_Forcing/MSWX_V100_r")
output_dir = Path("/share/home/dq076/bedrock/data/P/MSWX_V100_r/rawdata")
output_dir.mkdir(parents=True, exist_ok=True)

# 只同步 P_2003_01.nc ~ P_2020_12.nc（用 include/exclude 精确控制）
cmd = [
    "rsync",
    "-avh",            # archive + verbose + human-readable
    "--info=progress2",
    "--prune-empty-dirs",
    "--include=*/",
    "--include=P_200[3-9]_[0-1][0-9].nc",
    "--include=P_201[0-9]_[0-1][0-9].nc",
    "--include=P_2020_[0-1][0-9].nc",
    "--exclude=*",
    f"{str(input_dir)}/",
    f"{str(output_dir)}/",
]

subprocess.run(cmd, check=True)
print("完成：2003-2020 全部 P_YYYY_MM.nc 同步")


building file list ... done
./
P_2005_11.nc
        496.75M   0%  155.36MB/s    0:00:03 (xfr#1, to-chk=181/217)
P_2005_12.nc
          1.00G   0%  152.40MB/s    0:00:06 (xfr#2, to-chk=180/217)
P_2006_01.nc
          1.51G   1%  152.26MB/s    0:00:09 (xfr#3, to-chk=179/217)
P_2006_02.nc
          1.98G   1%  151.67MB/s    0:00:12 (xfr#4, to-chk=178/217)
P_2006_03.nc
          2.49G   2%  150.02MB/s    0:00:15 (xfr#5, to-chk=177/217)
P_2006_04.nc
          2.99G   2%  149.47MB/s    0:00:19 (xfr#6, to-chk=176/217)
P_2006_05.nc
          3.51G   3%  150.05MB/s    0:00:22 (xfr#7, to-chk=175/217)
P_2006_06.nc
          4.03G   3%  150.19MB/s    0:00:25 (xfr#8, to-chk=174/217)
P_2006_07.nc
          4.56G   4%  150.51MB/s    0:00:28 (xfr#9, to-chk=173/217)
P_2006_08.nc
          5.10G   4%  150.72MB/s    0:00:32 (xfr#10, to-chk=172/217)
P_2006_09.nc
          5.62G   5%  150.73MB/s    0:00:35 (xfr#11, to-chk=171/217)
P_2006_10.nc
          6.14G   5%  150.62MB/s    0:00:38 (xfr#12, to-chk=170

### 2.2 Merge the month data to year

In [6]:
#!/usr/bin/env python3
from pathlib import Path
import subprocess

input_dir = "/share/home/dq076/bedrock/data/P/MSWX_V100_r/rawdata"
output_dir = "/share/home/dq076/bedrock/data/P/MSWX_V100_r/3H"

Path(output_dir).mkdir(exist_ok=True)

for y in range(2003, 2021):
    files = " ".join([f"{input_dir}/P_{y}_{m:02d}.nc" for m in range(1, 13)])
    cmd = f"cdo -f nc4c -z zip_3 -mergetime {files} {output_dir}/P_MSWX_{y}_3H_p1_mm3h.nc"
    subprocess.run(cmd, shell=True)
    print(f"完成 {y}")

完成 2003
完成 2004
完成 2005
完成 2006
完成 2007
完成 2008
完成 2009
完成 2010
完成 2011
完成 2012
完成 2013
完成 2014
完成 2015
完成 2016
完成 2017
完成 2018
完成 2019
完成 2020


### 2.3 Transfer the daily to 8-days

In [1]:
#!/usr/bin/env python3
"""
One-click: MSWX yearly 3-hourly (3H) -> 8-day (8D) non-overlapping sums

Definition (consistent with your CHIRPS script):
- Always output 46 windows per year:
  - first 45 windows: 8-day sums aligned to Jan 1 (each = 64 x 3H timesteps)
  - last window: sum of the remaining timesteps (end-of-year remainder)
Output format: NetCDF4 classic + zip_3 compression (nc4c, zip_3)

Requires: CDO available in PATH.
"""

from pathlib import Path
import subprocess
from concurrent.futures import ThreadPoolExecutor, as_completed
import os
import sys

# =========================
# CONFIG
# =========================
IN_DIR  = Path("/share/home/dq076/bedrock/data/P/MSWX_V100_r/3H").resolve()
OUT_DIR = Path("/share/home/dq076/bedrock/data/P/MSWX_V100_r/8D").resolve()
OUT_DIR.mkdir(parents=True, exist_ok=True)

START_YEAR = 2003
END_YEAR   = 2020

IN_PATTERN  = "P_MSWX_{year}_3H_p1_mm3h.nc"
OUT_PATTERN = "P_MSWX_{year}_8D_p1_mm8d.nc"

CDO = "cdo"
FORMAT = "nc4c"
ZIP = "zip_3"

# 3-hourly -> 8-day
STEPS_PER_8DAY = 64
FULL_WINDOWS = 45
FULL_STEPS = FULL_WINDOWS * STEPS_PER_8DAY  # 2880 (first 360 days)
N_WINDOWS_TOTAL = 46

# Parallelism (years in parallel). Recommend 2-6 depending on filesystem.
MAX_WORKERS = 4

# OpenMP threads for each CDO call (if your CDO supports OpenMP)
OMP_NUM_THREADS = "8"

# Temporary directory (local scratch is best if available)
TMP_DIR = OUT_DIR / "_tmp_3H_to_8D"
TMP_DIR.mkdir(parents=True, exist_ok=True)
# =========================


def run(cmd: list[str]) -> None:
    """Run a command with inherited env, raising on failure."""
    subprocess.run(cmd, check=True)


def process_year(year: int) -> str:
    in_file = IN_DIR / IN_PATTERN.format(year=year)
    out_file = OUT_DIR / OUT_PATTERN.format(year=year)

    if not in_file.exists():
        return f"[SKIP] {year}: missing {in_file.name}"

    # Quick idempotency: if output exists, skip
    if out_file.exists():
        return f"[SKIP] {year}: exists {out_file.name}"

    # Temp files
    part45 = TMP_DIR / f"part45_{year}.nc"
    tail   = TMP_DIR / f"tail_{year}.nc"

    # 1) First 45 windows (2880 timesteps) -> timselsum,64 => 45 timesteps
    cmd1 = [
        CDO, "-O", "-L",
        "-f", FORMAT, "-z", ZIP,
        f"timselsum,{STEPS_PER_8DAY}",
        f"-seltimestep,1/{FULL_STEPS}",
        str(in_file),
        str(part45),
    ]

    # 2) Tail remainder timesteps (2881..end) -> timsum => 1 timestep
    cmd2 = [
        CDO, "-O", "-L",
        "-f", FORMAT, "-z", ZIP,
        "timsum",
        f"-seltimestep,{FULL_STEPS+1}/-1",
        str(in_file),
        str(tail),
    ]

    # 3) Merge => 46 timesteps
    cmd3 = [
        CDO, "-O", "-L",
        "-f", FORMAT, "-z", ZIP,
        "mergetime",
        str(part45),
        str(tail),
        str(out_file),
    ]

    # Run
    run(cmd1)
    run(cmd2)
    run(cmd3)

    # Cleanup temp
    try:
        part45.unlink(missing_ok=True)
        tail.unlink(missing_ok=True)
    except Exception:
        pass

    return f"[OK] {year}: {out_file.name} (46 windows: 45*8day + tail)"


def main() -> int:
    os.environ["OMP_NUM_THREADS"] = OMP_NUM_THREADS

    years = list(range(START_YEAR, END_YEAR + 1))
    print("=" * 70)
    print("MSWX yearly 3H -> 8D (46 windows/year) one-click")
    print(f"IN : {IN_DIR}")
    print(f"OUT: {OUT_DIR}")
    print(f"Years: {START_YEAR}-{END_YEAR}")
    print(f"CDO: {CDO} | format={FORMAT} | compression={ZIP}")
    print(f"Parallel years: {MAX_WORKERS} | OMP_NUM_THREADS={OMP_NUM_THREADS}")
    print("=" * 70)

    results = []
    failed = 0

    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
        futs = {ex.submit(process_year, y): y for y in years}
        for fut in as_completed(futs):
            y = futs[fut]
            try:
                msg = fut.result()
                print(msg)
                results.append(msg)
            except subprocess.CalledProcessError as e:
                failed += 1
                print(f"[FAIL] {y}: command failed with code {e.returncode}", file=sys.stderr)
            except Exception as e:
                failed += 1
                print(f"[FAIL] {y}: {e}", file=sys.stderr)

    print("=" * 70)
    if failed:
        print(f"Done with failures: {failed} year(s) failed.", file=sys.stderr)
        return 1
    print("Done. All requested years processed.")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())


MSWX yearly 3H -> 8D (46 windows/year) one-click
IN : /share/home/dq076/bedrock/data/P/MSWX_V100_r/3H
OUT: /share/home/dq076/bedrock/data/P/MSWX_V100_r/8D
Years: 2003-2020
CDO: cdo | format=nc4c | compression=zip_3
Parallel years: 4 | OMP_NUM_THREADS=8
[SKIP] 2003: exists P_MSWX_2003_8D_p1_mm8d.nc
[OK] 2006: P_MSWX_2006_8D_p1_mm8d.nc (46 windows: 45*8day + tail)
[OK] 2004: P_MSWX_2004_8D_p1_mm8d.nc (46 windows: 45*8day + tail)
[OK] 2005: P_MSWX_2005_8D_p1_mm8d.nc (46 windows: 45*8day + tail)
[OK] 2007: P_MSWX_2007_8D_p1_mm8d.nc (46 windows: 45*8day + tail)
[OK] 2008: P_MSWX_2008_8D_p1_mm8d.nc (46 windows: 45*8day + tail)
[OK] 2009: P_MSWX_2009_8D_p1_mm8d.nc (46 windows: 45*8day + tail)
[OK] 2010: P_MSWX_2010_8D_p1_mm8d.nc (46 windows: 45*8day + tail)
[OK] 2011: P_MSWX_2011_8D_p1_mm8d.nc (46 windows: 45*8day + tail)
[OK] 2013: P_MSWX_2013_8D_p1_mm8d.nc (46 windows: 45*8day + tail)
[OK] 2012: P_MSWX_2012_8D_p1_mm8d.nc (46 windows: 45*8day + tail)
[OK] 2015: P_MSWX_2015_8D_p1_mm8d.nc (46 

SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
