# 2. Preprocess the MSWEP_V280_r

### 2.1 Transmit the MSWEP data

In [4]:
from pathlib import Path
import subprocess

input_dir = Path("xuxh22@172.16.102.36:/tera11/zhwei/share/Reference/Grid/sources/Precipitation/MSWEP_V280/Precipitation_merged")
output_dir = Path("/share/home/dq076/bedrock/data/P/MSWEP_V280/rawdata")
output_dir.mkdir(parents=True, exist_ok=True)

cmd = [
    "rsync",
    "-avh",            # archive + verbose + human-readable
    "--info=progress2",
    "--prune-empty-dirs",
    "--include=*/",
    "--include=Precipitation_200[3-9].nc",
    "--include=Precipitation_201[0-9].nc",
    "--include=Precipitation_2020.nc",
    "--exclude=*",
    f"{str(input_dir)}/",
    f"{str(output_dir)}/",
]

subprocess.run(cmd, check=True)
print("完成：2003-2020 全部 Precipitation_YYYY.nc 同步")


/home/xuxh22/.bashrc: line 13: /opt/intel/oneapi/compiler/latest/env/vars.sh: No such file or directory


receiving file list ... done
./
Precipitation_2003.nc
          9.50G   5%   14.22MB/s    0:10:36 (xfr#1, to-chk=18/20)
Precipitation_2004.nc
         19.02G  11%   14.49MB/s    0:20:51 (xfr#2, to-chk=17/20)
Precipitation_2005.nc
         28.52G  16%   13.43MB/s    0:33:44 (xfr#3, to-chk=16/20)
Precipitation_2006.nc
         38.02G  22%   14.31MB/s    0:42:13 (xfr#4, to-chk=15/20)
Precipitation_2007.nc
         47.51G  27%   14.87MB/s    0:50:46 (xfr#5, to-chk=14/20)
Precipitation_2008.nc
         57.04G  33%   14.87MB/s    1:00:57 (xfr#6, to-chk=13/20)
Precipitation_2009.nc
         66.54G  38%   15.74MB/s    1:07:11 (xfr#7, to-chk=12/20)
Precipitation_2010.nc
         76.03G  44%   15.85MB/s    1:16:14 (xfr#8, to-chk=11/20)
Precipitation_2011.nc
         85.53G  49%   15.71MB/s    1:26:32 (xfr#9, to-chk=10/20)
Precipitation_2012.nc
         95.05G  55%   15.83MB/s    1:35:26 (xfr#10, to-chk=9/20)
Precipitation_2013.nc
        104.55G  61%   15.86MB/s    1:44:45 (xfr#11, to-chk=8/20)


### 2.2 Transfer the daily to 8-days

In [2]:
#!/usr/bin/env python3
"""
One-click: MSWX yearly 1-day (1D) -> 8-day (8D) non-overlapping sums

Definition (consistent with your CHIRPS script):
- Always output 46 windows per year:
  - first 45 windows: 8-day sums aligned to Jan 1 (each = 8 x 1D timesteps)
  - last window: sum of the remaining timesteps (end-of-year remainder)
Output format: NetCDF4 classic + zip_3 compression (nc4c, zip_3)

Requires: CDO available in PATH.
"""

from pathlib import Path
import subprocess
from concurrent.futures import ThreadPoolExecutor, as_completed
import os
import sys

# =========================
# CONFIG
# =========================
IN_DIR  = Path("/share/home/dq076/bedrock/data/P/MSWEP_V280/rawdata").resolve()
OUT_DIR = Path("/share/home/dq076/bedrock/data/P/MSWEP_V280/8D").resolve()
OUT_DIR.mkdir(parents=True, exist_ok=True)

START_YEAR = 2003
END_YEAR   = 2020

IN_PATTERN  = "Precipitation_{year}.nc"
OUT_PATTERN = "P_MSWEP_{year}_8D_p1_mm8d.nc"

CDO = "cdo"
FORMAT = "nc4c"
ZIP = "zip_3"

# 3-hourly -> 8-day
STEPS_PER_8DAY = 8
FULL_WINDOWS = 45
FULL_STEPS = FULL_WINDOWS * STEPS_PER_8DAY  # first 360 days
N_WINDOWS_TOTAL = 46

# Parallelism (years in parallel). Recommend 2-6 depending on filesystem.
MAX_WORKERS = 4

# OpenMP threads for each CDO call (if your CDO supports OpenMP)
OMP_NUM_THREADS = "8"

# Temporary directory (local scratch is best if available)
TMP_DIR = OUT_DIR / "_tmp_1D_to_8D"
TMP_DIR.mkdir(parents=True, exist_ok=True)
# =========================


def run(cmd: list[str]) -> None:
    """Run a command with inherited env, raising on failure."""
    subprocess.run(cmd, check=True)


def process_year(year: int) -> str:
    in_file = IN_DIR / IN_PATTERN.format(year=year)
    out_file = OUT_DIR / OUT_PATTERN.format(year=year)

    if not in_file.exists():
        return f"[SKIP] {year}: missing {in_file.name}"

    # Quick idempotency: if output exists, skip
    if out_file.exists():
        return f"[SKIP] {year}: exists {out_file.name}"

    # Temp files
    part45 = TMP_DIR / f"part45_{year}.nc"
    tail   = TMP_DIR / f"tail_{year}.nc"

    # 1) First 45 windows (2880 timesteps) -> timselsum,64 => 45 timesteps
    cmd1 = [
        CDO, "-O", "-L",
        "-f", FORMAT, "-z", ZIP,
        f"timselsum,{STEPS_PER_8DAY}",
        f"-seltimestep,1/{FULL_STEPS}",
        str(in_file),
        str(part45),
    ]

    # 2) Tail remainder timesteps (2881..end) -> timsum => 1 timestep
    cmd2 = [
        CDO, "-O", "-L",
        "-f", FORMAT, "-z", ZIP,
        "timsum",
        f"-seltimestep,{FULL_STEPS+1}/-1",
        str(in_file),
        str(tail),
    ]

    # 3) Merge => 46 timesteps
    cmd3 = [
        CDO, "-O", "-L",
        "-f", FORMAT, "-z", ZIP,
        "mergetime",
        str(part45),
        str(tail),
        str(out_file),
    ]

    # Run
    run(cmd1)
    run(cmd2)
    run(cmd3)

    # Cleanup temp
    try:
        part45.unlink(missing_ok=True)
        tail.unlink(missing_ok=True)
    except Exception:
        pass

    return f"[OK] {year}: {out_file.name} (46 windows: 45*8day + tail)"


def main() -> int:
    os.environ["OMP_NUM_THREADS"] = OMP_NUM_THREADS

    years = list(range(START_YEAR, END_YEAR + 1))
    print("=" * 70)
    print("MSWEP yearly 1D -> 8D (46 windows/year) one-click")
    print(f"IN : {IN_DIR}")
    print(f"OUT: {OUT_DIR}")
    print(f"Years: {START_YEAR}-{END_YEAR}")
    print(f"CDO: {CDO} | format={FORMAT} | compression={ZIP}")
    print(f"Parallel years: {MAX_WORKERS} | OMP_NUM_THREADS={OMP_NUM_THREADS}")
    print("=" * 70)

    results = []
    failed = 0

    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
        futs = {ex.submit(process_year, y): y for y in years}
        for fut in as_completed(futs):
            y = futs[fut]
            try:
                msg = fut.result()
                print(msg)
                results.append(msg)
            except subprocess.CalledProcessError as e:
                failed += 1
                print(f"[FAIL] {y}: command failed with code {e.returncode}", file=sys.stderr)
            except Exception as e:
                failed += 1
                print(f"[FAIL] {y}: {e}", file=sys.stderr)

    print("=" * 70)
    if failed:
        print(f"Done with failures: {failed} year(s) failed.", file=sys.stderr)
        return 1
    print("Done. All requested years processed.")
    return 0


if __name__ == "__main__":
    main()


MSWEP yearly 1D -> 8D (46 windows/year) one-click
IN : /share/home/dq076/bedrock/data/P/MSWEP_V280/rawdata
OUT: /share/home/dq076/bedrock/data/P/MSWEP_V280/8D
Years: 2003-2020
CDO: cdo | format=nc4c | compression=zip_3
Parallel years: 4 | OMP_NUM_THREADS=8
[SKIP] 2004: exists P_MSWEP_2004_8D_p1_mm8d.nc
[SKIP] 2003: exists P_MSWEP_2003_8D_p1_mm8d.nc
[SKIP] 2005: exists P_MSWEP_2005_8D_p1_mm8d.nc
[SKIP] 2006: exists P_MSWEP_2006_8D_p1_mm8d.nc
[SKIP] 2007: exists P_MSWEP_2007_8D_p1_mm8d.nc
[SKIP] 2009: exists P_MSWEP_2009_8D_p1_mm8d.nc
[SKIP] 2008: exists P_MSWEP_2008_8D_p1_mm8d.nc
[SKIP] 2010: exists P_MSWEP_2010_8D_p1_mm8d.nc
[SKIP] 2011: exists P_MSWEP_2011_8D_p1_mm8d.nc
[SKIP] 2012: exists P_MSWEP_2012_8D_p1_mm8d.nc
[SKIP] 2013: exists P_MSWEP_2013_8D_p1_mm8d.nc
[SKIP] 2014: exists P_MSWEP_2014_8D_p1_mm8d.nc
[SKIP] 2017: exists P_MSWEP_2017_8D_p1_mm8d.nc
[SKIP] 2016: exists P_MSWEP_2016_8D_p1_mm8d.nc
[SKIP] 2015: exists P_MSWEP_2015_8D_p1_mm8d.nc
[SKIP] 2019: exists P_MSWEP_2019_8D_p1