# 2. Preprocess the MSWEP_V280_r

### 2.1 Transmit the MSWEP data

In [4]:
from pathlib import Path
import subprocess

input_dir = Path("xuxh22@172.16.102.36:/tera11/zhwei/share/Reference/Grid/sources/Precipitation/MSWEP_V280/Precipitation_merged")
output_dir = Path("/share/home/dq076/bedrock/data/P/MSWEP_V280/rawdata")
output_dir.mkdir(parents=True, exist_ok=True)

cmd = [
    "rsync",
    "-avh",            # archive + verbose + human-readable
    "--info=progress2",
    "--prune-empty-dirs",
    "--include=*/",
    "--include=Precipitation_200[3-9].nc",
    "--include=Precipitation_201[0-9].nc",
    "--include=Precipitation_2020.nc",
    "--exclude=*",
    f"{str(input_dir)}/",
    f"{str(output_dir)}/",
]

subprocess.run(cmd, check=True)
print("完成：2003-2020 全部 Precipitation_YYYY.nc 同步")


/home/xuxh22/.bashrc: line 13: /opt/intel/oneapi/compiler/latest/env/vars.sh: No such file or directory


receiving file list ... done
./
Precipitation_2003.nc
          9.50G   5%   14.22MB/s    0:10:36 (xfr#1, to-chk=18/20)
Precipitation_2004.nc
         19.02G  11%   14.49MB/s    0:20:51 (xfr#2, to-chk=17/20)
Precipitation_2005.nc
         28.52G  16%   13.43MB/s    0:33:44 (xfr#3, to-chk=16/20)
Precipitation_2006.nc
         38.02G  22%   14.31MB/s    0:42:13 (xfr#4, to-chk=15/20)
Precipitation_2007.nc
         47.51G  27%   14.87MB/s    0:50:46 (xfr#5, to-chk=14/20)
Precipitation_2008.nc
         57.04G  33%   14.87MB/s    1:00:57 (xfr#6, to-chk=13/20)
Precipitation_2009.nc
         66.54G  38%   15.74MB/s    1:07:11 (xfr#7, to-chk=12/20)
Precipitation_2010.nc
         76.03G  44%   15.85MB/s    1:16:14 (xfr#8, to-chk=11/20)
Precipitation_2011.nc
         85.53G  49%   15.71MB/s    1:26:32 (xfr#9, to-chk=10/20)
Precipitation_2012.nc
         95.05G  55%   15.83MB/s    1:35:26 (xfr#10, to-chk=9/20)
Precipitation_2013.nc
        104.55G  61%   15.86MB/s    1:44:45 (xfr#11, to-chk=8/20)


### 2.2 Translate from 1D p1 year-group nc to 8D p05 all-group nc4

In [1]:
import os
import subprocess
import calendar
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor, as_completed

ROOT_DIR = Path("/share/home/dq076/bedrock/data/P/MSWEP_V280").resolve()
IN_DIR   = ROOT_DIR / "rawdata"
TMP_DIR  = ROOT_DIR / "_tmp"
OUT_DIR  = ROOT_DIR / "8D"

p05_path = Path("/share/home/dq076/bedrock/data/p05.txt").resolve()
p1_to_p05_path = IN_DIR / "p1_to_p05.nc"

START_YEAR = 2003
END_YEAR   = 2020

IN_PATTERN  = "Precipitation_{year}.nc"
OUT_PATTERN = "P_MSWEP_{year}_8D_p1_mm8d.nc"


STEPS_PER_8DAY = 8
FULL_WINDOWS = 45
FULL_STEPS = FULL_WINDOWS * STEPS_PER_8DAY  # 360
N_WINDOWS_TOTAL = 46

JOBS = min(4, os.cpu_count() or 4)
OMP_NUM_THREADS = "8"

OUT_DIR.mkdir(parents=True, exist_ok=True)
TMP_DIR.mkdir(parents=True, exist_ok=True)


def run(cmd, cwd=None):
    cmd = [str(x) for x in cmd]
    p = subprocess.run(
        cmd,
        cwd=str(cwd) if cwd else None,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True,
    )
    if p.returncode != 0:
        raise RuntimeError(
            "Command failed:\n"
            f"{' '.join(cmd)}\n\n"
            f"STDOUT:\n{p.stdout}\n\n"
            f"STDERR:\n{p.stderr}\n"
        )
    return p.stdout.strip()


def process_one_year(year: int) -> str:
    ndays = 366 if calendar.isleap(year) else 365
    in_path = IN_DIR / f"Precipitation_{year}.nc"

    subdir = TMP_DIR / f"{year}"
    subdir.mkdir(parents=True, exist_ok=True)

    tmp1_path = subdir / f"P_MSWEP_{year}001-{year}360_8D_p1_mm8d-1.nc"
    tmp2_path = subdir / f"P_MSWEP_{year}361-{year}{ndays:03d}_8D_p1_mm8d-1.nc"
    tmp3_path = subdir / f"P_MSWEP_{year}_8D_p1_mm8d-1.nc"
    out_path = OUT_DIR / f"P_MSWEP_{year}_8D_p05_mm8d-1.nc"

    # 1) first 360 timesteps -> 45 timesteps (8-day sums)
    run([
        "cdo", "-O", "-L",
        "-f", "nc4c", "-z", "zip_3",
        f"timselsum,{STEPS_PER_8DAY}",
        f"-seltimestep,1/{FULL_STEPS}",
        str(in_path),
        str(tmp1_path),
    ])

    # 2) remainder -> 1 timestep (tail sum)
    run([
        "cdo", "-O", "-L",
        "-f", "nc4c", "-z", "zip_3",
        "timsum",
        f"-seltimestep,{FULL_STEPS+1}/-1",
        str(in_path),
        str(tmp2_path),
    ])

    # 3) merge -> 46 timesteps
    run([
        "cdo", "-O", "-L",
        "-f", "nc4c", "-z", "zip_3",
        "mergetime",
        str(tmp1_path),
        str(tmp2_path),
        str(tmp3_path),
    ])

    run([
        "cdo", "-O", "-L",
        "-f", "nc4c", "-z", "zip_3",
        f"remap,{p05_path},{p1_to_p05_path}",
        str(tmp3_path),
        str(out_path),
    ])

    # cleanup
    # tmp1_path.unlink(missing_ok=True)
    # tmp2_path.unlink(missing_ok=True)

    return f"[OK] {year}: {tmp3_path.name} ({N_WINDOWS_TOTAL} windows: 45*8day + tail)"


def main():
    os.environ["OMP_NUM_THREADS"] = OMP_NUM_THREADS
    years = list(range(START_YEAR, END_YEAR + 1))

    print(f"[INFO] IN={IN_DIR}")
    print(f"[INFO] OUT={OUT_DIR}")
    print(f"[INFO] Years={START_YEAR}-{END_YEAR}")
    print(f"[INFO] cpu={os.cpu_count()} max_workers={JOBS} OMP_NUM_THREADS={OMP_NUM_THREADS}")

    with ProcessPoolExecutor(max_workers=JOBS) as ex:
        futs = {ex.submit(process_one_year, y): y for y in years}
        for fut in as_completed(futs):
            y = futs[fut]
            try:
                print(fut.result())
            except Exception as e:
                print(f"[ERR] {y} failed:\n{e}")
                raise

    files1 = [
        str(OUT_DIR / f"P_MSWEP_{year}_8D_p05_mm8d-1.nc")
        for year in years
    ]

    merge1_path = ROOT_DIR / "P_MSWEP_2003-2020_8D_p05_mm8d-1.nc"
    run(["cdo", "-O", "-L", "-f", "nc4", "-z", "zip_3", "mergetime", *files1, str(merge1_path)])

    print("===> ALL DONE")

if __name__ == "__main__":
    main()


[INFO] IN=/share/home/dq076/bedrock/data/P/MSWEP_V280/rawdata
[INFO] OUT=/share/home/dq076/bedrock/data/P/MSWEP_V280/8D
[INFO] Years=2003-2020
[INFO] cpu=48 max_workers=4 OMP_NUM_THREADS=8
[OK] 2006: P_MSWEP_2006_8D_p1_mm8d-1.nc (46 windows: 45*8day + tail)
[OK] 2003: P_MSWEP_2003_8D_p1_mm8d-1.nc (46 windows: 45*8day + tail)
[OK] 2005: P_MSWEP_2005_8D_p1_mm8d-1.nc (46 windows: 45*8day + tail)
[OK] 2004: P_MSWEP_2004_8D_p1_mm8d-1.nc (46 windows: 45*8day + tail)
[OK] 2008: P_MSWEP_2008_8D_p1_mm8d-1.nc (46 windows: 45*8day + tail)
[OK] 2007: P_MSWEP_2007_8D_p1_mm8d-1.nc (46 windows: 45*8day + tail)
[OK] 2010: P_MSWEP_2010_8D_p1_mm8d-1.nc (46 windows: 45*8day + tail)
[OK] 2009: P_MSWEP_2009_8D_p1_mm8d-1.nc (46 windows: 45*8day + tail)
[OK] 2011: P_MSWEP_2011_8D_p1_mm8d-1.nc (46 windows: 45*8day + tail)
[OK] 2013: P_MSWEP_2013_8D_p1_mm8d-1.nc (46 windows: 45*8day + tail)
[OK] 2014: P_MSWEP_2014_8D_p1_mm8d-1.nc (46 windows: 45*8day + tail)
[OK] 2012: P_MSWEP_2012_8D_p1_mm8d-1.nc (46 windows: