# 1.Preprocess the MOD16A2GF_v6.1

### 1.1 Download the MOD16A2GF_v6.1 data from Earthaccess (https://search.earthdata.nasa.gov/search?q=C2565791021-LPCLOUD)

In [None]:
import os
import time
from pathlib import Path
import calendar
import earthaccess

# =====================
# CONFIG
# =====================
OUTROOT = Path("/share/home/dq076/bedrock/data/ET/MOD16A2GF_v6.1/rawdata").resolve()  # 改成你的真实路径
OUTROOT.mkdir(parents=True, exist_ok=True)

SHORT_NAME = "MOD16A2GF"
VERSION = "061"

START_YM = (2003, 1)
END_YM   = (2020, 12)

# =====================
# UTILS
# =====================
def dir_size_bytes(d: Path) -> int:
    total = 0
    for p in d.rglob("*"):
        if p.is_file():
            try:
                total += p.stat().st_size
            except FileNotFoundError:
                pass
    return total

def month_range():
    y, m = START_YM
    end_y, end_m = END_YM
    while (y < end_y) or (y == end_y and m <= end_m):
        yield y, m
        m += 1
        if m == 13:
            y += 1
            m = 1

def ensure_netrc_permissions():
    netrc = Path.home() / ".netrc"
    if not netrc.exists():
        raise FileNotFoundError(f"未找到 {netrc}，请先创建并写入 urs.earthdata.nasa.gov 的登录信息。")
    # 建议权限 600
    try:
        os.chmod(netrc, 0o600)
    except PermissionError:
        pass

def login_with_netrc():
    """
    优先使用 ~/.netrc 登录。不同版本 earthaccess 对 login 参数支持不完全一致：
    - 若支持 strategy="netrc"：用它
    - 否则：直接 earthaccess.login()（通常也会尝试 netrc）
    """
    ensure_netrc_permissions()
    try:
        # 某些版本支持
        earthaccess.login(strategy="netrc")
    except TypeError:
        # 兼容旧版本
        earthaccess.login()

def retry(func, retries=10, sleep=60):
    for i in range(1, retries + 1):
        try:
            return func()
        except Exception as e:
            print(f"[WARN] attempt {i}/{retries} failed: {e}")
            if i == retries:
                raise
            time.sleep(sleep * i)  # 递增等待（指数/线性退避）

# =====================
# DOWNLOAD
# =====================
def download_one_month(year: int, month: int):
    # 当月起止日期
    last_day = calendar.monthrange(year, month)[1]
    t_start = f"{year:04d}-{month:02d}-01"
    t_end   = f"{year:04d}-{month:02d}-{last_day:02d}"

    outdir = OUTROOT
    outdir.mkdir(parents=True, exist_ok=True)

    size0 = dir_size_bytes(outdir)
    t0 = time.time()

    granules = earthaccess.search_data(
        short_name=SHORT_NAME,
        version=VERSION,
        temporal=(t_start, t_end),
    )

    print(f"[{year:04d}-{month:02d}] temporal=({t_start},{t_end}) granules={len(granules)}")

    # 下载
    earthaccess.download(granules, local_path=str(outdir))

    t1 = time.time()
    size1 = dir_size_bytes(outdir)
    dt = max(t1 - t0, 1e-6)
    speed = (size1 - size0) / dt / (1024**2)

    print(f"[{year:04d}-{month:02d}] +{(size1-size0)/1024**3:.2f} GB in {dt/60:.1f} min, avg {speed:.2f} MB/s")


# =====================
# RUN
# =====================
login_with_netrc()

for y, m in month_range():
    retry(lambda y=y, m=m: download_one_month(y, m), retries=10, sleep=60)


### 1.2 

In [None]:
import os
import re
import shutil
import subprocess
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor, as_completed
from datetime import datetime, timedelta

ROOT_DIR = Path("/share/home/dq076/bedrock/data/ET/MOD16A2GF_v6.1").resolve()
RAW_DIR = Path("/share/home/dq076/bedrock/data/ET/MOD16A2GF_v6.1/rawdata").resolve()
TMP_DIR = Path("/share/home/dq076/bedrock/data/ET/MOD16A2GF_v6.1/_tmp").resolve()
OUT_DIR = Path("/share/home/dq076/bedrock/data/ET/MOD16A2GF_v6.1/8D").resolve()

PRODUCT = "MOD16A2GF"
GRID_NAME = "MOD_Grid_MOD16A2"
SDS_NAME = "ET_500m"
VAR_NAME = "ET"
VAR_LONG_NAME = "Evapotranspiration"
VAR_UNITS = "mm 8d-1"

RES_DEG = "0.05"
RESAMPLE = "near"
SRC_NODATA = "32767"
DST_NODATA = "0"
ZLEVEL = "3"

JOBS = min(4, os.cpu_count() or 4)
NUM_THREADS = 8

# If True: build DOY list by fixed range. Otherwise scan RAW_DIR (recommended).
USE_FIXED_RANGE = False
START_YEAR, START_DOY = 2003, 1
END_YEAR, END_DOY = 2020, 361

OUT_DIR.mkdir(parents=True, exist_ok=True)
TMP_DIR.mkdir(parents=True, exist_ok=True)


def run(cmd, cwd=None):
    cmd = [str(x) for x in cmd]
    p = subprocess.run(
        cmd,
        cwd=str(cwd) if cwd else None,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True,
    )
    if p.returncode != 0:
        raise RuntimeError(
            "Command failed:\n"
            f"{' '.join(cmd)}\n\n"
            f"STDOUT:\n{p.stdout}\n\n"
            f"STDERR:\n{p.stderr}\n"
        )
    return p.stdout.strip()


def leq_yd(y1: int, d1: int, y2: int, d2: int) -> bool:
    return (y1 < y2) or (y1 == y2 and d1 <= d2)


def build_doy_list_scan() -> list[str]:
    """
    Scan all files under RAW_DIR, extract YYYYDDD from:
      MOD16A2GF.AYYYYDDD.*.hdf
    Return list like ["2003001", "2003009", ...] unique + sorted.
    """
    pat = re.compile(rf"^{re.escape(PRODUCT)}\.A(\d{{7}})\..*\.hdf$", re.IGNORECASE)
    keys = set()
    for p in RAW_DIR.rglob(f"{PRODUCT}.A*.hdf"):
        m = pat.match(p.name)
        if m:
            keys.add(m.group(1))
    return sorted(keys)


def build_doy_list_fixed() -> list[str]:
    out = []
    for y in range(START_YEAR, END_YEAR + 1):
        d_from, d_to = 1, 366
        if y == START_YEAR:
            d_from = START_DOY
        if y == END_YEAR:
            d_to = END_DOY
        for d in range(d_from, d_to + 1):
            if not leq_yd(y, d, END_YEAR, END_DOY):
                break
            out.append(f"{y:04d}{d:03d}")
    return out


print(f"===> Parallel: JOBS={JOBS}, gdalwarp NUM_THREADS={NUM_THREADS}")


def process_one_doy(doy_key: str) -> str:
    """
    doy_key: "YYYYDDD" (e.g., "2003001")
    """
    key = f"A{doy_key}"
    year = doy_key[:4]
    doy3 = doy_key[4:7]
    subdir = RAW_DIR / year / doy3
    yyyymmdd = (datetime(int(year), 1, 1) + timedelta(days=int(doy3) - 1)).strftime("%Y-%m-%d")


    tiles = sorted(subdir.glob(f"{PRODUCT}.{key}*.hdf"))
    work = TMP_DIR / key
    work.mkdir(parents=True, exist_ok=True)
    list_txt = work / f"hdf_{key}.txt"

    vrt = work / f"ET_{PRODUCT}_{year}{doy3}_8D_p05_mm8d-1.vrt"
    out_tif = work / f"ET_{PRODUCT}_{year}{doy3}_8D_p05_mm8d-1_tmp1.tif"
    out_nc = work / f"ET_{PRODUCT}_{year}{doy3}_8D_p05_mm8d-1_tmp1.nc4"
    data_path = OUT_DIR / f"ET_{PRODUCT}_{year}{doy3}_8D_p05_mm8d-1.nc4"

    # Write HDF4_EOS URIs using RELATIVE paths (relative to RAW_DIR)
    with list_txt.open("w") as f:
        for hdf in tiles:
            rel = f"{year}/{doy3}/{hdf.name}"
            f.write(f'HDF4_EOS:EOS_GRID:"{rel}":{GRID_NAME}:{SDS_NAME}\n')

    # Build VRT + warp executed with cwd=RAW_DIR so rel paths resolve
    run(["gdalbuildvrt", "-overwrite", "-input_file_list", str(list_txt), str(vrt)], cwd=RAW_DIR)

    run([
        "gdalwarp",
        "-overwrite",
        
        "-t_srs", "EPSG:4326",
        "-te", "-180", "-90", "180", "90",
        "-tr", RES_DEG, RES_DEG,
        "-tap",
        "-r", RESAMPLE,
        "-srcnodata", SRC_NODATA,
        "-dstnodata", DST_NODATA,
        "-multi", "-wo", f"NUM_THREADS={NUM_THREADS}",
        "-ot", "Float32",
        "-of", "GTiff",
        "-co", "TILED=YES",
        "-co", "COMPRESS=DEFLATE",
        "-co", f"ZLEVEL={ZLEVEL}",
        str(vrt),
        str(out_tif),
    ], cwd=RAW_DIR)

    run([
        "gdal_translate",
        "-q",
        "-of", "netCDF",
        "-co", "FORMAT=NC4",
        "-unscale",
        "-ot", "Float32",
        "-a_nodata", DST_NODATA,
        "-a_srs", "EPSG:4326",
        "-co", "COMPRESS=DEFLATE",
        "-co", f"ZLEVEL={ZLEVEL}",
        str(out_tif),
        str(out_nc),
    ])

    run([
        "cdo", "-O", "-L", "-b", "F32", "-f", "nc4", "-z", "zip_3",
        f"-setattribute,{VAR_NAME}@long_name={VAR_LONG_NAME}",
        f"-setattribute,{VAR_NAME}@units={VAR_UNITS}",
        f"-setattribute,{VAR_NAME}@_FillValue=0.f", 
        f"-setattribute,{VAR_NAME}@missing_value=0.f",
        f"-setname,{VAR_NAME}",
        "-invertlat",

        "-setmisstoc,0",
        f"-setctomiss,{SRC_NODATA}",
        f"-setctomiss,32766",

        "-setcalendar,proleptic_gregorian",
        f"-setreftime,{yyyymmdd},00:00:00",
        "-settunits,days",
        f"-setdate,{yyyymmdd}",
        "-settime,00:00:00",

        str(out_nc),
        str(data_path),
    ])

    # out_tif.unlink(missing_ok=True)
    msg = f"[OK] {key} -> {data_path.name}"

    # shutil.rmtree(work, ignore_errors=True)
    return msg


def main():
    if USE_FIXED_RANGE:
        doy_list = build_doy_list_fixed()
        print(f"===> Using fixed range: {START_YEAR}/{START_DOY:03d} -> {END_YEAR}/{END_DOY:03d}")
    else:
        doy_list = build_doy_list_scan()
        print(f"===> Scanning RAW_DIR to collect AYYYYDDD keys")

    print(f"===> Found {len(doy_list)} time slices")

    with ProcessPoolExecutor(max_workers=JOBS) as ex:
        futs = {ex.submit(process_one_doy, d): d for d in doy_list}
        for fut in as_completed(futs):
            d = futs[fut]
            try:
                print(fut.result())
            except Exception as e:
                print(f"[ERR] {d} failed:\n{e}")
                raise

    files = [str(OUT_DIR / f"ET_{PRODUCT}_{year}{doy:03d}_8D_p05_mm8d-1.nc4") 
              for year in range(2003,2021)
              for doy in range(1,365,8)]
    merge_path = ROOT_DIR / f"ET_{PRODUCT}_2003-2020_8D_p05_mm8d-1.nc4"
    run(["cdo", "-O", "-L", "-f", "nc4", "-z", "zip_3", "mergetime", *files, str(merge_path)])
    print(f"[OK] merged -> {merge_path}")

    print("===> ALL DONE")
    print(f"Output dir: {OUT_DIR}")

if __name__ == "__main__":
    main()

===> Parallel: JOBS=4, gdalwarp NUM_THREADS=8
===> Scanning RAW_DIR to collect AYYYYDDD keys
===> Found 828 time slices
[OK] A2003025 -> ET_MOD16A2GF_2003025_8D_p05_mm8d-1.nc4
[OK] A2003009 -> ET_MOD16A2GF_2003009_8D_p05_mm8d-1.nc4
[OK] A2003001 -> ET_MOD16A2GF_2003001_8D_p05_mm8d-1.nc4
[OK] A2003017 -> ET_MOD16A2GF_2003017_8D_p05_mm8d-1.nc4
[OK] A2003041 -> ET_MOD16A2GF_2003041_8D_p05_mm8d-1.nc4
[OK] A2003033 -> ET_MOD16A2GF_2003033_8D_p05_mm8d-1.nc4
[OK] A2003049 -> ET_MOD16A2GF_2003049_8D_p05_mm8d-1.nc4
[OK] A2003057 -> ET_MOD16A2GF_2003057_8D_p05_mm8d-1.nc4
[OK] A2003073 -> ET_MOD16A2GF_2003073_8D_p05_mm8d-1.nc4
[OK] A2003065 -> ET_MOD16A2GF_2003065_8D_p05_mm8d-1.nc4
[OK] A2003081 -> ET_MOD16A2GF_2003081_8D_p05_mm8d-1.nc4
[OK] A2003089 -> ET_MOD16A2GF_2003089_8D_p05_mm8d-1.nc4
[OK] A2003105 -> ET_MOD16A2GF_2003105_8D_p05_mm8d-1.nc4
[OK] A2003121 -> ET_MOD16A2GF_2003121_8D_p05_mm8d-1.nc4
[OK] A2003113 -> ET_MOD16A2GF_2003113_8D_p05_mm8d-1.nc4
[OK] A2003097 -> ET_MOD16A2GF_2003097_8D