# 1. Preprocess the GOSIF v2 data

### 1.1 Download data from https://data.globalecology.unh.edu/data/GOSIF_v2/8day/

In [None]:
#!/usr/bin/env python3
"""
Batch download GOSIF v2 8-day GeoTIFFs (tif.gz) for 2003-2020.

Assumptions (match your request):
- Files are named: GOSIF_YYYYDDD.tif.gz  (DDD = DOY with 3 digits)
- 8-day steps aligned to DOY=001,009,017,...,361 (46 files/year)
- Source directory:
  https://data.globalecology.unh.edu/data/GOSIF_v2/8day/

Download method:
- Generate a task list (OUTFILE<TAB>URL)
- Use xargs to run parallel wget:
  wget ... -O OUTFILE URL

If your website uses a different filename pattern, change FNAME_PATTERN below.
"""

from pathlib import Path
import subprocess

# =========================
# CONFIG
# =========================
BASE_URL = "https://data.globalecology.unh.edu/data/GOSIF_v2/8day"
OUT_DIR = Path("/share/home/dq076/bedrock/data/SIF/GOSIF_v2/rawdata")
OUT_DIR.mkdir(parents=True, exist_ok=True)

START_YEAR = 2003
END_YEAR = 2020

# Parallel downloads (4-8 is usually good; too high may slow down or trigger rate limits)
N_JOBS = 6

# Filename pattern
FNAME_PATTERN = "GOSIF_{year}{doy:03d}.tif.gz"

# wget options
WGET_OPTS = "-c --quiet --progress=dot:giga"
# =========================


def main():
    # 8-day DOY list: 001..361 step 8 => 46 files/year
    doys = list(range(1, 362, 8))

    task_file = OUT_DIR / "_wget_tasks.tsv"

    lines = []
    n_skip = 0

    for year in range(START_YEAR, END_YEAR + 1):
        for doy in doys:
            fname = FNAME_PATTERN.format(year=year, doy=doy)
            url = f"{BASE_URL}/{fname}"
            out_file = OUT_DIR / fname

            if out_file.exists():
                n_skip += 1
                continue

            # IMPORTANT: put OUTFILE first, URL second
            lines.append(f"{out_file}\t{url}\n")

    if not lines:
        print(f"所有文件已存在，无需下载。（已存在/跳过：{n_skip}）")
        return

    task_file.write_text("".join(lines))
    print(f"待下载文件数: {len(lines)} （并发={N_JOBS}，已存在/跳过：{n_skip}）")
    print(f"任务文件: {task_file}")

    # xargs reads two fields per line: OUTFILE URL
    # then runs: wget ... -O OUTFILE URL
    cmd = (
        f"cat {task_file} | "
        f"xargs -P {N_JOBS} -n 2 "
        f"wget {WGET_OPTS} -O"
    )

    subprocess.run(cmd, shell=True, check=True)
    print("下载完成。")


if __name__ == "__main__":
    main()


### 1.2 Extract all gz to tif

In [None]:
from pathlib import Path
import subprocess

IN_DIR  = Path("/share/home/dq076/bedrock/data/SIF/GOSIF_v2/rawdata").resolve()
OUT_DIR = Path("/share/home/dq076/bedrock/data/SIF/GOSIF_v2/tif").resolve()
OUT_DIR.mkdir(parents=True, exist_ok=True)

def gunzip_to_file(src_gz: Path, dst_tif: Path):
    dst_tif.parent.mkdir(parents=True, exist_ok=True)
    with open(dst_tif, "wb") as f_out:
        subprocess.run(["gunzip", "-c", str(src_gz)], stdout=f_out, check=True)

gz_files = sorted(IN_DIR.glob("GOSIF_*.tif.gz"))
print(f"Found {len(gz_files)} gz files")

ok = skip = fail = 0
for gz in gz_files:
    out_tif = OUT_DIR / gz.name[:-3]   # remove ".gz"
    if out_tif.exists():
        skip += 1
        continue
    try:
        gunzip_to_file(gz, out_tif)
        ok += 1
    except subprocess.CalledProcessError:
        print(f"[FAIL] {gz.name}")
        fail += 1

print("OK:", ok, "SKIP:", skip, "FAIL:", fail)


### 1.3 Translate to nc and mergetime

In [1]:
#!/usr/bin/env python3
import os
import subprocess
import calendar
from pathlib import Path
from datetime import datetime, timedelta
from concurrent.futures import ProcessPoolExecutor, as_completed
os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"

ROOT_DIR = Path("/share/home/dq076/bedrock/data/SIF/GOSIF_v2/").resolve()

VAR_NAME = "SIF"
VAR_LONG_NAME = "Solar-Induced Chlorophyll Fluorescence"
VAR_UNITS = "mW m-2 sr-1 nm-1"
FILLVALUE = "32767"

TIF_DIR   = ROOT_DIR / "tif"
TMP_DIR   = ROOT_DIR / "_tmp"
DATA1_DIR = ROOT_DIR / "8D"
TMP_DIR.mkdir(parents=True, exist_ok=True)
DATA1_DIR.mkdir(parents=True, exist_ok=True)

def run(cmd):
    cmd = [str(x) for x in cmd]
    p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    if p.returncode != 0:
        raise RuntimeError(
            "Command failed:\n"
            f"{' '.join(cmd)}\n\n"
            f"STDOUT:\n{p.stdout}\n\n"
            f"STDERR:\n{p.stderr}\n"
        )

def process_one_year(year: int) -> str:
    ndays = 366 if calendar.isleap(year) else 365
    tmp_year_dir = TMP_DIR / f"y{year}"
    tmp_year_dir.mkdir(parents=True, exist_ok=True)

    for doy in range(1, ndays + 1, 8):
        yyyymmdd = (datetime(year, 1, 1) + timedelta(days=doy - 1)).strftime("%Y-%m-%d")
        days_since = (datetime(year, 1, 1) + timedelta(days=doy - 1) - datetime(1970, 1, 1)).days

        print(f"[DAY] {yyyymmdd}  A{str(year)}{doy}")

        tif_path = TIF_DIR / f"GOSIF_{year}{doy:03d}.tif"
        tmp1_path = tmp_year_dir / f"SIF_GOSIF_{year}{doy:03d}_8D_p05_mWm-2sr-1nm-1_tmp1.nc4"
        nc1_path  = DATA1_DIR / f"SIF_GOSIF_{year}{doy:03d}_8D_p05_mWm-2sr-1nm-1.nc4"

        run([
            "gdal_translate",
            "-q",
            "-of", "netCDF",
            "-co", "FORMAT=NC4",
            "-co", f"COMPRESS=DEFLATE",
            "-co", f"ZLEVEL=3",
            "-a_srs", "EPSG:4326",
            "-a_nodata", FILLVALUE,
            str(tif_path),
            str(tmp1_path),
        ])

        run([
            "cdo", "-O", "-L", "-b", "F32", "-f", "nc4", "-z", "zip_3",
            f"-setattribute,{VAR_NAME}@long_name={VAR_LONG_NAME}",
            f"-setattribute,{VAR_NAME}@units={VAR_UNITS}",
            f"-setattribute,{VAR_NAME}@_FillValue=0.f", 
            f"-setattribute,{VAR_NAME}@missing_value=0.f",
            f"-setname,{VAR_NAME}",
            "-invertlat",

            "-setmisstoc,0",
            f"-setctomiss,{FILLVALUE}",
            f"-setctomiss,32766",

            "-setcalendar,proleptic_gregorian",
            f"-setreftime,{yyyymmdd},00:00:00",
            "-settunits,days",
            f"-setdate,{yyyymmdd}",
            "-settime,00:00:00",

            str(tmp1_path),
            str(nc1_path),
        ])

    return str(year)

def main():
    years = list(range(2003, 2021))
    max_workers = min(6, os.cpu_count())
    print(f"[INFO] cpu={os.cpu_count()} max_workers={max_workers}")

    with ProcessPoolExecutor(max_workers=max_workers) as ex:
        futs = {ex.submit(process_one_year, y): y for y in years}
        for fut in as_completed(futs):
            y = futs[fut]
            try:
                done_year = fut.result()
                print(f"[OK] year {done_year} done")
            except Exception as e:
                print(f"[ERR] year {y} failed:\n{e}")
                raise

    files1 = [
        str(DATA1_DIR / f"SIF_GOSIF_{year}{doy:03d}_8D_p05_mWm-2sr-1nm-1.nc4")
        for year in years
        for doy in range(1, (366 if calendar.isleap(y) else 365) + 1, 8)
    ]
    merge1_path = ROOT_DIR / "SIF_GOSIF_2003-2020_8D_p05_mWm-2sr-1nm-1.nc4"

    run(["cdo", "-O", "-L", "-f", "nc4", "-z", "zip_3", "mergetime", *files1, str(merge1_path)])
    print(f"[OK] merged -> {merge1_path}")

if __name__ == "__main__":
    main()


[INFO] cpu=48 max_workers=6
[DAY] 2006-01-01  A20061[DAY] 2008-01-01  A20081[DAY] 2003-01-01  A20031[DAY] 2004-01-01  A20041

[DAY] 2005-01-01  A20051[DAY] 2007-01-01  A20071



[DAY] 2007-01-09  A20079
[DAY] 2008-01-09  A20089
[DAY] 2004-01-09  A20049
[DAY] 2005-01-09  A20059
[DAY] 2003-01-09  A20039
[DAY] 2006-01-09  A20069
[DAY] 2007-01-17  A200717
[DAY] 2004-01-17  A200417
[DAY] 2003-01-17  A200317
[DAY] 2008-01-17  A200817
[DAY] 2006-01-17  A200617
[DAY] 2005-01-17  A200517
[DAY] 2007-01-25  A200725
[DAY] 2003-01-25  A200325
[DAY] 2008-01-25  A200825
[DAY] 2004-01-25  A200425
[DAY] 2006-01-25  A200625
[DAY] 2005-01-25  A200525
[DAY] 2007-02-02  A200733
[DAY] 2008-02-02  A200833
[DAY] 2003-02-02  A200333
[DAY] 2004-02-02  A200433
[DAY] 2006-02-02  A200633
[DAY] 2005-02-02  A200533
[DAY] 2007-02-10  A200741
[DAY] 2003-02-10  A200341
[DAY] 2004-02-10  A200441
[DAY] 2008-02-10  A200841
[DAY] 2005-02-10  A200541
[DAY] 2006-02-10  A200641
[DAY] 2007-02-18  A200749
[DAY] 2003-02-18  A200