# 1. Preprocess the GOSIF v2 data

### 1.1 Download data from https://data.globalecology.unh.edu/data/GOSIF_v2/8day/

In [3]:
#!/usr/bin/env python3
"""
Batch download GOSIF v2 8-day GeoTIFFs (tif.gz) for 2003-2020.

Assumptions (match your request):
- Files are named: GOSIF_YYYYDDD.tif.gz  (DDD = DOY with 3 digits)
- 8-day steps aligned to DOY=001,009,017,...,361 (46 files/year)
- Source directory:
  https://data.globalecology.unh.edu/data/GOSIF_v2/8day/

Download method:
- Generate a task list (OUTFILE<TAB>URL)
- Use xargs to run parallel wget:
  wget ... -O OUTFILE URL

If your website uses a different filename pattern, change FNAME_PATTERN below.
"""

from pathlib import Path
import subprocess

# =========================
# CONFIG
# =========================
BASE_URL = "https://data.globalecology.unh.edu/data/GOSIF_v2/8day"
OUT_DIR = Path("/share/home/dq076/bedrock/data/SIF/GOSIF_v2/rawdata")
OUT_DIR.mkdir(parents=True, exist_ok=True)

START_YEAR = 2003
END_YEAR = 2020

# Parallel downloads (4-8 is usually good; too high may slow down or trigger rate limits)
N_JOBS = 6

# Filename pattern
FNAME_PATTERN = "GOSIF_{year}{doy:03d}.tif.gz"

# wget options
WGET_OPTS = "-c --quiet --progress=dot:giga"
# =========================


def main():
    # 8-day DOY list: 001..361 step 8 => 46 files/year
    doys = list(range(1, 362, 8))

    task_file = OUT_DIR / "_wget_tasks.tsv"

    lines = []
    n_skip = 0

    for year in range(START_YEAR, END_YEAR + 1):
        for doy in doys:
            fname = FNAME_PATTERN.format(year=year, doy=doy)
            url = f"{BASE_URL}/{fname}"
            out_file = OUT_DIR / fname

            if out_file.exists():
                n_skip += 1
                continue

            # IMPORTANT: put OUTFILE first, URL second
            lines.append(f"{out_file}\t{url}\n")

    if not lines:
        print(f"所有文件已存在，无需下载。（已存在/跳过：{n_skip}）")
        return

    task_file.write_text("".join(lines))
    print(f"待下载文件数: {len(lines)} （并发={N_JOBS}，已存在/跳过：{n_skip}）")
    print(f"任务文件: {task_file}")

    # xargs reads two fields per line: OUTFILE URL
    # then runs: wget ... -O OUTFILE URL
    cmd = (
        f"cat {task_file} | "
        f"xargs -P {N_JOBS} -n 2 "
        f"wget {WGET_OPTS} -O"
    )

    subprocess.run(cmd, shell=True, check=True)
    print("下载完成。")


if __name__ == "__main__":
    main()


待下载文件数: 828 （并发=6，已存在/跳过：0）
任务文件: /share/home/dq076/bedrock/data/SIF/GOSIF_v2/rawdata/_wget_tasks.tsv
下载完成。
