# 1. Preprocess the GOSIF v2 data

### 1.1 Download data from https://data.globalecology.unh.edu/data/GOSIF_v2/8day/

In [3]:
#!/usr/bin/env python3
"""
Batch download GOSIF v2 8-day GeoTIFFs (tif.gz) for 2003-2020.

Assumptions (match your request):
- Files are named: GOSIF_YYYYDDD.tif.gz  (DDD = DOY with 3 digits)
- 8-day steps aligned to DOY=001,009,017,...,361 (46 files/year)
- Source directory:
  https://data.globalecology.unh.edu/data/GOSIF_v2/8day/

Download method:
- Generate a task list (OUTFILE<TAB>URL)
- Use xargs to run parallel wget:
  wget ... -O OUTFILE URL

If your website uses a different filename pattern, change FNAME_PATTERN below.
"""

from pathlib import Path
import subprocess

# =========================
# CONFIG
# =========================
BASE_URL = "https://data.globalecology.unh.edu/data/GOSIF_v2/8day"
OUT_DIR = Path("/share/home/dq076/bedrock/data/SIF/GOSIF_v2/rawdata")
OUT_DIR.mkdir(parents=True, exist_ok=True)

START_YEAR = 2003
END_YEAR = 2020

# Parallel downloads (4-8 is usually good; too high may slow down or trigger rate limits)
N_JOBS = 6

# Filename pattern
FNAME_PATTERN = "GOSIF_{year}{doy:03d}.tif.gz"

# wget options
WGET_OPTS = "-c --quiet --progress=dot:giga"
# =========================


def main():
    # 8-day DOY list: 001..361 step 8 => 46 files/year
    doys = list(range(1, 362, 8))

    task_file = OUT_DIR / "_wget_tasks.tsv"

    lines = []
    n_skip = 0

    for year in range(START_YEAR, END_YEAR + 1):
        for doy in doys:
            fname = FNAME_PATTERN.format(year=year, doy=doy)
            url = f"{BASE_URL}/{fname}"
            out_file = OUT_DIR / fname

            if out_file.exists():
                n_skip += 1
                continue

            # IMPORTANT: put OUTFILE first, URL second
            lines.append(f"{out_file}\t{url}\n")

    if not lines:
        print(f"所有文件已存在，无需下载。（已存在/跳过：{n_skip}）")
        return

    task_file.write_text("".join(lines))
    print(f"待下载文件数: {len(lines)} （并发={N_JOBS}，已存在/跳过：{n_skip}）")
    print(f"任务文件: {task_file}")

    # xargs reads two fields per line: OUTFILE URL
    # then runs: wget ... -O OUTFILE URL
    cmd = (
        f"cat {task_file} | "
        f"xargs -P {N_JOBS} -n 2 "
        f"wget {WGET_OPTS} -O"
    )

    subprocess.run(cmd, shell=True, check=True)
    print("下载完成。")


if __name__ == "__main__":
    main()


待下载文件数: 828 （并发=6，已存在/跳过：0）
任务文件: /share/home/dq076/bedrock/data/SIF/GOSIF_v2/rawdata/_wget_tasks.tsv
下载完成。


### 1.2 Extract all gz to tif

In [3]:
from pathlib import Path
import subprocess

IN_DIR  = Path("/share/home/dq076/bedrock/data/SIF/GOSIF_v2/rawdata").resolve()
OUT_DIR = Path("/share/home/dq076/bedrock/data/SIF/GOSIF_v2/tif").resolve()
OUT_DIR.mkdir(parents=True, exist_ok=True)

def gunzip_to_file(src_gz: Path, dst_tif: Path):
    dst_tif.parent.mkdir(parents=True, exist_ok=True)
    with open(dst_tif, "wb") as f_out:
        subprocess.run(["gunzip", "-c", str(src_gz)], stdout=f_out, check=True)

gz_files = sorted(IN_DIR.glob("GOSIF_*.tif.gz"))
print(f"Found {len(gz_files)} gz files")

ok = skip = fail = 0
for gz in gz_files:
    out_tif = OUT_DIR / gz.name[:-3]   # remove ".gz"
    if out_tif.exists():
        skip += 1
        continue
    try:
        gunzip_to_file(gz, out_tif)
        ok += 1
    except subprocess.CalledProcessError:
        print(f"[FAIL] {gz.name}")
        fail += 1

print("OK:", ok, "SKIP:", skip, "FAIL:", fail)


Found 828 gz files
OK: 827 SKIP: 1 FAIL: 0


### 1.3 Translate to nc and mergetime

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
GOSIF SIF 8-day TIFF -> NC4 conversion and annual aggregation
处理GOSIF_v2 SIF数据：
1. 将8天合成的TIFF文件转换为NC4格式
2. 将每年的46个8天文件聚合为单个年度文件
"""

import re
import subprocess
import tempfile
import shutil
from pathlib import Path
from datetime import datetime, timedelta

# ======================
# 配置参数
# ======================
IN_DIR = Path("/share/home/dq076/bedrock/data/SIF/GOSIF_v2/tif").resolve()
OUT_DIR_8D = Path("/share/home/dq076/bedrock/data/SIF/GOSIF_v2/8D").resolve()
OUT_DIR_ANNUAL = Path("/share/home/dq076/bedrock/data/SIF/GOSIF_v2/annual").resolve()

# 创建输出目录
OUT_DIR_8D.mkdir(parents=True, exist_ok=True)
OUT_DIR_ANNUAL.mkdir(parents=True, exist_ok=True)

# 数据信息
VAR_NAME = "SIF"
VAR_LONG_NAME = "Solar-Induced Chlorophyll Fluorescence"
UNITS = "mW m-2 sr-1 nm-1"

# 时间范围
START_YEAR = 2003
END_YEAR = 2020
YEARS = list(range(START_YEAR, END_YEAR + 1))

# 每8天一个文件，每年46个文件
DAYS_PER_PERIOD = 8
PERIODS_PER_YEAR = 46

# TIFF文件命名模式：GOSIF_YYYYDDD.tif
TIFF_PATTERN = re.compile(r'^GOSIF_(\d{7})\.tif$')

# 输出文件压缩设置
COMPRESSION_LEVEL = 3  # zip_3压缩

# ======================
# 辅助函数
# ======================
def run_cmd(cmd, cwd=None):
    """运行命令并检查错误"""
    p = subprocess.run(cmd, cwd=cwd, text=True, capture_output=True)
    if p.returncode != 0:
        raise RuntimeError(
            f"命令执行失败:\n命令: {' '.join(cmd)}\n"
            f"工作目录: {cwd}\n标准输出:\n{p.stdout}\n标准错误:\n{p.stderr}"
        )
    return p.stdout

def yyyyddd_to_datetime(yyyyddd_str):
    """将YYYYDDD格式转换为datetime对象"""
    year = int(yyyyddd_str[:4])
    doy = int(yyyyddd_str[4:])
    return datetime(year, 1, 1) + timedelta(days=doy-1)

def datetime_to_yyyyddd(dt):
    """将datetime对象转换为YYYYDDD格式"""
    return dt.strftime("%Y") + f"{dt.timetuple().tm_yday:03d}"

def get_8day_periods_for_year(year):
    """获取一年中所有8天周期的起始日期"""
    periods = []
    start_date = datetime(year, 1, 1)
    
    for period in range(PERIODS_PER_YEAR):
        period_start = start_date + timedelta(days=period * DAYS_PER_PERIOD)
        if period_start.year > year:
            break
            
        period_end = period_start + timedelta(days=DAYS_PER_PERIOD - 1)
        if period_end.year > year:
            period_end = datetime(year, 12, 31)
            
        periods.append({
            'start': period_start,
            'end': period_end,
            'start_str': datetime_to_yyyyddd(period_start)
        })
    
    return periods

def find_tiff_files_for_year(year):
    """查找指定年份的所有TIFF文件"""
    tiff_files = []
    
    for tiff_file in IN_DIR.glob(f"GOSIF_{year}*.tif"):
        match = TIFF_PATTERN.match(tiff_file.name)
        if match:
            date_str = match.group(1)
            file_date = yyyyddd_to_datetime(date_str)
            
            if file_date.year == year:
                tiff_files.append({
                    'path': tiff_file,
                    'date': file_date,
                    'date_str': date_str
                })
    
    # 按日期排序
    tiff_files.sort(key=lambda x: x['date'])
    
    return tiff_files

def convert_tiff_to_nc4(tiff_path, output_nc_path, date_obj):
    """
    将单个TIFF文件转换为NC4格式
    """
    print(f"  转换: {tiff_path.name}")
    
    # 第一步：使用gdal_translate转换TIFF到NC4
    cmd = [
        "gdal_translate",
        "-q",
        "-of", "netCDF",
        "-co", "FORMAT=NC4",
        "-co", f"COMPRESS=DEFLATE",
        "-co", f"ZLEVEL={COMPRESSION_LEVEL}",
        "-a_srs", "EPSG:4326",
        "-a_nodata", "-9999",
        str(tiff_path),
        str(output_nc_path)
    ]
    
    run_cmd(cmd)
    
    # 第二步：使用CDO添加时间坐标和元数据
    with tempfile.TemporaryDirectory(prefix="gosif_temp_") as td:
        td_path = Path(td)
        temp_nc = td_path / "temp.nc4"
        
        date_str = date_obj.strftime("%Y-%m-%d")
        
        cmd_cdo = [
            "cdo", "-O", "-L",
            "-f", "nc4", "-z", f"zip_{COMPRESSION_LEVEL}",
            f"-setname,{VAR_NAME}",
            f'-setattribute,{VAR_NAME}@long_name="{VAR_LONG_NAME}"',
            f'-setattribute,{VAR_NAME}@units="{UNITS}"',
            f"-setdate,{date_str}",
            f"-settime,00:00:00",
            str(output_nc_path),
            str(temp_nc)
        ]
        
        run_cmd(cmd_cdo)
        
        # 替换原文件
        shutil.move(temp_nc, output_nc_path)
    
    print(f"    完成: {output_nc_path.name}")

def aggregate_year_to_single_file(year, period_nc_files, annual_nc_path):
    """
    将一年的所有8天NC4文件聚合为单个年度文件
    """
    print(f"  聚合 {year} 年数据...")
    
    if not period_nc_files:
        print(f"    警告: {year} 年没有找到NC4文件")
        return
    
    # 使用CDO合并时间序列
    with tempfile.TemporaryDirectory(prefix=f"gosif_agg_{year}_") as td:
        td_path = Path(td)
        merged_nc = td_path / "merged.nc4"
        
        # 合并所有时间步
        cmd_merge = [
            "cdo", "-O", "-L",
            "-f", "nc4", "-z", f"zip_{COMPRESSION_LEVEL}",
            "mergetime",
            *[str(f) for f in period_nc_files],
            str(merged_nc)
        ]
        
        run_cmd(cmd_merge)
        
        # 移动合并后的文件到目标位置
        shutil.move(merged_nc, annual_nc_path)
    
    print(f"    完成: {annual_nc_path.name}")

def process_year(year):
    """处理单个年份的数据"""
    print(f"\n处理 {year} 年数据:")
    
    # 获取该年的8天周期
    periods = get_8day_periods_for_year(year)
    
    # 查找TIFF文件
    tiff_files = find_tiff_files_for_year(year)
    
    if not tiff_files:
        print(f"  警告: {year} 年没有找到TIFF文件")
        return [], None
    
    # 转换每个8天周期
    period_nc_files = []
    missing_periods = []
    
    for period in periods:
        period_start_str = period['start_str']
        
        # 查找对应的TIFF文件
        matching_tiffs = [tf for tf in tiff_files 
                         if tf['date_str'] == period_start_str]
        
        if not matching_tiffs:
            # GOSIF可能使用DOY 361、362等特殊日期
            # 尝试查找最接近的日期
            if int(period_start_str[4:]) > 360:
                # 如果是年底（361+），尝试查找年底的文件
                for tf in tiff_files:
                    if int(tf['date_str'][4:]) > 360:
                        matching_tiffs = [tf]
                        break
            
            if not matching_tiffs:
                print(f"    警告: 未找到周期 {period_start_str} 的TIFF文件")
                missing_periods.append(period_start_str)
                continue
        
        tiff_info = matching_tiffs[0]
        tiff_path = tiff_info['path']
        
        # 输出文件名 - 保持您指定的格式
        output_nc_name = f"SIF_GOSIF_{period_start_str}_8D_p05_mWm-2sr-1nm-1.nc4"
        output_nc_path = OUT_DIR_8D / output_nc_name
        
        # 如果NC4文件已存在，跳过转换
        if output_nc_path.exists():
            print(f"    跳过已存在: {output_nc_name}")
            period_nc_files.append(output_nc_path)
            continue
        
        # 转换TIFF到NC4
        try:
            convert_tiff_to_nc4(tiff_path, output_nc_path, period['start'])
            period_nc_files.append(output_nc_path)
            
        except Exception as e:
            print(f"    错误: 转换 {tiff_path.name} 失败: {e}")
            missing_periods.append(period_start_str)
    
    # 报告缺失周期
    if missing_periods:
        print(f"  警告: 缺失 {len(missing_periods)} 个周期")
    
    # 聚合年度数据
    if period_nc_files:
        # 年度文件名 - 保持您指定的格式
        annual_nc_name = f"SIF_GOSIF_{year}_p05_mWm-2sr-1nm-1.nc4"
        annual_nc_path = OUT_DIR_ANNUAL / annual_nc_name
        
        if annual_nc_path.exists():
            print(f"  跳过已存在的年度文件: {annual_nc_name}")
        else:
            aggregate_year_to_single_file(year, period_nc_files, annual_nc_path)
        
        return period_nc_files, annual_nc_path
    else:
        print(f"  错误: {year} 年没有成功转换的文件")
        return [], None

# ======================
# 主处理流程
# ======================
def main():
    """主处理函数"""
    print("GOSIF SIF 数据处理脚本")
    print("=" * 60)
    print(f"输入目录: {IN_DIR}")
    print(f"8天输出目录: {OUT_DIR_8D}")
    print(f"年度输出目录: {OUT_DIR_ANNUAL}")
    print(f"处理年份: {START_YEAR} 到 {END_YEAR}")
    print("=" * 60)
    
    # 检查必要工具
    print("\n检查必要工具...")
    required_tools = ["gdal_translate", "cdo"]
    for tool in required_tools:
        if shutil.which(tool) is None:
            raise RuntimeError(f"缺少必要工具: {tool}")
        else:
            print(f"  ✓ {tool} 可用")
    
    # 处理每一年
    all_period_files = []
    annual_files = []
    
    for year in YEARS:
        try:
            period_files, annual_file = process_year(year)
            
            if period_files:
                all_period_files.extend(period_files)
            
            if annual_file:
                annual_files.append(annual_file)
            
        except Exception as e:
            print(f"\n处理 {year} 年时发生错误: {e}")
            continue
    
    # 生成处理报告
    print("\n" + "=" * 60)
    print("处理完成!")
    print("=" * 60)
    
    print(f"\n处理统计:")
    print(f"  总8天文件数: {len(all_period_files)}")
    print(f"  总年度文件数: {len(annual_files)}")
    
    print(f"\n输出目录:")
    print(f"  8天文件: {OUT_DIR_8D}")
    print(f"  年度文件: {OUT_DIR_ANNUAL}")
    
    return all_period_files, annual_files

# ======================
# 执行主函数
# ======================
if __name__ == "__main__":
    try:
        period_files, annual_files = main()
        
        print("\n" + "=" * 60)
        print("所有处理完成!")
        
    except KeyboardInterrupt:
        print("\n\n处理被用户中断")
    except Exception as e:
        print(f"\n处理过程中发生错误: {e}")
        import traceback
        traceback.print_exc()

GOSIF v2 SIF 8-day TIFF -> NC4 + annual merge
IN_DIR      : /share/home/dq076/bedrock/data/SIF/GOSIF_v2/tif
OUT_DIR_8D  : /share/home/dq076/bedrock/data/SIF/GOSIF_v2/8D
OUT_DIR_YEAR: /share/home/dq076/bedrock/data/SIF/GOSIF_v2/annual
Years       : 2003-2020
Time axis   : time=start day, time_bnds=[start, end_exclusive) (half-open)
Last period : end_exclusive clipped to Jan 1 next year (365/366 actual)

=== Year 2003 ===
[CONV] GOSIF_2003001.tif -> SIF_GOSIF_2003001_8D_p05_mWm-2sr-1nm-1.nc4


HDF5-DIAG: Error detected in HDF5 (1.14.6) thread 1:
  #000: H5F.c line 496 in H5Fis_accessible(): unable to determine if file is accessible as HDF5
    major: File accessibility
    minor: Not an HDF5 file
  #001: H5VLcallback.c line 3913 in H5VL_file_specific(): file specific failed
    major: Virtual Object Layer
    minor: Can't operate on object
  #002: H5VLcallback.c line 3848 in H5VL__file_specific(): file specific failed
    major: Virtual Object Layer
    minor: Can't operate on object
  #003: H5VLnative_file.c line 344 in H5VL__native_file_specific(): error in HDF5 file check
    major: File accessibility
    minor: Can't get value
  #004: H5Fint.c line 1055 in H5F__is_hdf5(): unable to open file
    major: File accessibility
    minor: Unable to initialize object
  #005: H5FD.c line 787 in H5FD_open(): can't open file
    major: Virtual File Layer
    minor: Unable to open file
  #006: H5FDsec2.c line 323 in H5FD__sec2_open(): unable to open file: name = '/tmp/gosif_one_gras

[CONV] GOSIF_2003009.tif -> SIF_GOSIF_2003009_8D_p05_mWm-2sr-1nm-1.nc4


HDF5-DIAG: Error detected in HDF5 (1.14.6) thread 1:
  #000: H5F.c line 496 in H5Fis_accessible(): unable to determine if file is accessible as HDF5
    major: File accessibility
    minor: Not an HDF5 file
  #001: H5VLcallback.c line 3913 in H5VL_file_specific(): file specific failed
    major: Virtual Object Layer
    minor: Can't operate on object
  #002: H5VLcallback.c line 3848 in H5VL__file_specific(): file specific failed
    major: Virtual Object Layer
    minor: Can't operate on object
  #003: H5VLnative_file.c line 344 in H5VL__native_file_specific(): error in HDF5 file check
    major: File accessibility
    minor: Can't get value
  #004: H5Fint.c line 1055 in H5F__is_hdf5(): unable to open file
    major: File accessibility
    minor: Unable to initialize object
  #005: H5FD.c line 787 in H5FD_open(): can't open file
    major: Virtual File Layer
    minor: Unable to open file
  #006: H5FDsec2.c line 323 in H5FD__sec2_open(): unable to open file: name = '/tmp/gosif_one_at_4

KeyboardInterrupt: 