# 1. Preprocess the CHIRPS v3.0 IMERGlate-v07

### 1.1 Merge the month data to year

In [8]:
#!/usr/bin/env python3
from pathlib import Path
import subprocess

input_dir = "/share/home/dq076/bedrock/data/P/CHIRPS_v3_IMERGlate_v07/rawdata"
output_dir = "/share/home/dq076/bedrock/data/P/CHIRPS_v3_IMERGlate_v07/1D"

Path(output_dir).mkdir(exist_ok=True)

for y in range(2003, 2021):
    files = " ".join([f"{input_dir}/chirps-v3.0.{y}.{m:02d}.days_p05.nc" for m in range(1, 13)])
    cmd = f"cdo -f nc4c -z zip_3 -mergetime {files} {output_dir}/P_chirpsV3_{y}_1D_p05_mm1d.nc"
    subprocess.run(cmd, shell=True)
    print(f"完成 {y}")


cdo    mergetime (Abort): Outputfile /share/home/dq076/bedrock/data/P/CHIRPS_v3_IMERGlate_v07/1D/P_chirpsV3_2003_1D_p05_mm1d.nc already exists!


完成 2003



cdo    mergetime (Abort): Outputfile /share/home/dq076/bedrock/data/P/CHIRPS_v3_IMERGlate_v07/1D/P_chirpsV3_2004_1D_p05_mm1d.nc already exists!


完成 2004



cdo    mergetime (Abort): Outputfile /share/home/dq076/bedrock/data/P/CHIRPS_v3_IMERGlate_v07/1D/P_chirpsV3_2005_1D_p05_mm1d.nc already exists!


完成 2005



cdo    mergetime (Abort): Outputfile /share/home/dq076/bedrock/data/P/CHIRPS_v3_IMERGlate_v07/1D/P_chirpsV3_2006_1D_p05_mm1d.nc already exists!


完成 2006



cdo    mergetime (Abort): Outputfile /share/home/dq076/bedrock/data/P/CHIRPS_v3_IMERGlate_v07/1D/P_chirpsV3_2007_1D_p05_mm1d.nc already exists!


完成 2007



cdo    mergetime (Abort): Outputfile /share/home/dq076/bedrock/data/P/CHIRPS_v3_IMERGlate_v07/1D/P_chirpsV3_2008_1D_p05_mm1d.nc already exists!


完成 2008



cdo    mergetime (Abort): Outputfile /share/home/dq076/bedrock/data/P/CHIRPS_v3_IMERGlate_v07/1D/P_chirpsV3_2009_1D_p05_mm1d.nc already exists!


完成 2009



cdo    mergetime (Abort): Outputfile /share/home/dq076/bedrock/data/P/CHIRPS_v3_IMERGlate_v07/1D/P_chirpsV3_2010_1D_p05_mm1d.nc already exists!


完成 2010



cdo    mergetime (Abort): Outputfile /share/home/dq076/bedrock/data/P/CHIRPS_v3_IMERGlate_v07/1D/P_chirpsV3_2011_1D_p05_mm1d.nc already exists!


完成 2011



cdo    mergetime (Abort): Outputfile /share/home/dq076/bedrock/data/P/CHIRPS_v3_IMERGlate_v07/1D/P_chirpsV3_2012_1D_p05_mm1d.nc already exists!


完成 2012



cdo    mergetime (Abort): Outputfile /share/home/dq076/bedrock/data/P/CHIRPS_v3_IMERGlate_v07/1D/P_chirpsV3_2013_1D_p05_mm1d.nc already exists!


完成 2013



cdo    mergetime (Abort): Outputfile /share/home/dq076/bedrock/data/P/CHIRPS_v3_IMERGlate_v07/1D/P_chirpsV3_2014_1D_p05_mm1d.nc already exists!


完成 2014



cdo    mergetime (Abort): Outputfile /share/home/dq076/bedrock/data/P/CHIRPS_v3_IMERGlate_v07/1D/P_chirpsV3_2015_1D_p05_mm1d.nc already exists!


完成 2015



cdo    mergetime (Abort): Outputfile /share/home/dq076/bedrock/data/P/CHIRPS_v3_IMERGlate_v07/1D/P_chirpsV3_2016_1D_p05_mm1d.nc already exists!


完成 2016



cdo    mergetime (Abort): Outputfile /share/home/dq076/bedrock/data/P/CHIRPS_v3_IMERGlate_v07/1D/P_chirpsV3_2017_1D_p05_mm1d.nc already exists!


完成 2017



cdo    mergetime (Abort): Outputfile /share/home/dq076/bedrock/data/P/CHIRPS_v3_IMERGlate_v07/1D/P_chirpsV3_2018_1D_p05_mm1d.nc already exists!


完成 2018



cdo    mergetime (Abort): Outputfile /share/home/dq076/bedrock/data/P/CHIRPS_v3_IMERGlate_v07/1D/P_chirpsV3_2019_1D_p05_mm1d.nc already exists!


完成 2019
完成 2020



cdo    mergetime (Abort): Outputfile /share/home/dq076/bedrock/data/P/CHIRPS_v3_IMERGlate_v07/1D/P_chirpsV3_2020_1D_p05_mm1d.nc already exists!


### 1.2 Transfer the daily to 8-days

In [None]:
#!/usr/bin/env python3
"""
高效处理每年daily数据得到8天数据
专门针对已合并的年数据文件进行优化
"""

import numpy as np
import pandas as pd
import xarray as xr
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# =========================
# CONFIG (EDIT THIS ONLY)
# =========================
IN_DIR = Path("/share/home/dq076/bedrock/data/P/CHIRPS_v3_IMERGlate_v07/1D").resolve()
OUT_DIR = Path("/share/home/dq076/bedrock/data/P/CHIRPS_v3_IMERGlate_v07/8D").resolve()
OUT_DIR.mkdir(parents=True, exist_ok=True)

START_YEAR = 2003
END_YEAR = 2020

VAR_NAME = "precip"
FILLVALUE = -9999.0
COMP_LEVEL = 1
# =========================


def process_one_year_fast(year: int):
    """
    高效处理单年daily数据为8天数据
    确保每年都有46个窗口，前45个是8天和，最后一个是剩余天数的和
    """
    input_file = IN_DIR / f"P_chirpsV3_{year}_1D_p05_mm1d.nc"
    
    if not input_file.exists():
        print(f"[SKIP] {year}: 输入文件不存在")
        return
    
    print(f"[{year}] 处理中...")
    
    # 高效加载数据 - 使用分块读取
    ds = xr.open_dataset(input_file, chunks={"time": 366})
    
    if VAR_NAME not in ds:
        raise KeyError(f"变量 '{VAR_NAME}' 未找到")
    
    da = ds[VAR_NAME]
    
    # 掩膜填充值
    da = da.where(da != FILLVALUE)
    
    # 获取时间信息
    time_values = da.time.values
    n_days = len(time_values)
    
    # 计算8天窗口
    # 每年都有46个窗口：前45个是8天，最后1个是剩余天数
    n_windows = 46
    
    # 预分配结果数组
    result_data = np.full((n_windows, da.shape[1], da.shape[2]), np.nan, dtype=np.float32)
    result_times = []
    
    # 计算每个窗口
    for w in range(n_windows):
        # 计算窗口的起始和结束索引
        start_idx = w * 8
        end_idx = min(start_idx + 8, n_days)  # 最后一个窗口可能不足8天
        
        if start_idx >= n_days:
            break
        
        # 选择该窗口的数据
        window_data = da.isel(time=slice(start_idx, end_idx))
        
        # 计算该窗口的和（跳过NaN）
        window_sum = window_data.sum(dim='time', skipna=True)
        
        # 存储结果
        result_data[w] = window_sum.values
        
        # 计算窗口的时间（起始日）
        window_start_time = pd.Timestamp(year, 1, 1) + pd.Timedelta(days=start_idx)
        result_times.append(window_start_time)
    
    # 创建结果数据集
    result_da = xr.DataArray(
        result_data,
        dims=['time', 'latitude', 'longitude'],
        coords={
            'time': np.array(result_times, dtype='datetime64[ns]'),
            'latitude': da.latitude.values,
            'longitude': da.longitude.values
        },
        name=VAR_NAME,
        attrs=da.attrs.copy()
    )
    
    # 更新属性
    result_da.attrs['units'] = 'mm/8day'
    result_da.attrs['long_name'] = '8-day accumulated precipitation'
    result_da.attrs['aggregation'] = 'Non-overlapping 8-day sum aligned to Jan 1'
    result_da.attrs.pop('missing_value', None)
    result_da.attrs.pop('_FillValue', None)
    
    # 创建输出数据集
    out_ds = xr.Dataset({VAR_NAME: result_da})
    
    # 输出文件
    out_path = OUT_DIR / f"P_chirpsV3_{year}_8D_p05_mm8d.nc"
    encoding = {
        VAR_NAME: {
            'dtype': 'float32',
            'zlib': True,
            'complevel': COMP_LEVEL,
            '_FillValue': np.nan,
        }
    }
    
    out_ds.to_netcdf(out_path, encoding=encoding)
    ds.close()
    
    # 验证结果
    result_ds = xr.open_dataset(out_path)
    n_output_windows = len(result_ds.time)
    result_ds.close()
    
    print(f"[{year}] ✓ 完成: {n_days}天 -> {n_output_windows}个8天窗口")
    print(f"      输出文件: {out_path.name}")
    
    return n_output_windows


def process_one_year_chunked(year: int):
    """
    使用分块处理的版本，适用于大数据
    """
    input_file = IN_DIR / f"P_chirpsV3_{year}_1D_p05_mm1d.nc"
    
    if not input_file.exists():
        print(f"[SKIP] {year}: 输入文件不存在")
        return
    
    print(f"[{year}] 分块处理中...")
    
    # 使用更大的分块
    ds = xr.open_dataset(input_file, chunks={'time': 366, 'latitude': 100, 'longitude': 100})
    da = ds[VAR_NAME].where(ds[VAR_NAME] != FILLVALUE)
    
    # 获取时间维度信息
    n_days = len(da.time)
    n_windows = 46
    
    # 使用列表推导式并行计算每个窗口
    results = []
    times = []
    
    for w in range(n_windows):
        start_idx = w * 8
        end_idx = min(start_idx + 8, n_days)
        
        if start_idx >= n_days:
            break
        
        # 选择窗口数据并计算和
        window_da = da.isel(time=slice(start_idx, end_idx))
        window_sum = window_da.sum(dim='time', skipna=True)
        
        # 立即计算并存储
        results.append(window_sum.compute())
        
        # 计算窗口起始时间
        window_time = pd.Timestamp(year, 1, 1) + pd.Timedelta(days=start_idx)
        times.append(window_time)
    
    # 合并结果
    result_da = xr.concat(results, dim='time')
    result_da = result_da.assign_coords(time=np.array(times, dtype='datetime64[ns]'))
    
    # 设置属性
    result_da.attrs.update({
        'units': 'mm/8day',
        'long_name': '8-day accumulated precipitation',
        'aggregation': 'Non-overlapping 8-day sum aligned to Jan 1'
    })
    
    # 保存
    out_path = OUT_DIR / f"P_chirpsV3_{year}_8D_p05_mm8d.nc"
    encoding = {
        VAR_NAME: {
            'dtype': 'float32',
            'zlib': True,
            'complevel': COMP_LEVEL,
            '_FillValue': np.nan,
        }
    }
    
    out_ds = xr.Dataset({VAR_NAME: result_da})
    out_ds.to_netcdf(out_path, encoding=encoding)
    ds.close()
    
    print(f"[{year}] ✓ 完成: {n_days}天 -> {len(times)}个8天窗口")
    return len(times)


def process_one_year_vectorized(year: int):
    """
    向量化版本，最快但需要更多内存
    """
    input_file = IN_DIR / f"P_chirpsV3_{year}_1D_p05_mm1d.nc"
    
    if not input_file.exists():
        print(f"[SKIP] {year}: 输入文件不存在")
        return
    
    print(f"[{year}] 向量化处理中...")
    
    # 一次性加载所有数据（如果内存足够）
    ds = xr.open_dataset(input_file)
    da = ds[VAR_NAME].where(ds[VAR_NAME] != FILLVALUE)
    
    n_days = len(da.time)
    
    # 创建8天索引
    bin_indices = np.arange(n_days) // 8
    
    # 但我们需要确保每年只有46个窗口
    # 所以需要调整最后一个窗口的索引
    max_bins = 46
    if bin_indices.max() >= max_bins:
        # 将所有超过45的索引都设为45（最后一个窗口）
        bin_indices = np.where(bin_indices >= max_bins, max_bins - 1, bin_indices)
    
    # 转换为xarray数组用于分组
    bin_da = xr.DataArray(bin_indices, dims=['time'], coords={'time': da.time})
    
    # 按8天窗口分组求和
    result_da = da.groupby(bin_da).sum(dim='time', skipna=True)
    
    # 创建时间坐标（每年从1月1日开始）
    time_coords = []
    for b in range(len(result_da.group)):
        start_day = b * 8
        window_time = pd.Timestamp(year, 1, 1) + pd.Timedelta(days=start_day)
        time_coords.append(window_time)
    
    # 更新坐标
    result_da = result_da.assign_coords(time=('group', np.array(time_coords, dtype='datetime64[ns]')))
    result_da = result_da.swap_dims({'group': 'time'}).drop_vars('group')
    
    # 设置属性
    result_da.attrs.update({
        'units': 'mm/8day',
        'long_name': '8-day accumulated precipitation',
        'aggregation': 'Non-overlapping 8-day sum aligned to Jan 1'
    })
    
    # 保存
    out_path = OUT_DIR / f"P_chirpsV3_{year}_8D_p05_mm8d.nc"
    encoding = {
        VAR_NAME: {
            'dtype': 'float32',
            'zlib': True,
            'complevel': COMP_LEVEL,
            '_FillValue': np.nan,
        }
    }
    
    out_ds = xr.Dataset({VAR_NAME: result_da})
    out_ds.to_netcdf(out_path, encoding=encoding)
    ds.close()
    
    print(f"[{year}] ✓ 完成: {n_days}天 -> {len(result_da.time)}个8天窗口")
    return len(result_da.time)


def main():
    """主函数"""
    print("=" * 60)
    print("高效8天数据生成工具")
    print(f"输入目录: {IN_DIR}")
    print(f"输出目录: {OUT_DIR}")
    print(f"年份范围: {START_YEAR}-{END_YEAR}")
    print("=" * 60)
    
    total_windows = 0
    
    for year in range(START_YEAR, END_YEAR + 1):
        try:
            # 使用向量化版本（最快）
            n_windows = process_one_year_vectorized(year)
            if n_windows:
                total_windows += n_windows
        except MemoryError:
            print(f"[{year}] ⚠ 内存不足，回退到分块版本")
            # 如果内存不足，使用分块版本
            n_windows = process_one_year_chunked(year)
            if n_windows:
                total_windows += n_windows
        except Exception as e:
            print(f"[{year}] ✗ 错误: {e}")
    
    print("\n" + "=" * 60)
    print(f"处理完成!")
    print(f"总生成窗口数: {total_windows}")
    print(f"输出目录: {OUT_DIR}")
    print("=" * 60)

if __name__ == "__main__":
    main()

高效8天数据生成工具
输入目录: /share/home/dq076/bedrock/data/P/CHIRPS_v3_IMERGlate_v07/1D
输出目录: /share/home/dq076/bedrock/data/P/CHIRPS_v3_IMERGlate_v07/8D
年份范围: 2003-2020
[2003] 向量化处理中...
[2003] ✓ 完成: 365天 -> 46个8天窗口
[2004] 向量化处理中...
[2004] ✓ 完成: 366天 -> 46个8天窗口
[2005] 向量化处理中...
[2005] ✓ 完成: 365天 -> 46个8天窗口
[2006] 向量化处理中...
[2006] ✓ 完成: 365天 -> 46个8天窗口
[2007] 向量化处理中...
[2007] ✓ 完成: 365天 -> 46个8天窗口
[2008] 向量化处理中...
[2008] ✓ 完成: 366天 -> 46个8天窗口
[2009] 向量化处理中...
[2009] ✓ 完成: 365天 -> 46个8天窗口
[2010] 向量化处理中...
[2010] ✓ 完成: 365天 -> 46个8天窗口
[2011] 向量化处理中...
[2011] ✓ 完成: 365天 -> 46个8天窗口
[2012] 向量化处理中...
[2012] ✓ 完成: 366天 -> 46个8天窗口
[2013] 向量化处理中...
[2013] ✓ 完成: 365天 -> 46个8天窗口
[2014] 向量化处理中...
[2014] ✓ 完成: 365天 -> 46个8天窗口
[2015] 向量化处理中...
[2015] ✓ 完成: 365天 -> 46个8天窗口
[2016] 向量化处理中...
[2016] ✓ 完成: 366天 -> 46个8天窗口
[2017] 向量化处理中...
[2017] ✓ 完成: 365天 -> 46个8天窗口
[2018] 向量化处理中...
[2018] ✓ 完成: 365天 -> 46个8天窗口
[2019] 向量化处理中...
[2019] ✓ 完成: 365天 -> 46个8天窗口
[2020] 向量化处理中...
[2020] ✓ 完成: 366天 -> 46个8天窗口

处理完成!
总生成窗口数