In [None]:
# build_thinning_events.py
# Step 1: scan disturbance rasters, keep only "Thinning" pixels, write partitioned Parquet by year/tile.

import os, re, uuid, pathlib, warnings
from pathlib import Path
import numpy as np
import pandas as pd
import rasterio
from rasterio.windows import Window
from dbfread import DBF
import pyarrow as pa
import pyarrow.parquet as pq

# ---------------- config ----------------
ROOT = Path("../../Dataset")
DATASET = ROOT / "AnnualDisturbance_1999_present"
OUT = Path("../../Outputs/treatments")
TILE = 4096  # tile size in pixels
BAND = 1     # first band
TREATMENT_NAME = "Thinning"
# ----------------------------------------

OUT.mkdir(parents=True, exist_ok=True)

def find_tifs(base: Path):
    # handles both US_DISTYYYY and LF20YY_Dist_*** folders
    return sorted(base.rglob("Tif/*.tif"))

def year_from_path(p: Path):
    m = re.search(r"(?:US_DIST|LF)\D*?(\d{4})", str(p))
    if m:
        return int(m.group(1))
    # fallback: try to read raster tag
    try:
        with rasterio.open(p) as src:
            y = src.tags().get("DIST_YEAR") or src.tags().get("Year")
            return int(y) if y else None
    except Exception:
        return None

def find_vat_for_tif(tif_path: Path):
    # typical: <name>.tif.vat.dbf in same folder
    cand = tif_path.parent.glob(f"{tif_path.name}.vat.dbf")
    for c in cand:
        return c
    # also scan any *.vat.dbf in folder
    anyvat = list(tif_path.parent.glob("*.vat.dbf"))
    if anyvat:
        return anyvat[0]
    # fallback: look in sibling CSV_Data for a codes CSV
    csv_dir = tif_path.parents[1] / "CSV_Data"
    if csv_dir.exists():
        csvs = sorted(csv_dir.glob("*.csv"))
        if csvs:
            return csvs[0]
    return None

def load_code_table(vat_path: Path):
    # returns DataFrame with columns: CODE(int), DIST_TYPE(str)
    if vat_path.suffix.lower() == ".dbf":
        df = pd.DataFrame(DBF(str(vat_path), load=True))
    elif vat_path.suffix.lower() == ".csv":
        df = pd.read_csv(vat_path)
    else:
        raise FileNotFoundError(f"Unsupported VAT/CSV: {vat_path}")

    # normalize headers
    df.columns = [c.strip().upper() for c in df.columns]
    # code column candidates
    for k in ("VALUE", "GRIDCODE", "CODE", "VALUE_"):
        if k in df.columns:
            code_col = k
            break
    else:
        raise KeyError(f"No code column in {vat_path}")
    # type column candidates across eras
    for tcol in ("DIST_TYPE", "DISTTYPE", "DIST_TYPE_", "DIST_TYPE1", "DIST_TYPE2", "DIST_TYPE3", "DIST_TYPE4", "DISTTYPE1", "DISTTYPE2", "DIST_TYPE_", "DIST_TYPE_1", "DIST_TYPE_2", "DIST_TYPE_3", "DIST_TYPE_4", "DIST_TYPE5", "DIST_TYPE6", "DIST_TYPE7", "DIST_TYPE8", "DIST_TYPE9", "DISTTYPE_",
                 "DIST_TYPE ", "DISTTYPE ", "DIST TYPE", "DIST TYPE ", "DIST TYPE_",
                 "DIST_TYPE__1", "DIST_TYPE__2", "DIST_TYPE__3", "DIST_TYPE__4", "DIST_TYPE__5"):
        if tcol in df.columns:
            type_col = tcol
            break
    # older 1999–2008 metadata uses Dist_Type (case-insens)
    if "DIST_TYPE" not in df.columns and "DISTTYPE" not in df.columns:
        for alt in ("DIST_TYPE", "DISTTYPE", "DIST TYPE"):
            if alt in df.columns:
                type_col = alt
                break
    if 'DIST_TYPE' not in df.columns and 'DISTTYPE' not in df.columns:
        # try Dist_Type case-sensitive fallback
        if "DIST_TYPE" not in df.columns and "DISTTYPE" not in df.columns:
            # already tried uppercase; check original case on a copy
            pass
    # Final robust pick
    if 'DIST_TYPE' in df.columns:
        type_col = 'DIST_TYPE'
    elif 'DISTTYPE' in df.columns:
        type_col = 'DISTTYPE'
    elif 'DIST TYPE' in df.columns:
        type_col = 'DIST TYPE'
    elif 'DIST_TYPE1' in df.columns:
        type_col = 'DIST_TYPE1'
    else:
        # last resort: look for 'DIST' and 'TYPE' in any col
        cand = [c for c in df.columns if "TYPE" in c and "DIST" in c]
        if cand:
            type_col = cand[0]
        else:
            # older files: Dist_Type in mixed case
            for c in df.columns:
                if c.replace("_","").replace(" ","") in ("DISTTYPE","DISTURBANCETYPE"):
                    type_col = c
                    break
            else:
                raise KeyError(f"No disturbance type column in {vat_path}")

    out = df[[code_col, type_col]].copy()
    out.columns = ["CODE", "DIST_TYPE"]
    # clean
    out["CODE"] = pd.to_numeric(out["CODE"], errors="coerce").astype("Int64")
    out["DIST_TYPE"] = out["DIST_TYPE"].astype(str).str.strip().str.lower()
    return out.dropna(subset=["CODE"])

def thinning_codes(vat_df: pd.DataFrame, treat_name=TREATMENT_NAME):
    treat = treat_name.strip().lower()
    codes = vat_df.loc[vat_df["DIST_TYPE"] == treat, "CODE"].dropna().astype(int).unique().tolist()
    return set(codes)

def ensure_part_dir(root: Path, year: int, ti: int, tj: int) -> Path:
    d = root / f"yr={year}" / f"tile_i={ti}" / f"tile_j={tj}"
    d.mkdir(parents=True, exist_ok=True)
    return d

def write_part(df_chunk: pd.DataFrame, year: int, out_root: Path):
    # group by tile to create partitioned files
    for (ti, tj), g in df_chunk.groupby(["tile_i", "tile_j"]):
        part_dir = ensure_part_dir(out_root, year, int(ti), int(tj))
        fn = part_dir / f"part-{uuid.uuid4().hex[:8]}.parquet"
        table = pa.Table.from_pandas(g, preserve_index=False)
        pq.write_table(table, fn, compression="zstd", use_dictionary=True)

def process_one_tif(tif_path: Path, out_root: Path, tile_size=TILE):
    year = year_from_path(tif_path)
    if not year:
        raise ValueError(f"Cannot infer year from path: {tif_path}")

    vat_path = find_vat_for_tif(tif_path)
    if not vat_path:
        raise FileNotFoundError(f"No VAT/CSV found for {tif_path}")

    vat_df = load_code_table(vat_path)
    codes = thinning_codes(vat_df)
    if not codes:
        print(f"[{year}] No '{TREATMENT_NAME}' codes in {vat_path}. Skipping.")
        return

    code_arr = np.array(sorted(list(codes)))
    print(f"[{year}] {tif_path.name}: thinning codes = {sorted(list(codes))} from {vat_path.name}")

    with rasterio.open(tif_path) as src:
        nodata = src.nodata
        H, W = src.height, src.width
        dtype = src.dtypes[0]

        # stream over native blocks
        for _, w in src.block_windows(BAND):
            a = src.read(BAND, window=w)  # ndarray
            if nodata is not None:
                mask = a != nodata
            else:
                mask = np.ones_like(a, dtype=bool)

            if not mask.any():
                continue

            # thinning pixels in this window
            is_thin = mask & np.isin(a, code_arr)
            if not is_thin.any():
                continue

            rr, cc = np.nonzero(is_thin)
            rows = rr + int(w.row_off)
            cols = cc + int(w.col_off)

            tile_i = rows // tile_size
            tile_j = cols // tile_size
            r_in = rows % tile_size
            c_in = cols % tile_size
            
            df = pd.DataFrame({
                "year": np.full(rows.shape[0], year, dtype=np.int16),
                "tile_i": tile_i.astype(np.int32),
                "tile_j": tile_j.astype(np.int32),
                "row": rows.astype(np.int32),
                "col": cols.astype(np.int32),
                "r": r_in.astype(np.int16),
                "c": c_in.astype(np.int16),
                "type_code": np.full(rows.shape[0], 1, dtype=np.int16),  # 1 = Thinning
            })
            write_part(df, year, out_root)

def main():
    tifs = find_tifs(DATASET)
    if not tifs:
        raise SystemExit(f"No GeoTIFFs under {DATASET}")
    for tif in tifs:
        try:
            process_one_tif(tif, OUT, TILE)
        except Exception as e:
            warnings.warn(f"Failed {tif}: {e}")

if __name__ == "__main__":
    main()


[2015] LC15_Dist_200.tif: thinning codes = [441, 442, 443, 741, 742, 743, 941] from LC15_Dist_200.tif.vat.dbf
[2016] LC16_Dist_200.tif: thinning codes = [441, 442, 443, 741, 742, 743, 941] from LC16_Dist_200.tif.vat.dbf
[2017] LC17_Dist_220.tif: thinning codes = [441, 442, 443, 741, 742, 743, 941] from LC17_Dist_220.tif.vat.dbf
[2018] LC18_Dist_220.tif: thinning codes = [441, 442, 443, 741, 742, 743, 941] from LC18_Dist_220.tif.vat.dbf
[2019] LC19_Dist_220.tif: thinning codes = [441, 442, 443, 741, 742, 743, 941] from LC19_Dist_220.tif.vat.dbf
[2020] LC20_Dist_220.tif: thinning codes = [441, 442, 443, 741, 742, 743, 941] from LC20_Dist_220.tif.vat.dbf
[2021] LC21_Dist_230.tif: thinning codes = [441, 442, 443, 741, 742, 743, 941] from LC21_Dist_230.tif.vat.dbf
[2022] LC22_Dist_230.tif: thinning codes = [441, 442, 443, 741, 742, 743, 941] from LC22_Dist_230.tif.vat.dbf
[2023] LC23_Dist_240.tif: thinning codes = [441, 442, 443, 741, 742, 743, 941] from LC23_Dist_240.tif.vat.dbf
[2024] LC2