In [15]:
import os
import glob
import numpy as np
import rasterio
from rasterio.mask import mask
from rasterio.warp import reproject, Resampling
import geopandas as gpd

# ---- 1) Automatically collect all feature TIFFs whose names start with "Assam_X_" ----
feature_tiffs = sorted(glob.glob("Assam_X_*.tif"))
gt_tiff       = "Assam_Y_Precipitation_GT_geotif.tif"

print(f"Feature files:\n{feature_tiffs}\n")
print(f"Ground truth file:\n{gt_tiff}\n")

# ---- 2) Load & mask Assam boundary ----
shp_path = "SateMask/gadm41_IND_1.shp"
assam = gpd.read_file(shp_path)

# Ensure the GeoDataFrame is in WGS84 lon/lat (EPSG:4326)
if assam.crs is None:
    assam.set_crs(epsg=4326, inplace=True)
else:
    assam = assam.to_crs(epsg=4326)

# Merge into a single polygon and convert to GeoJSON-like dict
geometry = [assam.geometry.unary_union.__geo_interface__]

# ---- 3) Helper: read & mask a multiband TIFF → (data, transform, crs) ----
def read_and_mask(tiff_path):
    with rasterio.open(tiff_path) as src:
        masked, _ = mask(src, geometry, crop=False, filled=False)
        transform = src.transform
        crs       = src.crs
        nodata    = src.nodata

    # Convert to float and turn masked pixels into NaN
    masked = masked.astype(float)
    data = masked.filled(np.nan)
    return data, transform, crs

# ---- 4) Read Ground Truth to get reference grid/CRS ----
gt_data, gt_transform, gt_crs = read_and_mask(gt_tiff)
_, H_ref, W_ref = gt_data.shape
print(f"Ground-truth grid size: {H_ref}×{W_ref}  (total pixels = {H_ref*W_ref})\n")

# (Optional) flatten ground-truth for later use:
GT_flat = gt_data.reshape(gt_data.shape[0], -1).T  # shape: (H_ref*W_ref, n_classes)

# ---- 5) Read each feature TIFF in a loop, reproject if needed, flatten, and store ----
feature_flat_list = []  # will hold (basename, flattened_array) pairs

for path in feature_tiffs:
    feat_data, feat_transform, feat_crs = read_and_mask(path)
    B_f, H_f, W_f = feat_data.shape
    basename = os.path.basename(path)

    # If grid/CRS differ from the ground truth, reproject each band
    if (feat_transform != gt_transform) or (feat_crs != gt_crs) or (H_f != H_ref) or (W_f != W_ref):
        reprojected = np.full((B_f, H_ref, W_ref), np.nan, dtype=float)
        for b in range(B_f):
            dst = np.full((H_ref, W_ref), np.nan, dtype=float)
            reproject(
                source       = feat_data[b],
                destination  = dst,
                src_transform= feat_transform,
                src_crs      = feat_crs,
                dst_transform= gt_transform,
                dst_crs      = gt_crs,
                resampling   = Resampling.nearest,
                src_nodata   = np.nan,
                dst_nodata   = np.nan
            )
            reprojected[b] = dst
        feat_data = reprojected

    # Flatten to shape (n_pixels, B_f)
    feat_flat = feat_data.reshape(B_f, -1).T
    feature_flat_list.append((basename, feat_flat))

# ---- 6) Print each feature’s name and first 5 rows of its flattened array ----
for (name, arr) in feature_flat_list:
    print(f"Feature '{name}'  →  shape: {arr.shape}")
    print("First 5 rows:")
    with np.printoptions(precision=4, suppress=True):
        print(arr[:5])
    print("\n" + "-"*50 + "\n")


Feature files:
[]

Ground truth file:
Assam_Y_Precipitation_GT_geotif.tif



  geometry = [assam.geometry.unary_union.__geo_interface__]


RasterioIOError: Assam_Y_Precipitation_GT_geotif.tif: No such file or directory