In [1]:
import os
import pandas as pd
import numpy as np
import xarray as xr

COL_QSIM = 'Q_sim'  # Simulated flow column

def load_zarr_data(hz_path: str) -> xr.Dataset:
    """ Load Zarr dataset containing river flow data. """
    return xr.open_mfdataset(hz_path, concat_dim='rivid', combine='nested', parallel=True, engine='zarr')

def extract_river_flow(hz: xr.Dataset, rivid: int) -> pd.DataFrame:
    """ Extract time series flow data for a given rivid from the Zarr dataset. """
    sim_flow = hz['Qout'][:, hz.rivid.values == rivid].values
    sim_flow_df = pd.DataFrame(sim_flow, index=hz['time'].values, columns=[COL_QSIM])
    return sim_flow_df

def fdc(flows: np.array, steps: int = 101) -> pd.DataFrame:
    """ Compute flow duration curve (FDC) from flow data. """
    exceed_prob = np.linspace(0, 100, steps)
    fdc_flows = np.nanpercentile(flows, exceed_prob)
    df = pd.DataFrame({'p_exceed': exceed_prob, 'fdc': fdc_flows})
    return df

def compute_monthly_fdcs(hz_path: str, output_file: str = None, num_rivids: int = None):
    """ Compute monthly flow duration curves for each rivid and save or display results. """
    hz = load_zarr_data(hz_path)
    rivids = hz.rivid.values  # Get all river IDs in the dataset
    
    if num_rivids is not None:
        rivids = rivids[:num_rivids]
    print(f"Total rivids being processed: {len(rivids)}")
    
    all_fdc_data = []
    
    for rivid in rivids:
        print(f"Processing rivid: {rivid}")
        sim_flow_df = extract_river_flow(hz, rivid)
        sim_flow_df.index = pd.to_datetime(sim_flow_df.index)
        # Apply the year filter for the range 1941-2025
        sim_flow_df = sim_flow_df[(sim_flow_df.index.year >= 1941) & (sim_flow_df.index.year <= 2025)]
        
        for month in range(1, 13):
            monthly_flow = sim_flow_df[sim_flow_df.index.month == month].dropna().clip(lower=0)
            #print(monthly_flow)
            if not monthly_flow.empty:
                #print(f"  - Processing month: {month} for rivid {rivid} with {len(monthly_flow)} records")
                fdc_df = fdc(monthly_flow[COL_QSIM].values)
                fdc_df['rivid'] = rivid
                fdc_df['Month'] = month
                all_fdc_data.append(fdc_df)
            else:
                print(f"  - No data for rivid {rivid} in month {month}")

    all_fdc_df = pd.concat(all_fdc_data, ignore_index=True)
    
    if output_file:
        all_fdc_df.to_parquet(output_file)
        #print(f'Saved FDC data to {output_file}')
    else:
        print(all_fdc_df)
    
    hz.close()

# Usage
hz_path = "/Users/yubinbaaniya/Documents/WORLD BIAS/saber workdir/2nd_iteration_simulation_data.zarr"  # Replace with the path to your Zarr file
output_file = "/Users/yubinbaaniya/Documents/WORLD BIAS/saber workdir/tables/monthly_simulated_FDC.parquet"  # Set to None if you don't want to save to a file during testing
num_rivids = None  # mention number of rivids for testing or None for using all rivids

compute_monthly_fdcs(hz_path, output_file=output_file, num_rivids=num_rivids)


Total rivids being processed: 19272
Processing rivid: 110099693
Processing rivid: 110149488
Processing rivid: 110206983
Processing rivid: 110207080
Processing rivid: 110220341
Processing rivid: 110231984
Processing rivid: 110247633
Processing rivid: 110252406
Processing rivid: 110275139
Processing rivid: 110284460
Processing rivid: 110287075
Processing rivid: 110296320
Processing rivid: 110306692
Processing rivid: 110310578
Processing rivid: 110313194
Processing rivid: 110314726
Processing rivid: 110315855
Processing rivid: 110325014
Processing rivid: 110326327
Processing rivid: 110330340
Processing rivid: 110334201
Processing rivid: 110351299
Processing rivid: 110357907
Processing rivid: 110384112
Processing rivid: 110392033
Processing rivid: 110394507
Processing rivid: 110397166
Processing rivid: 110397259
Processing rivid: 110410366
Processing rivid: 110414341
Processing rivid: 110415641
Processing rivid: 110418244
Processing rivid: 110418274
Processing rivid: 110426145
Processing r

convert the parquet to zarr

In [9]:
#ZARR functions
def create_xarray_zarr(melted_df):
    """
    Convert melted DataFrame to xarray Dataset and save as Zarr
    
    Parameters:
    melted_df (pd.DataFrame): Melted DataFrame with columns 'rivid', 'Month', 'p_exceed', 'fdc'
    """
    # Ensure proper data types
    melted_df = melted_df.copy()
    melted_df['Month'] = melted_df['Month'].astype(int)
    melted_df['p_exceed'] = melted_df['p_exceed'].astype(int)
    
    # Sort values to ensure consistent ordering
    melted_df = melted_df.sort_values(['rivid', 'p_exceed', 'Month'])
    
    # Get unique values for dimensions
    gauges = sorted(melted_df['rivid'].unique())
    p_exceed_values = sorted(melted_df['p_exceed'].unique())
    months = sorted(melted_df['Month'].unique())
    
    # Create 3D array with proper shape
    shape = (len(gauges), len(p_exceed_values), len(months))
    data = np.full(shape, np.nan)
    
    # Create lookup dictionaries for faster indexing
    gauge_idx = {g: i for i, g in enumerate(gauges)}
    p_idx = {p: i for i, p in enumerate(p_exceed_values)}
    month_idx = {m: i for i, m in enumerate(months)}
    
    # Fill the 3D array
    for _, row in melted_df.iterrows():
        i = gauge_idx[row['rivid']]
        j = p_idx[row['p_exceed']]
        k = month_idx[row['Month']]
        data[i, j, k] = row['fdc']
    
    # Create xarray Dataset
    ds = xr.Dataset(
        {
            'fdc': (['rivid', 'p_exceed', 'month'], data)
        },
        coords={
            'rivid': gauges,
            'p_exceed': p_exceed_values,
            'month': months
        }
    )
    
    # Chunk the dataset - adjust chunk sizes based on your needs
    ds = ds.chunk({
        'rivid': min(50, len(gauges)),
        'p_exceed': min(101, len(p_exceed_values)),
        'month': min(12, len(months))
    })
    
    return ds

def save_to_zarr(ds, filename='/Users/yubinbaaniya/Documents/WORLD BIAS/saber workdir/simulated_monthly_fdc.zarr'):  #the file path is hardcoded here
    """
    Save xarray Dataset to Zarr format and return the absolute path
    
    Parameters:
    ds (xarray.Dataset): Dataset to save
    filename (str): Output filename with path
    
    Returns:
    str: Absolute path to the saved Zarr file
    """
    # Convert to absolute path
    abs_path = os.path.abspath(filename)
    
    # Save to Zarr format without compression
    ds.to_zarr(abs_path, mode='w')
    
    print(f"Zarr file saved to: {abs_path}")
    return abs_path


#Function Call to make a zarr file
ds = create_xarray_zarr(melted_df)
zarr_path = save_to_zarr(ds)

Zarr file saved to: /Users/yubinbaaniya/Documents/WORLD BIAS/saber workdir/simulated_monthly_fdc.zarr
