### Creating the obs_csv file
#### Author: Tadd Bindas

This file is meant to create the list of observations to be used in marquette when creating training basins 

In [49]:
import geopandas as gpd
import pandas as pd
import numpy as np
from shapely.geometry import Point, LineString
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import zarr

import warnings
warnings.filterwarnings('ignore', category=UserWarning)

In [50]:
riv = gpd.read_file("/Users/taddbindas/data/merit/riv_pfaf_7_MERIT_Hydro_v07_Basins_v01_bugfix1.shp")
gages = gpd.read_file("/Users/taddbindas/data/gages_3000/conus_3000_gages.gpkg").to_crs("EPSG:4326")
sindex = riv.sindex

In [51]:
def find_best_match(riv_gdf, point, drainage_area=None, buffer_dist = 0.01):
    """Finds the closest flowline to the point based on Drainage area and location
    """
    bounds = point.buffer(buffer_dist).bounds
    candidate_indices = list(sindex.intersection(bounds))
    
    if not candidate_indices:
        return None, None, None
    
    candidates = riv_gdf.iloc[candidate_indices].copy()
    candidates['distance'] = candidates.geometry.distance(point)
    
    if drainage_area is not None and drainage_area > 0:
        candidates['area_diff_pct'] = abs(candidates['uparea'] - drainage_area) / drainage_area * 100
        
        # Combined score (weighted average of distance and area difference)
        max_dist = candidates['distance'].max() or 1.0
        candidates['score'] = (
            0.3 * (candidates['distance'] / max_dist) + 
            0.7 * np.minimum(candidates['area_diff_pct'] / 100, 1.0)
        )
        
        best_idx = candidates['score'].idxmin()
        best_match = candidates.loc[best_idx]
        area_diff_pct = best_match['area_diff_pct']
    else:
        # Just use distance if drainage area not available
        best_idx = candidates['distance'].idxmin()
        best_match = candidates.loc[best_idx]
        area_diff_pct = None
    
    return best_idx, best_match['distance'], area_diff_pct

In [52]:
print("Matching gages to flowlines...")
matches = []

for idx, gage in tqdm(gages.iterrows()):
    drainage_area = gage.get('DRAIN_SQKM')
    
    match_idx, distance, area_diff_pct = find_best_match(riv, gage.geometry, drainage_area)
    
    if match_idx is not None:
        flowline = riv.loc[match_idx]
        
        # Determine match quality
        if area_diff_pct is not None:
            if distance < 0.001 and area_diff_pct < 10:  # Close match in both location and area
                quality = 'excellent'
            elif distance < 0.005 and area_diff_pct < 20:
                quality = 'good'
            elif distance < 0.01 and area_diff_pct < 50:
                quality = 'fair'
            else:
                quality = 'poor'
        else:
            # Base quality only on distance
            if distance < 0.001:
                quality = 'distance_only_excellent'
            elif distance < 0.005:
                quality = 'distance_only_good'
            else:
                quality = 'distance_only_fair'
        
        match_data = {
            'STAID': gage.get('STAID', str(idx)),
            'STANAME': gage.get('STANAME', f"Gage_{idx}"),
            'CLASS': gage.get('CLASS'),
            'STATE': gage.get('STATE'),
            'COMID': flowline.get('COMID', str(match_idx)),
            'distance': distance,
            'DRAIN_SQKM': drainage_area,
            'flowline_drainage_area': flowline.get('uparea'),
            'area_difference_pct': area_diff_pct,
            'match_quality': quality
        }
        
        matches.append(match_data)

# Create DataFrame with matches
matches_df = pd.DataFrame(matches)

# Sort by quality and area difference
if 'match_quality' in matches_df.columns and 'area_difference_pct' in matches_df.columns:
    quality_order = {
        'excellent': 0, 
        'good': 1, 
        'fair': 2, 
        'poor': 3,
        'distance_only_excellent': 4,
        'distance_only_good': 5,
        'distance_only_fair': 6
    }
    matches_df['quality_rank'] = matches_df['match_quality'].map(quality_order)
    matches_df = matches_df.sort_values(['quality_rank', 'area_difference_pct', 'distance'])
else:
    matches_df = matches_df.sort_values('distance')

# Print summary
print("\nMatching Summary:")
print(f"Total gages: {len(gages)}")
print(f"Total matched: {len(matches_df)}")

if 'match_quality' in matches_df.columns:
    quality_counts = matches_df['match_quality'].value_counts()
    for quality, count in quality_counts.items():
        print(f"{quality} matches: {count}")

Matching gages to flowlines...


0it [00:00, ?it/s]


Matching Summary:
Total gages: 9067
Total matched: 8787
excellent matches: 4985
good matches: 1604
poor matches: 1158
fair matches: 1040


In [53]:
display(matches_df[matches_df["match_quality"] == "excellent"].shape)
matches_df[matches_df["match_quality"] == "excellent"].head()

(4985, 11)

Unnamed: 0,STAID,STANAME,CLASS,STATE,COMID,distance,DRAIN_SQKM,flowline_drainage_area,area_difference_pct,match_quality,quality_rank
2606,9057500,"BLUE RIVER BELOW GREEN MOUNTAIN RESERVOIR, CO.",Non-ref,CO,77006776,0.000249,1494.94,1494.942711,0.000181,excellent,0
5983,2387500,"OOSTANAULA RIVER AT RESACA, GA",Non-ref,GA,73011126,0.000446,4153.317,4153.307263,0.000234,excellent,0
7546,1613000,"POTOMAC RIVER AT HANCOCK, MD",Non-ref,MD,73005077,0.000889,10556.82,10556.879872,0.000567,excellent,0
7851,1350101,SCHOHARIE CREEK AT GILBOA NY,Non-ref,NY,73003443,0.000161,817.1208,817.115991,0.000589,excellent,0
5478,3372500,SALT CREEK NEAR HARRODSBURG IND,Non-ref,IN,74038878,0.000273,1120.253,1120.260203,0.000643,excellent,0


In [45]:
output_df = matches_df[matches_df["match_quality"] == "excellent"]
output_df.to_csv("/Users/taddbindas/data/merit/obs_5000.csv")

In [58]:
gages[gages["STAID"].isin(output_df["STAID"])].to_file("/Users/taddbindas/data/gages_3000/subset_conus_3000_gages.gpkg")

### See how many gauges are in our zarr store

In [47]:
root = zarr.open_group("/Users/taddbindas/projects/ddr/data/gages_9000.zarr")
root

<zarr.hierarchy.Group '/'>

In [48]:
not_arr = []
in_arr = []
for idx, gage in tqdm(output_df.iterrows()):
    if gage["STAID"].zfill(8) in root:
        in_arr.append(idx)
    else:
        not_arr.append(idx)
        
print(f"{len(in_arr)} / {len(output_df)} present in the zarr store")

0it [00:00, ?it/s]

4985 / 4985 present in the zarr store
