### Creating the obs_csv file
#### Author: Tadd Bindas

This file is meant to create the list of observations to be used in marquette when creating training basins 

In [25]:
import geopandas as gpd
import pandas as pd
import numpy as np
from shapely.geometry import Point, LineString
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import zarr

import warnings
warnings.filterwarnings('ignore', category=UserWarning)

In [29]:
riv = gpd.read_file("/projects/mhpi/data/MERIT/raw/continent/riv_pfaf_7_MERIT_Hydro_v07_Basins_v01_bugfix1.shp")
gages = gpd.read_file("/projects/mhpi/data/conus_3000_gages.gpkg").to_crs("EPSG:4326")
gages_3000 = pd.read_csv("/projects/mhpi/data/gages3000Info.csv")
# riv = gpd.read_file("/projects/mhpi/data/MERIT/gage_information/gage_flowline_intersections/gage_9322_intersection.shp")
# riv
sindex = riv.sindex

In [45]:
gages_filtered = gages[gages["STAID"].isin(gages_3000["id"].astype(str).str.zfill(8))]
gages_filtered

Unnamed: 0,STAID,STANAME,CLASS,AGGECOREGI,DRAIN_SQKM,HUC02,LAT_GAGE,LNG_GAGE,STATE,HCDN_2009,ACTIVE09,FLYRS1900,FLYRS1950,FLYRS1990,geometry
2,14190500,"LUCKIAMUTE RIVER NEAR SUVER, OR",Non-ref,WestMnts,603.4942,17,44.783175,-123.234543,OR,,yes,75.0,60.0,20.0,POINT (-123.23454 44.78318)
6,14301000,"NEHALEM RIVER NEAR FOSS, OR",Ref,WestMnts,1743.5440,17,45.703999,-123.755405,OR,yes,yes,70.0,60.0,20.0,POINT (-123.75541 45.704)
7,14301500,"WILSON RIVER NEAR TILLAMOOK, OR",Ref,WestMnts,417.5676,17,45.475940,-123.725121,OR,,yes,77.0,59.0,19.0,POINT (-123.72512 45.47594)
10,14303200,"TUCCA CREEK NEAR BLAINE, OR.",Ref,WestMnts,8.0595,17,45.324276,-123.546501,OR,yes,yes,26.0,26.0,20.0,POINT (-123.5465 45.32428)
12,14305500,"SILETZ RIVER AT SILETZ, OR",Ref,WestMnts,526.3299,17,44.715117,-123.887335,OR,yes,yes,88.0,60.0,20.0,POINT (-123.88733 44.71512)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9051,01030500,"Mattawamkeag River near Mattawamkeag, Maine",Ref,NorthEast,3676.1720,01,45.500975,-68.305956,ME,yes,yes,75.0,60.0,20.0,POINT (-68.30596 45.50097)
9053,01034000,"Piscataquis River at Medford, Maine",Non-ref,NorthEast,3011.2290,01,45.260611,-68.868614,ME,,yes,78.0,53.0,20.0,POINT (-68.86861 45.26061)
9054,01034500,"Penobscot River at West Enfield, Maine",Non-ref,NorthEast,17347.3800,01,45.236175,-68.651383,ME,,yes,107.0,60.0,20.0,POINT (-68.65138 45.23617)
9057,01015800,"Aroostook River near Masardis, Maine",Non-ref,NorthEast,2313.7550,01,46.523003,-68.371764,ME,,yes,51.0,51.0,19.0,POINT (-68.37176 46.523)


In [4]:
def find_best_match(riv_gdf, point, drainage_area=None, buffer_dist = 0.01):
    """Finds the closest flowline to the point based on Drainage area and location
    """
    bounds = point.buffer(buffer_dist).bounds
    candidate_indices = list(sindex.intersection(bounds))
    
    if not candidate_indices:
        return None, None, None
    
    candidates = riv_gdf.iloc[candidate_indices].copy()
    candidates['distance'] = candidates.geometry.distance(point)
    
    if drainage_area is not None and drainage_area > 0:
        candidates['area_diff_pct'] = abs(candidates['uparea'] - drainage_area) / drainage_area * 100
        
        # Combined score (weighted average of distance and area difference)
        max_dist = candidates['distance'].max() or 1.0
        candidates['score'] = (
            0.3 * (candidates['distance'] / max_dist) + 
            0.7 * np.minimum(candidates['area_diff_pct'] / 100, 1.0)
        )
        
        best_idx = candidates['score'].idxmin()
        best_match = candidates.loc[best_idx]
        area_diff_pct = best_match['area_diff_pct']
    else:
        # Just use distance if drainage area not available
        best_idx = candidates['distance'].idxmin()
        best_match = candidates.loc[best_idx]
        area_diff_pct = None
    
    return best_idx, best_match['distance'], area_diff_pct

In [47]:
print("Matching gages to flowlines...")
matches = []

for idx, gage in tqdm(gages_filtered.iterrows()):
    drainage_area = gage.get('DRAIN_SQKM')
    
    match_idx, distance, area_diff_pct = find_best_match(riv, gage.geometry, drainage_area)
    
    if match_idx is not None:
        flowline = riv.loc[match_idx]
        
        # Determine match quality
        if area_diff_pct is not None:
            if distance < 0.001 and area_diff_pct < 10:  # Close match in both location and area
                quality = 'excellent'
            elif distance < 0.005 and area_diff_pct < 20:
                quality = 'good'
            elif distance < 0.01 and area_diff_pct < 50:
                quality = 'fair'
            else:
                quality = 'poor'
        else:
            # Base quality only on distance
            if distance < 0.001:
                quality = 'distance_only_excellent'
            elif distance < 0.005:
                quality = 'distance_only_good'
            else:
                quality = 'distance_only_fair'
        
        match_data = {
            'STAID': gage.get('STAID', str(idx)),
            'STANAME': gage.get('STANAME', f"Gage_{idx}"),
            'CLASS': gage.get('CLASS'),
            'STATE': gage.get('STATE'),
            'COMID': flowline.get('COMID', str(match_idx)),
            'distance': distance,
            'DRAIN_SQKM': drainage_area,
            'flowline_drainage_area': flowline.get('uparea'),
            'area_difference_pct': area_diff_pct,
            'match_quality': quality
        }
        
        matches.append(match_data)

# Create DataFrame with matches
matches_df = pd.DataFrame(matches)

# Sort by quality and area difference
if 'match_quality' in matches_df.columns and 'area_difference_pct' in matches_df.columns:
    quality_order = {
        'excellent': 0, 
        'good': 1, 
        'fair': 2, 
        'poor': 3,
        'distance_only_excellent': 4,
        'distance_only_good': 5,
        'distance_only_fair': 6
    }
    matches_df['quality_rank'] = matches_df['match_quality'].map(quality_order)
    matches_df = matches_df.sort_values(['quality_rank', 'area_difference_pct', 'distance'])
else:
    matches_df = matches_df.sort_values('distance')

# Print summary
print("\nMatching Summary:")
print(f"Total gages: {len(gages_filtered)}")
print(f"Total matched: {len(matches_df)}")

if 'match_quality' in matches_df.columns:
    quality_counts = matches_df['match_quality'].value_counts()
    for quality, count in quality_counts.items():
        print(f"{quality} matches: {count}")

Matching gages to flowlines...


0it [00:00, ?it/s]


Matching Summary:
Total gages: 3213
Total matched: 3155
excellent matches: 1969
good matches: 576
fair matches: 315
poor matches: 295


In [57]:
display(matches_df[matches_df["match_quality"] != "poor"].shape)
matches_df[matches_df["match_quality"] != "poor"].head()

(2860, 11)

Unnamed: 0,STAID,STANAME,CLASS,STATE,COMID,distance,DRAIN_SQKM,flowline_drainage_area,area_difference_pct,match_quality,quality_rank
921,9057500,"BLUE RIVER BELOW GREEN MOUNTAIN RESERVOIR, CO.",Non-ref,CO,77006776,0.000249,1494.94,1494.942711,0.000181,excellent,0
1982,2387500,"OOSTANAULA RIVER AT RESACA, GA",Non-ref,GA,73011126,0.000446,4153.317,4153.307263,0.000234,excellent,0
2617,1613000,"POTOMAC RIVER AT HANCOCK, MD",Non-ref,MD,73005077,0.000889,10556.82,10556.879872,0.000567,excellent,0
1972,3421000,"COLLINS RIVER NEAR MCMINNVILLE, TN",Non-ref,TN,74054427,4.9e-05,1664.06,1664.048272,0.000705,excellent,0
2305,1541000,"West Branch Susquehanna River at Bower, PA",Non-ref,PA,73005411,0.000133,817.7463,817.75596,0.001181,excellent,0


In [58]:
output_df = matches_df[matches_df["match_quality"] != "poor"]
# mask = ((output_df["STAID"].astype(str).str.len() > 8) & (output_df["STAID"].astype(str).str[0] == '0')) | (output_df["STAID"].astype(str).str.len() > 9)
# output = output_df[~mask]
output_df.to_csv("/projects/mhpi/tbindas/marquette/marquette/conf/obs_ref/obs_2860.csv")

In [53]:
# gages[gages["STAID"].isin(output_df["STAID"])].to_file("/Users/taddbindas/data/gages_3000/subset_conus_2995_gages.gpkg")

### See how many gauges are in our zarr store

In [10]:
root = zarr.open_group("/projects/mhpi/data/observations/gages_9000.zarr")
root

<zarr.hierarchy.Group '/'>

In [11]:
not_arr = []
in_arr = []
for idx, gage in tqdm(output_df.iterrows()):
    if gage["STAID"].zfill(8) in root:
        in_arr.append(idx)
    else:
        not_arr.append(idx)
        
print(f"{len(in_arr)} / {len(output_df)} present in the zarr store")

0it [00:00, ?it/s]

4014 / 4014 present in the zarr store


In [21]:
mask = ((output_df["STAID"].astype(str).str.len() > 8) & (output_df["STAID"].astype(str).str[0] == '0')) | (output_df["STAID"].astype(str).str.len() > 9)
mask.sum()

16

In [24]:
filtered_ids = output_df.loc[mask, "STAID"]
filtered_ids.to_csv("/projects/mhpi/data/MERIT/gage_information/formatted_gage_csvs/filtered_large_staids.txt", index=False, header=False)