In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np
from shapely.geometry import Point, LineString
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import zarr

import warnings
warnings.filterwarnings('ignore', category=UserWarning)

In [2]:
riv = gpd.read_file("/projects/mhpi/data/MERIT/raw/continent/riv_pfaf_7_MERIT_Hydro_v07_Basins_v01_bugfix1.shp")
gages = gpd.read_file("/projects/mhpi/data/MERIT/gage_information/gage_flowline_intersections/gnn_dataset_v1_2.shp").to_crs("EPSG:4326")
# gages = gpd.read_file("")

In [3]:
gages["STAID"] = gages["STAID"].astype(int)
gages = gages.drop(columns=['lengthkm', 'lengthdir', 'sinuosity', 'slope',
       'uparea', 'order_', 'strmDrop_t', 'slope_taud', 'NextDownID', 'maxup',
       'up1', 'up2', 'up3', 'up4', 'BUFF_DIST', 'ORIG_FID', 'FID_riv__1'])

In [7]:
gages.to_csv("/projects/mhpi/tbindas/marquette/marquette/conf/obs_ref/chapter_2_exp.csv")

In [11]:
grdc = gpd.read_file("/projects/mhpi/hjj5218/data/dMC_global/gage_information/gage_flowline_intersections/info_stations_GRDC_large_river_intersection.shp")
sindex = riv.sindex
grdc.head()

Unnamed: 0,STAID,river,station,LAT_GAGE,LNG_GAGE,DRAIN_SQKM,altitude,COMID,uparea,geometry
0,1134900,RIVER NIGER,ANSONGO,15.6667,0.5,566000.0,242.0,14016055,489426.4,POINT (0.5 15.6667)
1,1147010,CONGO RIVER,KINSHASA,-4.3,15.3,3475000.0,-999.0,13060250,3605897.0,POINT (15.3 -4.3)
2,1159100,ORANGE RIVER,VIOOLSDRIF (27811003),-28.75799,17.721591,866486.0,152.0,12080641,788915.8,POINT (17.72159 -28.75799)
3,1159103,ORANGE RIVER,PELLA MISSION,-28.963403,19.151922,859920.0,-999.0,12080676,767285.8,POINT (19.15192 -28.9634)
4,1159105,ORANGE RIVER,SENDELINGSDRIF,-28.075833,16.898333,985370.0,-999.0,12080609,885167.1,POINT (16.89833 -28.07583)


In [4]:
def find_best_match(riv_gdf, point, drainage_area=None, buffer_dist = 0.01):
    """Finds the closest flowline to the point based on Drainage area and location
    """
    bounds = point.buffer(buffer_dist).bounds
    candidate_indices = list(sindex.intersection(bounds))
    
    if not candidate_indices:
        return None, None, None
    
    candidates = riv_gdf.iloc[candidate_indices].copy()
    candidates['distance'] = candidates.geometry.distance(point)
    
    if drainage_area is not None and drainage_area > 0:
        candidates['area_diff_pct'] = abs(candidates['uparea'] - drainage_area) / drainage_area * 100
        
        # Combined score (weighted average of distance and area difference)
        max_dist = candidates['distance'].max() or 1.0
        candidates['score'] = (
            0.3 * (candidates['distance'] / max_dist) + 
            0.7 * np.minimum(candidates['area_diff_pct'] / 100, 1.0)
        )
        
        best_idx = candidates['score'].idxmin()
        best_match = candidates.loc[best_idx]
        area_diff_pct = best_match['area_diff_pct']
    else:
        # Just use distance if drainage area not available
        best_idx = candidates['distance'].idxmin()
        best_match = candidates.loc[best_idx]
        area_diff_pct = None
    
    return best_idx, best_match['distance'], area_diff_pct

In [5]:
print("Matching gages to flowlines...")
matches = []

for idx, gage in tqdm(grdc.iterrows()):
    drainage_area = gage.get('DRAIN_SQKM')
    
    match_idx, distance, area_diff_pct = find_best_match(riv, gage.geometry, drainage_area)
    
    if match_idx is not None:
        flowline = riv.loc[match_idx]
        
        # Determine match quality
        if area_diff_pct is not None:
            if distance < 0.001 and area_diff_pct < 10:  # Close match in both location and area
                quality = 'excellent'
            elif distance < 0.005 and area_diff_pct < 20:
                quality = 'good'
            elif distance < 0.01 and area_diff_pct < 50:
                quality = 'fair'
            else:
                quality = 'poor'
        else:
            # Base quality only on distance
            if distance < 0.001:
                quality = 'distance_only_excellent'
            elif distance < 0.005:
                quality = 'distance_only_good'
            else:
                quality = 'distance_only_fair'
        
        match_data = {
            'STAID': gage.get('STAID', str(idx)),
            'STANAME': gage.get('STANAME', f"Gage_{idx}"),
            'COMID': flowline.get('COMID', str(match_idx)),
            'distance': distance,
            'DRAIN_SQKM': drainage_area,
            'flowline_drainage_area': flowline.get('uparea'),
            'area_difference_pct': area_diff_pct,
            'match_quality': quality
        }
        
        matches.append(match_data)

# Create DataFrame with matches
matches_df = pd.DataFrame(matches)

# Sort by quality and area difference
if 'match_quality' in matches_df.columns and 'area_difference_pct' in matches_df.columns:
    quality_order = {
        'excellent': 0, 
        'good': 1, 
        'fair': 2, 
        'poor': 3,
        'distance_only_excellent': 4,
        'distance_only_good': 5,
        'distance_only_fair': 6
    }
    matches_df['quality_rank'] = matches_df['match_quality'].map(quality_order)
    matches_df = matches_df.sort_values(['quality_rank', 'area_difference_pct', 'distance'])
else:
    matches_df = matches_df.sort_values('distance')

# Print summary
print("\nMatching Summary:")
print(f"Total gages: {len(grdc)}")
print(f"Total matched: {len(matches_df)}")

if 'match_quality' in matches_df.columns:
    quality_counts = matches_df['match_quality'].value_counts()
    for quality, count in quality_counts.items():
        print(f"{quality} matches: {count}")

Matching gages to flowlines...


NameError: name 'grdc' is not defined

In [19]:
display(matches_df[matches_df["match_quality"] != "poor"][matches_df["match_quality"] != "fair"].shape)
matches_df[matches_df["match_quality"] != "poor"][matches_df["match_quality"] != "fair"].tail()

(20, 9)

Unnamed: 0,STAID,STANAME,COMID,distance,DRAIN_SQKM,flowline_drainage_area,area_difference_pct,match_quality,quality_rank
8,4122903,Gage_61,74028912,0.001367,1084844.8,1063917.0,1.929063,good,1
4,4122650,Gage_57,74028928,0.001067,1072153.8,1051214.0,1.953047,good,1
22,4352100,Gage_79,77032218,0.001179,631960.0,619064.7,2.040523,good,1
1,4115201,Gage_54,78017389,0.001833,665371.0,651615.2,2.06739,good,1
0,4115200,Gage_53,78019675,0.0023,613830.0,601052.9,2.081536,good,1


In [21]:
output_df = matches_df[matches_df["match_quality"] != "poor"][matches_df["match_quality"] != "fair"]
output_df

Unnamed: 0,STAID,STANAME,COMID,distance,DRAIN_SQKM,flowline_drainage_area,area_difference_pct,match_quality,quality_rank
15,4152050,Gage_68,77032221,0.000867,618715.0,618903.8,0.030514,excellent,0
14,4143550,Gage_67,72045866,0.000849,773892.0,773367.3,0.067801,excellent,0
12,4127503,Gage_65,74036826,0.000633,1805230.0,1789711.0,0.859642,excellent,0
11,4127502,Gage_64,74045105,0.000542,1835274.0,1818953.0,0.889278,excellent,0
13,4127800,Gage_66,74069844,0.000613,2964255.0,2937502.0,0.902525,excellent,0
6,4122901,Gage_59,74036915,0.000767,1296813.0,1282949.0,1.069066,excellent,0
5,4122900,Gage_58,74036886,9.4e-05,1357678.0,1339481.0,1.3403,excellent,0
16,4213710,Gage_73,71011345,0.000783,1000000.0,975832.3,2.416768,excellent,0
3,4121801,Gage_56,74028940,0.000833,846305.8,824471.0,2.580016,excellent,0
9,4123050,Gage_62,74045132,0.0026,525770.0,526009.3,0.045515,good,1


In [22]:
output_df.to_csv("/projects/mhpi/tbindas/marquette/marquette/conf/obs_ref/grdc_conus_v1.csv")