Nearest stream LINKNO

In [None]:
import os
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point

In [None]:
excel_file = '/Users/yubinbaaniya/Documents/GAUGE REVIEW/master_file.xlsx'
gpkg_file = '/Users/yubinbaaniya/Library/CloudStorage/Box-Box/master thesis and what not/Geoglows AWS files except VPU/global_streams_simplified.gpkg'
output_excel = '/Users/yubinbaaniya/Documents/GAUGE REVIEW/other calculation_ on master file/master_file_with_nearest.xlsx'

#Read and Filter Excel Data

try:
    # Read Excel file into a pandas DataFrame
    df_excel = pd.read_excel(excel_file)
except Exception as e:
    raise Exception(f"Error reading Excel file: {e}")

# Check for required latitude and longitude columns
for col in ['latitude', 'longitude']:
    if col not in df_excel.columns:
        raise ValueError(f"Excel file must contain a '{col}' column.")

# Filter out rows missing either latitude or longitude
df_excel_filtered = df_excel.dropna(subset=['latitude', 'longitude'])

# Filter rows where "TDX-Hydro Paired Rivers" is 0, empty, or null
if 'TDX-Hydro Paired Rivers' not in df_excel_filtered.columns:
    raise ValueError("Excel file must contain a 'TDX-Hydro Paired Rivers' column.")

df_excel_filtered = df_excel_filtered[
    (df_excel_filtered['TDX-Hydro Paired Rivers'].isnull()) |
    (df_excel_filtered['TDX-Hydro Paired Rivers'] == 0)
]


#Convert Filtered Points to a GeoDataFrame
# Create a geometry column from longitude and latitude
gdf_points = gpd.GeoDataFrame(
    df_excel_filtered,
    geometry=gpd.points_from_xy(df_excel_filtered['longitude'], df_excel_filtered['latitude']),
    crs="EPSG:4326"  # original coordinate system
)


#Read Stream Network from GeoPackage

try:
    gdf_streams = gpd.read_file(gpkg_file)
except Exception as e:
    raise Exception(f"Error reading GeoPackage file: {e}")

# Check for the necessary 'LINKNO' column in stream network data
if "LINKNO" not in gdf_streams.columns:
    raise ValueError("GeoPackage file must contain a 'LINKNO' column.")


# Reproject Data for Accurate Distance Calculation
# For distance calculations in meters, reproject both datasets to a projected CRS.
# EPSG:3857 (Web Mercator) is used here for simplicity.
gdf_points_proj = gdf_points.to_crs(epsg=3857)
gdf_streams_proj = gdf_streams.to_crs(epsg=3857)

#Find the Nearest Stream Segment for Each Point
# Using geopandas.sjoin_nearest
# this function will add the nearest 'LINKNO' and a 'distance' column (in meters)
try:
    # We perform a spatial join using only the necessary columns from the stream network
    gdf_nearest = gpd.sjoin_nearest(
        gdf_points_proj, 
        gdf_streams_proj[['LINKNO', 'geometry']], 
        how="left", 
        distance_col="distance"
    )
except Exception as e:
    raise Exception(f"Error during spatial join: {e}")

# (Optional) Reproject Result Back to EPSG:4326

gdf_nearest = gdf_nearest.to_crs(epsg=4326)

# Save the Output
# For output, we include all original columns plus the nearest LINKNO and distance.
# We drop the geometry column for Excel export
try:
    gdf_nearest.drop(columns="geometry").to_excel(output_excel, index=False)
    print(f"Output saved to {output_excel}")
except Exception as e:
    raise Exception(f"Error saving the output Excel file: {e}")

Most downstream LINKNO to make area covered by the gauge map

In [None]:
import pandas as pd
import networkx as nx
import numpy as np

In [None]:
# Read the datasets
network_df = pd.read_parquet('/Users/yubinbaaniya/Library/CloudStorage/Box-Box/master thesis and what not/Geoglows AWS files except VPU/v2-master-table.parquet') #contain LINKNO AND DSLINKNO to make a network
gauge_df = pd.read_excel('/Users/yubinbaaniya/Documents/GAUGE REVIEW/other calculation_ on master file/master_file_with_nearest_for_all_station&area.xlsx')

# Create a new column for the downstream flag; default is blank.
gauge_df['DownstreamFlag'] = ""

# Process each river system (VPUCode) separately.
for vpu in gauge_df['VPUCode'].dropna().unique():
    print(f"Processing VPU: {vpu}")
    
    # Subset the gauge and network data for the current VPUCode.
    gauge_subset = gauge_df[gauge_df['VPUCode'] == vpu]
    network_subset = network_df[network_df['VPUCode'] == vpu]
    
    # Build the directed graph for the river network.
    # Nodes are defined by 'LINKNO' and edges are from 'LINKNO' to 'DSLINKNO'
    G = nx.DiGraph()
    for idx, row in network_subset.iterrows():
        linkno = row['LINKNO']
        dslinkno = row['DSLINKNO']
        if pd.notnull(linkno):
            G.add_node(linkno)
        if pd.notnull(linkno) and pd.notnull(dslinkno):
            G.add_edge(linkno, dslinkno)
    
    # Filter gauge rows with a valid LINKNO_excel for connectivity analysis.
    candidates = gauge_subset[pd.notnull(gauge_subset['LINKNO_excel'])]
    candidate_ids = candidates['LINKNO_excel'].tolist()
    print(f"  Found {len(candidate_ids)} candidate gauge(s) for VPU {vpu}")
    
    # For each candidate gauge, determine its immediate downstream candidate gauge.
    downstream_mapping = {}  # key: candidate gauge id, value: downstream gauge id or "downstream"
    for candidate in candidate_ids:
        # If the candidate is not in the network graph, leave flag blank.
        if candidate not in G:
            downstream_mapping[candidate] = ""
            continue
        
        # Get all nodes downstream of the candidate.
        descendants = nx.descendants(G, candidate)
        # Filter to keep only those that are candidate gauges.
        downstream_candidates = [d for d in descendants if d in candidate_ids]
        
        if not downstream_candidates:
            # No downstream gauge candidate found: mark as "downstream".
            downstream_mapping[candidate] = "downstream"
        else:
            # Determine the immediate downstream candidate gauge.
            min_distance = np.inf
            immediate_downstream = None
            for d in downstream_candidates:
                try:
                    distance = nx.shortest_path_length(G, source=candidate, target=d)
                except nx.NetworkXNoPath:
                    continue
                if distance < min_distance:
                    min_distance = distance
                    immediate_downstream = d
                elif distance == min_distance:
                    # If there is a tie, use DSContArea (if available) as tie-breaker.
                    ds_area_current = candidates[candidates['LINKNO_excel'] == immediate_downstream]['DSContArea']
                    ds_area_candidate = candidates[candidates['LINKNO_excel'] == d]['DSContArea']
                    if (not ds_area_current.empty) and (not ds_area_candidate.empty):
                        if ds_area_candidate.iloc[0] > ds_area_current.iloc[0]:
                            immediate_downstream = d
            downstream_mapping[candidate] = immediate_downstream
    
    print(f"  Downstream mapping for VPU {vpu}: {downstream_mapping}")
    
    # Update the original gauge dataset.
    for idx, row in gauge_subset.iterrows():
        candidate = row['LINKNO_excel']
        if pd.notnull(candidate) and candidate in downstream_mapping:
            gauge_df.loc[idx, 'DownstreamFlag'] = downstream_mapping[candidate]
    
    print(f"Finished processing VPU: {vpu}\n")

# Write the output to a new Excel file.
output_path = '/Users/yubinbaaniya/Documents/GAUGE REVIEW/other calculation_ on master file/master_file_with_most downstream.xlsx'
gauge_df.to_excel(output_path, index=False)
print(f"Output written to '{output_path}'")
