In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import spearmanr
from pathlib import Path
import geopandas as gpd
import warnings

warnings.filterwarnings('ignore')

DRAW_FIGURE = False
folder = Path('./2.swot-selection-fig')
folder.mkdir(parents=True, exist_ok=True)
print(f"Directory ready: {folder}")

# === Step 0: Load and merge data ===
df_all = pd.read_csv('1.all_matched_points_VersionD.csv')
df_comid = gpd.read_file('../1-buffer-match/600mRiverReach-straightBuffer-corrected-20station.shp')[['stnmpy','COMID','STCD']]
df_comid = df_comid.rename(columns={'stnmpy': 'stationid'})

df_all = df_all.replace(-999999999999, np.nan).dropna()
df_all = df_all.merge(df_comid, on='STCD', how='inner')

# Remove nodes with fewer than 10 observations
df_all['node_data_num'] = df_all.groupby('node_id')['lat'].transform('count')
df_all = df_all[df_all['node_data_num'] >= 10]
stationids = df_all['stationid'].unique()

# Compute rank correlation per node
def calc_spearman(group):
    try:
        return pd.Series({'rank_corr': spearmanr(group['width'], group['wse'])[0]})
    except:
        return pd.Series({'rank_corr': np.nan})

df_node = df_all.groupby('node_id').apply(calc_spearman).reset_index()
df_node = df_node.merge(df_all[['node_id', 'stationid']].drop_duplicates(), on='node_id')

# Select node with max rank_corr per station
idx = df_node.groupby('stationid')['rank_corr'].idxmax()
df_node_rmax = df_node.loc[idx]
df_all = df_all[df_all['node_id'].isin(df_node_rmax['node_id'])]

# Load width range table
df_w = pd.read_csv('1.width_range.csv', index_col='stationid')
df_final = []

# Process each station
def process_station(s):
    if s not in df_w.index:
        return None

    df = df_all[df_all['stationid'] == s].copy()
    df['width_u_r'] = df['width_u'] / df['width']
    df1 = df[(df['wse_u'] <= 0.4) & (df['width_u_r'] <= 0.1)]
    if len(df1) < 5:
        return None

    # Step 2: Order consistency filtering
    max_iter = 50
    for _ in range(max_iter):
        n = len(df1)
        w, h = df1['width'].values, df1['wse'].values
        w_diff = np.repeat(w, n).reshape(n, -1) - np.tile(w, (n, 1))
        h_diff = np.repeat(h, n).reshape(n, -1) - np.tile(h, (n, 1))
        wh = w_diff * h_diff
        inverse = np.count_nonzero(wh < 0, axis=1)
        idx_max = np.argmax(inverse)
        if inverse[idx_max] / n < 0.5:
            break
        df1 = df1.drop(df1.index[idx_max])

    # Step 3: Outlier filtering
    w_low, w_high = df_w.loc[s, ['w_low', 'w_high']]
    d_bankfull = 0.27 * (w_high / 7.2)**0.6
    h50 = df1['wse'].median()
    df3 = df1[(df1['wse'] <= h50 + d_bankfull) & (df1['wse'] >= h50 - d_bankfull)]
    if len(df3) < 5:
        return None

    if DRAW_FIGURE:
        plot_steps(df, df1, df3, s)

    return df3

def plot_steps(df, df1, df3, station_id):
    s1_drop = df[~df.index.isin(df1.index)]
    s2_drop = df1[~df1.index.isin(df3.index)]

    fig, ax = plt.subplots(figsize=(8, 6))
    ax.errorbar(df['width'], df['wse'], xerr=df['width_u'], yerr=df['wse_u'], fmt='none', lw=0.5, ecolor='lightgray', zorder=0)
    ax.scatter(s1_drop['width'], s1_drop['wse'], c='blue', s=12, label='Step 1 dropped', alpha=0.6, marker='x')
    ax.scatter(s2_drop['width'], s2_drop['wse'], c='green', s=12, label='Step 2/3 dropped', alpha=0.6, marker='^')
    ax.scatter(df3['width'], df3['wse'], c='red', s=12, label='Retained', alpha=0.8, marker='o')
    ax.set_xlabel('Width (m)')
    ax.set_ylabel('WSE (m)')
    ax.legend()
    ax.set_title(f'{station_id} - Data QC steps')
    fig.tight_layout()
    plt.savefig(f'{folder}/{station_id}.png', dpi=200)
    plt.close()

# Main loop
for s in stationids:
    print(f"Processing station: {s}")
    result = process_station(s)
    if result is not None:
        df_final.append(result)

# Combine and export
if df_final:
    df_final = pd.concat(df_final).drop_duplicates(subset=['node_id', 'time', 'stationid'])
    df_final.reset_index(drop=True, inplace=True)
    df_final.insert(0, 'COMID', df_final.pop('COMID'))
    df_final.to_csv('2.swot-points-selection.csv', index=False)
    print("✅ Data saved: 2.swot-points-selection.csv")
else:
    print("⚠️ No valid data retained.")

Directory ready: 2.swot-selection-fig


KeyError: 'STCD'