In [None]:
from preprocess import load_and_merge_data
from fit_model import fit_all_stations
from generate_hypso import generate_all_hypso
from estimate_discharge import estimate_all_discharge

def main():
    print("=== Loading and preparing data ===")
    df_s3, df_attr, df_w = load_and_merge_data()

    print("\n=== Fitting WSE-Width models ===")
    df_fit = fit_all_stations(df_s3, df_attr, df_w)
    df_fit.to_csv('3/fit_proba_modified_q50.csv')

    print("\n=== Generating Hypsometric curves ===")
    df_hypso = generate_all_hypso(df_fit)
    df_hypso.to_csv('3/hypso_med_modified_q50.csv', index=False)

    print("\n=== Estimating discharge and evaluating ===")
    df_eval = estimate_all_discharge(df_fit, df_hypso)
    df_eval.to_csv('3/q_kge_med_modified_q50.csv', index=False)
    print("\n=== All Done ===")

if __name__ == "__main__":
    main()


import numpy as np

R_LIST = np.array([0.5, 1, 2])
GAP_LIST = np.array([-0.1, 0, 0.1])
W_LIST = np.array([0.3, 0.5, 0.7])
MANNING_N = 0.035
START_DATE = '1979-01-01'


import pandas as pd

def load_and_merge_data():
    df_s3 = pd.read_csv('swot_s3.csv')
    df_s3 = df_s3.drop_duplicates(subset=['stationid','time'], keep='first')

    df_attr = pd.read_csv('gages3000_GRFR_q50_slp.csv')
    df_q50 = pd.read_csv('q50_weighted.csv')
    df_attr = df_attr.merge(df_q50[['stationid','q50_weighted']], on='stationid', how='inner')
    df_attr = df_attr.drop_duplicates(subset='COMID').set_index('COMID')

    df_w = pd.read_csv('1_3_w50.csv', index_col='stationid')
    df_s3 = df_s3[df_s3['stationid'].isin(df_w.index)]

    return df_s3, df_attr, df_w


import pandas as pd
import numpy as np
from scipy.optimize import least_squares
from scipy.stats import linregress
from config import R_LIST, GAP_LIST, W_LIST

def PowerFunc(params, X, y):
    wse0, a, b = params
    return y - (wse0 + a * X**b)

def Loss(z):
    rho = np.zeros((3, len(z)))
    rho[0] = 2 * ((1 + z) ** 0.5 - 1)
    rho[1] = (1 + z) ** -0.5
    rho[2] = -0.5 * (1 + z) ** -1.5
    return rho

def fit_all_stations(df_s3, df_attr, df_w):
    stationids = df_s3['stationid'].unique()
    results = []

    for s in stationids:
        df = df_s3[df_s3['stationid'] == s]
        if df.empty: continue
        comid = df['COMID'].iloc[0]
        try:
            q50 = df_attr.loc[comid, 'q50_weighted']
            slp = df_attr.loc[comid, 'slope']
            w50, w_low, w_high = df_w.loc[s, ['w50', 'w_low', 'w_high']]
        except:
            continue

        d_bankfull = 0.27 * (w_high / 7.2) ** 0.6
        df['w50_diff'] = np.abs(df['width'] - w50)
        df = df.sort_values('w50_diff').copy()

        if df['width'].nunique() < 2:
            continue

        x = df.iloc[:5]['width'].values
        y = df.iloc[:5]['wse'].values
        res = linregress(x, y)
        h50 = res[0] * w50 + res[1] if res[0] >= 0 else y.mean()

        df_fit = df[(df['width'] >= w_low) & (df['width'] <= w_high)]
        if len(df_fit) < 3: continue

        swot_max = df_fit.sort_values('wse', ascending=False).iloc[0]
        d_wsemax = 0.27 * (swot_max['width'] / 7.2) ** 0.6
        a50 = (q50 * 0.035 / slp**0.5 * w50**(2/3))**(3/5)

        for r in R_LIST:
            for gap in GAP_LIST:
                for w in W_LIST:
                    a_low = a50 * (r+1) / r / w50**(r+1)
                    h0 = h50 - a_low * w50**r
                    h_low = h0 + a_low * w_low**r
                    h_high = swot_max['wse'] + (d_bankfull - d_wsemax) + gap * d_bankfull
                    xdata = np.insert(df_fit['width'].values, 0, [w_low, w_high])
                    ydata = np.insert(df_fit['wse'].values, 0, [h_low, h_high])
                    a_default = (h_high - h0) / w_high**2

                    ls = least_squares(PowerFunc, x0=[h0, a_default, 2], loss=Loss, args=(xdata, ydata))
                    results.append({
                        'stationid': s, 'R': r, 'GAP': gap, 'W': w,
                        'wse0': ls.x[0], 'a': ls.x[1], 'b': ls.x[2],
                        'a50': a50, 'w50': w50, 'q50': q50,
                        'w_low': w_low, 'w_high': w_high,
                        'h_low': h_low, 'h_high': h_high,
                        'slp': slp
                    })
    return pd.DataFrame(results)


import pandas as pd
import numpy as np

def generate_all_hypso(df_fit_all):
    stationids = sorted(df_fit_all['stationid'].unique())
    df_res = []

    for s in stationids:
        df_fit = df_fit_all[df_fit_all['stationid'] == s]
        w_low, w_high, w50, a50 = df_fit.iloc[0][['w_low','w_high','w50','a50']]
        w_list = np.linspace(w_low, w_high, 100)
        h_list = [np.median(df_fit['wse0'] + df_fit['a'] * w**df_fit['b']) for w in w_list]
        hmax = [np.max(df_fit['wse0'] + df_fit['a'] * w**df_fit['b']) for w in w_list]
        hmin = [np.min(df_fit['wse0'] + df_fit['a'] * w**df_fit['b']) for w in w_list]
        
        df_med = pd.DataFrame({'stationid': s, 'width': w_list, 'wse': h_list,
                               'wse_max': hmax, 'wse_min': hmin})
        
        idx50 = np.searchsorted(w_list, w50)
        h50 = np.interp(w50, [w_list[idx50-1], w_list[idx50]], [h_list[idx50-1], h_list[idx50]])
        df_med.loc[idx50, 'area'] = a50 + 0.5 * (w50 + w_list[idx50]) * (h_list[idx50] - h50)

        for i in range(idx50 + 1, len(w_list)):
            df_med.loc[i, 'area'] = df_med.loc[i - 1, 'area'] + 0.5 * (w_list[i-1] + w_list[i]) * (h_list[i] - h_list[i-1])
        for i in range(idx50 - 1, -1, -1):
            df_med.loc[i, 'area'] = df_med.loc[i + 1, 'area'] - 0.5 * (w_list[i+1] + w_list[i]) * (h_list[i+1] - h_list[i])
        df_res.append(df_med)
    
    return pd.concat(df_res, ignore_index=True)

import pandas as pd
import numpy as np
import os
from config import START_DATE
from sklearn.metrics import mean_squared_error

def kge(obs, sim):
    r = np.corrcoef(obs, sim)[0,1]
    alpha = np.mean(sim)/np.mean(obs)
    beta = np.std(sim)/np.mean(sim) / (np.std(obs)/np.mean(obs))
    return 1 - np.sqrt((r - 1)**2 + (alpha - 1)**2 + (beta - 1)**2)

def nse(obs, sim):
    return 1 - (np.sum((obs - sim) ** 2) / np.sum((obs - np.mean(obs)) ** 2))

def relative_rmse(obs, sim):
    rmse = np.sqrt(mean_squared_error(obs, sim))
    return rmse / np.mean(obs)


def estimate_all_discharge(df_fit_all, df_med_all):
    folder = 'daily_long/daily_long'
    width_df = pd.read_csv('gages3000_glow_datemean_width_timeseries.csv')
    width_df['date'] = pd.to_datetime(width_df['date'])

    stationids = df_fit_all['stationid'].unique()
    all_results = []

    for s in stationids:
        file_path = os.path.join(folder, f"{s}.csv")
        if not os.path.exists(file_path): continue
        df_val = pd.read_csv(file_path)
        df_val['date'] = pd.date_range(start=START_DATE, periods=len(df_val), freq='D')
        df_val = df_val.dropna(subset=['qobs'])
        df_val = df_val.merge(width_df[width_df['stationid'] == s], on=['stationid', 'date'], how='inner')

        df_fit = df_fit_all[df_fit_all['stationid'] == s]
        df_med = df_med_all[df_med_all['stationid'] == s].reset_index(drop=True)
        w_low, w_high, slp = df_fit.iloc[0][['w_low','w_high','slp']]
        df_val = df_val[(df_val['width'] >= w_low) & (df_val['width'] <= w_high)]

        if len(df_val) < 10: continue
        idx = np.searchsorted(df_med['width'], df_val['width'])
        df_val['idx'] = idx
        df_val['width_i-1'] = df_med['width'].iloc[idx-1].values
        df_val['width_i'] = df_med['width'].iloc[idx].values
        df_val['wse_i-1'] = df_med['wse'].iloc[idx-1].values
        df_val['wse_i'] = df_med['wse'].iloc[idx].values
        df_val['area_i-1'] = df_med['area'].iloc[idx-1].values

        delta_w = df_val['width'] - df_val['width_i-1']
        delta_wi = df_val['width_i'] - df_val['width_i-1']
        delta_h = df_val['wse_i'] - df_val['wse_i-1']

        df_val['area_hypso'] = df_val['area_i-1'] + 0.5 * (df_val['width_i-1'] + df_val['width']) * delta_h * (delta_w / delta_wi)
        df_val['Q_est'] = df_val['area_hypso']**(5/3) * df_val['width']**(-2/3) * slp**0.5 / 0.035

        df_val = df_val.dropna()
        df_val['kge'] = kge(df_val['qobs'], df_val['Q_est'])
        df_val['nse'] = nse(df_val['qobs'], df_val['Q_est'])
        df_val['nrmse'] = relative_rmse(df_val['qobs'], df_val['Q_est'])

        all_results.append(df_val[['stationid','date','width','area_hypso','qobs','Q_est','kge','nse','nrmse']])
    
    return pd.concat(all_results, ignore_index=True)
