In [None]:
import json
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from scipy.interpolate import interp1d
import matplotlib.pyplot as plt
import seaborn as sns

In [13]:
path_df = r"Table" + "\\"
path_model = r"Model" + "\\"
path_fig = r"Figure" + "\\"

col_list = ["River", "Station", "Discharge"]

In [None]:
sns.set_theme(style="white", font="Times New Roman")

font1 = {"family": "Times New Roman", "weight": "bold", "size": 16}

sources = ["Observed", "Predicted", "Interpolated", "Boundary matched"]
colors = ["blue", "green", "yellow", "red"]
color_dict = dict(zip(sources, colors))

# Model Training and Regression

In [15]:
file_model_performance = pd.read_csv(path_df + "Model_performance.csv")
river = file_model_performance["River"].values[0]

best_model = file_model_performance[file_model_performance["Best Model"] == 1][
    "Model"
].values[0]


file_best_paras = path_model + best_model + "_best_params.json"
with open(file_best_paras, "r") as f:
    best_params = json.load(f)


if best_model == "RF":
    model = RandomForestRegressor(**best_params)
elif best_model == "XGB":
    model = XGBRegressor(**best_params)
else:
    raise ValueError("No such model")

In [None]:
file_features = path_df + f"Selected_features_{best_model}.csv"
features = pd.read_csv(file_features).iloc[:, 0].tolist()

df = pd.read_csv(path_df + "Data1_with_nan.csv", parse_dates=["Date"], index_col="Date")
df.iloc[0]

In [None]:
df_dropna = df.dropna(subset=features + col_list)

X_train = df_dropna[features]
y_train = df_dropna["Discharge"]

model.fit(X_train, y_train)

In [None]:
# Split df to nan and non-nan
df1 = df[df.isnull().any(axis=1)]
df2 = df.dropna()[col_list]
df2["Source"] = "Observed"

# Predict nan values
df1["Discharge"] = model.predict(df1[features])
df1["Discharge"] = df1["Discharge"].clip(lower=0)
df1 = df1[col_list]
df1["Source"] = "Predicted"

# Combine df1 and df2
df = pd.concat([df1, df2]).sort_index()
df["Discharge"] = df["Discharge"].round(2)
df

# Post-processing for Temporal Consistency

In [None]:
discharge = df["Discharge"].values
source = df["Source"].values
dates = df.index

observed_indices = np.where(source == "Observed")[0]
predicted_indices = np.where(source == "Predicted")[0]

In [None]:
def find_missing_segments(predicted_indices):
    segments = []
    start = predicted_indices[0]
    for i in range(1, len(predicted_indices)):
        if predicted_indices[i] != predicted_indices[i - 1] + 1:
            segments.append((start, predicted_indices[i - 1]))
            start = predicted_indices[i]
    segments.append((start, predicted_indices[-1]))
    return segments


missing_segments = find_missing_segments(predicted_indices)

In [None]:
# Define the objective function for optimization (boundary matching)
def objective_function(
    adjustment_factors,
    discharge,
    predicted_indices,
    boundary_obs_pred,
    boundary_pred_obs,
):
    adjusted_discharge = discharge.copy()
    adjusted_discharge[predicted_indices] *= adjustment_factors

    boundary_mismatch = 0

    # Check if boundary_obs_pred is not empty and indices are valid
    if len(boundary_obs_pred) > 0 and np.all(boundary_obs_pred + 1 < len(discharge)):
        boundary_mismatch += np.sum(
            (discharge[boundary_obs_pred] - adjusted_discharge[boundary_obs_pred + 1])
            ** 2
        )

    # Check if boundary_pred_obs is not empty and indices are valid
    if len(boundary_pred_obs) > 0 and np.all(boundary_pred_obs + 1 < len(discharge)):
        boundary_mismatch += np.sum(
            (adjusted_discharge[boundary_pred_obs] - discharge[boundary_pred_obs + 1])
            ** 2
        )

    return boundary_mismatch


In [None]:
# Process each missing segment
for start_idx, end_idx in missing_segments:
    segment_length = end_idx - start_idx + 1
    segment_dates = dates[start_idx:end_idx + 1]

    # Condition (1): Length does not exceed 20 days and does not contain data from May to July
    if segment_length <= 20 and not ((segment_dates.month >= 5) & (segment_dates.month <= 7)).any():
        # Check boundaries to avoid index out of range
        if start_idx > 0 and end_idx < len(discharge) - 1:
            # Linear interpolation
            interp_func = interp1d([start_idx - 1, end_idx + 1], [discharge[start_idx - 1], discharge[end_idx + 1]], kind='linear')
            discharge[start_idx:end_idx + 1] = interp_func(np.arange(start_idx, end_idx + 1))
            source[start_idx:end_idx + 1] = "Interpolated"
        else:
            # Handle missing segments at the beginning or end of the sequence
            if start_idx == 0:
                # If the segment is at the start of the sequence, fill with the right observation value
                discharge[start_idx:end_idx + 1] = discharge[end_idx + 1]
            elif end_idx == len(discharge) - 1:
                # If the segment is at the end of the sequence, fill with the left observation value
                discharge[start_idx:end_idx + 1] = discharge[start_idx - 1]
            source[start_idx:end_idx + 1] = "Interpolated"

    else:
        # Condition (2): Boundary condition matching
        # Identify predicted values for the current segment
        predicted_segment_indices = np.arange(start_idx, end_idx + 1)
        boundary_obs_pred = np.array([start_idx - 1] if start_idx - 1 in observed_indices else [])
        boundary_pred_obs = np.array([end_idx + 1] if end_idx + 1 in observed_indices else [])

        # Initialize adjustment factors
        initial_factors = np.ones(len(predicted_segment_indices))

        # Optimize adjustment factors
        result = minimize(
            objective_function,
            initial_factors,
            args=(discharge, predicted_segment_indices, boundary_obs_pred, boundary_pred_obs),
            method="L-BFGS-B",
            bounds=[(0.2, 4)] * len(predicted_segment_indices)
        )

        # Compute X days: 10% of the segment length or 10 days, whichever is smaller
        X_days = min(int(segment_length * 0.1), 10)
        X_days = max(X_days, 3)  # Ensure X_days is at least 3
        X_days = min(X_days, segment_length)  # Ensure X_days does not exceed the segment length

        # Check if there are observed data before the segment and adjust the first X days
        if len(boundary_obs_pred) > 0 and X_days > 0:
            discharge[start_idx:start_idx + X_days] *= result.x[:X_days]
            source[start_idx:start_idx + X_days] = "Boundary matched"

        # Check if there are observed data after the segment and adjust the last X days
        if len(boundary_pred_obs) > 0 and X_days > 0:
            discharge[end_idx - X_days + 1:end_idx + 1] *= result.x[-X_days:]
            source[end_idx - X_days + 1:end_idx + 1] = "Boundary matched"

# Save the adjusted results back to the DataFrame
df["Source"] = source

df.to_csv(path_df + "Data3_filled.csv")

In [None]:
df["Source"].value_counts()

# Plot

In [None]:
date1 = pd.Timestamp(1950, 1, 1)
date2 = pd.Timestamp(2025, 1, 1)
date_list = pd.date_range(date1, date2, freq="5YS").strftime("%Y").tolist()

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))

actual_sources = df["Source"].unique()
for source in sources:
    if source in actual_sources:
        df_source = df[df["Source"] == source]
        df_source = df_source.reindex(dates)

        ax.plot(
            df_source.index, df_source["Discharge"], label=source, color=color_dict[source]
        )

ax.set_xlabel("Time", font=font1)
ax.set_ylabel("Discharge (m³/s)", font=font1)

plt.xticks(date_list, date_list, fontsize=13)
plt.yticks(fontsize=13)

plt.xlim(pd.Timestamp(1949, 1, 1), pd.Timestamp(2025, 1, 1))

plt.title(f"{river} River", font=font1)

ax.legend(fontsize=13)

plt.tight_layout()
plt.savefig(path_fig + f"Discharge_Time_Series_{river}.png", dpi=300)