In [1]:
import requests
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.ensemble import IsolationForest
import time
from tqdm import tqdm  # Progress bar

# Load metadata CSV
metadata_path = "C:\\Users\\pbenko\\Documents\\SQL saved scripts\\meta_data_for_all_active_systems_for_calculation_20250401(CSV).csv"
metadata_df = pd.read_csv(metadata_path, dtype={"STATION_ID": str})  # Ensure STATION_ID is a string

# Get the first 840 rows of metadata
filtered_metadata = metadata_df.head(840).copy()

# Extract STATION_IDs from the first 840 rows
site_ids = filtered_metadata["STATION_ID"].tolist()

# Initialize results DataFrame
df_results = pd.DataFrame()

# Start progress bar
start_time = time.time()
with tqdm(total=len(site_ids), desc="Processing Stations", unit="station") as pbar:
    for i, DivrtID in enumerate(site_ids, start=1):
        end_date = datetime.today().strftime("%Y-%m-%d")  # Get today's date in YYYY-MM-DD format
        api_url = f"https://www.waterrights.utah.gov/dvrtdb/daily-chart.asp?station_id={DivrtID}&end_date={end_date}&f=json"

        response = requests.get(api_url)
        if response.status_code == 200:
            data = response.json()
            if "data" in data:
                df = pd.DataFrame(data["data"], columns=["date", "value"])
                df.rename(columns={"date": "Date", "value": "DISCHARGE"}, inplace=True)
                df["STATION_ID"] = DivrtID  # Ensure STATION_ID is included

                # Convert 'DISCHARGE' to numeric
                df["DISCHARGE"] = pd.to_numeric(df["DISCHARGE"], errors="coerce")

                # === FLAGGING CRITERIA === #
                df["FLAG_NEGATIVE"] = df["DISCHARGE"] < 0
                df["FLAG_ZERO"] = df["DISCHARGE"] == 0

                # Filter out zero values before statistical computations
                df_nonzero = df[df["DISCHARGE"] > 0].copy()

                if not df_nonzero.empty:
                    # Compute 95th percentile **excluding zero values**
                    discharge_95th_percentile = np.percentile(df_nonzero["DISCHARGE"].dropna(), 95)
                    
                    # Compute IQR **excluding zero values**
                    Q1, Q3 = df_nonzero["DISCHARGE"].quantile([0.25, 0.75])
                    IQR = Q3 - Q1

                    # Compute rate of change **excluding zero values**
                    df_nonzero["RATE_OF_CHANGE"] = df_nonzero["DISCHARGE"].diff().abs()
                    df = df.merge(df_nonzero[["Date", "RATE_OF_CHANGE"]], on="Date", how="left")

                    # Compute repeated values **excluding zero values**
                    df_nonzero["FLAG_REPEATED"] = df_nonzero["DISCHARGE"].groupby(
                        (df_nonzero["DISCHARGE"] != df_nonzero["DISCHARGE"].shift()).cumsum()
                    ).transform("count") >= 3

                    # Apply Isolation Forest **excluding zero values**
                    model = IsolationForest(contamination=0.05, random_state=42)
                    df_nonzero["OUTLIER_IF"] = model.fit_predict(df_nonzero[["DISCHARGE"]])
                    df_nonzero["OUTLIER_IF"] = df_nonzero["OUTLIER_IF"] == -1  # Convert to boolean
                    
                    # Merge non-zero flags back into main DataFrame
                    df = df.merge(df_nonzero[["Date", "OUTLIER_IF", "FLAG_REPEATED"]], on="Date", how="left")
                else:
                    discharge_95th_percentile = 0
                    IQR = 0
                    df["RATE_OF_CHANGE"] = np.nan
                    df["OUTLIER_IF"] = False
                    df["FLAG_REPEATED"] = False

                # Apply flagging based on computed values
                df["FLAG_Discharge"] = df["DISCHARGE"] > discharge_95th_percentile
                df["FLAG_IQR"] = (df["DISCHARGE"] < Q1 - 1.5 * IQR) | (df["DISCHARGE"] > Q3 + 1.5 * IQR)
                df["FLAG_RoC"] = df["RATE_OF_CHANGE"] > discharge_95th_percentile

                df["FLAGGED"] = df[
                    ["FLAG_NEGATIVE", "FLAG_ZERO", "FLAG_REPEATED", "FLAG_IQR", "OUTLIER_IF", "FLAG_Discharge", "FLAG_RoC"]
                ].any(axis=1)

                # Create summary for the current station
                station_summary = {
                    "STATION_ID": DivrtID,
                    "TOTAL_RECORDS": len(df),
                    "TOTAL_NEGATIVE": df["FLAG_NEGATIVE"].sum(),
                    "TOTAL_ZERO": df["FLAG_ZERO"].sum(),
                    "TOTAL_95th": df["FLAG_Discharge"].sum(),
                    "TOTAL_IQR": df["FLAG_IQR"].sum(),
                    "TOTAL_RoC": df["FLAG_RoC"].sum(),
                    "TOTAL_REPEATED": df["FLAG_REPEATED"].sum(),
                    "TOTAL_IF": df["OUTLIER_IF"].sum(),
                    "TOTAL_FLAGGED": df["FLAGGED"].sum(),
                    "FLAG_RATIO": f"{(df['FLAGGED'].sum() / len(df) * 100):.2f}%" if len(df) > 0 else "0.00%",
                    "ZERO_RATIO": f"{(df['FLAG_ZERO'].sum() / len(df) * 100):.2f}%" if len(df) > 0 else "0.00%"
                }

                # Append the summary to results
                df_results = pd.concat([df_results, pd.DataFrame([station_summary])], ignore_index=True)
            else:
                print(f"Error: 'data' key not found for STATION_ID {DivrtID}")
        else:
            print(f"Error fetching data for STATION_ID {DivrtID}: {response.status_code}")

        # Update progress bar and estimated time
        pbar.update(1)
        elapsed_time = time.time() - start_time
        avg_time_per_station = elapsed_time / i
        estimated_time_remaining = avg_time_per_station * (len(site_ids) - i)
        pbar.set_postfix({"ETA (min)": f"{estimated_time_remaining / 60:.2f}"})

# Merge metadata with results on STATION_ID
merged_df = pd.merge(filtered_metadata, df_results, on="STATION_ID", how="left")

# Save the final merged CSV
output_filename = "merged_840_station_metadata_flagging_criteria_results.csv"
merged_df.to_csv(output_filename, index=False)

print(f"Final merged data saved as {output_filename}")

Processing Stations: 100%|██████████| 839/839 [09:21<00:00,  1.49station/s, ETA (min)=0.00] 

Final merged data saved as merged_840_station_metadata_flagging_criteria_results.csv



