In [18]:
import requests
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.ensemble import IsolationForest
import time
from tqdm import tqdm  # Progress bar

# Load metadata CSV
metadata_path = "C:\\Users\\pbenko\\Documents\\SQL saved scripts\\meta_data_for_all_active_systems_for_calculation_20250401(CSV).csv"
metadata_df = pd.read_csv(metadata_path, dtype={"STATION_ID": str})  # Ensure STATION_ID is a string

# Get the first 10 rows of metadata
filtered_metadata = metadata_df.head(10).copy()

# Extract STATION_IDs from the first 839 rows
site_ids = filtered_metadata["STATION_ID"].tolist()

# Initialize results DataFrame
df_results = pd.DataFrame()

# Start progress bar
start_time = time.time()
with tqdm(total=len(site_ids), desc="Processing Stations", unit="station") as pbar:
    for i, DivrtID in enumerate(site_ids, start=1):
        end_date = datetime.today().strftime("%Y-%m-%d")  # Get today's date in YYYY-MM-DD format
        api_url = f"https://www.waterrights.utah.gov/dvrtdb/daily-chart.asp?station_id={DivrtID}&end_date={end_date}&f=json"

        response = requests.get(api_url)
        if response.status_code == 200:
            data = response.json()
            if "data" in data:
                df = pd.DataFrame(data["data"], columns=["date", "value"])
                df.rename(columns={"date": "Date", "value": "DISCHARGE"}, inplace=True)
                df["STATION_ID"] = DivrtID  # Ensure STATION_ID is included
                
                # Convert 'Date' to datetime format
                df["Date"] = pd.to_datetime(df["Date"])
                
                # Identify irrigation and non-irrigation season
                df["SEASON"] = np.where(
                    df["Date"].dt.month.between(4, 9) | ((df["Date"].dt.month == 10) & (df["Date"].dt.day == 1)),
                    "Irrigation Season", "Non-Irrigation Season"
                )
                
                # Convert 'DISCHARGE' to numeric
                df["DISCHARGE"] = pd.to_numeric(df["DISCHARGE"], errors="coerce")
                
                # === FLAGGING CRITERIA === #
                df["FLAG_NEGATIVE"] = df["DISCHARGE"] < 0
                df["FLAG_ZERO"] = df["DISCHARGE"] == 0
                
                # Filter out zero values before statistical computations
                df_nonzero = df[df["DISCHARGE"] > 0].copy()

                # Compute IQR **excluding zero values**
                if not df_nonzero.empty:
                    Q1, Q3 = df_nonzero["DISCHARGE"].quantile([0.25, 0.75])
                    IQR = Q3 - Q1
                    discharge_95th_percentile = np.percentile(df_nonzero["DISCHARGE"].dropna(), 95)

                    # Compute rate of change **excluding zero values**
                    df_nonzero["RATE_OF_CHANGE"] = df_nonzero["DISCHARGE"].diff().abs()
                    df = df.merge(df_nonzero[["Date", "RATE_OF_CHANGE"]], on="Date", how="left")

                    # Compute repeated values >= 4 days **excluding zero values**
                    df_nonzero["FLAG_REPEATED"] = df_nonzero["DISCHARGE"].groupby(
                        (df_nonzero["DISCHARGE"] != df_nonzero["DISCHARGE"].shift()).cumsum()
                    ).transform("count") >= 4

                    # Apply Isolation Forest **excluding zero values**
                    model = IsolationForest(contamination=0.05, random_state=42)
                    df_nonzero["OUTLIER_IF"] = model.fit_predict(df_nonzero[["DISCHARGE"]])
                    df_nonzero["OUTLIER_IF"] = df_nonzero["OUTLIER_IF"] == -1

                    # === Percent Average Deviation (RSD) Outlier Detection === #
                    mean_discharge = df_nonzero["DISCHARGE"].mean()
                    df["PERCENT_DEV"] = ((df["DISCHARGE"] - mean_discharge).abs() / mean_discharge) * 100

                    # Set a threshold for extreme outliers (e.g., above 1000%)
                    threshold = 1000
                    df["FLAG_RSD"] = (df["PERCENT_DEV"] > threshold) & (df["DISCHARGE"] != 0)
                    
                    # Merge non-zero flags back into main DataFrame
                    df = df.merge(df_nonzero[["Date", "OUTLIER_IF", "FLAG_REPEATED"]], on="Date", how="left")
                else:
                    discharge_95th_percentile = 0
                    IQR = 0
                    df["RATE_OF_CHANGE"] = np.nan
                    df["OUTLIER_IF"] = False
                    df["FLAG_REPEATED"] = False
                    df["PERCENT_DEV"] = np.nan
                    df["FLAG_RSD"] = False
                
                # Apply flagging based on computed values
                df["FLAG_Discharge"] = df["DISCHARGE"] > discharge_95th_percentile
                df["FLAG_IQR"] = (df["DISCHARGE"] < Q1 - 1.5 * IQR) | (df["DISCHARGE"] > Q3 + 1.5 * IQR)
                df["FLAG_RoC"] = df["RATE_OF_CHANGE"] > discharge_95th_percentile
                
                df["FLAGGED"] = df[
                    ["FLAG_NEGATIVE", "FLAG_ZERO", "FLAG_REPEATED", "FLAG_IQR", "OUTLIER_IF", "FLAG_Discharge", "FLAG_RoC", "FLAG_RSD"]
                ].any(axis=1)

                # Create summary for the current station
                total_flagged = df["FLAGGED"].sum()

                station_summary = {
                    "STATION_ID": DivrtID,
                    "TOTAL_RECORDS": len(df),
                    "TOTAL_NEGATIVE": df["FLAG_NEGATIVE"].sum(),
                    "TOTAL_ZERO": df["FLAG_ZERO"].sum(),
                    "TOTAL_95th": df["FLAG_Discharge"].sum(),
                    "TOTAL_IQR": df["FLAG_IQR"].sum(),
                    "TOTAL_RoC": df["FLAG_RoC"].sum(),
                    "TOTAL_REPEATED": df["FLAG_REPEATED"].sum(),
                    "TOTAL_IF": df["OUTLIER_IF"].sum(),
                    "TOTAL_RSD": df["FLAG_RSD"].sum(),
                    "TOTAL_FLAGGED": total_flagged,  # Use the defined variable here
                    "NEGATIVE_RATIO": (df["FLAG_NEGATIVE"].sum() / total_flagged) * 100 if total_flagged else 0,
                    "ZERO_RATIO": (df["FLAG_ZERO"].sum() / total_flagged) * 100 if total_flagged else 0,
                    "DISCHARGE_RATIO": (df["FLAG_Discharge"].sum() / total_flagged) * 100 if total_flagged else 0,
                    "IQR_RATIO": (df["FLAG_IQR"].sum() / total_flagged) * 100 if total_flagged else 0,
                    "RoC_RATIO": (df["FLAG_RoC"].sum() / total_flagged) * 100 if total_flagged else 0,
                    "REPEATED_RATIO": (df["FLAG_REPEATED"].sum() / total_flagged) * 100 if total_flagged else 0,
                    "IF_RATIO": (df["OUTLIER_IF"].sum() / total_flagged) * 100 if total_flagged else 0,
                    "RSD_RATIO": (df["FLAG_RSD"].sum() / total_flagged) * 100 if total_flagged else 0,
                }

                # Append the summary to results
                df_results = pd.concat([df_results, pd.DataFrame([station_summary])], ignore_index=True)
                
                # Created Summary for the irrigation season April 1st to Oct. 1st
                total_ir_flagged = df["FLAGGED"].sum()
                
                irrigation_season_summary = {
                    "TOTAL_IR_RECORDS": len(df),
                    "TOTAL_IR_NEGATIVE": df["FLAG_NEGATIVE"].sum(),
                    "TOTAL_IR_ZERO": df["FLAG_ZERO"].sum(),
                    "TOTAL_IR_95th": df["FLAG_Discharge"].sum(),
                    "TOTAL_IR_IQR": df["FLAG_IQR"].sum(),
                    "TOTAL_IR_RoC": df["FLAG_RoC"].sum(),
                    "TOTAL_IR_REPEATED": df["FLAG_REPEATED"].sum(),
                    "TOTAL_IR_IF": df["OUTLIER_IF"].sum(),
                    "TOTAL_IR_RSD": df["FLAG_RSD"].sum(),
                    "TOTAL_IR_FLAGGED": total_ir_flagged,  # Keeping total flagged count
                    "NEGATIVE_IR_RATIO": (df["FLAG_NEGATIVE"].sum() / total_ir_flagged) * 100 if total_ir_flagged else 0,
                    "ZERO_IR_RATIO": (df["FLAG_ZERO"].sum() / total_ir_flagged) * 100 if total_ir_flagged else 0,
                    "DISCHARGE_IR_RATIO": (df["FLAG_Discharge"].sum() / total_ir_flagged) * 100 if total_ir_flagged else 0,
                    "IQR_IR_RATIO": (df["FLAG_IQR"].sum() / total_ir_flagged) * 100 if total_ir_flagged else 0,
                    "RoC_IR_RATIO": (df["FLAG_RoC"].sum() / total_ir_flagged) * 100 if total_ir_flagged else 0,
                    "REPEATED_IR_RATIO": (df["FLAG_REPEATED"].sum() / total_ir_flagged) * 100 if total_ir_flagged else 0,
                    "IF_IR_RATIO": (df["OUTLIER_IF"].sum() / total_ir_flagged) * 100 if total_ir_flagged else 0,
                    "RSD_IR_RATIO": (df["FLAG_RSD"].sum() / total_ir_flagged) * 100 if total_ir_flagged else 0,
                }
                
                df_results = pd.concat([df_results, pd.DataFrame([irrigation_season_summary])], ignore_index=True)
                
                # Created Summary for the non-irrigation season Oct. 2nd to March 31st
                total_nir_flagged = df["FLAGGED"].sum()
                
                non_irrigation_season_summary = {
                    "TOTAL_NIR_RECORDS": len(df),
                    "TOTAL_NIR_NEGATIVE": df["FLAG_NEGATIVE"].sum(),
                    "TOTAL_NIR_ZERO": df["FLAG_ZERO"].sum(),
                    "TOTAL_NIR_95th": df["FLAG_Discharge"].sum(),
                    "TOTAL_NIR_IQR": df["FLAG_IQR"].sum(),
                    "TOTAL_NIR_RoC": df["FLAG_RoC"].sum(),
                    "TOTAL_NIR_REPEATED": df["FLAG_REPEATED"].sum(),
                    "TOTAL_NIR_IF": df["OUTLIER_IF"].sum(),
                    "TOTAL_NIR_RSD": df["FLAG_RSD"].sum(),
                    "TOTAL_NIR_FLAGGED": total_nir_flagged,  # Keeping total flagged count
                    "NEGATIVE_NIR_RATIO": (df["FLAG_NEGATIVE"].sum() / total_nir_flagged) * 100 if total_nir_flagged else 0,
                    "ZERO_NIR_RATIO": (df["FLAG_ZERO"].sum() / total_nir_flagged) * 100 if total_nir_flagged else 0,
                    "DISCHARGE_NIR_RATIO": (df["FLAG_Discharge"].sum() / total_nir_flagged) * 100 if total_nir_flagged else 0,
                    "IQR_NIR_RATIO": (df["FLAG_IQR"].sum() / total_nir_flagged) * 100 if total_nir_flagged else 0,
                    "RoC_NIR_RATIO": (df["FLAG_RoC"].sum() / total_nir_flagged) * 100 if total_nir_flagged else 0,
                    "REPEATED_NIR_RATIO": (df["FLAG_REPEATED"].sum() / total_nir_flagged) * 100 if total_nir_flagged else 0,
                    "IF_NIR_RATIO": (df["OUTLIER_IF"].sum() / total_nir_flagged) * 100 if total_nir_flagged else 0,
                    "RSD_NIR_RATIO": (df["FLAG_RSD"].sum() / total_nir_flagged) * 100 if total_nir_flagged else 0,
                }
                
                df_results = pd.concat([df_results, pd.DataFrame([non_irrigation_season_summary])], ignore_index=True)
                pbar.update(1)

merged_df = pd.merge(filtered_metadata, df_results, on="STATION_ID", how="left")
output_filename = "merged_seasonal_flagging_results.csv"
merged_df.to_csv(output_filename, index=False)

print(f"Final merged data saved as {output_filename}")

Processing Stations: 100%|██████████| 10/10 [00:09<00:00,  1.09station/s]

Final merged data saved as merged_seasonal_flagging_results.csv



