In [3]:
import requests
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.ensemble import IsolationForest
import time
from tqdm import tqdm  # Progress bar

# Load metadata CSV
metadata_path = "C:\\Users\\pbenko\\Documents\\SQL saved scripts\\meta_data_for_all_active_systems_for_calculation_20250401(CSV).csv"
metadata_df = pd.read_csv(metadata_path, dtype={"STATION_ID": str})  # Ensure STATION_ID is a string

# Get the first 840 rows of metadata
filtered_metadata = metadata_df.head(840).copy()

# Extract STATION_IDs from the first 839 rows
site_ids = filtered_metadata["STATION_ID"].tolist()

# Initialize results DataFrame
df_results = pd.DataFrame()

# Start progress bar
start_time = time.time()
with tqdm(total=len(site_ids), desc="Processing Stations", unit="station") as pbar:
    for i, DivrtID in enumerate(site_ids, start=1):
        end_date = datetime.today().strftime("%Y-%m-%d")  # Get today's date in YYYY-MM-DD format
        api_url = f"https://www.waterrights.utah.gov/dvrtdb/daily-chart.asp?station_id={DivrtID}&end_date={end_date}&f=json"

        response = requests.get(api_url)
        if response.status_code == 200:
            data = response.json()
            if "data" in data:
                df = pd.DataFrame(data["data"], columns=["date", "value"])
                df.rename(columns={"date": "Date", "value": "DISCHARGE"}, inplace=True)
                df["STATION_ID"] = DivrtID  # Ensure STATION_ID is included

                # Convert 'DISCHARGE' to numeric
                df["DISCHARGE"] = pd.to_numeric(df["DISCHARGE"], errors="coerce")

                # === FLAGGING CRITERIA === #
                df["FLAG_NEGATIVE"] = df["DISCHARGE"] < 0
                df["FLAG_ZERO"] = df["DISCHARGE"] == 0

                # Filter out zero values before statistical computations
                df_nonzero = df[df["DISCHARGE"] > 0].copy()

                if not df_nonzero.empty:
                    # Compute 95th percentile **excluding zero values**
                    discharge_95th_percentile = np.percentile(df_nonzero["DISCHARGE"].dropna(), 95)
                    
                    # Compute IQR **excluding zero values**
                    Q1, Q3 = df_nonzero["DISCHARGE"].quantile([0.25, 0.75])
                    IQR = Q3 - Q1

                    # Compute rate of change **excluding zero values**
                    df_nonzero["RATE_OF_CHANGE"] = df_nonzero["DISCHARGE"].diff().abs()
                    df = df.merge(df_nonzero[["Date", "RATE_OF_CHANGE"]], on="Date", how="left")

                    # Compute repeated values >= 4 days **excluding zero values**
                    df_nonzero["FLAG_REPEATED"] = df_nonzero["DISCHARGE"].groupby(
                        (df_nonzero["DISCHARGE"] != df_nonzero["DISCHARGE"].shift()).cumsum()
                    ).transform("count") >= 4

                    # Apply Isolation Forest **excluding zero values**
                    model = IsolationForest(contamination=0.05, random_state=42)
                    df_nonzero["OUTLIER_IF"] = model.fit_predict(df_nonzero[["DISCHARGE"]])
                    df_nonzero["OUTLIER_IF"] = df_nonzero["OUTLIER_IF"] == -1  # Convert to boolean
                    
                    # === Percent Average Deviation (RSD) Outlier Detection === #
                    mean_discharge = df_nonzero["DISCHARGE"].mean()  # Compute mean excluding zero values
                    df["PERCENT_DEV"] = ((df["DISCHARGE"] - mean_discharge).abs() / mean_discharge) * 100  # Compute percent deviation
                    
                    # Set a threshold for extreme outliers (e.g., above 1000%)
                    threshold = 1000  # Modify as needed
                    df["FLAG_RSD"] = (df["PERCENT_DEV"] > threshold) & (df["DISCHARGE"] != 0)
                    
                    # Merge non-zero flags back into main DataFrame
                    df = df.merge(df_nonzero[["Date", "OUTLIER_IF", "FLAG_REPEATED"]], on="Date", how="left")
                else:
                    discharge_95th_percentile = 0
                    IQR = 0
                    df["RATE_OF_CHANGE"] = np.nan
                    df["OUTLIER_IF"] = False
                    df["FLAG_REPEATED"] = False
                    df["PERCENT_DEV"] = np.nan
                    df["FLAG_RSD"] = False

                # Apply flagging based on computed values
                df["FLAG_Discharge"] = df["DISCHARGE"] > discharge_95th_percentile
                df["FLAG_IQR"] = (df["DISCHARGE"] < Q1 - 1.5 * IQR) | (df["DISCHARGE"] > Q3 + 1.5 * IQR)
                df["FLAG_RoC"] = df["RATE_OF_CHANGE"] > discharge_95th_percentile

                df["FLAGGED"] = df[
                    ["FLAG_NEGATIVE", "FLAG_ZERO", "FLAG_REPEATED", "FLAG_IQR", "OUTLIER_IF", "FLAG_Discharge", "FLAG_RoC", "FLAG_RSD"]
                ].any(axis=1)

                # Create summary for the current station
                station_summary = {
                    "STATION_ID": DivrtID,
                    "TOTAL_RECORDS": len(df),
                    "TOTAL_NEGATIVE": df["FLAG_NEGATIVE"].sum(),
                    "TOTAL_ZERO": df["FLAG_ZERO"].sum(),
                    "TOTAL_95th": df["FLAG_Discharge"].sum(),
                    "TOTAL_IQR": df["FLAG_IQR"].sum(),
                    "TOTAL_RoC": df["FLAG_RoC"].sum(),
                    "TOTAL_REPEATED": df["FLAG_REPEATED"].sum(),
                    "TOTAL_IF": df["OUTLIER_IF"].sum(),
                    "TOTAL_RSD": df["FLAG_RSD"].sum(),
                    "TOTAL_FLAGGED": df["FLAGGED"].sum() }
                

                # Append the summary to results
                df_results = pd.concat([df_results, pd.DataFrame([station_summary])], ignore_index=True)
        
        pbar.update(1)

# Merge metadata with results on STATION_ID
merged_df = pd.merge(filtered_metadata, df_results, on="STATION_ID", how="left")

# Save the final merged CSV
output_filename = "merged_3.0_839_station_metadata_flagging_criteria_results.csv"
merged_df.to_csv(output_filename, index=False)

print(f"Final merged data saved as {output_filename}")


Processing Stations: 100%|██████████| 839/839 [41:55<00:00,  3.00s/station]  

Final merged data saved as merged_2.0_840_station_metadata_flagging_criteria_results.csv





In [6]:
import requests
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.ensemble import IsolationForest
import time
from tqdm import tqdm  # Progress bar

# Load metadata CSV
metadata_path = "C:\\Users\\pbenko\\Documents\\SQL saved scripts\\meta_data_for_all_active_systems_for_calculation_20250401(CSV).csv"
metadata_df = pd.read_csv(metadata_path, dtype={"STATION_ID": str})  # Ensure STATION_ID is a string

# Get the first 10 rows of metadata
filtered_metadata = metadata_df.head(10).copy()

# Extract STATION_IDs from the first 839 rows
site_ids = filtered_metadata["STATION_ID"].tolist()

# Initialize results DataFrame
df_results = pd.DataFrame()

# Start progress bar
start_time = time.time()
with tqdm(total=len(site_ids), desc="Processing Stations", unit="station") as pbar:
    for i, DivrtID in enumerate(site_ids, start=1):
        end_date = datetime.today().strftime("%Y-%m-%d")  # Get today's date in YYYY-MM-DD format
        api_url = f"https://www.waterrights.utah.gov/dvrtdb/daily-chart.asp?station_id={DivrtID}&end_date={end_date}&f=json"

        response = requests.get(api_url)
        if response.status_code == 200:
            data = response.json()
            if "data" in data:
                df = pd.DataFrame(data["data"], columns=["date", "value"])
                df.rename(columns={"date": "Date", "value": "DISCHARGE"}, inplace=True)
                df["STATION_ID"] = DivrtID  # Ensure STATION_ID is included

                # Convert 'DISCHARGE' to numeric
                df["DISCHARGE"] = pd.to_numeric(df["DISCHARGE"], errors="coerce")

                # === FLAGGING CRITERIA === #
                df["FLAG_NEGATIVE"] = df["DISCHARGE"] < 0
                df["FLAG_ZERO"] = df["DISCHARGE"] == 0

                # Filter out zero values before statistical computations
                df_nonzero = df[df["DISCHARGE"] > 0].copy()

                if not df_nonzero.empty:
                    # Compute 95th percentile **excluding zero values**
                    discharge_95th_percentile = np.percentile(df_nonzero["DISCHARGE"].dropna(), 95)
                    
                    # Compute IQR **excluding zero values**
                    Q1, Q3 = df_nonzero["DISCHARGE"].quantile([0.25, 0.75])
                    IQR = Q3 - Q1

                    # Compute rate of change **excluding zero values**
                    df_nonzero["RATE_OF_CHANGE"] = df_nonzero["DISCHARGE"].diff().abs()
                    df = df.merge(df_nonzero[["Date", "RATE_OF_CHANGE"]], on="Date", how="left")

                    # Compute repeated values >= 4 days **excluding zero values**
                    df_nonzero["FLAG_REPEATED"] = df_nonzero["DISCHARGE"].groupby(
                        (df_nonzero["DISCHARGE"] != df_nonzero["DISCHARGE"].shift()).cumsum()
                    ).transform("count") >= 4

                    # Apply Isolation Forest **excluding zero values**
                    model = IsolationForest(contamination=0.05, random_state=42)
                    df_nonzero["OUTLIER_IF"] = model.fit_predict(df_nonzero[["DISCHARGE"]])
                    df_nonzero["OUTLIER_IF"] = df_nonzero["OUTLIER_IF"] == -1  # Convert to boolean
                    
                    # === Percent Average Deviation (RSD) Outlier Detection === #
                    mean_discharge = df_nonzero["DISCHARGE"].mean()  # Compute mean excluding zero values
                    df["PERCENT_DEV"] = ((df["DISCHARGE"] - mean_discharge).abs() / mean_discharge) * 100  # Compute percent deviation
                    
                    # Set a threshold for extreme outliers (e.g., above 1000%)
                    threshold = 1000  # Modify as needed
                    df["FLAG_RSD"] = (df["PERCENT_DEV"] > threshold) & (df["DISCHARGE"] != 0)
                    
                    # Merge non-zero flags back into main DataFrame
                    df = df.merge(df_nonzero[["Date", "OUTLIER_IF", "FLAG_REPEATED"]], on="Date", how="left")
                else:
                    discharge_95th_percentile = 0
                    IQR = 0
                    df["RATE_OF_CHANGE"] = np.nan
                    df["OUTLIER_IF"] = False
                    df["FLAG_REPEATED"] = False
                    df["PERCENT_DEV"] = np.nan
                    df["FLAG_RSD"] = False

                # Apply flagging based on computed values
                df["FLAG_Discharge"] = df["DISCHARGE"] > discharge_95th_percentile
                df["FLAG_IQR"] = (df["DISCHARGE"] < Q1 - 1.5 * IQR) | (df["DISCHARGE"] > Q3 + 1.5 * IQR)
                df["FLAG_RoC"] = df["RATE_OF_CHANGE"] > discharge_95th_percentile

                df["FLAGGED"] = df[
                    ["FLAG_NEGATIVE", "FLAG_ZERO", "FLAG_REPEATED", "FLAG_IQR", "OUTLIER_IF", "FLAG_Discharge", "FLAG_RoC", "FLAG_RSD"]
                ].any(axis=1)

                # Create summary for the current station
                total_flagged = df["FLAGGED"].sum()  # Add this line to calculate total_flagged

                station_summary = {
                    "STATION_ID": DivrtID,
                    "TOTAL_RECORDS": len(df),
                    "TOTAL_NEGATIVE": df["FLAG_NEGATIVE"].sum(),
                    "TOTAL_ZERO": df["FLAG_ZERO"].sum(),
                    "TOTAL_95th": df["FLAG_Discharge"].sum(),
                    "TOTAL_IQR": df["FLAG_IQR"].sum(),
                    "TOTAL_RoC": df["FLAG_RoC"].sum(),
                    "TOTAL_REPEATED": df["FLAG_REPEATED"].sum(),
                    "TOTAL_IF": df["OUTLIER_IF"].sum(),
                    "TOTAL_RSD": df["FLAG_RSD"].sum(),
                    "TOTAL_FLAGGED": df["FLAGGED"].sum(),
                    # Ratios
                    "NEGATIVE_RATIO": (df["FLAG_NEGATIVE"].sum() / total_flagged) * 100 if total_flagged else 0,
                    "ZERO_RATIO": (df["FLAG_ZERO"].sum() / total_flagged) * 100 if total_flagged else 0,
                    "DISCHARGE_RATIO": (df["FLAG_Discharge"].sum() / total_flagged) * 100 if total_flagged else 0,
                    "IQR_RATIO": (df["FLAG_IQR"].sum() / total_flagged) * 100 if total_flagged else 0,
                    "RoC_RATIO": (df["FLAG_RoC"].sum() / total_flagged) * 100 if total_flagged else 0,
                    "REPEATED_RATIO": (df["FLAG_REPEATED"].sum() / total_flagged) * 100 if total_flagged else 0,
                    "IF_RATIO": (df["OUTLIER_IF"].sum() / total_flagged) * 100 if total_flagged else 0,
                    "RSD_RATIO": (df["FLAG_RSD"].sum() / total_flagged) * 100 if total_flagged else 0,
                }

                # Define and separate the two seasons
                df['Date'] = pd.to_datetime(df['Date'])
                
                # Define Irrigation season: April 1st to October 1st
                df["SEASON"] = np.where(
                    ((df["Date"].dt.month == 4) & (df["Date"].dt.day >= 1)) |  # April 1st onwards
                    ((df["Date"].dt.month == 10) & (df["Date"].dt.day <= 1)),  # October 1st
                    "Irrigation", 
                    "Non-Irrigation"
                )

                # Irrigation Season Processing
                df_irrigation = df[df["SEASON"] == "Irrigation"]
                
                # Compute flagging counts for Irrigation Season
                seasonal_flagging_irrigation = df_irrigation.groupby("SEASON").agg({
                    'FLAG_NEGATIVE': 'sum',
                    'FLAG_ZERO': 'sum',
                    'FLAG_Discharge': 'sum',
                    'FLAG_IQR': 'sum',
                    'FLAG_RoC': 'sum',
                    'FLAG_REPEATED': 'sum',
                    'OUTLIER_IF': 'sum',
                    'FLAG_RSD': 'sum',
                    'FLAGGED': 'sum'
                }).reset_index()

                # Non-Irrigation Season Processing
                df_non_irrigation = df[df["SEASON"] == "Non-Irrigation"]
                
                # Compute flagging counts for Non-Irrigation Season
                seasonal_flagging_non_irrigation = df_non_irrigation.groupby("SEASON").agg({
                    'FLAG_NEGATIVE': 'sum',
                    'FLAG_ZERO': 'sum',
                    'FLAG_Discharge': 'sum',
                    'FLAG_IQR': 'sum',
                    'FLAG_RoC': 'sum',
                    'FLAG_REPEATED': 'sum',
                    'OUTLIER_IF': 'sum',
                    'FLAG_RSD': 'sum',
                    'FLAGGED': 'sum'
                }).reset_index()

                # Append the summary to results
                df_results = pd.concat([df_results, pd.DataFrame([station_summary])], ignore_index=True)
        
        pbar.update(1)

# Merge metadata with results on STATION_ID
merged_df = pd.merge(filtered_metadata, df_results, on="STATION_ID", how="left")

# Save the final merged CSV
output_filename = "merged_3.0_10_station_metadata_flagging_criteria_results.csv"
merged_df.to_csv(output_filename, index=False)

# Time taken for processing
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Total Time Taken: {elapsed_time:.2f} seconds.")

Processing Stations: 100%|██████████| 10/10 [00:09<00:00,  1.00station/s]

Total Time Taken: 9.98 seconds.





In [1]:
import requests
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.ensemble import IsolationForest
import time
from tqdm import tqdm  # Progress bar

# Load metadata CSV
metadata_path = "C:\\Users\\pbenko\\Documents\\SQL saved scripts\\meta_data_for_all_active_systems_for_calculation_20250401(CSV).csv"
metadata_df = pd.read_csv(metadata_path, dtype={"STATION_ID": str})  # Ensure STATION_ID is a string

# Get the first 10 rows of metadata
filtered_metadata = metadata_df.head(10).copy()

# Extract STATION_IDs from the first 839 rows
site_ids = filtered_metadata["STATION_ID"].tolist()

# Initialize results DataFrame
df_results = pd.DataFrame()

# Start progress bar
start_time = time.time()
with tqdm(total=len(site_ids), desc="Processing Stations", unit="station") as pbar:
    for i, DivrtID in enumerate(site_ids, start=1):
        end_date = datetime.today().strftime("%Y-%m-%d")  # Get today's date in YYYY-MM-DD format
        api_url = f"https://www.waterrights.utah.gov/dvrtdb/daily-chart.asp?station_id={DivrtID}&end_date={end_date}&f=json"

        response = requests.get(api_url)
        if response.status_code == 200:
            data = response.json()
            if "data" in data:
                df = pd.DataFrame(data["data"], columns=["date", "value"])
                df.rename(columns={"date": "Date", "value": "DISCHARGE"}, inplace=True)
                df["STATION_ID"] = DivrtID  # Ensure STATION_ID is included
                
                # Convert 'Date' to datetime format
                df["Date"] = pd.to_datetime(df["Date"])
                
                # Identify irrigation and non-irrigation season
                df["SEASON"] = np.where(
                    df["Date"].dt.month.between(4, 9) | ((df["Date"].dt.month == 10) & (df["Date"].dt.day == 1)),
                    "Irrigation Season", "Non-Irrigation Season"
                )
                
                # Convert 'DISCHARGE' to numeric
                df["DISCHARGE"] = pd.to_numeric(df["DISCHARGE"], errors="coerce")
                
                # === FLAGGING CRITERIA === #
                df["FLAG_NEGATIVE"] = df["DISCHARGE"] < 0
                df["FLAG_ZERO"] = df["DISCHARGE"] == 0
                
                # Filter out zero values before statistical computations
                df_nonzero = df[df["DISCHARGE"] > 0].copy()

                if not df_nonzero.empty:
                    Q1, Q3 = df_nonzero["DISCHARGE"].quantile([0.25, 0.75])
                    IQR = Q3 - Q1
                    discharge_95th_percentile = np.percentile(df_nonzero["DISCHARGE"].dropna(), 95)
                    
                    df_nonzero["RATE_OF_CHANGE"] = df_nonzero["DISCHARGE"].diff().abs()
                    df = df.merge(df_nonzero[["Date", "RATE_OF_CHANGE"]], on="Date", how="left")
                    
                    df_nonzero["FLAG_REPEATED"] = df_nonzero["DISCHARGE"].groupby(
                        (df_nonzero["DISCHARGE"] != df_nonzero["DISCHARGE"].shift()).cumsum()
                    ).transform("count") >= 4
                    
                    model = IsolationForest(contamination=0.05, random_state=42)
                    df_nonzero["OUTLIER_IF"] = model.fit_predict(df_nonzero[["DISCHARGE"]])
                    df_nonzero["OUTLIER_IF"] = df_nonzero["OUTLIER_IF"] == -1
                    
                    mean_discharge = df_nonzero["DISCHARGE"].mean()
                    df["PERCENT_DEV"] = ((df["DISCHARGE"] - mean_discharge).abs() / mean_discharge) * 100
                    
                    threshold = 1000
                    df["FLAG_RSD"] = (df["PERCENT_DEV"] > threshold) & (df["DISCHARGE"] != 0)
                    
                    df = df.merge(df_nonzero[["Date", "OUTLIER_IF", "FLAG_REPEATED"]], on="Date", how="left")
                else:
                    discharge_95th_percentile = 0
                    IQR = 0
                    df["RATE_OF_CHANGE"] = np.nan
                    df["OUTLIER_IF"] = False
                    df["FLAG_REPEATED"] = False
                    df["PERCENT_DEV"] = np.nan
                    df["FLAG_RSD"] = False

                df["FLAG_Discharge"] = df["DISCHARGE"] > discharge_95th_percentile
                df["FLAG_IQR"] = (df["DISCHARGE"] < Q1 - 1.5 * IQR) | (df["DISCHARGE"] > Q3 + 1.5 * IQR)
                df["FLAG_RoC"] = df["RATE_OF_CHANGE"] > discharge_95th_percentile
                
                df["FLAGGED"] = df[
                    ["FLAG_NEGATIVE", "FLAG_ZERO", "FLAG_REPEATED", "FLAG_IQR", "OUTLIER_IF", "FLAG_Discharge", "FLAG_RoC", "FLAG_RSD"]
                ].any(axis=1)
                
                # Count flagged records for each season
                season_summary = df.groupby("SEASON")["FLAGGED"].sum().to_dict()
                
                station_summary = {
                    "STATION_ID": DivrtID,
                    "TOTAL_RECORDS": len(df),
                    "TOTAL_FLAGGED": df["FLAGGED"].sum(),
                    "TOTAL_FLAGGED_IRRIGATION": season_summary.get("Irrigation Season", 0),
                    "TOTAL_FLAGGED_NON_IRRIGATION": season_summary.get("Non-Irrigation Season", 0)
                }
                
                df_results = pd.concat([df_results, pd.DataFrame([station_summary])], ignore_index=True)
        
        pbar.update(1)

merged_df = pd.merge(filtered_metadata, df_results, on="STATION_ID", how="left")
output_filename = "merged_seasonal_flagging_results.csv"
merged_df.to_csv(output_filename, index=False)

print(f"Final merged data saved as {output_filename}")

Processing Stations: 100%|██████████| 10/10 [00:09<00:00,  1.10station/s]

Final merged data saved as merged_seasonal_flagging_results.csv



