In [1]:
pip install sqlalchemy pyodbc

Note: you may need to restart the kernel to use updated packages.


In [1]:
import pyodbc
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import os
from tqdm import tqdm  # Standard tqdm for text-based progress bar

# Define the connection parameters
server = 'wrt-sql-prod'
database = 'dvrtDB'
username = 'wrtsqlq'
password = 'guest'

# Create SQLAlchemy engine
connection_string = f"mssql+pyodbc://{username}:{password}@{server}/{database}?driver=ODBC+Driver+17+for+SQL+Server"
engine = create_engine(connection_string)

# Directory to save the results
output_directory = r"C:\Users\pbenko\Documents\20250213_distribution_data\data"
print(f"Output directory: {output_directory}")  # Debugging print

# Create the output directory if it doesn't exist
if not os.path.exists(output_directory):
    print("Creating output directory...")  # Debugging print
    os.makedirs(output_directory)

# List to store invalid data (empty groups/columns)
invalid_data = []

try:
    # SQL Query to load all data
    query = "SELECT * FROM dbo.DAILY_RECORDS"

    # Load the full dataset
    print("✅ Trying to load data from SQL...")
    df = pd.read_sql(query, engine)
    print(f"✅ Data loaded successfully. Rows: {df.shape[0]}, Columns: {df.shape[1]}")

    # Identify columns related to days (RV_0101, RV_0102, ..., RV_1231)
    daily_cols = [col for col in df.columns if col.startswith("RV_")]
    print(f"✅ Identified {len(daily_cols)} daily columns.")

    # Initialize a list for storing IQR results
    iqr_results = []

    # Start processing with a progress bar (text-based)
    print("📊 Starting IQR calculation...")
    
    # Wrap the iteration in tqdm to display a progress bar
    for (station, year), group in tqdm(df.groupby(["STATION_ID", "RECORD_YEAR"]), desc="Processing Stations & Years", total=df.groupby(["STATION_ID", "RECORD_YEAR"]).ngroups):
        # Skip empty group (no data for the station-year combination)
        if group.empty:
            invalid_data.append({"STATION_ID": station, "RECORD_YEAR": year, "reason": "No data"})
            continue
        
        # Filter out day columns with all NaN values
        valid_cols = [col for col in daily_cols if group[col].dropna().size > 0]
        
        if not valid_cols:
            # Skip if no valid day columns for this station-year
            invalid_data.append({"STATION_ID": station, "RECORD_YEAR": year, "reason": "No valid day data"})
            continue
        
        # Create a dictionary for storing IQR results for the current station and year
        iqr_for_station_year = {
            "STATION_ID": station,
            "RECORD_YEAR": year
        }
        
        # Compute IQR for each valid day column
        for col in valid_cols:
            # Drop NaN values and compute the IQR
            valid_data = group[col].dropna()
            
            # Calculate Q1, Q3, and IQR
            Q1 = np.percentile(valid_data, 25)  # 25th percentile
            Q3 = np.percentile(valid_data, 75)  # 75th percentile
            IQR = Q3 - Q1  # Interquartile Range
            
            iqr_for_station_year[col] = {"Q1": Q1, "Q3": Q3, "IQR": IQR}
        
        # Append results for this station and year
        iqr_results.append(iqr_for_station_year)

    # Convert the list of dictionaries into a DataFrame for easier analysis
    iqr_df = pd.DataFrame(iqr_results)

    # Print the first few rows of IQR results for debugging
    print("Sample of IQR Results (first few rows):")
    print(iqr_df.head())  # Debugging print

    # Save the IQR results to CSV
    output_file = os.path.join(output_directory, "combined_iqr_results.csv")
    print(f"Saving IQR results to: {output_file}")  # Debugging print
    iqr_df.to_csv(output_file, index=False)

    print(f"✅ IQR results saved to '{output_file}'.")

    # If there is invalid data (no valid day columns for a station-year), save it to a separate CSV
    if invalid_data:
        invalid_df = pd.DataFrame(invalid_data)
        invalid_data_file = os.path.join(output_directory, "invalid_data_log.csv")
        print(f"Saving invalid data to: {invalid_data_file}")  # Debugging print
        invalid_df.to_csv(invalid_data_file, index=False)
        print(f"⚠️ Invalid data saved to '{invalid_data_file}'.")

except Exception as e:
    print(f"❌ Error: {e}")

Output directory: C:\Users\pbenko\Documents\20250213_distribution_data\data
✅ Trying to load data from SQL...
✅ Data loaded successfully. Rows: 74788, Columns: 374
✅ Identified 372 daily columns.
📊 Starting IQR calculation...


Processing Stations & Years: 100%|██████████| 74677/74677 [3:17:43<00:00,  6.29it/s]  


Sample of IQR Results (first few rows):
   STATION_ID RECORD_YEAR                               RV_0401  \
0           1        1950    {'Q1': 0.0, 'Q3': 0.0, 'IQR': 0.0}   
1           1        1951    {'Q1': 0.0, 'Q3': 0.0, 'IQR': 0.0}   
2           1        1952                                   NaN   
3           1        1953  {'Q1': 36.0, 'Q3': 36.0, 'IQR': 0.0}   
4           1        1954    {'Q1': 0.0, 'Q3': 0.0, 'IQR': 0.0}   

                                RV_0402                               RV_0403  \
0    {'Q1': 0.0, 'Q3': 0.0, 'IQR': 0.0}    {'Q1': 0.0, 'Q3': 0.0, 'IQR': 0.0}   
1    {'Q1': 0.0, 'Q3': 0.0, 'IQR': 0.0}    {'Q1': 0.0, 'Q3': 0.0, 'IQR': 0.0}   
2                                   NaN                                   NaN   
3  {'Q1': 31.0, 'Q3': 31.0, 'IQR': 0.0}  {'Q1': 36.0, 'Q3': 36.0, 'IQR': 0.0}   
4    {'Q1': 0.0, 'Q3': 0.0, 'IQR': 0.0}    {'Q1': 0.0, 'Q3': 0.0, 'IQR': 0.0}   

                                RV_0404                               