# load data

In [None]:
import pandas as pd

In [None]:
pip install openpyxl

In [None]:
gage_class=pd.read_excel('data/GAGES-II_ref_non_ref.xlsx')
gage_class.head()

In [None]:
gage_in_gslb=pd.read_csv('shp/gsl_nwm_gage.csv')
gage_in_gslb.head()

In [None]:
import pandas as pd

# Check the initial assignment
print("Loading gage_class from Excel file...")
try:
    gage_class = pd.read_excel('data/GAGES-II_ref_non_ref.xlsx')
    print("Successfully loaded gage_class as DataFrame")
except Exception as e:
    print(f"Error loading gage_class: {e}")

# Check the type and value of gage_class
print(f"Type of gage_class after loading: {type(gage_class)}")
print(gage_class.head())

# Ensure gage_class is not overwritten
# Add similar checks throughout your code where gage_class is used


In [None]:
# Ensure the necessary columns are of string type for merging
gage_class['STAID'] = gage_class['STAID'].astype(str)
gage_in_gslb['id'] = gage_in_gslb['id'].astype(str)

# Merge gage_in_gslb with gage_class to include the class information
gage_in_gslb_with_class = gage_in_gslb.merge(
    gage_class[['STAID', 'CLASS']],
    left_on='id',
    right_on='STAID',
    how='left'
)

# Drop the duplicate STAID column and rename CLASS to lowercase
gage_in_gslb_with_class = gage_in_gslb_with_class.drop('STAID', axis=1)
gage_in_gslb_with_class = gage_in_gslb_with_class.rename(columns={'CLASS': 'class'})

# Calculate the total number of gages
total_gages = len(gage_in_gslb_with_class)

# Count the number of gages in each class
class_counts = gage_in_gslb_with_class['class'].value_counts()

# Calculate the percentage of gages in each class
class_percentages = (class_counts / total_gages) * 100

# Display the results
print("Number of gages in each class:")
print(class_counts)
print("\nPercentage of gages in each class:")
print(class_percentages.round(2), "%")


In [None]:
# Get the reference gages that are within the GSL basin
ref_gages_in_gslb = gage_in_gslb_with_class[gage_in_gslb_with_class['class'] == 'Ref']

# Display the reference gages in GSL basin with their details
ref_gages_in_gslb[['id', 'name', 'River', 'latitude', 'longitude', 'state', 'class']]

# plots

In [None]:
final_result_cleaned = pd.read_csv('downstream/all/q_buffer2_pair_delta_30m.csv')

In [None]:
# Convert gage_id in both dataframes to string for merging
final_result_cleaned['gage_id'] = final_result_cleaned['gage_id'].astype(str)
gage_class['STAID'] = gage_class['STAID'].astype(str)

# Merge final_result_cleaned with gage_class to add CLASS column
final_result_cleaned = final_result_cleaned.merge(
    gage_class[['STAID', 'CLASS']],
    left_on='gage_id',
    right_on='STAID',
    how='left'
)

# Drop redundant STAID column and rename CLASS to lowercase
final_result_cleaned.drop('STAID', axis=1, inplace=True)
final_result_cleaned.rename(columns={'CLASS': 'class'}, inplace=True)


In [None]:
# Count the number of Non-ref and Ref gages
gage_counts = final_result_cleaned['class'].value_counts()

# Display the counts
print("Number of Non-ref gages:", gage_counts.get('Non-ref', 0))
print("Number of Ref gages:", gage_counts.get('Ref', 0))


In [None]:
final_result_cleaned.head()

In [None]:
final_result_cleaned.info()

## Monthly

### time series

In [None]:
final_result_cleaned.head()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import linregress
import os

# Ensure the plots directory exists
os.makedirs('downstream/all/monthly/subplot_q_delta_wte_monthly', exist_ok=True)

# Prepare a list to collect statistical data for CSV
stats_data = []

# Assuming final_result_cleaned is already loaded as a DataFrame
# Group by gage_id
grouped = final_result_cleaned.groupby('gage_id')

# Iterate over each group
for gage_id, group in grouped:
    # Drop NaN values (changed from 'q' to 'delta_q')
    group = group.dropna(subset=['delta_wte', 'date', 'delta_q'])

    # Convert date to datetime format
    group['date'] = pd.to_datetime(group['date'])

    # Extract month from date
    group['month'] = group['date'].dt.month
    group['month_name'] = group['date'].dt.strftime('%B')

    # Sort by date to ensure the line plot is correct
    group = group.sort_values('date')

    # Get unique months in the data
    unique_months = sorted(group['month'].unique())

    # Skip if no monthly data
    if len(unique_months) == 0:
        continue

    # Create plots for each month
    for month in unique_months:
        # Filter data for current month
        monthly_data = group[group['month'] == month].copy()
        month_name = monthly_data['month_name'].iloc[0]

        # Skip if insufficient data for this month
        if len(monthly_data) < 2:
            continue

        # Create subplots (Delta Q on top, Delta WTE on bottom)
        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10), sharex=True)

        # Plot delta_q vs. date on the top subplot (changed from q)
        ax1.scatter(monthly_data['date'], monthly_data['delta_q'], color='orange', label='Delta Q (cfs)', alpha=0.6)

        # Perform linear regression for delta_q (changed from q)
        if len(monthly_data) > 1 and len(monthly_data['date'].unique()) > 1:
            monthly_data['date_numeric'] = monthly_data['date'].map(pd.Timestamp.toordinal)
            slope_delta_q, intercept_delta_q, r_value_delta_q, p_value_delta_q, std_err_delta_q = linregress(
                monthly_data['date_numeric'], monthly_data['delta_q']
            )

            # Plot the regression line for delta_q
            ax1.plot(monthly_data['date'],
                    intercept_delta_q + slope_delta_q * monthly_data['date_numeric'],
                    'r', label='Fitted line for Delta Q')

            # Prepare legend text for delta_q
            legend_text_delta_q = (
                f"Delta Q Points: {len(monthly_data)}\n"
                f"Slope Delta Q: {slope_delta_q:.6f}\n"
                f"R² Delta Q: {r_value_delta_q ** 2:.2f}\n"
                f"P-value Delta Q: {p_value_delta_q:.4f}"
            )
        else:
            slope_delta_q = r_value_delta_q = p_value_delta_q = None
            legend_text_delta_q = (
                f"Delta Q Points: {len(monthly_data)}\n"
                "Insufficient data for regression"
            )

        ax1.set_ylabel('Delta Q (cfs)')
        ax1.grid(True, linestyle='--', linewidth=0.5, alpha=0.7)

        # Add a custom legend with statistical data for delta_q
        ax1.text(
            0.05, 0.95,
            legend_text_delta_q,
            transform=ax1.transAxes,
            fontsize=10,
            verticalalignment='top',
            bbox=dict(boxstyle='round', facecolor='white', alpha=0.5)
        )

        # Plot delta_wte vs. date on the bottom subplot
        ax2.scatter(monthly_data['date'], monthly_data['delta_wte'], color='blue', label='Delta WTE', alpha=0.6)

        # Perform linear regression for delta_wte
        if len(monthly_data) > 1 and len(monthly_data['date'].unique()) > 1:
            slope_wte, intercept_wte, r_value_wte, p_value_wte, std_err_wte = linregress(
                monthly_data['date_numeric'], monthly_data['delta_wte']
            )

            # Plot the regression line for delta_wte
            ax2.plot(monthly_data['date'],
                    intercept_wte + slope_wte * monthly_data['date_numeric'],
                    'r', label='Fitted line for Delta WTE')

            # Collect statistics (updated variable names)
            stats_data.append({
                'gage_id': gage_id,
                'month': month,
                'month_name': month_name,
                'num_wells': monthly_data['well_id'].nunique(),
                'num_measurements': len(monthly_data),
                'slope_delta_q': slope_delta_q,
                'r_squared_delta_q': r_value_delta_q ** 2 if r_value_delta_q is not None else None,
                'p_value_delta_q': p_value_delta_q,
                'slope_wte': slope_wte,
                'r_squared_wte': r_value_wte ** 2,
                'p_value_wte': p_value_wte
            })

            # Prepare legend text for delta_wte
            legend_text_wte = (
                f"Wells: {monthly_data['well_id'].nunique()}\n"
                f"Measurements: {len(monthly_data)}\n"
                f"Slope WTE: {slope_wte:.6f}\n"
                f"R² WTE: {r_value_wte ** 2:.2f}\n"
                f"P-value WTE: {p_value_wte:.4f}"
            )
        else:
            legend_text_wte = (
                f"Wells: {monthly_data['well_id'].nunique()}\n"
                f"Measurements: {len(monthly_data)}\n"
                "Insufficient data for regression"
            )

        ax2.set_ylabel('Delta WTE (ft)')
        ax2.grid(True, linestyle='--', linewidth=0.5, alpha=0.7)

        # Add a custom legend with statistical data for delta_wte
        ax2.text(
            0.05, 0.95,
            legend_text_wte,
            transform=ax2.transAxes,
            fontsize=10,
            verticalalignment='top',
            bbox=dict(boxstyle='round', facecolor='white', alpha=0.5)
        )

        # Set a single title for both subplots, closer to the plots (updated title)
        fig.suptitle(f'Gage ID: {gage_id} - {month_name} - Delta Q and Delta WTE vs. Time',
                    fontsize=16, y=0.92)

        # Format the date axis
        ax2.xaxis.set_major_formatter(plt.matplotlib.dates.DateFormatter('%Y-%m-%d'))
        ax2.xaxis.set_major_locator(plt.matplotlib.dates.AutoDateLocator())
        plt.xticks(rotation=45)

        # Save the plot (updated filename)
        plt.savefig(f'downstream/all/monthly/subplot_q_delta_wte_monthly/gage_{gage_id}_{month:02d}_{month_name}.png',
                   bbox_inches='tight')
        plt.close()

# Export statistics to CSV (updated filename)
if stats_data:
    stats_df = pd.DataFrame(stats_data)
    stats_df.to_csv('downstream/all/monthly/delta_q_delta_wte_monthly_statistics.csv', index=False)

    # Calculate monthly statistics (updated variable names)
    print("Monthly Summary Statistics:")
    for month_name in sorted(stats_df['month_name'].unique()):
        month_data = stats_df[stats_df['month_name'] == month_name]

        print(f"\n{month_name}:")
        print(f"  Number of gages: {len(month_data)}")
        print(f"  Average R² Delta Q: {month_data['r_squared_delta_q'].mean():.3f}")
        print(f"  Average R² WTE: {month_data['r_squared_wte'].mean():.3f}")
        print(f"  Positive slope Delta Q: {(month_data['slope_delta_q'] > 0).mean() * 100:.1f}%")
        print(f"  Positive slope WTE: {(month_data['slope_wte'] > 0).mean() * 100:.1f}%")

    # Overall statistics (updated variable names)
    overall_positive_delta_q = (stats_df['slope_delta_q'] > 0).mean() * 100
    overall_positive_wte = (stats_df['slope_wte'] > 0).mean() * 100

    print(f"\nOverall Statistics:")
    print(f"Total gage-month combinations: {len(stats_df)}")
    print(f"Percentage of positive slopes for Delta Q: {overall_positive_delta_q:.2f}%")
    print(f"Percentage of positive slopes for Delta WTE: {overall_positive_wte:.2f}%")

else:
    print("No statistical data collected - insufficient data for regression analysis")


### scatter plots

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import linregress
import os

# Ensure output directory exists
os.makedirs('downstream/all/monthly/scatter_plots_delta_q_vs_delta_wte', exist_ok=True)

# Load data
final_result_cleaned = pd.read_csv('downstream/all/q_buffer2_pair_delta_30m.csv')
gage_class = pd.read_excel('data/GAGES-II_ref_non_ref.xlsx')

# Merge gage classification information
final_result_cleaned['gage_id'] = final_result_cleaned['gage_id'].astype(str)
gage_class['STAID'] = gage_class['STAID'].astype(str)

final_result_cleaned = final_result_cleaned.merge(
    gage_class[['STAID', 'CLASS']],
    left_on='gage_id',
    right_on='STAID',
    how='left'
)

final_result_cleaned.drop('STAID', axis=1, inplace=True)
final_result_cleaned.rename(columns={'CLASS': 'class'}, inplace=True)

# Add month information
final_result_cleaned['date'] = pd.to_datetime(final_result_cleaned['date'])
final_result_cleaned['month'] = final_result_cleaned['date'].dt.month

# Month name mapping
month_names = {
    1: 'January', 2: 'February', 3: 'March', 4: 'April',
    5: 'May', 6: 'June', 7: 'July', 8: 'August',
    9: 'September', 10: 'October', 11: 'November', 12: 'December'
}

# Prepare statistics collection
stats_data = []

# Group by gage_id
grouped = final_result_cleaned.groupby('gage_id')

# Create monthly scatter plots for each gage
for gage_id, group in grouped:
    group = group.dropna(subset=['delta_wte', 'delta_q'])

    # Check if enough data is available
    if len(group) < 2:
        continue

    plt.figure(figsize=(15, 10))

    # Get available months for this gage
    available_months = sorted(group['month'].unique())

    # Adjust subplot layout based on number of months
    if len(available_months) <= 4:
        rows, cols = 2, 2
    elif len(available_months) <= 6:
        rows, cols = 2, 3
    elif len(available_months) <= 9:
        rows, cols = 3, 3
    else:
        rows, cols = 3, 4

    fig, axes = plt.subplots(rows, cols, figsize=(15, 12))
    if rows == 1:
        axes = axes.reshape(1, -1)
    elif cols == 1:
        axes = axes.reshape(-1, 1)
    axes = axes.flatten()

    # Map class display names
    if 'class' in group.columns and not group['class'].isna().all():
        class_display = group['class'].iloc[0]
        if class_display == 'Non-ref':
            class_display = 'Unregulated'
        elif class_display == 'Ref':
            class_display = 'Regulated'
    else:
        class_display = 'Unknown'

    # Create scatter plots for each month
    for idx, month in enumerate(available_months):
        if idx >= len(axes):
            break

        ax = axes[idx]
        month_data = group[group['month'] == month]

        if len(month_data) == 0:
            ax.set_visible(False)
            continue

        # Plot scatter with different colors for each well
        sns.scatterplot(
            data=month_data,
            x='delta_wte',
            y='delta_q',
            hue='well_id',
            palette='viridis',
            edgecolor='none',
            legend=False,
            ax=ax
        )

        legend_text = ""

        # Perform linear regression if enough data points
        if len(month_data) >= 2 and len(month_data['delta_wte'].unique()) > 1:
            slope, intercept, r_value, p_value, std_err = linregress(
                month_data['delta_wte'], month_data['delta_q']
            )

            # Plot regression line
            delta_wte_range = pd.Series(month_data['delta_wte'].unique()).sort_values()
            ax.plot(delta_wte_range, intercept + slope * delta_wte_range, 'r', linewidth=2)

            # Collect statistics
            stats_data.append({
                'gage_id': gage_id,
                'month': month,
                'month_name': month_names[month],
                'num_wells': month_data['well_id'].nunique(),
                'num_measurements': len(month_data),
                'slope': slope,
                'intercept': intercept,
                'r_squared': r_value ** 2,
                'p_value': p_value,
                'class': class_display
            })

            legend_text = (
                f"Wells: {month_data['well_id'].nunique()}\n"
                f"Measurements: {len(month_data)}\n"
                f"Slope: {slope:.2f}\n"
                f"R²: {r_value ** 2:.2f}\n"
                f"p-value: {p_value:.4f}"
            )
        else:
            if len(month_data['delta_wte'].unique()) == 1:
                legend_text = (
                    f"Wells: {month_data['well_id'].nunique()}\n"
                    f"Measurements: {len(month_data)}\n"
                    "All delta_wte identical\nNo regression"
                )
            else:
                legend_text = (
                    f"Wells: {month_data['well_id'].nunique()}\n"
                    f"Measurements: {len(month_data)}\n"
                    "Not enough data"
                )

        # Set subplot labels and title
        ax.set_title(f'{month_names[month]}', fontsize=12, fontweight='bold')
        ax.set_xlabel('Delta WTE (ft)', fontsize=10)
        ax.set_ylabel('Delta Q (cfs)', fontsize=10)

        # Add grid
        ax.yaxis.grid(True, linestyle='--', linewidth=0.5, alpha=0.7)
        ax.xaxis.grid(False)
        ax.set_facecolor('white')

        # Add statistics text box
        ax.text(
            0.98, 0.95,
            legend_text,
            transform=ax.transAxes,
            fontsize=8,
            ha='right',
            va='top',
            linespacing=1.2,
            bbox=dict(
                facecolor='white',
                edgecolor='black',
                alpha=0.75,
                boxstyle='square,pad=0.3'
            )
        )

    # Hide unused subplots
    for idx in range(len(available_months), len(axes)):
        axes[idx].set_visible(False)

    # Set main title
    fig.suptitle(f'Gage ID: {gage_id} - Class: {class_display}\nDelta Q vs Delta WTE by Month',
                 fontsize=16, fontweight='bold', y=0.95)

    # Adjust layout and save plot
    plt.tight_layout()
    plt.subplots_adjust(top=0.90)
    plt.savefig(f'downstream/all/monthly/scatter_plots_delta_q_vs_delta_wte/gage_{gage_id}_monthly.png',
                bbox_inches='tight', dpi=300)
    plt.close()

# Export statistics
if stats_data:
    stats_df = pd.DataFrame(stats_data)
    stats_df.to_csv('downstream/all/monthly/scatter_delta_q_vs_delta_wte_monthly_statistics.csv', index=False)

    # Calculate summary statistics
    print(f"Generated monthly scatter plots for {len(grouped)} gages")
    print(f"Total monthly analyses: {len(stats_data)}")

    # Monthly statistics
    monthly_summary = stats_df.groupby('month_name').agg({
        'gage_id': 'count',
        'slope': ['mean', 'std'],
        'r_squared': ['mean', 'std'],
        'p_value': lambda x: (x < 0.05).sum()
    }).round(4)

    print("\nMonthly Summary:")
    print(monthly_summary)

    # Class statistics
    if 'class' in stats_df.columns:
        class_summary = stats_df.groupby('class').agg({
            'gage_id': 'count',
            'slope': ['mean', 'std'],
            'r_squared': ['mean', 'std'],
            'p_value': lambda x: (x < 0.05).sum()
        }).round(4)

        print("\nClass Summary:")
        print(class_summary)
else:
    print("No valid data found for analysis")


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import linregress
import os

# Ensure required packages are installed
!pip install openpyxl

In [None]:
# Load gage classification data
gage_class = pd.read_excel('data/GAGES-II_ref_non_ref.xlsx')
gage_class['STAID'] = gage_class['STAID'].astype(str)

In [None]:
# Load GSL basin gage data
gage_in_gslb = pd.read_csv('shp/gsl_nwm_gage.csv')
gage_in_gslb['id'] = gage_in_gslb['id'].astype(str)

In [None]:
# Merge gage data with classifications
gage_in_gslb_with_class = gage_in_gslb.merge(
    gage_class[['STAID', 'CLASS']],
    left_on='id',
    right_on='STAID',
    how='left'
).drop('STAID', axis=1).rename(columns={'CLASS': 'class'})

In [None]:
# Load delta metrics data
final_result_cleaned = pd.read_csv('downstream/all/q_buffer2_pair_delta_30m.csv')
final_result_cleaned['date'] = pd.to_datetime(final_result_cleaned['date'])

In [None]:
# Create output directories for plots
os.makedirs('downstream/all/monthly/subplot_q_delta_wte_monthly', exist_ok=True)
os.makedirs('downstream/all/monthly/scatter_plots_delta_q_vs_delta_wte', exist_ok=True)

In [None]:
# Define month mapping for better readability
MONTH_NAMES = {
    1: 'January', 2: 'February', 3: 'March', 4: 'April',
    5: 'May', 6: 'June', 7: 'July', 8: 'August',
    9: 'September', 10: 'October', 11: 'November', 12: 'December'
}

In [None]:
# Simple monthly statistical report - Positive slope percentage and R²
def generate_simple_monthly_report(stats_df):
    """
    Generate a simple monthly report showing percentage of positive slopes and R² values
    """
    print("MONTHLY STATISTICAL REPORT")
    print("=" * 60)

    # Month order for proper chronological display
    month_order = ['January', 'February', 'March', 'April', 'May', 'June',
                   'July', 'August', 'September', 'October', 'November', 'December']

    # Create summary for each month
    monthly_results = []

    for month in month_order:
        if month in stats_df['month_name'].values:
            month_data = stats_df[stats_df['month_name'] == month]

            # Calculate statistics
            total_records = len(month_data)
            positive_slopes = (month_data['slope'] > 0).sum()
            positive_percentage = (positive_slopes / total_records) * 100
            mean_r_squared = month_data['r_squared'].mean()

            monthly_results.append({
                'Month': month,
                'Total_Records': total_records,
                'Positive_Slopes_Percentage': positive_percentage,
                'Mean_R_Squared': mean_r_squared
            })

            print(f"{month:12} | Records: {total_records:3d} | Positive Slopes: {positive_percentage:5.1f}% | Mean R²: {mean_r_squared:.3f}")

    # Overall statistics
    total_all = len(stats_df)
    positive_all = (stats_df['slope'] > 0).sum()
    positive_pct_all = (positive_all / total_all) * 100
    mean_r2_all = stats_df['r_squared'].mean()

    print("-" * 60)
    print(f"{'OVERALL':12} | Records: {total_all:3d} | Positive Slopes: {positive_pct_all:5.1f}% | Mean R²: {mean_r2_all:.3f}")

    # Export to CSV
    results_df = pd.DataFrame(monthly_results)
    results_df.to_csv('downstream/all/monthly/simple_monthly_report.csv', index=False)

    return results_df

# Load monthly statistics and generate report
monthly_stats = pd.read_csv('downstream/all/monthly/scatter_delta_q_vs_delta_wte_monthly_statistics.csv')
simple_report = generate_simple_monthly_report(monthly_stats)


# Seasonal

### time series

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import linregress
import os

# Ensure the plots directory exists
os.makedirs('downstream/all/seasonal/subplot_delta_q_delta_wte_seasonal', exist_ok=True)

# Prepare a list to collect statistical data for CSV
stats_data = []

# Define season mapping function
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    elif month in [9, 10, 11]:
        return 'Fall'

# Assuming final_result_cleaned is already loaded as a DataFrame
# Group by gage_id
grouped = final_result_cleaned.groupby('gage_id')

# Iterate over each group
for gage_id, group in grouped:
    # Drop NaN values (changed from 'q' to 'delta_q')
    group = group.dropna(subset=['delta_wte', 'date', 'delta_q'])

    # Convert date to datetime format
    group['date'] = pd.to_datetime(group['date'])

    # Extract month and season from date
    group['month'] = group['date'].dt.month
    group['season'] = group['month'].apply(get_season)

    # Sort by date to ensure the line plot is correct
    group = group.sort_values('date')

    # Get unique seasons in the data
    unique_seasons = sorted(group['season'].unique())

    # Skip if no seasonal data
    if len(unique_seasons) == 0:
        continue

    # Create plots for each season
    for season in unique_seasons:
        # Filter data for current season
        seasonal_data = group[group['season'] == season].copy()

        # Skip if insufficient data for this season
        if len(seasonal_data) < 2:
            continue

        # Create subplots (Delta Q on top, Delta WTE on bottom)
        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10), sharex=True)

        # Plot delta_q vs. date on the top subplot (changed from q)
        ax1.scatter(seasonal_data['date'], seasonal_data['delta_q'], color='orange', label='Delta Q (cfs)', alpha=0.6)

        # Perform linear regression for delta_q (changed from q)
        if len(seasonal_data) > 1 and len(seasonal_data['date'].unique()) > 1:
            seasonal_data['date_numeric'] = seasonal_data['date'].map(pd.Timestamp.toordinal)
            slope_delta_q, intercept_delta_q, r_value_delta_q, p_value_delta_q, std_err_delta_q = linregress(
                seasonal_data['date_numeric'], seasonal_data['delta_q']
            )

            # Plot the regression line for delta_q
            ax1.plot(seasonal_data['date'],
                    intercept_delta_q + slope_delta_q * seasonal_data['date_numeric'],
                    'r', label='Fitted line for Delta Q')

            # Prepare legend text for delta_q
            legend_text_delta_q = (
                f"Delta Q Points: {len(seasonal_data)}\n"
                f"Slope Delta Q: {slope_delta_q:.6f}\n"
                f"R² Delta Q: {r_value_delta_q ** 2:.2f}\n"
                f"P-value Delta Q: {p_value_delta_q:.4f}"
            )
        else:
            slope_delta_q = r_value_delta_q = p_value_delta_q = None
            legend_text_delta_q = (
                f"Delta Q Points: {len(seasonal_data)}\n"
                "Insufficient data for regression"
            )

        ax1.set_ylabel('Delta Q (cfs)')
        ax1.grid(True, linestyle='--', linewidth=0.5, alpha=0.7)

        # Add a custom legend with statistical data for delta_q
        ax1.text(
            0.05, 0.95,
            legend_text_delta_q,
            transform=ax1.transAxes,
            fontsize=10,
            verticalalignment='top',
            bbox=dict(boxstyle='round', facecolor='white', alpha=0.5)
        )

        # Plot delta_wte vs. date on the bottom subplot
        ax2.scatter(seasonal_data['date'], seasonal_data['delta_wte'], color='blue', label='Delta WTE', alpha=0.6)

        # Perform linear regression for delta_wte
        if len(seasonal_data) > 1 and len(seasonal_data['date'].unique()) > 1:
            slope_wte, intercept_wte, r_value_wte, p_value_wte, std_err_wte = linregress(
                seasonal_data['date_numeric'], seasonal_data['delta_wte']
            )

            # Plot the regression line for delta_wte
            ax2.plot(seasonal_data['date'],
                    intercept_wte + slope_wte * seasonal_data['date_numeric'],
                    'r', label='Fitted line for Delta WTE')

            # Collect statistics (updated variable names)
            stats_data.append({
                'gage_id': gage_id,
                'season': season,
                'num_wells': seasonal_data['well_id'].nunique(),
                'num_measurements': len(seasonal_data),
                'slope_delta_q': slope_delta_q,
                'r_squared_delta_q': r_value_delta_q ** 2 if r_value_delta_q is not None else None,
                'p_value_delta_q': p_value_delta_q,
                'slope_wte': slope_wte,
                'r_squared_wte': r_value_wte ** 2,
                'p_value_wte': p_value_wte
            })

            # Prepare legend text for delta_wte
            legend_text_wte = (
                f"Wells: {seasonal_data['well_id'].nunique()}\n"
                f"Measurements: {len(seasonal_data)}\n"
                f"Slope WTE: {slope_wte:.6f}\n"
                f"R² WTE: {r_value_wte ** 2:.2f}\n"
                f"P-value WTE: {p_value_wte:.4f}"
            )
        else:
            legend_text_wte = (
                f"Wells: {seasonal_data['well_id'].nunique()}\n"
                f"Measurements: {len(seasonal_data)}\n"
                "Insufficient data for regression"
            )

        ax2.set_ylabel('Delta WTE (ft)')
        ax2.grid(True, linestyle='--', linewidth=0.5, alpha=0.7)

        # Add a custom legend with statistical data for delta_wte
        ax2.text(
            0.05, 0.95,
            legend_text_wte,
            transform=ax2.transAxes,
            fontsize=10,
            verticalalignment='top',
            bbox=dict(boxstyle='round', facecolor='white', alpha=0.5)
        )

        # Set a single title for both subplots, closer to the plots (updated title)
        fig.suptitle(f'Gage ID: {gage_id} - {season} - Delta Q and Delta WTE vs. Time',
                    fontsize=16, y=0.92)

        # Format the date axis
        ax2.xaxis.set_major_formatter(plt.matplotlib.dates.DateFormatter('%Y-%m-%d'))
        ax2.xaxis.set_major_locator(plt.matplotlib.dates.AutoDateLocator())
        plt.xticks(rotation=45)

        # Save the plot (updated filename and path)
        plt.savefig(f'downstream/all/seasonal/subplot_delta_q_delta_wte_seasonal/gage_{gage_id}_{season}.png',
                   bbox_inches='tight')
        plt.close()

# Export statistics to CSV (updated filename)
if stats_data:
    stats_df = pd.DataFrame(stats_data)
    stats_df.to_csv('downstream/all/seasonal/delta_q_delta_wte_seasonal_statistics.csv', index=False)

    # Calculate seasonal statistics (updated variable names)
    print("Seasonal Summary Statistics:")
    for season in sorted(stats_df['season'].unique()):
        season_data = stats_df[stats_df['season'] == season]

        print(f"\n{season}:")
        print(f"  Number of gages: {len(season_data)}")
        print(f"  Average R² Delta Q: {season_data['r_squared_delta_q'].mean():.3f}")
        print(f"  Average R² WTE: {season_data['r_squared_wte'].mean():.3f}")
        print(f"  Positive slope Delta Q: {(season_data['slope_delta_q'] > 0).mean() * 100:.1f}%")
        print(f"  Positive slope WTE: {(season_data['slope_wte'] > 0).mean() * 100:.1f}%")

    # Overall statistics (updated variable names)
    overall_positive_delta_q = (stats_df['slope_delta_q'] > 0).mean() * 100
    overall_positive_wte = (stats_df['slope_wte'] > 0).mean() * 100

    print(f"\nOverall Statistics:")
    print(f"Total gage-season combinations: {len(stats_df)}")
    print(f"Percentage of positive slopes for Delta Q: {overall_positive_delta_q:.2f}%")
    print(f"Percentage of positive slopes for Delta WTE: {overall_positive_wte:.2f}%")

else:
    print("No statistical data collected - insufficient data for regression analysis")


## scatter plots

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import linregress
import os

# Ensure the plots directory exists
os.makedirs('downstream/all/seasonal/delta_q_vs_delta_wte_by_gage_seasonal', exist_ok=True)

# Define season mapping function
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    elif month in [9, 10, 11]:
        return 'Fall'

# Load the data
final_result_cleaned = pd.read_csv('downstream/all/q_buffer2_pair_delta_30m.csv')

# Re-merge with gage_class to ensure 'class' column exists
final_result_cleaned['gage_id'] = final_result_cleaned['gage_id'].astype(str)
gage_class['STAID'] = gage_class['STAID'].astype(str)

# Merge to add CLASS column
final_result_cleaned = final_result_cleaned.merge(
    gage_class[['STAID', 'CLASS']],
    left_on='gage_id',
    right_on='STAID',
    how='left'
)

# Clean up and rename
final_result_cleaned.drop('STAID', axis=1, inplace=True)
final_result_cleaned.rename(columns={'CLASS': 'class'}, inplace=True)

# Prepare a list to collect statistical data for CSV
stats_data = []

# Convert date to datetime and extract season information
final_result_cleaned['date'] = pd.to_datetime(final_result_cleaned['date'])
final_result_cleaned['month'] = final_result_cleaned['date'].dt.month
final_result_cleaned['season'] = final_result_cleaned['month'].apply(get_season)

# Group by gage_id
grouped = final_result_cleaned.groupby('gage_id')

# Iterate over each gage
for gage_id, gage_group in grouped:
    # Get unique seasons for this gage
    unique_seasons = sorted(gage_group['season'].unique())

    # Skip if no seasonal data
    if len(unique_seasons) == 0:
        continue

    # Create plots for each season
    for season in unique_seasons:
        # Filter data for current season
        seasonal_data = gage_group[gage_group['season'] == season].copy()

        # Drop NaN values
        seasonal_data = seasonal_data.dropna(subset=['delta_wte', 'delta_q'])

        # Skip if not enough data points
        if len(seasonal_data) < 2:
            continue

        # Convert delta_q to acre-ft/year
        seasonal_data['delta_q_acre_ft_year'] = seasonal_data['delta_q'] * 365.25

        # Create the plot
        plt.figure(figsize=(10, 6))

        # Scatter plot with different colors for different wells
        sns.scatterplot(
            x='delta_wte',
            y='delta_q_acre_ft_year',
            data=seasonal_data,
            hue='well_id',
            palette='viridis',
            alpha=0.6,
            s=50,
            legend=False  # We'll create our own legend
        )

        # Perform linear regression if there's variation in delta_wte
        if len(seasonal_data['delta_wte'].unique()) > 1:
            slope, intercept, r_value, p_value, std_err = linregress(
                seasonal_data['delta_wte'], seasonal_data['delta_q_acre_ft_year']
            )

            # Plot the regression line
            sns.regplot(
                x='delta_wte',
                y='delta_q_acre_ft_year',
                data=seasonal_data,
                scatter=False,
                color='red',
                line_kws={'linewidth': 2}
            )

            # Collect statistics
            stats_data.append({
                'gage_id': gage_id,
                'season': season,
                'num_wells': seasonal_data['well_id'].nunique(),
                'num_measurements': len(seasonal_data),
                'slope': slope,
                'intercept': intercept,
                'r_squared': r_value ** 2,
                'p_value': p_value,
                'std_err': std_err,
                'class': seasonal_data['class'].iloc[0] if 'class' in seasonal_data.columns else 'Unknown'
            })

            # Statistical text
            stats_text = (
                f"Wells: {seasonal_data['well_id'].nunique()}\n"
                f"Measurements: {len(seasonal_data)}\n"
                f"Slope: {slope:.2f} (acre-ft/year)/ft\n"
                f"R²: {r_value**2:.3f}\n"
                f"P-value: {p_value:.4f}"
            )
        else:
            stats_text = (
                f"Wells: {seasonal_data['well_id'].nunique()}\n"
                f"Measurements: {len(seasonal_data)}\n"
                "All delta_wte values identical\nNo regression line"
            )

        # Map class names for display (with safety check)
        if 'class' in seasonal_data.columns and not seasonal_data['class'].isna().all():
            class_display = seasonal_data['class'].iloc[0]
            if class_display == 'Non-ref':
                class_display = 'Unregulated'
            elif class_display == 'Ref':
                class_display = 'Regulated'
        else:
            class_display = 'Unknown'

        # Add title and labels
        plt.title(f'Gage {gage_id} - {season} - Class: {class_display}\nDelta Q (acre-ft/year) vs Delta WTE', fontsize=14)
        plt.xlabel('Delta WTE (ft)', fontsize=12)
        plt.ylabel('Delta Q (acre-ft/year)', fontsize=12)
        plt.grid(True, alpha=0.3)

        # Add statistical text box
        plt.text(
            0.05, 0.95,
            stats_text,
            transform=plt.gca().transAxes,
            fontsize=10,
            verticalalignment='top',
            bbox=dict(boxstyle='round', facecolor='white', alpha=0.8, edgecolor='black')
        )

        # Save the plot
        plt.savefig(f'downstream/all/seasonal/delta_q_vs_delta_wte_by_gage_seasonal/gage_{gage_id}_{season}_delta_q_vs_delta_wte.png',
                    bbox_inches='tight', dpi=300)
        plt.close()

# Export statistics to CSV
if stats_data:
    stats_df = pd.DataFrame(stats_data)
    stats_df.to_csv('downstream/all/seasonal/delta_q_vs_delta_wte_seasonal_statistics.csv', index=False)

    # Print summary statistics by season
    print("Seasonal Summary Statistics:")
    for season in sorted(stats_df['season'].unique()):
        season_data = stats_df[stats_df['season'] == season]

        print(f"\n{season}:")
        print(f"  Number of gage-season combinations: {len(season_data)}")
        print(f"  Average R²: {season_data['r_squared'].mean():.3f}")
        print(f"  Median R²: {season_data['r_squared'].median():.3f}")
        print(f"  Positive slope percentage: {(season_data['slope'] > 0).mean() * 100:.1f}%")
        print(f"  Average slope: {season_data['slope'].mean():.2f} (acre-ft/year)/ft")

    # Overall statistics
    print(f"\nOverall Statistics:")
    print(f"Total gage-season combinations: {len(stats_df)}")
    print(f"Average R²: {stats_df['r_squared'].mean():.3f}")
    print(f"Percentage with positive slope: {(stats_df['slope'] > 0).mean() * 100:.1f}%")
    print(f"Number of unique gages: {stats_df['gage_id'].nunique()}")
    print(f"Number of seasons represented: {stats_df['season'].nunique()}")

else:
    print("No valid data for regression analysis")


In [None]:
# Simple seasonal statistical report - Positive slope percentage and R²
def generate_simple_seasonal_report(stats_df):
    """
    Generate a simple seasonal report showing percentage of positive slopes and R² values
    """
    print("SEASONAL STATISTICAL REPORT")
    print("=" * 60)

    # Define seasons based on months
    def get_season(month_name):
        if month_name in ['December', 'January', 'February']:
            return 'Winter'
        elif month_name in ['March', 'April', 'May']:
            return 'Spring'
        elif month_name in ['June', 'July', 'August']:
            return 'Summer'
        elif month_name in ['September', 'October', 'November']:
            return 'Fall'
        return 'Unknown'

    # Add season column if it doesn't exist
    if 'season' not in stats_df.columns:
        if 'month_name' in stats_df.columns:
            stats_df['season'] = stats_df['month_name'].apply(get_season)
        else:
            # If we only have season column already, use it directly
            pass

    # Season order for proper display
    season_order = ['Spring', 'Summer', 'Fall', 'Winter']

    # Create summary for each season
    seasonal_results = []

    for season in season_order:
        if season in stats_df['season'].values:
            season_data = stats_df[stats_df['season'] == season]

            # Calculate statistics for Delta Q
            total_records = len(season_data)
            positive_slopes_delta_q = (season_data['slope_delta_q'] > 0).sum()
            positive_percentage_delta_q = (positive_slopes_delta_q / total_records) * 100
            mean_r_squared_delta_q = season_data['r_squared_delta_q'].mean()

            # Calculate statistics for WTE
            positive_slopes_wte = (season_data['slope_wte'] > 0).sum()
            positive_percentage_wte = (positive_slopes_wte / total_records) * 100
            mean_r_squared_wte = season_data['r_squared_wte'].mean()

            seasonal_results.append({
                'Season': season,
                'Total_Records': total_records,
                'Positive_Slopes_Delta_Q_Percentage': positive_percentage_delta_q,
                'Mean_R_Squared_Delta_Q': mean_r_squared_delta_q,
                'Positive_Slopes_WTE_Percentage': positive_percentage_wte,
                'Mean_R_Squared_WTE': mean_r_squared_wte
            })

            print(f"{season:12} | Records: {total_records:3d} | Positive Slopes Delta Q: {positive_percentage_delta_q:5.1f}% | Mean R² Delta Q: {mean_r_squared_delta_q:.3f}")
            print(f"{'':12} | {'':13} | Positive Slopes WTE: {positive_percentage_wte:5.1f}% | Mean R² WTE: {mean_r_squared_wte:.3f}")
            print("-" * 60)

    # Overall statistics
    total_all = len(stats_df)
    positive_all_delta_q = (stats_df['slope_delta_q'] > 0).sum()
    positive_pct_all_delta_q = (positive_all_delta_q / total_all) * 100
    mean_r2_all_delta_q = stats_df['r_squared_delta_q'].mean()

    positive_all_wte = (stats_df['slope_wte'] > 0).sum()
    positive_pct_all_wte = (positive_all_wte / total_all) * 100
    mean_r2_all_wte = stats_df['r_squared_wte'].mean()

    print(f"{'OVERALL':12} | Records: {total_all:3d} | Positive Slopes Delta Q: {positive_pct_all_delta_q:5.1f}% | Mean R² Delta Q: {mean_r2_all_delta_q:.3f}")
    print(f"{'':12} | {'':13} | Positive Slopes WTE: {positive_pct_all_wte:5.1f}% | Mean R² WTE: {mean_r2_all_wte:.3f}")

    # Export to CSV
    results_df = pd.DataFrame(seasonal_results)
    results_df.to_csv('downstream/all/seasonal/simple_seasonal_delta_q_wte_report.csv', index=False)

    return results_df

# Load seasonal statistics and generate seasonal report
seasonal_stats = pd.read_csv('downstream/all/seasonal/delta_q_delta_wte_seasonal_statistics.csv')
simple_seasonal_report = generate_simple_seasonal_report(seasonal_stats)
