# Load Streamflow ML results

## filter q (bfd=1)

In [14]:
import pandas as pd
import os

# from main_jupyter import final_measurements_delta

# Define the directory containing the CSV files
directory = '../data/raw/streamflow/GSLB_ML'

# Initialize an empty DataFrame to store the results
compiled_data = pd.DataFrame(columns=['gage_id', 'date', 'q', 'bfd'])

# Iterate over each file in the directory
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        # Construct the full file path
        file_path = os.path.join(directory, filename)

        # Read the CSV file
        df = pd.read_csv(file_path)

        # Filter rows where ML_BFD is 1
        filtered_df = df[df['ML_BFD'] == 1]

        # Extract gage_id from the filename (assuming filename is the gage_id)
        gage_id = os.path.splitext(filename)[0]

        # Add a new column for gage_id
        filtered_df['gage_id'] = gage_id

        # Select and rename the necessary columns
        filtered_df = filtered_df[['gage_id', 'date','Q', 'ML_BFD']]
        filtered_df.columns = ['gage_id', 'date', 'q', 'bfd']

        # Append to the compiled DataFrame
        compiled_data = pd.concat([compiled_data, filtered_df], ignore_index=True)

# Display the compiled DataFrame
compiled_data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['gage_id'] = gage_id
  compiled_data = pd.concat([compiled_data, filtered_df], ignore_index=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['gage_id'] = gage_id
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['gage_id'] = gage_id
A value is trying to be set o

Unnamed: 0,gage_id,date,q,bfd
0,10015900,1958-04-01,0.0,1.0
1,10015900,1958-04-02,0.0,1.0
2,10015900,1958-04-03,0.0,1.0
3,10015900,1958-04-04,0.0,1.0
4,10015900,1958-04-05,0.0,1.0
...,...,...,...,...
900051,10058600,1986-09-10,35.0,1.0
900052,10058600,1986-09-14,33.6,1.0
900053,10058600,1986-09-24,32.9,1.0
900054,10058600,1986-09-25,34.4,1.0


## streamflow outliers

In [15]:
# Streamflow data preparation - no outlier detection needed
# Flow variations are natural and should not be treated as outliers

print("Processing streamflow data...")
print(f"Total streamflow records: {len(compiled_data):,}")
print(f"Number of unique gages: {compiled_data['gage_id'].nunique()}")

# Simply use the compiled data as clean data since flow variations are natural
clean_data = compiled_data.copy()

# Display basic statistics by gage
gage_stats = compiled_data.groupby('gage_id').agg({
    'q': ['count', 'min', 'max', 'mean', 'std'],
    'date': ['min', 'max']
}).round(2)

gage_stats.columns = ['record_count', 'min_flow', 'max_flow', 'mean_flow', 'std_flow', 'start_date', 'end_date']

print(f"\nStreamflow statistics by gage:")
print(gage_stats)

print(f"\nStreamflow data ready for use - no outlier removal applied")
print(f"Final record count: {len(clean_data):,}")

Processing streamflow data...
Total streamflow records: 900,056
Number of unique gages: 70

Streamflow statistics by gage:
          record_count  min_flow  max_flow  mean_flow  std_flow  start_date  \
gage_id                                                                       
10011500         21729     13.00     236.0      55.71     26.26  1942-07-28   
10015700         11305      0.00      37.0       4.74      5.23  1957-10-01   
10015900          6805      0.00      49.0       1.84      4.07  1958-04-01   
10016900          6868      3.20     177.0      48.85     24.38  1984-08-11   
10020100         12810      0.00     200.0      36.85     27.73  1961-10-01   
...                ...       ...       ...        ...       ...         ...   
10168500         15408      4.50      98.0      29.15     10.63  1930-10-01   
10171000         23782      0.00     292.0     132.99     39.94  1942-12-01   
10172700         20071      0.51       5.2       2.42      0.93  1958-06-26   
10172860

In [16]:
# Display streamflow data summary - no outlier detection applied
print("Streamflow data summary:")
print(f"Total records: {len(clean_data):,}")
print(f"Date range: {clean_data['date'].min()} to {clean_data['date'].max()}")
print(f"Flow range: {clean_data['q'].min():.2f} to {clean_data['q'].max():.2f} cfs")
print(f"Number of gages: {clean_data['gage_id'].nunique()}")

# Show sample data
print(f"\nSample of streamflow data:")
print(clean_data.head(10))

Streamflow data summary:
Total records: 900,056
Date range: 1902-10-01 to 2025-01-28
Flow range: -0.00 to 1890.00 cfs
Number of gages: 70

Sample of streamflow data:
    gage_id        date    q  bfd
0  10015900  1958-04-01  0.0  1.0
1  10015900  1958-04-02  0.0  1.0
2  10015900  1958-04-03  0.0  1.0
3  10015900  1958-04-04  0.0  1.0
4  10015900  1958-04-05  0.0  1.0
5  10015900  1958-04-06  0.0  1.0
6  10015900  1958-04-07  0.0  1.0
7  10015900  1958-04-08  0.0  1.0
8  10015900  1958-04-09  0.0  1.0
9  10015900  1958-04-10  0.0  1.0


In [17]:
clean_data=clean_data[['gage_id','date','q','bfd']]

In [18]:
compiled_data=clean_data.copy()
compiled_data.head()

Unnamed: 0,gage_id,date,q,bfd
0,10015900,1958-04-01,0.0,1.0
1,10015900,1958-04-02,0.0,1.0
2,10015900,1958-04-03,0.0,1.0
3,10015900,1958-04-04,0.0,1.0
4,10015900,1958-04-05,0.0,1.0


In [19]:
# Define the output directory
output_directory = '../data/processed/streamflow'
os.makedirs(output_directory, exist_ok=True)

# Define the output file path
output_file_path = os.path.join(output_directory, 'q_bfd_1.csv')

# Save the streamflow DataFrame (no outlier removal applied)
clean_data.to_csv(output_file_path, index=False)

print(f"Saved streamflow data to: {output_file_path}")
print(f"Records saved: {len(clean_data):,}")
print("Note: No outlier detection applied to streamflow data - flow variations are natural")

# Display the path where the file is saved
output_file_path

Saved streamflow data to: ../data/processed/streamflow/q_bfd_1.csv
Records saved: 900,056
Note: No outlier detection applied to streamflow data - flow variations are natural


'../data/processed/streamflow/q_bfd_1.csv'

# PCHIP well wte

In [None]:
import pandas as pd

# Load and examine the original groundwater time series data
well_ts = pd.read_csv('../data/raw/groundwater/GSLB_1900-2023_TS_with_aquifers.csv')
well_ts.columns = well_ts.columns.str.lower()

print("=== ORIGINAL GROUNDWATER DATA VALIDATION ===")
print(f"Data shape: {well_ts.shape}")
print(f"Columns: {well_ts.columns.tolist()}")

# Critical: Examine the original date format
print(f"\n=== DATE FORMAT ANALYSIS ===")
print("Sample date values (original format):")
print(well_ts['date'].head(20).tolist())

print(f"\nDate column data type: {well_ts['date'].dtype}")
print(f"Unique date count: {well_ts['date'].nunique():,}")

# Try to parse dates and check year distribution
print(f"\n=== DATETIME CONVERSION TEST ===")
try:
    test_dates = pd.to_datetime(well_ts['date'], errors='coerce')
    valid_dates = test_dates.dropna()
    
    print(f"Successfully parsed dates: {len(valid_dates):,} / {len(well_ts):,}")
    print(f"Failed to parse: {test_dates.isna().sum():,}")
    
    if len(valid_dates) > 0:
        years = valid_dates.dt.year
        print(f"\nYear range: {years.min()} to {years.max()}")
        print(f"Year distribution (top 10):")
        year_counts = years.value_counts().sort_index().head(10)
        for year, count in year_counts.items():
            print(f"  {year}: {count:,} records")
        
        print(f"\nYear distribution (bottom 10):")
        year_counts_bottom = years.value_counts().sort_index().tail(10)
        for year, count in year_counts_bottom.items():
            print(f"  {year}: {count:,} records")
        
        # Check for potential year issues
        problematic_years = years[(years < 1900) | (years > 2030)]
        if len(problematic_years) > 0:
            print(f"\n*** WARNING: Found {len(problematic_years)} records with problematic years ***")
            print(f"Problematic year range: {problematic_years.min()} to {problematic_years.max()}")
        else:
            print(f"\n✓ All years appear reasonable (1900-2030)")
            
except Exception as e:
    print(f"Error parsing dates: {e}")

# Display basic data info
print(f"\n=== BASIC DATA INFO ===")
well_ts.info()

In [22]:
import pandas as pd


def analyze_well_time_spans(well_ts):
    """
    Analyze well time spans and calculate key statistics
    Args:
        well_ts: Dictionary with well data containing start_date, end_date and data_points
    Returns:
        DataFrame with time span statistics
    """
    data = []
    for well_id, well_data in well_ts.items():
        start_date = pd.to_datetime(well_data['start_date'])
        end_date = pd.to_datetime(well_data['end_date'])
        time_span = (end_date - start_date).days / 365.0
        density = well_data['data_points'] / time_span if time_span > 0 else 0
        data.append({
            "well_id": well_id,
            "time_span_years": time_span,
            "data_density": density,
        })
    return pd.DataFrame(data)


def generate_time_span_report(df):
    """
    Generate summary report from well statistics DataFrame
    Args:
        df: DataFrame with well statistics
    """
    print(f"\nSummary Statistics:")
    print(f"- Total wells: {len(df):,}")
    if 'time_span_years' in df:
        print(f"- Time span range: {df['time_span_years'].min():.1f} - {df['time_span_years'].max():.1f} years")
        print(f"- Average time span: {df['time_span_years'].mean():.1f} years")
    if 'data_density' in df:
        print(f"- Data density range: {df['data_density'].min():.1f} - {df['data_density'].max():.1f} points/year")


# Example usage
well_ts = {
    "well_1": {"start_date": "2015-01-01", "end_date": "2020-01-01", "data_points": 50},
    "well_2": {"start_date": "2010-06-15", "end_date": "2022-06-15", "data_points": 100}
}

stats_df = analyze_well_time_spans(well_ts)
generate_time_span_report(stats_df)



Summary Statistics:
- Total wells: 2
- Time span range: 5.0 - 12.0 years
- Average time span: 8.5 years
- Data density range: 8.3 - 10.0 points/year


## detect outliers

In [23]:
import pandas as pd
import numpy as np
from scipy import stats
import warnings

warnings.filterwarnings('ignore')


class SimpleGroundwaterOutlierDetector:
    """Simplified groundwater data outlier detection class - for interpolation preparation"""
    DATE_COLUMN = 'date'
    WELL_ID_COLUMN = 'well_id'

    def __init__(self, data):
        self.data = data.copy()
        self._validate_columns()
        # Convert date column during initialization
        self.data[self.DATE_COLUMN] = pd.to_datetime(self.data[self.DATE_COLUMN], errors='coerce')
        # Remove rows with invalid dates
        self.data = self.data.dropna(subset=[self.DATE_COLUMN])
        self.results = None

    def _validate_columns(self):
        """Validate that required columns exist in the data"""
        required_columns = [self.DATE_COLUMN, self.WELL_ID_COLUMN, 'wte']
        missing_columns = [col for col in required_columns if col not in self.data.columns]
        if missing_columns:
            raise ValueError(f"Missing required columns: {missing_columns}")

    def detect_outliers(self, min_points=5, zscore_threshold=3.0, iqr_multiplier=2):
        """
        Detect outliers using statistical methods only
        
        Args:
            min_points: Minimum points needed for statistical tests 
            zscore_threshold: Z-score threshold for outlier detection
            iqr_multiplier: IQR multiplier for outlier detection
        """
        # Sort by well_id and date
        self.data = self.data.sort_values([self.WELL_ID_COLUMN, self.DATE_COLUMN]).reset_index(drop=True)
        results = []
        well_stats = []

        print(f"Processing {self.data[self.WELL_ID_COLUMN].nunique()} wells for outlier detection...")

        for well_id in self.data[self.WELL_ID_COLUMN].unique():
            well_data = self.data[self.data[self.WELL_ID_COLUMN] == well_id].copy()
            n_points = len(well_data)

            # Initialize outlier flags
            well_data['is_outlier_zscore'] = False
            well_data['is_outlier_iqr'] = False
            well_data['is_outlier'] = False

            if n_points >= min_points:
                wte_values = well_data['wte'].values

                # 1. Statistical outlier detection (Z-score method)
                try:
                    z_scores = np.abs(stats.zscore(wte_values, nan_policy='omit'))
                    is_zscore_outlier = z_scores > zscore_threshold
                    well_data['is_outlier_zscore'] = is_zscore_outlier
                except:
                    pass

                # 2. Statistical outlier detection (IQR method)
                try:
                    Q1, Q3 = np.nanpercentile(wte_values, [25, 75])
                    IQR = Q3 - Q1
                    if IQR > 0:
                        lower_bound = Q1 - iqr_multiplier * IQR
                        upper_bound = Q3 + iqr_multiplier * IQR
                        is_iqr_outlier = (wte_values < lower_bound) | (wte_values > upper_bound)
                        well_data['is_outlier_iqr'] = is_iqr_outlier
                except:
                    pass

                # Combine outlier detection methods (conservative approach)
                # Flag as outlier only if detected by both methods
                well_data['is_outlier'] = well_data['is_outlier_zscore'] & well_data['is_outlier_iqr']

                # Collect statistics for this well
                well_stats.append({
                    'well_id': well_id,
                    'total_points': n_points,
                    'zscore_outliers': well_data['is_outlier_zscore'].sum(),
                    'iqr_outliers': well_data['is_outlier_iqr'].sum(),
                    'total_outliers': well_data['is_outlier'].sum(),
                    'outlier_percentage': (well_data['is_outlier'].sum() / n_points * 100),
                    'date_range_days': (well_data[self.DATE_COLUMN].max() - 
                                      well_data[self.DATE_COLUMN].min()).days,
                    'wte_range': well_data['wte'].max() - well_data['wte'].min()
                })

            else:
                # For wells with insufficient data, don't flag any outliers
                well_data['is_outlier_zscore'] = False
                well_data['is_outlier_iqr'] = False
                well_data['is_outlier'] = False

            results.append(well_data)

        if results:
            self.results = pd.concat(results, ignore_index=True)
            
            # Display summary statistics
            stats_df = pd.DataFrame(well_stats)
            if len(stats_df) > 0:
                print(f"\nWell outlier detection summary:")
                print(f"- Wells processed: {len(stats_df)}")
                print(f"- Total outliers detected: {self.results['is_outlier'].sum():,}")
                print(f"- By Z-score method: {self.results['is_outlier_zscore'].sum():,}")
                print(f"- By IQR method: {self.results['is_outlier_iqr'].sum():,}")
                
                print(f"\nOutlier percentage statistics:")
                print(f"- Mean outlier percentage per well: {stats_df['outlier_percentage'].mean():.2f}%")
                print(f"- Max outlier percentage per well: {stats_df['outlier_percentage'].max():.2f}%")
                print(f"- Wells with >10% outliers: {(stats_df['outlier_percentage'] > 10).sum()}")

        return self.results

    def get_clean_data(self):
        """Get clean data suitable for interpolation"""
        if self.results is None:
            return None

        clean_data = self.results[~self.results['is_outlier']].copy()

        # Print interpolation readiness stats
        well_stats = clean_data.groupby(self.WELL_ID_COLUMN).size()
        print(f"\nInterpolation readiness summary:")
        print(f"- Wells with no data after cleaning: {(well_stats == 0).sum()}")
        print(f"- Wells with 1-2 points: {((well_stats >= 1) & (well_stats <= 2)).sum()}")
        print(f"- Wells with 3+ points: {(well_stats >= 3).sum()} (suitable for PCHIP)")

        return clean_data


def clean_well_data_for_interpolation(well_ts, min_points=5):
    """Main function to clean groundwater data for interpolation"""
    detector = SimpleGroundwaterOutlierDetector(well_ts)
    detector.detect_outliers(min_points=min_points)
    return detector.get_clean_data()

In [24]:
# Reload the well_ts DataFrame since it was overwritten by the dictionary example
well_ts = pd.read_csv('../data/raw/groundwater/GSLB_1900-2023_TS_with_aquifers.csv')
well_ts.columns = well_ts.columns.str.lower()

# Now use the actual well_ts DataFrame for outlier detection
clean_data = clean_well_data_for_interpolation(well_ts)

Processing 8602 wells for outlier detection...

Well outlier detection summary:
- Wells processed: 2338
- Total outliers detected: 911
- By Z-score method: 1,049
- By IQR method: 3,247

Outlier percentage statistics:
- Mean outlier percentage per well: 0.57%
- Max outlier percentage per well: 9.09%
- Wells with >10% outliers: 0

Interpolation readiness summary:
- Wells with no data after cleaning: 0
- Wells with 1-2 points: 5664
- Wells with 3+ points: 2938 (suitable for PCHIP)


## PCHIP wte

In [25]:
import pandas as pd
from scipy.interpolate import PchipInterpolator

# Convert all column names to lowercase
clean_data.columns = clean_data.columns.str.lower()

# Convert date column to datetime format
clean_data['date'] = pd.to_datetime(clean_data['date'])

# Count data points for each well
well_counts = clean_data['well_id'].value_counts()
total_wells = len(well_counts)
wells_with_one_point = (well_counts == 1).sum()
wells_with_two_points = (well_counts == 2).sum()

# Print summary statistics
print(f"Total number of wells: {total_wells}")
print(f"Number of wells with only one data point: {wells_with_one_point}")
print(f"Number of wells with only two data points: {wells_with_two_points}")


Total number of wells: 8602
Number of wells with only one data point: 4837
Number of wells with only two data points: 827


In [26]:
import pandas as pd
import numpy as np
from scipy.interpolate import PchipInterpolator


def interpolate_daily_pchip(well_ts):
    """
    Perform daily PCHIP interpolation on groundwater well time series data
    
    Args:
        well_ts: DataFrame containing well_id, date, and wte (water table elevation) columns
        
    Returns:
        DataFrame with daily interpolated values for each well
    """
    well_ts = well_ts.copy()
    
    # Critical: Proper datetime conversion and validation
    print("Converting dates to datetime format...")
    well_ts['date'] = pd.to_datetime(well_ts['date'], errors='coerce')
    
    # Remove rows with invalid dates
    invalid_dates = well_ts['date'].isna()
    if invalid_dates.sum() > 0:
        print(f"Warning: {invalid_dates.sum()} rows with invalid dates removed")
        well_ts = well_ts.dropna(subset=['date'])
    
    # Validate date ranges to ensure reasonable years
    min_year = well_ts['date'].dt.year.min()
    max_year = well_ts['date'].dt.year.max()
    print(f"Date range: {min_year} to {max_year}")
    
    # Flag potential year issues
    if min_year < 1900 or max_year > 2030:
        print(f"WARNING: Potentially problematic date range detected!")
        print(f"Please verify the year information is correct")
    
    # Critical: Sort by well_id and date to ensure proper chronological order
    well_ts = well_ts.sort_values(['well_id', 'date']).reset_index(drop=True)
    print(f"Processing {well_ts['well_id'].nunique()} wells for PCHIP interpolation...")

    interpolated_list = []
    skipped_wells = 0

    for well_id, group in well_ts.groupby('well_id'):
        # Skip wells with less than 2 observations (minimum required for interpolation)
        if len(group) < 2:
            skipped_wells += 1
            continue

        # Ensure group is sorted by date (critical for proper interpolation)
        group = group.sort_values('date').reset_index(drop=True)
        
        # Check for duplicate dates and handle them
        if group['date'].duplicated().any():
            print(f"Warning: Well {well_id} has duplicate dates, averaging WTE values")
            group = group.groupby('date')['wte'].mean().reset_index()
            group['well_id'] = well_id

        # Get date range
        start_date = group['date'].min()
        end_date = group['date'].max()
        
        # Skip wells with very short time ranges (less than 1 day)
        if (end_date - start_date).days < 1:
            skipped_wells += 1
            continue

        # Generate daily date sequence 
        full_dates = pd.date_range(start=start_date, end=end_date, freq='D')

        # Convert dates to ordinal numbers for interpolation
        # This preserves the exact datetime information
        x_obs = group['date'].map(pd.Timestamp.toordinal)
        y_obs = group['wte'].values

        # Verify we have valid data for interpolation
        if len(x_obs) != len(y_obs) or len(x_obs) < 2:
            skipped_wells += 1
            continue

        # Perform PCHIP interpolation
        try:
            interpolator = PchipInterpolator(x_obs, y_obs)
            x_new = full_dates.map(pd.Timestamp.toordinal)
            y_new = interpolator(x_new)

            # Create interpolated DataFrame 
            df_interp = pd.DataFrame({
                'well_id': well_id,
                'date': full_dates,
                'wte': y_new
            })

            interpolated_list.append(df_interp)
            
        except Exception as e:
            print(f"Error interpolating well {well_id}: {e}")
            skipped_wells += 1
            continue

    print(f"Successfully interpolated {len(interpolated_list)} wells")
    print(f"Skipped {skipped_wells} wells due to insufficient data or errors")

    if interpolated_list:
        interpolated_df = pd.concat(interpolated_list, ignore_index=True)
        
        # Final validation of interpolated data
        print(f"\nInterpolation results:")
        print(f"- Total interpolated records: {len(interpolated_df):,}")
        print(f"- Date range: {interpolated_df['date'].min()} to {interpolated_df['date'].max()}")
        print(f"- WTE range: {interpolated_df['wte'].min():.2f} to {interpolated_df['wte'].max():.2f}")
        
        return interpolated_df
    else:
        print("No wells could be interpolated!")
        return pd.DataFrame()

In [27]:
well_info = pd.read_csv('../data/raw/groundwater/GSLB_1900-2023_wells_with_aquifers.csv')
well_info.columns = well_info.columns.str.lower()


In [28]:
well_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8752 entries, 0 to 8751
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   well_id       8752 non-null   int64  
 1   well_name     8752 non-null   object 
 2   lat_dec       8752 non-null   float64
 3   long_dec      8752 non-null   float64
 4   gse           8752 non-null   float64
 5   aquiferid     8752 non-null   int64  
 6   aquifer_name  8752 non-null   object 
 7   state         8752 non-null   object 
dtypes: float64(3), int64(2), object(3)
memory usage: 547.1+ KB


In [29]:
daily_interp_df = interpolate_daily_pchip(clean_data)
# Merge well information with well_info data
merged_data = pd.merge(daily_interp_df, well_info, on='well_id', how='left')

# Rename columns
merged_data.rename(columns={'lat_dec': 'well_lat', 'long_dec': 'well_lon'}, inplace=True)
merged_data = merged_data[['well_id', 'date', 'wte', 'well_lat', 'well_lon', 'gse']]
# Display merged data
merged_data.head()


Converting dates to datetime format...
Date range: 1906 to 2023
Processing 8602 wells for PCHIP interpolation...
Successfully interpolated 3765 wells
Skipped 4837 wells due to insufficient data or errors

Interpolation results:
- Total interpolated records: 26,260,962
- Date range: 1915-08-01 00:00:00 to 2023-12-21 00:00:00
- WTE range: 4072.00 to 7845.81


Unnamed: 0,well_id,date,wte,well_lat,well_lon,gse
0,382113113435401,2008-09-03,5395.95,38.353571,-113.732473,5775.0
1,382113113435401,2008-09-04,5395.950539,38.353571,-113.732473,5775.0
2,382113113435401,2008-09-05,5395.951113,38.353571,-113.732473,5775.0
3,382113113435401,2008-09-06,5395.951721,38.353571,-113.732473,5775.0
4,382113113435401,2008-09-07,5395.952363,38.353571,-113.732473,5775.0


In [30]:
merged_data.to_csv('../data/processed/well_pchip.csv', index=False)