# Load Streamflow ML results

## filter q (bfd=1)

In [1]:
import pandas as pd
import os

# from main_jupyter import final_measurements_delta

# Define the directory containing the CSV files
directory = '../data/raw/streamflow/GSLB_ML'

# Initialize an empty DataFrame to store the results
compiled_data = pd.DataFrame(columns=['gage_id', 'date', 'q', 'bfd'])

# Iterate over each file in the directory
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        # Construct the full file path
        file_path = os.path.join(directory, filename)

        # Read the CSV file
        df = pd.read_csv(file_path)

        # Filter rows where ML_BFD is 1
        filtered_df = df[df['ML_BFD'] == 1]

        # Extract gage_id from the filename (assuming filename is the gage_id)
        gage_id = os.path.splitext(filename)[0]

        # Add a new column for gage_id
        filtered_df['gage_id'] = gage_id

        # Select and rename the necessary columns
        filtered_df = filtered_df[['gage_id', 'date','Q', 'ML_BFD']]
        filtered_df.columns = ['gage_id', 'date', 'q', 'bfd']

        # Append to the compiled DataFrame
        compiled_data = pd.concat([compiled_data, filtered_df], ignore_index=True)

# Display the compiled DataFrame
compiled_data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['gage_id'] = gage_id
  compiled_data = pd.concat([compiled_data, filtered_df], ignore_index=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['gage_id'] = gage_id
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['gage_id'] = gage_id
A value is trying to be set o

Unnamed: 0,gage_id,date,q,bfd
0,10015900,1958-04-01,0.0,1.0
1,10015900,1958-04-02,0.0,1.0
2,10015900,1958-04-03,0.0,1.0
3,10015900,1958-04-04,0.0,1.0
4,10015900,1958-04-05,0.0,1.0
...,...,...,...,...
900051,10058600,1986-09-10,35.0,1.0
900052,10058600,1986-09-14,33.6,1.0
900053,10058600,1986-09-24,32.9,1.0
900054,10058600,1986-09-25,34.4,1.0


## streamflow outliers

In [8]:
import pandas as pd
import numpy as np
from scipy import stats


class SimpleOutlierDetector:
    """Simplified outlier detection class"""

    def __init__(self, data, column):
        self.data = data.copy()
        self.column = column
        self.results = None

    def detect_outliers(self, zscore_threshold=3.0, iqr_multiplier=1.5):
        """Detect outliers"""
        data = self.data.copy()
        # Initialize outlier flag columns
        data['is_outlier_zscore'] = False
        data['is_outlier_iqr'] = False

        # 1. Z-score method
        try:
            z_scores = np.abs(stats.zscore(data[self.column], nan_policy='omit'))
            data['is_outlier_zscore'] = z_scores > zscore_threshold
        except:
            pass

        # 2. IQR method
        try:
            Q1 = np.nanpercentile(data[self.column], 25)
            Q3 = np.nanpercentile(data[self.column], 75)
            IQR = Q3 - Q1
            lower_bound = Q1 - iqr_multiplier * IQR
            upper_bound = Q3 + iqr_multiplier * IQR
            data['is_outlier_iqr'] = (data[self.column] < lower_bound) | (data[self.column] > upper_bound)
        except:
            pass

        # Combined outlier detection
        data['is_outlier_any'] = data[['is_outlier_zscore', 'is_outlier_iqr']].any(axis=1)

        self.results = data
        return self.results

    def get_clean_data(self):
        """Get clean data"""
        if self.results is None:
            raise ValueError("Please run detect_outliers() method first")
        return self.results[~self.results['is_outlier_any']].copy()


# Usage example
detector = SimpleOutlierDetector(compiled_data, 'q')
outlier_results = detector.detect_outliers()
clean_data = detector.get_clean_data()

# Display clean data
print(clean_data.head())




    gage_id        date    q  bfd  is_outlier_zscore  is_outlier_iqr  \
0  10015900  1958-04-01  0.0  1.0              False           False   
1  10015900  1958-04-02  0.0  1.0              False           False   
2  10015900  1958-04-03  0.0  1.0              False           False   
3  10015900  1958-04-04  0.0  1.0              False           False   
4  10015900  1958-04-05  0.0  1.0              False           False   

   is_outlier_any  
0           False  
1           False  
2           False  
3           False  
4           False  


In [9]:
# Get detection results
outlier_results = detector.detect_outliers()

# Display total number of outliers detected
total_outliers = outlier_results['is_outlier_any'].sum()
print(f"Total number of outliers detected: {total_outliers}")

# Display number of outliers detected by each method
zscore_outliers = outlier_results['is_outlier_zscore'].sum()
iqr_outliers = outlier_results['is_outlier_iqr'].sum()

print(f"Number of outliers detected by Z-score method: {zscore_outliers}")
print(f"Number of outliers detected by IQR method: {iqr_outliers}")

# Mark records detected as outliers by both methods
outlier_results['is_outlier_both'] = outlier_results['is_outlier_zscore'] & outlier_results['is_outlier_iqr']

# Remove records that are outliers by both methods
clean_data = outlier_results[~outlier_results['is_outlier_both']].copy()

# Display summary of data cleaning
removed_count = outlier_results['is_outlier_both'].sum()
print(f"Number of outliers removed (detected by both methods): {removed_count}")
print(f"Number of records after cleaning: {len(clean_data)}")


Total number of outliers detected: 35377
Number of outliers detected by Z-score method: 14340
Number of outliers detected by IQR method: 35377
Number of outliers removed (detected by both methods): 14340
Number of records after cleaning: 885716


In [10]:
clean_data=clean_data[['gage_id','date','q','bfd']]

In [12]:
compiled_data=clean_data.copy()
compiled_data.head()

Unnamed: 0,gage_id,date,q,bfd
0,10015900,1958-04-01,0.0,1.0
1,10015900,1958-04-02,0.0,1.0
2,10015900,1958-04-03,0.0,1.0
3,10015900,1958-04-04,0.0,1.0
4,10015900,1958-04-05,0.0,1.0


In [13]:
# Define the output directory
output_directory = '../data/processed/streamflow'
os.makedirs(output_directory, exist_ok=True)

# Define the output file path
output_file_path = os.path.join(output_directory, 'q_bfd_1.csv')

# Save the compiled DataFrame to a CSV file
compiled_data.to_csv(output_file_path, index=False)

# Display the path where the file is saved
output_file_path


'../data/processed/streamflow/q_bfd_1.csv'