In [1]:
import pandas as pd
import numpy as np
import os
import glob
from joblib import Parallel, delayed


In [2]:
def calculate_z_scaled_fdc(df: pd.DataFrame) -> np.ndarray:
    """
    Calculate the Z-scaled Flow Duration Curve (FDC) for a given DataFrame.

    Parameters:
    - df: pandas DataFrame with 'Streamflow (m3/s)' column.

    Returns:
    - z_scaled: numpy array with Z-scaled streamflow values.
    """
    # Ensure the 'Streamflow (m3/s)' column is numeric and drop any non-numeric entries
    df['Streamflow (m3/s)'] = pd.to_numeric(df['Streamflow (m3/s)'], errors='coerce')
    df = df.dropna(subset=['Streamflow (m3/s)'])

    # Remove any values less than 0
    df = df[df['Streamflow (m3/s)'] >= 0]

    # Extract the streamflow data as a NumPy array
    data_array = df['Streamflow (m3/s)'].values

    if data_array.size == 0:
        # If after cleaning, there are no data points, return an empty array
        return np.full(101, np.nan)

    # Define the percentiles in a gap of 1
    percentiles = np.arange(100, -1, -1)

    # Calculate the percentiles for the streamflow data
    streamflow_percentiles = np.percentile(data_array, [p for p in percentiles])

    # Z-scale the data (mean 0, std 1)
    z_scaled = (streamflow_percentiles - np.mean(streamflow_percentiles)) / np.std(streamflow_percentiles)

    return z_scaled

def process_single_csv_file(csv_file_path: str) -> pd.Series:
    """
    Process a single CSV file: read, compute Z-scaled FDC, and return a Series.

    Parameters:
    - csv_file_path: Path to the input CSV file.

    Returns:
    - A pandas Series where the index is the percentiles (0 to 100) and the first element is the file name.
    """
    try:
        # Extract the file name without extension
        file_name = os.path.splitext(os.path.basename(csv_file_path))[0]

        # Read the CSV file
        df = pd.read_csv(csv_file_path)

        # Calculate Z-scaled FDC
        z_scaled_fdc = calculate_z_scaled_fdc(df)

        # Create a Series with the filename and the Z-scaled FDC values
        return pd.Series([file_name] + z_scaled_fdc.tolist())
    
    except Exception as e:
        print(f"Error processing file {csv_file_path}: {e}")
        return pd.Series()

def process_all_csv_files_in_folder(folder_path: str, save_path: str, n_jobs: int = -1) -> None:
    """
    Process all CSV files in the given folder in parallel, computing Z-scaled FDCs and saving a summary file.

    Parameters:
    - folder_path: Path to the folder containing input CSV files.
    - save_path: Path to save the summary CSV file.
    - n_jobs: Number of parallel jobs. Default is -1 (use all available cores).
    """
    # Find all CSV files in the specified folder
    csv_files = glob.glob(os.path.join(folder_path, '*.csv'))

    if not csv_files:
        print(f"No CSV files found in the folder {folder_path}.")
        return

    print(f"Found {len(csv_files)} CSV files in {folder_path}. Starting processing...")

    # Process files in parallel and collect results
    results = Parallel(n_jobs=n_jobs)(
        delayed(process_single_csv_file)(csv_file_path)
        for csv_file_path in csv_files
    )

    # Convert the list of Series to a DataFrame
    summary_df = pd.DataFrame(results)

    # Define column names: first column is 'File', others are 0, 1, 2, ..., 100
    summary_df.columns = ['File'] + list(range(101))

    # Save the summary DataFrame to a CSV file
    summary_df.to_csv(save_path, index=False)

    print(f"Finished processing {len(csv_files)} files. Summary saved to {save_path}.")

# Example usage
if __name__ == '__main__':
    folder_path = '/Users/yubinbaaniya/Documents/WORLD BIAS/saber workdir/gauge_data'  # Replace with the path to your folder containing CSV files
    save_path = '/Users/yubinbaaniya/Documents/WORLD BIAS/saber workdir/z_scaled_gauge_2nd_iteration.csv'  # Replace with the path to save the summary CSV file

    # Process all CSV files and save the summary file
    process_all_csv_files_in_folder(folder_path, save_path, n_jobs=-1)


Found 20553 CSV files in /Users/yubinbaaniya/Documents/WORLD BIAS/saber workdir/gauge_data. Starting processing...
Finished processing 20553 files. Summary saved to /Users/yubinbaaniya/Documents/WORLD BIAS/saber workdir/z_scaled_gauge_2nd_iteration.csv.
