for all the station that is in the folder called gauge data

In [None]:
import os
import pandas as pd
from joblib import Parallel, delayed
from tqdm import tqdm

# Function to process a single CSV file
def process_file(file_path):
    try:
        # Read the CSV file
        df = pd.read_csv(file_path)

        # Remove negative values in streamflow
        df = df[df['Streamflow (m3/s)'] >= 0]

        if df.empty:
            return None

        # Extract the first and last dates
        first_date = pd.to_datetime(df['Datetime']).min()
        last_date = pd.to_datetime(df['Datetime']).max()

        # Calculate number of possible days
        num_possible_days = (last_date - first_date).days + 1

        # Calculate number of daily measurements
        num_daily_measurements = len(df)

        # Calculate percentage of missing data
        percent_missing = ((num_possible_days - num_daily_measurements) / num_possible_days) * 100

        # Calculate flow statistics
        average_flow = df['Streamflow (m3/s)'].mean()
        min_flow = df['Streamflow (m3/s)'].min()
        max_flow = df['Streamflow (m3/s)'].max()

        # Calculate monthly observation counts across all years
        df['Datetime'] = pd.to_datetime(df['Datetime'])
        df['Month'] = df['Datetime'].dt.month_name()
        monthly_counts = df['Month'].value_counts().reindex(
            ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"], fill_value=0
        )

        # Calculate non-zero monthly observation counts across all years
        non_zero_monthly_counts = df[df['Streamflow (m3/s)'] > 0]['Month'].value_counts().reindex(
            ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"], fill_value=0
        )

        # Extract filename without extension
        filename = os.path.splitext(os.path.basename(file_path))[0]

        # Return the summary statistics
        return {
            "Filename": filename,
            "First Measurement Date": first_date,
            "Last Measurement Date": last_date,
            "Number of Possible Days": num_possible_days,
            "Number of Daily Measurements": num_daily_measurements,
            "% Missing": percent_missing,
            "Average Flow (m3/s)": average_flow,
            "Min Flow (m3/s)": min_flow,
            "Max Flow (m3/s)": max_flow,
            **monthly_counts.add_suffix(" (Total)").to_dict(),
            **non_zero_monthly_counts.add_suffix(" (Non-Zero)").to_dict()
        }
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

# Main function to process all files in a folder
def process_all_files(folder_path, output_file):
    # List all CSV files in the folder
    csv_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.csv')]

    # Wrap file processing with tqdm for a progress bar
    results = Parallel(n_jobs=-1)(
        delayed(process_file)(file) for file in tqdm(csv_files, desc="Processing Files")
    )

    # Filter out None results
    results = [res for res in results if res is not None]

    # Create a summary dataframe
    summary_df = pd.DataFrame(results)

    # Save the summary to a CSV file
    summary_df.to_csv(output_file, index=False)
    print(f"Summary file saved to {output_file}")

# Specify the folder path and output file
folder_path = "/Users/yubinbaaniya/Documents/WORLD BIAS/saber workdir/gauge_data"
output_file = "/Users/yubinbaaniya/Documents/GAUGE REVIEW/summary_statistics.csv"

# Run the processing
process_all_files(folder_path, output_file)


read and process only those stations in csv file for example for a certain country

In [None]:
import os
import pandas as pd
from joblib import Parallel, delayed
from tqdm import tqdm

# Function to process a single CSV file
def process_specific_file(file_path):
    try:
        # Read the CSV file
        df = pd.read_csv(file_path)

        # Remove negative values in streamflow
        df = df[df['Streamflow (m3/s)'] >= 0]

        if df.empty:
            return None

        # Extract the first and last dates
        first_date = pd.to_datetime(df['Datetime']).min()
        last_date = pd.to_datetime(df['Datetime']).max()

        # Calculate number of possible days
        num_possible_days = (last_date - first_date).days + 1

        # Calculate number of daily measurements
        num_daily_measurements = len(df)

        # Calculate percentage of missing data
        percent_missing = ((num_possible_days - num_daily_measurements) / num_possible_days) * 100

        # Calculate flow statistics
        average_flow = df['Streamflow (m3/s)'].mean()
        min_flow = df['Streamflow (m3/s)'].min()
        max_flow = df['Streamflow (m3/s)'].max()

        # Calculate monthly observation counts across all years
        df['Datetime'] = pd.to_datetime(df['Datetime'])
        df['Month'] = df['Datetime'].dt.month_name()
        monthly_counts = df['Month'].value_counts().reindex(
            ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"], fill_value=0
        )

        # Calculate non-zero monthly observation counts across all years
        non_zero_monthly_counts = df[df['Streamflow (m3/s)'] > 0]['Month'].value_counts().reindex(
            ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"], fill_value=0
        )

        # Extract the filename as the gauge ID
        gauge_id = os.path.splitext(os.path.basename(file_path))[0]

        # Return the summary statistics
        summary = {
            "Gauge ID": gauge_id,
            "First Measurement Date": first_date,
            "Last Measurement Date": last_date,
            "Number of Possible Days": num_possible_days,
            "Number of Daily Measurements": num_daily_measurements,
            "% Missing": percent_missing,
            "Average Flow (m3/s)": average_flow,
            "Min Flow (m3/s)": min_flow,
            "Max Flow (m3/s)": max_flow,
            **monthly_counts.add_suffix(" (Total)").to_dict(),
            **non_zero_monthly_counts.add_suffix(" (Non-Zero)").to_dict()
        }

        return summary

    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

# Main function to process specific files based on gauge IDs and save the result
def analyze_files_by_gauge_ids(folder_path, gauge_file, output_file):
    try:
        # Read the gauge IDs from the CSV file
        gauge_data = pd.read_csv(gauge_file)
        gauge_ids = set(gauge_data['gauge_id'].astype(str))

        # List all CSV files in the folder
        csv_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.csv')]

        # Filter files that match the gauge IDs
        matching_files = [file for file in csv_files if os.path.splitext(os.path.basename(file))[0] in gauge_ids]

        # Wrap file processing with tqdm for a progress bar
        results = Parallel(n_jobs=-1)(
            delayed(process_specific_file)(file) for file in tqdm(matching_files, desc="Processing Files")
        )

        # Filter out None results
        results = [res for res in results if res is not None]

        if results:
            # Convert the results to a DataFrame for saving
            summary_df = pd.DataFrame(results)
            summary_df.to_csv(output_file, index=False)
            print(f"Summary file saved to {output_file}")
        else:
            print("No valid data to process.")
    except Exception as e:
        print(f"Error during processing: {e}")

# Specify the folder path, gauge file, and output file
folder_path = "/Users/yubinbaaniya/Documents/WORLD BIAS/saber workdir/gauge_data"  # Replace with the path to your folder containing CSV files
gauge_file = "/Users/yubinbaaniya/Documents/WORLD BIAS/saber workdir/gauge_table_2nd_iteration_deDuplicated.csv"  # Replace with the path to your gauge file
output_file = "/Users/yubinbaaniya/Documents/GAUGE REVIEW/summary_only_gauge_used.csv"  # Replace with your desired output path

# Run the processing
analyze_files_by_gauge_ids(folder_path, gauge_file, output_file)
