In [2]:
import os
import pandas as pd
from pathlib import Path

# Define the years to process
years = [str(year) for year in range(2017, 2018)]

# Pressure file naming dictionary
pressure_file_naming = {
    "geopotential": "geopotential",
    "relative_humidity": "relative_humidity",
    "temperature": "temperature",
    "u_component_of_wind": "u",
    "v_component_of_wind": "v",
    "vertical_velocity": "w",
    "vorticity": "vo",
}

# Variable abbreviations for pressure data
pressure_variable_abbreviations = {
    "geopotential": "z",
    "relative_humidity": "r",
    "temperature": "t",
    "u_component_of_wind": "u",
    "v_component_of_wind": "v",
    "vertical_velocity": "w",
    "vorticity": "vo",
}

# File paths
BASE_PATH = r"Z:\Thesis\Data\Met\ERA5_parquet_test"
OUTPUT_PATH = r"Z:\Thesis\Data\Met\ERA5_parquet_test\Pressure_Monthly_Stats"

def process_pressure_file(file, variable_name):
    """
    Process an individual pressure file and calculate monthly statistics.
    """
    df = pd.read_parquet(file)

    # Ensure time column is datetime
    df["time"] = pd.to_datetime(df["time"])

    # Group by time, longitude, latitude, and level
    grouped = df.groupby([pd.Grouper(key="time", freq="ME"), "longitude", "latitude", "level"])
    monthly_stats = grouped.agg({
        variable_name: ["min", "max", "mean"]
    }).reset_index()

    # Flatten column names
    monthly_stats.columns = [
        col[0] if col[1] == "" else f"{col[0]}_{col[1]}" for col in monthly_stats.columns
    ]
    return monthly_stats

def calculate_pressure_monthly_statistics(year, country):
    """
    Calculate monthly statistics for pressure variables.
    """
    country_path = os.path.join(BASE_PATH, year, country)
    output_dir = os.path.join(OUTPUT_PATH, year)
    Path(output_dir).mkdir(parents=True, exist_ok=True)

    monthly_stats_list = []
    missing_files = []

    for variable, file_name in pressure_file_naming.items():
        abbreviation = pressure_variable_abbreviations[variable]
        file_path = os.path.join(country_path, "pressure", f"{year}_{country}_pressure_{file_name}.parquet")
        if os.path.exists(file_path):
            try:
                stats = process_pressure_file(file_path, abbreviation)
                monthly_stats_list.append(stats)
            except Exception as e:
                print(f"Error processing {file_path}: {e}")
        else:
            missing_files.append(file_path)

    # Combine and save results
    if monthly_stats_list:
        combined_stats = pd.concat(monthly_stats_list, axis=0)
        combined_stats = combined_stats.loc[:, ~combined_stats.columns.duplicated()]  # Remove duplicate columns
        output_file = os.path.join(output_dir, f"{country}_{year}_pressure_monthly_stats.parquet")
        combined_stats.to_parquet(output_file, index=False)
        print(f"Saved pressure monthly statistics for {country} in {year} to {output_file}")
    else:
        print(f"No data processed for {country} in {year}.")
        for file in missing_files:
            print(f"  Missing file: {file}")

if __name__ == "__main__":
    countries = ["Bahrain",
                 #"Saudi_Arabia",
                 #"Oman",
                 #"Qatar",
                 #"United_Arab_Emirates",
                 #"Kuwait",
                 #"Yemen"
                ]
    for year in years:
        for country in countries:
            calculate_pressure_monthly_statistics(year, country)


Saved pressure monthly statistics for Bahrain in 2017 to Z:\Thesis\Data\Met\ERA5_parquet_test\Pressure_Monthly_Stats\2017\Bahrain_2017_pressure_monthly_stats.parquet


In [7]:
import os
import pandas as pd
from pathlib import Path

# Define the years to process
years = [str(year) for year in range(2017, 2018)]

# Pressure file naming dictionary
pressure_file_naming = {
    "geopotential": "geopotential",
    "relative_humidity": "relative_humidity",
    "temperature": "temperature",
    "u_component_of_wind": "u_component_of_wind",
    "v_component_of_wind": "v_component_of_wind",
    "vertical_velocity": "vertical_velocity",
    "vorticity": "vorticity",
}

# Variable abbreviations for pressure data
pressure_variable_abbreviations = {
    "geopotential": "z",
    "relative_humidity": "r",
    "temperature": "t",
    "u_component_of_wind": "u",
    "v_component_of_wind": "v",
    "vertical_velocity": "w",
    "vorticity": "vo",
}

# File paths
BASE_PATH = r"Z:\Thesis\Data\Met\ERA5_parquet_test"
OUTPUT_PATH = r"Z:\Thesis\Data\Met\ERA5_parquet_test\Pressure_Monthly_Stats"

def process_pressure_file(file, variable_name):
    """
    Process an individual pressure file and calculate monthly statistics.
    """
    df = pd.read_parquet(file)

    # Ensure time column is datetime and aligned to month-end
    df["time"] = pd.to_datetime(df["time"]).dt.to_period("M").dt.to_timestamp("M")

    # Check if the variable has valid data
    if df[variable_name].isna().all():
        print(f"Warning: All values are NaN for variable '{variable_name}' in file: {file}")
        return None

    # Group by time, longitude, latitude, and level
    grouped = df.groupby([pd.Grouper(key="time", freq="ME"), "longitude", "latitude", "level"])
    
    # Debug: Inspect grouped data
    print(f"Processing '{variable_name}' from file: {file}")
    print(f"Total groups: {len(grouped)}")
    print(f"Example group keys: {list(grouped.groups.keys())[:5]}")

    # Skip groups with no valid data
    filtered_groups = grouped.filter(lambda x: not x[variable_name].isna().all())

    # Aggregate statistics
    monthly_stats = filtered_groups.groupby([pd.Grouper(key="time", freq="ME"), "longitude", "latitude", "level"]).agg({
        variable_name: ["min", "max", "mean"]
    }).reset_index()

    # Flatten column names
    monthly_stats.columns = [
        col[0] if col[1] == "" else f"{col[0]}_{col[1]}" for col in monthly_stats.columns
    ]

    # Debug: Inspect aggregated results
    print(f"Aggregated data for '{variable_name}':")
    print(monthly_stats.head())

    return monthly_stats


def calculate_pressure_monthly_statistics(year, country):
    """
    Calculate monthly statistics for pressure variables.
    """
    country_path = os.path.join(BASE_PATH, year, country)
    output_dir = os.path.join(OUTPUT_PATH, year)
    Path(output_dir).mkdir(parents=True, exist_ok=True)

    monthly_stats_list = []
    missing_files = []

    for variable, file_name in pressure_file_naming.items():
        abbreviation = pressure_variable_abbreviations[variable]
        file_path = os.path.join(country_path, "pressure", f"{year}_{country}_pressure_{file_name}.parquet")
        if os.path.exists(file_path):
            try:
                stats = process_pressure_file(file_path, abbreviation)
                if stats is not None:
                    monthly_stats_list.append(stats)
            except Exception as e:
                print(f"Error processing {file_path}: {e}")
        else:
            missing_files.append(file_path)

    # Combine and save results
    if monthly_stats_list:
        combined_stats = pd.concat(monthly_stats_list, axis=0, ignore_index=True)
        combined_stats = combined_stats.loc[:, ~combined_stats.columns.duplicated()]  # Remove duplicate columns
        output_file = os.path.join(output_dir, f"{country}_{year}_pressure_monthly_stats.parquet")
        combined_stats.to_parquet(output_file, index=False)
        print(f"Saved pressure monthly statistics for {country} in {year} to {output_file}")
    else:
        print(f"No data processed for {country} in {year}.")
        for file in missing_files:
            print(f"  Missing file: {file}")

if __name__ == "__main__":
    countries = ["Bahrain"]
    for year in years:
        for country in countries:
            calculate_pressure_monthly_statistics(year, country)


Processing 'z' from file: Z:\Thesis\Data\Met\ERA5_parquet_test\2017\Bahrain\pressure\2017_Bahrain_pressure_geopotential.parquet
Total groups: 41
Example group keys: [(Timestamp('2017-01-31 00:00:00'), 50.45399856567383, 26.03499984741211, 850), (Timestamp('2017-01-31 00:00:00'), 50.45399856567383, 26.03499984741211, 925), (Timestamp('2017-01-31 00:00:00'), 50.45399856567383, 26.03499984741211, 1000), (Timestamp('2017-02-28 00:00:00'), 50.45399856567383, 26.03499984741211, 500), (Timestamp('2017-02-28 00:00:00'), 50.45399856567383, 26.03499984741211, 700)]
Aggregated data for 'z':
        time  longitude  latitude  level          z_min          z_max  \
0 2017-01-31  50.453999    25.535     10  299981.802251  303586.425101   
1 2017-01-31  50.453999    25.535     50  200015.788400  201876.845307   
2 2017-01-31  50.453999    25.535    100  159932.570302  162371.682765   
3 2017-01-31  50.453999    25.535    200  118007.093852  121386.134046   
4 2017-01-31  50.453999    25.535    300   