In [2]:
import os
import pandas as pd
from pathlib import Path

# Define the years to process
years = [str(year) for year in range(2017, 2018)]

# Surface file naming dictionary
surface_file_naming = {
    "2m_temperature": "2m_temperature",
    "2m_dewpoint_temperature": "2m_dewpoint_temperature",
    "10m_u_component_of_wind": "10m_u_component_of_wind",
    "10m_v_component_of_wind": "10m_v_component_of_wind",
    "surface_solar_radiation_downwards": "surface_solar_radiation_downwards",
    "surface_thermal_radiation_downwards": "surface_thermal_radiation_downwards",
    "evaporation": "evaporation",
    "potential_evaporation": "potential_evaporation",
    "sea_surface_temperature": "sea_surface_temperature",
    "soil_temperature_level_1": "soil_temperature_level_1",
    "soil_temperature_level_2": "soil_temperature_level_2",
    "soil_temperature_level_3": "soil_temperature_level_3",
    "soil_temperature_level_4": "soil_temperature_level_4",
    "volumetric_soil_water_layer_1": "volumetric_soil_water_layer_1",
    "volumetric_soil_water_layer_2": "volumetric_soil_water_layer_2",
    "volumetric_soil_water_layer_3": "volumetric_soil_water_layer_3",
    "volumetric_soil_water_layer_4": "volumetric_soil_water_layer_4",
    "leaf_area_index_high_vegetation": "leaf_area_index_high_vegetation",
    "leaf_area_index_low_vegetation": "leaf_area_index_low_vegetation",
    "surface_pressure": "surface_pressure",
    "mean_sea_level_pressure": "mean_sea_level_pressure",
    "convective_available_potential_energy": "convective_available_potential_energy",
    "geopotential": "geopotential",
    "instantaneous_10m_wind_gust": "instantaneous_10m_wind_gust",
    "total_precipitation": "total_precipitation",
    "k_index": "k_index",
}

# Variable abbreviations for surface data
surface_variable_abbreviations = {
    "2m_temperature": "t2m",
    "2m_dewpoint_temperature": "d2m",
    "10m_u_component_of_wind": "u10",
    "10m_v_component_of_wind": "v10",
    "surface_solar_radiation_downwards": "ssrd",
    "surface_thermal_radiation_downwards": "strd",
    "evaporation": "e",
    "potential_evaporation": "pev",
    "sea_surface_temperature": "sst",
    "soil_temperature_level_1": "stl1",
    "soil_temperature_level_2": "stl2",
    "soil_temperature_level_3": "stl3",
    "soil_temperature_level_4": "stl4",
    "volumetric_soil_water_layer_1": "swvl1",
    "volumetric_soil_water_layer_2": "swvl2",
    "volumetric_soil_water_layer_3": "swvl3",
    "volumetric_soil_water_layer_4": "swvl4",
    "leaf_area_index_high_vegetation": "lai_hv",
    "leaf_area_index_low_vegetation": "lai_lv",
    "surface_pressure": "sp",
    "mean_sea_level_pressure": "msl",
    "convective_available_potential_energy": "cape",
    "geopotential": "z",
    "instantaneous_10m_wind_gust": "i10fg",
    "total_precipitation": "tp",
    "k_index": "kx",
}

# File paths
BASE_PATH = r"Z:\Thesis\Data\Met\ERA5_parquet_test"
OUTPUT_PATH = r"Z:\Thesis\Data\Met\ERA5_parquet_test\Surface_Monthly_Stats"

def process_surface_file(file, variable_name):
    """
    Process an individual surface file and calculate monthly statistics.
    """
    df = pd.read_parquet(file)

    # Rename 'pressure_level' to 'level' if present
    if "pressure_level" in df.columns:
        df.rename(columns={"pressure_level": "level"}, inplace=True)
    else:
        df["level"] = 1013.0  # Add default level for surface data

    # Convert time to datetime
    df["time"] = pd.to_datetime(df["time"])

    # Group by time, longitude, latitude, and level
    grouped = df.groupby([pd.Grouper(key="time", freq="ME"), "longitude", "latitude", "level"])
    monthly_stats = grouped.agg({
        variable_name: ["min", "max", "mean"]
    }).reset_index()

    # Flatten column names
    monthly_stats.columns = [
        col[0] if col[1] == "" else f"{col[0]}_{col[1]}" for col in monthly_stats.columns
    ]
    return monthly_stats

def calculate_surface_monthly_statistics(year, country):
    """
    Calculate monthly statistics for surface variables.
    """
    country_path = os.path.join(BASE_PATH, year, country)
    output_dir = os.path.join(OUTPUT_PATH, year)
    Path(output_dir).mkdir(parents=True, exist_ok=True)

    monthly_stats_list = []
    missing_files = []

    for variable, file_name in surface_file_naming.items():
        abbreviation = surface_variable_abbreviations[variable]
        file_path = os.path.join(country_path, "surface", f"{year}_{country}_surface_{file_name}.parquet")
        if os.path.exists(file_path):
            try:
                stats = process_surface_file(file_path, abbreviation)
                monthly_stats_list.append(stats)
            except Exception as e:
                print(f"Error processing {file_path}: {e}")
        else:
            missing_files.append(file_path)

    # Combine and save results
    if monthly_stats_list:
        combined_stats = pd.concat(monthly_stats_list, axis=0)
        combined_stats = combined_stats.loc[:, ~combined_stats.columns.duplicated()]  # Remove duplicate columns
        output_file = os.path.join(output_dir, f"{country}_{year}_surface_monthly_stats.parquet")
        combined_stats.to_parquet(output_file, index=False)
        print(f"Saved surface monthly statistics for {country} in {year} to {output_file}")
    else:
        print(f"No data processed for {country} in {year}.")
        for file in missing_files:
            print(f"  Missing file: {file}")

if __name__ == "__main__":
    countries = ["Bahrain",
                 #"Saudi_Arabia",
                 #"Oman",
                 #"Qatar",
                 #"United_Arab_Emirates",
                 #"Kuwait",
                 #"Yemen"
                ]
    for year in years:
        for country in countries:
            calculate_surface_monthly_statistics(year, country)


  grouped = df.groupby([pd.Grouper(key="time", freq="M"), "longitude", "latitude", "level"])
  grouped = df.groupby([pd.Grouper(key="time", freq="M"), "longitude", "latitude", "level"])
  grouped = df.groupby([pd.Grouper(key="time", freq="M"), "longitude", "latitude", "level"])
  grouped = df.groupby([pd.Grouper(key="time", freq="M"), "longitude", "latitude", "level"])
  grouped = df.groupby([pd.Grouper(key="time", freq="M"), "longitude", "latitude", "level"])
  grouped = df.groupby([pd.Grouper(key="time", freq="M"), "longitude", "latitude", "level"])
  grouped = df.groupby([pd.Grouper(key="time", freq="M"), "longitude", "latitude", "level"])
  grouped = df.groupby([pd.Grouper(key="time", freq="M"), "longitude", "latitude", "level"])
  grouped = df.groupby([pd.Grouper(key="time", freq="M"), "longitude", "latitude", "level"])
  grouped = df.groupby([pd.Grouper(key="time", freq="M"), "longitude", "latitude", "level"])
  grouped = df.groupby([pd.Grouper(key="time", freq="M"), "longitude",

Saved surface monthly statistics for Bahrain in 2017 to Z:\Thesis\Data\Met\ERA5_parquet_test\Surface_Monthly_Stats\2017\Bahrain_2017_surface_monthly_stats.parquet
