In [1]:
#########################################################################################
# Saudi Arabia Chunked Processing                                                       #
#########################################################################################

import os
import pandas as pd
from pathlib import Path

# Define the years to process
years = [str(year) for year in range(2013, 2015)]

# Variable abbreviations for accessing data
variable_abbreviations = {
    "2m_temperature": "t2m",
    "2m_dewpoint_temperature": "d2m",
    "10m_u_component_of_wind": "u10",
    "10m_v_component_of_wind": "v10",
    "surface_solar_radiation_downwards": "ssrd",
    "surface_thermal_radiation_downwards": "strd",
    "evaporation": "e",
    "potential_evaporation": "pev",
    "sea_surface_temperature": "sst",
    "soil_temperature_level_1": "stl1",
    "soil_temperature_level_2": "stl2",
    "soil_temperature_level_3": "stl3",
    "soil_temperature_level_4": "stl4",
    "volumetric_soil_water_layer_1": "swvl1",
    "volumetric_soil_water_layer_2": "swvl2",
    "volumetric_soil_water_layer_3": "swvl3",
    "volumetric_soil_water_layer_4": "swvl4",
    "leaf_area_index_high_vegetation": "lai_hv",
    "leaf_area_index_low_vegetation": "lai_lv",
    "surface_pressure": "sp",
    "mean_sea_level_pressure": "msl",
    "convective_available_potential_energy": "cape",
    "geopotential": "z",
    "instantaneous_10m_wind_gust": "i10fg",
    "total_precipitation": "tp",
    "k_index": "kx",
    "relative_humidity": "r",
    "temperature": "t",
    "u_component_of_wind": "u",
    "v_component_of_wind": "v",
    "vertical_velocity": "w",
    "vorticity": "vo",
}

# File paths
BASE_PATH = r"Z:\Thesis\Data\Met\ERA5_parquet_test"
OUTPUT_PATH = r"Z:\Thesis\Data\Met\ERA5_parquet_test\Monthly_Stats"

def process_file(file_path, variable_name, include_level=False):
    """Processes the dataset month-by-month to reduce memory usage."""
    if not os.path.exists(file_path):
        print(f"Missing file: {file_path}")
        return None

    # Convert variable name to its abbreviation
    var_abbr = variable_abbreviations.get(variable_name, variable_name)  # Ensure correct name mapping

    # Initialize empty list to store monthly results
    monthly_results = []

    # Read the file in smaller time-based chunks
    df = pd.read_parquet(file_path, columns=["longitude", "latitude", "time", var_abbr] + (["level"] if include_level else []))
    df["time"] = pd.to_datetime(df["time"])  # Ensure time is in datetime format

    for month in range(1, 13):  # Process each month separately
        monthly_df = df[df["time"].dt.month == month]

        if monthly_df.empty:
            continue  # Skip if no data for this month

        # Define grouping columns
        group_by_columns = ["longitude", "latitude"]
        if include_level:
            group_by_columns.append("level")

        # Group by month and calculate statistics
        grouped = monthly_df.groupby(group_by_columns)
        monthly_stats = grouped[var_abbr].agg(["min", "max", "mean"]).reset_index()

        # Add month column to distinguish data
        monthly_stats["month"] = month

        monthly_results.append(monthly_stats)

    # Concatenate and return all monthly results
    return pd.concat(monthly_results, ignore_index=True) if monthly_results else None

def calculate_monthly_statistics(year, country):
    """Calculates monthly statistics for surface and pressure variables."""
    country_path = os.path.join(BASE_PATH, year, country)
    output_dir = os.path.join(OUTPUT_PATH, year)
    Path(output_dir).mkdir(parents=True, exist_ok=True)

    surface_results = []
    pressure_results = []

    # Process surface variables
    for variable in variable_abbreviations.keys():
        file_path = os.path.join(country_path, "surface", f"{year}_{country}_surface_{variable}.parquet")
        result = process_file(file_path, variable, include_level=False)
        if result is not None:
            surface_results.append(result)

    # Process pressure variables
    for variable in ["geopotential", "relative_humidity", "temperature", "u_component_of_wind", "v_component_of_wind", "vertical_velocity", "vorticity"]:
        file_path = os.path.join(country_path, "pressure", f"{year}_{country}_pressure_{variable}.parquet")
        result = process_file(file_path, variable, include_level=True)
        if result is not None:
            pressure_results.append(result)

    # Save surface data
    if surface_results:
        combined_surface_stats = pd.concat(surface_results, axis=1)
        combined_surface_stats = combined_surface_stats.loc[:, ~combined_surface_stats.columns.duplicated()]
        surface_output_file = os.path.join(output_dir, f"{country}_{year}_surface_monthly_stats.parquet")
        combined_surface_stats.to_parquet(surface_output_file, index=False)
        print(f"Saved surface monthly statistics for {country} in {year} to {surface_output_file}")

    # Save pressure data
    if pressure_results:
        combined_pressure_stats = pd.concat(pressure_results, axis=1)
        combined_pressure_stats = combined_pressure_stats.loc[:, ~combined_pressure_stats.columns.duplicated()]
        pressure_output_file = os.path.join(output_dir, f"{country}_{year}_pressure_monthly_stats.parquet")
        combined_pressure_stats.to_parquet(pressure_output_file, index=False)
        print(f"Saved pressure monthly statistics for {country} in {year} to {pressure_output_file}")

if __name__ == "__main__":
    countries = ["Saudi_Arabia"]

    for year in years:
        for country in countries:
            calculate_monthly_statistics(year, country)


Missing file: Z:\Thesis\Data\Met\ERA5_parquet_test\1989\Saudi_Arabia\surface\1989_Saudi_Arabia_surface_relative_humidity.parquet
Missing file: Z:\Thesis\Data\Met\ERA5_parquet_test\1989\Saudi_Arabia\surface\1989_Saudi_Arabia_surface_temperature.parquet
Missing file: Z:\Thesis\Data\Met\ERA5_parquet_test\1989\Saudi_Arabia\surface\1989_Saudi_Arabia_surface_u_component_of_wind.parquet
Missing file: Z:\Thesis\Data\Met\ERA5_parquet_test\1989\Saudi_Arabia\surface\1989_Saudi_Arabia_surface_v_component_of_wind.parquet
Missing file: Z:\Thesis\Data\Met\ERA5_parquet_test\1989\Saudi_Arabia\surface\1989_Saudi_Arabia_surface_vertical_velocity.parquet
Missing file: Z:\Thesis\Data\Met\ERA5_parquet_test\1989\Saudi_Arabia\surface\1989_Saudi_Arabia_surface_vorticity.parquet
Saved surface monthly statistics for Saudi_Arabia in 1989 to Z:\Thesis\Data\Met\ERA5_parquet_test\Monthly_Stats\1989\Saudi_Arabia_1989_surface_monthly_stats.parquet
Saved pressure monthly statistics for Saudi_Arabia in 1989 to Z:\Thesis

In [None]:
import os
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import glob

# Define the surface and pressure level variables
surface_variables = [
    "2m_temperature", "2m_dewpoint_temperature", "10m_u_component_of_wind",
    "10m_v_component_of_wind", "surface_solar_radiation_downwards",
    "surface_thermal_radiation_downwards", "evaporation", "potential_evaporation",
    "sea_surface_temperature", "soil_temperature_level_1", "soil_temperature_level_2",
    "soil_temperature_level_3", "soil_temperature_level_4", "volumetric_soil_water_layer_1",
    "volumetric_soil_water_layer_2", "volumetric_soil_water_layer_3",
    "volumetric_soil_water_layer_4", "leaf_area_index_high_vegetation",
    "leaf_area_index_low_vegetation", "surface_pressure", "mean_sea_level_pressure",
    "convective_available_potential_energy", "geopotential",
    "instantaneous_10m_wind_gust", "total_precipitation", "k_index"
]

pressure_level_variables = [
    "geopotential", "relative_humidity", "temperature", "u_component_of_wind",
    "v_component_of_wind", "vertical_velocity", "vorticity"
]

# File paths
BASE_PATH = r"Z:\Thesis\Data\Met\ERA5_parquet_test\2017"
OUTPUT_PATH = r"Z:\Thesis\Data\Met\ERA5_parquet_test\Monthly_Stats"

def calculate_monthly_statistics(country, surface_variables, pressure_level_variables):
    """
    Calculate monthly statistics for both surface and pressure variables and save the result as a single Parquet file.
    """
    country_path = os.path.join(BASE_PATH, country)
    surface_path = os.path.join(country_path, "surface")
    pressure_path = os.path.join(country_path, "pressure")
    Path(OUTPUT_PATH).mkdir(parents=True, exist_ok=True)

    # Combine results from all variables
    monthly_stats_list = []

    def process_file(file, variable_name, is_pressure):
        """
        Process an individual file, calculate monthly statistics, and append to the results list.
        """
        df = pd.read_parquet(file)

        # Handle column renaming for pressure files
        if is_pressure and "level" in df.columns:
            df.rename(columns={"level": "pressure_level"}, inplace=True)

        # Group by month and calculate statistics
        df["time"] = pd.to_datetime(df["time"])
        grouped = df.groupby([pd.Grouper(key="time", freq="M"), "longitude", "latitude"])
        monthly_stats = grouped.agg({
            variable_name: ["min", "max", "mean"]
        }).reset_index()

        # Flatten column names
        monthly_stats.columns = [
            "time" if col[0] == "time" else "_".join(col).strip("_")
            for col in monthly_stats.columns
        ]
        return monthly_stats

    # Process surface variables
    for variable in tqdm(surface_variables, desc=f"Processing surface variables for {country}"):
        file_path = os.path.join(surface_path, f"2017_{country}_surface_{variable}.parquet")
        if os.path.exists(file_path):
            monthly_stats = process_file(file_path, variable, is_pressure=False)
            monthly_stats_list.append(monthly_stats)

    # Process pressure variables
    for variable in tqdm(pressure_level_variables, desc=f"Processing pressure variables for {country}"):
        file_path = os.path.join(pressure_path, f"2017_{country}_pressure_{variable}.parquet")
        if os.path.exists(file_path):
            monthly_stats = process_file(file_path, variable, is_pressure=True)
            monthly_stats_list.append(monthly_stats)

    # Merge all statistics into a single DataFrame
    if monthly_stats_list:
        combined_stats = pd.concat(monthly_stats_list, axis=1)
        combined_stats = combined_stats.loc[:, ~combined_stats.columns.duplicated()]  # Remove duplicate columns

        # Save to a single Parquet file
        output_file = os.path.join(OUTPUT_PATH, f"{country}_2017_monthly_stats.parquet")
        combined_stats.to_parquet(output_file, index=False)
        print(f"Saved combined monthly statistics for {country} to {output_file}")
    else:
        print(f"No data processed for {country}.")

if __name__ == "__main__":
    # List of countries to process
    countries = ["Bahrain", "Saudi_Arabia", "Oman", "Qatar", "United_Arab_Emirates", "Kuwait", "Yemen"]

    for country in countries:
        calculate_monthly_statistics(country, surface_variables, pressure_level_variables)


In [3]:
import os
import pandas as pd
from pathlib import Path
from tqdm import tqdm

# Define the years to process
years = [str(year) for year in range(2017, 2018)]

# Dictionaries for surface and pressure variables
surface_file_naming = {
    "2m_temperature": "2m_temperature",
    "2m_dewpoint_temperature": "2m_dewpoint_temperature",
    "10m_u_component_of_wind": "10m_u_component_of_wind",
    "10m_v_component_of_wind": "10m_v_component_of_wind",
    "surface_solar_radiation_downwards": "surface_solar_radiation_downwards",
    "surface_thermal_radiation_downwards": "surface_thermal_radiation_downwards",
    "evaporation": "evaporation",
    "potential_evaporation": "potential_evaporation",
    "sea_surface_temperature": "sea_surface_temperature",
    "soil_temperature_level_1": "soil_temperature_level_1",
    "soil_temperature_level_2": "soil_temperature_level_2",
    "soil_temperature_level_3": "soil_temperature_level_3",
    "soil_temperature_level_4": "soil_temperature_level_4",
    "volumetric_soil_water_layer_1": "volumetric_soil_water_layer_1",
    "volumetric_soil_water_layer_2": "volumetric_soil_water_layer_2",
    "volumetric_soil_water_layer_3": "volumetric_soil_water_layer_3",
    "volumetric_soil_water_layer_4": "volumetric_soil_water_layer_4",
    "leaf_area_index_high_vegetation": "leaf_area_index_high_vegetation",
    "leaf_area_index_low_vegetation": "leaf_area_index_low_vegetation",
    "surface_pressure": "surface_pressure",
    "mean_sea_level_pressure": "mean_sea_level_pressure",
    "convective_available_potential_energy": "convective_available_potential_energy",
    "geopotential": "geopotential",
    "instantaneous_10m_wind_gust": "instantaneous_10m_wind_gust",
    "total_precipitation": "total_precipitation",
    "k_index": "k_index",
}

pressure_file_naming = {
    "geopotential": "geopotential",
    "relative_humidity": "relative_humidity",
    "temperature": "temperature",
    "u_component_of_wind": "u_component_of_wind",
    "v_component_of_wind": "v_component_of_wind",
    "vertical_velocity": "vertical_velocity",
    "vorticity": "vorticity",
}

# Variable abbreviations for accessing data
variable_abbreviations = {
    "2m_temperature": "t2m",
    "2m_dewpoint_temperature": "d2m",
    "10m_u_component_of_wind": "u10",
    "10m_v_component_of_wind": "v10",
    "surface_solar_radiation_downwards": "ssrd",
    "surface_thermal_radiation_downwards": "strd",
    "evaporation": "e",
    "potential_evaporation": "pev",
    "sea_surface_temperature": "sst",
    "soil_temperature_level_1": "stl1",
    "soil_temperature_level_2": "stl2",
    "soil_temperature_level_3": "stl3",
    "soil_temperature_level_4": "stl4",
    "volumetric_soil_water_layer_1": "swvl1",
    "volumetric_soil_water_layer_2": "swvl2",
    "volumetric_soil_water_layer_3": "swvl3",
    "volumetric_soil_water_layer_4": "swvl4",
    "leaf_area_index_high_vegetation": "lai_hv",
    "leaf_area_index_low_vegetation": "lai_lv",
    "surface_pressure": "sp",
    "mean_sea_level_pressure": "msl",
    "convective_available_potential_energy": "cape",
    "geopotential": "z",
    "instantaneous_10m_wind_gust": "i10fg",
    "total_precipitation": "tp",
    "k_index": "kx",
    "relative_humidity": "r",
    "temperature": "t",
    "u_component_of_wind": "u",
    "v_component_of_wind": "v",
    "vertical_velocity": "w",
    "vorticity": "vo",
}

# File paths
BASE_PATH = r"Z:\Thesis\Data\Met\ERA5_parquet_test"
OUTPUT_PATH = r"Z:\Thesis\Data\Met\ERA5_parquet_test\Monthly_Stats"

def calculate_monthly_statistics(year, country, surface_file_naming, pressure_file_naming, variable_abbreviations):
    """
    Calculate monthly statistics for all variables and save the result as a single Parquet file.
    """
    country_path = os.path.join(BASE_PATH, year, country)  # Folder names have spaces
    #normalized_country = country.replace(" ", "_")  # File names use underscores
    output_dir = os.path.join(OUTPUT_PATH, year)
    Path(output_dir).mkdir(parents=True, exist_ok=True)

    # Combine results from all variables
    monthly_stats_list = []
    missing_files = []

    def process_file(file, column_name):
        """
        Process an individual file, calculate monthly statistics, and append to the results list.
        """
        df = pd.read_parquet(file)

        # Handle column renaming for pressure files
        if "pressure_level" in df.columns:
            df.rename(columns={"pressure_level": "level"}, inplace=True)

        # Group by month and calculate statistics
        df["time"] = pd.to_datetime(df["time"])
        grouped = df.groupby([pd.Grouper(key="time", freq="ME"), "longitude", "latitude"])
        monthly_stats = grouped.agg({
            column_name: ["min", "max", "mean"]
        }).reset_index()

        # Flatten column names
        monthly_stats.columns = [
            "time" if col[0] == "time" else "_".join(col).strip("_")
            for col in monthly_stats.columns
        ]
        return monthly_stats

    # Process surface variables
    for variable, file_name in surface_file_naming.items():
        abbreviation = variable_abbreviations[variable]
        file_path = os.path.join(country_path, "surface", f"{year}_{normalized_country}_surface_{file_name}.parquet")
        if os.path.exists(file_path):
            monthly_stats_list.append(process_file(file_path, abbreviation))
        else:
            missing_files.append(file_path)

    # Process pressure variables
    for variable, file_name in pressure_file_naming.items():
        abbreviation = variable_abbreviations[variable]
        file_path = os.path.join(country_path, "pressure", f"{year}_{normalized_country}_pressure_{file_name}.parquet")
        if os.path.exists(file_path):
            monthly_stats_list.append(process_file(file_path, abbreviation))
        else:
            missing_files.append(file_path)

    # Combine and save results
    if monthly_stats_list:
        combined_stats = pd.concat(monthly_stats_list, axis=1)
        combined_stats = combined_stats.loc[:, ~combined_stats.columns.duplicated()]  # Remove duplicate columns
        output_file = os.path.join(output_dir, f"{normalized_country}_{year}_monthly_stats.parquet")
        combined_stats.to_parquet(output_file, index=False)
        print(f"Saved combined monthly statistics for {country} in {year} to {output_file}")
    else:
        print(f"No data processed for {country} in {year}.")
        for file in missing_files:
            print(f"  Missing file: {file}")

if __name__ == "__main__":
    # List of countries to process
    countries = ["Bahrain",
                 "Saudi_Arabia",
                 "Oman",
                 "Qatar",
                 "United_Arab_Emirates",
                 "Kuwait",
                 "Yemen"
                ]

    for year in years:
        for country in countries:
            calculate_monthly_statistics(year, country, surface_file_naming, pressure_file_naming, variable_abbreviations)


Saved combined monthly statistics for Bahrain in 2017 to Z:\Thesis\Data\Met\ERA5_parquet_test\Monthly_Stats\2017\Bahrain_2017_monthly_stats.parquet
Saved combined monthly statistics for Saudi Arabia in 2017 to Z:\Thesis\Data\Met\ERA5_parquet_test\Monthly_Stats\2017\Saudi_Arabia_2017_monthly_stats.parquet
Saved combined monthly statistics for Oman in 2017 to Z:\Thesis\Data\Met\ERA5_parquet_test\Monthly_Stats\2017\Oman_2017_monthly_stats.parquet
Saved combined monthly statistics for Qatar in 2017 to Z:\Thesis\Data\Met\ERA5_parquet_test\Monthly_Stats\2017\Qatar_2017_monthly_stats.parquet
No data processed for United Arab Emirates in 2017.
  Missing file: Z:\Thesis\Data\Met\ERA5_parquet_test\2017\United Arab Emirates\surface\2017_United_Arab_Emirates_surface_2m_temperature.parquet
  Missing file: Z:\Thesis\Data\Met\ERA5_parquet_test\2017\United Arab Emirates\surface\2017_United_Arab_Emirates_surface_2m_dewpoint_temperature.parquet
  Missing file: Z:\Thesis\Data\Met\ERA5_parquet_test\2017\U

In [1]:
import os
import pandas as pd
from pathlib import Path
from tqdm import tqdm

# Define the years to process
years = [str(year) for year in range(1989, 2001)]

# Dictionaries for surface and pressure variables
surface_file_naming = {
    "2m_temperature": "2m_temperature",
    "2m_dewpoint_temperature": "2m_dewpoint_temperature",
    "10m_u_component_of_wind": "10m_u_component_of_wind",
    "10m_v_component_of_wind": "10m_v_component_of_wind",
    "surface_solar_radiation_downwards": "surface_solar_radiation_downwards",
    "surface_thermal_radiation_downwards": "surface_thermal_radiation_downwards",
    "evaporation": "evaporation",
    "potential_evaporation": "potential_evaporation",
    "sea_surface_temperature": "sea_surface_temperature",
    "soil_temperature_level_1": "soil_temperature_level_1",
    "soil_temperature_level_2": "soil_temperature_level_2",
    "soil_temperature_level_3": "soil_temperature_level_3",
    "soil_temperature_level_4": "soil_temperature_level_4",
    "volumetric_soil_water_layer_1": "volumetric_soil_water_layer_1",
    "volumetric_soil_water_layer_2": "volumetric_soil_water_layer_2",
    "volumetric_soil_water_layer_3": "volumetric_soil_water_layer_3",
    "volumetric_soil_water_layer_4": "volumetric_soil_water_layer_4",
    "leaf_area_index_high_vegetation": "leaf_area_index_high_vegetation",
    "leaf_area_index_low_vegetation": "leaf_area_index_low_vegetation",
    "surface_pressure": "surface_pressure",
    "mean_sea_level_pressure": "mean_sea_level_pressure",
    "convective_available_potential_energy": "convective_available_potential_energy",
    "geopotential": "geopotential",
    "instantaneous_10m_wind_gust": "instantaneous_10m_wind_gust",
    "total_precipitation": "total_precipitation",
    "k_index": "k_index",
}

pressure_file_naming = {
    "geopotential": "geopotential",
    "relative_humidity": "relative_humidity",
    "temperature": "temperature",
    "u_component_of_wind": "u_component_of_wind",
    "v_component_of_wind": "v_component_of_wind",
    "vertical_velocity": "vertical_velocity",
    "vorticity": "vorticity",
}

# Variable abbreviations for accessing data
variable_abbreviations = {
    "2m_temperature": "t2m",
    "2m_dewpoint_temperature": "d2m",
    "10m_u_component_of_wind": "u10",
    "10m_v_component_of_wind": "v10",
    "surface_solar_radiation_downwards": "ssrd",
    "surface_thermal_radiation_downwards": "strd",
    "evaporation": "e",
    "potential_evaporation": "pev",
    "sea_surface_temperature": "sst",
    "soil_temperature_level_1": "stl1",
    "soil_temperature_level_2": "stl2",
    "soil_temperature_level_3": "stl3",
    "soil_temperature_level_4": "stl4",
    "volumetric_soil_water_layer_1": "swvl1",
    "volumetric_soil_water_layer_2": "swvl2",
    "volumetric_soil_water_layer_3": "swvl3",
    "volumetric_soil_water_layer_4": "swvl4",
    "leaf_area_index_high_vegetation": "lai_hv",
    "leaf_area_index_low_vegetation": "lai_lv",
    "surface_pressure": "sp",
    "mean_sea_level_pressure": "msl",
    "convective_available_potential_energy": "cape",
    "geopotential": "z",
    "instantaneous_10m_wind_gust": "i10fg",
    "total_precipitation": "tp",
    "k_index": "kx",
    "relative_humidity": "r",
    "temperature": "t",
    "u_component_of_wind": "u",
    "v_component_of_wind": "v",
    "vertical_velocity": "w",
    "vorticity": "vo",
}

# File paths
BASE_PATH = r"Z:\Thesis\Data\Met\ERA5_parquet_test"
OUTPUT_PATH = r"Z:\Thesis\Data\Met\ERA5_parquet_test\Monthly_Stats"

def calculate_monthly_statistics(year, country, surface_file_naming, pressure_file_naming, variable_abbreviations):
    """
    Calculate monthly statistics for all variables and save the result as a single Parquet file.
    """
    country_path = os.path.join(BASE_PATH, year, country)
    output_dir = os.path.join(OUTPUT_PATH, year)
    Path(output_dir).mkdir(parents=True, exist_ok=True)

    # Combine results from all variables
    monthly_stats_list = []
    missing_files = []

    def process_file(file, column_name):
        """
        Process an individual file, calculate monthly statistics, and append to the results list.
        """
        df = pd.read_parquet(file)

        # Handle column renaming for pressure files
        if "pressure_level" in df.columns:
            df.rename(columns={"pressure_level": "level"}, inplace=True)

        # Group by month and calculate statistics
        df["time"] = pd.to_datetime(df["time"])
        grouped = df.groupby([pd.Grouper(key="time", freq="ME"), "longitude", "latitude"])
        monthly_stats = grouped.agg({
            column_name: ["min", "max", "mean"]
        }).reset_index()

        # Flatten column names
        monthly_stats.columns = [
            "time" if col[0] == "time" else "_".join(col).strip("_")
            for col in monthly_stats.columns
        ]
        return monthly_stats

    # Process surface variables
    for variable, file_name in surface_file_naming.items():
        abbreviation = variable_abbreviations[variable]
        file_path = os.path.join(country_path, "surface", f"{year}_{country}_surface_{file_name}.parquet")
        if os.path.exists(file_path):
            monthly_stats_list.append(process_file(file_path, abbreviation))
        else:
            missing_files.append(file_path)

    # Process pressure variables
    for variable, file_name in pressure_file_naming.items():
        abbreviation = variable_abbreviations[variable]
        file_path = os.path.join(country_path, "pressure", f"{year}_{country}_pressure_{file_name}.parquet")
        if os.path.exists(file_path):
            monthly_stats_list.append(process_file(file_path, abbreviation))
        else:
            missing_files.append(file_path)

    # Combine and save results
    if monthly_stats_list:
        combined_stats = pd.concat(monthly_stats_list, axis=1)
        combined_stats = combined_stats.loc[:, ~combined_stats.columns.duplicated()]  # Remove duplicate columns
        output_file = os.path.join(output_dir, f"{country}_{year}_monthly_stats.parquet")
        combined_stats.to_parquet(output_file, index=False)
        print(f"Saved combined monthly statistics for {country} in {year} to {output_file}")
    else:
        print(f"No data processed for {country} in {year}.")
        for file in missing_files:
            print(f"  Missing file: {file}")

if __name__ == "__main__":
    # List of countries to process
    countries = [#"Bahrain",
                 "Saudi_Arabia",
                 "Oman",
                 "Qatar",
                 "United_Arab_Emirates",
                 "Kuwait",
                 "Yemen"
                ]

    for year in years:
        for country in countries:
            calculate_monthly_statistics(year, country, surface_file_naming, pressure_file_naming, variable_abbreviations)


MemoryError: Unable to allocate 1.18 GiB for an array with shape (1, 158236000) and data type object

In [None]:
import os
import pandas as pd
from pathlib import Path

# Define the years to process
years = [str(year) for year in range(1989, 2001)]

# Dictionaries for surface and pressure variables
surface_file_naming = {
    "2m_temperature": "2m_temperature",
    "2m_dewpoint_temperature": "2m_dewpoint_temperature",
    "10m_u_component_of_wind": "10m_u_component_of_wind",
    "10m_v_component_of_wind": "10m_v_component_of_wind",
    "surface_solar_radiation_downwards": "surface_solar_radiation_downwards",
    "surface_thermal_radiation_downwards": "surface_thermal_radiation_downwards",
    "evaporation": "evaporation",
    "potential_evaporation": "potential_evaporation",
    "sea_surface_temperature": "sea_surface_temperature",
    "soil_temperature_level_1": "soil_temperature_level_1",
    "soil_temperature_level_2": "soil_temperature_level_2",
    "soil_temperature_level_3": "soil_temperature_level_3",
    "soil_temperature_level_4": "soil_temperature_level_4",
    "volumetric_soil_water_layer_1": "volumetric_soil_water_layer_1",
    "volumetric_soil_water_layer_2": "volumetric_soil_water_layer_2",
    "volumetric_soil_water_layer_3": "volumetric_soil_water_layer_3",
    "volumetric_soil_water_layer_4": "volumetric_soil_water_layer_4",
    "leaf_area_index_high_vegetation": "leaf_area_index_high_vegetation",
    "leaf_area_index_low_vegetation": "leaf_area_index_low_vegetation",
    "surface_pressure": "surface_pressure",
    "mean_sea_level_pressure": "mean_sea_level_pressure",
    "convective_available_potential_energy": "convective_available_potential_energy",
    "geopotential": "geopotential",
    "instantaneous_10m_wind_gust": "instantaneous_10m_wind_gust",
    "total_precipitation": "total_precipitation",
    "k_index": "k_index",
}

pressure_file_naming = {
    "geopotential": "geopotential",
    "relative_humidity": "relative_humidity",
    "temperature": "temperature",
    "u_component_of_wind": "u_component_of_wind",
    "v_component_of_wind": "v_component_of_wind",
    "vertical_velocity": "vertical_velocity",
    "vorticity": "vorticity",
}

# Variable abbreviations for accessing data
variable_abbreviations = {
    "2m_temperature": "t2m",
    "2m_dewpoint_temperature": "d2m",
    "10m_u_component_of_wind": "u10",
    "10m_v_component_of_wind": "v10",
    "surface_solar_radiation_downwards": "ssrd",
    "surface_thermal_radiation_downwards": "strd",
    "evaporation": "e",
    "potential_evaporation": "pev",
    "sea_surface_temperature": "sst",
    "soil_temperature_level_1": "stl1",
    "soil_temperature_level_2": "stl2",
    "soil_temperature_level_3": "stl3",
    "soil_temperature_level_4": "stl4",
    "volumetric_soil_water_layer_1": "swvl1",
    "volumetric_soil_water_layer_2": "swvl2",
    "volumetric_soil_water_layer_3": "swvl3",
    "volumetric_soil_water_layer_4": "swvl4",
    "leaf_area_index_high_vegetation": "lai_hv",
    "leaf_area_index_low_vegetation": "lai_lv",
    "surface_pressure": "sp",
    "mean_sea_level_pressure": "msl",
    "convective_available_potential_energy": "cape",
    "geopotential": "z",
    "instantaneous_10m_wind_gust": "i10fg",
    "total_precipitation": "tp",
    "k_index": "kx",
    "relative_humidity": "r",
    "temperature": "t",
    "u_component_of_wind": "u",
    "v_component_of_wind": "v",
    "vertical_velocity": "w",
    "vorticity": "vo",
}

# File paths
BASE_PATH = r"Z:\Thesis\Data\Met\ERA5_parquet_test"
OUTPUT_PATH = r"Z:\Thesis\Data\Met\ERA5_parquet_test\Monthly_Stats"

def calculate_monthly_statistics(year, country, surface_file_naming, pressure_file_naming, variable_abbreviations):
    """
    Calculate monthly statistics for all variables and save the results as separate Parquet files for surface and pressure data.
    """
    country_path = os.path.join(BASE_PATH, year, country)
    output_dir = os.path.join(OUTPUT_PATH, year)
    Path(output_dir).mkdir(parents=True, exist_ok=True)

    # Separate results for surface and pressure data
    surface_stats_list = []
    pressure_stats_list = []
    missing_files = []

    def process_file(file, column_name, include_level=False):
        """
        Process an individual file, calculate monthly statistics, and append to the results list.
        """
        df = pd.read_parquet(file)

        # Define grouping columns based on whether `level` should be included
        group_by_columns = ["longitude", "latitude", pd.Grouper(key="time", freq="ME")]
        if include_level and "level" in df.columns:
            group_by_columns.append("level")

        # Group by month and calculate statistics
        grouped = df.groupby(group_by_columns)
        monthly_stats = grouped.agg({
            column_name: ["min", "max", "mean"]
        }).reset_index()

        # Flatten column names
        monthly_stats.columns = [
            col if isinstance(col, str) else "_".join(col).strip("_")
            for col in monthly_stats.columns
        ]
        return monthly_stats

    # Process surface variables (no dynamic levels)
    for variable, file_name in surface_file_naming.items():
        abbreviation = variable_abbreviations[variable]
        file_path = os.path.join(country_path, "surface", f"{year}_{country}_surface_{file_name}.parquet")
        if os.path.exists(file_path):
            surface_stats_list.append(process_file(file_path, abbreviation, include_level=True))
        else:
            missing_files.append(file_path)

    # Process pressure variables (include dynamic levels)
    for variable, file_name in pressure_file_naming.items():
        abbreviation = variable_abbreviations[variable]
        file_path = os.path.join(country_path, "pressure", f"{year}_{country}_pressure_{file_name}.parquet")
        if os.path.exists(file_path):
            pressure_stats_list.append(process_file(file_path, abbreviation, include_level=True))
        else:
            missing_files.append(file_path)

    # Save surface data
    if surface_stats_list:
        combined_surface_stats = pd.concat(surface_stats_list, axis=1)
        combined_surface_stats = combined_surface_stats.loc[:, ~combined_surface_stats.columns.duplicated()]  # Remove duplicate columns
        surface_output_file = os.path.join(output_dir, f"{country}_{year}_surface_monthly_stats.parquet")
        combined_surface_stats.to_parquet(surface_output_file, index=False)
        print(f"Saved surface monthly statistics for {country} in {year} to {surface_output_file}")
    else:
        print(f"No surface data processed for {country} in {year}.")
    
    # Save pressure data
    if pressure_stats_list:
        combined_pressure_stats = pd.concat(pressure_stats_list, axis=1)
        combined_pressure_stats = combined_pressure_stats.loc[:, ~combined_pressure_stats.columns.duplicated()]  # Remove duplicate columns
        pressure_output_file = os.path.join(output_dir, f"{country}_{year}_pressure_monthly_stats.parquet")
        combined_pressure_stats.to_parquet(pressure_output_file, index=False)
        print(f"Saved pressure monthly statistics for {country} in {year} to {pressure_output_file}")
    else:
        print(f"No pressure data processed for {country} in {year}.")

    # Log missing files
    if missing_files:
        print(f"Missing files for {country} in {year}:")
        for file in missing_files:
            print(f"  {file}")



if __name__ == "__main__":
    # List of countries to process
    countries = ["Bahrain",
                 #"Saudi_Arabia",
                 #"Oman",
                 #"Qatar",
                 #"United_Arab_Emirates",
                 #"Kuwait",
                 #"Yemen"
                ]

    for year in years:
        for country in countries:
            calculate_monthly_statistics(year, country, surface_file_naming, pressure_file_naming, variable_abbreviations)


Saved surface monthly statistics for Bahrain in 1989 to Z:\Thesis\Data\Met\ERA5_parquet_test\Monthly_Stats\1989\Bahrain_1989_surface_monthly_stats.parquet
Saved pressure monthly statistics for Bahrain in 1989 to Z:\Thesis\Data\Met\ERA5_parquet_test\Monthly_Stats\1989\Bahrain_1989_pressure_monthly_stats.parquet
Saved surface monthly statistics for Bahrain in 1990 to Z:\Thesis\Data\Met\ERA5_parquet_test\Monthly_Stats\1990\Bahrain_1990_surface_monthly_stats.parquet
Saved pressure monthly statistics for Bahrain in 1990 to Z:\Thesis\Data\Met\ERA5_parquet_test\Monthly_Stats\1990\Bahrain_1990_pressure_monthly_stats.parquet
