In [1]:
import os
import pandas as pd

# ------------------------- CONFIGURATION -------------------------
base_path = r"Z:\Thesis\Data\ML_Data\AP_training2"
output_dir = r"Z:\Thesis\Data\stat_analysis\AP_means"
os.makedirs(output_dir, exist_ok=True)

train_years = [
    1980, 1981, 1982, 1983, 1984, 
    1985, 1986, 1987, 1988, 2016, 
    2017, 2018, 2019, 2020, 2021
]


countries = [
    "Yemen", 
    "Bahrain", 
    "Kuwait", 
    "Oman", 
    "Qatar", 
    "Saudi_Arabia", 
    "United_Arab_Emirates"
]

# (Optional) If you only want to compute means of these specific columns, list them:
selected_columns = [
    "z_mean", 
    "r_mean", 
    "t_mean", 
    "u_mean", 
    "v_mean",
    "w_mean",
    "vo_mean",
    "DUCMASS_mean", #"DUSMASS_min","DUSMASS_max",
    "DUCMASS25_mean", #"DUSMASS25_min","DUSMASS25_max",
    "DUFLUXU_mean", #"DUFLUXU_min","DUFLUXU_max",
    "DUFLUXV_mean" #"DUFLUXV_min","DUFLUXV_max",
]

all_dfs = []

# ------------------------- DATA LOADING & COMBINING -------------------------
for country in countries:
    for yr in train_years:
        file_path = os.path.join(
            base_path, 
            str(yr), 
            f"{country}_{yr}_pressure_monthly_stats_merged.parquet"
        )
        
        if os.path.exists(file_path):
            try:
                df = pd.read_parquet(file_path)
                
                # If you only want specific columns, filter here:
                df = df[selected_columns] if set(selected_columns).issubset(df.columns) else df
                
                all_dfs.append(df)
            except Exception as e:
                print(f"Could not read {file_path} due to error: {e}")
        else:
            print(f"File does not exist: {file_path}")

# ------------------------- COMPUTE MEANS & SAVE -------------------------
if not all_dfs:
    print("No files were loaded. Please check your paths and configuration.")
else:
    combined_df = pd.concat(all_dfs, ignore_index=True)
    
    # If you only want means of numeric columns, use numeric_only=True:
    # Or if you are sure all are numeric, you can omit numeric_only
    means_series = combined_df.mean(numeric_only=True)
    
    # Optionally, if you only want the means of your selected columns:
    means_series = combined_df[selected_columns].mean()

    # Convert the Series to a DataFrame for clearer labeling in CSV
    means_df = pd.DataFrame(means_series, columns=["Mean_Value"])
    
    # Save the results as a CSV
    output_file = os.path.join(output_dir, "AP_UA_means.csv")
    means_df.to_csv(output_file)
    
    print(f"Means saved to {output_file}")


Means saved to Z:\Thesis\Data\stat_analysis\AP_means\AP_UA_means.csv


In [3]:
import os
import math
import pandas as pd

# ------------------------- CONFIGURATION -------------------------
base_path = r"Z:\Thesis\Data\ML_Data\AP_training2"
output_dir = r"Z:\Thesis\Data\stat_analysis\AP_means"
os.makedirs(output_dir, exist_ok=True)

train_years = [
    1980, 1981, 1982, 1983, 1984, 
    1985, 1986, 1987, 1988, 2016, 
    2017, 2018, 2019, 2020, 2021
]

countries = [
    "Yemen", 
    "Bahrain", 
    "Kuwait", 
    "Oman", 
    "Qatar", 
    "Saudi_Arabia", 
    "United_Arab_Emirates"
]

# Pressure levels to include
pressure_levels = ['1000', '925', '850', '700', '500', '300', '200', '100', '50', '10']

# Selected columns to average
selected_columns = [
    "level",       # Pressure level
    "z_mean",      # Geopotential
    "r_mean",      # Relative Humidity
    "t_mean",      # Temperature
    "u_mean",      # U-component of wind
    "v_mean",      # V-component of wind
    "w_mean",      # Vertical Velocity
    "vo_mean",     # Vorticity
    "DUCMASS_mean",  # Dust column mass density
    "DUCMASS25_mean",  # Dust column mass density - PM2.5
    "DUFLUXU_mean",  # Dust column U-wind flux
    "DUFLUXV_mean"   # Dust column V-wind flux
]

# Units for each variable
param_units = {
    "z_mean":        "m²/s²",
    "r_mean":        "%",
    "t_mean":        "K",
    "u_mean":        "m/s",
    "v_mean":        "m/s",
    "w_mean":        "Pa/s",
    "vo_mean":       "s⁻¹",
    "DUCMASS_mean":  "kg/m³",
    "DUCMASS25_mean": "kg/m³",
    "DUFLUXU_mean":  "kg/(m·s)",
    "DUFLUXV_mean":  "kg/(m·s)",
    "wind_speed":    "m/s",
    "wind_dir":      "deg_from_N",
    "dust_flux_speed": "kg/(m·s)",
    "dust_flux_dir": "deg_from_N"
}

# Temperature variables to convert from Kelvin to Celsius
temp_vars = ["t_mean"]

all_dfs = []

# ------------------------- DATA LOADING & COMBINING -------------------------
for country in countries:
    for yr in train_years:
        file_path = os.path.join(
            base_path, 
            str(yr), 
            f"{country}_{yr}_pressure_monthly_stats_merged.parquet"
        )
        
        if os.path.exists(file_path):
            try:
                df = pd.read_parquet(file_path)
                
                # Filter to our selected columns if present
                df = df[selected_columns] if set(selected_columns).issubset(df.columns) else df
                
                all_dfs.append(df)
            except Exception as e:
                print(f"Could not read {file_path} due to error: {e}")
        else:
            print(f"File does not exist: {file_path}")

# ------------------------- COMPUTE MEANS & SAVE -------------------------
if not all_dfs:
    print("No files were loaded. Please check your paths and configuration.")
else:
    combined_df = pd.concat(all_dfs, ignore_index=True)
    
    results = []

    # Loop over pressure levels to compute averages and derived metrics
    for level in pressure_levels:
        level_df = combined_df[combined_df["level"] == int(level)]
        if level_df.empty:
            continue

        # Compute mean for selected variables
        means_series = level_df.mean(numeric_only=True)
        means_df = pd.DataFrame(means_series, columns=["Mean_Value"])
        means_df["Pressure_Level"] = level  # Add the pressure level
        means_df["Units"] = None

        # Convert temperature from Kelvin to Celsius
        for var in temp_vars:
            if var in means_df.index:
                means_df.loc[var, "Mean_Value"] -= 273.15
                means_df.loc[var, "Units"] = "°C"  # Update units

        # Assign units for other variables
        for param in means_df.index:
            if param in param_units and means_df.loc[param, "Units"] is None:
                means_df.loc[param, "Units"] = param_units[param]

        # Compute wind speed and direction
        if "u_mean" in means_df.index and "v_mean" in means_df.index:
            u = means_df.loc["u_mean", "Mean_Value"]
            v = means_df.loc["v_mean", "Mean_Value"]
            wind_speed = math.sqrt(u**2 + v**2)
            wind_dir = (270.0 - math.degrees(math.atan2(v, u))) % 360.0

            means_df.loc["wind_speed", "Mean_Value"] = wind_speed
            means_df.loc["wind_speed", "Units"] = "m/s"

            means_df.loc["wind_dir", "Mean_Value"] = wind_dir
            means_df.loc["wind_dir", "Units"] = "deg_from_N"

        # Compute dust flux magnitude and direction
        if "DUFLUXU_mean" in means_df.index and "DUFLUXV_mean" in means_df.index:
            du_u = means_df.loc["DUFLUXU_mean", "Mean_Value"]
            du_v = means_df.loc["DUFLUXV_mean", "Mean_Value"]
            dust_flux_speed = math.sqrt(du_u**2 + du_v**2)
            dust_flux_dir = (270.0 - math.degrees(math.atan2(du_v, du_u))) % 360.0

            means_df.loc["dust_flux_speed", "Mean_Value"] = dust_flux_speed
            means_df.loc["dust_flux_speed", "Units"] = "kg/(m·s)"

            means_df.loc["dust_flux_dir", "Mean_Value"] = dust_flux_dir
            means_df.loc["dust_flux_dir", "Units"] = "deg_from_N"

        # Append the results for this level
        results.append(means_df)

    # Concatenate all results
    final_df = pd.concat(results)

    # Save the results as CSV
    final_output_file = os.path.join(output_dir, "AP_UA_means_with_calc.csv")
    final_df.to_csv(final_output_file, index_label="Variable")
    print(f"Pressure-level means (with derived metrics) saved to {final_output_file}")


Pressure-level means (with derived metrics) saved to Z:\Thesis\Data\stat_analysis\AP_pressure_means\AP_Pressure_Means_with_Derived.csv
