In [2]:
import os
import pandas as pd

# ------------------------- CONFIGURATION -------------------------
base_path = r"Z:\Thesis\Data\ML_Data\AP_training2"
output_dir = r"Z:\Thesis\Data\stat_analysis\AP_means"
os.makedirs(output_dir, exist_ok=True)

train_years = [
    1980, 1981, 1982, 1983, 1984, 
    1985, 1986, 1987, 1988, 2016, 
    2017, 2018, 2019, 2020, 2021
]


countries = [
    "Yemen", 
    "Bahrain", 
    "Kuwait", 
    "Oman", 
    "Qatar", 
    "Saudi_Arabia", 
    "United_Arab_Emirates"
]

# (Optional) If you only want to compute means of these specific columns, list them:
selected_columns = [
    "t2m_mean", #"t2m_min","t2m_max",
    "d2m_mean", #"d2m_min","d2m_max",
    "u10_mean", #"u10_min","u10_max",
    "v10_mean", #"v10_min","v10_max",
    "ssrd_mean", #"ssrd_min","ssrd_max",
    "strd_mean", #"strd_min","strd_max",
    "e_mean", #"e_min","e_max",
    "pev_mean", #"pev_min","pev_max",
    "sst_mean", #"sst_min","sst_max",
    "stl1_mean", #"stl1_min","stl1_max",
    "stl2_mean", #"stl2_min","stl2_max",
    "stl3_mean", #"stl3_min","stl3_max",
    "stl4_mean", #"stl4_min","stl4_max",
    "swvl1_mean", #"swvl1_min","swvl1_max",
    "swvl2_mean", #"swvl2_min","swvl2_max",
    "swvl3_mean", #"swvl3_min","swvl3_max",
    "swvl4_mean", #"swvl4_min","swvl4_max",
    "lai_hv_mean", #"lai_hv_min","lai_hv_max",
    "lai_lv_mean", #"lai_lv_min","lai_lv_max",
    "sp_mean", #"sp_min","sp_max",
    "msl_mean", #"msl_min","msl_max",
    "cape_mean", #"cape_min","cape_max",
    "z_mean", #"z_min","z_max",
    "i10fg_mean", #"i10fg_min","i10fg_max",
    "tp_mean", #"tp_min","tp_max",
    "kx_mean", #"kx_min","kx_max",
    "DUSMASS_mean", #"DUSMASS_min","DUSMASS_max",
    "DUSMASS25_mean", #"DUSMASS25_min","DUSMASS25_max",
    "DUFLUXU_mean", #"DUFLUXU_min","DUFLUXU_max",
    "DUFLUXV_mean" #"DUFLUXV_min","DUFLUXV_max",
]

all_dfs = []

# ------------------------- DATA LOADING & COMBINING -------------------------
for country in countries:
    for yr in train_years:
        file_path = os.path.join(
            base_path, 
            str(yr), 
            f"{country}_{yr}_surface_monthly_stats_merged.parquet"
        )
        
        if os.path.exists(file_path):
            try:
                df = pd.read_parquet(file_path)
                
                # If you only want specific columns, filter here:
                df = df[selected_columns] if set(selected_columns).issubset(df.columns) else df
                
                all_dfs.append(df)
            except Exception as e:
                print(f"Could not read {file_path} due to error: {e}")
        else:
            print(f"File does not exist: {file_path}")

# ------------------------- COMPUTE MEANS & SAVE -------------------------
if not all_dfs:
    print("No files were loaded. Please check your paths and configuration.")
else:
    combined_df = pd.concat(all_dfs, ignore_index=True)
    
    # If you only want means of numeric columns, use numeric_only=True:
    # Or if you are sure all are numeric, you can omit numeric_only
    means_series = combined_df.mean(numeric_only=True)
    
    # Optionally, if you only want the means of your selected columns:
    means_series = combined_df[selected_columns].mean()

    # Convert the Series to a DataFrame for clearer labeling in CSV
    means_df = pd.DataFrame(means_series, columns=["Mean_Value"])
    
    # Save the results as a CSV
    output_file = os.path.join(output_dir, "AP_SFC_means.csv")
    means_df.to_csv(output_file)
    
    print(f"Means saved to {output_file}")


Means saved to Z:\Thesis\Data\stat_analysis\AP_means\AP_means.csv


In [2]:
import os
import math
import pandas as pd

# ------------------------- CONFIGURATION -------------------------
base_path = r"Z:\Thesis\Data\ML_Data\AP_ML_training_data"
output_dir = r"Z:\Thesis\Data\stat_analysis\AP_means"
os.makedirs(output_dir, exist_ok=True)

train_years = [
    1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989,
    1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
    2000,
    2013, 2014, 2015, 2016, 2017, 2018, 2019, 
    2020, 2021, 2022, 2023
]

countries = [
    "Yemen", 
    "Bahrain", 
    "Kuwait", 
    "Oman", 
    "Qatar", 
    "Saudi_Arabia", 
    "United_Arab_Emirates"
]

# Selected columns to average
selected_columns = [
    "t2m_mean", 
    "d2m_mean", 
    "u10_mean", 
    "v10_mean", 
    "ssrd_mean", 
    "strd_mean", 
    "e_mean", 
    "pev_mean", 
    "sst_mean", 
    "stl1_mean", 
    "stl2_mean", 
    "stl3_mean", 
    "stl4_mean", 
    "swvl1_mean", 
    "swvl2_mean", 
    "swvl3_mean", 
    "swvl4_mean", 
    "lai_hv_mean", 
    "lai_lv_mean", 
    "sp_mean", 
    "msl_mean", 
    "cape_mean", 
    "z_mean", 
    "i10fg_mean", 
    "tp_mean", 
    "kx_mean",
    "DUSMASS_mean",
    "DUSMASS25_mean",
    "DUFLUXU_mean",
    "DUFLUXV_mean"
]

############################################################################
# 1) Define typical MERRA-2 parameter units (adjust if your data differs):
############################################################################
param_units = {
    # Temperatures (initially K; we convert to °C below)
    "t2m_mean":        "K",
    "d2m_mean":        "K",
    "sst_mean":        "K",
    "stl1_mean":       "K",
    "stl2_mean":       "K",
    "stl3_mean":       "K",
    "stl4_mean":       "K",
    
    # Wind components
    "u10_mean":        "m/s",
    "v10_mean":        "m/s",
    
    # Radiation (accumulated energy)
    "ssrd_mean":       "J/m^2",
    "strd_mean":       "J/m^2",
    
    # Evaporation
    "e_mean":          "m of water",
    "pev_mean":        "m of water",
    
    # Soil moisture, LAI, etc.
    "swvl1_mean":      "m^3/m^3",
    "swvl2_mean":      "m^3/m^3",
    "swvl3_mean":      "m^3/m^3",
    "swvl4_mean":      "m^3/m^3",
    "lai_hv_mean":     "m^2/m^2",
    "lai_lv_mean":     "m^2/m^2",
    
    # Pressure & others
    "sp_mean":         "Pa",
    "msl_mean":        "Pa",
    "cape_mean":       "J/kg",
    "z_mean":          "m",
    "i10fg_mean":      "m/s",
    "tp_mean":         "m of water",
    "kx_mean":         "K",      # K-index (dimensionless, but often reported in K)
    
    # Dust mass concentrations
    "DUSMASS_mean":    "kg/m^3",  # per MERRA-2 info
    "DUSMASS25_mean":  "kg/m^3",  # per MERRA-2 info
    
    # Dust flux
    "DUFLUXU_mean":    "kg/(m·s)",  # or "kg m^-1 s^-1"
    "DUFLUXV_mean":    "kg/(m·s)"
}

# Temperature variables to convert from Kelvin to Celsius
temp_vars = [
    "t2m_mean", "d2m_mean", "sst_mean", 
    "stl1_mean", "stl2_mean", "stl3_mean", "stl4_mean"
]

all_dfs = []

# ------------------------- DATA LOADING & COMBINING -------------------------
for country in countries:
    for yr in train_years:
        file_path = os.path.join(
            base_path, 
            str(yr), 
            f"{country}_{yr}_surface_monthly_stats_merged.parquet"
        )
        
        if os.path.exists(file_path):
            try:
                df = pd.read_parquet(file_path)
                
                # Filter to our selected columns if present
                df = df[selected_columns] if set(selected_columns).issubset(df.columns) else df
                all_dfs.append(df)
            except Exception as e:
                print(f"Could not read {file_path} due to error: {e}")
        else:
            print(f"File does not exist: {file_path}")

# ------------------------- COMPUTE MEANS & SAVE -------------------------
if not all_dfs:
    print("No files were loaded. Please check your paths and configuration.")
else:
    combined_df = pd.concat(all_dfs, ignore_index=True)
    
    # Compute mean of the selected columns
    means_series = combined_df[selected_columns].mean(numeric_only=True)
    
    # Convert to DataFrame
    means_df = pd.DataFrame(means_series, columns=["Mean_Value"])
    
    # -------------- Create a "Units" column (initially None) --------------
    means_df["Units"] = None
    
    # -------------- 1) Convert Kelvin -> Celsius & update units --------------
    for var in temp_vars:
        if var in means_df.index:
            means_df.loc[var, "Mean_Value"] -= 273.15
            means_df.loc[var, "Units"] = "°C"  # override the dictionary for these
    
    # -------------- 2) Assign correct units from the dictionary --------------
    for param in means_df.index:
        if param in param_units:
            # Only update if we haven't already set (e.g., for the temp vars)
            if means_df.loc[param, "Units"] in (None, ""):
                means_df.loc[param, "Units"] = param_units[param]
    
    # -------------- 3) Compute wind speed/direction for 10-m wind --------------
    if "u10_mean" in means_df.index and "v10_mean" in means_df.index:
        u10 = means_df.loc["u10_mean", "Mean_Value"]
        v10 = means_df.loc["v10_mean", "Mean_Value"]
        
        wind_speed_10m = math.sqrt(u10**2 + v10**2)
        # meteorological direction = "from" direction, clockwise from N
        theta_math_deg = math.degrees(math.atan2(v10, u10))  # standard atan2(y, x)
        wind_dir_10m = (270.0 - theta_math_deg) % 360.0
        
        means_df.loc["wind_speed_10m", "Mean_Value"] = wind_speed_10m
        means_df.loc["wind_speed_10m", "Units"] = "m/s"
        
        means_df.loc["wind_dir_10m", "Mean_Value"] = wind_dir_10m
        means_df.loc["wind_dir_10m", "Units"] = "deg_from_N"
    
    # -------------- 4) Compute magnitude/direction for DUFLUX --------------
    if "DUFLUXU_mean" in means_df.index and "DUFLUXV_mean" in means_df.index:
        du_u = means_df.loc["DUFLUXU_mean", "Mean_Value"]
        du_v = means_df.loc["DUFLUXV_mean", "Mean_Value"]
        
        du_flux_speed = math.sqrt(du_u**2 + du_v**2)
        theta_math_deg = math.degrees(math.atan2(du_v, du_u))
        du_flux_dir = (270.0 - theta_math_deg) % 360.0
        
        means_df.loc["dust_flux_speed", "Mean_Value"] = du_flux_speed
        means_df.loc["dust_flux_speed", "Units"] = "kg/(m·s)"
        
        means_df.loc["dust_flux_dir", "Mean_Value"] = du_flux_dir
        means_df.loc["dust_flux_dir", "Units"] = "deg_from_N"
    
    # -------------- 5) Save the results as CSV --------------
    # Reorder columns so Mean_Value is first, Units second
    means_df = means_df[["Mean_Value", "Units"]]
    
    output_file = os.path.join(output_dir, "AP_SFC_means_with_calcs_feb25.csv")
    means_df.to_csv(output_file)
    print(f"Means (with derived parameters) saved to {output_file}")


Means (with derived parameters) saved to Z:\Thesis\Data\stat_analysis\AP_means\AP_SFC_means_with_calcs_feb25.csv


In [None]:
# Only calculates averages for lowest 5000ft in atmosphere (level= 1000, 925, 850)

