In [7]:
import os
import pandas as pd
import h3

# Define the year range
start_year = 2013
end_year = 2023  # inclusive

# Countries for ERA5 data
countries = [
    "Saudi_Arabia",
    "United_Arab_Emirates",
    "Yemen",
    "Bahrain",
    "Kuwait",
    "Oman",
    "Qatar"
]

# Base directories
merra2_base = r"Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats"
era5_base = r"Z:\Thesis\Data\Met\ERA5_parquet_test\Monthly_Stats"

# Compile a list of file paths
file_paths = []

for year in range(start_year, end_year+1):
    # MERRA2 path for the given year
    merra2_path = os.path.join(merra2_base, str(year), f"MERRA2_{year}_monthly_stats.parquet")
    file_paths.append(merra2_path)

    # ERA5 paths for the given year and each country
    for country in countries:
        pressure_path = os.path.join(era5_base, str(year), f"{country}_{year}_pressure_monthly_stats.parquet")
        surface_path = os.path.join(era5_base, str(year), f"{country}_{year}_surface_monthly_stats.parquet")
        file_paths.append(pressure_path)
        file_paths.append(surface_path)

# Define which H3 resolutions we want
h3_resolutions = [3, 4, 5, 6]

for file_path in file_paths:
    if not os.path.exists(file_path):
        print(f"File does not exist, skipping: {file_path}")
        continue

    print(f"Processing {file_path}")

    # Load the parquet file into a DataFrame
    df = pd.read_parquet(file_path)

    # Determine if file is MERRA2 or ERA5 based on file path
    if "MERRA2" in file_path:
        # MERRA2 uses 'lat' and 'lon'
        # Check if the columns exist as expected
        if 'lat' not in df.columns or 'lon' not in df.columns:
            print(f"lat/lon columns not found for MERRA2 file: {file_path}, skipping.")
            continue
    else:
        # ERA5 uses 'latitude' and 'longitude', so we rename them to 'lat' and 'lon'
        if 'latitude' in df.columns and 'longitude' in df.columns:
            df.rename(columns={'latitude': 'lat', 'longitude': 'lon'}, inplace=True)
        # After attempted rename, check if 'lat' and 'lon' now exist
        if 'lat' not in df.columns or 'lon' not in df.columns:
            print(f"lat/lon columns not found for ERA5 file: {file_path}, skipping.")
            continue


    # Compute H3 indices at desired resolutions
    for res in h3_resolutions:
        h3_col_name = f"h3_res_{res}"
        # Compute the H3 string index
        df[h3_col_name] = df.apply(lambda row: h3.geo_to_h3(row['lat'], row['lon'], res), axis=1)
        
        # Convert the H3 string to a numeric index
        h3_int_col = f"h3_res_{res}_int"
        df[h3_int_col] = df[h3_col_name].apply(lambda x: int(x, 16))

    # Save the updated DataFrame back to parquet (overwriting)
    df.to_parquet(file_path, index=False)
    print(f"Updated file saved: {file_path}")


Processing Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats\2013\MERRA2_2013_monthly_stats.parquet
Updated file saved: Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats\2013\MERRA2_2013_monthly_stats.parquet
Processing Z:\Thesis\Data\Met\ERA5_parquet_test\Monthly_Stats\2013\Saudi_Arabia_2013_pressure_monthly_stats.parquet
Updated file saved: Z:\Thesis\Data\Met\ERA5_parquet_test\Monthly_Stats\2013\Saudi_Arabia_2013_pressure_monthly_stats.parquet
Processing Z:\Thesis\Data\Met\ERA5_parquet_test\Monthly_Stats\2013\Saudi_Arabia_2013_surface_monthly_stats.parquet
Updated file saved: Z:\Thesis\Data\Met\ERA5_parquet_test\Monthly_Stats\2013\Saudi_Arabia_2013_surface_monthly_stats.parquet
Processing Z:\Thesis\Data\Met\ERA5_parquet_test\Monthly_Stats\2013\United_Arab_Emirates_2013_pressure_monthly_stats.parquet
Updated file saved: Z:\Thesis\Data\Met\ERA5_parquet_test\Monthly_Stats\2013\United_Arab_Emirates_2013_pressure_monthly_stats.parquet
Processing Z:\Thesis\Data\Met\ERA5

In [8]:
import pandas as pd
import os

# Base paths
era5_base_path = r"Z:\Thesis\Data\Met\ERA5_parquet_test\Monthly_Stats"
merra2_base_path = r"Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats"
output_base_path = r"Z:\Thesis\Data\ML_Data\AP_ML_training_data"

# Columns to merge for surface and pressure data
surface_aod_cols = [
    "DUSMASS_min", "DUSMASS_max", "DUSMASS_mean",
    "DUSMASS25_min", "DUSMASS25_max", "DUSMASS25_mean",
    "DUFLUXU_min", "DUFLUXU_max", "DUFLUXU_mean",
    "DUFLUXV_min", "DUFLUXV_max", "DUFLUXV_mean"
]

pressure_aod_cols = [
    "DUCMASS_min", "DUCMASS_max", "DUCMASS_mean",
    "DUCMASS25_min", "DUCMASS25_max", "DUCMASS25_mean",
    "DUFLUXU_min", "DUFLUXU_max", "DUFLUXU_mean",
    "DUFLUXV_min", "DUFLUXV_max", "DUFLUXV_mean"
]

# Process each year
for year in range(2013, 2024):
    print(f"Processing year: {year}")

    # Paths for the current year
    merra2_file = os.path.join(merra2_base_path, str(year), f"MERRA2_{year}_monthly_stats.parquet")
    era5_year_path = os.path.join(era5_base_path, str(year))

    if not os.path.exists(merra2_file):
        print(f"Skipping year {year}: MERRA2 file not found.")
        continue
    if not os.path.exists(era5_year_path):
        print(f"Skipping year {year}: ERA5 directory not found.")
        continue

    # Load MERRA2 data and convert the time column to datetime
    merra2_df = pd.read_parquet(merra2_file)
    merra2_df['time'] = pd.to_datetime(merra2_df['time'])

    # Process each ERA5 file in the year's directory
    for era5_file in os.listdir(era5_year_path):
        era5_file_path = os.path.join(era5_year_path, era5_file)

        # Determine if this file is surface or pressure based on its name
        filename = os.path.basename(era5_file_path).lower()
        if "surface" in filename:
            aod_cols = surface_aod_cols
        elif "pressure" in filename:
            aod_cols = pressure_aod_cols
        else:
            print(f"Skipping {era5_file_path}: Unable to determine type.")
            continue

        # Read ERA5 data and convert the time column to datetime
        era5_df = pd.read_parquet(era5_file_path)
        era5_df['time'] = pd.to_datetime(era5_df['time'])

        # Define join keys
        join_keys = ['time', 'h3_res_3']
        
        # Subset and aggregate MERRA2 data by taking the mean for each time and h3_res_3
        merra2_subset = merra2_df[join_keys + aod_cols]
        merra2_agg = merra2_subset.groupby(join_keys, as_index=False).mean()

        # Merge ERA5 with aggregated MERRA2 data
        merged_df = pd.merge(era5_df, merra2_agg, on=join_keys, how='left')

        # Drop rows that do not have a match in the aggregated MERRA2 data
        merged_df = merged_df.dropna(subset=aod_cols)

        # Save the merged output
        output_dir = os.path.join(output_base_path, str(year))
        os.makedirs(output_dir, exist_ok=True)
        output_file = os.path.join(output_dir, os.path.splitext(era5_file)[0] + "_merged.parquet")
        merged_df.to_parquet(output_file, index=False)
        print(f"Saved merged file: {output_file}")


Processing year: 2013
Saved merged file: Z:\Thesis\Data\ML_Data\AP_ML_training_data\2013\Bahrain_2013_pressure_monthly_stats_merged.parquet
Saved merged file: Z:\Thesis\Data\ML_Data\AP_ML_training_data\2013\Bahrain_2013_surface_monthly_stats_merged.parquet
Saved merged file: Z:\Thesis\Data\ML_Data\AP_ML_training_data\2013\Kuwait_2013_pressure_monthly_stats_merged.parquet
Saved merged file: Z:\Thesis\Data\ML_Data\AP_ML_training_data\2013\Kuwait_2013_surface_monthly_stats_merged.parquet
Saved merged file: Z:\Thesis\Data\ML_Data\AP_ML_training_data\2013\Oman_2013_pressure_monthly_stats_merged.parquet
Saved merged file: Z:\Thesis\Data\ML_Data\AP_ML_training_data\2013\Oman_2013_surface_monthly_stats_merged.parquet
Saved merged file: Z:\Thesis\Data\ML_Data\AP_ML_training_data\2013\Qatar_2013_pressure_monthly_stats_merged.parquet
Saved merged file: Z:\Thesis\Data\ML_Data\AP_ML_training_data\2013\Qatar_2013_surface_monthly_stats_merged.parquet
Saved merged file: Z:\Thesis\Data\ML_Data\AP_ML_tr

In [9]:
import pandas as pd
import os
from glob import glob

# Paths to DMI CSV files (using raw strings to avoid escape issues)
dmi_csv_files = {
    "DMI_EAST_HadISST1.1": r"Z:\Thesis\Data\Met\DMI\dmieast.had.long.csv",
    "DMI_HadISST1.1": r"Z:\Thesis\Data\Met\DMI\dmi.had.long.csv"
}

# Paths to parquet files
data_directory = r"Z:\Thesis\Data\ML_Data\AP_ML_training_data"
output_directory = r"Z:\Thesis\Data\ML_Data\AP_ML_training_data"
years = list(range(2013, 2024))  # Adjust the range of years as needed

print("Paths defined")

def load_dmi_data(file_paths):
    """
    Loads DMI CSV files, converts the date string to a datetime,
    creates 'year' and 'month' columns, and then merges the datasets
    on these two columns.
    """
    dmi_data_final = None
    for column_name, file_path in file_paths.items():
        # Use comma as the delimiter
        dmi_data = pd.read_csv(file_path, sep=',')
        # Remove any accidental spaces in the column names
        dmi_data.columns = dmi_data.columns.str.strip()
        # Convert the Date column (e.g. "1/1/1980") to datetime
        dmi_data['Date'] = pd.to_datetime(dmi_data['Date'], format='%m/%d/%Y', errors='coerce')
        # Create separate year and month columns
        dmi_data['year'] = dmi_data['Date'].dt.year
        dmi_data['month'] = dmi_data['Date'].dt.month
        # Keep only the year, month, and the desired DMI column
        dmi_data = dmi_data[['year', 'month', column_name]]
        
        # Merge this dataframe with the accumulated data
        if dmi_data_final is None:
            dmi_data_final = dmi_data
        else:
            dmi_data_final = pd.merge(dmi_data_final, dmi_data, on=['year', 'month'], how='outer')
            
    return dmi_data_final

dmi_data = load_dmi_data(dmi_csv_files)
print("DMI data loaded and processed:")
print(dmi_data.head())

def process_and_merge_parquet_files(base_dir, years, dmi_data, output_dir):
    """
    For each parquet file in each year's directory:
      - Drop the existing (blank) DMI columns from the parquet data,
      - Convert the 'time' column to datetime and create 'year' and 'month' columns,
      - Merge with the DMI data on ['year', 'month'],
      - Save the merged dataset.
    """
    for year in years:
        year_path = os.path.join(base_dir, str(year))
        print(f"\nProcessing directory: {year_path}")
        parquet_files = glob(os.path.join(year_path, "*.parquet"))
        print(f"Found {len(parquet_files)} parquet files.")
        
        if not parquet_files:
            print(f"No parquet files found for {year}")
            continue

        for parquet_file in parquet_files:
            try:
                parquet_data = pd.read_parquet(parquet_file)
            except Exception as e:
                print(f"Error reading {parquet_file}: {e}")
                continue

            # Drop the blank DMI columns from the parquet file (if they exist)
            for col in ["DMI_EAST_HadISST1.1", "DMI_HadISST1.1"]:
                if col in parquet_data.columns:
                    parquet_data.drop(columns=[col], inplace=True)

            # Convert the 'time' column to datetime (format like "1/31/1980")
            parquet_data['time'] = pd.to_datetime(parquet_data['time'], format='%m/%d/%Y', errors='coerce')
            # Create year and month columns for merging
            parquet_data['year'] = parquet_data['time'].dt.year
            parquet_data['month'] = parquet_data['time'].dt.month

            # Merge on year and month
            merged_data = parquet_data.merge(dmi_data, on=['year', 'month'], how='left')

            # Build the output file path and ensure the directory exists
            relative_path = os.path.relpath(parquet_file, base_dir)
            output_file = os.path.join(output_dir, relative_path)
            os.makedirs(os.path.dirname(output_file), exist_ok=True)
            merged_data.to_parquet(output_file, index=False)

            print(f"Merged data saved to {output_file}")

process_and_merge_parquet_files(data_directory, years, dmi_data, output_directory)
print("\nProcess completed")


Paths defined
DMI data loaded and processed:
     year  month  DMI_EAST_HadISST1.1  DMI_HadISST1.1
0  1900.0    1.0               -0.226          -0.403
1  1900.0    2.0               -0.105          -0.213
2  1900.0    3.0               -0.163          -0.364
3  1900.0    4.0               -0.097          -0.244
4  1900.0    5.0               -0.304          -0.080

Processing directory: Z:\Thesis\Data\ML_Data\AP_ML_training_data\2013
Found 14 parquet files.
Merged data saved to Z:\Thesis\Data\ML_Data\AP_ML_training_data\2013\Bahrain_2013_pressure_monthly_stats_merged.parquet
Merged data saved to Z:\Thesis\Data\ML_Data\AP_ML_training_data\2013\Bahrain_2013_surface_monthly_stats_merged.parquet
Merged data saved to Z:\Thesis\Data\ML_Data\AP_ML_training_data\2013\Kuwait_2013_pressure_monthly_stats_merged.parquet
Merged data saved to Z:\Thesis\Data\ML_Data\AP_ML_training_data\2013\Kuwait_2013_surface_monthly_stats_merged.parquet
Merged data saved to Z:\Thesis\Data\ML_Data\AP_ML_training_d