In [8]:
import pandas as pd
import os
from glob import glob

# Paths to DMI CSV files
dmi_csv_files = {
    "DMI_EAST_HadISST1.1": r"Z:\Thesis\Data\Met\DMI\dmieast.had.long.csv",
    "DMI_HadISST1.1": r"Z:\Thesis\Data\Met\DMI\dmieast.had.long.csv"
}

# Paths to parquet files
data_directory = "Z:\\Thesis\\Data\\ML_Data\\AP__ML_training_data"
output_directory = "Z:\\Thesis\\Data\\ML_Data\\AP__ML_training_data"
years = list(range(1980, 1982)) #+ list(range(2016, 2024))

print(f"paths defined")

# Read and preprocess DMI data
def load_dmi_data(file_paths):
    dmi_frames = []
    for column_name, file_path in file_paths.items():
        dmi_data = pd.read_csv(file_path)
        # Parse dates and set them to the last day of the month
        dmi_data['Date'] = pd.to_datetime(dmi_data['Date'], format='%m/%d/%Y', errors='coerce')
        dmi_data['Date'] = dmi_data['Date'] + pd.offsets.MonthEnd(0)
        # Rename the value column to match the key for clarity
        dmi_data = dmi_data[['Date', column_name]]
        dmi_frames.append(dmi_data)
    combined_dmi_data = pd.concat(dmi_frames, axis=1)
    # Remove duplicate 'Date' columns, keeping only one
    combined_dmi_data = combined_dmi_data.loc[:, ~combined_dmi_data.columns.duplicated()]
    return combined_dmi_data

dmi_data = load_dmi_data(dmi_csv_files)

def process_and_merge_parquet_files(base_dir, years, dmi_data, output_dir):
    for year in years:
        year_path = os.path.join(base_dir, str(year))
        parquet_files = glob(os.path.join(year_path, "*.parquet"))

        for parquet_file in parquet_files:
            # Load parquet file
            parquet_data = pd.read_parquet(parquet_file)

            # Ensure the 'time' column is in datetime format and set to the last day of the month
            parquet_data['time'] = pd.to_datetime(parquet_data['time'], errors='coerce')
            parquet_data['time'] = parquet_data['time'] + pd.offsets.MonthEnd(0)

            # Merge with DMI data
            merged_data = parquet_data.merge(
                dmi_data,
                left_on='time',
                right_on='Date',
                how='left'
            )

            # Drop unnecessary 'Date' column from the output
            merged_data = merged_data.drop(columns=['Date'])

            # Save merged data to a new parquet file in the output directory
            relative_path = os.path.relpath(parquet_file, base_dir)
            output_file = os.path.join(output_dir, relative_path)
            os.makedirs(os.path.dirname(output_file), exist_ok=True)
            merged_data.to_parquet(output_file, index=False)

            print(f"Merged data saved to {output_file}")

# Run the merging process
process_and_merge_parquet_files(data_directory, years, dmi_data, output_directory)

print(f"process completed")

paths defined


  "DMI_EAST_HadISST1.1": "Z:\Thesis\Data\Met\DMI\dmieast.had.long.csv",
  "DMI_HadISST1.1": "Z:\Thesis\Data\Met\DMI\dmieast.had.long.csv"
  "DMI_EAST_HadISST1.1": "Z:\Thesis\Data\Met\DMI\dmieast.had.long.csv",
  "DMI_HadISST1.1": "Z:\Thesis\Data\Met\DMI\dmieast.had.long.csv"


KeyError: "['DMI_HadISST1.1'] not in index"

In [17]:
import pandas as pd
import os
from glob import glob

# Paths to DMI CSV files (using raw strings to avoid escape issues)
dmi_csv_files = {
    "DMI_EAST_HadISST1.1": r"Z:\Thesis\Data\Met\DMI\dmieast.had.long.csv",
    "DMI_HadISST1.1": r"Z:\Thesis\Data\Met\DMI\dmi.had.long.csv"
}

# Paths to parquet files
data_directory = r"Z:\Thesis\Data\ML_Data\AP_ML_training_data"
output_directory = r"Z:\Thesis\Data\ML_Data\AP_ML_training_data"
years = list(range(1980, 2000))  # Adjust the range of years as needed

print("Paths defined")

def load_dmi_data(file_paths):
    """
    Loads DMI CSV files, converts the date string to a datetime,
    creates 'year' and 'month' columns, and then merges the datasets
    on these two columns.
    """
    dmi_data_final = None
    for column_name, file_path in file_paths.items():
        # Use comma as the delimiter
        dmi_data = pd.read_csv(file_path, sep=',')
        # Remove any accidental spaces in the column names
        dmi_data.columns = dmi_data.columns.str.strip()
        # Convert the Date column (e.g. "1/1/1980") to datetime
        dmi_data['Date'] = pd.to_datetime(dmi_data['Date'], format='%m/%d/%Y', errors='coerce')
        # Create separate year and month columns
        dmi_data['year'] = dmi_data['Date'].dt.year
        dmi_data['month'] = dmi_data['Date'].dt.month
        # Keep only the year, month, and the desired DMI column
        dmi_data = dmi_data[['year', 'month', column_name]]
        
        # Merge this dataframe with the accumulated data
        if dmi_data_final is None:
            dmi_data_final = dmi_data
        else:
            dmi_data_final = pd.merge(dmi_data_final, dmi_data, on=['year', 'month'], how='outer')
            
    return dmi_data_final

dmi_data = load_dmi_data(dmi_csv_files)
print("DMI data loaded and processed:")
print(dmi_data.head())

def process_and_merge_parquet_files(base_dir, years, dmi_data, output_dir):
    """
    For each parquet file in each year's directory:
      - Drop the existing (blank) DMI columns from the parquet data,
      - Convert the 'time' column to datetime and create 'year' and 'month' columns,
      - Merge with the DMI data on ['year', 'month'],
      - Save the merged dataset.
    """
    for year in years:
        year_path = os.path.join(base_dir, str(year))
        print(f"\nProcessing directory: {year_path}")
        parquet_files = glob(os.path.join(year_path, "*.parquet"))
        print(f"Found {len(parquet_files)} parquet files.")
        
        if not parquet_files:
            print(f"No parquet files found for {year}")
            continue

        for parquet_file in parquet_files:
            try:
                parquet_data = pd.read_parquet(parquet_file)
            except Exception as e:
                print(f"Error reading {parquet_file}: {e}")
                continue

            # Drop the blank DMI columns from the parquet file (if they exist)
            for col in ["DMI_EAST_HadISST1.1", "DMI_HadISST1.1"]:
                if col in parquet_data.columns:
                    parquet_data.drop(columns=[col], inplace=True)

            # Convert the 'time' column to datetime (format like "1/31/1980")
            parquet_data['time'] = pd.to_datetime(parquet_data['time'], format='%m/%d/%Y', errors='coerce')
            # Create year and month columns for merging
            parquet_data['year'] = parquet_data['time'].dt.year
            parquet_data['month'] = parquet_data['time'].dt.month

            # Merge on year and month
            merged_data = parquet_data.merge(dmi_data, on=['year', 'month'], how='left')

            # Build the output file path and ensure the directory exists
            relative_path = os.path.relpath(parquet_file, base_dir)
            output_file = os.path.join(output_dir, relative_path)
            os.makedirs(os.path.dirname(output_file), exist_ok=True)
            merged_data.to_parquet(output_file, index=False)

            print(f"Merged data saved to {output_file}")

process_and_merge_parquet_files(data_directory, years, dmi_data, output_directory)
print("\nProcess completed")


Paths defined
DMI data loaded and processed:
     year  month  DMI_EAST_HadISST1.1  DMI_HadISST1.1
0  1900.0    1.0               -0.226          -0.403
1  1900.0    2.0               -0.105          -0.213
2  1900.0    3.0               -0.163          -0.364
3  1900.0    4.0               -0.097          -0.244
4  1900.0    5.0               -0.304          -0.080

Processing directory: Z:\Thesis\Data\ML_Data\AP_ML_training_data\1980
Found 14 parquet files.
Merged data saved to Z:\Thesis\Data\ML_Data\AP_ML_training_data\1980\Bahrain_1980_pressure_monthly_stats_merged.parquet
Merged data saved to Z:\Thesis\Data\ML_Data\AP_ML_training_data\1980\Bahrain_1980_surface_monthly_stats_merged.parquet
Merged data saved to Z:\Thesis\Data\ML_Data\AP_ML_training_data\1980\Kuwait_1980_pressure_monthly_stats_merged.parquet
Merged data saved to Z:\Thesis\Data\ML_Data\AP_ML_training_data\1980\Kuwait_1980_surface_monthly_stats_merged.parquet
Merged data saved to Z:\Thesis\Data\ML_Data\AP_ML_training_d