In [5]:
import os
import pandas as pd

# Define file paths
input_file = r"Z:\Thesis\Data\test\DustCast\UA\DC_v0063\ua_row_level_predictions_2023_v0063_H3res4.csv"
output_file = r"Z:\Thesis\Data\test\DustCast\UA\DC_v0063\ua_level_averages_2023_v0063_H3res4_test.csv"

# Desired levels to include
levels_to_include = ['1000', '925', '850', '700', '500', '300', '200', '100', '50', '10']

print("Loading CSV file...")
df = pd.read_csv(input_file)
print(f"Initial data shape: {df.shape}")
print("Columns:", df.columns.tolist())

# Convert "level" column to numeric then to integer string (so "10.0" becomes "10")
if 'level' not in df.columns:
    print("Error: 'level' column not found.")
else:
    df['level'] = pd.to_numeric(df['level'], errors='coerce').astype('Int64').astype(str)
print("Unique levels before filtering:", df['level'].unique())

# Filter rows to only include desired levels
df = df[df['level'].isin(levels_to_include)]
print(f"Data shape after filtering levels: {df.shape}")
print("Unique levels after filtering:", df['level'].unique())

# Derive 'Month' from 'time' if not already present
if 'Month' not in df.columns:
    if 'time' in df.columns:
        print("Deriving 'Month' from 'time' column...")
        df['time'] = pd.to_datetime(df['time'], errors='coerce')
        df['Month'] = df['time'].dt.strftime('%Y-%m')
    else:
        print("Error: Neither 'Month' nor 'time' column found.")
print("Unique Months:", df['Month'].unique())

# Define grouping keys as (lon, lat, Month) so that we average over levels
group_keys = ['lon', 'lat', 'Month']
print("Grouping keys:", group_keys)

# List of columns for which to compute the average
cols_to_avg = [
    'z_min', 'z_max', 'z_mean',
    'r_min', 'r_max', 'r_mean',
    't_min', 't_max', 't_mean',
    'u_min', 'u_max', 'u_mean',
    'v_min', 'v_max', 'v_mean',
    'w_min', 'w_max', 'w_mean',
    'vo_min', 'vo_max', 'vo_mean',
    'DUCMASS_mean',
    'DUFLUXU_min', 'DUFLUXU_max', 'DUFLUXU_mean',
    'DUFLUXV_min', 'DUFLUXV_max', 'DUFLUXV_mean',
    'lag_1', 'ensemble_predictions', 'squared_error_ensemble',
    'lr_predictions', 'knn_predictions', 'dt_predictions', 'rf_predictions'
]

# All other columns (that should remain the same) are those not in cols_to_avg and not in group_keys.
cols_to_keep = [col for col in df.columns if col not in cols_to_avg and col not in group_keys]
print("Columns to average:", cols_to_avg)
print("Other columns to keep:", cols_to_keep)

# Build aggregation dictionary: for numeric columns we compute the mean; for others, we take the first value.
agg_dict = {col: 'mean' for col in cols_to_avg}
agg_dict.update({col: 'first' for col in cols_to_keep})
print("Aggregation dictionary:")
print(agg_dict)

print("Grouping data by keys:", group_keys)
grouped = df.groupby(group_keys, as_index=False).agg(agg_dict)
print(f"Grouped data shape: {grouped.shape}")
print("Grouped data columns:", grouped.columns.tolist())

# Define desired column order. Note that "level" is omitted since we average across levels.
desired_order = [
    'lon', 'lat', 'time', 'Month',
    'z_min', 'z_max', 'z_mean',
    'r_min', 'r_max', 'r_mean',
    't_min', 't_max', 't_mean',
    'u_min', 'u_max', 'u_mean',
    'v_min', 'v_max', 'v_mean',
    'w_min', 'w_max', 'w_mean',
    'vo_min', 'vo_max', 'vo_mean',
    'DUCMASS_mean',
    'DUFLUXU_min', 'DUFLUXU_max', 'DUFLUXU_mean',
    'DUFLUXV_min', 'DUFLUXV_max', 'DUFLUXV_mean',
    'DMI_EAST_HadISST1.1', 'DMI_HadISST1.1',  # if present
    'lag_1', 'ensemble_predictions', 'squared_error_ensemble',
    'lr_predictions', 'knn_predictions', 'dt_predictions', 'rf_predictions',
    'h3_res_3', 'h3_res_4'
]
desired_order = [col for col in desired_order if col in grouped.columns]
print("Desired column order:", desired_order)
grouped = grouped[desired_order]

# Save the aggregated results to CSV
grouped.to_csv(output_file, index=False)
print(f"Saved level averages to {output_file}")


Loading CSV file...
Initial data shape: (1031520, 44)
Columns: ['lon', 'lat', 'time', 'level', 'z_min', 'z_max', 'z_mean', 'r_min', 'r_max', 'r_mean', 't_min', 't_max', 't_mean', 'u_min', 'u_max', 'u_mean', 'v_min', 'v_max', 'v_mean', 'w_min', 'w_max', 'w_mean', 'vo_min', 'vo_max', 'vo_mean', 'DUCMASS_mean', 'DUFLUXU_min', 'DUFLUXU_max', 'DUFLUXU_mean', 'DUFLUXV_min', 'DUFLUXV_max', 'DUFLUXV_mean', 'DMI_EAST_HadISST1.1', 'DMI_HadISST1.1', 'Month', 'lag_1', 'ensemble_predictions', 'squared_error_ensemble', 'lr_predictions', 'knn_predictions', 'dt_predictions', 'rf_predictions', 'h3_res_3', 'h3_res_4']
Unique levels before filtering: ['10' '50' '100' '200' '300' '500' '700' '850' '925' '1000']
Data shape after filtering levels: (1031520, 44)
Unique levels after filtering: ['10' '50' '100' '200' '300' '500' '700' '850' '925' '1000']
Unique Months: ['2023-01' '2023-02' '2023-03' '2023-04' '2023-05' '2023-06' '2023-07'
 '2023-08' '2023-09' '2023-10' '2023-11' '2023-12']
Grouping keys: ['lon