In [1]:
#---------------------------------------------------------------------------------------------------------------
# Welcome to DustCast V0.0.6.3 SFC! 
# This version trains on 1980-2000 and 2013-2022 Surface data and predicts DUSMASS_mean for the year 2023 by month. 
# A one-month lag is integrated so that each 2023 prediction uses the previous month’s value.
#---------------------------------------------------------------------------------------------------------------

import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer

# Additional metrics
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error

def mse(y_true, y_pred):
    """Helper function to compute Mean Squared Error."""
    return mean_squared_error(y_true, y_pred)

def train_models(regressors, X_train, y_train):
    """Train each regressor on the training data and return the trained models."""
    models = []
    for name, model in regressors:
        model.fit(X_train, y_train)
        models.append((name, model))
    return models

def compute_metrics_and_weights(models, X_val, y_val):
    """
    Compute each model's MSE, MAE, R^2, and weight = 1/(MSE + 1e-20).

    Returns: (model_names, mse_list, mae_list, r2_list, weights)
    """
    model_names = []
    mse_list = []
    mae_list = []
    r2_list = []
    weights = []

    for name, model in models:
        y_pred_val = model.predict(X_val)
        curr_mse = mse(y_val, y_pred_val)
        mae_val = mean_absolute_error(y_val, y_pred_val)
        r2_val = r2_score(y_val, y_pred_val)

        # Compute weight as inverse of MSE
        w = 1 / (curr_mse + 1e-20)

        model_names.append(name)
        mse_list.append(curr_mse)
        mae_list.append(mae_val)
        r2_list.append(r2_val)
        weights.append(w)

    # Normalize weights (so they sum to 1)
    weights = np.array(weights)
    weights /= np.sum(weights)

    return model_names, mse_list, mae_list, r2_list, weights

def predict_weighted(models, X_val, y_val, X_test):
    """
    Compute weights for each model (inverse of MSE) on (X_val, y_val),
    and make weighted ensemble predictions on X_test.

    Returns:
      final_prediction (ndarray): Weighted ensemble predictions on X_test.
      weights (ndarray): The normalized weights for each model.
    """
    weights = []
    predictions = []

    # Compute MSE on validation set -> weights (inverse MSE)
    for name, model in models:
        y_pred_val = model.predict(X_val)
        curr_mse = mse(y_val, y_pred_val)
        weights.append(1 / (curr_mse + 1e-20))
        predictions.append(model.predict(X_test))

    # Normalize weights
    weights = np.array(weights)
    weights /= np.sum(weights)

    # Weighted average of predictions
    predictions = np.array(predictions)  # shape: (num_models, num_samples)
    final_prediction = np.dot(weights, predictions)

    return final_prediction, weights

# ------------------------- CONFIGURATION -------------------------
base_path = r"Z:\Thesis\Data\ML_Data\AP_ML_training_data"
output_dir = r"Z:\\Thesis\\Data\\test\\DustCast\\SFC\\DC_v0063"
os.makedirs(output_dir, exist_ok=True)

train_years = [
    1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989,
    1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
    2000,
    2013, 2014, 2015, 2016, 2017, 2018, 2019, 
    2020, 2021, 2022
]
test_year = 2023

countries = [
    "Bahrain",
    "Yemen", 
    "Kuwait", 
    "Oman", 
    "Qatar", 
    "Saudi_Arabia", 
    "United_Arab_Emirates"
]

# Define regressors
regressors = [
    ('lr', LinearRegression(copy_X=True, fit_intercept=True)),
    ('knn', KNeighborsRegressor(algorithm='ball_tree', leaf_size=5,
                                n_neighbors=5, p=2, weights='distance')),
    ('dt', DecisionTreeRegressor(max_depth=None, max_features=20,
                                 min_samples_split=10, random_state=13)),
    ('rf', RandomForestRegressor(max_depth=13, n_estimators=50, 
                                 max_features=20, min_samples_split=10, random_state=23))   
]

target_column = 'DUSMASS_mean'

cols_to_drop = [
    'sst_min', 'sst_max', 'sst_mean',
    'DUSMASS25_min', 'DUSMASS25_max', 'DUSMASS25_mean',
    'DUSMASS_min', 'DUSMASS_max','h3_res_5', 'h3_res_6', 
    'h3_res_5_int', 'h3_res_6_int', 'h3_res_3', 'h3_res_3_int', 
    'h3_res_4', 'h3_res_4_int', 'year', 'month'
]

# ------------------------- OUTPUT STORAGES -------------------------
all_feature_importances = []
predictions_2023_all_countries = []
monthly_ensemble_predictions_2023 = []
monthly_model_predictions_2023 = {name: [] for name, _ in regressors}
all_model_metrics = []  # Store each country's model metrics (MSE, MAE, R2, Weight), plus ensemble metrics

# ------------------------- MAIN LOOP -------------------------
for country in countries:
    try:
        # 1) Load and combine training data
        train_frames = []
        for yr in train_years:
            file_path = os.path.join(base_path, str(yr), f"{country}_{yr}_surface_monthly_stats_merged.parquet")
            if not os.path.exists(file_path):
                print(f"Training file not found: {file_path}")
                continue
            df_temp = pd.read_parquet(file_path)
            train_frames.append(df_temp)
        
        if len(train_frames) == 0:
            print(f"No training data found for {country}. Skipping...")
            continue

        train_df = pd.concat(train_frames, ignore_index=True)
        print(f"DEBUG: [{country}] After concatenation: train_df.shape = {train_df.shape}")

        train_df['time'] = pd.to_datetime(train_df['time'])

        # Drop columns
        train_df.drop(columns=[c for c in cols_to_drop if c in train_df.columns], 
                      inplace=True, errors='ignore')
        print(f"DEBUG: [{country}] After dropping columns: train_df.shape = {train_df.shape}")

        # Create a 1-month lag of the target
        train_df = train_df.sort_values(by=['lat', 'lon', 'time'])
        print(f"DEBUG: [{country}] After sorting: train_df.shape = {train_df.shape}")
        train_df['lag_1'] = train_df.groupby(['lat', 'lon'])[target_column].shift(1)
        print(f"DEBUG: [{country}] After creating 'lag_1': train_df.shape = {train_df.shape}")

        # Drop NaNs for target or lag_1
        train_df = train_df.dropna(subset=[target_column, 'lag_1'])
        print(f"DEBUG: [{country}] After dropping NaNs for target and lag_1: train_df.shape = {train_df.shape}")

        # 2) Features / Target
        X_train_full = train_df.drop(columns=[target_column, 'time'], errors='ignore')
        y_train_full = train_df[target_column]
        print(f"DEBUG: [{country}] X_train_full.shape = {X_train_full.shape}, y_train_full.shape = {y_train_full.shape}")

        # 3) Train-Validation Split
        X_train_part, X_val_part, y_train_part, y_val_part = train_test_split(
            X_train_full,
            y_train_full,
            test_size=0.3,
            random_state=42
        )
        print(f"DEBUG: [{country}] Split -> X_train_part={X_train_part.shape}, X_val_part={X_val_part.shape}")

        # Impute
        imputer = SimpleImputer(strategy='mean')
        X_train_part_imp = imputer.fit_transform(X_train_part)
        X_val_part_imp   = imputer.transform(X_val_part)

        X_train_part = pd.DataFrame(X_train_part_imp, columns=X_train_part.columns, index=X_train_part.index)
        X_val_part   = pd.DataFrame(X_val_part_imp,   columns=X_val_part.columns,   index=X_val_part.index)

        # 4) Train Models
        models = train_models(regressors, X_train_part, y_train_part)
        print(f"DEBUG: [{country}] Finished training models.")

        # Define output directory for decision boundary plots
        decision_boundary_dir = os.path.join(output_dir, "decision_boundaries")
        os.makedirs(decision_boundary_dir, exist_ok=True)  # Ensure the directory exists

        # 4.1) Plot Decision Boundaries (if X_train_part has exactly 2 features)
        if X_train_part.shape[1] == 2:
            import matplotlib.pyplot as plt  # ensure matplotlib is imported
            fig, axes = plt.subplots(2, 3, figsize=(15, 10))
            axes = axes.flatten()
    
            for i, (name, model) in enumerate(models):
                if i >= len(axes):
                    break
                # Your decision boundary plotting function should be defined elsewhere.
                plot_decision_boundary(axes[i], model, X_train_part.to_numpy(), y_train_part.to_numpy(), f"{country} - {name}")

            # Save decision boundary plots
            fig.tight_layout()
            decision_boundary_path = os.path.join(decision_boundary_dir, f"{country}_decision_boundaries.png")
            plt.savefig(decision_boundary_path, dpi=300, bbox_inches='tight')
            plt.close()
    
            print(f"DEBUG: [{country}] Decision boundary plots saved to '{decision_boundary_path}'")
        else:
            print(f"DEBUG: [{country}] Skipping decision boundary plot (requires 2 features).")

        # 4.2) Feature Importances
        feature_importances = pd.DataFrame(index=X_train_part.columns)
        for name, model in models:
            if hasattr(model, 'feature_importances_'):
                feature_importances[name] = model.feature_importances_

        if not feature_importances.empty:
            feature_importances.reset_index(inplace=True)
            feature_importances.rename(columns={'index': 'Feature'}, inplace=True)
            feature_importances['Country'] = country
            all_feature_importances.append(feature_importances)

        # 4.3) Compute model-level metrics
        model_names, mse_list, mae_list, r2_list, weights_array = compute_metrics_and_weights(models, X_val_part, y_val_part)
        metrics_df = pd.DataFrame({
            'Country': country,
            'Model': model_names,
            'MSE': mse_list,
            'MAE': mae_list,
            'R2': r2_list,
            'Weight': weights_array
        })

        # 4.4) Ensemble metrics on validation
        ensemble_val_preds, _ = predict_weighted(models, X_val_part, y_val_part, X_val_part)
        ensemble_mse_val = mse(y_val_part, ensemble_val_preds)
        ensemble_mae_val  = mean_absolute_error(y_val_part, ensemble_val_preds)
        ensemble_r2_val   = r2_score(y_val_part, ensemble_val_preds)

        ensemble_row = {
            'Country': country,
            'Model': 'ensemble',
            'MSE': ensemble_mse_val,
            'MAE': ensemble_mae_val,
            'R2': ensemble_r2_val,
            'Weight': 1.0
        }
        ensemble_df = pd.DataFrame([ensemble_row])
        metrics_df = pd.concat([metrics_df, ensemble_df], ignore_index=True)

        all_model_metrics.append(metrics_df)

        # 5) Prepare Test Data for 2023 Month-by-Month
        test_file_path = os.path.join(base_path, str(test_year), f"{country}_{test_year}_surface_monthly_stats_merged.parquet")
        if not os.path.exists(test_file_path):
            print(f"Test file (2023) not found for {country}. Skipping...")
            continue

        test_df_2023 = pd.read_parquet(test_file_path)
        print(f"DEBUG: [{country}] Loaded test_df_2023: shape = {test_df_2023.shape}")

        test_df_2023['time'] = pd.to_datetime(test_df_2023['time'])
        test_df_2023.drop(columns=[c for c in cols_to_drop if c in test_df_2023.columns], inplace=True, errors='ignore')
        print(f"DEBUG: [{country}] After dropping columns in test_df_2023: {test_df_2023.shape}")

        test_df_2023 = test_df_2023.sort_values(by=['lat', 'lon', 'time'])
        test_df_2023['Month'] = test_df_2023['time'].dt.to_period('M')
        print(f"DEBUG: [{country}] After sorting test_df_2023: shape={test_df_2023.shape}")

        # Dec 2022 data for seeding Jan 2023
        dec_2022 = train_df.loc[train_df['time'].dt.to_period('M') == pd.Period('2022-12')]
        dec_2022 = dec_2022[['lat', 'lon', target_column]].rename(columns={target_column: 'lag_1'})
        dec_2022['Month'] = pd.Period('2023-01')

        month_preds_list = []
        unique_months_2023 = sorted(test_df_2023['Month'].unique())
        print(f"DEBUG: [{country}] unique_months_2023 = {unique_months_2023}")

        for i, mon in enumerate(unique_months_2023):
            this_month_df = test_df_2023[test_df_2023['Month'] == mon].copy()
            print(f"DEBUG: [{country}] Month={mon}, initial this_month_df.shape = {this_month_df.shape}")

            if i == 0:
                print(f"DEBUG: [{country}] Merging with dec_2022: dec_2022.shape = {dec_2022.shape}")
                this_month_df = pd.merge(
                    this_month_df,
                    dec_2022[['lat', 'lon', 'lag_1']],
                    on=['lat', 'lon'],
                    how='left'
                )
                print(f"DEBUG: [{country}] After merge (Jan 2023): this_month_df.shape = {this_month_df.shape}")
            else:
                prev_month_data = month_preds_list[-1][['lat', 'lon', 'ensemble_predictions']].copy()
                prev_month_data.rename(columns={'ensemble_predictions': 'lag_1'}, inplace=True)
                prev_month_data['Month'] = mon

                print(f"DEBUG: [{country}] prev_month_data.shape = {prev_month_data.shape}")
                this_month_df = pd.merge(
                    this_month_df,
                    prev_month_data[['lat', 'lon', 'lag_1']],
                    on=['lat', 'lon'],
                    how='left'
                )
                print(f"DEBUG: [{country}] After merge (Month={mon}): this_month_df.shape = {this_month_df.shape}")

            # Drop rows w/o valid lag_1
            this_month_df.dropna(subset=['lag_1'], inplace=True)
            print(f"DEBUG: [{country}] After dropping NA lag_1: this_month_df.shape = {this_month_df.shape}")

            # Prepare features by dropping unwanted columns
            X_cols_to_drop = [target_column, 'time', 'Month']
            X_this_month = this_month_df.drop(columns=[c for c in X_cols_to_drop if c in this_month_df.columns],
                                              errors='ignore')
            print(f"DEBUG: [{country}] X_this_month.shape before reindex = {X_this_month.shape}")

            # Reindex X_this_month so that it has all features used in training
            expected_features = X_train_part.columns  # Columns from training
            X_this_month = X_this_month.reindex(columns=expected_features, fill_value=np.nan)
            print(f"DEBUG: [{country}] X_this_month.shape after reindex = {X_this_month.shape}")

            # Impute missing values
            X_this_month_imputed = imputer.transform(X_this_month)
            X_this_month = pd.DataFrame(X_this_month_imputed, columns=expected_features, index=X_this_month.index)

            # Weighted ensemble prediction
            y_ensemble_2023, _ = predict_weighted(models, X_val_part, y_val_part, X_this_month)
            this_month_df['ensemble_predictions'] = y_ensemble_2023

            # Individual models predictions
            for name, model in models:
                this_month_df[f"{name}_predictions"] = model.predict(X_this_month)

            month_preds_list.append(this_month_df)

        if len(month_preds_list) > 0:
            test_df_2023_preds = pd.concat(month_preds_list, ignore_index=True)
            print(f"DEBUG: [{country}] test_df_2023_preds.shape (after concat months) = {test_df_2023_preds.shape}")
        else:
            test_df_2023_preds = pd.DataFrame()

        # 6) Store row-level predictions
        if not test_df_2023_preds.empty:
            # Add 'Country' column to the row-level predictions before appending
            test_df_2023_preds['Country'] = country
            predictions_2023_all_countries.append(test_df_2023_preds.copy())

            # 7) Monthly Aggregation
            monthly_ensemble = (
                test_df_2023_preds
                .groupby(['Month', 'lat', 'lon'])['ensemble_predictions']
                .mean()
                .reset_index()
            )
            monthly_ensemble['Country'] = country  # Ensure 'Country' is present
            monthly_ensemble_predictions_2023.append(monthly_ensemble)

            for name, _ in models:
                col_name = f"{name}_predictions"
                monthly_model = (
                    test_df_2023_preds
                    .groupby(['Month', 'lat', 'lon'])[col_name]
                    .mean()
                    .reset_index()
                )
                monthly_model.rename(columns={col_name: 'DUSMASS_Mean_Predictions'}, inplace=True)
        
                # Add 'Country' column explicitly to the monthly model data
                monthly_model['Country'] = country  
        
                monthly_model_predictions_2023[name].append(monthly_model)

    except Exception as e:
        print(f"Error processing {country}: {e}")

print("Main loop complete! Saving Results...")

# ------------------------- SAVE RESULTS -------------------------

# 1) Feature Importances
if all_feature_importances:
    all_feature_importances_df = pd.concat(all_feature_importances, ignore_index=True)
else:
    all_feature_importances_df = pd.DataFrame()
fi_output_path = os.path.join(output_dir, "sfc_feature_importances_2023_v0063.csv")
all_feature_importances_df.to_csv(fi_output_path, index=False)
print(f"Feature importances saved to '{fi_output_path}'")

# 2) Row-Level 2023 Predictions
if len(predictions_2023_all_countries) > 0:
    full_predictions_2023 = pd.concat(predictions_2023_all_countries, ignore_index=True)
    print(f"DEBUG: [ALL] full_predictions_2023.shape = {full_predictions_2023.shape}")
    output_2023_path = os.path.join(output_dir, "sfc_row_level_predictions_2023_v0063.csv")
    full_predictions_2023.to_csv(output_2023_path, index=False)
    print(f"Row-level predictions for 2023 (ensemble + individual models) saved to '{output_2023_path}'")
else:
    print("No row-level predictions available for 2023.")

# 3) Monthly 2023 Predictions: ENSEMBLE
if len(monthly_ensemble_predictions_2023) > 0:
    monthly_ensemble_df_2023 = pd.concat(monthly_ensemble_predictions_2023, ignore_index=True)
    monthly_ensemble_df_2023['Year'] = 2023
    print(f"DEBUG: [ALL] monthly_ensemble_df_2023.shape = {monthly_ensemble_df_2023.shape}")
    ensemble_monthly_output_path = os.path.join(output_dir, "sfc_monthly_ensemble_predictions_2023_v0063.csv")
    monthly_ensemble_df_2023.to_csv(ensemble_monthly_output_path, index=False)
    print(f"Monthly ensemble predictions for 2023 saved to '{ensemble_monthly_output_path}'")
else:
    print("No monthly ensemble predictions available for 2023.")

# 4) Monthly 2023 Predictions: INDIVIDUAL MODELS
for name in monthly_model_predictions_2023:
    if len(monthly_model_predictions_2023[name]) > 0:
        model_monthly_df = pd.concat(monthly_model_predictions_2023[name], ignore_index=True)
        model_monthly_df['Year'] = 2023
        print(f"DEBUG: [ALL] {name} monthly dataframe shape = {model_monthly_df.shape}")
        model_output_path = os.path.join(output_dir, f"sfc_monthly_predictions_2023_{name}_v0063.csv")
        model_monthly_df.to_csv(model_output_path, index=False)
        print(f"Monthly predictions for 2023 for model '{name}' saved to '{model_output_path}'")
    else:
        print(f"No monthly predictions available for 2023 for model '{name}'.")

# 5) Model Metrics (Including Ensemble)
if len(all_model_metrics) > 0:
    final_metrics_df = pd.concat(all_model_metrics, ignore_index=True)
    metrics_output_path = os.path.join(output_dir, "sfc_model_metrics_2023_v0063.csv")
    final_metrics_df.to_csv(metrics_output_path, index=False)
    print(f"Model metrics (including Ensemble) saved to '{metrics_output_path}'")
else:
    print("No model metrics to save.")


DEBUG: [Bahrain] After concatenation: train_df.shape = (1116, 106)
DEBUG: [Bahrain] After dropping columns: train_df.shape = (1116, 88)
DEBUG: [Bahrain] After sorting: train_df.shape = (1116, 88)
DEBUG: [Bahrain] After creating 'lag_1': train_df.shape = (1116, 89)
DEBUG: [Bahrain] After dropping NaNs for target and lag_1: train_df.shape = (1110, 89)
DEBUG: [Bahrain] X_train_full.shape = (1110, 87), y_train_full.shape = (1110,)
DEBUG: [Bahrain] Split -> X_train_part=(777, 87), X_val_part=(333, 87)
DEBUG: [Bahrain] Finished training models.
DEBUG: [Bahrain] Skipping decision boundary plot (requires 2 features).
DEBUG: [Bahrain] Loaded test_df_2023: shape = (36, 106)
DEBUG: [Bahrain] After dropping columns in test_df_2023: (36, 88)
DEBUG: [Bahrain] After sorting test_df_2023: shape=(36, 89)
DEBUG: [Bahrain] unique_months_2023 = [Period('2023-01', 'M'), Period('2023-02', 'M'), Period('2023-03', 'M'), Period('2023-04', 'M'), Period('2023-05', 'M'), Period('2023-06', 'M'), Period('2023-07', 

DEBUG: [Yemen] Month=2023-05, initial this_month_df.shape = (1424, 89)
DEBUG: [Yemen] prev_month_data.shape = (1424, 4)
DEBUG: [Yemen] After merge (Month=2023-05): this_month_df.shape = (1424, 90)
DEBUG: [Yemen] After dropping NA lag_1: this_month_df.shape = (1424, 90)
DEBUG: [Yemen] X_this_month.shape before reindex = (1424, 87)
DEBUG: [Yemen] X_this_month.shape after reindex = (1424, 87)
DEBUG: [Yemen] Month=2023-06, initial this_month_df.shape = (1424, 89)
DEBUG: [Yemen] prev_month_data.shape = (1424, 4)
DEBUG: [Yemen] After merge (Month=2023-06): this_month_df.shape = (1424, 90)
DEBUG: [Yemen] After dropping NA lag_1: this_month_df.shape = (1424, 90)
DEBUG: [Yemen] X_this_month.shape before reindex = (1424, 87)
DEBUG: [Yemen] X_this_month.shape after reindex = (1424, 87)
DEBUG: [Yemen] Month=2023-07, initial this_month_df.shape = (1424, 89)
DEBUG: [Yemen] prev_month_data.shape = (1424, 4)
DEBUG: [Yemen] After merge (Month=2023-07): this_month_df.shape = (1424, 90)
DEBUG: [Yemen] Af

DEBUG: [Kuwait] Month=2023-11, initial this_month_df.shape = (56, 89)
DEBUG: [Kuwait] prev_month_data.shape = (56, 4)
DEBUG: [Kuwait] After merge (Month=2023-11): this_month_df.shape = (56, 90)
DEBUG: [Kuwait] After dropping NA lag_1: this_month_df.shape = (56, 90)
DEBUG: [Kuwait] X_this_month.shape before reindex = (56, 87)
DEBUG: [Kuwait] X_this_month.shape after reindex = (56, 87)
DEBUG: [Kuwait] Month=2023-12, initial this_month_df.shape = (56, 89)
DEBUG: [Kuwait] prev_month_data.shape = (56, 4)
DEBUG: [Kuwait] After merge (Month=2023-12): this_month_df.shape = (56, 90)
DEBUG: [Kuwait] After dropping NA lag_1: this_month_df.shape = (56, 90)
DEBUG: [Kuwait] X_this_month.shape before reindex = (56, 87)
DEBUG: [Kuwait] X_this_month.shape after reindex = (56, 87)
DEBUG: [Kuwait] test_df_2023_preds.shape (after concat months) = (672, 95)
DEBUG: [Oman] After concatenation: train_df.shape = (476160, 106)
DEBUG: [Oman] After dropping columns: train_df.shape = (476160, 88)
DEBUG: [Oman] Aft

DEBUG: [Qatar] Month=2023-02, initial this_month_df.shape = (28, 89)
DEBUG: [Qatar] prev_month_data.shape = (28, 4)
DEBUG: [Qatar] After merge (Month=2023-02): this_month_df.shape = (28, 90)
DEBUG: [Qatar] After dropping NA lag_1: this_month_df.shape = (28, 90)
DEBUG: [Qatar] X_this_month.shape before reindex = (28, 87)
DEBUG: [Qatar] X_this_month.shape after reindex = (28, 87)
DEBUG: [Qatar] Month=2023-03, initial this_month_df.shape = (28, 89)
DEBUG: [Qatar] prev_month_data.shape = (28, 4)
DEBUG: [Qatar] After merge (Month=2023-03): this_month_df.shape = (28, 90)
DEBUG: [Qatar] After dropping NA lag_1: this_month_df.shape = (28, 90)
DEBUG: [Qatar] X_this_month.shape before reindex = (28, 87)
DEBUG: [Qatar] X_this_month.shape after reindex = (28, 87)
DEBUG: [Qatar] Month=2023-04, initial this_month_df.shape = (28, 89)
DEBUG: [Qatar] prev_month_data.shape = (28, 4)
DEBUG: [Qatar] After merge (Month=2023-04): this_month_df.shape = (28, 90)
DEBUG: [Qatar] After dropping NA lag_1: this_mo

DEBUG: [Saudi_Arabia] Month=2023-08, initial this_month_df.shape = (5525, 89)
DEBUG: [Saudi_Arabia] prev_month_data.shape = (5525, 4)
DEBUG: [Saudi_Arabia] After merge (Month=2023-08): this_month_df.shape = (5525, 90)
DEBUG: [Saudi_Arabia] After dropping NA lag_1: this_month_df.shape = (5525, 90)
DEBUG: [Saudi_Arabia] X_this_month.shape before reindex = (5525, 87)
DEBUG: [Saudi_Arabia] X_this_month.shape after reindex = (5525, 87)
DEBUG: [Saudi_Arabia] Month=2023-09, initial this_month_df.shape = (5525, 89)
DEBUG: [Saudi_Arabia] prev_month_data.shape = (5525, 4)
DEBUG: [Saudi_Arabia] After merge (Month=2023-09): this_month_df.shape = (5525, 90)
DEBUG: [Saudi_Arabia] After dropping NA lag_1: this_month_df.shape = (5525, 90)
DEBUG: [Saudi_Arabia] X_this_month.shape before reindex = (5525, 87)
DEBUG: [Saudi_Arabia] X_this_month.shape after reindex = (5525, 87)
DEBUG: [Saudi_Arabia] Month=2023-10, initial this_month_df.shape = (5525, 89)
DEBUG: [Saudi_Arabia] prev_month_data.shape = (5525,

DEBUG: [United_Arab_Emirates] Month=2023-11, initial this_month_df.shape = (280, 89)
DEBUG: [United_Arab_Emirates] prev_month_data.shape = (280, 4)
DEBUG: [United_Arab_Emirates] After merge (Month=2023-11): this_month_df.shape = (280, 90)
DEBUG: [United_Arab_Emirates] After dropping NA lag_1: this_month_df.shape = (280, 90)
DEBUG: [United_Arab_Emirates] X_this_month.shape before reindex = (280, 87)
DEBUG: [United_Arab_Emirates] X_this_month.shape after reindex = (280, 87)
DEBUG: [United_Arab_Emirates] Month=2023-12, initial this_month_df.shape = (280, 89)
DEBUG: [United_Arab_Emirates] prev_month_data.shape = (280, 4)
DEBUG: [United_Arab_Emirates] After merge (Month=2023-12): this_month_df.shape = (280, 90)
DEBUG: [United_Arab_Emirates] After dropping NA lag_1: this_month_df.shape = (280, 90)
DEBUG: [United_Arab_Emirates] X_this_month.shape before reindex = (280, 87)
DEBUG: [United_Arab_Emirates] X_this_month.shape after reindex = (280, 87)
DEBUG: [United_Arab_Emirates] test_df_2023_pre

In [2]:
#---------------------------------------------------------------------------------------------------------------
# Welcome to DustCast V0.0.6.3 UA! 
# This version trains on 1980-2000 and 2013-2022 Surface data and predicts DUSMASS_mean for the year 2023 by month. 
# A one-month lag is integrated so that each 2023 prediction uses the previous month’s value.
#---------------------------------------------------------------------------------------------------------------

import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer

# Additional metrics
from sklearn.metrics import (
    mean_absolute_error,
    r2_score,
    mean_squared_error
)

def mse(y_true, y_pred):
    """Helper function to compute MSE using scikit-learn's mean_squared_error."""
    return mean_squared_error(y_true, y_pred)

def train_models(regressors, X_train, y_train):
    """Train each regressor on the training data and return the trained models."""
    models = []
    for name, model in regressors:
        model.fit(X_train, y_train)
        models.append((name, model))
    return models

def predict_weighted(models, X_val, y_val, X_test):
    """
    Compute weights for each model (inverse of MSE) on (X_val, y_val),
    and make weighted ensemble predictions on X_test.
    """
    weights = []
    predictions = []

    for name, model in models:
        y_pred_val = model.predict(X_val)
        curr_mse = mse(y_val, y_pred_val)
        # Inverse of (MSE + small constant)
        weights.append(1 / (curr_mse + 1e-10))
        predictions.append(model.predict(X_test))

    # Normalize weights
    weights = np.array(weights)
    weights /= np.sum(weights)

    # Weighted average of predictions
    predictions = np.array(predictions)  # shape: (num_models, num_samples)
    final_prediction = np.dot(weights, predictions)

    return final_prediction, weights

def compute_metrics_and_weights(models, X_val, y_val):
    """
    Compute each model's MSE, MAE, R^2, and weight = 1/(MSE + 1e-10).
    """
    model_names, mse_list, mae_list, r2_list, weights = [], [], [], [], []

    for name, model in models:
        y_pred_val = model.predict(X_val)
        curr_mse = mse(y_val, y_pred_val)
        mae_val = mean_absolute_error(y_val, y_pred_val)
        r2_val = r2_score(y_val, y_pred_val)
        weight = 1 / (curr_mse + 1e-10)

        model_names.append(name)
        mse_list.append(curr_mse)
        mae_list.append(mae_val)
        r2_list.append(r2_val)
        weights.append(weight)

    # Normalize weights
    weights = np.array(weights)
    weights /= np.sum(weights)

    return model_names, mse_list, mae_list, r2_list, weights

# ------------------------- CONFIGURATION -------------------------
base_path = r"Z:\Thesis\Data\ML_Data\AP_ML_training_data"
output_dir = r"Z:\\Thesis\\Data\\test\\DustCast\\UA\\DC_v0063"
os.makedirs(output_dir, exist_ok=True)

train_years = [
    1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989,
    1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
    2000,
    2013, 2014, 2015, 2016, 2017, 2018, 2019, 
    2020, 2021, 2022
]
test_year = 2023

countries = [
    "Yemen", 
    "Bahrain", 
    "Kuwait", 
    "Oman", 
    "Qatar", 
    "Saudi_Arabia", 
    "United_Arab_Emirates"
]

pressure_levels = [10, 50, 100, 200, 300, 500, 700, 850, 925, 1000]
regressors = [
    ('lr', LinearRegression(copy_X=True, fit_intercept=True)),
    ('knn', KNeighborsRegressor(algorithm='ball_tree', leaf_size=5,
                                n_neighbors=5, p=2, weights='distance')),
    ('dt', DecisionTreeRegressor(max_depth=None, max_features=None,
                                 min_samples_split=10, random_state=13)),
    ('rf', RandomForestRegressor(max_depth=25, n_estimators=200, 
                                 max_features=0.25, min_samples_split=5, random_state=23))   
]

target_column = 'DUCMASS_mean'
cols_to_drop = [
    'sst_min', 'sst_max', 'sst_mean',
    'DUCMASS25_min', 'DUCMASS25_max', 'DUCMASS25_mean',
    'DUCMASS_min', 'DUCMASS_max', 'year', 'month',
    'h3_res_5', 'h3_res_6', 'h3_res_5_int', 'h3_res_6_int',
    'h3_res_3', 'h3_res_3_int', 'h3_res_4', 'h3_res_4_int'
]

# ------------------------- OUTPUT STORAGES -------------------------
all_model_metrics = []
all_feature_importances = []

# For monthly predictions across all countries/levels
monthly_ensemble_predictions_all = []
monthly_model_predictions_all = {name: [] for name, _ in regressors}

# For 2023 predictions (row-level, monthly, etc.)
predictions_2023_all_countries = []  # row-level 2023 predictions
monthly_ensemble_predictions_2023 = []  # monthly ensemble for 2023
monthly_model_predictions_2023 = {name: [] for name, _ in regressors}  # monthly predictions per model for 2023

# For averaged ensemble across levels
predictions_all_countries_across_levels = []

# ------------------------- MAIN LOOP -------------------------
for country in countries:
    print(f"\n--- Processing Country: {country} ---")

    # Will store the final test predictions across all levels for this country
    predictions_all_levels_country = []

    for level in pressure_levels:
        print(f"   >> Processing Level: {level} hPa")
        try:
            # 1) LOAD & COMBINE TRAIN DATA
            train_frames = []
            for yr in train_years:
                file_path = os.path.join(base_path, str(yr), f"{country}_{yr}_pressure_monthly_stats_merged.parquet")
                if not os.path.exists(file_path):
                    print(f"Training file not found: {file_path}")
                    continue

                df_temp = pd.read_parquet(file_path)
                df_temp['time'] = pd.to_datetime(df_temp['time'])

                # Keep only rows for the current pressure level
                df_temp = df_temp[df_temp['level'] == level]
                if df_temp.empty:
                    continue

                train_frames.append(df_temp)

            if not train_frames:
                print(f"       No training data for {country}, level {level}. Skipping...")
                continue

            train_df = pd.concat(train_frames, ignore_index=True)
            train_df['Country'] = country

            # Drop unnecessary columns
            train_df.drop(columns=[c for c in cols_to_drop if c in train_df.columns],
                          inplace=True, errors='ignore')
            print(f"DEBUG: [{country}, level={level}] After dropping columns: {train_df.shape}")

            # Sort & create lag_1 by grouping on (lat, lon, level)
            train_df = train_df.sort_values(by=['lat', 'lon', 'level', 'time'])
            train_df['lag_1'] = train_df.groupby(['lat', 'lon', 'level'])[target_column].shift(1)
            train_df.dropna(subset=[target_column, 'lag_1'], inplace=True)
            print(f"DEBUG: [{country}, level={level}] After lag_1: shape={train_df.shape}")

            # 2) PREPARE TRAIN FEATURES/TARGET
            X_train_full = train_df.drop(columns=[target_column, 'time', 'Country'], errors='ignore')
            y_train_full = train_df[target_column]

            # 3) Train-Validation Split
            X_train_part, X_val_part, y_train_part, y_val_part = train_test_split(
                X_train_full, y_train_full, test_size=0.3, random_state=42
            )

            imputer = SimpleImputer(strategy='mean')
            X_train_part_imp = imputer.fit_transform(X_train_part)
            X_val_part_imp   = imputer.transform(X_val_part)

            X_train_part = pd.DataFrame(X_train_part_imp, columns=X_train_part.columns, index=X_train_part.index)
            X_val_part   = pd.DataFrame(X_val_part_imp,   columns=X_val_part.columns,   index=X_val_part.index)

            # 4) Train Models
            models = train_models(regressors, X_train_part, y_train_part)

            # 4.1) Compute & Store Model Metrics
            model_names, mse_list, mae_list, r2_list, weights_array = compute_metrics_and_weights(
                models, X_val_part, y_val_part
            )
            metrics_df = pd.DataFrame({
                'Country': country,
                'Level': level,
                'Model': model_names,
                'MSE': mse_list,
                'MAE': mae_list,
                'R2': r2_list,
                'Weight': weights_array
            })

            # Ensemble metrics
            ensemble_val_preds, _ = predict_weighted(models, X_val_part, y_val_part, X_val_part)
            ensemble_mse_val = mse(y_val_part, ensemble_val_preds)
            ensemble_mae_val  = mean_absolute_error(y_val_part, ensemble_val_preds)
            ensemble_r2_val   = r2_score(y_val_part, ensemble_val_preds)

            ensemble_row = {
                'Country': country,
                'Level': level,
                'Model': 'ensemble',
                'MSE': ensemble_mse_val,
                'MAE': ensemble_mae_val,
                'R2': ensemble_r2_val,
                'Weight': 1.0
            }
            ensemble_df = pd.DataFrame([ensemble_row])
            metrics_df = pd.concat([metrics_df, ensemble_df], ignore_index=True)
            all_model_metrics.append(metrics_df)

            # 5) Prepare Test Data for 2023 (Monthly Approach)
            test_file_path = os.path.join(
                base_path, str(test_year), f"{country}_{test_year}_pressure_monthly_stats_merged.parquet"
            )
            if not os.path.exists(test_file_path):
                print(f"       Test file not found for {country}, level {level}. Skipping...")
                continue

            test_df_2023 = pd.read_parquet(test_file_path)
            test_df_2023['time'] = pd.to_datetime(test_df_2023['time'])
            test_df_2023 = test_df_2023[test_df_2023['level'] == level]
            if test_df_2023.empty:
                print(f"       No 2023 data for {country}, level {level}. Skipping...")
                continue

            test_df_2023.drop(columns=[c for c in cols_to_drop if c in test_df_2023.columns],
                              inplace=True, errors='ignore')
            test_df_2023 = test_df_2023.sort_values(by=['lat', 'lon', 'level', 'time'])
            test_df_2023['Month'] = test_df_2023['time'].dt.to_period('M')

            # Dec 2022 data to seed Jan 2023
            dec_2022 = train_df.loc[train_df['time'].dt.to_period('M') == pd.Period('2022-12')]
            dec_2022 = dec_2022[['lat', 'lon', 'level', target_column]]\
                .rename(columns={target_column: 'lag_1'})
            dec_2022['Month'] = pd.Period('2023-01')

            # We'll hold month-by-month predictions here
            month_preds_list = []
            unique_months_2023 = sorted(test_df_2023['Month'].unique())

            for i, mon in enumerate(unique_months_2023):
                this_month_df = test_df_2023[test_df_2023['Month'] == mon].copy()

                if i == 0:
                    this_month_df = pd.merge(
                        this_month_df,
                        dec_2022[['lat', 'lon', 'level', 'lag_1']],
                        on=['lat', 'lon', 'level'],
                        how='left'
                    )
                else:
                    # Merge in the previous month's ensemble predictions as 'lag_1'
                    prev_data = month_preds_list[-1][['lat', 'lon', 'level', 'ensemble_predictions']]\
                        .rename(columns={'ensemble_predictions': 'lag_1'})
                    prev_data['Month'] = mon

                    this_month_df = pd.merge(
                        this_month_df,
                        prev_data[['lat', 'lon', 'level', 'lag_1']],
                        on=['lat', 'lon', 'level'],
                        how='left'
                    )

                # Drop rows w/o valid lag_1
                this_month_df.dropna(subset=['lag_1'], inplace=True)

                # Prepare X by dropping target, time, Country, Month
                X_cols_drop = [target_column, 'time', 'Country', 'Month']
                X_this_month = this_month_df.drop(
                    columns=[c for c in X_cols_drop if c in this_month_df.columns],
                    errors='ignore'
                )

                # === HadISST1.1 Handling: Reindex to match training features ===
                expected_features = X_train_part.columns
                X_this_month = X_this_month.reindex(columns=expected_features, fill_value=np.nan)

                # Impute missing values
                X_this_month_imp = imputer.transform(X_this_month)
                X_this_month = pd.DataFrame(X_this_month_imp, columns=expected_features, index=X_this_month.index)

                # Weighted ensemble prediction
                y_ensemble, _ = predict_weighted(models, X_val_part, y_val_part, X_this_month)
                this_month_df['ensemble_predictions'] = y_ensemble

                # === Row-level MSE Calculation ===
                if target_column in this_month_df.columns:
                    this_month_df['squared_error_ensemble'] = (this_month_df[target_column] - y_ensemble)**2

                # Individual model predictions
                for name, model in models:
                    this_month_df[f"{name}_predictions"] = model.predict(X_this_month)

                month_preds_list.append(this_month_df)

            # Combine monthly predictions for this level
            if month_preds_list:
                test_2023_preds = pd.concat(month_preds_list, ignore_index=True)
            else:
                test_2023_preds = pd.DataFrame()

            if not test_2023_preds.empty:
                # Store row-level predictions for this level
                predictions_all_levels_country.append(test_2023_preds.copy())
                predictions_2023_all_countries.append(test_2023_preds.copy())

                # Monthly ensemble predictions for this level
                monthly_ensemble = (
                    test_2023_preds
                    .groupby(['Month', 'lat', 'lon', 'level'])['ensemble_predictions']
                    .mean()
                    .reset_index()
                )
                monthly_ensemble_predictions_all.append(monthly_ensemble)
                monthly_ensemble_predictions_2023.append(monthly_ensemble.copy())

                # Monthly predictions per model for this level
                for name, _ in models:
                    col_name = f"{name}_predictions"
                    monthly_model = (
                        test_2023_preds
                        .groupby(['Month', 'lat', 'lon', 'level'])[col_name]
                        .mean()
                        .reset_index()
                    )
                    monthly_model.rename(columns={col_name: 'DUCMASS_Mean_Predictions'}, inplace=True)

                    monthly_model_predictions_all[name].append(monthly_model)
                    monthly_model_predictions_2023[name].append(monthly_model.copy())

            # 6) Feature Importances
            feature_importances = pd.DataFrame(index=X_train_part.columns)
            for name, model in models:
                if hasattr(model, 'feature_importances_'):
                    feature_importances[name] = model.feature_importances_

            if not feature_importances.empty:
                feature_importances.reset_index(inplace=True)
                feature_importances.rename(columns={'index': 'Feature'}, inplace=True)
                feature_importances['Country'] = country
                feature_importances['Level']   = level
                all_feature_importances.append(feature_importances)
            else:
                print(f"Warning: No feature importances for {country}, level {level}.")

        except Exception as e:
            print(f"       Error processing {country}, level {level}: {e}")

    # -----------------------------------------------------------------
    # AFTER processing all levels for this country, combine & average
    # -----------------------------------------------------------------
    if predictions_all_levels_country:
        # 1) Concatenate data across all pressure levels
        combined_across_levels = pd.concat(predictions_all_levels_country, ignore_index=True)

        # 2) Calculate mean of 'ensemble_predictions' across levels
        average_ensemble = (
            combined_across_levels
            .groupby(['lat', 'lon', 'Month'], as_index=False)['ensemble_predictions']
            .mean()
        )
        average_ensemble.rename(
            columns={'ensemble_predictions': 'ensemble_prediction_across_levels'},
            inplace=True
        )

        # Store for final "averaged" results across levels
        predictions_all_countries_across_levels.append(average_ensemble)
        print(f">> [Averaged] {country} has {average_ensemble.shape[0]} rows of final monthly predictions.")

print("\nMain loop complete! Saving Results...")

# ------------------------- SAVE RESULTS -------------------------

# 1) Feature Importances
if all_feature_importances:
    all_feature_importances_df = pd.concat(all_feature_importances, ignore_index=True)
else:
    all_feature_importances_df = pd.DataFrame()

fi_output_path = os.path.join(output_dir, "ua_feature_importances_2023_v0063.csv")
all_feature_importances_df.to_csv(fi_output_path, index=False)
print(f"Feature importances saved to '{fi_output_path}'")

# 2) Row-Level 2023 Predictions
if len(predictions_2023_all_countries) > 0:
    full_predictions_2023 = pd.concat(predictions_2023_all_countries, ignore_index=True)
    print(f"DEBUG: [ALL] full_predictions_2023.shape = {full_predictions_2023.shape}")
    output_2023_path = os.path.join(output_dir, "ua_row_level_predictions_2023_v0063.csv")
    full_predictions_2023.to_csv(output_2023_path, index=False)
    print(f"Row-level predictions for 2023 (ensemble + individual models) saved to '{output_2023_path}'")
else:
    print("No row-level predictions available for 2023.")

# 3) Monthly 2023 Predictions: ENSEMBLE
if len(monthly_ensemble_predictions_2023) > 0:
    monthly_ensemble_df_2023 = pd.concat(monthly_ensemble_predictions_2023, ignore_index=True)
    monthly_ensemble_df_2023['Year'] = 2023
    print(f"DEBUG: [ALL] monthly_ensemble_df_2023.shape = {monthly_ensemble_df_2023.shape}")
    ensemble_monthly_output_path = os.path.join(output_dir, "ua_monthly_ensemble_predictions_2023_v0063.csv")
    monthly_ensemble_df_2023.to_csv(ensemble_monthly_output_path, index=False)
    print(f"Monthly ensemble predictions for 2023 saved to '{ensemble_monthly_output_path}'")
else:
    print("No monthly ensemble predictions available for 2023.")

# 4) Monthly 2023 Predictions: INDIVIDUAL MODELS
for name in monthly_model_predictions_2023:
    if len(monthly_model_predictions_2023[name]) > 0:
        model_monthly_df = pd.concat(monthly_model_predictions_2023[name], ignore_index=True)
        model_monthly_df['Year'] = 2023
        print(f"DEBUG: [ALL] {name} monthly dataframe shape = {model_monthly_df.shape}")
        model_output_path = os.path.join(output_dir, f"ua_monthly_predictions_2023_{name}_v0063.csv")
        model_monthly_df.to_csv(model_output_path, index=False)
        print(f"Monthly predictions for 2023 for model '{name}' saved to '{model_output_path}'")
    else:
        print(f"No monthly predictions available for 2023 for model '{name}'.")

# 5) Model Metrics (Including Ensemble)
if len(all_model_metrics) > 0:
    final_metrics_df = pd.concat(all_model_metrics, ignore_index=True)
    metrics_output_path = os.path.join(output_dir, "ua_model_metrics_2023_v0063.csv")
    final_metrics_df.to_csv(metrics_output_path, index=False)
    print(f"Model metrics (including Ensemble) saved to '{metrics_output_path}'")
else:
    print("No model metrics to save.")

# 6) Monthly 2023 Predictions: AVERAGED ACROSS LEVELS
if len(predictions_all_countries_across_levels) > 0:
    final_averages_df = pd.concat(predictions_all_countries_across_levels, ignore_index=True)
    final_averages_df['Year'] = 2023

    final_averages_output_path = os.path.join(
        output_dir,
        "ua_monthly_ensemble_predictions_2023_averaged_across_levels_v0063.csv"
    )
    final_averages_df.to_csv(final_averages_output_path, index=False)
    print(f"Averaged ensemble predictions across levels saved to '{final_averages_output_path}'")
else:
    print("No averaged ensemble predictions across levels available.")



--- Processing Country: Yemen ---
   >> Processing Level: 10 hPa
DEBUG: [Yemen, level=10] After dropping columns: (529728, 35)
DEBUG: [Yemen, level=10] After lag_1: shape=(526880, 36)
   >> Processing Level: 50 hPa
DEBUG: [Yemen, level=50] After dropping columns: (529728, 35)
DEBUG: [Yemen, level=50] After lag_1: shape=(526880, 36)
   >> Processing Level: 100 hPa
DEBUG: [Yemen, level=100] After dropping columns: (529728, 35)
DEBUG: [Yemen, level=100] After lag_1: shape=(526880, 36)
   >> Processing Level: 200 hPa
DEBUG: [Yemen, level=200] After dropping columns: (529728, 35)
DEBUG: [Yemen, level=200] After lag_1: shape=(526880, 36)
   >> Processing Level: 300 hPa
DEBUG: [Yemen, level=300] After dropping columns: (529728, 35)
DEBUG: [Yemen, level=300] After lag_1: shape=(526880, 36)
   >> Processing Level: 500 hPa
DEBUG: [Yemen, level=500] After dropping columns: (529728, 35)
DEBUG: [Yemen, level=500] After lag_1: shape=(526880, 36)
   >> Processing Level: 700 hPa
DEBUG: [Yemen, level=

DEBUG: [Saudi_Arabia, level=10] After lag_1: shape=(2049775, 36)
   >> Processing Level: 50 hPa
DEBUG: [Saudi_Arabia, level=50] After dropping columns: (2055300, 35)
DEBUG: [Saudi_Arabia, level=50] After lag_1: shape=(2049775, 36)
   >> Processing Level: 100 hPa
DEBUG: [Saudi_Arabia, level=100] After dropping columns: (2055300, 35)
DEBUG: [Saudi_Arabia, level=100] After lag_1: shape=(2049775, 36)
   >> Processing Level: 200 hPa
DEBUG: [Saudi_Arabia, level=200] After dropping columns: (2055300, 35)
DEBUG: [Saudi_Arabia, level=200] After lag_1: shape=(2049775, 36)
   >> Processing Level: 300 hPa
DEBUG: [Saudi_Arabia, level=300] After dropping columns: (2055300, 35)
DEBUG: [Saudi_Arabia, level=300] After lag_1: shape=(2049775, 36)
   >> Processing Level: 500 hPa
DEBUG: [Saudi_Arabia, level=500] After dropping columns: (2055300, 35)
DEBUG: [Saudi_Arabia, level=500] After lag_1: shape=(2049775, 36)
   >> Processing Level: 700 hPa
DEBUG: [Saudi_Arabia, level=700] After dropping columns: (205