In [2]:

import os
import numpy as np
import pandas as pd

# Scikit-learn
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer

# Additional metrics
from sklearn.metrics import (
    mean_absolute_error,
    r2_score,
    mean_squared_error
)

def rmse(y_true, y_pred):
    """Helper function to compute RMSE using scikit-learn's root_mean_squared_error with squared=False."""
    return mean_squared_error(y_true, y_pred)

# ---------------------------------------------
# 1) Define Parameter *Distributions* for Each Regressor
#    (Same dictionary as before; now used for random search)
# ---------------------------------------------
param_grids = {
    'lr': {
        'fit_intercept': [True, False],
        'copy_X': [True, False],
    },
    'svr': {
        'kernel': ['rbf', 'poly', 'sigmoid'],
        'gamma': ['scale', 'auto', 0.01, 0.1, 0.25, 0.5, 1.0],
        'C': [0.1, 0.5, 1.0, 10, 100],
        'epsilon': [0.1, 0.25, 0.5, 0.75, 1.0, 1.5], 
        'max_iter': [-1]
    },
    'knn': {
        'n_neighbors': [3, 5, 7, 9, 13],
        'weights': ['uniform', 'distance'], 
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
        'leaf_size': [5, 10, 15, 20, 25, 30, 35, 50],
        'p': [2]
    },
    'dt': {
        'max_depth': [2, 3, 5, 7, 13, None],
        'min_samples_split': [3, 5, 10, 13],
        'max_features': [0.25, 0.5, 1.0, 2, 3, 5, 10, 15, 20, None],
        'random_state': [1, 4, 13, 19, 23, 42]
    },
    'rf': {
        'n_estimators': [10, 25, 50, 100, 150, 200],
        'max_depth': [2, 3, 5, 7, 13, 25, None],
        'min_samples_split': [3, 5, 10, 13],
        'max_features': [0.25, 0.5, 1.0, 2, 3, 5, 10, 15, 20, None],
        'random_state': [1, 4, 13, 19, 23, 42], 
    }
}


# -------------------------------------------------
# 2) Train Models Using RandomizedSearchCV
# -------------------------------------------------
def train_models(regressors, X_train, y_train, param_grids):
    """
    Train (and tune) each regressor on the training data using RandomizedSearchCV 
    and return the best models found.
    """
    models = []
    for name, model in regressors:
        print(f"Training and tuning model: {name}")
        
        # Retrieve param distributions for this model
        if name in param_grids:
            param_distribution = param_grids[name]
        else:
            # If no param grid is defined, just fit the default model
            param_distribution = {}
        
        # Set up RandomizedSearchCV
        rs = RandomizedSearchCV(
            estimator=model,
            param_distributions=param_distribution,
            scoring='neg_mean_squared_error',  # Common scoring for regression
            cv=3,                              # 3-fold cross-validation
            n_jobs=-1,                         # Use all CPU cores
            verbose=1,                         # Prints progress messages
            n_iter=20,                         # Number of parameter settings to sample
            random_state=42                    # For reproducible results
        )
        
        # Fit the model
        rs.fit(X_train, y_train)
        
        # Get the best estimator
        best_model = rs.best_estimator_
        print(f"  Best Params for {name}: {rs.best_params_}\n")
        
        models.append((name, best_model))
    return models


# -------------------------------------------------
# 3) Weighted Ensemble Prediction
# -------------------------------------------------
def predict_weighted(models, X_val, y_val, X_test):
    """
    Compute weights for each model (inverse of RMSE) on (X_val, y_val),
    and make weighted ensemble predictions on X_test.

    Returns:
      final_prediction (ndarray): Weighted ensemble predictions on X_test.
      weights (ndarray): The normalized weights for each model (same order as 'models').
    """
    weights = []
    predictions = []

    # Compute RMSE on validation set -> weights (inverse RMSE)
    for name, model in models:
        y_pred_val = model.predict(X_val)
        curr_rmse = rmse(y_val, y_pred_val)
        weights.append(1 / (curr_rmse + 1e-10))
        predictions.append(model.predict(X_test))

    # Normalize weights
    weights = np.array(weights)
    weights /= np.sum(weights)

    # Weighted average of predictions
    predictions = np.array(predictions)  # shape: (num_models, num_samples)
    final_prediction = np.dot(weights, predictions)

    return final_prediction, weights

def compute_metrics_and_weights(models, X_val, y_val):
    """
    Compute each model's RMSE, MAE, R^2, and weight = 1/(RMSE + 1e-10).

    Returns: (model_names, rmse_list, mae_list, r2_list, weights)
    """
    from sklearn.metrics import mean_absolute_error, r2_score

    model_names = []
    rmse_list   = []
    mae_list    = []
    r2_list     = []
    weights     = []

    for name, model in models:
        y_pred_val = model.predict(X_val)
        curr_rmse  = rmse(y_val, y_pred_val)
        mae_val    = mean_absolute_error(y_val, y_pred_val)
        r2_val     = r2_score(y_val, y_pred_val)

        w = 1 / (curr_rmse + 1e-10)

        model_names.append(name)
        rmse_list.append(curr_rmse)
        mae_list.append(mae_val)
        r2_list.append(r2_val)
        weights.append(w)

    # Normalize weights
    weights = np.array(weights)
    weights /= weights.sum()

    return model_names, rmse_list, mae_list, r2_list, weights


# -------------------------------------------------
# 4) MAIN SCRIPT CONFIGURATION
# -------------------------------------------------
if __name__ == "__main__":
    # ------------------------- CONFIGURATION -------------------------
    base_path = r"Z:\\Thesis\\Data\\ML_Data\\AP_training2"
    output_dir = r"Z:\\Thesis\\Data\\test\\DustCast\\SFC\\DC_v0050_RS"
    os.makedirs(output_dir, exist_ok=True)

    train_years = [1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 2016, 2017, 2018, 2019, 2020]
    test_year = 2021

    countries = [
        "Yemen", 
        "Bahrain", 
        "Kuwait", 
        "Oman", 
        "Qatar", 
        "Saudi_Arabia", 
        "United_Arab_Emirates"
    ]

    # Define regressors (initial models)
    regressors = [
        ('lr', LinearRegression()),
        ('svr', SVR()),
        ('knn', KNeighborsRegressor()),
        ('dt', DecisionTreeRegressor()),
        ('rf', RandomForestRegressor()),
    ]

    target_column = 'DUSMASS_mean'

    cols_to_drop = [
        'sst_min', 'sst_max', 'sst_mean',
        'DUSMASS25_min', 'DUSMASS25_max', 'DUSMASS25_mean',
        'DUSMASS_min', 'DUSMASS_max',
        'h3_res_5', 'h3_res_6', 'h3_res_5_int', 'h3_res_6_int',
        'h3_res_3', 'h3_res_3_int', 'h3_res_4', 'h3_res_4_int'
    ]

    # ------------------------- OUTPUT STORAGES -------------------------
    all_feature_importances = []
    predictions_2021_all_countries = []
    monthly_ensemble_predictions_2021 = []
    monthly_model_predictions_2021 = {name: [] for name, _ in regressors}

    all_model_metrics = []  # Store each country's model metrics (RMSE, MAE, R2, Weight), plus ensemble metrics

    # ------------------------- MAIN LOOP -------------------------
    for country in countries:
        try:
            # 1) Load and combine training data
            train_frames = []
            for yr in train_years:
                file_path = os.path.join(base_path, str(yr), f"{country}_{yr}_surface_monthly_stats_merged.parquet")
                if not os.path.exists(file_path):
                    print(f"Training file not found: {file_path}")
                    continue
                df_temp = pd.read_parquet(file_path)
                train_frames.append(df_temp)
            
            if len(train_frames) == 0:
                print(f"No training data found for {country}. Skipping...")
                continue

            train_df = pd.concat(train_frames, ignore_index=True)
            print(f"DEBUG: [{country}] After concatenation: train_df.shape = {train_df.shape}")

            train_df['time'] = pd.to_datetime(train_df['time'])

            # Drop columns
            train_df.drop(columns=[c for c in cols_to_drop if c in train_df.columns], 
                          inplace=True, errors='ignore')
            print(f"DEBUG: [{country}] After dropping columns: train_df.shape = {train_df.shape}")

            # Create a 1-month lag of the target
            train_df = train_df.sort_values(by=['lat', 'lon', 'time'])
            print(f"DEBUG: [{country}] After sorting: train_df.shape = {train_df.shape}")
            train_df['lag_1'] = train_df.groupby(['lat', 'lon'])[target_column].shift(1)
            print(f"DEBUG: [{country}] After creating 'lag_1': train_df.shape = {train_df.shape}")

            # Drop NaNs for target or lag_1
            train_df = train_df.dropna(subset=[target_column, 'lag_1'])
            print(f"DEBUG: [{country}] After dropping NaNs for target and lag_1: train_df.shape = {train_df.shape}")

            # 2) Features / Target
            X_train_full = train_df.drop(columns=[target_column, 'time'], errors='ignore')
            y_train_full = train_df[target_column]
            print(f"DEBUG: [{country}] X_train_full.shape = {X_train_full.shape}, y_train_full.shape = {y_train_full.shape}")

            # 3) Train-Validation Split
            X_train_part, X_val_part, y_train_part, y_val_part = train_test_split(
                X_train_full,
                y_train_full,
                test_size=0.3,
                random_state=42
            )
            print(f"DEBUG: [{country}] Split -> X_train_part={X_train_part.shape}, X_val_part={X_val_part.shape}")

            # Impute
            imputer = SimpleImputer(strategy='mean')
            X_train_part_imp = imputer.fit_transform(X_train_part)
            X_val_part_imp   = imputer.transform(X_val_part)

            X_train_part = pd.DataFrame(X_train_part_imp, columns=X_train_part.columns, index=X_train_part.index)
            X_val_part   = pd.DataFrame(X_val_part_imp,   columns=X_val_part.columns,   index=X_val_part.index)

            # 4) Train Models with RandomizedSearchCV
            models = train_models(regressors, X_train_part, y_train_part, param_grids)
            print(f"DEBUG: [{country}] Finished training (and tuning) models.")

            # 4.1) Feature Importances
            feature_importances = pd.DataFrame(index=X_train_part.columns)
            for name, model in models:
                if hasattr(model, 'feature_importances_'):
                    feature_importances[name] = model.feature_importances_

            if not feature_importances.empty:
                feature_importances.reset_index(inplace=True)
                feature_importances.rename(columns={'index': 'Feature'}, inplace=True)
                feature_importances['Country'] = country
                all_feature_importances.append(feature_importances)

            # 4.2) Compute model-level metrics
            model_names, rmse_list, mae_list, r2_list, weights_array = compute_metrics_and_weights(models, X_val_part, y_val_part)
            metrics_df = pd.DataFrame({
                'Country': country,
                'Model': model_names,
                'RMSE': rmse_list,
                'MAE': mae_list,
                'R2': r2_list,
                'Weight': weights_array
            })

            # 4.3) Ensemble metrics on validation
            ensemble_val_preds, _ = predict_weighted(models, X_val_part, y_val_part, X_val_part)
            ensemble_rmse_val = rmse(y_val_part, ensemble_val_preds)
            ensemble_mae_val  = mean_absolute_error(y_val_part, ensemble_val_preds)
            ensemble_r2_val   = r2_score(y_val_part, ensemble_val_preds)

            # Instead of .append(), use pd.concat + a 1-row DataFrame
            ensemble_row = {
                'Country': country,
                'Model': 'ensemble',
                'RMSE': ensemble_rmse_val,
                'MAE': ensemble_mae_val,
                'R2': ensemble_r2_val,
                'Weight': 1.0
            }
            ensemble_df = pd.DataFrame([ensemble_row])
            metrics_df = pd.concat([metrics_df, ensemble_df], ignore_index=True)

            all_model_metrics.append(metrics_df)

            # 5) Prepare Test Data for 2021 Month-by-Month
            test_file_path = os.path.join(base_path, str(test_year), f"{country}_{test_year}_surface_monthly_stats_merged.parquet")
            if not os.path.exists(test_file_path):
                print(f"Test file (2021) not found for {country}. Skipping...")
                continue

            test_df_2021 = pd.read_parquet(test_file_path)
            print(f"DEBUG: [{country}] Loaded test_df_2021: shape = {test_df_2021.shape}")

            test_df_2021['time'] = pd.to_datetime(test_df_2021['time'])
            test_df_2021.drop(columns=[c for c in cols_to_drop if c in test_df_2021.columns], inplace=True, errors='ignore')
            print(f"DEBUG: [{country}] After dropping columns in test_df_2021: {test_df_2021.shape}")

            test_df_2021 = test_df_2021.sort_values(by=['lat', 'lon', 'time'])
            test_df_2021['Month'] = test_df_2021['time'].dt.to_period('M')
            print(f"DEBUG: [{country}] After sorting test_df_2021: shape={test_df_2021.shape}")

            # Dec 2020 data for seeding Jan 2021
            dec_2020 = train_df.loc[train_df['time'].dt.to_period('M') == pd.Period('2020-12')]
            dec_2020 = dec_2020[['lat', 'lon', target_column]].rename(columns={target_column: 'lag_1'})
            dec_2020['Month'] = pd.Period('2021-01')

            month_preds_list = []
            unique_months_2021 = sorted(test_df_2021['Month'].unique())
            print(f"DEBUG: [{country}] unique_months_2021 = {unique_months_2021}")

            for i, mon in enumerate(unique_months_2021):
                this_month_df = test_df_2021[test_df_2021['Month'] == mon].copy()
                print(f"DEBUG: [{country}] Month={mon}, initial this_month_df.shape = {this_month_df.shape}")

                if i == 0:
                    print(f"DEBUG: [{country}] Merging with dec_2020: dec_2020.shape = {dec_2020.shape}")
                    this_month_df = pd.merge(
                        this_month_df,
                        dec_2020[['lat', 'lon', 'lag_1']],
                        on=['lat', 'lon'],
                        how='left'
                    )
                    print(f"DEBUG: [{country}] After merge (Jan 2021): this_month_df.shape = {this_month_df.shape}")
                else:
                    prev_month_data = month_preds_list[-1][['lat', 'lon', 'ensemble_predictions']].copy()
                    prev_month_data.rename(columns={'ensemble_predictions': 'lag_1'}, inplace=True)
                    prev_month_data['Month'] = mon

                    print(f"DEBUG: [{country}] prev_month_data.shape = {prev_month_data.shape}")
                    this_month_df = pd.merge(
                        this_month_df,
                        prev_month_data[['lat', 'lon', 'lag_1']],
                        on=['lat', 'lon'],
                        how='left'
                    )
                    print(f"DEBUG: [{country}] After merge (Month={mon}): this_month_df.shape = {this_month_df.shape}")

                # Drop rows w/o valid lag_1
                this_month_df.dropna(subset=['lag_1'], inplace=True)
                print(f"DEBUG: [{country}] After dropping NA lag_1: this_month_df.shape = {this_month_df.shape}")

                # Prepare X
                X_cols_to_drop = [target_column, 'time', 'Month']
                X_this_month = this_month_df.drop(columns=[c for c in X_cols_to_drop if c in this_month_df.columns],
                                                  errors='ignore')
                print(f"DEBUG: [{country}] X_this_month.shape = {X_this_month.shape}")

                # Impute
                X_this_month_imputed = imputer.transform(X_this_month)
                X_this_month = pd.DataFrame(X_this_month_imputed, columns=X_this_month.columns, index=X_this_month.index)

                # Weighted ensemble
                y_ensemble_2021, _ = predict_weighted(models, X_val_part, y_val_part, X_this_month)
                this_month_df['ensemble_predictions'] = y_ensemble_2021

                # Individual models
                for name, model in models:
                    this_month_df[f"{name}_predictions"] = model.predict(X_this_month)

                month_preds_list.append(this_month_df)

            if len(month_preds_list) > 0:
                test_df_2021_preds = pd.concat(month_preds_list, ignore_index=True)
                print(f"DEBUG: [{country}] test_df_2021_preds.shape (after concat months) = {test_df_2021_preds.shape}")
            else:
                test_df_2021_preds = pd.DataFrame()

            # 6) Store row-level predictions
            if not test_df_2021_preds.empty:
                predictions_2021_all_countries.append(test_df_2021_preds.copy())

                # 7) Monthly Aggregation
                monthly_ensemble = (
                    test_df_2021_preds
                    .groupby(['Month', 'lat', 'lon'])['ensemble_predictions']
                    .mean()
                    .reset_index()
                )
                monthly_ensemble['Country'] = country
                monthly_ensemble_predictions_2021.append(monthly_ensemble)

                for name, _ in models:
                    col_name = f"{name}_predictions"
                    monthly_model = (
                        test_df_2021_preds
                        .groupby(['Month', 'lat', 'lon'])[col_name]
                        .mean()
                        .reset_index()
                    )
                    monthly_model.rename(columns={col_name: 'DUSMASS_Mean_Predictions'}, inplace=True)
                    monthly_model['Country'] = country
                    monthly_model_predictions_2021[name].append(monthly_model)

        except Exception as e:
            print(f"Error processing {country}: {e}")


    print("Main loop complete! Saving Results...")

    # ------------------------- SAVE RESULTS -------------------------

    # 1) Feature Importances
    if all_feature_importances:
        all_feature_importances_df = pd.concat(all_feature_importances, ignore_index=True)
    else:
        all_feature_importances_df = pd.DataFrame()
    fi_output_path = os.path.join(output_dir, "sfc_feature_importances_2016_2020.csv")
    all_feature_importances_df.to_csv(fi_output_path, index=False)
    print(f"Feature importances saved to '{fi_output_path}'")

    # 2) Row-Level 2021 Predictions
    if len(predictions_2021_all_countries) > 0:
        full_predictions_2021 = pd.concat(predictions_2021_all_countries, ignore_index=True)
        print(f"DEBUG: [ALL] full_predictions_2021.shape = {full_predictions_2021.shape}")
        output_2021_path = os.path.join(output_dir, "sfc_row_level_predictions_2021.csv")
        full_predictions_2021.to_csv(output_2021_path, index=False)
        print(f"Row-level predictions for 2021 (ensemble + individual models) saved to '{output_2021_path}'")
    else:
        print("No row-level predictions available for 2021.")

    # 3) Monthly 2021 Predictions: ENSEMBLE
    if len(monthly_ensemble_predictions_2021) > 0:
        monthly_ensemble_df_2021 = pd.concat(monthly_ensemble_predictions_2021, ignore_index=True)
        monthly_ensemble_df_2021['Year'] = 2021
        print(f"DEBUG: [ALL] monthly_ensemble_df_2021.shape = {monthly_ensemble_df_2021.shape}")
        ensemble_monthly_output_path = os.path.join(output_dir, "sfc_monthly_ensemble_predictions_2021.csv")
        monthly_ensemble_df_2021.to_csv(ensemble_monthly_output_path, index=False)
        print(f"Monthly ensemble predictions for 2021 saved to '{ensemble_monthly_output_path}'")
    else:
        print("No monthly ensemble predictions available for 2021.")

    # 4) Monthly 2021 Predictions: INDIVIDUAL MODELS
    for name in monthly_model_predictions_2021:
        if len(monthly_model_predictions_2021[name]) > 0:
            model_monthly_df = pd.concat(monthly_model_predictions_2021[name], ignore_index=True)
            model_monthly_df['Year'] = 2021
            print(f"DEBUG: [ALL] {name} monthly dataframe shape = {model_monthly_df.shape}")
            model_output_path = os.path.join(output_dir, f"sfc_monthly_predictions_2021_{name}.csv")
            model_monthly_df.to_csv(model_output_path, index=False)
            print(f"Monthly predictions for 2021 for model '{name}' saved to '{model_output_path}'")
        else:
            print(f"No monthly predictions available for 2021 for model '{name}'.")

    # 5) Model Metrics (Including Ensemble)
    if len(all_model_metrics) > 0:
        final_metrics_df = pd.concat(all_model_metrics, ignore_index=True)
        metrics_output_path = os.path.join(output_dir, "sfc_model_metrics_2016_2020.csv")
        final_metrics_df.to_csv(metrics_output_path, index=False)
        print(f"Model metrics (including Ensemble) saved to '{metrics_output_path}'")
    else:
        print("No model metrics to save.")


DEBUG: [Yemen] After concatenation: train_df.shape = (96432, 104)
DEBUG: [Yemen] After dropping columns: train_df.shape = (96432, 88)
DEBUG: [Yemen] After sorting: train_df.shape = (96432, 88)
DEBUG: [Yemen] After creating 'lag_1': train_df.shape = (96432, 89)
DEBUG: [Yemen] After dropping NaNs for target and lag_1: train_df.shape = (95284, 89)
DEBUG: [Yemen] X_train_full.shape = (95284, 87), y_train_full.shape = (95284,)
DEBUG: [Yemen] Split -> X_train_part=(66698, 87), X_val_part=(28586, 87)
Training and tuning model: lr
Fitting 3 folds for each of 4 candidates, totalling 12 fits




  Best Params for lr: {'fit_intercept': True, 'copy_X': True}

Training and tuning model: svr
Fitting 3 folds for each of 20 candidates, totalling 60 fits
  Best Params for svr: {'max_iter': -1, 'kernel': 'sigmoid', 'gamma': 0.25, 'epsilon': 1.5, 'C': 10}

Training and tuning model: knn
Fitting 3 folds for each of 20 candidates, totalling 60 fits
  Best Params for knn: {'weights': 'distance', 'p': 2, 'n_neighbors': 5, 'leaf_size': 35, 'algorithm': 'ball_tree'}

Training and tuning model: dt
Fitting 3 folds for each of 20 candidates, totalling 60 fits
  Best Params for dt: {'random_state': 42, 'min_samples_split': 13, 'max_features': 0.25, 'max_depth': None}

Training and tuning model: rf
Fitting 3 folds for each of 20 candidates, totalling 60 fits
  Best Params for rf: {'random_state': 4, 'n_estimators': 10, 'min_samples_split': 5, 'max_features': 20, 'max_depth': 13}

DEBUG: [Yemen] Finished training (and tuning) models.
DEBUG: [Yemen] Loaded test_df_2021: shape = (6888, 104)
DEBUG: [



  Best Params for lr: {'fit_intercept': True, 'copy_X': True}

Training and tuning model: svr
Fitting 3 folds for each of 20 candidates, totalling 60 fits
  Best Params for svr: {'max_iter': -1, 'kernel': 'sigmoid', 'gamma': 0.25, 'epsilon': 1.5, 'C': 10}

Training and tuning model: knn
Fitting 3 folds for each of 20 candidates, totalling 60 fits
  Best Params for knn: {'weights': 'distance', 'p': 2, 'n_neighbors': 7, 'leaf_size': 15, 'algorithm': 'auto'}

Training and tuning model: dt
Fitting 3 folds for each of 20 candidates, totalling 60 fits
  Best Params for dt: {'random_state': 1, 'min_samples_split': 3, 'max_features': 15, 'max_depth': 2}

Training and tuning model: rf
Fitting 3 folds for each of 20 candidates, totalling 60 fits
  Best Params for rf: {'random_state': 4, 'n_estimators': 100, 'min_samples_split': 10, 'max_features': 0.5, 'max_depth': 7}

DEBUG: [Bahrain] Finished training (and tuning) models.
DEBUG: [Bahrain] Loaded test_df_2021: shape = (24, 104)
DEBUG: [Bahrain]



  Best Params for svr: {'max_iter': -1, 'kernel': 'sigmoid', 'gamma': 0.25, 'epsilon': 1.5, 'C': 10}

Training and tuning model: knn
Fitting 3 folds for each of 20 candidates, totalling 60 fits
  Best Params for knn: {'weights': 'distance', 'p': 2, 'n_neighbors': 5, 'leaf_size': 35, 'algorithm': 'ball_tree'}

Training and tuning model: dt
Fitting 3 folds for each of 20 candidates, totalling 60 fits
  Best Params for dt: {'random_state': 13, 'min_samples_split': 13, 'max_features': 1.0, 'max_depth': 5}

Training and tuning model: rf
Fitting 3 folds for each of 20 candidates, totalling 60 fits
  Best Params for rf: {'random_state': 23, 'n_estimators': 200, 'min_samples_split': 5, 'max_features': 0.25, 'max_depth': 25}

DEBUG: [Kuwait] Finished training (and tuning) models.
DEBUG: [Kuwait] Loaded test_df_2021: shape = (372, 104)
DEBUG: [Kuwait] After dropping columns in test_df_2021: (372, 88)
DEBUG: [Kuwait] After sorting test_df_2021: shape=(372, 89)
DEBUG: [Kuwait] unique_months_2021 =



  Best Params for lr: {'fit_intercept': True, 'copy_X': True}

Training and tuning model: svr
Fitting 3 folds for each of 20 candidates, totalling 60 fits
  Best Params for svr: {'max_iter': -1, 'kernel': 'sigmoid', 'gamma': 0.25, 'epsilon': 1.5, 'C': 10}

Training and tuning model: knn
Fitting 3 folds for each of 20 candidates, totalling 60 fits
  Best Params for knn: {'weights': 'distance', 'p': 2, 'n_neighbors': 5, 'leaf_size': 35, 'algorithm': 'ball_tree'}

Training and tuning model: dt
Fitting 3 folds for each of 20 candidates, totalling 60 fits
  Best Params for dt: {'random_state': 42, 'min_samples_split': 13, 'max_features': None, 'max_depth': 13}

Training and tuning model: rf
Fitting 3 folds for each of 20 candidates, totalling 60 fits
  Best Params for rf: {'random_state': 23, 'n_estimators': 200, 'min_samples_split': 5, 'max_features': 0.25, 'max_depth': 25}

DEBUG: [Oman] Finished training (and tuning) models.
DEBUG: [Oman] Loaded test_df_2021: shape = (6192, 104)
DEBUG: [



  Best Params for lr: {'fit_intercept': True, 'copy_X': True}

Training and tuning model: svr
Fitting 3 folds for each of 20 candidates, totalling 60 fits
  Best Params for svr: {'max_iter': -1, 'kernel': 'sigmoid', 'gamma': 0.25, 'epsilon': 1.5, 'C': 10}

Training and tuning model: knn
Fitting 3 folds for each of 20 candidates, totalling 60 fits
  Best Params for knn: {'weights': 'distance', 'p': 2, 'n_neighbors': 5, 'leaf_size': 35, 'algorithm': 'auto'}

Training and tuning model: dt
Fitting 3 folds for each of 20 candidates, totalling 60 fits
  Best Params for dt: {'random_state': 42, 'min_samples_split': 13, 'max_features': 0.25, 'max_depth': None}

Training and tuning model: rf
Fitting 3 folds for each of 20 candidates, totalling 60 fits
  Best Params for rf: {'random_state': 23, 'n_estimators': 200, 'min_samples_split': 10, 'max_features': 20, 'max_depth': 7}

DEBUG: [Qatar] Finished training (and tuning) models.
DEBUG: [Qatar] Loaded test_df_2021: shape = (156, 104)
DEBUG: [Qata



  Best Params for lr: {'fit_intercept': False, 'copy_X': True}

Training and tuning model: svr
Fitting 3 folds for each of 20 candidates, totalling 60 fits
  Best Params for svr: {'max_iter': -1, 'kernel': 'sigmoid', 'gamma': 0.25, 'epsilon': 1.5, 'C': 10}

Training and tuning model: knn
Fitting 3 folds for each of 20 candidates, totalling 60 fits
  Best Params for knn: {'weights': 'distance', 'p': 2, 'n_neighbors': 5, 'leaf_size': 35, 'algorithm': 'ball_tree'}

Training and tuning model: dt
Fitting 3 folds for each of 20 candidates, totalling 60 fits
  Best Params for dt: {'random_state': 42, 'min_samples_split': 13, 'max_features': 0.25, 'max_depth': None}

Training and tuning model: rf
Fitting 3 folds for each of 20 candidates, totalling 60 fits
  Best Params for rf: {'random_state': 23, 'n_estimators': 200, 'min_samples_split': 5, 'max_features': 0.25, 'max_depth': 25}

DEBUG: [Saudi_Arabia] Finished training (and tuning) models.
DEBUG: [Saudi_Arabia] Loaded test_df_2021: shape = (



  Best Params for lr: {'fit_intercept': True, 'copy_X': True}

Training and tuning model: svr
Fitting 3 folds for each of 20 candidates, totalling 60 fits
  Best Params for svr: {'max_iter': -1, 'kernel': 'sigmoid', 'gamma': 0.25, 'epsilon': 1.5, 'C': 10}

Training and tuning model: knn
Fitting 3 folds for each of 20 candidates, totalling 60 fits
  Best Params for knn: {'weights': 'distance', 'p': 2, 'n_neighbors': 5, 'leaf_size': 35, 'algorithm': 'ball_tree'}

Training and tuning model: dt
Fitting 3 folds for each of 20 candidates, totalling 60 fits
  Best Params for dt: {'random_state': 23, 'min_samples_split': 3, 'max_features': 0.25, 'max_depth': 13}

Training and tuning model: rf
Fitting 3 folds for each of 20 candidates, totalling 60 fits
  Best Params for rf: {'random_state': 23, 'n_estimators': 200, 'min_samples_split': 5, 'max_features': 0.25, 'max_depth': 25}

DEBUG: [United_Arab_Emirates] Finished training (and tuning) models.
DEBUG: [United_Arab_Emirates] Loaded test_df_202