In [1]:
from scipy.stats import spearmanr
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import seaborn as sns
import itertools as it
from scipy.stats import spearmanr
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import mutual_info_regression as mir
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import OneHotEncoder as onehot
from sklearn import linear_model
from sklearn.cluster import KMeans
from sklearn.neural_network import MLPRegressor
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from statsmodels.tsa.deterministic import DeterministicProcess
from sklearn.metrics import mean_absolute_percentage_error as mape
from statsmodels.tsa.stattools import pacf
import xgboost as xgb
from itertools import product
from scipy import signal
from scipy import stats
from statsmodels.tsa.deterministic import Fourier
from sklearn.model_selection import TimeSeriesSplit

## Data Preprocessing

In [2]:
X_train = pd.read_csv('../challenge_data/X_train.csv')
Y_train = pd.read_csv('../challenge_data/Y_train.csv')
X_test = pd.read_csv('../challenge_data/X_test.csv')

In [3]:
from Util.tools import *
X_train_clean = fill_missing_with_average(X_train)
X_test_clean = fill_missing_with_average(X_test)

#drop those cols according to EDA
X_train_clean=X_train_clean.drop(["DE_FR_EXCHANGE","DE_NET_IMPORT","FR_NET_IMPORT"],axis=1)
X_test_clean=X_test_clean.drop(["DE_FR_EXCHANGE","DE_NET_IMPORT","FR_NET_IMPORT"],axis=1)

#Split into DE & FR
X_train_de = X_train_clean[X_train_clean['COUNTRY'] == 'DE']
X_test_de = X_test_clean[X_test_clean['COUNTRY'] == 'DE']

X_train_fr = X_train_clean[X_train_clean['COUNTRY'] == 'FR']
X_test_fr = X_test_clean[X_test_clean['COUNTRY'] == 'FR']

# merge TARGET
X_train_de = pd.merge(X_train_de, Y_train, on='ID', how='inner').sort_values('DAY_ID')
X_train_fr = pd.merge(X_train_fr, Y_train, on='ID', how='inner').sort_values('DAY_ID')

## Feature Selections

In [4]:
# select correlation bigger than 0.05 
def get_sorted_correlations_and_features(X_train, threshold=0.05):
    correlations = {}
    for column in X_train.columns:
        if column == 'TARGET':  
            continue
        corr, _ = spearmanr(X_train[column], X_train['TARGET'])
        correlations[column] = corr

    corr_df = pd.DataFrame(list(correlations.items()), columns=['Feature', 'Correlation'])
    corr_df['Absolute_Correlation'] = corr_df['Correlation'].abs()
    sorted_corr_df = corr_df.sort_values('Absolute_Correlation', ascending=False)
    
    selected_features = sorted_corr_df[sorted_corr_df['Absolute_Correlation'] >= threshold]['Feature']
    features_selected = selected_features.tolist()
    
    return sorted_corr_df, features_selected


In [5]:
sorted_corr_df_de, features_selected_de = get_sorted_correlations_and_features(X_train_de)
print("Selected Features for DE:")
print(features_selected_de)

sorted_corr_df_fr, features_selected_fr = get_sorted_correlations_and_features(X_train_fr)
print("Selected Features for FR:")
print(features_selected_fr)

Selected Features for DE:
['DE_RESIDUAL_LOAD', 'DE_NET_EXPORT', 'DE_WINDPOW', 'DE_GAS', 'DE_HYDRO', 'FR_WINDPOW', 'DE_COAL', 'DE_WIND', 'DE_LIGNITE', 'FR_DE_EXCHANGE', 'FR_WIND', 'FR_GAS', 'DE_CONSUMPTION', 'FR_RAIN', 'FR_HYDRO']
Selected Features for FR:
['CARBON_RET', 'GAS_RET', 'FR_WINDPOW', 'DE_HYDRO', 'DE_WINDPOW', 'DE_NET_EXPORT', 'FR_HYDRO', 'FR_COAL', 'DE_RAIN', 'COAL_RET', 'DE_RESIDUAL_LOAD', 'DE_CONSUMPTION']




In [6]:
# Split the data

X_trainde, X_testde, Y_trainde, Y_testde = train_test_split(X_train_de[features_selected_de], X_train_de['TARGET'], test_size=0.2, random_state=42)
X_trainfr, X_testfr, Y_trainfr, Y_testfr = train_test_split(X_train_fr[features_selected_fr], X_train_fr['TARGET'], test_size=0.2, random_state=42)

## Bagging Models

In [44]:
from scipy.stats import spearmanr
import numpy as np

from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer

def metric_train(output, truth):
    return spearmanr(output, truth).correlation


def get_model(model_name, best_param=None):
    if model_name == 'dt':
        model = DecisionTreeRegressor(**(best_param if best_param else {}))
    elif model_name == 'bagging_ridge':
        base_model = Ridge(**(best_param['base_model'] if best_param and 'base_model' in best_param else {}))
        model = BaggingRegressor(base_estimator=base_model, n_estimators=10, random_state=42, **(best_param['model'] if best_param and 'model' in best_param else {}))
    elif model_name == 'extra_trees':
        model = ExtraTreesRegressor(**(best_param if best_param else {}))
    elif model_name == 'rf':
        model = RandomForestRegressor(**(best_param if best_param else {}))
    elif model_name == 'bagging_knn':
        base_model = KNeighborsRegressor(**(best_param['base_model'] if best_param and 'base_model' in best_param else {}))
        model = BaggingRegressor(base_estimator=base_model, n_estimators=10, random_state=42, **(best_param['model'] if best_param and 'model' in best_param else {}))
    elif model_name == 'bagging_svr':
        base_model = SVR(**(best_param['base_model'] if best_param and 'base_model' in best_param else {}))
        model = BaggingRegressor(base_estimator=base_model, n_estimators=10, random_state=42, **(best_param['model'] if best_param and 'model' in best_param else {}))
    elif model_name == 'bagging_linear':
        base_model = LinearRegression(**(best_param['base_model'] if best_param and 'base_model' in best_param else {}))
        model = BaggingRegressor(base_estimator=base_model, n_estimators=10, random_state=42, **(best_param['model'] if best_param and 'model' in best_param else {}))
    elif model_name == 'adaboost':
        model = AdaBoostRegressor(**(best_param if best_param else {}))
    elif model_name == 'gradient_boosting':
        model = GradientBoostingRegressor(**(best_param if best_param else {}))
    else:
        raise ValueError('Unknown Model')
    return model
        
scorer_train = make_scorer(metric_train)

In [None]:
model_names = [
    'dt',  # Decision Tree Regressor
    'bagging_ridge',  # Bagging model based on Ridge regression
    'extra_trees',  # Extra Trees Regressor
    'rf',  # Random Forest Regressor
    'bagging_knn',  # Bagging model based on KNN regression
    'bagging_svr',  # Bagging model based on SVR
    'bagging_linear',  # Bagging model based on Linear regression
    'adaboost',  # AdaBoost Regressor
    'gradient_boosting'  # Gradient Boosting Regressor
]

# Train and evaluate models
results = []

for model_name in model_names:
    model = get_model(model_name)
    
    # Train on DE dataset
    model.fit(X_trainde, Y_trainde)
    predictions_de = model.predict(X_testde)
    score_de = metric_train(predictions_de, Y_testde)
    
    # Train on FR dataset
    model.fit(X_trainfr, Y_trainfr)
    predictions_fr = model.predict(X_testfr)  
    score_fr = metric_train(predictions_fr, Y_testfr)
    
    # Overall Score
    predictions_overall = np.concatenate((predictions_de, predictions_fr))
    truth_overall = np.concatenate((Y_testde, Y_testfr))
    score_overall = metric_train(predictions_overall, truth_overall)
    
    results.append({
        'Model': model_name,
        'DE_Train_Score': score_de,
        'FR_Train_Score': score_fr,
        'Overall_Score': score_overall  
    })

# Convert results to DataFrame for better visualization
import pandas as pd
results_df = pd.DataFrame(results)

In [46]:
print(results_df)

               Model  DE_Train_Score  FR_Train_Score  Overall_Score
0                 dt        0.166270        0.066704       0.106568
1      bagging_ridge        0.491419        0.158929       0.313878
2        extra_trees        0.189837        0.211448       0.201050
3                 rf        0.338914        0.208304       0.262706
4        bagging_knn        0.134548        0.102892       0.107535
5        bagging_svr        0.412729        0.245307       0.320654
6     bagging_linear        0.491704        0.156625       0.314120
7           adaboost        0.364300        0.097936       0.201350
8  gradient_boosting        0.296987        0.265450       0.271008


## Tuning

In [23]:
#Tune only 1.bagging_ridge 2,extra_trees 3,random_forest 4,bagging_svr 5,bagging_linear

### Random_Forest

In [None]:
import optuna

def rf_hyperparameter_optimization(X, y, cv=5, n_trials=10, seed=42):
    def objective(trial):
        param = {
            "n_estimators": 100,  
            "max_depth": trial.suggest_int("max_depth", 3, 12),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 64),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 2, 64),
            "max_features": trial.suggest_float("max_features", 0.2, 1.0),
            "max_leaf_nodes": trial.suggest_int("max_leaf_nodes", 2, 64),
            "min_impurity_decrease": trial.suggest_float("min_impurity_decrease", 1e-2, 1.0, log=True),
            "max_samples": trial.suggest_float("max_samples", 0.5, 1.0),
            "ccp_alpha": trial.suggest_float("ccp_alpha", 1e-2, 1.0, log=True),
        }

        model = RandomForestRegressor(random_state=seed, **param)
        
        scores = cross_val_score(model, X, y, cv=cv, scoring="neg_mean_squared_error")
        rmse_scores = np.sqrt(-scores)
        return np.mean(rmse_scores)  

    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials)

    return study.best_params

rf_best_paramsde = rf_hyperparameter_optimization(X_trainde, Y_trainde, cv=5, n_trials=10, seed=42)
rf_best_paramsfr = rf_hyperparameter_optimization(X_trainfr, Y_trainfr, cv=5, n_trials=10, seed=42)

In [25]:
print("rf_best_paramsde", rf_best_paramsde)
print("rf_best_paramsfr", rf_best_paramsfr)


rf_best_paramsde {'max_depth': 12, 'min_samples_split': 7, 'min_samples_leaf': 36, 'max_features': 0.7677065026437544, 'max_leaf_nodes': 15, 'min_impurity_decrease': 0.02324164438742622, 'max_samples': 0.8636723851254833, 'ccp_alpha': 0.04849481120754413}
rf_best_paramsfr {'max_depth': 11, 'min_samples_split': 46, 'min_samples_leaf': 19, 'max_features': 0.7589229311075041, 'max_leaf_nodes': 36, 'min_impurity_decrease': 0.3197690514647084, 'max_samples': 0.6611917747614966, 'ccp_alpha': 0.013952638388272324}


### Bagging_Ridge

In [None]:
def bagging_ridge_hyperparameter_optimization(X, y, cv=5, n_trials=10, seed=42):
    def objective(trial):

        ridge_param = {
            "alpha": trial.suggest_float("alpha", 0.1, 5.0, log=True),
            "fit_intercept": trial.suggest_categorical("fit_intercept", [True, False]),
        }
        ridge_model = Ridge(**ridge_param)
        

        bagging_param = {
            "n_estimators": trial.suggest_int("n_estimators", 5, 50),
            "max_samples": trial.suggest_float("max_samples", 0.5, 1.0),
            "max_features": trial.suggest_float("max_features", 0.5, 1.0),
            "bootstrap": trial.suggest_categorical("bootstrap", [True, False]),
            "bootstrap_features": trial.suggest_categorical("bootstrap_features", [True, False]),
        }
        model = BaggingRegressor(base_estimator=ridge_model, random_state=seed, **bagging_param)
        
        scores = cross_val_score(model, X, y, cv=cv, scoring="neg_mean_squared_error")
        rmse_scores = np.sqrt(-scores)
        return np.mean(rmse_scores)  


    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials)

    return study.best_params


bagging_ridge_best_paramsde = bagging_ridge_hyperparameter_optimization(X_trainde, Y_trainde, cv=5, n_trials=10, seed=42)
bagging_ridge_best_paramsfr = bagging_ridge_hyperparameter_optimization(X_trainfr, Y_trainfr, cv=5, n_trials=10, seed=42)


In [28]:
print("bagging_ridge_best_paramsde", bagging_ridge_best_paramsde)
print("bagging_ridge_best_paramsfr", bagging_ridge_best_paramsfr)

bagging_ridge_best_paramsde {'alpha': 0.24530935689241493, 'fit_intercept': False, 'n_estimators': 32, 'max_samples': 0.7836691725672775, 'max_features': 0.7028963498535951, 'bootstrap': False, 'bootstrap_features': True}
bagging_ridge_best_paramsfr {'alpha': 4.342477384102951, 'fit_intercept': False, 'n_estimators': 19, 'max_samples': 0.5047195405691205, 'max_features': 0.9636551412132504, 'bootstrap': True, 'bootstrap_features': True}


### Extra_Trees


In [None]:
def extra_trees_hyperparameter_optimization(X, y, cv=5, n_trials=10, seed=42):
    def objective(trial):
        param = {
            "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
            "max_depth": trial.suggest_int("max_depth", 5, 30),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 14),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 14),
            "max_features": trial.suggest_categorical("max_features", ["auto", "sqrt", "log2"]),
            "bootstrap": trial.suggest_categorical("bootstrap", [True, False]),
        }
        
        model = ExtraTreesRegressor(random_state=seed, **param)
        
        scores = cross_val_score(model, X, y, cv=cv, scoring="neg_mean_squared_error")
        rmse_scores = np.sqrt(-scores)
        return np.mean(rmse_scores)  

    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials)

    return study.best_params

extra_trees_best_paramsde = extra_trees_hyperparameter_optimization(X_trainde, Y_trainde, cv=5, n_trials=10, seed=42)
extra_trees_best_paramsfr = extra_trees_hyperparameter_optimization(X_trainfr, Y_trainfr, cv=5, n_trials=10, seed=42)

In [30]:
print("extra_trees_best_paramsde", extra_trees_best_paramsde)
print("extra_trees_best_paramsfr", extra_trees_best_paramsfr)

extra_trees_best_paramsde {'n_estimators': 347, 'max_depth': 15, 'min_samples_split': 8, 'min_samples_leaf': 5, 'max_features': 'log2', 'bootstrap': True}
extra_trees_best_paramsfr {'n_estimators': 803, 'max_depth': 29, 'min_samples_split': 2, 'min_samples_leaf': 10, 'max_features': 'sqrt', 'bootstrap': True}


### bagging_svr

In [None]:
def bagging_svr_hyperparameter_optimization(X, y, cv=5, n_trials=10, seed=42):
    def objective(trial):
        svr_param = {
            "C": trial.suggest_float("C", 0.5, 10.0, log=True),
            "epsilon": trial.suggest_float("epsilon", 0.05, 1.0, log=True),
            "kernel": trial.suggest_categorical("kernel", ["linear", "poly", "rbf", "sigmoid"]),
        }
        svr_model = SVR(**svr_param)
        
        bagging_param = {
            "n_estimators": trial.suggest_int("n_estimators", 5, 50),
            "max_samples": trial.suggest_float("max_samples", 0.5, 1.0),
            "max_features": trial.suggest_float("max_features", 0.5, 1.0),
            "bootstrap": trial.suggest_categorical("bootstrap", [True, False]),
            "bootstrap_features": trial.suggest_categorical("bootstrap_features", [True, False]),
        }
        model = BaggingRegressor(base_estimator=svr_model, random_state=seed, **bagging_param)

        scores = cross_val_score(model, X, y, cv=cv, scoring="neg_mean_squared_error")
        rmse_scores = np.sqrt(-scores)
        return np.mean(rmse_scores) 

    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials)

    return study.best_params


bagging_svr_best_paramsde = bagging_svr_hyperparameter_optimization(X_trainde, Y_trainde, cv=5, n_trials=10, seed=42)
bagging_svr_best_paramsfr = bagging_svr_hyperparameter_optimization(X_trainfr, Y_trainfr, cv=5, n_trials=10, seed=42)


In [32]:
print("bagging_svr_best_paramsde", bagging_svr_best_paramsde)
print("bagging_svr_best_paramsfr", bagging_svr_best_paramsfr)

bagging_svr_best_paramsde {'C': 2.073111784706136, 'epsilon': 0.3380017802887977, 'kernel': 'linear', 'n_estimators': 46, 'max_samples': 0.9738641495774639, 'max_features': 0.9776745869238563, 'bootstrap': False, 'bootstrap_features': True}
bagging_svr_best_paramsfr {'C': 1.893184146184135, 'epsilon': 0.16610288223819752, 'kernel': 'linear', 'n_estimators': 29, 'max_samples': 0.7607259342411905, 'max_features': 0.6270018221375047, 'bootstrap': False, 'bootstrap_features': True}


## bagging_linear

In [None]:
def bagging_linear_hyperparameter_optimization(X, y, cv=5, n_trials=10, seed=42):
    def objective(trial):
        bagging_param = {
            "n_estimators": trial.suggest_int("n_estimators", 5, 100),
            "max_samples": trial.suggest_float("max_samples", 0.5, 1.0),
            "max_features": trial.suggest_float("max_features", 0.5, 1.0),
            "bootstrap": trial.suggest_categorical("bootstrap", [True, False]),
            "bootstrap_features": trial.suggest_categorical("bootstrap_features", [True, False]),
        }
        base_model = LinearRegression()
        model = BaggingRegressor(base_estimator=base_model, random_state=seed, **bagging_param)
        
        scores = cross_val_score(model, X, y, cv=cv, scoring="neg_mean_squared_error")
        rmse_scores = np.sqrt(-scores)
        return np.mean(rmse_scores)  

    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials)

    return study.best_params

bagging_linear_best_paramsde = bagging_linear_hyperparameter_optimization(X_trainde, Y_trainde, cv=5, n_trials=10, seed=42)
bagging_linear_best_paramsfr = bagging_linear_hyperparameter_optimization(X_trainfr, Y_trainfr, cv=5, n_trials=10, seed=42)



In [34]:
print("bagging_linear_best_paramsde:", bagging_linear_best_paramsde)
print("bagging_linear_best_paramsfr:", bagging_linear_best_paramsfr)

bagging_linear_best_paramsde: {'n_estimators': 98, 'max_samples': 0.9884336579539678, 'max_features': 0.5972669840012728, 'bootstrap': False, 'bootstrap_features': True}
bagging_linear_best_paramsfr: {'n_estimators': 76, 'max_samples': 0.7151840983643662, 'max_features': 0.6525844739910632, 'bootstrap': False, 'bootstrap_features': True}


In [None]:
model = RandomForestRegressor(random_state=42)
model.set_params(**rf_best_paramsde)  
    
model.fit(X_trainde, Y_trainde)
predictions_de = model.predict(X_testde)  
score_de = metric_train(predictions_fr, Y_testfr)
score_de

In [54]:
results = []
def apply_best_params(model, best_params):
    # This function assumes best_params is a dictionary of parameters to be set
    model.set_params(**best_params)
    return model

for model_name, best_params_de, best_params_fr in [
    # Model names and best_params placeholders
]:
    # Initialize the model
    model_de = get_model(model_name)
    model_fr = get_model(model_name)
    
    # Apply best parameters - this step is conceptual
    # You would need to ensure that best_params_* variables are dictionaries of parameters
    model_de = apply_best_params(model_de, best_params_de)
    model_fr = apply_best_params(model_fr, best_params_fr)


for model_name, best_params_de, best_params_fr in [
    ('rf', rf_best_paramsde, rf_best_paramsfr),
    ('bagging_ridge', bagging_ridge_best_paramsde, bagging_ridge_best_paramsfr),
    ('extra_trees', extra_trees_best_paramsde, extra_trees_best_paramsfr),
    ('bagging_svr', bagging_svr_best_paramsde, bagging_svr_best_paramsfr),
    ('bagging_linear', bagging_linear_best_paramsde, bagging_linear_best_paramsfr),
]:

    model_de = get_model(model_name, best_params_de)
    model_de.fit(X_trainde, Y_trainde)
    predictions_de = model_de.predict(X_testde)
    score_de = metric_train(predictions_de, Y_testde)
    
    model_fr = get_model(model_name, best_params_fr)
    model_fr.fit(X_trainfr, Y_trainfr)
    predictions_fr = model_fr.predict(X_testfr)
    score_fr = metric_train(predictions_fr, Y_testfr)

    predictions_overall = np.concatenate((predictions_de, predictions_fr))
    truth_overall = np.concatenate((Y_testde, Y_testfr))
    score_overall = metric_train(predictions_overall, truth_overall)
    
    results.append({
        'Model': model_name,
        'DE_Train_Score': score_de,
        'FR_Train_Score': score_fr,
        'Overall_Score': score_overall,
    })

for result in results:
    print(result)




{'Model': 'rf', 'DE_Train_Score': 0.44697180573913853, 'FR_Train_Score': nan, 'Overall_Score': 0.22149684435088007}
{'Model': 'bagging_ridge', 'DE_Train_Score': 0.4914188282647586, 'FR_Train_Score': 0.15892912856697147, 'Overall_Score': 0.31387770975233054}
{'Model': 'extra_trees', 'DE_Train_Score': 0.3921511627906977, 'FR_Train_Score': 0.18879928960568318, 'Overall_Score': 0.26966255180613113}
{'Model': 'bagging_svr', 'DE_Train_Score': 0.4127292039355993, 'FR_Train_Score': 0.24530683754529967, 'Overall_Score': 0.32065422949143874}
{'Model': 'bagging_linear', 'DE_Train_Score': 0.4917039355992845, 'FR_Train_Score': 0.15662514699882402, 'Overall_Score': 0.31411993466594074}


