In [1]:
from scipy.stats import spearmanr
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import seaborn as sns
import itertools as it
from scipy.stats import spearmanr
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import mutual_info_regression as mir
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import OneHotEncoder as onehot
from sklearn import linear_model
from sklearn.cluster import KMeans
from sklearn.neural_network import MLPRegressor
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from statsmodels.tsa.deterministic import DeterministicProcess
from sklearn.metrics import mean_absolute_percentage_error as mape
from statsmodels.tsa.stattools import pacf
import xgboost as xgb
from itertools import product
from scipy import signal
from scipy import stats
from statsmodels.tsa.deterministic import Fourier
from sklearn.model_selection import TimeSeriesSplit

## Data Preprocessing

In [2]:
X_train = pd.read_csv('../challenge_data/X_train.csv')
Y_train = pd.read_csv('../challenge_data/Y_train.csv')
X_test = pd.read_csv('../challenge_data/X_test.csv')

In [3]:
from Util.tools import *
X_train_clean = fill_missing_with_average(X_train)
X_test_clean = fill_missing_with_average(X_test)

#drop those cols according to EDA
X_train_clean=X_train_clean.drop(["DE_FR_EXCHANGE","DE_NET_IMPORT","FR_NET_IMPORT"],axis=1)
X_test_clean=X_test_clean.drop(["DE_FR_EXCHANGE","DE_NET_IMPORT","FR_NET_IMPORT"],axis=1)

#Split into DE & FR
X_train_de = X_train_clean[X_train_clean['COUNTRY'] == 'DE']
X_test_de = X_test_clean[X_test_clean['COUNTRY'] == 'DE']

X_train_fr = X_train_clean[X_train_clean['COUNTRY'] == 'FR']
X_test_fr = X_test_clean[X_test_clean['COUNTRY'] == 'FR']

# merge TARGET
X_train_de = pd.merge(X_train_de, Y_train, on='ID', how='inner').sort_values('DAY_ID')
X_train_fr = pd.merge(X_train_fr, Y_train, on='ID', how='inner').sort_values('DAY_ID')

## Feature Selections

In [4]:
# select correlation bigger than 0.05 
def get_sorted_correlations_and_features(X_train, threshold=0.05):
    correlations = {}
    for column in X_train.columns:
        if column == 'TARGET':  
            continue
        corr, _ = spearmanr(X_train[column], X_train['TARGET'])
        correlations[column] = corr

    corr_df = pd.DataFrame(list(correlations.items()), columns=['Feature', 'Correlation'])
    corr_df['Absolute_Correlation'] = corr_df['Correlation'].abs()
    sorted_corr_df = corr_df.sort_values('Absolute_Correlation', ascending=False)
    
    selected_features = sorted_corr_df[sorted_corr_df['Absolute_Correlation'] >= threshold]['Feature']
    features_selected = selected_features.tolist()
    
    return sorted_corr_df, features_selected


In [5]:
sorted_corr_df_de, features_selected_de = get_sorted_correlations_and_features(X_train_de)
print("Selected Features for DE:")
print(features_selected_de)

sorted_corr_df_fr, features_selected_fr = get_sorted_correlations_and_features(X_train_fr)
print("Selected Features for FR:")
print(features_selected_fr)

Selected Features for DE:
['DE_RESIDUAL_LOAD', 'DE_NET_EXPORT', 'DE_WINDPOW', 'DE_GAS', 'DE_HYDRO', 'FR_WINDPOW', 'DE_COAL', 'DE_WIND', 'DE_LIGNITE', 'FR_DE_EXCHANGE', 'FR_WIND', 'FR_GAS', 'DE_CONSUMPTION', 'FR_RAIN', 'FR_HYDRO']
Selected Features for FR:
['CARBON_RET', 'GAS_RET', 'FR_WINDPOW', 'DE_HYDRO', 'DE_WINDPOW', 'DE_NET_EXPORT', 'FR_HYDRO', 'FR_COAL', 'DE_RAIN', 'COAL_RET', 'DE_RESIDUAL_LOAD', 'DE_CONSUMPTION']




In [6]:
# Split the data

X_trainde, X_testde, Y_trainde, Y_testde = train_test_split(X_train_de[features_selected_de], X_train_de['TARGET'], test_size=0.2, random_state=42)
X_trainfr, X_testfr, Y_trainfr, Y_testfr = train_test_split(X_train_fr[features_selected_fr], X_train_fr['TARGET'], test_size=0.2, random_state=42)

## Bagging Models

In [7]:
from scipy.stats import spearmanr
import numpy as np

from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer

def metric_train(output, truth):
    return spearmanr(output, truth).correlation

def get_model(model_name):
    if model_name == 'dt':
        return DecisionTreeRegressor()
    elif model_name == 'bagging_ridge':
        base_model = Ridge()  # Use Ridge regression as the base model
        return BaggingRegressor(base_estimator=base_model, n_estimators=10, random_state=42)
    elif model_name == 'extra_trees':
        return ExtraTreesRegressor()
    elif model_name == 'rf':
        return RandomForestRegressor()
    elif model_name == 'bagging_knn':
        base_model = KNeighborsRegressor()  # Use KNN regression as the base model
        return BaggingRegressor(base_estimator=base_model, n_estimators=10, random_state=42)
    elif model_name == 'bagging_svr':
        base_model = SVR()  # Use Support Vector Machine regression as the base model
        return BaggingRegressor(base_estimator=base_model, n_estimators=10, random_state=42)
    elif model_name == 'bagging_linear':
        base_model = LinearRegression()  # Use Linear regression as the base model
        return BaggingRegressor(base_estimator=base_model, n_estimators=10, random_state=42)
    elif model_name == 'adaboost':
        return AdaBoostRegressor()
    elif model_name == 'gradient_boosting':
        return GradientBoostingRegressor()
    else:
        raise ValueError('Unknown Model')
        
scorer_train = make_scorer(metric_train)

In [None]:
model_names = [
    'dt',  # Decision Tree Regressor
    'bagging_ridge',  # Bagging model based on Ridge regression
    'extra_trees',  # Extra Trees Regressor
    'rf',  # Random Forest Regressor
    'bagging_knn',  # Bagging model based on KNN regression
    'bagging_svr',  # Bagging model based on SVR
    'bagging_linear',  # Bagging model based on Linear regression
    'adaboost',  # AdaBoost Regressor
    'gradient_boosting'  # Gradient Boosting Regressor
]

# Train and evaluate models
results = []

for model_name in model_names:
    model = get_model(model_name)
    
    # Train on DE dataset
    model.fit(X_trainde, Y_trainde)
    predictions_de = model.predict(X_testde)
    score_de = metric_train(predictions_de, Y_testde)
    
    # Train on FR dataset
    model.fit(X_trainfr, Y_trainfr)
    predictions_fr = model.predict(X_testfr)  
    score_fr = metric_train(predictions_fr, Y_testfr)
    
    results.append({
        'Model': model_name,
        'DE_Train_Score': score_de,
        'FR_Train_Score': score_fr
    })

# Convert results to DataFrame for better visualization
import pandas as pd
results_df = pd.DataFrame(results)

In [10]:
print(results_df)

               Model  DE_Train_Score  FR_Train_Score
0                 dt        0.190454        0.116455
1      bagging_ridge        0.491419        0.158929
2        extra_trees        0.198083        0.157024
3                 rf        0.265921        0.189728
4        bagging_knn        0.134548        0.102892
5        bagging_svr        0.412729        0.245307
6     bagging_linear        0.491704        0.156625
7           adaboost        0.286776        0.099414
8  gradient_boosting        0.279215        0.278748


## Tuning

In [None]:
#Tune only 1.bagging_ridge 2,extra_trees 3,random_forest 4,bagging_svr 5,bagging_linear

### Random_Forest

In [17]:
import optuna

def rf_hyperparameter_optimization(X, y, cv=5, n_trials=10, seed=42):
    def objective(trial):
        param = {
            "n_estimators": 100,  
            "max_depth": trial.suggest_int("max_depth", 3, 12),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 64),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 2, 64),
            "max_features": trial.suggest_float("max_features", 0.2, 1.0),
            "max_leaf_nodes": trial.suggest_int("max_leaf_nodes", 2, 64),
            "min_impurity_decrease": trial.suggest_float("min_impurity_decrease", 1e-2, 1.0, log=True),
            "max_samples": trial.suggest_float("max_samples", 0.5, 1.0),
            "ccp_alpha": trial.suggest_float("ccp_alpha", 1e-2, 1.0, log=True),
        }

        model = RandomForestRegressor(random_state=seed, **param)
        
        scores = cross_val_score(model, X, y, cv=cv, scoring="neg_mean_squared_error")
        rmse_scores = np.sqrt(-scores)
        return np.mean(rmse_scores)  

    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials)

    return study.best_params



In [27]:
rf_best_params = rf_hyperparameter_optimization(X_trainde, Y_trainde, cv=5, n_trials=10, seed=42)
print("best_params", rf_best_params)


[I 2024-02-18 03:36:18,144] A new study created in memory with name: no-name-0ec983ed-529b-4b4b-96e3-5b2f433f086f
[I 2024-02-18 03:36:19,173] Trial 0 finished with value: 1.0066674896558196 and parameters: {'max_depth': 8, 'min_samples_split': 15, 'min_samples_leaf': 43, 'max_features': 0.8185697975514876, 'max_leaf_nodes': 35, 'min_impurity_decrease': 0.19174870540761627, 'max_samples': 0.998922289931578, 'ccp_alpha': 0.07771970662659705}. Best is trial 0 with value: 1.0066674896558196.
[I 2024-02-18 03:36:20,074] Trial 1 finished with value: 1.0063175953113985 and parameters: {'max_depth': 7, 'min_samples_split': 59, 'min_samples_leaf': 25, 'max_features': 0.2493305504944348, 'max_leaf_nodes': 59, 'min_impurity_decrease': 0.0961114127571237, 'max_samples': 0.5970929434316429, 'ccp_alpha': 0.5921555508316173}. Best is trial 1 with value: 1.0063175953113985.
[I 2024-02-18 03:36:21,179] Trial 2 finished with value: 0.9802936640426765 and parameters: {'max_depth': 4, 'min_samples_split':

best_params {'max_depth': 4, 'min_samples_split': 25, 'min_samples_leaf': 28, 'max_features': 0.3157989158094407, 'max_leaf_nodes': 20, 'min_impurity_decrease': 0.014231628952015142, 'max_samples': 0.8042184913920745, 'ccp_alpha': 0.014411968858996087}


In [28]:
model = RandomForestRegressor(random_state=42)
model.set_params(**rf_best_params)  
    
# 训练模型
model.fit(X_trainde, Y_trainde)
predictions_de = model.predict(X_testde)  
score_de = metric_train(predictions_fr, Y_testfr)
score_de

0.2787481700146399

### Bagging_Ridge

In [None]:
def bagging_ridge_hyperparameter_optimization(X, y, cv=5, n_trials=10, seed=42):
    def objective(trial):

        ridge_param = {
            "alpha": trial.suggest_float("alpha", 0.1, 5.0, log=True),
            "fit_intercept": trial.suggest_categorical("fit_intercept", [True, False]),
        }
        ridge_model = Ridge(**ridge_param)
        

        bagging_param = {
            "n_estimators": trial.suggest_int("n_estimators", 5, 50),
            "max_samples": trial.suggest_float("max_samples", 0.5, 1.0),
            "max_features": trial.suggest_float("max_features", 0.5, 1.0),
            "bootstrap": trial.suggest_categorical("bootstrap", [True, False]),
            "bootstrap_features": trial.suggest_categorical("bootstrap_features", [True, False]),
        }
        model = BaggingRegressor(base_estimator=ridge_model, random_state=seed, **bagging_param)
        
        scores = cross_val_score(model, X, y, cv=cv, scoring="neg_mean_squared_error")
        rmse_scores = np.sqrt(-scores)
        return np.mean(rmse_scores)  


    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials)

    return study.best_params


bagging_ridge_best_params = bagging_ridge_hyperparameter_optimization(X_trainde, Y_trainde, cv=5, n_trials=10, seed=42)


In [20]:
print("bagging_ridge_best_params", bagging_ridge_best_params)

最佳超参数： {'alpha': 1.0461108737828706, 'fit_intercept': False, 'n_estimators': 17, 'max_samples': 0.5542504452666337, 'max_features': 0.5992117149215204, 'bootstrap': True, 'bootstrap_features': True}


### Extra_Trees


In [21]:
def extra_trees_hyperparameter_optimization(X, y, cv=5, n_trials=10, seed=42):
    def objective(trial):
        param = {
            "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
            "max_depth": trial.suggest_int("max_depth", 5, 30),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 14),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 14),
            "max_features": trial.suggest_categorical("max_features", ["auto", "sqrt", "log2"]),
            "bootstrap": trial.suggest_categorical("bootstrap", [True, False]),
        }
        
        model = ExtraTreesRegressor(random_state=seed, **param)
        
        scores = cross_val_score(model, X, y, cv=cv, scoring="neg_mean_squared_error")
        rmse_scores = np.sqrt(-scores)
        return np.mean(rmse_scores)  

    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials)

    return study.best_params

extra_trees_best_params = extra_trees_hyperparameter_optimization(X_trainde, Y_trainde, cv=5, n_trials=10, seed=42)

In [None]:
print("extra_trees_best_params", extra_trees_best_params)

### bagging_svr

In [None]:
def bagging_svr_hyperparameter_optimization(X, y, cv=5, n_trials=10, seed=42):
    def objective(trial):
        svr_param = {
            "C": trial.suggest_float("C", 0.5, 10.0, log=True),
            "epsilon": trial.suggest_float("epsilon", 0.05, 1.0, log=True),
            "kernel": trial.suggest_categorical("kernel", ["linear", "poly", "rbf", "sigmoid"]),
        }
        svr_model = SVR(**svr_param)
        
        bagging_param = {
            "n_estimators": trial.suggest_int("n_estimators", 5, 50),
            "max_samples": trial.suggest_float("max_samples", 0.5, 1.0),
            "max_features": trial.suggest_float("max_features", 0.5, 1.0),
            "bootstrap": trial.suggest_categorical("bootstrap", [True, False]),
            "bootstrap_features": trial.suggest_categorical("bootstrap_features", [True, False]),
        }
        model = BaggingRegressor(base_estimator=svr_model, random_state=seed, **bagging_param)

        scores = cross_val_score(model, X, y, cv=cv, scoring="neg_mean_squared_error")
        rmse_scores = np.sqrt(-scores)
        return np.mean(rmse_scores) 

    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials)

    return study.best_params


bagging_svr_best_params = bagging_svr_hyperparameter_optimization(X_trainde, Y_trainde, cv=5, n_trials=10, seed=42)


In [25]:
print("bagging_svr_best_params", bagging_svr_best_params)

bagging_svr_best_params {'C': 0.9100753035718627, 'epsilon': 0.7212504348863487, 'kernel': 'linear', 'n_estimators': 31, 'max_samples': 0.7818924468563151, 'max_features': 0.7231084111243953, 'bootstrap': False, 'bootstrap_features': True}
