In [1]:
import pandas as pd
import numpy as np
from Util.tools import *

In [2]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
import lightgbm as lgb

from sklearn.model_selection import train_test_split

In [3]:
X_train = pd.read_csv('../challenge_data/X_train.csv')
Y_train = pd.read_csv('../challenge_data/Y_train.csv')
X_test = pd.read_csv('../challenge_data/X_test.csv')
X_train_clean = X_train.drop(['COUNTRY'], axis=1)
X_train_clean = preprocessing(X_train_clean, norm=True, pca=True)
X_train_clean = pd.DataFrame(X_train_clean)
X_train_clean.columns = X_train_clean.columns.astype(str)
Y_train_clean = Y_train['TARGET']

In [4]:
def training(model, cv=5):
    scores = []
    for _ in range(cv):
        X_train, X_test, Y_train, Y_test = train_test_split(X_train_clean, Y_train_clean, test_size=0.2, random_state=np.random.randint(1, 100))
        model.fit(X_train, Y_train)
        y_pred = model.predict(X_test)
        score = spearmanr(Y_test, y_pred).correlation

        scores.append(score)

    metric = np.mean(scores)

    print('Spearman correlation for the train set: {:.1f}%'.format(100 * metric ))

    return metric

In [5]:
def grid_search_ensemble(n_estimators, learning_rate, estimator, X, Y, cv=5):
    best_n = 0
    best_learning_rate = 0
    best_result = float('-inf')
    best_model = None

    for i in n_estimators:
        for j in learning_rate:  
            if estimator == 'AdaBoost':
                model = AdaBoostRegressor(DecisionTreeRegressor(max_depth=3), n_estimators=i, learning_rate=j, random_state=1)

            elif estimator == 'GradientBoost':
                model = GradientBoostingRegressor(n_estimators=i, learning_rate=j, random_state=1)
                
            elif estimator == 'XGBoost':
                model = XGBRegressor(n_estimators=i, learning_rate=j, random_state=1)
                
            else:
                model = lgb.LGBMRegressor(max_depth=2, n_estimators=i, learning_rate=j, random_state=1, min_child_samples=20)

            scores = []
            for _ in range(cv):
                X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=np.random.randint(1, 100))

                model.fit(X_train, Y_train)
                y_pred = model.predict(X_test)
                
                score = spearmanr(Y_test, y_pred).correlation

                scores.append(score)

            # Calculate the mean score
            mean_score = np.mean(scores)

            # Check if the current model is the best
            if mean_score > best_result:
                best_result = mean_score
                best_n = i
                best_learning_rate = j
                best_model = model
                

    print(f'The best parameters values are: learning_rate: {best_learning_rate}, n_estimators: {best_n}')    

    return best_n, best_learning_rate, best_model, best_result

In [5]:
clf = DecisionTreeRegressor(max_depth=3)
clf.fit(X_train_clean, Y_train_clean)
ada = AdaBoostRegressor(clf, n_estimators=500, learning_rate=0.1, random_state=1)
result = training(ada)

Spearman correlation for the train set: 20.0%


In [8]:
n_estimators = np.arange(100, 600, 30)
learning_rate = [0.01, 0.05, 0.1, 0.5, 1]
best_n, best_learning_rate, best_model, best_result = grid_search_ensemble(n_estimators, learning_rate, 'AdaBoost', X_train_clean, Y_train_clean)

The best parameters values are: learning_rate: 0.1, n_estimators: 310


In [37]:
clf = DecisionTreeRegressor(max_depth=3)
clf.fit(X_train_clean, Y_train_clean)
ada_best = AdaBoostRegressor(clf, n_estimators=310, learning_rate=0.1, random_state=1)
result = training(ada_best)

Spearman correlation for the train set: 16.7%


In [10]:
gb = GradientBoostingRegressor(max_depth=2, n_estimators=500, learning_rate=0.1, random_state=1)
result = training(gb)

Spearman correlation for the train set: 11.8%


In [11]:
n_estimators = np.arange(100, 600, 30)
learning_rate = [0.01, 0.05, 0.1, 0.5, 1]
best_n, best_learning_rate, best_model, best_result = grid_search_ensemble(n_estimators, learning_rate, 'GradientBoost', X_train_clean, Y_train_clean)

The best parameters values are: learning_rate: 0.01, n_estimators: 280


In [34]:
gb_best = GradientBoostingRegressor(max_depth=2, n_estimators=280, learning_rate=0.01, random_state=1)

result = training(gb_best)

Spearman correlation for the train set: 19.1%


In [14]:
X_test = pd.read_csv('../challenge_data/X_test.csv')
X_test = X_test.drop(['COUNTRY'], axis=1)
X_test_clean = preprocessing(X_test, norm=True, pca=True)

In [15]:
submission(X_test, X_test_clean, gb_best, 'gb_best')



In [24]:
xgb = XGBRegressor(max_depth=2, n_estimators=500, learning_rate=0.1)
result = training(xgb)

Spearman correlation for the train set: 12.7%


In [25]:
n_estimators = np.arange(100, 600, 30)
learning_rate = [0.01, 0.05, 0.1, 0.5, 1]
best_n, best_learning_rate, best_model, best_result = grid_search_ensemble(n_estimators, learning_rate, 'XGBoost', X_train_clean, Y_train_clean)

The best parameters values are: learning_rate: 0.01, n_estimators: 190


In [31]:
xgb_best = XGBRegressor(max_depth=2, n_estimators=190, learning_rate=0.01)

result = training(xgb_best)

Spearman correlation for the train set: 18.1%


In [10]:
# Load the datasets
df_train = pd.read_csv('../challenge_data/X_train.csv').set_index('ID').sort_index()
y_train = pd.read_csv('../challenge_data/Y_train.csv').set_index('ID').sort_index()
df_test = pd.read_csv('../challenge_data/X_test.csv').set_index('ID').sort_index()


de_train = df_train[df_train['COUNTRY'] == 'DE']
fr_train = df_train[df_train['COUNTRY'] == 'FR']

y_de_train = y_train[y_train.index.isin(de_train.index)]
y_fr_train = y_train[y_train.index.isin(fr_train.index)]

de_full = pd.concat([de_train, df_test[df_test['COUNTRY'] == 'DE']]).sort_index()
fr_full = pd.concat([fr_train, df_test[df_test['COUNTRY'] == 'FR']]).sort_index()

de_full.drop('COUNTRY', axis=1, inplace=True)
fr_full.drop('COUNTRY', axis=1, inplace=True)
de_full = de_full.join(de_full.shift(1), rsuffix='_prev_day')
fr_full = fr_full.join(fr_full.shift(1), rsuffix='_prev_day')

window_size = 50  # Example window size, can be adjusted
# Calculate moving averages for each column
for column in de_full.columns:
    if de_full[column].isnull().any():
        de_full[column+'_MA'] = de_full[column].rolling(window=window_size, min_periods=1).mean()
for column in de_full.columns:
    if '_MA' in column:
        original_column = column.replace('_MA', '')
        de_full[original_column].fillna(de_full[column], inplace=True)
        

for column in fr_full.columns:
    if fr_full[column].isnull().any():
        fr_full[column+'_MA'] = fr_full[column].rolling(window=window_size, min_periods=1).mean()
for column in fr_full.columns:
    if '_MA' in column:
        original_column = column.replace('_MA', '')
        fr_full[original_column].fillna(fr_full[column], inplace=True)

# Remove the moving average columns after filling missing values
de_full.drop([col for col in de_full.columns if '_MA' in col], axis=1, inplace=True)
fr_full.drop([col for col in fr_full.columns if '_MA' in col], axis=1, inplace=True)

de_test_final = de_full[de_full['DAY_ID'].isin(df_test[df_test['COUNTRY'] == 'DE']['DAY_ID'])]
fr_test_final = fr_full[fr_full['DAY_ID'].isin(df_test[df_test['COUNTRY'] == 'FR']['DAY_ID'])]
de_train_final = de_full[de_full['DAY_ID'].isin(df_train[df_train['COUNTRY'] == 'DE']['DAY_ID'])]
fr_train_final = fr_full[fr_full['DAY_ID'].isin(df_train[df_train['COUNTRY'] == 'FR']['DAY_ID'])]

columns_to_remove = [
    "DAY_ID", "FR_CONSUMPTION", "FR_DE_EXCHANGE", "FR_NET_EXPORT",
    "DE_NET_IMPORT", "FR_NET_IMPORT", "FR_GAS", "FR_COAL", "FR_HYDRO", 
    "FR_NUCLEAR", "FR_SOLAR", "FR_WINDPOW", "FR_RESIDUAL_LOAD", 
    "FR_RAIN", "FR_WIND", "FR_TEMP"
]
# Add columns with '_prev_day' suffix
columns_to_remove += [col + '_prev_day' for col in columns_to_remove]
# Remove the columns from de_train_final
de_train_final = de_train_final.drop(columns=columns_to_remove, errors='ignore')
de_test_final = de_test_final.drop(columns=columns_to_remove, errors='ignore')

columns_to_remove = [
    "DAY_ID", "DE_CONSUMPTION", "DE_FR_EXCHANGE", "DE_NET_EXPORT",
    "FR_NET_IMPORT", "DE_NET_IMPORT", "DE_GAS", "DE_COAL", "DE_HYDRO", 
    "DE_NUCLEAR", "DE_SOLAR", "DE_WINDPOW", "DE_RESIDUAL_LOAD", 
    "DE_RAIN", "DE_WIND", "DE_TEMP"
]
# Add columns with '_prev_day' suffix
columns_to_remove += [col + '_prev_day' for col in columns_to_remove]
# Remove the columns from de_train_final
fr_train_final = fr_train_final.drop(columns=columns_to_remove, errors='ignore')
fr_test_final = fr_test_final.drop(columns=columns_to_remove, errors='ignore')

de_train_final.fillna(method='ffill', inplace=True)  # Forward fill
de_train_final.fillna(method='bfill', inplace=True)  # Backward fill
de_test_final.fillna(method='ffill', inplace=True)
de_test_final.fillna(method='bfill', inplace=True)
fr_train_final.fillna(method='ffill', inplace=True)
fr_train_final.fillna(method='bfill', inplace=True)
fr_test_final.fillna(method='ffill', inplace=True)
fr_test_final.fillna(method='bfill', inplace=True)

  de_train_final.fillna(method='ffill', inplace=True)  # Forward fill
  de_train_final.fillna(method='bfill', inplace=True)  # Backward fill
  de_test_final.fillna(method='ffill', inplace=True)
  de_test_final.fillna(method='bfill', inplace=True)
  fr_train_final.fillna(method='ffill', inplace=True)
  fr_train_final.fillna(method='bfill', inplace=True)
  fr_test_final.fillna(method='ffill', inplace=True)
  fr_test_final.fillna(method='bfill', inplace=True)


In [11]:
##########################################################
### Training process -- XGB
##########################################################

# Assuming 'TARGET' is your target variable
X = de_train_final
y = y_de_train

# Split the data
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)

# Train the model
xgb_model = XGBRegressor(max_depth=2, n_estimators=500, learning_rate=0.1)
xgb_model.fit(x_train, y_train)

# Predict on the test set
y_pred_lr = xgb_model.predict(x_test)

# Calculate Spearman Correlation
spearman_corr_lr = spearmanr(y_pred_lr, y_test).correlation
print(f"Spearman Correlation for Linear Regression: {spearman_corr_lr:.1%}")

de_train_pred = xgb_model.predict(X)
de_train = de_train_final.reset_index()
de_train['TARGET'] = de_train_pred
de_train = de_train[['ID', 'TARGET']]
de_test = xgb_model.predict(de_test_final)
de_test_pred = de_test_final.reset_index()
de_test_pred['TARGET'] = de_test
de_test_pred = de_test_pred[['ID', 'TARGET']]

# Assuming 'TARGET' is your target variable
X = fr_train_final
y = y_fr_train

# Split the data
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)

# Train the model
xgb_model = XGBRegressor(max_depth=2, n_estimators=500, learning_rate=0.1)
xgb_model.fit(x_train, y_train)

# Predict on the test set
y_pred_lr = xgb_model.predict(x_test)

# Calculate Spearman Correlation
spearman_corr_lr = spearmanr(y_pred_lr, y_test).correlation
print(f"Spearman Correlation for Linear Regression: {spearman_corr_lr:.1%}")

fr_train_pred = xgb_model.predict(X)
fr_result = fr_train_final.reset_index()
fr_result['TARGET'] = fr_train_pred
fr_result = fr_result[['ID', 'TARGET']]
fr_test = xgb_model.predict(fr_test_final)
fr_test_pred = fr_test_final.reset_index()
fr_test_pred['TARGET'] = fr_test
fr_test_pred = fr_test_pred[['ID', 'TARGET']]

y_train = pd.read_csv('../challenge_data/Y_train.csv')
train_pred = pd.DataFrame()
train_pred['ID'] = y_train['ID']

train_pred = train_pred.merge(de_train[['ID', 'TARGET']], on='ID', how='left')
train_pred = train_pred.merge(fr_result[['ID', 'TARGET']], on='ID', how='left')

train_pred['TARGET'] = train_pred['TARGET_x'].combine_first(train_pred['TARGET_y'])

train_pred = train_pred.drop(['TARGET_x', 'TARGET_y'], axis=1)

spearman_corr_lr = spearmanr(train_pred['TARGET'], y_train['TARGET']).correlation
print(f"Spearman Correlation for Linear Regression: {spearman_corr_lr:.1%}")

# SUBMISSION
df_test = pd.read_csv('../challenge_data/X_test.csv')
test_pred = pd.DataFrame()
test_pred['ID'] = df_test['ID']
test_pred = test_pred.merge(de_test_pred[['ID', 'TARGET']], on='ID', how='left')
test_pred = test_pred.merge(fr_test_pred[['ID', 'TARGET']], on='ID', how='left')
test_pred['TARGET'] = test_pred['TARGET_x'].combine_first(test_pred['TARGET_y'])
test_pred = test_pred.drop(['TARGET_x', 'TARGET_y'], axis=1)
test_pred.to_csv('./Submission/' + 'xgb_final' + '.csv', index=False)

Spearman Correlation for Linear Regression: 65.0%
Spearman Correlation for Linear Regression: 16.2%
Spearman Correlation for Linear Regression: 74.5%


In [12]:
train_pred.to_csv('./Train submission/xgb_train.csv', index = False)

In [21]:
##########################################################
### Training process -- GradientBoosting
##########################################################

# Assuming 'TARGET' is your target variable
X = de_train_final
y = y_de_train

# Split the data
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)

# Train the model
model = GradientBoostingRegressor(max_depth=2, n_estimators=280, learning_rate=0.01, random_state=1)
model.fit(x_train, y_train)

# Predict on the test set
y_pred_lr = model.predict(x_test)

# Calculate Spearman Correlation
spearman_corr_lr = spearmanr(y_pred_lr, y_test).correlation
print(f"Spearman Correlation for Linear Regression: {spearman_corr_lr:.1%}")

de_train_pred = model.predict(X)
de_train = de_train_final.reset_index()
de_train['TARGET'] = de_train_pred
de_train = de_train[['ID', 'TARGET']]
de_test = model.predict(de_test_final)
de_test_pred = de_test_final.reset_index()
de_test_pred['TARGET'] = de_test
de_test_pred = de_test_pred[['ID', 'TARGET']]

# Assuming 'TARGET' is your target variable
X = fr_train_final
y = y_fr_train

# Split the data
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)

# Train the model
model = GradientBoostingRegressor(max_depth=2, n_estimators=280, learning_rate=0.01, random_state=1)
model.fit(x_train, y_train)

# Predict on the test set
y_pred_lr = model.predict(x_test)

# Calculate Spearman Correlation
spearman_corr_lr = spearmanr(y_pred_lr, y_test).correlation
print(f"Spearman Correlation for Linear Regression: {spearman_corr_lr:.1%}")

fr_train_pred = model.predict(X)
fr_result = fr_train_final.reset_index()
fr_result['TARGET'] = fr_train_pred
fr_result = fr_result[['ID', 'TARGET']]
fr_test = model.predict(fr_test_final)
fr_test_pred = fr_test_final.reset_index()
fr_test_pred['TARGET'] = fr_test
fr_test_pred = fr_test_pred[['ID', 'TARGET']]

y_train = pd.read_csv('../challenge_data/Y_train.csv')
train_pred = pd.DataFrame()
train_pred['ID'] = y_train['ID']

train_pred = train_pred.merge(de_train[['ID', 'TARGET']], on='ID', how='left')
train_pred = train_pred.merge(fr_result[['ID', 'TARGET']], on='ID', how='left')

train_pred['TARGET'] = train_pred['TARGET_x'].combine_first(train_pred['TARGET_y'])

train_pred = train_pred.drop(['TARGET_x', 'TARGET_y'], axis=1)

spearman_corr_lr = spearmanr(train_pred['TARGET'], y_train['TARGET']).correlation
print(f"Spearman Correlation for Linear Regression: {spearman_corr_lr:.1%}")

# SUBMISSION
df_test = pd.read_csv('../challenge_data/X_test.csv')
test_pred = pd.DataFrame()
test_pred['ID'] = df_test['ID']
test_pred = test_pred.merge(de_test_pred[['ID', 'TARGET']], on='ID', how='left')
test_pred = test_pred.merge(fr_test_pred[['ID', 'TARGET']], on='ID', how='left')
test_pred['TARGET'] = test_pred['TARGET_x'].combine_first(test_pred['TARGET_y'])
test_pred = test_pred.drop(['TARGET_x', 'TARGET_y'], axis=1)
test_pred.to_csv('./Submission/' + 'gb_final' + '.csv', index=False)

  y = column_or_1d(y, warn=True)


Spearman Correlation for Linear Regression: 58.7%


  y = column_or_1d(y, warn=True)


Spearman Correlation for Linear Regression: 23.3%
Spearman Correlation for Linear Regression: 52.8%


In [6]:
train_pred.to_csv('./Train submission/gb_train.csv', index = False)

In [7]:
##########################################################
### Training process -- AdaBoosting
##########################################################

# Assuming 'TARGET' is your target variable
X = de_train_final
y = y_de_train

# Split the data
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)

# Train the model
clf = DecisionTreeRegressor(max_depth=3)
clf.fit(x_train, y_train)
model = AdaBoostRegressor(clf, n_estimators=500, learning_rate=0.1, random_state=1)
model.fit(x_train, y_train)

# Predict on the test set
y_pred_lr = model.predict(x_test)

# Calculate Spearman Correlation
spearman_corr_lr = spearmanr(y_pred_lr, y_test).correlation
print(f"Spearman Correlation for Linear Regression: {spearman_corr_lr:.1%}")

de_train_pred = model.predict(X)
de_train = de_train_final.reset_index()
de_train['TARGET'] = de_train_pred
de_train = de_train[['ID', 'TARGET']]
de_test = model.predict(de_test_final)
de_test_pred = de_test_final.reset_index()
de_test_pred['TARGET'] = de_test
de_test_pred = de_test_pred[['ID', 'TARGET']]

# Assuming 'TARGET' is your target variable
X = fr_train_final
y = y_fr_train

# Split the data
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)

# Train the model
clf = DecisionTreeRegressor(max_depth=3)
clf.fit(x_train, y_train)
model = AdaBoostRegressor(clf, n_estimators=500, learning_rate=0.1, random_state=1)
model.fit(x_train, y_train)

# Predict on the test set
y_pred_lr = model.predict(x_test)

# Calculate Spearman Correlation
spearman_corr_lr = spearmanr(y_pred_lr, y_test).correlation
print(f"Spearman Correlation for Linear Regression: {spearman_corr_lr:.1%}")

fr_train_pred = model.predict(X)
fr_result = fr_train_final.reset_index()
fr_result['TARGET'] = fr_train_pred
fr_result = fr_result[['ID', 'TARGET']]
fr_test = model.predict(fr_test_final)
fr_test_pred = fr_test_final.reset_index()
fr_test_pred['TARGET'] = fr_test
fr_test_pred = fr_test_pred[['ID', 'TARGET']]

y_train = pd.read_csv('../challenge_data/Y_train.csv')
train_pred = pd.DataFrame()
train_pred['ID'] = y_train['ID']

train_pred = train_pred.merge(de_train[['ID', 'TARGET']], on='ID', how='left')
train_pred = train_pred.merge(fr_result[['ID', 'TARGET']], on='ID', how='left')

train_pred['TARGET'] = train_pred['TARGET_x'].combine_first(train_pred['TARGET_y'])

train_pred = train_pred.drop(['TARGET_x', 'TARGET_y'], axis=1)

spearman_corr_lr = spearmanr(train_pred['TARGET'], y_train['TARGET']).correlation
print(f"Spearman Correlation for Linear Regression: {spearman_corr_lr:.1%}")

# SUBMISSION
df_test = pd.read_csv('../challenge_data/X_test.csv')
test_pred = pd.DataFrame()
test_pred['ID'] = df_test['ID']
test_pred = test_pred.merge(de_test_pred[['ID', 'TARGET']], on='ID', how='left')
test_pred = test_pred.merge(fr_test_pred[['ID', 'TARGET']], on='ID', how='left')
test_pred['TARGET'] = test_pred['TARGET_x'].combine_first(test_pred['TARGET_y'])
test_pred = test_pred.drop(['TARGET_x', 'TARGET_y'], axis=1)
test_pred.to_csv('./Submission/' + 'adab_final' + '.csv', index=False)

  y = column_or_1d(y, warn=True)


Spearman Correlation for Linear Regression: 56.6%


  y = column_or_1d(y, warn=True)


Spearman Correlation for Linear Regression: 7.2%
Spearman Correlation for Linear Regression: 38.2%


In [8]:
train_pred.to_csv('./Train submission/adb_train.csv', index = False)