# Import Data

In [2]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from scipy.stats import spearmanr



In [3]:
x_train=pd.read_csv("/Users/Yiru/Desktop/Big Data/CW/electricity/X_train_NHkHMNU.csv")
y_train=pd.read_csv("/Users/Yiru/Desktop/Big Data/CW/electricity/y_train_ZAN5mwg.csv")
x_test=pd.read_csv("/Users/Yiru/Desktop/Big Data/CW/electricity/X_test_final.csv")

In [9]:
x_train.head()


Unnamed: 0,ID,DAY_ID,COUNTRY,DE_CONSUMPTION,FR_CONSUMPTION,DE_FR_EXCHANGE,FR_DE_EXCHANGE,DE_NET_EXPORT,FR_NET_EXPORT,DE_NET_IMPORT,...,FR_RESIDUAL_LOAD,DE_RAIN,FR_RAIN,DE_WIND,FR_WIND,DE_TEMP,FR_TEMP,GAS_RET,COAL_RET,CARBON_RET
0,1054,206,FR,0.210099,-0.427458,-0.606523,0.606523,,0.69286,,...,-0.444661,-0.17268,-0.556356,-0.790823,-0.28316,-1.06907,-0.063404,0.339041,0.124552,-0.002445
1,2049,501,FR,-0.022399,-1.003452,-0.022063,0.022063,-0.57352,-1.130838,0.57352,...,-1.183194,-1.2403,-0.770457,1.522331,0.828412,0.437419,1.831241,-0.659091,0.047114,-0.490365
2,1924,687,FR,1.395035,1.978665,1.021305,-1.021305,-0.622021,-1.682587,0.622021,...,1.947273,-0.4807,-0.313338,0.431134,0.487608,0.684884,0.114836,0.535974,0.743338,0.204952
3,297,720,DE,-0.983324,-0.849198,-0.839586,0.839586,-0.27087,0.56323,0.27087,...,-0.976974,-1.114838,-0.50757,-0.499409,-0.236249,0.350938,-0.417514,0.911652,-0.296168,1.073948
4,1101,818,FR,0.143807,-0.617038,-0.92499,0.92499,,0.990324,,...,-0.526267,-0.541465,-0.42455,-1.088158,-1.01156,0.614338,0.729495,0.245109,1.526606,2.614378


In [10]:
y_train.head()

Unnamed: 0,ID,TARGET
0,1054,0.028313
1,2049,-0.112516
2,1924,-0.18084
3,297,-0.260356
4,1101,-0.071733


# Data Preparation
There are some missing testues in the dataset, so here we use the linear interpolation method to fill the missing value.

In [4]:
x_train.interpolate(method='polynomial',order=3, inplace=True)
x_train_clean = x_train.drop(['COUNTRY'], axis=1).fillna(0)

x_test.interpolate(method='polynomial',order=3, inplace=True)
x_test_clean = x_test.drop(['COUNTRY'], axis=1).fillna(0)

y_train_clean = y_train['TARGET']



missing_values_x_train_clean = x_train_clean.isnull().sum()
print(missing_values_x_train_clean)
print("========================================")

missing_values_y_train_clean = y_train_clean.isnull().sum()
print(missing_values_y_train_clean)


print("\nDataFrame Info:\n", x_train.info())

shape = x_train.shape


print("DataFrame Shape:", shape)


ID                  0
DAY_ID              0
DE_CONSUMPTION      0
FR_CONSUMPTION      0
DE_FR_EXCHANGE      0
FR_DE_EXCHANGE      0
DE_NET_EXPORT       0
FR_NET_EXPORT       0
DE_NET_IMPORT       0
FR_NET_IMPORT       0
DE_GAS              0
FR_GAS              0
DE_COAL             0
FR_COAL             0
DE_HYDRO            0
FR_HYDRO            0
DE_NUCLEAR          0
FR_NUCLEAR          0
DE_SOLAR            0
FR_SOLAR            0
DE_WINDPOW          0
FR_WINDPOW          0
DE_LIGNITE          0
DE_RESIDUAL_LOAD    0
FR_RESIDUAL_LOAD    0
DE_RAIN             0
FR_RAIN             0
DE_WIND             0
FR_WIND             0
DE_TEMP             0
FR_TEMP             0
GAS_RET             0
COAL_RET            0
CARBON_RET          0
dtype: int64
0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1494 entries, 0 to 1493
Data columns (total 35 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                1494 non-null

# LightGBM

In this part, we will use the LightGBM method to model electricity price. 


In [21]:
# X_train, X_test, y_train, y_test = train_test_split(x_train_clean, y_train_clean, test_size=0.2, random_state=42)

# Define LightGBM model
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'mse',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
}

# Define K-fold cross validation
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Model training and assessment (by mse and sp_correlation)
mse_scores = []
SP_scores = []

for train_index, test_index in kf.split(x_train_clean):
    X_train_fold, X_test_fold = x_train_clean.iloc[train_index, 1:], x_train_clean.iloc[test_index, 1:]
    y_train_fold, y_test_fold = y_train_clean.iloc[train_index], y_train_clean.iloc[test_index]
    
    lgb_train = lgb.Dataset(X_train_fold, y_train_fold)
    lgb_test = lgb.Dataset(X_test_fold, y_test_fold, reference=lgb_train)
    
    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=1000,
                    valid_sets=[lgb_train, lgb_test],
                    #early_stopping_rounds=50,
                    #verbose_eval=100
                    )
    
    y_pred = gbm.predict(X_test_fold, num_iteration=gbm.best_iteration)
    mse = mean_squared_error(y_test_fold, y_pred)
    mse_scores.append(mse)

    SP_correlation = spearmanr(y_pred, y_test_fold).correlation
    SP_scores.append(SP_correlation)

# calculate the mean mse and mean sp_correlation
mean_mse = np.mean(mse_scores)
print("Mean MSE:", mean_mse)

mean_SP = np.mean(SP_scores)
print("Mean Spearman Correlation: ", mean_SP)

# calculate the target 
y_pred_test = gbm.predict(x_test_clean.iloc[:, 1:], num_iteration=gbm.best_iteration)

y_test_submission = x_test[['ID']].copy()
y_test_submission['TARGET'] = y_pred_test
y_test_submission.to_csv('lightgbm.csv', index=False)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001241 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8404
[LightGBM] [Info] Number of data points in the train set: 1195, number of used features: 33
[LightGBM] [Info] Start training from score 0.083871
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000679 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8405
[LightGBM] [Info] Number of data points in the train set: 1195, number of used features: 33
[LightGBM] [Info] Start training from score 0.083566
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000811 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8407
[LightGBM] [Info] Number of data points in the train set: 1195, number of used features: 33
[LightGBM] [Info] Start traini

# LightGBM_grid

In [6]:
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMRegressor
from sklearn.model_selection import cross_val_predict


# define parameter space
param_grid = {
    'num_leaves': [15, 31, 50],  
    'learning_rate': [0.05, 0.1, 0.2], 
    'feature_fraction': [0.6, 0.8, 0.9],
    'bagging_fraction': [0.6, 0.8, 0.9],
    'bagging_freq': [3, 5, 7]
}

# build the LightGBM model
lgbm = LGBMRegressor(boosting_type='gbdt', objective='regression', metric='mse')

# build the GridSearchCV on LightGBM
grid_search = GridSearchCV(estimator=lgbm, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

# grid search
grid_search.fit(x_train_clean.iloc[:, 1:], y_train_clean)

print("Best parameters found: ", grid_search.best_params_)

print("Best mse: ", -1 * grid_search.best_score_)

# find the best model
best_model = grid_search.best_estimator_

# calculate Spearman correlation
y_pred_train = cross_val_predict(best_model, x_train_clean.iloc[:, 1:], y_train_clean, cv=5)
spearman_corr = spearmanr(y_pred_train, y_train_clean).correlation

print("Best Spearman correlation:", spearman_corr)

# calculate the target test data
y_pred_test = best_model.predict(x_test_clean.iloc[:, 1:])

y_test_submission = x_test[['ID']].copy()
y_test_submission['TARGET'] = y_pred_test
y_test_submission.to_csv('lightgbm_grid.csv', index=False)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000591 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8406
[LightGBM] [Info] Number of data points in the train set: 1195, number of used features: 33
[LightGBM] [Info] Start training from score 0.108013
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000578 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8405
[LightGBM] [Info] Number of data points in the train set: 1195, number of used features: 33
[LightGBM] [Info] Start training from score 0.074386
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000713 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8406
[LightGBM] [Info] Number of data points in the train set: 1195, number of used features: 33
[LightGBM] [Info] Start traini