In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score, mean_absolute_percentage_error, r2_score
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score
from lightgbm import plot_importance
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from torch import nn
import copy
import joblib
from skopt import BayesSearchCV
from skopt.space import Real, Integer
import torch.optim as optim
from sklearn.preprocessing import MinMaxScaler

In [310]:
data = pd.read_excel('data.xlsx')
data=data.reset_index()
data.columns = data.columns.str.replace(' ', '_')
data=data.drop('index',axis=1)

In [389]:
data_x=data.iloc[:,1:291]
data_y=data.iloc[:,0]
data_g=pd.DataFrame()

In [390]:
def remove_highly_correlated_features(data, threshold=0.9):
    corr_matrix = data.corr().abs() 
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))  
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]  #
    data_reduced = data.drop(columns=to_drop)
    return data_reduced, to_drop

data_x, dropped_features = remove_highly_correlated_features(data_x)

In [391]:
data_y=data.iloc[:,329]
data_X_train,data_X_test, data_y_train, data_y_test =train_test_split(data_x,data_y,test_size=0.2, random_state=23)

xgb_best_model = xgb.XGBRegressor(random_state=23)
xgb_best_model.fit(data_X_train, data_y_train)

y_pred = xgb_best_model.predict(data_X_test)
r2 = r2_score(data_y_test, y_pred)
print("R-squared Score (R2): {:.4f}".format(r2))

R-squared Score (R2): 0.7978


In [392]:

def xgb_cv(n_estimators, max_depth, learning_rate,):
    params = {
        'n_estimators': int(n_estimators),
        'max_depth': int(max_depth),
        'learning_rate': learning_rate
    }
    xgb_model = xgb.XGBRegressor(**params)
    
    cv_scores = cross_val_score(xgb_model, data_X_train, data_y_train, scoring='neg_root_mean_squared_error', cv=5)
    return cv_scores.mean()  

param_bounds = {
    'n_estimators': (100, 1000),
    'max_depth': (2, 7),
    'learning_rate': (0.01, 0.3)
}


optimizer = BayesianOptimization(
    f=xgb_cv,
    pbounds=param_bounds,
    verbose=2
)
optimizer.maximize(init_points=3, n_iter=7)

print("Best Parameters Found:", optimizer.max)


best_params = optimizer.max['params']
best_params['n_estimators'] = int(best_params['n_estimators'])  
best_params['max_depth'] = int(best_params['max_depth'])  

xgb_best_model = xgb.XGBRegressor(**best_params)
xgb_best_model.fit(data_X_train, data_y_train)

y_pred = xgb_best_model.predict(data_X_test)
r2 = r2_score(data_y_test, y_pred)
print("R-squared Score (R2): {:.4f}".format(r2))

|   iter    |  target   | learni... | max_depth | n_esti... |
-------------------------------------------------------------
| [0m1        [0m | [0m-2.972e+0[0m | [0m0.2625   [0m | [0m5.358    [0m | [0m739.9    [0m |
| [0m2        [0m | [0m-3.306e+0[0m | [0m0.2288   [0m | [0m3.615    [0m | [0m126.4    [0m |
| [0m3        [0m | [0m-3.427e+0[0m | [0m0.2046   [0m | [0m2.102    [0m | [0m992.6    [0m |
| [0m4        [0m | [0m-3.094e+0[0m | [0m0.2405   [0m | [0m4.818    [0m | [0m738.9    [0m |
| [95m5        [0m | [95m-2.955e+0[0m | [95m0.04549  [0m | [95m5.807    [0m | [95m742.0    [0m |
| [0m6        [0m | [0m-3.301e+0[0m | [0m0.05908  [0m | [0m2.21     [0m | [0m742.8    [0m |
| [0m7        [0m | [0m-3.039e+0[0m | [0m0.05946  [0m | [0m6.963    [0m | [0m741.1    [0m |
| [0m8        [0m | [0m-3.01e+03[0m | [0m0.2649   [0m | [0m6.887    [0m | [0m745.2    [0m |
| [0m9        [0m | [0m-2.979e+0[0m | [0m0.06153  

In [393]:
data_g['BTUNGPLHEAT']=xgb_best_model.predict(data_x)

In [394]:
data_y_i=data.iloc[:,330]
data_X_train,data_X_test, data_y_train, data_y_test =train_test_split(data_x,data_y_i,test_size=0.2, random_state=23)

xgb_best_model = xgb.XGBRegressor(random_state=23)
xgb_best_model.fit(data_X_train, data_y_train)

y_pred = xgb_best_model.predict(data_X_test)
r2 = r2_score(data_y_test, y_pred)
print("R-squared Score (R2): {:.4f}".format(r2))

R-squared Score (R2): 0.5079


In [395]:
def xgb_cv(n_estimators, max_depth, learning_rate,):
    params = {
        'n_estimators': int(n_estimators),
        'max_depth': int(max_depth),
        'learning_rate': learning_rate
    }
    xgb_model = xgb.XGBRegressor(**params)
    

    cv_scores = cross_val_score(xgb_model, data_X_train, data_y_train, scoring='neg_root_mean_squared_error', cv=5)
    return cv_scores.mean()  


param_bounds = {
    'n_estimators': (100, 1000),
    'max_depth': (2, 7),
    'learning_rate': (0.01, 0.3)
}


optimizer = BayesianOptimization(
    f=xgb_cv,
    pbounds=param_bounds,
    verbose=2
)
optimizer.maximize(init_points=3, n_iter=7)


print("Best Parameters Found:", optimizer.max)


best_params = optimizer.max['params']
best_params['n_estimators'] = int(best_params['n_estimators'])  
best_params['max_depth'] = int(best_params['max_depth'])  


xgb_best_model = xgb.XGBRegressor(**best_params)
xgb_best_model.fit(data_X_train, data_y_train)

y_pred = xgb_best_model.predict(data_X_test)
r2 = r2_score(data_y_test, y_pred)
print("R-squared Score (R2): {:.4f}".format(r2))

|   iter    |  target   | learni... | max_depth | n_esti... |
-------------------------------------------------------------
| [0m1        [0m | [0m-2.599e+0[0m | [0m0.1979   [0m | [0m6.668    [0m | [0m866.0    [0m |
| [95m2        [0m | [95m-2.57e+03[0m | [95m0.02835  [0m | [95m5.401    [0m | [95m182.4    [0m |
| [95m3        [0m | [95m-2.511e+0[0m | [95m0.06348  [0m | [95m3.965    [0m | [95m429.7    [0m |
| [95m4        [0m | [95m-2.51e+03[0m | [95m0.102    [0m | [95m4.861    [0m | [95m430.4    [0m |
| [0m5        [0m | [0m-2.679e+0[0m | [0m0.3      [0m | [0m7.0      [0m | [0m487.3    [0m |
| [0m6        [0m | [0m-2.679e+0[0m | [0m0.3      [0m | [0m7.0      [0m | [0m391.3    [0m |
| [0m7        [0m | [0m-2.514e+0[0m | [0m0.07505  [0m | [0m2.032    [0m | [0m448.1    [0m |
| [0m8        [0m | [0m-2.557e+0[0m | [0m0.2984   [0m | [0m3.221    [0m | [0m121.2    [0m |
| [0m9        [0m | [0m-2.543e+0[0m | [0

In [396]:
data_g['KWHOTH']=xgb_best_model.predict(data_x)

In [397]:
data_y_i=data.iloc[:,331]
data_X_train,data_X_test, data_y_train, data_y_test =train_test_split(data_x,data_y_i,test_size=0.2, random_state=23)

xgb_best_model = xgb.XGBRegressor(random_state=23)
xgb_best_model.fit(data_X_train, data_y_train)

y_pred = xgb_best_model.predict(data_X_test)
r2 = r2_score(data_y_test, y_pred)
print("R-squared Score (R2): {:.4f}".format(r2))

R-squared Score (R2): 0.6001


In [398]:
def xgb_cv(n_estimators, max_depth, learning_rate,):
    params = {
        'n_estimators': int(n_estimators),
        'max_depth': int(max_depth),
        'learning_rate': learning_rate
    }
    xgb_model = xgb.XGBRegressor(**params)
    

    cv_scores = cross_val_score(xgb_model, data_X_train, data_y_train, scoring='neg_root_mean_squared_error', cv=5)
    return cv_scores.mean()  


param_bounds = {
    'n_estimators': (100, 1000),
    'max_depth': (2, 7),
    'learning_rate': (0.01, 0.3)
}

optimizer = BayesianOptimization(
    f=xgb_cv,
    pbounds=param_bounds,
    verbose=2
)
optimizer.maximize(init_points=3, n_iter=7)


print("Best Parameters Found:", optimizer.max)


best_params = optimizer.max['params']
best_params['n_estimators'] = int(best_params['n_estimators'])  
best_params['max_depth'] = int(best_params['max_depth'])  


xgb_best_model = xgb.XGBRegressor(**best_params)
xgb_best_model.fit(data_X_train, data_y_train)

y_pred = xgb_best_model.predict(data_X_test)
r2 = r2_score(data_y_test, y_pred)
print("R-squared Score (R2): {:.4f}".format(r2))

|   iter    |  target   | learni... | max_depth | n_esti... |
-------------------------------------------------------------
| [0m1        [0m | [0m-1.765e+0[0m | [0m0.1484   [0m | [0m6.383    [0m | [0m411.8    [0m |
| [0m2        [0m | [0m-1.797e+0[0m | [0m0.193    [0m | [0m5.502    [0m | [0m992.0    [0m |
| [0m3        [0m | [0m-1.817e+0[0m | [0m0.2722   [0m | [0m6.989    [0m | [0m499.4    [0m |
| [0m4        [0m | [0m-1.845e+0[0m | [0m0.2915   [0m | [0m6.041    [0m | [0m413.0    [0m |
| [95m5        [0m | [95m-1.756e+0[0m | [95m0.1108   [0m | [95m5.411    [0m | [95m992.0    [0m |
| [0m6        [0m | [0m-1.818e+0[0m | [0m0.02814  [0m | [0m3.281    [0m | [0m318.6    [0m |
| [95m7        [0m | [95m-1.737e+0[0m | [95m0.03622  [0m | [95m6.041    [0m | [95m411.7    [0m |
| [0m8        [0m | [0m-1.825e+0[0m | [0m0.2843   [0m | [0m6.347    [0m | [0m793.6    [0m |
| [0m9        [0m | [0m-1.812e+0[0m | [0m0.27

In [399]:
data_g['KWHSPH']=xgb_best_model.predict(data_x)

In [400]:
data_y_i=data.iloc[:,332]
data_X_train,data_X_test, data_y_train, data_y_test =train_test_split(data_x,data_y_i,test_size=0.2, random_state=23)

xgb_best_model = xgb.XGBRegressor(random_state=23)
xgb_best_model.fit(data_X_train, data_y_train)

y_pred = xgb_best_model.predict(data_X_test)
r2 = r2_score(data_y_test, y_pred)
print("R-squared Score (R2): {:.4f}".format(r2))

R-squared Score (R2): 0.5916


In [401]:
data_g['KWHCOL']=xgb_best_model.predict(data_x)

In [402]:
data_y_i=data.iloc[:,333]
data_X_train,data_X_test, data_y_train, data_y_test =train_test_split(data_x,data_y_i,test_size=0.2, random_state=23)

xgb_best_model = xgb.XGBRegressor(random_state=23)
xgb_best_model.fit(data_X_train, data_y_train)


y_pred = xgb_best_model.predict(data_X_test)
r2 = r2_score(data_y_test, y_pred)
print("R-squared Score (R2): {:.4f}".format(r2))

R-squared Score (R2): 0.6621


In [403]:
data_g['KWHWTH']=xgb_best_model.predict(data_x)

In [404]:
data_x=data.iloc[:,1:291]
data_y=data.iloc[:,0]
data_X_train,data_X_test, data_y_train, data_y_test =train_test_split(data_x,data_y,test_size=0.2, random_state=23)

In [405]:
rf = RandomForestRegressor(max_depth=3,n_estimators=50, random_state=23)
rf.fit(data_X_train, data_y_train)

feature_importance = rf.feature_importances_
feature_names = data_x.columns


importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importance
})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

top_features = importance_df.head(5)['Feature'].tolist()
data_select=data_x[top_features]
data_g=data_g.merge(data_select, how='inner', left_index=True, right_index=True)

In [406]:
xgb_best_model = xgb.XGBRegressor()
xgb_best_model.fit(data_X_train, data_y_train)

feature_importance = xgb_best_model.feature_importances_
feature_names = data_x.columns


importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importance
})
importance_df = importance_df.sort_values(by='Importance', ascending=False)


top_features = importance_df.head(5)['Feature'].tolist()
data_select=data_x[top_features]
data_g=data_g.merge(data_select, how='inner', left_index=True, right_index=True)

In [407]:
data_g['kwh']=data_y

In [408]:
data_g.to_csv('data_g.csv')