# **Model for the Business Solar Energy Production**

The aim of this notebook is to create a baseline model for the deterministc part of the solar energy production time series, treated here as a cross-sectional problem. Furthermore, we will de-trend and de-seasonal the data.

## 0 - Set up

### 0.1 - Loading relevant packages

In [1]:
#for handling the data
import numpy as np
import pandas as pd

#for visualisation
import matplotlib.pyplot as plt
import seaborn as sns

sns.set() # set seaborn as default style


#for the modelling
from catboost import CatBoostRegressor, Pool

#for hyperparameter tunning
import optuna

#for model evaluation
from sklearn.metrics import mean_absolute_error


#model serilasation
import pickle as pk

#miscellania
import json
import warnings
warnings.filterwarnings("ignore")

RSEED = 42

### 0.2 - User-defined functions

#### Optuna objective function

In [2]:
def objective(trial):
    
    fixed_ignored_feats = ['shortwave_radiation', 'installed_capacity', 
                           "is_school_holiday", "is_holiday", "is_weekend", 
                           "cloudcover_high", 'highest_price_per_mwh', "snowfall",
                           'lowest_price_per_mwh', 'euros_per_mwh', 'prediction_unit_id'
                          ]
    
    ignored_feats = [fixed_ignored_feats, 
                     fixed_ignored_feats + ['is_population_over_100k'], 
                     fixed_ignored_feats + ['windspeed_10m', "winddirection_10m"],
                     fixed_ignored_feats + ['is_population_over_100k', 'windspeed_10m', "winddirection_10m"],
                     fixed_ignored_feats + ['highest_price_per_mwh_21_weeks_lag', 'lowest_price_per_mwh_21_weeks_lag', 
                                            'highest_price_per_mwh_23_weeks_lag', 'lowest_price_per_mwh_23_weeks_lag']
                    ]    
    
    idx = trial.suggest_categorical('idx', [0, 1, 2, 3, 4])
    
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 500, 1200),
        "task_type" : 'CPU',
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "depth": trial.suggest_int("depth", 3, 10),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
        "ignored_features" : ignored_feats[idx],
        "loss_function" : 'RMSE',
        "eval_metric" : MAE_EXP(),
        "use_best_model" : True,
        "random_seed" : RSEED,
        "cat_features" : cat_feats
    }


    weight_train = np.array(X_train.installed_capacity)
    weight_val = np.array(X_val.installed_capacity)

    train_data = Pool(data = X_train, 
                      label = y_train.log1p_target_per_installed_capacity, 
                      weight = weight_train, 
                      cat_features = cat_feats
                     )


    eval_data = Pool(data = X_val, 
                     label = y_val.log1p_target_per_installed_capacity, 
                     weight = weight_val, 
                     cat_features = cat_feats
                    )


    model = CatBoostRegressor(**params, silent=True)
    model.fit(train_data, eval_set = eval_data)
    predictions = model.predict(X_val)
    
    y_true = np.array(y_val.target)
    y_pred = np.expm1(np.array(predictions))*weight_val
    
    mae_score = mean_absolute_error(y_true, y_pred)
    
    return mae_score

#### MAE with exponentials

In [3]:
class MAE_EXP:
   
    def is_max_optimal(self):
        return False # lower is better

    def evaluate(self, approxes, target, weight):
 
        y_true = np.exp(np.array(target))
        y_pred = np.exp(np.array(approxes))
        w = np.array(weight)
        
        
        score = np.mean(np.abs(y_true - y_pred)*w)
        return score, 1

    def get_final_error(self, error, weight):
        return error

#### Get targets and features

In [4]:
def get_feats_and_target(df):
    
    datetime_feats = df.select_dtypes(include=["datetime64"]).columns.to_list()
    cat_feats = df.select_dtypes(include=["category"]).columns.to_list()
    num_cols = df.select_dtypes(include=["float64", "int64"]).columns.to_list()
    
    num_feats = [elem for elem in num_cols if "target" not in elem]
    target = [elem for elem in num_cols if elem not in num_feats]
    
    cat_num_feats = cat_feats + num_feats
        
    return cat_feats, num_feats, cat_num_feats, target

#### Train/validation split

In [5]:
def my_train_val_split(df, date):

    train_cond = pd.to_datetime(df['date']) < pd.to_datetime(date)
    val_cond = pd.to_datetime(df['date']) >= pd.to_datetime(date)
    
    df_train = df[train_cond]
    df_val = df[val_cond]
    
    return df_train, df_val

## 1 - Loading the data

In [6]:
data_prod_df = pd.read_csv('../data/data_production.csv', parse_dates = ["datetime", "date"])

In [7]:
cat_cols = [
    "county",
    "is_business",
    "product_type",
    "hour",
    "month",
    "is_weekend",
    "is_holiday",
    "is_school_holiday",
    "is_population_over_100k"
]

data_prod_df[cat_cols] = data_prod_df[cat_cols].astype("category")

In [8]:
data_prod_df.head()

Unnamed: 0,county,is_business,product_type,prediction_unit_id,datetime,date,solar_azimuth,solar_altitude,hour,month,...,cloudcover_high,windspeed_10m,winddirection_10m,shortwave_radiation,direct_solar_radiation,diffuse_radiation,is_population_over_100k,is_holiday,is_school_holiday,target
0,0,0,1,0,2022-02-09,2022-02-09,167.90799,-45.119922,0,2,...,0.0,4.69,154.0,0.0,0.0,0.0,1,0,0,0.212
1,0,0,2,1,2022-02-09,2022-02-09,167.90799,-45.119922,0,2,...,0.0,4.69,154.0,0.0,0.0,0.0,1,0,0,0.0
2,0,0,3,2,2022-02-09,2022-02-09,167.90799,-45.119922,0,2,...,0.0,4.69,154.0,0.0,0.0,0.0,1,0,0,0.005
3,0,1,0,3,2022-02-09,2022-02-09,167.90799,-45.119922,0,2,...,0.0,4.69,154.0,0.0,0.0,0.0,1,0,0,0.0
4,0,1,1,4,2022-02-09,2022-02-09,167.90799,-45.119922,0,2,...,0.0,4.69,154.0,0.0,0.0,0.0,1,0,0,0.0


## 2 - Features and target

We separate the different types of features we have in our dataset, namely *datetime*, *categorical* and *numerical features*.

In [9]:
data_prod_df["log1p_target_per_installed_capacity"] = np.log1p(data_prod_df.target/data_prod_df.installed_capacity)

In [10]:
cat_feats, num_feats, cat_num_feats, target = get_feats_and_target(data_prod_df)

In [11]:
data_df = data_prod_df[data_prod_df['is_business'] == 1]
data_train_df, data_val_df = my_train_val_split(data_df, date = '2023-05-24')

In [12]:
X_train = data_train_df[cat_num_feats].copy()
y_train = data_train_df[['datetime'] + target].copy()

X_val = data_val_df[cat_num_feats].copy()
y_val = data_val_df[['datetime'] + target].copy()

##  3 - Hyperparameter tuning

In [13]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=60)

[I 2024-04-20 16:27:17,424] A new study created in memory with name: no-name-ec9c49dd-9954-49fe-93d4-d75aed9a9318
[I 2024-04-20 16:32:00,074] Trial 0 finished with value: 87.44178080656717 and parameters: {'idx': 4, 'n_estimators': 1178, 'learning_rate': 0.07592279806927878, 'depth': 7, 'min_data_in_leaf': 42}. Best is trial 0 with value: 87.44178080656717.
[I 2024-04-20 16:34:22,956] Trial 1 finished with value: 130.87197810953182 and parameters: {'idx': 0, 'n_estimators': 957, 'learning_rate': 0.006692709283172387, 'depth': 5, 'min_data_in_leaf': 47}. Best is trial 0 with value: 87.44178080656717.
[I 2024-04-20 16:37:38,550] Trial 2 finished with value: 140.5626433420545 and parameters: {'idx': 1, 'n_estimators': 793, 'learning_rate': 0.0030765984771863055, 'depth': 9, 'min_data_in_leaf': 32}. Best is trial 0 with value: 87.44178080656717.
[I 2024-04-20 16:41:02,046] Trial 3 finished with value: 131.2767259189026 and parameters: {'idx': 3, 'n_estimators': 853, 'learning_rate': 0.0045

In [14]:
fixed_ignored_feats = ['shortwave_radiation', 'installed_capacity', 
                       "is_school_holiday", "is_holiday", "is_weekend", 
                       "cloudcover_high", 'highest_price_per_mwh', "snowfall",
                       'lowest_price_per_mwh', 'euros_per_mwh', 'prediction_unit_id'
                      ]
    
ignored_feats = [fixed_ignored_feats, 
                 fixed_ignored_feats + ['is_population_over_100k'], 
                 fixed_ignored_feats + ['windspeed_10m', "winddirection_10m"],
                 fixed_ignored_feats + ['is_population_over_100k', 'windspeed_10m', "winddirection_10m"], 
                 fixed_ignored_feats + ['highest_price_per_mwh_21_weeks_lag', 'lowest_price_per_mwh_21_weeks_lag', 
                                        'highest_price_per_mwh_23_weeks_lag', 'lowest_price_per_mwh_23_weeks_lag']
                ] 

business_prod_best_params = study.best_params
print('Best MAE for validation set:', study.best_value)

idx = business_prod_best_params.get('idx')
del business_prod_best_params['idx']
business_prod_best_params.update({'ignored_features' : ignored_feats[idx], 
                                 'cat_features' : cat_feats, 
                                 'MAE' : study.best_value})

with open("best_parameters_business_production_final.json", "w") as outfile: 
    json.dump(business_prod_best_params, outfile)

Best MAE for validation set: 84.12082501533493


In [15]:
cont_overall_mae = study.best_value*data_df['target'].size/(2*data_prod_df['target'].size)
print('Contribution to the overall MAE:', cont_overall_mae)

Contribution to the overall MAE: 22.855054632096973
