In [1]:
!pip install --no-index -U --find-links=/kaggle/input/deeptables-dependecies deeptables==0.2.5

Looking in links: /kaggle/input/deeptables-dependecies
Processing /kaggle/input/deeptables-dependecies/deeptables-0.2.5-py3-none-any.whl
Processing /kaggle/input/deeptables-dependecies/hypernets-0.3.0-py3-none-any.whl (from deeptables==0.2.5)
Processing /kaggle/input/deeptables-dependecies/XlsxWriter-3.1.9-py3-none-any.whl (from hypernets>=0.2.5.1->deeptables==0.2.5)
Processing /kaggle/input/deeptables-dependecies/paramiko-3.3.1-py3-none-any.whl (from hypernets>=0.2.5.1->deeptables==0.2.5)
Processing /kaggle/input/deeptables-dependecies/bcrypt-4.0.1-cp36-abi3-manylinux_2_28_x86_64.whl (from paramiko->hypernets>=0.2.5.1->deeptables==0.2.5)
Processing /kaggle/input/deeptables-dependecies/PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (from paramiko->hypernets>=0.2.5.1->deeptables==0.2.5)
Installing collected packages: XlsxWriter, bcrypt, pynacl, paramiko, hypernets, deeptables
Successfully installed XlsxWriter-3.1.9 bcrypt-4.0.1 deeptables-0.2

In [96]:
import warnings
warnings.filterwarnings("ignore")

import os
import math
import numpy as np 
import pandas as pd 
import polars as pl 
import matplotlib.pyplot as plt 
import seaborn as sns
import holidays
import lightgbm as lgb
import tensorflow as tf, deeptables as dt
from tensorflow.keras.utils import plot_model
from tensorflow_addons.optimizers import AdamW
from tensorflow.python.keras import backend as K
from deeptables.models import DeepTable, ModelConfig
from deeptables.models import deepnets
import joblib

print('Tensorflow version:', tf.__version__)
print('DeepTables version:', dt.__version__)

# large number of warnings in data processing step
import warnings
warnings.filterwarnings("ignore")


# fast ai libraries
from fastai.tabular.all import *

# constants
SEED = 2024 # global seed for notebook
BATCH_SIZE = 1024
EPOCHS = 20

#library from Yelim
from statsmodels.tsa.stattools import acf, pacf
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler


Tensorflow version: 2.13.0
DeepTables version: 0.2.5


In [97]:
import warnings
warnings.filterwarnings("ignore")

import os
import math
import numpy as np 
import pandas as pd 
import polars as pl 
import matplotlib.pyplot as plt 
import holidays


from datetime import timedelta

In [98]:
class DataStorage:
    """
    This class was copied out from:
    https://www.kaggle.com/code/vitalykudelya/enefit-object-oriented-gbdt
    """
    
    root = "/kaggle/input/predict-energy-behavior-of-prosumers"

    data_cols = [
        "target",
        "county",
        "is_business",
        "product_type",
        "is_consumption",
        "datetime",
        "row_id",
        "prediction_unit_id",
    ]
    client_cols = [
        "product_type",
        "county",
        "eic_count",
        "installed_capacity",
        "is_business",
        "date",
    ]
    gas_prices_cols = ["forecast_date", "lowest_price_per_mwh", "highest_price_per_mwh"]
    electricity_prices_cols = ["forecast_date", "euros_per_mwh"]
    forecast_weather_cols = [
        "latitude",
        "longitude",
        "hours_ahead",
        "temperature",
        "dewpoint",
        "cloudcover_high",
        "cloudcover_low",
        "cloudcover_mid",
        "cloudcover_total",
        "10_metre_u_wind_component",
        "10_metre_v_wind_component",
        "forecast_datetime",
        "direct_solar_radiation",
        "surface_solar_radiation_downwards",
        "snowfall",
        "total_precipitation",
    ]
    historical_weather_cols = [
        "datetime",
        "temperature",
        "dewpoint",
        "rain",
        "snowfall",
        "surface_pressure",
        "cloudcover_total",
        "cloudcover_low",
        "cloudcover_mid",
        "cloudcover_high",
        "windspeed_10m",
        "winddirection_10m",
        "shortwave_radiation",
        "direct_solar_radiation",
        "diffuse_radiation",
        "latitude",
        "longitude",
    ]
    location_cols = ["longitude", "latitude", "county"]
    target_cols = [
        "target",
        "county",
        "is_business",
        "product_type",
        "is_consumption",
        "datetime",
    ]

    def __init__(self):
        self.df_data = pl.read_csv(
            os.path.join(self.root, "train.csv"),
            columns=self.data_cols,
            try_parse_dates=True,
        )
        self.df_client = pl.read_csv(
            os.path.join(self.root, "client.csv"),
            columns=self.client_cols,
            try_parse_dates=True,
        )
        self.df_gas_prices = pl.read_csv(
            os.path.join(self.root, "gas_prices.csv"),
            columns=self.gas_prices_cols,
            try_parse_dates=True,
        )
        self.df_electricity_prices = pl.read_csv(
            os.path.join(self.root, "electricity_prices.csv"),
            columns=self.electricity_prices_cols,
            try_parse_dates=True,
        )
        self.df_forecast_weather = pl.read_csv(
            os.path.join(self.root, "forecast_weather.csv"),
            columns=self.forecast_weather_cols,
            try_parse_dates=True,
        )
        self.df_historical_weather = pl.read_csv(
            os.path.join(self.root, "historical_weather.csv"),
            columns=self.historical_weather_cols,
            try_parse_dates=True,
        )
        self.df_weather_station_to_county_mapping = pl.read_csv(
            os.path.join(self.root, "weather_station_to_county_mapping.csv"),
            columns=self.location_cols,
            try_parse_dates=True,
        )
        self.df_data = self.df_data.filter(
            pl.col("datetime") >= pd.to_datetime("2022-01-01")
        )
        self.df_target = self.df_data.select(self.target_cols)

        self.schema_data = self.df_data.schema
        self.schema_client = self.df_client.schema
        self.schema_gas_prices = self.df_gas_prices.schema
        self.schema_electricity_prices = self.df_electricity_prices.schema
        self.schema_forecast_weather = self.df_forecast_weather.schema
        self.schema_historical_weather = self.df_historical_weather.schema
        self.schema_target = self.df_target.schema

        self.df_weather_station_to_county_mapping = (
            self.df_weather_station_to_county_mapping.with_columns(
                pl.col("latitude").cast(pl.datatypes.Float32),
                pl.col("longitude").cast(pl.datatypes.Float32),
            )
        )
        
    def run(self): # roll -1hr, for only T & T_d
        self.df_data = self.fill_target(self.df_data)
        self.df_forecast_weather = self.fill_radiation(self.df_forecast_weather)
        self.df_forecast_weather = self.fill_summertime(self.df_forecast_weather)
        self.df_forecast_weather = self.separateTP(self.df_forecast_weather)
        self.df_forecast_weather = self.expForecastHr(self.df_forecast_weather)
        self.df_historical_weather = self.snow2water(self.df_historical_weather)
        self.df_historical_weather = self.histRoll(self.df_historical_weather)
        return self.df
        
    def _interpolate_group(self, group):
        group['target'] = group['target'].interpolate(method='linear')
        return group

    def fill_target(self, df):
        return pl.DataFrame(df.to_pandas().groupby(['prediction_unit_id', 'is_consumption']).apply(_interpolate_group))

    def fill_radiation(self, df):
        rad = df.to_pandas()['surface_solar_radiation_downwards'].values
        idx = df['surface_solar_radiation_downwards'].is_null().to_numpy().nonzero()[0]
        for i, ind in enumerate(idx):
            tmp = df[idx][i]
            df_b1 = df.filter(
                pl.col('latitude')==tmp['latitude'], pl.col('longitude')==tmp['longitude'],
                abs(pl.col('forecast_datetime') - tmp['forecast_datetime']) < timedelta(days=2),
                pl.col('forecast_datetime').dt.hour() == tmp['forecast_datetime'].dt.hour(),
                pl.col('hours_ahead') == tmp['hours_ahead'])
            fillValue = df_b1['direct_solar_radiation'][1] / ((np.divide(df_b1['direct_solar_radiation'][0], df_b1['surface_solar_radiation_downwards'][0]) +
                np.divide(df_b1['direct_solar_radiation'][2], df_b1['surface_solar_radiation_downwards'][2]))/2)
            rad[ind] = fillValue
        df.replace('surface_solar_radiation_downwards', pl.Series(rad))
        return df

    def fill_summertime(self, df):
        missingDate = list(set(pd.date_range('2021-09-01', '2023-06-02', freq='h')[3:-22]) - set(df.to_pandas()['forecast_datetime'].unique()))
        hrs_ahead = 2
        add_df = pd.DataFrame()
        for date in missingDate:
            tmp = df.filter(abs(pl.col('forecast_datetime') - date) < timedelta(hours=2),
                            pl.col('hours_ahead') <= 2).sort('latitude', 'longitude').to_pandas()
            for _, row in tmp.iterrows():
                if row['hours_ahead'] == 1:
                    values_1 = row
                elif row['hours_ahead'] == 2:
                    values_2 = row
                    average_values = pd.Series([(v1+v2)/2 if c != 'forecast_datetime' else date for (v1,v2,c) in zip(values_1,values_2,values_2.keys())],
                                            index=values_2.keys())
                    average_values['hours_ahead'] = hrs_ahead
                    add_df = pd.concat([add_df, average_values.to_frame().T]).reset_index(drop=True)
        return pl.DataFrame(pd.concat([df.to_pandas(), add_df]).reset_index(drop=True))

    def snow2water(self, df): # for historical: [cm]/7->[mm]
        df = df.with_columns([
        (df['total_precipitation'] - df['snowfall'] / 100).alias('rain')
    ])
        return df

    def separateTP(self, df): 
    # Adjust the indentation as needed to match the rest of your class
        df = df.with_columns([
            (df['total_precipitation'] - df['snowfall'] / 100).alias('rain')
        ])
        return df

    def expForecastHr(self, df):
    # Ensure the indentation here matches the rest of your class
        def _exp(x):
            return np.exp(x) / np.exp(48)

        df = df.with_columns([
            df['hours_ahead'].apply(_exp).alias('exp_hours_ahead')
        ])
        return df

    def update_with_new_data(
        self,
        df_new_client,
        df_new_gas_prices,
        df_new_electricity_prices,
        df_new_forecast_weather,
        df_new_historical_weather,
        df_new_target,
    ):
        df_new_client = pl.from_pandas(
            df_new_client[self.client_cols], schema_overrides=self.schema_client
        )
        df_new_gas_prices = pl.from_pandas(
            df_new_gas_prices[self.gas_prices_cols],
            schema_overrides=self.schema_gas_prices,
        )
        df_new_electricity_prices = pl.from_pandas(
            df_new_electricity_prices[self.electricity_prices_cols],
            schema_overrides=self.schema_electricity_prices,
        )
        df_new_forecast_weather = pl.from_pandas(
            df_new_forecast_weather[self.forecast_weather_cols],
            schema_overrides=self.schema_forecast_weather,
        )
        df_new_historical_weather = pl.from_pandas(
            df_new_historical_weather[self.historical_weather_cols],
            schema_overrides=self.schema_historical_weather,
        )
        df_new_target = pl.from_pandas(
            df_new_target[self.target_cols], schema_overrides=self.schema_target
        )

        self.df_client = pl.concat([self.df_client, df_new_client]).unique(
            ["date", "county", "is_business", "product_type"]
        )
        self.df_gas_prices = pl.concat([self.df_gas_prices, df_new_gas_prices]).unique(
            ["forecast_date"]
        )
        self.df_electricity_prices = pl.concat(
            [self.df_electricity_prices, df_new_electricity_prices]
        ).unique(["forecast_date"])
        self.df_forecast_weather = pl.concat(
            [self.df_forecast_weather, df_new_forecast_weather]
        ).unique(["forecast_datetime", "latitude", "longitude", "hours_ahead"])
        self.df_historical_weather = pl.concat(
            [self.df_historical_weather, df_new_historical_weather]
        ).unique(["datetime", "latitude", "longitude"])
        self.df_target = pl.concat([self.df_target, df_new_target]).unique(
            ["datetime", "county", "is_business", "product_type", "is_consumption"]
        )

    def preprocess_test(self, df_test):
        df_test = df_test.rename(columns={"prediction_datetime": "datetime"})
        df_test = pl.from_pandas(
            df_test[self.data_cols[1:]], schema_overrides=self.schema_data
        )
        return df_test

In [99]:
class FeaturesGenerator:
    """
    This class was copied out from:
    https://www.kaggle.com/code/vitalykudelya/enefit-object-oriented-gbdt
    """
    def __init__(self, data_storage):
        self.data_storage = data_storage

    def _add_general_features(self, df_features):
        df_features = (
            df_features.with_columns(
                pl.col("datetime").dt.ordinal_day().alias("dayofyear"),
                pl.col("datetime").dt.hour().alias("hour"),
                pl.col("datetime").dt.day().alias("day"),
                pl.col("datetime").dt.weekday().alias("weekday"),
                pl.col("datetime").dt.month().alias("month"),
                pl.col("datetime").dt.year().alias("year"),
            )
            .with_columns(
                pl.concat_str(
                    "county",
                    "is_business",
                    "product_type",
                    "is_consumption",
                    separator="_",
                ).alias("segment"),
            )
            .with_columns(
                (np.pi * pl.col("dayofyear") / 183).sin().alias("sin(dayofyear)"),
                (np.pi * pl.col("dayofyear") / 183).cos().alias("cos(dayofyear)"),
                (np.pi * pl.col("hour") / 12).sin().alias("sin(hour)"),
                (np.pi * pl.col("hour") / 12).cos().alias("cos(hour)"),
            )
        )
        return df_features

    def _add_client_features(self, df_features):
        df_client = self.data_storage.df_client

        df_features = df_features.join(
            df_client.with_columns(
                (pl.col("date") + pl.duration(days=2)).cast(pl.Date)
            ),
            on=["county", "is_business", "product_type", "date"],
            how="left",
        )
        return df_features

    def _add_forecast_weather_features(self, df_features):
        df_forecast_weather = self.data_storage.df_forecast_weather
        df_weather_station_to_county_mapping = (
            self.data_storage.df_weather_station_to_county_mapping
        )

        df_forecast_weather = (
            df_forecast_weather.rename({"forecast_datetime": "datetime"})
            #.filter((pl.col("hours_ahead") >= 22) & pl.col("hours_ahead") <= 45)
            .drop("hours_ahead")
            .with_columns(
                pl.col("latitude").cast(pl.datatypes.Float32),
                pl.col("longitude").cast(pl.datatypes.Float32),
            )
            .join(
                df_weather_station_to_county_mapping,
                how="left",
                on=["longitude", "latitude"],
            )
            .drop("longitude", "latitude")
        )

        df_forecast_weather_date = (
            df_forecast_weather.group_by("datetime").mean().drop("county")
        )

        df_forecast_weather_local = (
            df_forecast_weather.filter(pl.col("county").is_not_null())
            .group_by("county", "datetime")
            .mean()
        )

        for hours_lag in [0, 7 * 24]:
            df_features = df_features.join(
                df_forecast_weather_date.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ),
                on="datetime",
                how="left",
                suffix=f"_forecast_{hours_lag}h",
            )
            df_features = df_features.join(
                df_forecast_weather_local.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ),
                on=["county", "datetime"],
                how="left",
                suffix=f"_forecast_local_{hours_lag}h",
            )

        return df_features

    def _add_historical_weather_features(self, df_features):
        df_historical_weather = self.data_storage.df_historical_weather
        df_weather_station_to_county_mapping = (
            self.data_storage.df_weather_station_to_county_mapping
        )

        df_historical_weather = (
            df_historical_weather.with_columns(
                pl.col("latitude").cast(pl.datatypes.Float32),
                pl.col("longitude").cast(pl.datatypes.Float32),
            )
            .join(
                df_weather_station_to_county_mapping,
                how="left",
                on=["longitude", "latitude"],
            )
            .drop("longitude", "latitude")
        )

        df_historical_weather_date = (
            df_historical_weather.group_by("datetime").mean().drop("county")
        )

        df_historical_weather_local = (
            df_historical_weather.filter(pl.col("county").is_not_null())
            .group_by("county", "datetime")
            .mean()
        )

        for hours_lag in [2 * 24, 7 * 24]:
            df_features = df_features.join(
                df_historical_weather_date.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ),
                on="datetime",
                how="left",
                suffix=f"_historical_{hours_lag}h",
            )
            df_features = df_features.join(
                df_historical_weather_local.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ),
                on=["county", "datetime"],
                how="left",
                suffix=f"_historical_local_{hours_lag}h",
            )

        for hours_lag in [1 * 24]:
            df_features = df_features.join(
                df_historical_weather_date.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag),
                    pl.col("datetime").dt.hour().alias("hour"),
                )
                #.filter(pl.col("hour") <= 10)
                .drop("hour"),
                on="datetime",
                how="left",
                suffix=f"_historical_{hours_lag}h",
            )

        return df_features

    def _add_target_features(self, df_features):
        df_target = self.data_storage.df_target

        df_target_all_type_sum = (
            df_target.group_by(["datetime", "county", "is_business", "is_consumption"])
            .sum()
            .drop("product_type")
        )

        df_target_all_county_type_sum = (
            df_target.group_by(["datetime", "is_business", "is_consumption"])
            .sum()
            .drop("product_type", "county")
        )

        for hours_lag in [
            2 * 24,
            3 * 24,
            4 * 24,
            5 * 24,
            6 * 24,
            7 * 24,
            8 * 24,
            9 * 24,
            10 * 24,
            11 * 24,
            12 * 24,
            13 * 24,
            14 * 24,
            6,      ###
            12,     ###
            84,     ###
            3096,   ### add juwon
        ]:
            df_features = df_features.join(
                df_target.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ).rename({"target": f"target_{hours_lag}h"}),
                on=[
                    "county",
                    "is_business",
                    "product_type",
                    "is_consumption",
                    "datetime",
                ],
                how="left",
            )

        for hours_lag in [2 * 24, 3 * 24, 7 * 24, 14 * 24]:
            df_features = df_features.join(
                df_target_all_type_sum.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ).rename({"target": f"target_all_type_sum_{hours_lag}h"}),
                on=["county", "is_business", "is_consumption", "datetime"],
                how="left",
            )

            df_features = df_features.join(
                df_target_all_county_type_sum.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ).rename({"target": f"target_all_county_type_sum_{hours_lag}h"}),
                on=["is_business", "is_consumption", "datetime"],
                how="left",
                suffix=f"_all_county_type_sum_{hours_lag}h",
            )

        cols_for_stats = [
            f"target_{hours_lag}h" for hours_lag in [2 * 24, 3 * 24, 4 * 24, 5 * 24]
        ]
        df_features = df_features.with_columns(
            df_features.select(cols_for_stats).mean(axis=1).alias(f"target_mean"),
            df_features.select(cols_for_stats)
            .transpose()
            .std()
            .transpose()
            .to_series()
            .alias(f"target_std"),
        )

        for target_prefix, lag_nominator, lag_denomonator in [
            ("target", 24 * 7, 24 * 14),
            ("target", 24 * 2, 24 * 9),
            ("target", 24 * 3, 24 * 10),
            ("target", 24 * 2, 24 * 3),
            ("target_all_type_sum", 24 * 2, 24 * 3),
            ("target_all_type_sum", 24 * 7, 24 * 14),
            ("target_all_county_type_sum", 24 * 2, 24 * 3),
            ("target_all_county_type_sum", 24 * 7, 24 * 14),
        ]:
            df_features = df_features.with_columns(
                (
                    pl.col(f"{target_prefix}_{lag_nominator}h")
                    / (pl.col(f"{target_prefix}_{lag_denomonator}h") + 1e-3)
                ).alias(f"{target_prefix}_ratio_{lag_nominator}_{lag_denomonator}")
            )

        return df_features

    def _reduce_memory_usage(self, df_features):
        df_features = df_features.with_columns(pl.col(pl.Float64).cast(pl.Float32))
        return df_features

    def _drop_columns(self, df_features):
        df_features = df_features.drop(
           "datetime", "hour", "dayofyear"
        )
        return df_features

    def _to_pandas(self, df_features, y):
        cat_cols = [
            "county",
            "is_business",
            "product_type",
            "is_consumption",
            "segment",
        ]

        if y is not None:
            df_features = pd.concat([df_features.to_pandas(), y.to_pandas()], axis=1)
        else:
            df_features = df_features.to_pandas()

        df_features = df_features.set_index("row_id")
        df_features[cat_cols] = df_features[cat_cols].astype("category")

        return df_features

    def generate_features(self, df_prediction_items):
        if "target" in df_prediction_items.columns:
            df_prediction_items, y = (
                df_prediction_items.drop("target"),
                df_prediction_items.select("target"),
            )
        else:
            y = None

        df_features = df_prediction_items.with_columns(
            pl.col("datetime").cast(pl.Date).alias("date"),
        )

        for add_features in [
            self._add_general_features,
            self._add_client_features,
            self._add_forecast_weather_features,
            self._add_historical_weather_features,
            self._add_target_features,
            self._reduce_memory_usage,
            self._drop_columns,
        ]:
            df_features = add_features(df_features)

        df_features = self._to_pandas(df_features, y)

        return df_features

In [92]:
def convert_2_dataloader(df, _seed):

    # define categorical and continous numerical feature column names (on small number of features)
    # from train.csv
    cat_names = ["county", "is_business", "product_type", "is_consumption", "segment"]
    # from datetime column
    cat_names += ["weekday", "month", 'sin(dayofyear)', 'cos(dayofyear)', 'sin(hour)', 'cos(hour)']
    # from https://www.kaggle.com/code/albansteff/enefit-estonian-holidays-lb-65-79 notebook
    cat_names += ['country_holiday']
    
    # from client.csv
    cont_names = ["installed_capacity", "eic_count"]
    
    # from forecast_weather.csv (next 0 hours)
    cont_names += [_ for _ in df.columns if "_forecast_0h" in _]
    cont_names += [_ for _ in df.columns if "_forecast_local_0h" in _]
    # from forecast_weather.csv (next 24 hours)
    cont_names += [_ for _ in df.columns if "_forecast_24h" in _]
    cont_names += [_ for _ in df.columns if "_forecast_local_24h" in _]
    
    # from historical_weather.csv (last 24/48 hours)
    cont_names += [_ for _ in df.columns if "_historical_24h" in _]
    cont_names += [_ for _ in df.columns if "_historical_48h" in _]
    cont_names += [_ for _ in df.columns if "_historical_local_48h" in _]
    
    # add all historical target values (last n hours)
    cont_names += df.filter(regex=("target_.[0-9]*h")).columns.tolist()
    cont_names += ['target_mean', 'target_std']
    
    # added aggregated target values
    cont_names += [_ for _ in df.columns if "target_all_" in _]
    
    # add ratios between last kown target values
    cont_names += df.filter(regex=("target_ratio_.[0-9]")).columns.tolist()
    
    procs = [Categorify, FillMissing, Normalize]
    
    # log transform target variable
    df.loc[:, 'target'] = np.log1p(df['target'])
        
    # convert pandas DataFrame to fastai DataLoader object
    # code snippet taken from
    # https://docs.fast.ai/tabular.learner.html
    splits = RandomSplitter(valid_pct=0.2, seed = _seed)(df)
    
    # tabular object (only categorical features)
    to = TabularPandas(df[cat_names + cont_names + ["target"]],
                       procs = procs,
                       cat_names = cat_names,
                       cont_names = cont_names,
                       y_names = ["target"],
                       splits=splits)
    # create dataloader
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    dls = to.dataloaders(BATCH_SIZE, device = device)
     
    # return all dataloaders as tuple
    return dls

def add_custom_features(df):
    """
    Function inspired by notebook:
    https://www.kaggle.com/code/albansteff/enefit-estonian-holidays-lb-65-79
    """
    
    # code bellow same as in NB v17 add_holidays_as_binary_features function
    estonian_holidays = holidays.country_holidays('EE', years=range(2021, 2026))
    estonian_holidays = [pd.to_datetime(_) for _ in estonian_holidays.keys()]
    
    df['country_holiday'] = df['date'].isin(estonian_holidays) * 1
    del df['date']
    
    # log transform histrocial target values
    _cols = df.filter(regex=("target_.[0-9]*h")).columns.tolist()
    for _col in _cols:
        df.loc[:, _col] = np.log1p(df[_col])
    
    # log transform aggregated target values
    _cols = [_ for _ in df.columns if "target_all_" in _]
    for _col in _cols:
        df.loc[:, _col] = np.log1p(df[_col])  
    
    return df

In [100]:
class DataTransformer:
    def __init__(self, df):
        self.df = df

    def transform(self):
        self.add_season()
        self.add_daypart_with_sin_hour()
        self.add_feels_like_temperature()
        self.add_energy_usage_trend()
        self.add_temp_change()
        self.add_prec_change()
        self.add_autocorr_features()
        self.add_energy_price_volatility_and_trend()
        #self.perform_clustering()
        self.analyze_transit_and_charging_access()
        return self.df

    def add_season(self):
        def get_season(month):
            if month in [3, 4, 5]:
                return 1 #spring
            elif month in [6, 7, 8]:
                return 2 #summer
            elif month in [9, 10, 11]:
                return 3 #fall
            else:
                return 4 #winter
        
        self.df['season'] = self.df['month'].apply(get_season)

    def add_daypart_with_sin_hour(self):
        def get_daypart(sin_hour):
            if sin_hour > 0:
                return 1  # sin(hour) 양수: 오전~오후
            else:
                return 2      # sin(hour) 음수: 저녁~밤

        self.df['daypart'] = self.df['sin(hour)'].apply(get_daypart)

    def add_feels_like_temperature(self):
        def calculate_feels_like(T, u, v):
            wind_speed = (u**2 + v**2)**0.5
            if wind_speed < 4.8:
                return T
            else:
                return 13.12 + 0.6215 * T - 11.37 * (wind_speed ** 0.16) + 0.3965 * T * (wind_speed ** 0.16)

        self.df['feels_like_temp'] = self.df.apply(lambda row: calculate_feels_like(row['temperature'], row['10_metre_u_wind_component'], row['10_metre_v_wind_component']), axis=1)


    def add_energy_usage_trend(self, period = 7):
        self.df['energy_trend'] = self.df['target'].rolling(window = period).mean()

    def add_temp_change(self, interval = 24):
        self.df['temp_change'] = self.df['temperature'].diff(periods = interval)

    def add_prec_change(self):
        self.df['precipitation_change'] = self.df['total_precipitation'].diff()

    def add_autocorr_features(self, lags = 10):
        acf_values = acf(self.df['target'], nlags = lags)
        pacf_values = pacf(self.df['target'], nlags = lags)
        for i in range(lags+1):
            self.df[f'acf_lag_{i}'] = acf_values[i]
            self.df[f'pacf_lag{i}'] = pacf_values[i]

    def add_energy_price_volatility_and_trend(self, window = 7):
        self.df['energy_price_volatility'] = self.df['target'].rolling(window = window).std()

    # def perform_clustering(self, n_clusters = 3, features = None):
    #     if features is None:
    #         features = ['target_24h', 'target_48h', 'temperature', 'cloudcover_total']
    #     scaler = StandardScaler()
    #     scaled_data = scaler.fit_transform(self.df[features])

    #     kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    #     self.df['cluster'] = kmeans.fit_predict(scaled_data)

    def analyze_transit_and_charging_access(self):

        results = []
        for consumption_status in [0, 1]:
            subset = self.df[self.df['is_consumption'] == consumption_status]

            # 대중교통 이용률 분석
            business_hours_energy = subset[subset['is_business'] == 1]['target'].mean()
            non_business_hours_energy = subset[subset['is_business'] == 0]['target'].mean()
            transit_usage_estimate = business_hours_energy - non_business_hours_energy

            # 전기차 충전소 접근성 분석
            high_capacity_energy = subset[subset['installed_capacity'] > subset['installed_capacity'].median()]['target'].mean()
            low_capacity_energy = subset[subset['installed_capacity'] <= subset['installed_capacity'].median()]['target'].mean()
            charging_access_estimate = high_capacity_energy - low_capacity_energy

            results.append((consumption_status, transit_usage_estimate, charging_access_estimate))

        # 결과를 하나의 컬럼으로 합침
        for consumption_status, transit_estimate, charging_estimate in results:
            self.df[f'transit_usage_estimate_{consumption_status}'] = transit_estimate
            self.df[f'charging_access_estimate_{consumption_status}'] = charging_estimate

In [101]:
import numpy as np #wind  
import pandas as pd

#train_dataset data 변환 (weekday -> weekend, wind dir, speed -> U10, V10)
class TrainDataTransform:
    def __init__(self, df):
        self.df = df

    def transform(self):
        self.is_weekend()
        self.wind_data_to_UV()
        return self.df

    #weekend 판별 함수
    def is_weekend(self):
        self.df['is_weekend'] = np.where(self.df['weekday'] > 4, 1, 0)
        
    def wind_data_to_UV(self):
        self.df['U10'] = self.df['windspeed_10m'] * np.cos(np.radians(270 - self.df['winddirection_10m']))
        self.df['V10'] = self.df['windspeed_10m'] * np.sin(np.radians(270 - self.df['winddirection_10m']))
        

In [102]:
data_storage = DataStorage()
features_generator = FeaturesGenerator(data_storage=data_storage)

In [103]:
train_dataset = features_generator.generate_features(data_storage.df_data)
# exclude rows with missing target value
train_dataset = train_dataset[train_dataset['target'].notnull()]
# add estonian holidays
train_dataset = add_custom_features(train_dataset)

In [None]:
clist = train_dataset.columns.tolist()
print(clist)

In [104]:
#add yelim
train_dataset = DataTransformer(train_dataset)
train_dataset = train_dataset.transform()

#add joonyong
train_dataset = TrainDataTransform(train_dataset)
train_dataset = train_dataset.transform()

In [105]:
train_dataset.head()

Unnamed: 0_level_0,county,is_business,product_type,is_consumption,prediction_unit_id,day,weekday,month,year,segment,sin(dayofyear),cos(dayofyear),sin(hour),cos(hour),eic_count,installed_capacity,temperature,dewpoint,cloudcover_high,cloudcover_low,cloudcover_mid,cloudcover_total,10_metre_u_wind_component,10_metre_v_wind_component,direct_solar_radiation,surface_solar_radiation_downwards,snowfall,total_precipitation,temperature_forecast_local_0h,dewpoint_forecast_local_0h,cloudcover_high_forecast_local_0h,cloudcover_low_forecast_local_0h,cloudcover_mid_forecast_local_0h,cloudcover_total_forecast_local_0h,10_metre_u_wind_component_forecast_local_0h,10_metre_v_wind_component_forecast_local_0h,direct_solar_radiation_forecast_local_0h,surface_solar_radiation_downwards_forecast_local_0h,snowfall_forecast_local_0h,total_precipitation_forecast_local_0h,temperature_forecast_168h,dewpoint_forecast_168h,cloudcover_high_forecast_168h,cloudcover_low_forecast_168h,cloudcover_mid_forecast_168h,cloudcover_total_forecast_168h,10_metre_u_wind_component_forecast_168h,10_metre_v_wind_component_forecast_168h,direct_solar_radiation_forecast_168h,surface_solar_radiation_downwards_forecast_168h,snowfall_forecast_168h,total_precipitation_forecast_168h,temperature_forecast_local_168h,dewpoint_forecast_local_168h,cloudcover_high_forecast_local_168h,cloudcover_low_forecast_local_168h,cloudcover_mid_forecast_local_168h,cloudcover_total_forecast_local_168h,10_metre_u_wind_component_forecast_local_168h,10_metre_v_wind_component_forecast_local_168h,direct_solar_radiation_forecast_local_168h,surface_solar_radiation_downwards_forecast_local_168h,snowfall_forecast_local_168h,total_precipitation_forecast_local_168h,temperature_historical_48h,dewpoint_historical_48h,rain,snowfall_historical_48h,surface_pressure,cloudcover_total_historical_48h,cloudcover_low_historical_48h,cloudcover_mid_historical_48h,cloudcover_high_historical_48h,windspeed_10m,winddirection_10m,shortwave_radiation,direct_solar_radiation_historical_48h,diffuse_radiation,temperature_historical_local_48h,dewpoint_historical_local_48h,rain_historical_local_48h,snowfall_historical_local_48h,surface_pressure_historical_local_48h,cloudcover_total_historical_local_48h,cloudcover_low_historical_local_48h,cloudcover_mid_historical_local_48h,cloudcover_high_historical_local_48h,windspeed_10m_historical_local_48h,winddirection_10m_historical_local_48h,shortwave_radiation_historical_local_48h,direct_solar_radiation_historical_local_48h,diffuse_radiation_historical_local_48h,temperature_historical_168h,dewpoint_historical_168h,rain_historical_168h,snowfall_historical_168h,surface_pressure_historical_168h,cloudcover_total_historical_168h,cloudcover_low_historical_168h,cloudcover_mid_historical_168h,cloudcover_high_historical_168h,windspeed_10m_historical_168h,winddirection_10m_historical_168h,shortwave_radiation_historical_168h,direct_solar_radiation_historical_168h,diffuse_radiation_historical_168h,temperature_historical_local_168h,dewpoint_historical_local_168h,rain_historical_local_168h,snowfall_historical_local_168h,surface_pressure_historical_local_168h,cloudcover_total_historical_local_168h,cloudcover_low_historical_local_168h,cloudcover_mid_historical_local_168h,cloudcover_high_historical_local_168h,windspeed_10m_historical_local_168h,winddirection_10m_historical_local_168h,shortwave_radiation_historical_local_168h,direct_solar_radiation_historical_local_168h,diffuse_radiation_historical_local_168h,temperature_historical_24h,dewpoint_historical_24h,rain_historical_24h,snowfall_historical_24h,surface_pressure_historical_24h,cloudcover_total_historical_24h,cloudcover_low_historical_24h,cloudcover_mid_historical_24h,cloudcover_high_historical_24h,windspeed_10m_historical_24h,winddirection_10m_historical_24h,shortwave_radiation_historical_24h,direct_solar_radiation_historical_24h,diffuse_radiation_historical_24h,target_48h,target_72h,target_96h,target_120h,target_144h,target_168h,target_192h,target_216h,target_240h,target_264h,target_288h,target_312h,target_336h,target_6h,target_12h,target_84h,target_3096h,target_all_type_sum_48h,target_all_county_type_sum_48h,target_all_type_sum_72h,target_all_county_type_sum_72h,target_all_type_sum_168h,target_all_county_type_sum_168h,target_all_type_sum_336h,target_all_county_type_sum_336h,target_mean,target_std,target_ratio_168_336,target_ratio_48_216,target_ratio_72_240,target_ratio_48_72,target_all_type_sum_ratio_48_72,target_all_type_sum_ratio_168_336,target_all_county_type_sum_ratio_48_72,target_all_county_type_sum_ratio_168_336,target,country_holiday,season,daypart,feels_like_temp,energy_trend,temp_change,precipitation_change,acf_lag_0,pacf_lag0,acf_lag_1,pacf_lag1,acf_lag_2,pacf_lag2,acf_lag_3,pacf_lag3,acf_lag_4,pacf_lag4,acf_lag_5,pacf_lag5,acf_lag_6,pacf_lag6,acf_lag_7,pacf_lag7,acf_lag_8,pacf_lag8,acf_lag_9,pacf_lag9,acf_lag_10,pacf_lag10,energy_price_volatility,transit_usage_estimate_0,charging_access_estimate_0,transit_usage_estimate_1,charging_access_estimate_1,is_weekend,U10,V10
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1,Unnamed: 187_level_1,Unnamed: 188_level_1,Unnamed: 189_level_1,Unnamed: 190_level_1,Unnamed: 191_level_1,Unnamed: 192_level_1,Unnamed: 193_level_1,Unnamed: 194_level_1,Unnamed: 195_level_1,Unnamed: 196_level_1,Unnamed: 197_level_1,Unnamed: 198_level_1,Unnamed: 199_level_1,Unnamed: 200_level_1,Unnamed: 201_level_1,Unnamed: 202_level_1,Unnamed: 203_level_1,Unnamed: 204_level_1,Unnamed: 205_level_1,Unnamed: 206_level_1,Unnamed: 207_level_1
366048,0,0,1,0,0,1,6,1,2022,0_0_1_0,0.017166,0.999853,0.0,1.0,148.0,1345.689941,-0.246805,-0.777223,0.367615,0.93042,0.591207,0.995787,0.617221,-0.222493,0.0,0.0,1.6e-05,6.5e-05,-4.124589,-4.592993,0.749156,0.923236,0.957959,0.999995,0.002135,0.237211,0.0,0.0,8.940697e-07,7.763738e-07,-7.310143,-10.651634,0.003465,0.534964,0.304186,0.719171,3.110685,-4.115274,0.0,0.0,5e-05,5e-05,-8.073828,-10.480595,0.0,0.795092,0.324187,0.922786,2.260051,-3.338259,0.0,0.0,6.1e-05,6.1e-05,-5.085714,-7.095536,0.003571,0.109375,1006.891052,99.70536,94.321426,97.33036,85.60714,4.588789,178.910721,4.6875,1.348214,3.339286,-6.283333,-8.066667,0.0,0.198333,1005.150024,100.0,97.166664,100.0,97.666664,4.407407,168.666672,0.0,0.0,0.0,-8.976786,-11.715178,0.0,0.016875,985.258911,71.20536,55.07143,35.705357,2.232143,5.225695,320.589294,7.321429,4.178571,3.142857,-11.15,-13.433333,0.0,0.035,983.799988,71.666664,79.666664,0.166667,0.0,4.884259,343.833344,0.0,0.0,0.0,-0.446429,-1.522321,0.079464,0.031875,996.402649,98.678574,96.553574,76.79464,72.73214,6.972718,203.258926,9.321428,6.375,2.946429,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,1,4,2,-0.246805,,,,1.0,1.0,-0.026534,-0.026534,0.004919,0.004218,-0.066948,-0.066755,0.09216,0.089088,-0.055721,-0.051705,0.06068,0.054661,-0.068467,-0.055862,0.124669,0.110848,-0.007858,0.010192,0.005509,-0.01286,,-23.837394,156.68901,606.576062,732.791935,1,-0.087235,4.58796
366049,0,0,1,1,0,1,6,1,2022,0_0_1_1,0.017166,0.999853,0.0,1.0,148.0,1345.689941,-0.246805,-0.777223,0.367615,0.93042,0.591207,0.995787,0.617221,-0.222493,0.0,0.0,1.6e-05,6.5e-05,-4.124589,-4.592993,0.749156,0.923236,0.957959,0.999995,0.002135,0.237211,0.0,0.0,8.940697e-07,7.763738e-07,-7.310143,-10.651634,0.003465,0.534964,0.304186,0.719171,3.110685,-4.115274,0.0,0.0,5e-05,5e-05,-8.073828,-10.480595,0.0,0.795092,0.324187,0.922786,2.260051,-3.338259,0.0,0.0,6.1e-05,6.1e-05,-5.085714,-7.095536,0.003571,0.109375,1006.891052,99.70536,94.321426,97.33036,85.60714,4.588789,178.910721,4.6875,1.348214,3.339286,-6.283333,-8.066667,0.0,0.198333,1005.150024,100.0,97.166664,100.0,97.666664,4.407407,168.666672,0.0,0.0,0.0,-8.976786,-11.715178,0.0,0.016875,985.258911,71.20536,55.07143,35.705357,2.232143,5.225695,320.589294,7.321429,4.178571,3.142857,-11.15,-13.433333,0.0,0.035,983.799988,71.666664,79.666664,0.166667,0.0,4.884259,343.833344,0.0,0.0,0.0,-0.446429,-1.522321,0.079464,0.031875,996.402649,98.678574,96.553574,76.79464,72.73214,6.972718,203.258926,9.321428,6.375,2.946429,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,442.226,1,4,2,-0.246805,,,0.0,1.0,1.0,-0.026534,-0.026534,0.004919,0.004218,-0.066948,-0.066755,0.09216,0.089088,-0.055721,-0.051705,0.06068,0.054661,-0.068467,-0.055862,0.124669,0.110848,-0.007858,0.010192,0.005509,-0.01286,,-23.837394,156.68901,606.576062,732.791935,1,-0.087235,4.58796
366050,0,0,2,0,1,1,6,1,2022,0_0_2_0,0.017166,0.999853,0.0,1.0,16.0,153.699997,-0.246805,-0.777223,0.367615,0.93042,0.591207,0.995787,0.617221,-0.222493,0.0,0.0,1.6e-05,6.5e-05,-4.124589,-4.592993,0.749156,0.923236,0.957959,0.999995,0.002135,0.237211,0.0,0.0,8.940697e-07,7.763738e-07,-7.310143,-10.651634,0.003465,0.534964,0.304186,0.719171,3.110685,-4.115274,0.0,0.0,5e-05,5e-05,-8.073828,-10.480595,0.0,0.795092,0.324187,0.922786,2.260051,-3.338259,0.0,0.0,6.1e-05,6.1e-05,-5.085714,-7.095536,0.003571,0.109375,1006.891052,99.70536,94.321426,97.33036,85.60714,4.588789,178.910721,4.6875,1.348214,3.339286,-6.283333,-8.066667,0.0,0.198333,1005.150024,100.0,97.166664,100.0,97.666664,4.407407,168.666672,0.0,0.0,0.0,-8.976786,-11.715178,0.0,0.016875,985.258911,71.20536,55.07143,35.705357,2.232143,5.225695,320.589294,7.321429,4.178571,3.142857,-11.15,-13.433333,0.0,0.035,983.799988,71.666664,79.666664,0.166667,0.0,4.884259,343.833344,0.0,0.0,0.0,-0.446429,-1.522321,0.079464,0.031875,996.402649,98.678574,96.553574,76.79464,72.73214,6.972718,203.258926,9.321428,6.375,2.946429,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,1,4,2,-0.246805,,,0.0,1.0,1.0,-0.026534,-0.026534,0.004919,0.004218,-0.066948,-0.066755,0.09216,0.089088,-0.055721,-0.051705,0.06068,0.054661,-0.068467,-0.055862,0.124669,0.110848,-0.007858,0.010192,0.005509,-0.01286,,-23.837394,156.68901,606.576062,732.791935,1,-0.087235,4.58796
366051,0,0,2,1,1,1,6,1,2022,0_0_2_1,0.017166,0.999853,0.0,1.0,16.0,153.699997,-0.246805,-0.777223,0.367615,0.93042,0.591207,0.995787,0.617221,-0.222493,0.0,0.0,1.6e-05,6.5e-05,-4.124589,-4.592993,0.749156,0.923236,0.957959,0.999995,0.002135,0.237211,0.0,0.0,8.940697e-07,7.763738e-07,-7.310143,-10.651634,0.003465,0.534964,0.304186,0.719171,3.110685,-4.115274,0.0,0.0,5e-05,5e-05,-8.073828,-10.480595,0.0,0.795092,0.324187,0.922786,2.260051,-3.338259,0.0,0.0,6.1e-05,6.1e-05,-5.085714,-7.095536,0.003571,0.109375,1006.891052,99.70536,94.321426,97.33036,85.60714,4.588789,178.910721,4.6875,1.348214,3.339286,-6.283333,-8.066667,0.0,0.198333,1005.150024,100.0,97.166664,100.0,97.666664,4.407407,168.666672,0.0,0.0,0.0,-8.976786,-11.715178,0.0,0.016875,985.258911,71.20536,55.07143,35.705357,2.232143,5.225695,320.589294,7.321429,4.178571,3.142857,-11.15,-13.433333,0.0,0.035,983.799988,71.666664,79.666664,0.166667,0.0,4.884259,343.833344,0.0,0.0,0.0,-0.446429,-1.522321,0.079464,0.031875,996.402649,98.678574,96.553574,76.79464,72.73214,6.972718,203.258926,9.321428,6.375,2.946429,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,44.899,1,4,2,-0.246805,,,0.0,1.0,1.0,-0.026534,-0.026534,0.004919,0.004218,-0.066948,-0.066755,0.09216,0.089088,-0.055721,-0.051705,0.06068,0.054661,-0.068467,-0.055862,0.124669,0.110848,-0.007858,0.010192,0.005509,-0.01286,,-23.837394,156.68901,606.576062,732.791935,1,-0.087235,4.58796
366052,0,0,3,0,2,1,6,1,2022,0_0_3_0,0.017166,0.999853,0.0,1.0,739.0,7638.620117,-0.246805,-0.777223,0.367615,0.93042,0.591207,0.995787,0.617221,-0.222493,0.0,0.0,1.6e-05,6.5e-05,-4.124589,-4.592993,0.749156,0.923236,0.957959,0.999995,0.002135,0.237211,0.0,0.0,8.940697e-07,7.763738e-07,-7.310143,-10.651634,0.003465,0.534964,0.304186,0.719171,3.110685,-4.115274,0.0,0.0,5e-05,5e-05,-8.073828,-10.480595,0.0,0.795092,0.324187,0.922786,2.260051,-3.338259,0.0,0.0,6.1e-05,6.1e-05,-5.085714,-7.095536,0.003571,0.109375,1006.891052,99.70536,94.321426,97.33036,85.60714,4.588789,178.910721,4.6875,1.348214,3.339286,-6.283333,-8.066667,0.0,0.198333,1005.150024,100.0,97.166664,100.0,97.666664,4.407407,168.666672,0.0,0.0,0.0,-8.976786,-11.715178,0.0,0.016875,985.258911,71.20536,55.07143,35.705357,2.232143,5.225695,320.589294,7.321429,4.178571,3.142857,-11.15,-13.433333,0.0,0.035,983.799988,71.666664,79.666664,0.166667,0.0,4.884259,343.833344,0.0,0.0,0.0,-0.446429,-1.522321,0.079464,0.031875,996.402649,98.678574,96.553574,76.79464,72.73214,6.972718,203.258926,9.321428,6.375,2.946429,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.015,1,4,2,-0.246805,,,0.0,1.0,1.0,-0.026534,-0.026534,0.004919,0.004218,-0.066948,-0.066755,0.09216,0.089088,-0.055721,-0.051705,0.06068,0.054661,-0.068467,-0.055862,0.124669,0.110848,-0.007858,0.010192,0.005509,-0.01286,,-23.837394,156.68901,606.576062,732.791935,1,-0.087235,4.58796


In [None]:
[c for c in train_dataset.columns]

In [None]:
data_storage.df_forecast_weather.filter(pl.col("hours_ahead") >= 22, pl.col("hours_ahead") <= 45)['hours_ahead'].unique()
    # .drop("hours_ahead")
    # .with_columns(
    #     pl.col("latitude").cast(pl.datatypes.Float32),
    #     pl.col("longitude").cast(pl.datatypes.Float32),
    # )
    # .join(
    #     df_weather_station_to_county_mapping,
    #     how="left",
    #     on=["longitude", "latitude"],
    # )
    # .drop("longitude", "latitude")


In [None]:
for c in train_dataset.columns:
    if train_dataset[c].isnull().sum() > 0:
        print(c, train_dataset[c].isnull().sum())

In [None]:
def pd_to_polars(df):
    """
    Convert a Pandas DataFrame to Polars DataFrame and handle columns
    with int and float categorical dtypes.
    """
    df = df.copy()
    for col in df.columns:
        if isinstance(df[col].dtype, pd.CategoricalDtype):
            if pd.api.types.is_integer_dtype(df[col].cat.categories.dtype):
                df[col] = df[col].astype(int)
                print(f"Column [{col}] cast to int")
            elif pd.api.types.is_float_dtype(df[col].cat.categories.dtype):
                df[col] = df[col].astype(float)
                print(f"Column [{col}] cast to float")

    return pl.from_pandas(df)

test2 = pd_to_polars(train_dataset)


In [None]:
test2.filter(pl.col('prediction_unit_id') == 21,
                                   pl.col('year') == 2022,
                                   pl.col('month') == 2,
                                   pl.col('day') < 10,
                                   pl.col('is_consumption') == 1,
                                   ).to_pandas()#.value_counts('installed_capacity')

In [None]:
from datetime import datetime

In [None]:
data_storage.df_client.filter(pl.col('date') == datetime(2022, 2, 1),
                              pl.col('county') == 5,
                              )

In [None]:
train_dataset.groupby(['eic_count', 'installed_capacity']).size().reset_index()

In [None]:
test = train_dataset[train_dataset.eic_count.isnull()]
for c in test.columns:
    print(test[c].value_counts())
    print('\n')

In [None]:
for c in data_storage.df_client.columns:
    if c != 'eic_count':
        print(c, 
              data_storage.df_client[c].value_counts())

In [None]:
#train_dataset.to_csv('/Data/home/limkim/Enefit/Kaggle2024_Enefit/baseline.csv', index = False)

# Modeling

In [None]:
df_train_features = train_dataset[train_dataset['target'].notnull()]
df_train_features.info()

In [None]:
LR_START = 1e-7
LR_MAX = 1e-3
LR_MIN = 1e-7
LR_RAMPUP_EPOCHS = 2
LR_SUSTAIN_EPOCHS = 2
EPOCHS = 20

def lrfn(epoch):
    if epoch < LR_RAMPUP_EPOCHS:
        lr = (LR_MAX - LR_START) / LR_RAMPUP_EPOCHS * epoch + LR_START
    elif epoch < LR_RAMPUP_EPOCHS + LR_SUSTAIN_EPOCHS:
        lr = LR_MAX
    else:
        decay_total_epochs = EPOCHS - LR_RAMPUP_EPOCHS - LR_SUSTAIN_EPOCHS - 1
        decay_epoch_index = epoch - LR_RAMPUP_EPOCHS - LR_SUSTAIN_EPOCHS
        phase = math.pi * decay_epoch_index / decay_total_epochs
        cosine_decay = 0.5 * (1 + math.cos(phase))
        lr = (LR_MAX - LR_MIN) * cosine_decay + LR_MIN
    
    return lr

rng = [i for i in range(EPOCHS)]
lr_y = [lrfn(x) for x in rng]
plt.figure(figsize=(10,4))
plt.plot(rng, lr_y, '-o')
plt.xlabel('Epoch'); plt.ylabel('LR')
print('Learning Rate schedule : {:.3g} to {:3g} to {:3g}' . \
        format(lr_y[0], max(lr_y), lr_y[-1]))
LR = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose = True)

In [None]:
%%time
class CFG:
    nn = True
    lgb = True
    ens_weights = {'nn': 0.5, 'lgb': 0.5}
    epochs = 10
    batch_size = 512
    valid_size = 5e-2
    LR_Scheduler = []  # [LR]
    optimizer = AdamW(learning_rate=1e-3, weight_decay=9e-7)
     
class Model:
    def __init__(self):
        self.conf = ModelConfig(auto_imputation=True,
                                auto_discrete=True,
                                auto_discard_unique=True,
                                categorical_columns='auto',
                                fixed_embedding_dim=False,
                                embeddings_output_dim=4,
                                embedding_dropout=0.3,
                                nets=['dnn_nets'],
                                dnn_params={
                                    'hidden_units': ((512, 0.3, True),
                                                     (256, 0.3, True)),
                                    'dnn_activation': 'relu',
                                },
                                stacking_op='add',
                                output_use_bias=False,
                                optimizer=CFG.optimizer,
                                task='regression',
                                loss='MeanAbsoluteError',
                                metrics='MeanAbsoluteError',
                                earlystopping_patience=1,
                                )
        
        self.lgb_params = {"n_estimators": 2500,
                           "learning_rate": 0.06,
                           "max_depth": 16,
                           "num_leaves": 500,
                           "reg_alpha": 3.5,
                           "reg_lambda": 1.5,
                           "colsample_bytree": 0.9,
                           "colsample_bynode": 0.6,
                           "min_child_samples": 50,
                           "random_state": 0,
                           "objective": "regression_l1",
                           "device": "gpu",
                           "n_jobs": 4,
                           "verbose": -1,
                           }
        
        self.nn_model_consumption = DeepTable(config=self.conf)  
        self.nn_model_production = DeepTable(config=self.conf)
        
        self.lgb_model_consumption = lgb.LGBMRegressor(**self.lgb_params)
        self.lgb_model_production = lgb.LGBMRegressor(**self.lgb_params)

    def fit(self, df_train_features):
        print('nn = '+str(CFG.nn))
        print('lgb = '+str(CFG.lgb))
        
        if CFG.nn == True:
            
            print('\n',"nn model consumption training.",'\n')
            mask = df_train_features["is_consumption"] == 1
            self.nn_model_consumption.fit(
                X=df_train_features[mask].drop(columns=["target"]),
                y=df_train_features[mask]["target"]
                - df_train_features[mask]["target_48h"].fillna(0),
                validation_split=CFG.valid_size, shuffle=False,
                batch_size=CFG.batch_size, epochs=CFG.epochs, verbose=2,
                callbacks=CFG.LR_Scheduler
            )
        
            # Avoid saving error
            with K.name_scope(CFG.optimizer.__class__.__name__):
                for i, var in enumerate(CFG.optimizer.weights):
                    name = 'variable{}'.format(i)
                    CFG.optimizer.weights[i] = tf.Variable(var, name=name)
            self.conf = self.conf._replace(optimizer=CFG.optimizer)   
            self.nn_model_production = DeepTable(config=self.conf)
            
            print('\n',"nn model production training.",'\n')
            mask = df_train_features["is_consumption"] == 0
            self.nn_model_production.fit(
                X=df_train_features[mask].drop(columns=["target"]),
                y=df_train_features[mask]["target"]
                - df_train_features[mask]["target_48h"].fillna(0),
                validation_split=CFG.valid_size, shuffle=False,
                batch_size=CFG.batch_size, epochs=CFG.epochs, verbose=2,
                callbacks=CFG.LR_Scheduler
            )
        
        if CFG.lgb == True:
            
            print('\n',"lgb model consumption training.")
            mask = df_train_features["is_consumption"] == 1
            self.lgb_model_consumption.fit(
                X=df_train_features[mask].drop(columns=["target"]),
                y=df_train_features[mask]["target"]
                - df_train_features[mask]["target_48h"].fillna(0),
            )
        
            print('\n',"lgb model production training.",'\n')
            mask = df_train_features["is_consumption"] == 0
            self.lgb_model_production.fit(
                X=df_train_features[mask].drop(columns=["target"]),
                y=df_train_features[mask]["target"]
                - df_train_features[mask]["target_48h"].fillna(0),
            )
        
    def plot_nn_model(self):
        if CFG.nn == True:
            return plot_model(self.nn_model_consumption.get_model().model)    

    def predict(self, df_features):
        predictions = np.zeros(len(df_features))
        
        if CFG.nn == True and CFG.lgb == True:
            
            print('\n',"nn & lgb model consumption prediction.",'\n')
            mask = df_features["is_consumption"] == 1
            predictions[mask.values] = np.clip(
                df_features[mask]["target_48h"].fillna(0).values
                + CFG.ens_weights['nn'] * (self.nn_model_consumption.predict(df_features[mask])[:,0])
                + CFG.ens_weights['lgb'] * (self.lgb_model_consumption.predict(df_features[mask])),
                0,
                np.inf,
            )
        
            print('\n',"nn & lgb model production prediction.",'\n')
            mask = df_features["is_consumption"] == 0
            predictions[mask.values] = np.clip(
                df_features[mask]["target_48h"].fillna(0).values
                + CFG.ens_weights['nn'] * (self.nn_model_production.predict(df_features[mask])[:,0])
                + CFG.ens_weights['lgb'] * (self.lgb_model_production.predict(df_features[mask])),
                0,
                np.inf,
            )
        
        elif CFG.nn == True and CFG.lgb == False:
            
            print('\n',"nn model consumption prediction.",'\n')
            mask = df_features["is_consumption"] == 1
            predictions[mask.values] = np.clip(
                df_features[mask]["target_48h"].fillna(0).values
                + self.nn_model_consumption.predict(df_features[mask])[:,0],
                0,
                np.inf,
            )
            
            print('\n',"nn model production prediction.",'\n')
            mask = df_features["is_consumption"] == 0
            predictions[mask.values] = np.clip(
                df_features[mask]["target_48h"].fillna(0).values
                + self.nn_model_production.predict(df_features[mask])[:,0],
                0,
                np.inf,
            )
            
        elif CFG.nn == False and CFG.lgb == True:
            
            print('\n',"lgb model consumption prediction.",'\n')
            mask = df_features["is_consumption"] == 1
            predictions[mask.values] = np.clip(
                df_features[mask]["target_48h"].fillna(0).values
                + self.lgb_model_consumption.predict(df_features[mask]),
                0,
                np.inf,
            )
            
            print('\n',"lgb model production prediction.",'\n')
            mask = df_features["is_consumption"] == 0
            predictions[mask.values] = np.clip(
                df_features[mask]["target_48h"].fillna(0).values
                + self.lgb_model_production.predict(df_features[mask]),
                0,
                np.inf,
            )
            
        else:
            raise ValueError("No models has been trained.")
            
        return predictions
    
    
model = Model()
model.fit(df_train_features)

joblib.dump(model.lgb_model_consumption, 'lgb_model_consumption.joblib')
joblib.dump(model.lgb_model_production, 'lgb_model_production.joblib')

nn_model_consumption = model.nn_model_consumption.get_model().model
nn_model_consumption.save('nn_model_consumption.h5')

nn_model_production = model.nn_model_production.get_model().model
nn_model_production.save('nn_model_production.h5')


In [None]:
# # 저장되어있는 모델 사용하는 경우

# lgb_model_consumption = joblib.load('lgb_model_consumption.joblib')
# lgb_model_production = joblib.load('lgb_model_production.joblib')

# from tensorflow.keras.models import load_model
# model_file = 'nn_model_production.h5'
# loaded_model = load_model(model_file)

In [None]:
model.plot_nn_model()

# Submit API

import enefit

env = enefit.make_env()
iter_test = env.iter_test()

In [None]:
%%time
for (
    df_test, 
    df_new_target, 
    df_new_client, 
    df_new_historical_weather,
    df_new_forecast_weather, 
    df_new_electricity_prices, 
    df_new_gas_prices, 
    df_sample_prediction
) in iter_test:

    data_storage.update_with_new_data(
        df_new_client=df_new_client,
        df_new_gas_prices=df_new_gas_prices,
        df_new_electricity_prices=df_new_electricity_prices,
        df_new_forecast_weather=df_new_forecast_weather,
        df_new_historical_weather=df_new_historical_weather,
        df_new_target=df_new_target
    )
    df_test = data_storage.preprocess_test(df_test)
    
    df_test_features = features_generator.generate_features(df_test)

        #add yelim
    test_dataset = DataTransformer(df_test_features)
    test_dataset = test_dataset.transform()

    #add joonyong
    test_dataset = TrainDataTransform(test_dataset)
    test_dataset = test_dataset.transform()

    test_dataset = DataStorageTransform(test_dataset)
    df_test_features = test_dataset.transform()
    
    df_sample_prediction["target"] = model.predict(df_test_features)
    
    env.predict(df_sample_prediction)