In [None]:
debug = False
n_rows_debug = 100000

In [None]:
import pandas as pd
import numpy as np
from datetime import date, datetime, timedelta
import matplotlib.pyplot as plt
import math
import collections
import time
from copy import deepcopy
import seaborn as sns
import os, gc
from tqdm import tqdm
import re

from statsmodels.graphics.tsaplots import plot_acf

""" SCIKIT-LEARN """
from sklearn import preprocessing
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, confusion_matrix, accuracy_score, mean_squared_error, roc_auc_score, roc_curve
from sklearn.model_selection import GroupKFold, StratifiedKFold, KFold, TimeSeriesSplit
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor

from collections import Counter, defaultdict

from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import lightgbm as lgb
import joblib

pd.set_option('display.float_format', lambda x: '%.5f' % x) # No scientic notation
pd.set_option('display.max_columns', 100)

In [None]:
import polars as pl

def timestamp_UTC_conversion(train, date_col) :
    """  """
    
    # To avoid bugs
    if date_col == 'date' :
        train = train.rename(columns = {'date' : 'date_'})
        date_col = 'date_'
            
    # Transform
    dt_transforms = [
                     #pl.col(date_col).str.to_datetime(time_zone='UTC'), # UTC conversion to avoid different timezones
                     (pl.col(date_col).str.to_datetime().dt.date()).alias('date'), #  Extract date
                     pl.col(date_col).str.to_datetime().dt.time().alias('time'), # Extract time
                    ]

    # Apply transform
    return  (pl.from_pandas(train)
               .with_columns((dt_transforms))
               .to_pandas()
            )

In [None]:
def process_date(train, date_col, prefixe = '', extract_features = False) :
    
    # Force to str (to avoid datetime error)
    train[date_col] = train[date_col].astype(str)
                
    # Polar UTC conversion
    train = timestamp_UTC_conversion(train, date_col)
    
    # Convert to string
    train['date'] = train['date'].astype(str)
    train['time'] = train['time'].astype(str)
    train[date_col] = train['date'] + ' ' + train['time']
    
    # -----------------------------------------------------
    # FEATURES EXTRACTION
    
    # Return
    if not(extract_features) :
        return train
    
    # Date features
    train['year']  = (train['date'].apply(lambda x : x[:4]).astype(int)).astype(int)
    train['month'] = train['date'].apply(lambda x : x[5:7]).astype(int)
    train['day']   = train['date'].apply(lambda x : x[8:10]).astype(int)
    
    # Day of week
    train['dayofweek'] = train[['year', 'month', 'day']].apply(lambda row : datetime(row['year'], row['month'], row['day']).weekday(), axis=1)
    
    # Time features
    train['hour']    = train['time'].apply(lambda x : x[:2]).astype(int)
    train['minutes'] = train['time'].apply(lambda x : x[3:5]).astype(int)
    train['seconds'] = train['time'].apply(lambda x : x[6:7]).astype(int)
    
    # Rename
    if len(prefixe) > 0 :
        cols = ['year', 'month', 'day', 'hour', 'minutes', 'seconds']
        train = train.rename(columns = {k : f"{prefixe}_{k}" for k in cols})

    # Drop column
    #train.drop(columns = [date_col], inplace=True)
    
    # Return
    return train

In [None]:
def reduce_memory_usage(df, print_info=False):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    
    if print_info :
        print('*'*50)
        start_mem = df.memory_usage().sum() / 1024**2
        print('Memory before : {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    if print_info :
        end_mem = df.memory_usage().sum() / 1024**2
        print('Memory after  : {:.2f} MB'.format(end_mem))
        print('Decreased by  : {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
        print('*'*50 + '\n')
    
    return df

In [None]:
train = pd.read_csv("/kaggle/input/predict-energy-behavior-of-prosumers/train.csv")
if debug :
    print("DEBUG IS ON !")
    train = train.sample(n=n_rows_debug, random_state=12)

# Show
print(train.shape)
train.head(2)

In [None]:
# Pivot the training data to have a cleaner DataFrame where we can analyze the mean target values
# organized by datetime and various categorical variables.
pivot_train = train.pivot_table(index='datetime',columns=['county','product_type','is_business','is_consumption'], values='target', aggfunc='mean')

# Renaming columns for easier access and interpretation
pivot_train.columns = ['county{}_productType{}_isBusiness{}_isConsumption{}'.format(*col) for col in pivot_train.columns.values]
pivot_train.index = pd.to_datetime(pivot_train.index)
pivot_train.head(2)

In [None]:
df_plot = pivot_train.copy()
df_plot = (df_plot - df_plot.min())/(df_plot.max() - df_plot.min())
df_plot_resampled_D = df_plot.resample('D').mean()

# Plot the consumption data with alpha=0.1 
df_plot_resampled_D.loc['2022-7':].plot(alpha=0.1, color='gray', figsize=(15, 6), legend=False)

In [None]:
# Select the relevant columns and time range
columns_consumption_0 = df_plot_resampled_D.columns[df_plot_resampled_D.columns.str.contains('isConsumption0')]
columns_consumption_1 = df_plot_resampled_D.columns[df_plot_resampled_D.columns.str.contains('isConsumption1')]

# Create a single legend for each category
plt.figure(figsize=(15, 6))
plt.plot([], color='blue', label='is_Consumption = 1')
plt.plot([], color='green', label='is_Consumption = 0')
plt.legend()

# Plot the data for is_Consumption = 0 in green
for column in columns_consumption_0:
    df_plot_resampled_D.loc['2022-7':, column].plot(alpha=0.1, color='green', legend=False)

# Plot the data for is_Consumption = 1 in blue
for column in columns_consumption_1:
    df_plot_resampled_D.loc['2022-7':, column].plot(alpha=0.1, color='blue', legend=False)

# Add a single legend to the plot
#plt.legend()

# Show the plot
plt.show()

In [None]:
# Select the consumption_1 data
consumption_1 = df_plot.loc[:, df_plot.columns.str.contains('isConsumption1')]

# Filter the data for the desired time period ('2023-5' onwards)
consumption_1_filtered = consumption_1.loc['2022-8']

# Calculate the average values
average_values = consumption_1_filtered.mean(axis=1)

# Create a figure and axis for the plot
fig, ax = plt.subplots(figsize=(15, 6))

# Plot the consumption data with alpha=0.1 in blue
consumption_1_filtered.plot(alpha=0.1, color='blue', ax=ax)

# Plot the average values as a black bold line
average_values.plot(color='black', linewidth=2, ax=ax)

# Set the legend
ax.legend(['Consumption Data', 'Average'])

# Show the plot
plt.show()

In [None]:
# Calculate the average values
average_values = consumption_1.mean(axis=1)

# Create a figure and axis with figsize
fig, ax = plt.subplots(figsize=(15, 6))

# Calculate and plot the autocorrelation
plot_acf(average_values, lags=170, ax=ax)
plt.xlabel('Lag')
plt.ylabel('Autocorrelation')
plt.title('Autocorrelation Plot')
plt.show()


In [None]:
%%time

# Date processing + features extraction
for date_col in ['datetime'] :
    train = process_date(train, date_col, extract_features = True)

# Show
print(train.shape)
train.head(2)

In [None]:
# Print datetimes in the train dataset (2021-09-01 -> 2023-05-31)
train['datetime'].value_counts().to_frame().sort_index()

In [None]:
# Show target distribution
display(train['target'].describe())
train['target'].hist(bins=100)

In [None]:
for day_shift in range(2, 8) :

    # Add previous targets
    train['data_block_id_shifted'] = train['data_block_id'] + day_shift
    train = pd.merge(train,
                     train[["county", "is_business", "product_type", "is_consumption", "time", "data_block_id_shifted", "target"]].rename(columns = {"data_block_id_shifted":"data_block_id",
                                                                                                                                                     "target" : f"target_revealed_{day_shift}days_ago",
                                                                                                                                                    }),
                     how = 'left',
                     on = ["county", "is_business", "product_type", "is_consumption", "time", "data_block_id"],
                    )

    # Fill NaN with -1
    mask = train[f"target_revealed_{day_shift}days_ago"].isna()
    train.loc[mask, f"target_revealed_{day_shift}days_ago"] = train.loc[mask, "target"]

# Drop useless column
train.drop(columns = ['data_block_id_shifted'], inplace=True)

# Show
print(train.shape)
train.tail(2)

In [None]:
county = 15
is_business = 1
product_type = 3
is_consumption = 1
#mask = (train['county'] == county) & (train['is_business'] == is_business) & (train['product_type'] == product_type) & (train['is_consumption'] == is_consumption)
#train.loc[mask, ['date', 'time', 'data_block_id', 'target', 'target_revealed']].tail(30

In [None]:
df = train.copy()

# Show
print(df.shape)
df.head(2)

In [None]:
electricity_prices = pd.read_csv("/kaggle/input/predict-energy-behavior-of-prosumers/electricity_prices.csv")

# Date processing + features extraction
for date_col in ['forecast_date', 'origin_date'] :
    electricity_prices = process_date(electricity_prices, date_col)
    
# Show
print(electricity_prices.shape)
electricity_prices.head(2)

In [None]:
%%time

# Shift data_clock_id (during the submission phase, the data are available 2 days ago)
electricity_prices['data_block_id_shifted'] = electricity_prices['data_block_id'] + 2

# Join
df = pd.merge(df,
              electricity_prices[["time", "data_block_id_shifted", "euros_per_mwh"]].rename(columns = {"data_block_id_shifted":"data_block_id"}),
              how = 'left',
              on = ["time", "data_block_id"],
             )

# Show
print(df.shape)
df.head(2)

In [None]:
gas_prices = pd.read_csv("/kaggle/input/predict-energy-behavior-of-prosumers/gas_prices.csv")

# Date processing + features extraction
for date_col in ['forecast_date', 'origin_date'] :
    gas_prices = process_date(gas_prices, date_col)
    
# Show
print(gas_prices.shape)
gas_prices.head(2)

In [None]:
%%time

# Shift data_clock_id (during the submission phase, the data are available 2 days ago)
gas_prices['data_block_id_shifted'] = gas_prices['data_block_id'] + 2

# Join
cols = ['data_block_id_shifted', 'time', 'lowest_price_per_mwh', 'highest_price_per_mwh']
df = df.merge(gas_prices[cols].rename(columns = {'data_block_id_shifted' : 'data_block_id'}),
             how='left',
             on=["time", "data_block_id"],
             )

# Show
print(df.shape)
df.head(2)

In [None]:
client = pd.read_csv("/kaggle/input/predict-energy-behavior-of-prosumers/client.csv")

# Rename
client = client.rename(columns = {'date' : 'datetime'})

# Date processing + features extraction
for date_col in ['datetime'] :
    client = process_date(client, date_col)
    
# Fill NaN
client.fillna(0, inplace=True)

# Show
print(client.shape)
client.head(2)

In [None]:
%%time

# Shift data_clock_id (during the submission phase, the data are available 2 days ago)
client['data_block_id_shifted'] = client['data_block_id'] + 2

# Join
df = df.merge(client.drop(columns = ['date', 'datetime', 'data_block_id', 'time']).rename(columns = {'data_block_id_shifted' : 'data_block_id'}),
             how='left',
             on=['data_block_id', 'county', 'is_business', 'product_type'], # don't merge on time ! Client data are only available at 00:00:00
             )

# Show
print(df.shape)
df.head(2)

In [None]:
location = pd.read_csv("/kaggle/input/fabiendaniels-mapping-locations-and-county-codes/county_lon_lats.csv").drop(columns = ["Unnamed: 0"])

# Convert to int to avoid float imprecision
for k in ['latitude', 'longitude'] :
    location[k] = (10*location[k]).astype(int)

# Show
print(location.shape)
location.head(2)

In [None]:
def process_weather_info(h, location=location) :
    
    # Drop duplicates
    h = h.drop_duplicates().reset_index(drop=True)

    # Convert to int to avoid float imprecision
    for k in ['latitude', 'longitude'] :
        h[k] = (10*h[k]).astype(int)
    
    # Add location
    h = pd.merge(h, location, how='left', on=['latitude', 'longitude'])
    
    # Fill NaN and force int
    h['county'] = h['county'].fillna(-1).astype(int)

    # Return
    return h

In [None]:
%%time

historical_weather = pd.read_csv("/kaggle/input/predict-energy-behavior-of-prosumers/historical_weather.csv")

# Date processing + features extraction
for date_col in ['datetime'] :
    historical_weather = process_date(historical_weather, date_col)
    
# Reduce memory usage to avoid OOM (Out OF Memory error)
#historical_weather = reduce_memory_usage(historical_weather, print_info=True)

# Show
print(historical_weather.shape)
historical_weather.head(2)

In [None]:
# Add location
historical_weather = process_weather_info(historical_weather)

# Show
print(historical_weather.shape)
historical_weather.head(2)

In [None]:
# Aggregate information over latitude/longitude
dict_agg = {'temperature' : ['min', 'mean', 'max', 'std'],
            'dewpoint' : ['min', 'mean', 'max', 'std'],
            'rain' : ['min', 'mean', 'max', 'std'],
            'snowfall' : ['min', 'mean', 'max', 'std'],
            'surface_pressure' : ['min', 'mean', 'max', 'std'],
            'cloudcover_total' : ['min', 'mean', 'max', 'std'],
            'cloudcover_low' : ['min', 'mean', 'max', 'std'],
            'cloudcover_mid' : ['min', 'mean', 'max', 'std'],
            'cloudcover_high' : ['min', 'mean', 'max', 'std'],
            'windspeed_10m' : ['min', 'mean', 'max', 'std'],
            'winddirection_10m' : ['min', 'mean', 'max', 'std'],
            'shortwave_radiation' : ['min', 'mean', 'max', 'std'],
            'direct_solar_radiation' : ['min', 'mean', 'max', 'std'],
            'diffuse_radiation' : ['min', 'mean', 'max', 'std'],
           }

# Groupby
keys = ['county', 'datetime']
historical_weather = historical_weather.groupby(keys).agg(dict_agg).reset_index()

# Flatten columns names
historical_weather.columns = ['_'.join([xx for xx in x if len(xx)>0]) for x in historical_weather.columns]
historical_weather.columns = [x + '_h' if x not in keys else x for x in historical_weather.columns]

# Show
print(historical_weather.shape)
historical_weather.head(2)

In [None]:
%%time

# Shift datetime
historical_weather['datetime_shifted'] = (pd.to_datetime(historical_weather['datetime'].astype(str)) + pd.Timedelta(days=1, hours=13)).astype(str)

# Join
df = df.merge(historical_weather.drop(columns = ['datetime']).rename(columns = {'datetime_shifted' : 'datetime'}),
              how='left',
              on=['county', 'datetime'],
             )

# Show
print(df.shape)
df.head(2)

In [None]:
%%time

forecast_weather = pd.read_csv("/kaggle/input/predict-energy-behavior-of-prosumers/forecast_weather.csv")

# Date processing + features extraction
for date_col in ['forecast_datetime'] :
    forecast_weather = process_date(forecast_weather, date_col)

# Reduce memory usage to avoid OOM (Out OF Memory error)
#forecast_weather = reduce_memory_usage(forecast_weather, print_info=True)

# Show
print(forecast_weather.shape)
forecast_weather.head(2)

In [None]:
# Add location
forecast_weather = process_weather_info(forecast_weather)

# Show
print(forecast_weather.shape)
forecast_weather.head(2)

In [None]:
# Aggregate information over latitude/longitude
dict_agg = {'temperature' : ['min', 'mean', 'max', 'std'],
            'dewpoint' : ['min', 'mean', 'max', 'std'],
            'cloudcover_high' : ['min', 'mean', 'max', 'std'],
            'cloudcover_low' : ['min', 'mean', 'max', 'std'],
            'cloudcover_mid' : ['min', 'mean', 'max', 'std'],
            'cloudcover_total' : ['min', 'mean', 'max', 'std'],
            '10_metre_u_wind_component' : ['min', 'mean', 'max', 'std'],
            '10_metre_v_wind_component' : ['min', 'mean', 'max', 'std'],
            'direct_solar_radiation' : ['min', 'mean', 'max', 'std'],
            'surface_solar_radiation_downwards' : ['min', 'mean', 'max', 'std'],
            'snowfall' : ['min', 'mean', 'max', 'std'],
            'total_precipitation' : ['min', 'mean', 'max', 'std'],
           }

# Groupby
keys = ['county', 'forecast_datetime']
forecast_weather = forecast_weather.groupby(keys).agg(dict_agg).reset_index()

# Flatten columns names
forecast_weather.columns = ['_'.join([xx for xx in x if len(xx)>0]) for x in forecast_weather.columns]
forecast_weather.columns = [x + '_f' if x not in keys else x for x in forecast_weather.columns]

# Show
print(forecast_weather.shape)
forecast_weather.head(2)

In [None]:
%%time

# Join
df = df.merge(forecast_weather.rename(columns = {'forecast_datetime' : 'datetime'}),
              how='left',
              on=['county', 'datetime'],
             )

# Fill NaN
df = df.sort_values(by=['datetime']).reset_index(drop=True)
for k in df :
    if k.endswith('_f') :
        df[k] = df[k].ffill().bfill()

# Show
print(df.shape)
df.head(2)

In [None]:
# Sort
df = df.sort_values(by=['datetime']).reset_index(drop=True)

# Fill NaN
df.fillna(0, inplace=True)

# Show
print(df.shape)
df.head(2)

In [None]:
del historical_weather, forecast_weather
_ = gc.collect()

In [None]:
def create_df(df, client, historical_weather, forecast_weather,
              electricity_prices, gas_prices, sample_prediction) :

    #################################################
    # ⚡ ELECTRICITY FEATURES ⚡
    #################################################

    # Join
    df = pd.merge(df,
                  electricity_prices[["time", "euros_per_mwh"]],
                  how = 'left',
                  on = ["time"],
                 )

    #################################################
    # 🛢️ GAS FEATURES 🛢️
    #################################################

    # Join
    cols = ['time', 'lowest_price_per_mwh', 'highest_price_per_mwh']
    df = df.merge(gas_prices[cols],
                 how='left',
                 on=["time"],
                 )

    #################################################
    # 🧑 CLIENT FEATURES 🧑
    #################################################
   
    # Join
    df = df.merge(client.drop(columns = ['datetime', 'date', 'time']),
                 how='left',
                 on=['county', 'is_business', 'product_type'],
                 )

    #################################################
    # 🌤️ HISTORICAL WEATHER FEATURES 🌤️
    #################################################
    
    # Aggregate information over latitude/longitude
    dict_agg = {'temperature' : ['min', 'mean', 'max', 'std'],
                'dewpoint' : ['min', 'mean', 'max', 'std'],
                'rain' : ['min', 'mean', 'max', 'std'],
                'snowfall' : ['min', 'mean', 'max', 'std'],
                'surface_pressure' : ['min', 'mean', 'max', 'std'],
                'cloudcover_total' : ['min', 'mean', 'max', 'std'],
                'cloudcover_low' : ['min', 'mean', 'max', 'std'],
                'cloudcover_mid' : ['min', 'mean', 'max', 'std'],
                'cloudcover_high' : ['min', 'mean', 'max', 'std'],
                'windspeed_10m' : ['min', 'mean', 'max', 'std'],
                'winddirection_10m' : ['min', 'mean', 'max', 'std'],
                'shortwave_radiation' : ['min', 'mean', 'max', 'std'],
                'direct_solar_radiation' : ['min', 'mean', 'max', 'std'],
                'diffuse_radiation' : ['min', 'mean', 'max', 'std'],
               }

    # Groupby
    keys = ['county', 'datetime']
    historical_weather = historical_weather.groupby(keys).agg(dict_agg).reset_index()

    # Flatten columns names
    historical_weather.columns = ['_'.join([xx for xx in x if len(xx)>0]) for x in historical_weather.columns]
    historical_weather.columns = [x + '_h' if x not in keys else x for x in historical_weather.columns]

    # Shift datetime
    historical_weather['datetime'] = (pd.to_datetime(historical_weather['datetime'].astype(str)) + pd.Timedelta(days=1, hours=13)).astype(str)
    
    # Join
    df = df.merge(historical_weather,
                  how='left',
                  on=['county', 'datetime'],
                 )
    
    #################################################
    # 🌤️ FORECAST WEATHER FEATURES 🌤️
    #################################################
    
    # Aggregate information over datetime
    dict_agg = {'temperature' : ['min', 'mean', 'max', 'std'],
                'dewpoint' : ['min', 'mean', 'max', 'std'],
                'cloudcover_high' : ['min', 'mean', 'max', 'std'],
                'cloudcover_low' : ['min', 'mean', 'max', 'std'],
                'cloudcover_mid' : ['min', 'mean', 'max', 'std'],
                'cloudcover_total' : ['min', 'mean', 'max', 'std'],
                '10_metre_u_wind_component' : ['min', 'mean', 'max', 'std'],
                '10_metre_v_wind_component' : ['min', 'mean', 'max', 'std'],
                'direct_solar_radiation' : ['min', 'mean', 'max', 'std'],
                'surface_solar_radiation_downwards' : ['min', 'mean', 'max', 'std'],
                'snowfall' : ['min', 'mean', 'max', 'std'],
                'total_precipitation' : ['min', 'mean', 'max', 'std'],
               }

    # Groupby
    keys = ['county', 'forecast_datetime']
    forecast_weather = forecast_weather.groupby(keys).agg(dict_agg).reset_index()

    # Flatten columns names
    forecast_weather.columns = ['_'.join([xx for xx in x if len(xx)>0]) for x in forecast_weather.columns]
    forecast_weather.columns = [x + '_f' if x not in keys else x for x in forecast_weather.columns]

    # Join
    df = df.merge(forecast_weather.rename(columns = {'forecast_datetime' : 'datetime'}),
                  how='left',
                  on=['county', 'datetime'],
                 )

    # Fill NaN
    df = df.sort_values(by=['datetime']).reset_index(drop=True)
    for k in df :
        if k.endswith('_f') :
            df[k] = df[k].ffill().bfill()
    
    #################################################
    # ⚙️ FINAL PROCESSING ⚙️
    #################################################
    
    # Sort
    #df = df.sort_values(by=['datetime']).reset_index(drop=True)

    # Fill NaN
    df.fillna(0, inplace=True)

    # Return
    return df

In [None]:
forbidden_cols = ['target',
                  'datetime',
                  'date',
                  'row_id',
                  'data_block_id',
                  'prediction_unit_id',
                  
                  # Useless feats
                  'minutes',
                  'snowfall_min_h',
                  'rain_min_h',
                  'seconds',
                  'highest_price_per_mwh',
                  'lowest_price_per_mwh',
                  'snowfall_max_h',
                  'snowfall_min_f',
                  'cloudcover_mid_min_f',
                  'cloudcover_high_min_f',
                 ]
numeric_cols   = df.select_dtypes(include=np.number).columns.tolist()

feats = [x for x in numeric_cols if x not in forbidden_cols]

print(f"{len(feats)} features.")
df[feats].head(1)

In [None]:
# Fewer feats
if False :
    # Fewer feats
    feats = ['day',
             'month',
             'euros_per_mwh',
             'hour',
             'dayofweek',
             'county',
             'target_revealed_6days_ago',
             'target_revealed_5days_ago',
             'target_revealed_4days_ago',
             'target_revealed_3days_ago',
             'target_revealed_2days_ago',
             'year',
             'is_consumption',
             'is_business',
             'product_type',
             'eic_count',
             'installed_capacity',
             ]


print(f"{len(feats)} features.")
df[feats].head(1)

In [None]:
lgbm_params = {'boosting_type': 'gbdt',
               'objective': 'regression',
               'metric' : 'mean_absolute_error',
               'importance_type': 'split',
               'learning_rate': 0.08,
               'n_estimators': 1000,
               'max_depth': -1,
               'min_child_samples': 120,
               'num_leaves': 250,
               'colsample_bytree': 0.85,
               'subsample': 0.85,
               'reg_alpha': 2,
               'reg_lambda': 1,
               'n_jobs': 2,
               'random_state': 12,
              }

fit_params = {"eval_metric" : 'mean_absolute_error',
              #"eval_set" : [(xtr, ytr), (xte, yte)],
              "eval_names": ['train', 'test'],
              "categorical_feature": 'auto'}

In [None]:
%%time

Iterations = []

for date_limit in ['2023-01-01 00:00:00',
                   '2023-02-01 00:00:00',
                   '2023-03-01 00:00:00',
                   '2023-04-01 00:00:00',
                   #'2023-05-01 00:00:00',
                  ] :

    # Train and test
    mask_train = (df['datetime'] <= date_limit)
    xtr, ytr = df.loc[mask_train, feats], df.loc[mask_train, 'target']
    xte, yte = df.loc[~mask_train, feats], df.loc[~mask_train, 'target']
    
    # Fit model
    model = lgb.LGBMRegressor(**lgbm_params)
    fit_params['eval_set'] = [(xtr, ytr), (xte, yte)]
    model.fit(xtr, ytr, **fit_params, callbacks = [lgb.early_stopping(15, verbose=False),
                                                   lgb.log_evaluation(0)])

    # Add number of iterations
    Iterations.append(model.best_iteration_)
    
    # Predictions
    preds = np.clip(model.predict(xte), 0, 15000)
    
    # Print score
    print(f"{date_limit[:10]} :")
    print(f"Train/Test : {len(xtr)}/{len(xte)} rows.")
    print(f"Iterations : {Iterations[-1]}.")
    print(f"Score      : {round(mean_absolute_error(yte, preds), 2)}.")
    print()

In [None]:
# The number of iterations we will use for our training (trained on all data)
print(Iterations)
best_iter = int(np.median(Iterations))
best_iter

In [None]:
%%time

lgbm_params['n_estimators'] = best_iter

Feature_Imp = None

# Train and test
xtr, ytr = df[feats], df['target']
print(f"Train : {len(xtr)} rows.")

# Fit model
model = lgb.LGBMRegressor(**lgbm_params)
fit_params['eval_set'] = [(xtr, ytr)]
model.fit(xtr, ytr, **fit_params, callbacks = [lgb.log_evaluation(50)])

# Save model
joblib.dump(model, f"lgbm_model.pkl")

# Features Importance
Feature_Imp = pd.DataFrame(sorted(zip(model.feature_importances_, feats)), columns=['Value','Feature'])
Feature_Imp['Value'] = 100* (Feature_Imp['Value'] / Feature_Imp['Value'].max()) # Normalisation
Feature_Imp = Feature_Imp.sort_values(by='Value', ascending=False).reset_index(drop=True)

In [None]:
# Features Importance
if len(Feature_Imp) > 90 : plt.figure(figsize=(7, 15))
elif len(Feature_Imp) > 60 : plt.figure(figsize=(7, 12))
elif len(Feature_Imp) > 30 : plt.figure(figsize=(7, 10))
else :
    plt.figure(figsize=(5, 5))
sns.barplot(x="Value", y="Feature", data=Feature_Imp.head(100))
plt.title('Features Importance')
plt.show()

In [None]:
# Most important features
Feature_Imp.Feature.tolist()[:20]

In [None]:
# Less important features
Feature_Imp.Feature.tolist()[::-1][:10]

In [None]:
import enefit
env = enefit.make_env()
iter_test = env.iter_test()

In [None]:
def add_previous_targets(test, PREVIOUS_TARGET_REVEALED) :
    
    for i, revealed_targets in enumerate(PREVIOUS_TARGET_REVEALED) :
        day_shift = i + 2
    
        # Rename
        revealed_targets = revealed_targets.rename(columns = {"target" : f"target_revealed_{day_shift}days_ago"})

        # Add target 24h ago
        test = pd.merge(test,
                        revealed_targets[["county", "is_business", "product_type", "is_consumption", f"target_revealed_{day_shift}days_ago", "time"]],
                        how = 'left',
                        on = ["county", "is_business", "product_type", "is_consumption", "time"],
                       )
        
    # Fill NaN with last revealed
    for day_shift in range(2, 7) :
        k = f"target_revealed_{day_shift}days_ago"
        if k not in test :
            test[k] = test["target_revealed_2days_ago"].copy()
        else :
            mask = (test[k].isna())
            test.loc[mask, k] = test.loc[mask, "target_revealed_2days_ago"]
        test[k] = test[k].fillna(0.5)

    # Return
    return test

#add_target_one_day_ago(test, revealed_targets)

In [None]:
# Reload enefit environment (only in debug mode, otherwise the submission will fail)
if debug :
    enefit.make_env.__called__ = False
    type(env)._state = type(type(env)._state).__dict__['INIT']
    iter_test = env.iter_test()
# -------------------------------------------------------

# List of target_revealed dataframes
PREVIOUS_TARGET_REVEALED = []

# Iterate and submit
for (test, revealed_targets, client,
     historical_weather, forecast_weather,
     electricity_prices, gas_prices, sample_prediction) in iter_test:

    # Rename
    client = client.rename(columns = {'date' : 'datetime'})
    if 'datetime' not in test :
        test   = test.rename(columns = {'prediction_datetime' : 'datetime'})
        
    # Boolean -> Int
    client['is_business'] = client['is_business'].astype(int)
    for k in ['is_business', 'is_consumption'] :
        test[k] = test[k].astype(int)
        revealed_targets[k] = revealed_targets[k].astype(int)
    
    # Date processing + features extraction
    for date_col in ['datetime'] :
        test = process_date(test, date_col, extract_features = True)
        client = process_date(client, date_col)
        historical_weather = process_date(historical_weather, date_col)
        revealed_targets = process_date(revealed_targets, date_col)
    for date_col in ['forecast_date', 'origin_date'] :
        electricity_prices = process_date(electricity_prices, date_col)
        gas_prices = process_date(gas_prices, date_col)
    for date_col in ['forecast_datetime'] :
        forecast_weather = process_date(forecast_weather, date_col)
        
    # Process weather info
    historical_weather = process_weather_info(historical_weather)
    forecast_weather   = process_weather_info(forecast_weather)
    
    # Create df
    test = create_df(test, client, historical_weather, forecast_weather,
                     electricity_prices, gas_prices, sample_prediction)
    
    # Store revealed_targets
    PREVIOUS_TARGET_REVEALED.insert(0, revealed_targets)
    while len(PREVIOUS_TARGET_REVEALED) > 7 :
        PREVIOUS_TARGET_REVEALED.pop()
    
    # Add previous targets
    test = add_previous_targets(test, PREVIOUS_TARGET_REVEALED)
        
    # In case of missing feats, we create them
    missing_feats = [x for x in feats if x not in test]
    if len(missing_feats) > 0 :
        test = pd.concat([test,
                          pd.DataFrame(0, index=np.arange(len(test)), columns=missing_feats)
                         ],
                         axis=1,
                        )

    # Inference
    sample_prediction.drop(columns = ['target'], inplace=True)
    test['target'] = np.clip(model.predict(test[feats]), 0, 15000)
    
    #test['target'] = 0
    #for model in MODELS :
    #    test['target'] += np.clip(model.predict(test[feats]), 0, 15000) / len(MODELS)
        
    # Add target to sample_prediction
    sample_prediction = pd.merge(sample_prediction, test[['row_id', 'target']], on='row_id', how='left')
    
    # Send predictions
    env.predict(sample_prediction)

In [None]:
# Show sample_prediction
try :
    print(sample_prediction.shape)
    display(sample_prediction.head())
except :
    print("Oops, something went wrong !")