### The feature engineering functions and the baseline are taken from [this notebook](https://www.kaggle.com/code/ahmedabdulwahab/pandas-data-description-and-starters-guide)

## Importing Libraries

In [161]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, date, timedelta
import enefit
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor
from sklearn.ensemble import VotingRegressor

import warnings
warnings.filterwarnings('ignore')


In [162]:
train= pd.read_csv('/kaggle/input/predict-energy-behavior-of-prosumers/train.csv')
gas_df= pd.read_csv('/kaggle/input/predict-energy-behavior-of-prosumers/gas_prices.csv')
electricity_df= pd.read_csv('/kaggle/input/predict-energy-behavior-of-prosumers/electricity_prices.csv')
client_df= pd.read_csv('/kaggle/input/predict-energy-behavior-of-prosumers/client.csv')
fw_df= pd.read_csv('/kaggle/input/predict-energy-behavior-of-prosumers/forecast_weather.csv')
hw_df= pd.read_csv('/kaggle/input/predict-energy-behavior-of-prosumers/historical_weather.csv')
locations= pd.read_csv('/kaggle/input/locations/county_lon_lats.csv')

## Feature Engineering functions

In [163]:
def feat_eng_train(data, client, hist_weather,forecast_weather, electricity, gas, locations):

    data= data[data['target'].notnull()] 
    
    data['datetime'] = pd.to_datetime(data['datetime'], utc=True)
    
    electricity = electricity.rename(columns= {'forecast_date' : 'datetime'})
    
    electricity['datetime'] = pd.to_datetime(electricity['datetime'], utc= True)
    
    # Decreasing (data_block_id) in client data because it's 2 steps ahead from train's data (data_block_id)
    client['data_block_id'] -= 2
    
    # locations is a custom data that will help replace (latitude) and (longitude)columns by the counties for each coordination 
    locations = locations.drop('Unnamed: 0', axis= 1) 
        
    forecast_weather[['latitude', 'longitude']] = forecast_weather[['latitude','longitude']].astype(float).round(1)
        
    forecast_weather= forecast_weather.merge(locations, how='left',on=['longitude','latitude'])
        
    forecast_weather.dropna(axis= 0, inplace= True)    
    
    forecast_weather['county'] = forecast_weather['county'].astype('int64')
    
    forecast_weather.drop(['origin_datetime', 'latitude','longitude', 'hours_ahead', 'data_block_id'], axis=1, inplace= True)
    
    forecast_weather.rename(columns={'forecast_datetime': 'datetime'}, inplace= True)
    
    forecast_weather['datetime']= pd.to_datetime(forecast_weather['datetime'], utc= True)
    
    # Grouping all forecast_weather columns mean values by hour, So each hour will have the mean values of the forecast_weather columns
    forecast_weather_datetime= forecast_weather.groupby([forecast_weather['datetime'].dt.to_period('h')])[list(forecast_weather.drop(['county','datetime'], axis= 1).columns)].mean().reset_index()
    
    forecast_weather_datetime['datetime']= pd.to_datetime(forecast_weather_datetime['datetime'].dt.to_timestamp(), utc=True)
    
    # Grouping all forecast_weather columns mean values by hour and county, So each hour and county will have the mean values of the forecast_weather columns for each county
    forecast_weather_datetime_county= forecast_weather.groupby(['county',forecast_weather['datetime'].dt.to_period('h')])[list(forecast_weather.drop(['county','datetime'], axis= 1).columns)].mean().reset_index()
    
    forecast_weather_datetime_county['datetime']= pd.to_datetime(forecast_weather_datetime_county['datetime'].dt.to_timestamp(), utc=True)
           
    hist_weather[['latitude', 'longitude']] = hist_weather[['latitude', 'longitude']].astype(float).round(1)

    hist_weather= hist_weather.merge(locations, how='left', on=['longitude','latitude'])    
    
    hist_weather.dropna(axis= 0, inplace= True)
    
    hist_weather.drop(['latitude', 'longitude'], axis=1, inplace= True)
  
    hist_weather['county'] = hist_weather['county'].astype('int64')
    
    hist_weather['datetime']= pd.to_datetime(hist_weather['datetime'], utc= True)
    
    # Grouping all historical_weather columns mean values by hour, So each hour will have the mean values of the historical_weather columns
    hist_weather_datetime= hist_weather.groupby([hist_weather['datetime'].dt.to_period('h')])[list(hist_weather.drop(['county','datetime','data_block_id'], axis= 1).columns)].mean().reset_index()    
    
   
    hist_weather_datetime['datetime']= pd.to_datetime(hist_weather_datetime['datetime'].dt.to_timestamp(), utc=True)
    
    hist_weather_datetime= hist_weather_datetime.merge(hist_weather[['datetime', 'data_block_id']], how='left', on='datetime')
    
    #  Grouping all historical_weather columns mean values by hour and county, So each hour will have the mean values of the historical_weather columns for each county
    hist_weather_datetime_county= hist_weather.groupby(['county',hist_weather['datetime'].dt.to_period('h')])[list(hist_weather.drop(['county','datetime', 'data_block_id'], axis= 1).columns)].mean().reset_index() 
    
    hist_weather_datetime_county['datetime']= pd.to_datetime(hist_weather_datetime_county['datetime'].dt.to_timestamp(), utc=True)
    
    hist_weather_datetime_county= hist_weather_datetime_county.merge(hist_weather[['datetime', 'data_block_id']], how='left', on='datetime')
    
    data['year'] = data['datetime'].dt.year
    data['month'] = data['datetime'].dt.month
    data['day'] = data['datetime'].dt.day
    data['hour'] = data['datetime'].dt.hour
    data['dayofweek'] = data['datetime'].dt.dayofweek
    data['dayofyear']= data['datetime'].dt.dayofyear
    
    electricity['hour'] = electricity['datetime'].dt.hour

    data= data.merge(client.drop(columns = ['date']), how='left', on=['data_block_id', 'county', 'is_business', 'product_type'])
    data= data.merge(gas[['data_block_id', 'lowest_price_per_mwh', 'highest_price_per_mwh']], how='left', on='data_block_id')
    data= data.merge(electricity[['euros_per_mwh', 'hour', 'data_block_id']], how='left', on=['hour', 'data_block_id']) 
    data= data.merge(forecast_weather_datetime, how='left', on=['datetime'])  
    data= data.merge(forecast_weather_datetime_county, how='left', on=['datetime', 'county'],suffixes= ('_fcast_mean','_fcast_mean_by_county'))
    
    hist_weather_datetime['hour']= hist_weather_datetime['datetime'].dt.hour
    hist_weather_datetime_county['hour']= hist_weather_datetime_county['datetime'].dt.hour

    hist_weather_datetime.drop_duplicates(inplace=True)
    hist_weather_datetime_county.drop_duplicates(inplace=True)
    hist_weather_datetime.drop('datetime', axis= 1, inplace= True)
    hist_weather_datetime_county.drop('datetime', axis= 1, inplace= True)

    data= data.merge(hist_weather_datetime, how='left', on=['data_block_id', 'hour'])
    data= data.merge(hist_weather_datetime_county, how='left', on=['data_block_id', 'county', 'hour'],suffixes= ('_hist_mean','_hist_mean_by_county'))
    data= data.groupby(['year', 'day', 'hour'], as_index=False).apply(lambda x: x.ffill().bfill()).reset_index()
    data.drop(['level_0', 'level_1', 'row_id', 'data_block_id'], axis= 1, inplace= True)
    
    return data

In [164]:
def create_revealed_targets_train(data, N_day_lags):
    # Creating lagged target values => A target value depends on what was a target value n days ago
    
    original_datetime = data['datetime']
    
    revealed_targets = data[['datetime', 'prediction_unit_id', 'is_consumption', 'target']].copy()
    
    #Creating revealed targets for 'n' days lag
    for day_lag in range(2, N_day_lags+1):
        revealed_targets['datetime'] = original_datetime + pd.DateOffset(day_lag)
        data = data.merge(revealed_targets, how='left', on = ['datetime', 'prediction_unit_id', 'is_consumption'],suffixes = ('', f'_{day_lag}_days_ago'))
    return data

In [165]:
# Применение функции предварительной обработки данных к обучающему набору. Эта функция интегрирует информацию из различных источников данных, 
# таких как данные клиентов, аппаратного и программного обеспечения, электроэнергии и газа, а также местоположений.
train = feat_eng_train(train, client_df, hw_df, fw_df, electricity_df, gas_df, locations)

# Создание лагов для переменных за последние N дней (в данном случае 7 дней). Это помогает модели учесть временные зависимости в данных.
N_day_lags = 7
train = create_revealed_targets_train(train, N_day_lags=N_day_lags)

# Преобразование столбца 'datetime' в числовой формат. Это может быть необходимо для обработки дат и времени моделями машинного обучения,
# которые обычно работают с числовыми данными.
train['datetime'] = train['datetime'].astype('int64')




## Data Transformation

В этом коде используется преобразование временных признаков, таких как час и день в году, в синусоидальные и косинусоидальные значения, чтобы лучше отобразить их циклическую природу. Кроме того, вычисляются статистические характеристики целевой переменной, такие как среднее значение, стандартное отклонение и дисперсия, на основе исторических данных.


In [166]:
# Преобразование часа дня в синусоидальные и косинусоидальные значения для учета цикличности времени суток
train['sin_hour'] = (np.pi * np.sin(train['hour']) / 12)  # Синусоидальное представление часа дня
train['cos_hour'] = (np.pi * np.cos(train['hour']) / 12)  # Косинусоидальное представление часа дня

# Преобразование дня в году в синусоидальные и косинусоидальные значения для учета цикличности времени в году
train['sin_dayofyear'] = (np.pi * np.sin(train['dayofyear']) / 183)  # Синусоидальное представление дня в году
train['cos_dayofyear'] = (np.pi * np.cos(train['dayofyear']) / 183)  # Косинусоидальное представление дня в году

# Расчет статистических характеристик целевой переменной на основе исторических данных
train['target_mean'] = train[[f'target_{i}_days_ago' for i in range(2, N_day_lags+1)]].mean(1)  # Среднее значение целевой переменной за предыдущие дни
train['target_std'] = train[[f'target_{i}_days_ago' for i in range(2, N_day_lags+1)]].std(1)  # Стандартное отклонение целевой переменной за предыдущие дни
train['target_var'] = train[[f'target_{i}_days_ago' for i in range(2, N_day_lags+1)]].var(1)  # Дисперсия целевой переменной за предыдущие дни


In [167]:
#Log the columns with outliers
to_log= ['installed_capacity', 'euros_per_mwh', 'temperature_fcast_mean', 'dewpoint_fcast_mean','cloudcover_high_fcast_mean', 'cloudcover_low_fcast_mean', 'cloudcover_mid_fcast_mean', 'cloudcover_total_fcast_mean','10_metre_u_wind_component_fcast_mean', '10_metre_v_wind_component_fcast_mean', 'direct_solar_radiation_fcast_mean','snowfall_fcast_mean', 'total_precipitation_fcast_mean', 'temperature_fcast_mean_by_county', 'dewpoint_fcast_mean_by_county','cloudcover_high_fcast_mean_by_county', 'cloudcover_low_fcast_mean_by_county', 'cloudcover_mid_fcast_mean_by_county','cloudcover_total_fcast_mean_by_county', '10_metre_u_wind_component_fcast_mean_by_county', '10_metre_v_wind_component_fcast_mean_by_county','surface_solar_radiation_downwards_fcast_mean_by_county', 'snowfall_fcast_mean_by_county', 'total_precipitation_fcast_mean_by_county','rain_hist_mean', 'snowfall_hist_mean', 'windspeed_10m_hist_mean_by_county', 'target_2_days_ago', 'target_3_days_ago','target_4_days_ago', 'target_5_days_ago', 'target_6_days_ago', 'target_7_days_ago', 'target_mean', 'target_std']
for i in to_log:
    train[f"log_{i}"]= np.where((train[i])!= 0, np.log(train[i]),0)

In [168]:
train=train[train.year >= 2022]

## Training columns

In [169]:
#Storing training features into numpy arrays
X= train.drop('target', axis= 1).values
y= train['target']

#Storing production targets into an array itself | Will seperate it into another model
X2= train[train['is_consumption'] == 0].drop('target', axis= 1).values
y2= train[train['is_consumption'] == 0]['target']

## Training models

### MOD1 Params

In [170]:
# p1={'n_estimators':4000,'verbose': -1,'random_state':73,'objective':'tweedie','learning_rate': 0.030339736147758608, 'colsample_bytree': 0.9263063801074632, 'colsample_bynode': 0.4527058263857967, 'reg_alpha': 3.62802063709343, 'reg_lambda': 1.6506819544194185, 'min_data_in_leaf': 201, 'max_depth': 15, 'device':'gpu',"n_jobs" : 4,'num_leaves': 455}
# p2={'n_estimators':4000,'verbose': -1,'random_state':73,'objective':'tweedie','learning_rate': 0.033090718804096083, 'colsample_bytree': 0.9499770953943448, 'colsample_bynode': 0.4670163857441046, 'reg_aplha': 3.9694606555680705, 'reg_lambda': 1.925712107567988, 'min_data_in_leaf': 223, 'max_depth': 18, 'device':'gpu',"n_jobs" : 4,'num_leaves': 465}
# p3={'n_estimators':4000,'verbose': -1,'random_state':73,'objective':'tweedie','learning_rate': 0.035559490612977255, 'colsample_bytree': 0.9682791614810814, 'colsample_bynode': 0.4722023075509447, 'reg_aplha': 4.1562458539834125, 'reg_lambda': 2.265053303366992, 'min_data_in_leaf': 254, 'max_depth': 19, 'device':'gpu',"n_jobs" : 4,'num_leaves': 475}
# p4={'n_estimators':4000,'verbose': -1,'random_state':73,'objective':'tweedie','learning_rate': 0.038908744594789185, 'colsample_bytree': 0.9864875442500248, 'colsample_bynode': 0.4832525869590394, 'reg_aplha': 4.358459131925572, 'reg_lambda': 2.355521088983217, 'min_data_in_leaf': 289, 'max_depth': 21, 'device':'gpu',"n_jobs" : 4,'num_leaves': 485}

In [171]:
p1={'n_estimators':4000,'verbose': -1,'random_state':73,'objective':'mae','learning_rate': 0.030339736147758608, 'colsample_bytree': 0.9263063801074632, 'colsample_bynode': 0.4527058263857967, 'reg_alpha': 3.62802063709343, 'reg_lambda': 1.6506819544194185, 'min_data_in_leaf': 201, 'max_depth': 15, 'device':'gpu',"n_jobs" : 4,'num_leaves': 455}
p2={'n_estimators':4000,'verbose': -1,'random_state':73,'objective':'mae','learning_rate': 0.033090718804096083, 'colsample_bytree': 0.9499770953943448, 'colsample_bynode': 0.4670163857441046, 'reg_aplha': 3.9694606555680705, 'reg_lambda': 1.925712107567988, 'min_data_in_leaf': 223, 'max_depth': 18, 'device':'gpu',"n_jobs" : 4,'num_leaves': 465}
p3={'n_estimators':4000,'verbose': -1,'random_state':73,'objective':'mae','learning_rate': 0.035559490612977255, 'colsample_bytree': 0.9682791614810814, 'colsample_bynode': 0.4722023075509447, 'reg_aplha': 4.1562458539834125, 'reg_lambda': 2.265053303366992, 'min_data_in_leaf': 254, 'max_depth': 19, 'device':'gpu',"n_jobs" : 4,'num_leaves': 475}
p4={'n_estimators':4000,'verbose': -1,'random_state':73,'objective':'mae','learning_rate': 0.038908744594789185, 'colsample_bytree': 0.9864875442500248, 'colsample_bynode': 0.4832525869590394, 'reg_aplha': 4.358459131925572, 'reg_lambda': 2.355521088983217, 'min_data_in_leaf': 289, 'max_depth': 21, 'device':'gpu',"n_jobs" : 4,'num_leaves': 485}

In [172]:
# # #LGB
# p1={'n_estimators': 1000,'verbose': -1,'objective': 'l2','learning_rate': 0.06258413085998576, 'colsample_bytree': 0.6527661140701613, 'colsample_bynode': 0.8106858631408332, 'lambda_l1': 5.065645378814257, 'lambda_l2': 9.81159370218779, 'min_data_in_leaf': 192, 'max_depth': 10, 'max_bin': 1800}
# p2={'n_estimators': 1000,'verbose': -1,'objective': 'l2','learning_rate': 0.0632167263149817, 'colsample_bytree': 0.6958033941948067, 'colsample_bynode': 0.6030801666196094, 'lambda_l1': 7.137580620471935, 'lambda_l2': 9.348169401713742, 'min_data_in_leaf': 74, 'max_depth': 11, 'max_bin': 530}
# p3={'n_estimators': 1000,'verbose': -1,'objective': 'l2','learning_rate': 0.061236402165228264, 'colsample_bytree': 0.81427095118471, 'colsample_bynode': 0.6097376843527067, 'lambda_l1': 6.360490880385201, 'lambda_l2': 9.954136008333839, 'min_data_in_leaf': 238, 'max_depth': 13, 'max_bin': 649}
# p4={'n_estimators': 1000,'verbose': -1,'objective': 'l2','learning_rate': 0.06753282378023663, 'colsample_bytree': 0.7508715107428325, 'colsample_bynode': 0.6831819500325418, 'lambda_l1': 8.679353563755722, 'lambda_l2': 6.105008696961338, 'min_data_in_leaf': 198, 'max_depth': 15, 'max_bin': 835}
# p5={'n_estimators': 1000,'verbose': -1,'objective': 'l2','learning_rate': 0.05129380742257108, 'colsample_bytree': 0.5101576947777211, 'colsample_bynode': 0.8052639518604396, 'lambda_l1': 8.087311995794915, 'lambda_l2': 5.067361158677095, 'min_data_in_leaf': 222, 'max_depth': 8, 'max_bin': 97}
# p6={'n_estimators': 900,'verbose': -1,'objective': 'l2','learning_rate': 0.05689066836106983,'colsample_bytree': 0.8915976762048253,'colsample_bynode': 0.5942203285139224,'lambda_l1': 3.6277555139102864,'lambda_l2': 1.6591278779517808,'min_data_in_leaf' : 186,'max_depth': 9,'max_bin': 813,}

# p7={'n_estimators': 1483,'verbose': -1,'objective': 'l2','learning_rate': 0.047463300970785334,'colsample_bytree': 0.5765687465069222,'colsample_bynode': 0.745770069784652,'lambda_l1': 5.569745853175777,'lambda_l2': 0.9051759724463506,'min_data_in_leaf' : 186,'max_depth': 11,'max_bin': 738,}
# # p7={'min_data_in_leaf': 138,   'num_leaves': 68, 'feature_fraction': 0.8935903906747225, 'bagging_fraction': 0.9688930607563444, 'bagging_freq': 1}

In [173]:
lgbp1=LGBMRegressor(**p1)
lgbp2=LGBMRegressor(**p2)
lgbp3=LGBMRegressor(**p3)
lgbp4=LGBMRegressor(**p4)

### MOD2 params

In [174]:
# n1={'n_iter':4000,'verbose': -1,'random_state':73,'objective':'tweedie','learning_rate': 0.010339736147758608, 'colsample_bytree': 0.8893063801074632, 'colsample_bynode': 0.4527058263857967, 'lambda_l1': 3.62802063709343, 'lambda_l2': 1.6506819544194185, 'min_data_in_leaf': 63, 'max_depth': 12, 'device':'gpu', 'min_data_per_groups': 59,'num_leaves': 455,"n_jobs" : 4}
# n2={'n_iter':4000,'verbose': -1,'random_state':73,'objective':'tweedie','learning_rate': 0.012090718804096083, 'colsample_bytree': 0.9099770953943448, 'colsample_bynode': 0.4670163857441046, 'lambda_l1': 3.8694606555680705, 'lambda_l2': 1.925712107567988, 'min_data_in_leaf': 68, 'max_depth': 14, 'device':'gpu', 'min_data_per_groups': 69,'num_leaves': 465,"n_jobs" : 4}
# n3={'n_iter':4000,'verbose': -1,'random_state':73,'objective':'tweedie','learning_rate': 0.014559490612977255, 'colsample_bytree': 0.9282791614810814, 'colsample_bynode': 0.4722023075509447, 'lambda_l1': 4.0562458539834125, 'lambda_l2': 2.265053303366992, 'min_data_in_leaf': 73, 'max_depth': 17, 'device':'gpu', 'min_data_per_groups': 79,'num_leaves': 475,"n_jobs" : 4}
# n4={'n_iter':4000,'verbose': -1,'random_state':73,'objective':'tweedie','learning_rate': 0.016908744594789185, 'colsample_bytree': 0.9534875442500248, 'colsample_bynode': 0.4832525869590394, 'lambda_l1': 4.258459131925572, 'lambda_l2': 2.355521088983217, 'min_data_in_leaf': 78, 'max_depth': 21, 'device':'gpu', 'min_data_per_groups': 89,'num_leaves': 485,"n_jobs" : 4}

In [175]:
# !pip install catboost[gpu]

In [176]:
# !pip install catboost-cuda


In [177]:
import catboost as cb  # Библиотека для градиентного бустинга

In [178]:
c1={'iterations': 1800,'eval_metric': 'MAE','learning_rate': 0.04094387403153919, 'depth': 9, 'l2_leaf_reg': 3, 'border_count': 160, 'random_strength': 2, 'bagging_temperature': 0.30238224812191056}
c2={'iterations': 1800,'eval_metric': 'MAE','learning_rate': 0.06054396656784583, 'depth': 11, 'l2_leaf_reg': 6, 'border_count': 134, 'random_strength': 9, 'bagging_temperature': 0.29607411641626996}
c3 = {
    'learning_rate': 0.12358952478027072,
    'depth': 11,
    'l2_leaf_reg': 8,
    'border_count': 191,
    'random_strength': 3,
    'bagging_temperature': 0.41774414265586035,
    'iterations': 1800,
    'eval_metric': 'MAE'
}

c4 = {
    'learning_rate': 0.06258413085998576,
    'depth': 10,
    'l2_leaf_reg': 8,
    'border_count': 211,
    'random_strength': 6,
    'bagging_temperature': 0.13029094645654574,
    'iterations': 1800,
    'eval_metric': 'MAE'
}

In [179]:
# c1={'task_type': 'gpu','learning_rate': 0.04094387403153919, 'depth': 9, 'l2_leaf_reg': 3, 'border_count': 160, 'random_strength': 2, 'bagging_temperature': 0.30238224812191056}
# c2={'task_type': 'gpu','learning_rate': 0.06054396656784583, 'depth': 11, 'l2_leaf_reg': 6, 'border_count': 134, 'random_strength': 9, 'bagging_temperature': 0.29607411641626996}
# c3 = {
#     'learning_rate': 0.12358952478027072,
#     'depth': 11,
#     'l2_leaf_reg': 8,
#     'border_count': 191,
#     'random_strength': 3,
#     'bagging_temperature': 0.41774414265586035,
#     'iterations': 1800,
#     'eval_metric': 'MAE',
#     'task_type': 'gpu',# Добавление поддержки GPU
#     'cat_features': ['county', 'is_business', 'product_type', 'is_consumption', 'category_1']
# }

# c4 = {
#     'learning_rate': 0.06258413085998576,
#     'depth': 10,
#     'l2_leaf_reg': 8,
#     'border_count': 211,
#     'random_strength': 6,
#     'bagging_temperature': 0.13029094645654574,
#     'iterations': 1800,
#     'eval_metric': 'MAE',
#     'task_type': 'gpu',
#     'cat_features': ['county', 'is_business', 'product_type', 'is_consumption', 'category_1']
# }

In [180]:
cat1=cb.CatBoostRegressor(**c1, random_state=42)
cat2=cb.CatBoostRegressor(**c2, random_state=42)
cat3=cb.CatBoostRegressor(**c3, random_state=42)
cat4=cb.CatBoostRegressor(**c4, random_state=42)

        

In [181]:
# lgbn1=LGBMRegressor(**n1)
# lgbn2=LGBMRegressor(**n2)
# lgbn3=LGBMRegressor(**n3)
# lgbn4=LGBMRegressor(**n4)

In [182]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

# Split your data
Xtr, Xval, ytr, yval = train_test_split(X, y, test_size=0.2, random_state=73,shuffle=True)
X2tr, X2val, y2tr, y2val = train_test_split(X2, y2, test_size=0.2, random_state=73,shuffle=True)

In [183]:
for lgbm_model in [lgbp1, lgbp2, lgbp3, lgbp4]:
    print('_______________________________________________________')
    print('Start')
    lgbm_model.fit(Xtr, ytr, eval_set=[(Xval, yval)], callbacks=[
            lgb.callback.early_stopping(stopping_rounds=100),
            lgb.callback.log_evaluation(period=100),
        ],)

_______________________________________________________
Start
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l1: 65.1581


KeyboardInterrupt: 

In [None]:
import gc
gc.collect()

In [None]:
# for lgbm_model in [lgbn1, lgbn2, lgbn3, lgbn4]:
#     print('_______________________________________________________')
#     print('Start')
#     lgbm_model.fit(X2tr, y2tr, eval_set=[(X2val, y2val)], callbacks=[
#             lgb.callback.early_stopping(stopping_rounds=100),
#             lgb.callback.log_evaluation(period=100),
#         ],)

In [None]:

for cat_model in [cat1, cat2, cat3, cat4]:
    print('_______________________________________________________')
    print('Start')

    # Обучение модели CatBoost
    cat_model.fit(
        X2tr, y2tr, 
        eval_set=[(X2val, y2val)], 
        early_stopping_rounds=100, 
        verbose=100  # Вывод информации каждые 100 итераций
    ) 

In [None]:
gc.collect()

## Feature Engineering for Test Data

In [None]:
def feat_eng_test(data, client, hist_weather, forecast_weather, electricity, gas, locations):
    
    data= data.rename(columns={'prediction_datetime' : 'datetime'})
    data['datetime'] = pd.to_datetime(data['datetime'], utc=True)
        
    electricity = electricity.rename(columns= {'forecast_date' : 'datetime'})    
    electricity['datetime'] = pd.to_datetime(electricity['datetime'], utc= True)

    locations = locations.drop('Unnamed: 0', axis= 1) 
        
    forecast_weather[['latitude', 'longitude']] = forecast_weather[['latitude', 'longitude']].astype(float).round(1)   
    forecast_weather= forecast_weather.merge(locations, how='left', on=['longitude','latitude'])
    forecast_weather.dropna(axis= 0, inplace= True)    
    forecast_weather['county'] = forecast_weather['county'].astype('int64')
    forecast_weather.drop(['origin_datetime', 'latitude', 'longitude', 'hours_ahead', 'data_block_id'], axis=1, inplace= True)
    forecast_weather.rename(columns={'forecast_datetime': 'datetime'}, inplace= True)
    forecast_weather['datetime']= pd.to_datetime(forecast_weather['datetime'], utc= True)
    
    forecast_weather_datetime= forecast_weather.groupby([forecast_weather['datetime'].dt.to_period('h')])[list(forecast_weather.drop(['county','datetime'], axis= 1).columns)].mean().reset_index()
    forecast_weather_datetime['datetime']= pd.to_datetime(forecast_weather_datetime['datetime'].dt.to_timestamp(), utc=True)
    forecast_weather_datetime_county= forecast_weather.groupby(['county',forecast_weather['datetime'].dt.to_period('h')])[list(forecast_weather.drop(['county','datetime'], axis= 1).columns)].mean().reset_index()
    forecast_weather_datetime_county['datetime']= pd.to_datetime(forecast_weather_datetime_county['datetime'].dt.to_timestamp(), utc=True)
          
    hist_weather[['latitude', 'longitude']] = hist_weather[['latitude', 'longitude']].astype(float).round(1)
    hist_weather= hist_weather.merge(locations, how='left', on=['longitude','latitude'])    
    hist_weather.dropna(axis= 0, inplace= True)
    hist_weather.drop(['latitude', 'longitude'], axis=1, inplace= True)
    hist_weather['county'] = hist_weather['county'].astype('int64')
    hist_weather['datetime']= pd.to_datetime(hist_weather['datetime'], utc= True)
    hist_weather_datetime= hist_weather.groupby([hist_weather['datetime'].dt.to_period('h')])[list(hist_weather.drop(['county','datetime', 'data_block_id'], axis= 1).columns)].mean().reset_index()    
    hist_weather_datetime['datetime']= pd.to_datetime(hist_weather_datetime['datetime'].dt.to_timestamp(), utc=True)
    hist_weather_datetime= hist_weather_datetime.merge(hist_weather[['datetime', 'data_block_id']], how='left', on='datetime')
    hist_weather_datetime_county= hist_weather.groupby(['county',hist_weather['datetime'].dt.to_period('h')])[list(hist_weather.drop(['county','datetime', 'data_block_id'], axis= 1).columns)].mean().reset_index() 
    hist_weather_datetime_county['datetime']= pd.to_datetime(hist_weather_datetime_county['datetime'].dt.to_timestamp(), utc=True)
    hist_weather_datetime_county= hist_weather_datetime_county.merge(hist_weather[['datetime', 'data_block_id']], how='left', on='datetime')
    
    data['year'] = data['datetime'].dt.year 
    data['month'] = data['datetime'].dt.month
    data['day'] = data['datetime'].dt.day
    data['hour'] = data['datetime'].dt.hour
    data['dayofweek']= data['datetime'].dt.dayofweek
    data['dayofyear']= data['datetime'].dt.dayofyear
        
    electricity['hour'] = electricity['datetime'].dt.hour

    data= data.merge(client.drop(columns = ['date']), how='left', on=['data_block_id', 'county', 'is_business', 'product_type'])
    data= data.merge(gas[['data_block_id', 'lowest_price_per_mwh', 'highest_price_per_mwh']], how='left', on='data_block_id')
    data= data.merge(electricity[['euros_per_mwh', 'hour', 'data_block_id']], how='left', on=['hour', 'data_block_id'])
    data= data.merge(forecast_weather_datetime, how='left', on=['datetime'])
    data= data.merge(forecast_weather_datetime_county, how='left', on=['datetime', 'county'],suffixes= ('_fcast_mean','_fcast_mean_by_county')) 
    
    hist_weather_datetime['hour']= hist_weather_datetime['datetime'].dt.hour
    hist_weather_datetime_county['hour']= hist_weather_datetime_county['datetime'].dt.hour
    hist_weather_datetime.drop_duplicates(inplace=True)
    hist_weather_datetime_county.drop_duplicates(inplace=True)
    hist_weather_datetime.drop('datetime', axis= 1, inplace= True)
    hist_weather_datetime_county.drop('datetime', axis= 1, inplace= True)

    data= data.merge(hist_weather_datetime, how='left', on=['data_block_id', 'hour'])
    data= data.merge(hist_weather_datetime_county, how='left', on=['data_block_id', 'county', 'hour'],suffixes= ('_hist_mean','_hist_mean_by_county'))
    data= data.groupby(['year', 'day', 'hour'], as_index=False).apply(lambda x: x.ffill().bfill()).reset_index()
    data.drop(['level_0', 'level_1', 'row_id', 'data_block_id'], axis= 1, inplace= True)
    
    return data

In [None]:
def create_revealed_targets_test(data, previous_revealed_targets, N_day_lags):
    # Create new test data based on previous_revealed_targets and N_day_lags 
    
    for count, revealed_targets in enumerate(previous_revealed_targets) :
        day_lag = count + 2
        revealed_targets['hour'] = pd.to_datetime(revealed_targets['datetime'], utc= True).dt.hour
        
        revealed_targets = revealed_targets[['hour', 'prediction_unit_id', 'is_consumption', 'target']]
        revealed_targets = revealed_targets.rename(columns = {"target" : f"target_{day_lag}_days_ago"})
        data = pd.merge(data,revealed_targets,how = 'left',on = ['hour', 'prediction_unit_id', 'is_consumption'],)
        
    all_revealed_columns = [f"target_{day_lag}_days_ago" for day_lag in range(2, N_day_lags+1)]
    missing_columns = list(set(all_revealed_columns) - set(data.columns))
    data[missing_columns] = np.nan 
    
    return data

## Submission

In [None]:
previous_revealed_targets = []
env = enefit.make_env()
iter_test = env.iter_test()
for (test, revealed_targets, client_test, historical_weather_test,
     forecast_weather_test, electricity_test, gas_test,sample_prediction) in iter_test:
    
    # Rename test set to make consistent with train
    test = test.rename(columns = {'prediction_datetime': 'datetime'})
    
    id_column = 'data_block_id' 
    
    test[id_column] = 0
    gas_test[id_column] = 0
    electricity_test[id_column] = 0
    historical_weather_test[id_column] = 0
    forecast_weather_test[id_column] = 0
    client_test[id_column] = 0
    revealed_targets[id_column] = 0
    
    data_test = feat_eng_test(test, client_test, historical_weather_test,forecast_weather_test, electricity_test, gas_test, locations)
    
    data_test['datetime']= pd.to_datetime(data_test['datetime'], utc= True).astype('int64')
    
    previous_revealed_targets.insert(0, revealed_targets)
    if len(previous_revealed_targets) == N_day_lags:
        previous_revealed_targets.pop()
    
    df_test = create_revealed_targets_test(data = data_test.copy(),previous_revealed_targets = previous_revealed_targets.copy(), N_day_lags = N_day_lags)
    
    #Data Transformation
    df_test['sin_hour']= (np.pi * np.sin(df_test['hour']) / 12)
    df_test['cos_hour']= (np.pi * np.cos(df_test['hour']) / 12)
    df_test['sin_hour']= (np.pi * np.sin(df_test['hour']) / 12)
    df_test['cos_hour']= (np.pi * np.cos(df_test['hour']) / 12)
    df_test['sin_dayofyear']= (np.pi * np.sin(df_test['dayofyear']) / 183)
    df_test['cos_dayofyear']= (np.pi * np.cos(df_test['dayofyear']) / 183)
    df_test['target_mean']= df_test[[f'target_{i}_days_ago' for i in range(2, N_day_lags+1)]].mean(1)
    df_test['target_std']= df_test[[f'target_{i}_days_ago' for i in range(2, N_day_lags+1)]].std(1)
    df_test['target_var']= df_test[[f'target_{i}_days_ago' for i in range(2, N_day_lags+1)]].var(1)
    for i in to_log:
        df_test[f"log_{i}"]= np.where((df_test[i])!= 0, np.log(df_test[i]),0)
    X_test = df_test.drop('currently_scored', axis= 1).values
    
    # Предсказания
    # lgbp///////////////////////////
    # Создание списка для хранения предсказаний каждой модели
    target_list = []
    # Проход по списку моделей и добавление предсказаний каждой модели в список
    for mod in [lgbp1, lgbp2, lgbp3, lgbp4]:
        # Добавление предсказаний модели в список target_list
        # Вызов метода .predict() для модели и применение метода .clip(0) для ограничения предсказаний минимальным значением 0
        target_list.append(mod.predict(X_test).clip(0))

    
        # Вычисление взвешенного среднего для предсказаний
    # Каждое предсказание из target_list умножается на свой вес и суммируется для получения итогового предсказания
    pred = (target_list[0] * 0.3) + (target_list[1] * 0.27) + (target_list[2] * 0.23) + (target_list[3] * 0.2)
    # Присваивание итогового предсказания столбцу 'target' в DataFrame test
    test['target'] = pred
    # cat///////////////////////////
    # Повторение процесса выше для целевой переменной, связанной с солнечной энергией
    tsolar_list = []
    # Применение каждой модели CatBoost для прогнозирования и добавление результатов в список tsolar_list
    for model in [cat1, cat2, cat3, cat4]:
        tsolar_list.append(model.predict(X_test).clip(0))  # Прогноз и ограничение его нижней границей на 0

    # Вычисление взвешенного среднего для предсказаний
    pred_solar = (tsolar_list[0] * 0.25) + (tsolar_list[1] * 0.25) + (tsolar_list[2] * 0.25) + (tsolar_list[3] * 0.25)

    # Присваивание итогового предсказания столбцу 'target_solar' в DataFrame test
    test['target_solar'] = pred_solar

    
#     # Повторение процесса выше для целевой переменной, связанной с солнечной энергией
#     tsolar_list = []
#     # Применение каждой модели для прогнозирования и добавление результатов в список tsolar_list
#     for mod in [lgbn1, lgbn2, lgbn3, lgbn4]:
#         tsolar_list.append(mod.predict(X_test).clip(0))  # Прогноз и ограничение его нижней границей на 0
#     # Вычисление взвешенного среднего для предсказаний солнечной энергии
#     pred_solar = (tsolar_list[0] * 0.3) + (tsolar_list[1] * 0.27) + (tsolar_list[2] * 0.23) + (tsolar_list[3] * 0.2)
#     # Присваивание итогового предсказания солнечной энергии столбцу 'target_solar' в DataFrame test
#     test['target_solar'] = pred_solar

    
    gc.collect()
    
    test.loc[test['is_consumption']==0, "target"] = test.loc[test['is_consumption']==0, "target_solar"]  
    sample_prediction["target"] = test['target']
    
    #Sending predictions to the API
    env.predict(sample_prediction)