In [None]:
import os, glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

PATH_DATASET = "/kaggle/input/predict-energy-behavior-of-prosumers"
pd.set_option('display.max_columns', None)

In [None]:
df_train = pd.read_csv(os. path.join(PATH_DATASET, "train.csv"))
print(f"data size: {len(df_train)}")
display(df_train.head())

In [None]:
train_nan_ = df_train.isnull().sum()
df_train.dropna(inplace=True)
train_nan = pd.DataFrame(dict(before=train_nan_, after=df_train.isnull().sum()))
display(train_nan.T)

In [None]:
def extend_datetime_features(df, col_dt="datetime", datetime_offset=None):
    df[col_dt] = pd.to_datetime(df[col_dt])
    if not datetime_offset:
        datetime_offset = df[col_dt].min()
    df["date"] = df[col_dt].dt.date
    df["date_year"] = df[col_dt].dt.year
    df["date_month"] = df[col_dt].dt.month
    df["date_day"] = df[col_dt].dt.day
    df["date_dayofyear"] = df[col_dt].dt.dayofyear
    df["date_dayofweek"] = df[col_dt].dt.dayofweek
    df["date_weekday"] = df[col_dt].dt.weekday
    df["time_hour"] = df[col_dt].dt.hour
    df["time_minute"] = df[col_dt].dt.minute
    df["date_delta"] = (df[col_dt] - datetime_offset).dt.days
    df["time_delta"] = df[col_dt].dt.hour + (df[col_dt].dt.minute / 60.)
    return datetime_offset

datetime_offset = extend_datetime_features(df_train)
display(df_train.head())

In [None]:
fig = plt.figure(figsize=(10, 2.5))
# sns.histplot(data=df_train, x=col, hue="target_", ax=fig.gca())
df_train["target"].plot.hist(ax=fig.gca(), bins=90, logy=True, xlabel="target", grid=True)

In [None]:
df_train_ = df_train.groupby(["date", "product_type", "county"])["target"].mean().reset_index(name="target")
# display(df_train_)

fig = plt.figure(figsize=(12, 4))
ax = sns.lineplot(data=df_train_, x='date', y='target', style="product_type", hue="county", ax=fig.gca())
ax.grid(), ax.set_yscale('log')
ax.legend(loc='center right', bbox_to_anchor=(1.15, 0.5))

In [None]:
for col in filter(lambda c: c.startswith("date_") or c.startswith("time_"), df_train.columns):
    fig = plt.figure(figsize=(10, 1.5))
    df_train[col].plot.hist(ax=fig.gca(), bins=90, logy=True, xlabel=col, grid=True)

In [None]:
for col in ['county', 'is_business', 'product_type', 'is_consumption', 'data_block_id', 'prediction_unit_id']:  #
    lbs, counts = np.unique(df_train[col].values, return_counts=True)
    if len(lbs) < 9:
        fig, ax = plt.subplots(figsize=(6, 3))
        ax.pie(counts, labels=lbs, autopct='%.0f%%')
        ax.set_ylabel(col)
    else:
        fig, ax = plt.subplots(figsize=(10, 2))
        ax.bar(lbs, counts)
        ax.set_xlabel(col)
        ax.grid()

In [None]:
df_client = pd.read_csv(os. path.join(PATH_DATASET, "client.csv"))
print(f"data size: {len(df_client)}")
display(df_client.head())

In [None]:
df_elect_prices = pd.read_csv(os. path.join(PATH_DATASET, "electricity_prices.csv"))
print(f"data size: {len(df_elect_prices)}")
display(df_elect_prices.head())

In [None]:
df_elect_prices['forecast_date'] = pd.to_datetime(df_elect_prices['forecast_date'])
fig = plt.figure(figsize=(12, 4))
ax = sns.lineplot(data=df_elect_prices, x='forecast_date', y='euros_per_mwh', ax=fig.gca())
ax.set_ylim([0, 1100]), ax.grid()

In [None]:
df_gas_prices = pd.read_csv(os. path.join(PATH_DATASET, "gas_prices.csv"))
print(f"data size: {len(df_gas_prices)}")
display(df_gas_prices.head())

In [None]:
df_gas_prices['forecast_date'] = pd.to_datetime(df_gas_prices['forecast_date'])
fig = plt.figure(figsize=(12, 4))
for col in ['lowest_price_per_mwh', 'highest_price_per_mwh']:
    ax = sns.lineplot(data=df_gas_prices, x='forecast_date', y=col, label=col, ax=fig.gca())
ax.grid()

In [None]:
def expand_price_features(df, df_elect_prices, df_gas_prices):
    df['date'] = pd.to_datetime(df['date'])
    df_elect_prices['date'] = pd.to_datetime(df_elect_prices['forecast_date'])
    df = df.merge(df_elect_prices[['date', 'euros_per_mwh']], how="left", on="date")
    df_gas_prices['date'] = pd.to_datetime(df_gas_prices['forecast_date'])
    df = df.merge(df_gas_prices[['date', 'lowest_price_per_mwh', 'highest_price_per_mwh']], how="left", on="date")
    return df

df_train = expand_price_features(df_train, df_elect_prices, df_gas_prices)
display(df_train.head())

In [None]:
df_weather_forecat = pd.read_csv(os. path.join(PATH_DATASET, f"forecast_weather.csv"))
# df_weather_forecat['datetime'] = pd.to_datetime(df_weather_forecat['forecast_datetime'])
print(f"forecast size: {len(df_weather_forecat)}")
df_weather_hist = pd.read_csv(os. path.join(PATH_DATASET, f"historical_weather.csv"))
# df_weather_hist['datetime'] = pd.to_datetime(df_weather_hist['datetime'])
print(f"histirical size: {len(df_weather_hist)}")
df_weather = pd.concat([df_weather_forecat, df_weather_hist])
# del df_weather_forecat, df_weather_hist
display(df_weather)

In [None]:
# df_weather['forecast_datetime'] = pd.to_datetime(df_weather['forecast_datetime'])
df_weather_locations = df_weather.groupby(["latitude", "longitude"]).size().reset_index(name="count")
# display(df_weather_locations)
sns.scatterplot(data=df_weather_locations, x="longitude", y="latitude", size="count")
plt.legend(loc='center right', bbox_to_anchor=(1.25, 0.5))

In [None]:
for col in ["hours_ahead", "temperature", "dewpoint"]:
    fig = plt.figure(figsize=(10, 1.5))
    df_weather[col].plot.hist(ax=fig.gca(), bins=90, logy=True, xlabel=col, grid=True)

In [None]:
def expand_weather_features(df, df_weather_forecat, df_weather_hist):
    df_weather_forecat = pd.read_csv(os. path.join(PATH_DATASET, f"forecast_weather.csv"))
    extend_datetime_features(df_weather_forecat, col_dt="forecast_datetime")
    #df_weather_forecat['origin_datetime'] = pd.to_datetime(df_weather_forecat['origin_datetime'])
    #df_weather_forecat['forecast_datetime'] = pd.to_datetime(df_weather_forecat['forecast_datetime'])
    print(f"forecast size: {len(df_weather_forecat)}")
    df_weather_hist = pd.read_csv(os. path.join(PATH_DATASET, f"historical_weather.csv"))
    extend_datetime_features(df_weather_hist, col_dt="datetime")
    #df_weather_hist['datetime'] = pd.to_datetime(df_weather_hist['datetime'])
    print(f"histirical size: {len(df_weather_hist)}")
    cols_datetime = ['date_year', 'date_month', 'date_day', 'time_hour']
    cols_features = ["temperature", "dewpoint", "cloudcover_mid", "cloudcover_total", "snowfall"]
    df_weather = pd.concat([
        df_weather_forecat[cols_datetime + cols_features],
        df_weather_hist[cols_datetime + cols_features],
    ]).groupby(cols_datetime).mean()
    #display(df_weather)
    df = df.merge(df_weather, how="left", on=cols_datetime)
    return df

df_train = expand_weather_features(df_train, df_weather_forecat, df_weather_hist)
display(df_train.head())

In [None]:
df_train.isnull().sum()

In [None]:
cols_ = [c for c in df_train.columns if c not in ("datetime", "date")]
_= sns.heatmap(df_train[cols_].corr())

In [None]:
df_test = pd.read_csv(os. path.join(PATH_DATASET, "example_test_files", "test.csv"))
print(f"data size: {len(df_test)}")
display(df_test.head())

In [None]:
!head /kaggle/input/predict-energy-behavior-of-prosumers/example_test_files/sample_submission.csv

In [None]:
TRAIN_FEATURES = [
    'county', 'is_business', 'product_type', 'is_consumption',  # 'prediction_unit_id',
    'date_year', 'date_month', 'date_day', 'date_dayofyear', 'date_dayofweek',  # 'date_weekday', 'date_delta',
    'time_hour',  #  'time_minute','time_delta',
    # 'euros_per_mwh', 'lowest_price_per_mwh', 'highest_price_per_mwh',  <- missing in test dataset
    "temperature", "dewpoint", "cloudcover_mid", "cloudcover_total", "snowfall",
]
COLS_CATEGORY = [
    'county', 'is_business', 'product_type', 'is_consumption',  # 'prediction_unit_id',
    #'date_year', 'date_month', 'date_day', 'date_dayofyear', 'date_dayofweek', 'date_weekday',
]
TRAIN_TARGET = "target"

In [None]:
X_all = df_train[TRAIN_FEATURES]
X_all[COLS_CATEGORY] = X_all[COLS_CATEGORY].astype("category")
y_all = df_train[TRAIN_TARGET].values

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    X_all, y_all, test_size=0.2, random_state=42)
display(X_train.head())
display(X_train.info())

In [None]:
!pip install -U -q xgboost -f /kaggle/input/xgboost-python-package --no-index

In [None]:
import time
import xgboost as xgb

print(xgb.__version__)

model = xgb.XGBRegressor(
    device="cuda",
    tree_method="hist",
    enable_categorical=True,
    objective='reg:absoluteerror',
    #learning_rate=0.01,
    #sampling_method='gradient_based',
    #grow_policy='lossguide',
    eval_metric='mae',
# max_depth is an optional parameter that shows the maximum depth of each decision tree.
    max_depth=20,
# learning_rate is an optional parameter where the step size shrinkage prevents overfitting.
# subsample is an optional parameter representing the fraction of samples used for each tree.
    subsample=1,
# colsample_bytree is an optional parameter representing the fraction of features used for each tree.
    colsample_bytree=1,
# n_estimators is a required parameter that determines the number of boosting iterations and controls the overall complexity of the model.
    n_estimators=800,
# Maximum number of leaves; 0 indicates no limit.
    max_leaves=512,
# L1 regularization term on weights (xgb’s alpha).
    reg_alpha=2.5,
# L2 regularization term on weights (xgb’s lambda).
    reg_lambda=9.5,
# The feature importance type for the feature_importances_ property
    importance_type="total_gain",
# OTHER
    random_state=42,
)

start = time.time()
# Training the model on the training data
model.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    early_stopping_rounds=100,
    verbose=True)
elapsed = time.time() - start
print(f"training took: {elapsed / 60.} min")
print(model)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

# Making predictions on the test set
predictions = model.predict(X_valid)

# Calculate the mean absolute and squared error
print("Mean Absolute Error:", mean_absolute_error(y_valid, predictions))

In [None]:
model.fit(X_all, y_all)


In [None]:
!head submission.csv

In [None]:
import enefit
env = enefit.make_env()
iter_test = env.iter_test()

In [None]:
from tqdm.auto import tqdm

counter = 0
for (test, revealed_targets, client, historical_weather, 
     forecast_weather, electricity_prices, gas_prices, sample_prediction) in tqdm(iter_test):
    df_test = test.merge(sample_prediction, how="left", on="row_id")
    extend_datetime_features(
        df_test, col_dt="prediction_datetime", datetime_offset=datetime_offset)
    df_test = expand_price_features(df_test, electricity_prices, gas_prices)
    df_test = expand_weather_features(df_test, forecast_weather, historical_weather)
    X_test = df_test[TRAIN_FEATURES]
    X_test[COLS_CATEGORY] = X_test[COLS_CATEGORY].astype("category")
    #X_test = scaler.transform(X_test)

    assert df_test["row_id"].tolist() == sample_prediction["row_id"].tolist()
    sample_prediction["target"] = model.predict(X_test).clip(0)
    if counter < 3:
        display(df_test.head())
        display(sample_prediction.head())

    env.predict(sample_prediction)
    counter += 1

In [None]:
!head submission.csv