In [1]:
from IPython.display import display
import pandas as pd
import numpy as np
import streamlit as st
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from pathlib import Path
from pymongo import MongoClient
from datetime import datetime

from utils.dataset import MarketData, DataPreprocessor
from utils.data_processing import prepare_data
from utils.plotting import (display_performance_metrics, display_trade_log, plot_equity_curve, 
                            plot_model_results_with_trades, plot_recent_predictions, plot_ensemble_statistics, plot_ensemble_predictions_realtime)
from utils.model_utils import create_model, train_model, generate_model_predictions, train_ensemble_models
from utils.mongodb_utils import get_stored_predictions, setup_mongodb_connection, save_recent_predictions
from utils.backtesting import backtest_model_with_metrics

%load_ext autoreload
%autoreload 2

In [2]:
def load_and_preprocess_data():
    cot_df, auction_df, options_df, ta_df, fundamentals_df = MarketData.latest(Path('data'))
    cot_df = cot_df.set_index('Date').resample('W', origin='end').mean().reset_index()
    auction_df = auction_df.set_index('Date').resample('D').mean().reset_index()

    auction_df = auction_df[7:]
    # auction_df.loc[:, 'Premium/discount-settle'] = auction_df['Premium/discount-settle'].ffill()
    auc_cols = ['Auc Price', 'Median Price', 'Cover Ratio', 'Spot Value', 
                'Auction Spot Diff', 'Median Spot Diff', 'Premium/discount-settle']
    auction_df.loc[:, auc_cols] = auction_df[auc_cols].ffill()

    merged_df = DataPreprocessor.engineer_auction_features(auction_df)

    auc_df = merged_df[['Date', 'Auc Price']].copy()
    options_df = options_df.merge(auc_df, how='left')
    options_df = options_df.bfill()
    return merged_df, options_df

def prepare_data_and_train_model(merged_df):
    train_df, test_df, val_df, preprocessor = prepare_data(merged_df)

    print("Prepared Data\n\n")
    print(f"{train_df}")
    print(f"{test_df}")
    num_features = len(test_df.columns)
    OUT_STEPS = 7
    model = create_model(num_features, OUT_STEPS)
    history = train_model(model, train_df, val_df, test_df, preprocessor)
    predictions_df, recent_preds, trend = generate_model_predictions(model, test_df)
    
    return model, preprocessor, test_df, predictions_df, recent_preds, trend



In [None]:
merged_df, options_df = load_and_preprocess_data()

In [None]:
inf_counts = merged_df.isin([np.inf, -np.inf]).sum()
print("Number of inf values in each column FILLED DF4:")
print(inf_counts.head(20))

### Prepare Data and Train Model

In [None]:

# train_test_data
FEATURES = merged_df.columns.tolist()
LABEL_COLS = ['Auc Price']
preprocessor = DataPreprocessor(features=FEATURES, label_columns=LABEL_COLS, input_width=7, label_width=7, shift=1)
FEATURES = [feature for feature in FEATURES if feature != 'Date']
train_df = merged_df[merged_df['Date'] <
                "2024-04-01"].copy().set_index('Date')[FEATURES].copy()

val_df = merged_df[(merged_df['Date'] >= "2024-01-01") & (merged_df['Date'] < "2024-04-01")].copy().set_index('Date')[FEATURES].copy()
test_df = merged_df[merged_df['Date'] >= "2024-04-01"].copy().set_index('Date')[FEATURES].copy()


# NORMALIZATION - normalize
train_mean = train_df.mean()
train_std = train_df.std()

train_df = (train_df - train_mean) / train_std
val_df = (val_df - train_mean) / train_std
test_df = (test_df - train_mean) / train_std


# print("Prepared Data\n\n")
# print(f"{train_df}")
# print(f"{test_df}")

In [51]:
# Replace inf values with NaN before calculating mean
train_df_clean = train_df.replace([np.inf, -np.inf], np.nan)
train_mean = train_df_clean.mean()

In [54]:
COT_SHEET_NAME: str = "COT-G362"
AUCTION_SHEET_NAME: str = "Auction"
OPTIONS_SHEET_NAME: str = "EUA option-G363"
TA_SHEET_NAME: str = "TA"
FUNDAMENTALS_SHEET_NAME: str = "power generation-G355"
ICE_SHEET_NAME: str = "ICE Value"

path = Path('data') / f"data_sheet_latest.xlsx"


print("\n\nLOADING AUCTIONS DATA\n")
auction_df = pd.read_excel(path, sheet_name=AUCTION_SHEET_NAME)
cols = ['date', 'auction price', 'median price', 'cover ratio', 'Spot.value', 
        'Auction.Spot.diff', 'Median.Spot.diff', 'Premium/discount-settle']
auction_df = auction_df[cols]
auction_df.columns = ['Date', 'Auc Price', 'Median Price', 'Cover Ratio', 
                    'Spot Value', 'Auction Spot Diff', 'Median Spot Diff', 
                    'Premium/discount-settle']

auction_df = auction_df[auction_df['Date'].dt.year >= 2020]


inf_counts = auction_df.isin([np.inf, -np.inf, np.nan]).sum()
print("Number of inf values in each column FILLED DF 1:")
print(inf_counts.head(20))

# Process dates
auction_df['Date'] = pd.to_datetime(auction_df['Date'])
auction_df = auction_df[~auction_df['Date'].isna()]
auction_df = auction_df.sort_values(by='Date').reset_index(drop=True)
auction_df = auction_df.set_index('Date').resample('D').mean().reset_index()



LOADING AUCTIONS DATA

Number of inf values in each column FILLED DF 1:
Date                       0
Auc Price                  0
Median Price               0
Cover Ratio                0
Spot Value                 2
Auction Spot Diff          2
Median Spot Diff           0
Premium/discount-settle    2
dtype: int64


In [55]:
ice_df = pd.read_excel(path, sheet_name=ICE_SHEET_NAME, skiprows=4)
ice_df = ice_df[['Unnamed: 11', 'High', 'Open', 'Low', 'Close']][1:]
ice_df.columns = ['Date', 'High', 'Open', 'Low', 'Close']

ice_df['Date'] = pd.to_datetime(ice_df['Date'])
ice_df = ice_df[ice_df['Date'].dt.year >= 2000]
ice_df['Date'] = ice_df['Date'].dt.date
ice_df['Date'] = pd.to_datetime(ice_df['Date'])
ice_df = ice_df[1:]

merged_df = pd.merge(auction_df, ice_df, on='Date', how='outer')

In [59]:
df = merged_df.copy()

df['Spot Value'] = df['Spot Value'].replace([np.inf, -np.inf], np.nan)
df['Close'] = df['Close'].replace([np.inf, -np.inf], np.nan)
df['Auc Price'] = df['Auc Price'].replace([np.inf, -np.inf], np.nan)
mask = ~(np.isinf(df['Spot Value']) | np.isinf(df['Close']) | df['Spot Value'].isna() | df['Close'].isna())
filtered_df = df.loc[mask]

mask2 = filtered_df['Close'] != 0
ratio = filtered_df.loc[mask2, 'Spot Value'] / filtered_df.loc[mask2, 'Close']
ratio = ratio[~np.isinf(ratio)]

# Calculate average differences and ratios
df['Spot_Close_Diff'] = df['Spot Value'] - df['Close']
# df['Spot_Close_Ratio'] = df['Spot Value'] / df['Close']
df['Auc_Spot_Diff'] = df['Auc Price'] - df['Spot Value']
df['Auc_Spot_Ratio'] = df['Auc Price'] / df['Spot Value']

stats = {
        'Spot_Close_Diff_Mean': df['Spot_Close_Diff'].mean(),
        'Spot_Close_Diff_Median': df['Spot_Close_Diff'].median(),
        'Spot_Close_Ratio_Mean': ratio.mean(),
        'Auc_Spot_Diff_Mean': df['Auc_Spot_Diff'].mean(),
        'Auc_Spot_Diff_Median': df['Auc_Spot_Diff'].median(),
        'Auc_Spot_Ratio_Mean': df['Auc_Spot_Ratio'].mean(),
    }


filled_df = df.copy()
mask = filled_df['Spot Value'].isna() & filled_df['Close'].notna()
filled_df.loc[mask, 'Spot Value'] = (
    filled_df.loc[mask, 'Close'] * stats['Spot_Close_Ratio_Mean']
)
mask = filled_df['Auc Price'].isna() & filled_df['Spot Value'].notna()
filled_df.loc[mask, 'Auc Price'] = (
    filled_df.loc[mask, 'Spot Value'] + stats['Auc_Spot_Diff_Mean']
)
mask = filled_df['Median Price'].isna()
filled_df.loc[mask, 'Median Price'] = (
    (filled_df.loc[mask, 'High'] + filled_df.loc[mask, 'Low']) / 2
)
filled_df['Median Spot Diff'] = filled_df['Median Price'] - filled_df['Spot Value']
filled_df['Auction Spot Diff'] = filled_df['Auc Price'] - filled_df['Spot Value'] 

filled_df['Premium/discount-settle'] = np.where(
    filled_df['Spot Value'] != 0,
    filled_df['Auction Spot Diff'] / filled_df['Spot Value'],
    0  # or another default value like np.nan
)

filled_df['Cover Ratio'] = filled_df['Cover Ratio'].fillna(
    filled_df['Cover Ratio'].rolling(window=30, min_periods=1).median()
)

filled_df = filled_df[auction_df.columns]
mask = filled_df['Auc Price'] < 0
filled_df.loc[mask, 'Auc Price'] = filled_df['Auc Price'].rolling(window=30, min_periods=1).mean()



In [73]:
auction_df = filled_df.copy()
auction_df = auction_df.set_index('Date').resample('D').mean().reset_index()
auction_df = auction_df[7:]
auc_cols = ['Auc Price', 'Median Price', 'Cover Ratio', 'Spot Value', 
            'Auction Spot Diff', 'Median Spot Diff', 'Premium/discount-settle']
auction_df.loc[:, auc_cols] = auction_df[auc_cols].ffill()

In [74]:
merged_df = DataPreprocessor.engineer_auction_features(auction_df)

In [81]:
inf_counts = merged_df.isin([np.inf, -np.inf, np.nan]).sum()
print("Number of inf values in each column FILLED DF 1:")
print(inf_counts.head(20))

Number of inf values in each column FILLED DF 1:
Date                       0
Auc Price                  0
Median Price               0
Cover Ratio                0
Spot Value                 0
Auction Spot Diff          0
Median Spot Diff           0
Premium/discount-settle    0
DayOfWeek                  0
Month                      0
Quarter                    0
Auc Price_7d_MA            0
Auc Price_30d_MA           0
Auc Price_7d_std           0
Auc Price_30d_std          0
Auc Price_7d_EMA           0
Auc Price_30d_EMA          0
Median Price_7d_MA         0
Median Price_30d_MA        0
Median Price_7d_std        0
dtype: int64


In [94]:
FEATURES = merged_df.columns.tolist()
LABEL_COLS = ['Auc Price']

preprocessor = DataPreprocessor(features=FEATURES, label_columns=LABEL_COLS, input_width=7, label_width=7, shift=1)
train_df, test_df, val_df = preprocessor.train_test_data(merged_df)
train_df, test_df, val_df = preprocessor.normalize(train_df, test_df, val_df)


train_df = train_df.dropna(axis=1)
test_df = test_df.dropna(axis=1)
val_df = val_df.dropna(axis=1)

  sqr = _ensure_numeric((avg - values) ** 2)


In [96]:
num_features = len(test_df.columns)
OUT_STEPS = 7
model = create_model(num_features, OUT_STEPS)

In [97]:
history = train_model(model, train_df, val_df, test_df, preprocessor)
predictions_df, recent_preds, trend = generate_model_predictions(model, test_df)

Epoch 1/40
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.6020 - mean_absolute_error: 0.5008 - val_loss: 0.2170 - val_mean_absolute_error: 0.2587
Epoch 2/40
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.3008 - mean_absolute_error: 0.2516 - val_loss: 0.1858 - val_mean_absolute_error: 0.2237
Epoch 3/40
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.2762 - mean_absolute_error: 0.2277 - val_loss: 0.1703 - val_mean_absolute_error: 0.2074
Epoch 4/40
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.2473 - mean_absolute_error: 0.2107 - val_loss: 0.1616 - val_mean_absolute_error: 0.1964
Epoch 5/40
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.2458 - mean_absolute_error: 0.2112 - val_loss: 0.1577 - val_mean_absolute_error: 0.1933
Epoch 6/40
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss

In [10]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 10;

<IPython.core.display.Javascript object>

In [80]:
pd.set_option("display.max_rows", 200)

In [14]:
display(merged_df.head(20))

Unnamed: 0,Date,Auc Price,Median Price,Cover Ratio,Spot Value,Auction Spot Diff,Median Spot Diff,Premium/discount-settle,High,Open,Low,Close
0,2020-01-07,24.06,24.15,1.52,24.23,-0.17,-0.08,-0.007016,,,,
1,2020-01-08,,,,,,,,,,,
2,2020-01-09,24.01,24.01,1.87,24.07,-0.06,-0.06,-0.002493,,,,
3,2020-01-10,24.48,24.47,1.84,24.56,-0.08,-0.09,-0.003257,,,,
4,2020-01-11,,,,,,,,,,,
5,2020-01-12,,,,,,,,,,,
6,2020-01-13,24.03,23.91,2.22,24.14,-0.11,-0.23,-0.004557,,,,
7,2020-01-14,23.74,23.65,1.96,23.77,-0.03,-0.12,-0.001262,,,,
8,2020-01-15,24.2,24.03,1.76,24.09,0.11,-0.06,0.004566,,,,
9,2020-01-16,24.85,24.72,2.08,24.78,0.07,-0.06,0.002825,,,,


In [41]:
stats

{'Spot_Close_Diff_Mean': 3.422245175936436,
 'Spot_Close_Diff_Median': -0.75,
 'Spot_Close_Ratio_Mean': 0.9866106467713301,
 'Auc_Spot_Diff_Mean': -0.06383165599268065,
 'Auc_Spot_Diff_Median': -0.07000000000000028,
 'Auc_Spot_Ratio_Mean': 0.9988930710074821}

In [52]:
inf_counts = filled_df.isin([np.inf, -np.inf]).sum()
print("Number of inf values in each column FILLED DF:")
print(inf_counts.head(20))

Number of inf values in each column FILLED DF:
Date                       0
Auc Price                  0
Median Price               0
Cover Ratio                0
Spot Value                 0
Auction Spot Diff          0
Median Spot Diff           0
Premium/discount-settle    0
High                       0
Open                       0
Low                        0
Close                      0
Spot_Close_Diff            0
Auc_Spot_Diff              0
Auc_Spot_Ratio             0
dtype: int64


In [None]:
# from IPython.display import display, HTML

# pd.set_option("display.max_rows", None)

# # display(HTML(auction_df.to_html()))
# # 

# display(auction_df)

In [None]:

filled_df = cls.fill_missing_values(merged_df)
# print(f"Merged DF: {merged_df.head()} | SHAPE: {merged_df.shape}")

# Return only the auction columns after filling
filled_df = filled_df[auction_df.columns]

# print(f"Filled DF: {filled_df.head()} | SHAPE: {filled_df.shape}")
# Replace negative Auc Price values with rolling mean
mask = filled_df['Auc Price'] < 0
filled_df.loc[mask, 'Auc Price'] = filled_df['Auc Price'].rolling(window=30, min_periods=1).mean()


# print(f"Filled DF: {filled_df.head()} | SHAPE: {filled_df.shape}")
return filled_df