In [None]:
import os
import pandas as pd
import numpy as np
from datetime import timedelta
from neuralforecast import NeuralForecast
import logging
import asyncio
import concurrent.futures
import glob
from functools import partial

# Predicting volume 
- For the 3 months of optimization and simulation periods (collectively 6 months) all tickers with the TFT.
- Needs to be more optimized.
- Unfinished

In [None]:
# Base directory and loading tickers
base_dir = "/home/jupyter-kohv04@vse.cz/kohv04/backtesting_final/"
metadata_dir = os.path.join(base_dir, "metadata")
nasdaq100_tickers = pd.read_json(os.path.join(metadata_dir, "nasdaq100_ticker_dataset.json"))["Ticker"].tolist()[17:]

# Logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - [%(levelname)s] - [%(process)d] - %(message)s',
    handlers=[
        logging.FileHandler('prediction_process.log')
    ]
)
logger = logging.getLogger(__name__)

# Time parameters
trading_start = pd.to_datetime("09:30:00").time()
trading_end = pd.to_datetime("16:00:00").time()

In [None]:
def create_volume_prediction_dir(ticker):
    """Create volume_prediction directory for a given ticker."""
    volume_pred_dir = os.path.join(base_dir, f"ticker={ticker}_standardized/volume_prediction")
    os.makedirs(volume_pred_dir, exist_ok=True)
    return volume_pred_dir

def preprocess_ticker_data(ticker, period, n_lags=15):
    """Load and prepare data for a specific ticker and period."""
    period_dir = os.path.join(base_dir, f"ticker={ticker}_standardized/{period}")
    df = pd.concat([pd.read_parquet(f) for f in glob.glob(os.path.join(period_dir, "part.*.parquet"))])
    df["timestamp"] = pd.to_datetime(df["timestamp"])
    df = df.sort_values("timestamp")
    
    # Regime data
    ticker_regime = pd.read_csv(os.path.join(base_dir, f"ticker={ticker}/{ticker}_regimes.csv"))
    ticker_regime["date"] = pd.to_datetime(ticker_regime["date"]).dt.date
    
    # Loading scalers and encoders
    volume_pred_dir = create_volume_prediction_dir(ticker)
    scaler = pd.read_pickle(os.path.join(volume_pred_dir, "scaler.pkl"))
    log_volume_scaler = pd.read_pickle(os.path.join(volume_pred_dir, "log_volume_scaler.pkl"))
    regime_encoders = {
        col: pd.read_pickle(os.path.join(volume_pred_dir, f"{col}_encoder.pkl"))
        for col in ["volatility_regime", "trend_regime", "liquidity_regime"]
    }
    
    return df, ticker_regime, scaler, log_volume_scaler, regime_encoders

def preprocess_row(window, ticker_regime, scaler, log_volume_scaler, regime_encoders, n_lags=15):
    """Preprocess a window of data for prediction."""
    window = window.copy()
    
    # Temporal features
    window["hour"] = window["timestamp"].dt.hour
    window["day_of_week"] = window["timestamp"].dt.dayofweek
    window["minute"] = window["timestamp"].dt.minute
    window["time_since_open"] = ((window["timestamp"] - window["timestamp"].dt.floor("D") - 
                                 pd.Timedelta(hours=9, minutes=30)).dt.total_seconds() / 60).clip(lower=0)
    window["is_trading"] = ((window["timestamp"].dt.time >= trading_start) & 
                            (window["timestamp"].dt.time <= trading_end)).astype(int)
    window["date"] = window["timestamp"].dt.date
    window["intraday_minute"] = (((window["timestamp"] - window["timestamp"].dt.floor("D") - 
                                  pd.Timedelta(hours=9, minutes=30)).dt.total_seconds() / 60).astype(int) + 1)
    
    # Merging with regime data.
    window = window.merge(ticker_regime[["date", "volatility_regime", "trend_regime", "liquidity_regime", "news_impact"]], 
                          on="date", how="left").rename(columns={"news_impact": "news_impact_regime"})
    
    # Lagged features
    for i in range(1, n_lags + 1):
        window[f"volume_lag_{i}"] = window["volume"].shift(i)
        window[f"close_lag_{i}"] = window["close"].shift(i)
    
    # Scaling continuous features
    cont_cols = ["open", "high", "low", "close", "volume", "estimated_obd", "estimated_bid_ask_spread", 
                 "prev_session_high", "prev_session_low", "50_day_sma"] + \
                [f"volume_lag_{i}" for i in range(1, n_lags + 1)] + [f"close_lag_{i}" for i in range(1, n_lags + 1)]
    window[cont_cols] = scaler.transform(window[cont_cols])
    
    window["log_volume"] = np.log1p(window["volume"])
    window["log_volume"] = log_volume_scaler.transform(window[["log_volume"]]) * 2 - 1
    
    # Encoding regime data
    for col in ["volatility_regime", "trend_regime", "liquidity_regime"]:
        window[col] = regime_encoders[col].transform(window[col].astype(str))
    
    # Calculating returns
    window["returns"] = window["close"].pct_change().fillna(0)
    
    # Imputing NaNs for lag features
    for lag in range(1, n_lags + 1):
        window[f"volume_lag_{lag}"] = window[f"volume_lag_{lag}"].fillna(
            window["volume"].iloc[0] if window["timestamp"].dt.time.iloc[0] < trading_start else window["volume"])
        window[f"close_lag_{lag}"] = window[f"close_lag_{lag}"].fillna(
            window["close"].iloc[0] if window["timestamp"].dt.time.iloc[0] < trading_start else window["close"])
    
    # Preparing NeuralForecast data for the current row
    current_row = window.iloc[-1:].copy()
    nf_data = current_row[["ticker", "timestamp", "log_volume"]].rename(
        columns={"ticker": "unique_id", "timestamp": "ds", "log_volume": "y"})
    exogenous_cols = ["open", "high", "low", "close", "volume", "estimated_obd", "estimated_bid_ask_spread", 
                      "prev_session_high", "prev_session_low", "50_day_sma", "hour", "day_of_week", "minute", 
                      "time_since_open", "is_trading", "volatility_regime", "trend_regime", "liquidity_regime", 
                      "news_impact_regime", "returns", "intraday_minute"] + \
                     [f"volume_lag_{i}" for i in range(1, n_lags + 1)] + [f"close_lag_{i}" for i in range(1, n_lags + 1)]
    nf_data = nf_data.join(current_row[exogenous_cols])
    return nf_data

def process_ticker_period(ticker, period, n_lags=15, input_size=60):
    """Process a single ticker and period (optimization or simulation)."""
    logger.info(f"Starting processing for {ticker} in {period}")
    volume_pred_dir = create_volume_prediction_dir(ticker)
    
    # Loading the trained TFT model
    try:
        nf = NeuralForecast.load(path=os.path.join(volume_pred_dir, f"models/{ticker}_model"))
    except Exception as e:
        logger.error(f"Failed to load model for {ticker}: {e}")
        return
    
    output_dir = os.path.join(base_dir, f"ticker={ticker}_standardized/{period}_tft")
    os.makedirs(output_dir, exist_ok=True)
    
    # Loading data and preprocessing artifacts
    df, ticker_regime, scaler, log_volume_scaler, regime_encoders = preprocess_ticker_data(ticker, period, n_lags=n_lags)
    
    # Generating predictions using a rolling window
    predictions_list = []
    for idx in range(input_size - 1, len(df) - 15):
        # Preparing window up to the current timestamp
        window = df.iloc[max(0, idx - input_size + 1):idx + 1].copy()
        if len(window) < input_size:
            continue
        
        # Preprocessing the window
        nf_data = preprocess_row(window, ticker_regime, scaler, log_volume_scaler, regime_encoders, n_lags=n_lags)
        
        # if not in trading hours -> Skip
        if nf_data["is_trading"].iloc[0] != 1:
            continue
        
        # Preparing future data
        futr_timestamps = pd.date_range(start=df["timestamp"].iloc[idx] + pd.Timedelta(minutes=1), 
                                        periods=15, freq="1min")
        futr_data = pd.DataFrame({"ds": futr_timestamps, "unique_id": ticker})
        futr_data["hour"] = futr_data["ds"].dt.hour
        futr_data["day_of_week"] = futr_data["ds"].dt.dayofweek
        futr_data["minute"] = futr_data["ds"].dt.minute
        futr_data["time_since_open"] = ((futr_data["ds"] - futr_data["ds"].dt.floor("D") - 
                                        pd.Timedelta(hours=9, minutes=30)).dt.total_seconds() / 60).clip(lower=0)
        futr_data["is_trading"] = ((futr_data["ds"].dt.time >= trading_start) & 
                                   (futr_data["ds"].dt.time <= trading_end)).astype(int)
        futr_data["date"] = futr_data["ds"].dt.date
        futr_data = futr_data.merge(ticker_regime[["date", "volatility_regime", "trend_regime", 
                                                   "liquidity_regime", "news_impact"]], 
                                    on="date", how="left").rename(columns={"news_impact": "news_impact_regime"})
        
        # Encoding future regime data
        for col in ["volatility_regime", "trend_regime", "liquidity_regime"]:
            futr_data[col] = regime_encoders[col].transform(futr_data[col].astype(str))
        
        # PREDICTIONS
        try:
            pred = nf.predict(df=nf_data, futr_df=futr_data)
            pred_at_horizon_15 = pred[pred["ds"] == futr_timestamps[-1]]["TFT"].values[0]
            predictions_list.append({"timestamp": df["timestamp"].iloc[idx], 
                                     "pred_volume_15_tft_scaled": pred_at_horizon_15})
        except Exception as e:
            logger.warning(f"Prediction failed at index {idx} for {ticker}: {e}")
            continue
    
    if not predictions_list:
        logger.warning(f"No predictions generated for {ticker} in {period}")
        return
    
    # Processing predictions
    predictions_df = pd.DataFrame(predictions_list)
    predictions_df["pred_volume_15_tft"] = np.expm1(log_volume_scaler.inverse_transform(
        ((predictions_df["pred_volume_15_tft_scaled"] + 1) / 2).values.reshape(-1, 1)).flatten())
    
    # Merging with original dataset
    df = df.merge(predictions_df[["timestamp", "pred_volume_15_tft"]], on="timestamp", how="left")
    
    # Saving to new subfolder
    df.to_parquet(os.path.join(output_dir, "data.parquet"), index=False)
    logger.info(f"Saved updated {period} data for {ticker} to {output_dir}")

async def run_ticker_period(ticker, period, executor, n_lags=15, input_size=60):
    """Run the processing for a ticker and period asynchronously."""
    loop = asyncio.get_event_loop()
    func = partial(process_ticker_period, ticker, period, n_lags, input_size)
    await loop.run_in_executor(executor, func)

async def process_all_tickers():
    """Process all tickers and periods asynchronously with parallelization."""
    max_workers = min(len(nasdaq100_tickers) * 2, os.cpu_count() * 2)  # Adjust based on CPU cores
    logger.info(f"Using {max_workers} workers for parallel processing")
    
    with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
        tasks = []
        for ticker in nasdaq100_tickers:
            for period in ["optimization", "simulation"]:
                tasks.append(run_ticker_period(ticker, period, executor))
        
        await asyncio.gather(*tasks)

In [None]:
if __name__ == "__main__":
    # Running the asynchronous processing
    asyncio.run(process_all_tickers())