In [43]:
#%pip install neuralforecast
#%pip install dask[dataframe]
#%pip install dask
#%pip install pandas
#%pip install numpy
#%pip install pyarrow
#%pip install ray
#%pip install optuna
#%pip install scikit-learn
#%pip install torch
#%pip install matplotlib
#%pip install neuralforecast
#%pip install --upgrade optree

### Env checks, because i changed servers mid-work

In [None]:
import platform
import sys
import os
import psutil
import jupyter_core
import subprocess
import pkg_resources
import pandas as pd
import numpy as np
import dask.dataframe as dd
import pyarrow.parquet as pq
from neuralforecast import NeuralForecast
from neuralforecast.models import TFT, PatchTST, Informer, Autoformer, FEDformer, iTransformer, NHITS
from neuralforecast.auto import AutoTFT, AutoPatchTST, AutoInformer, AutoAutoformer, AutoFEDformer, AutoiTransformer, AutoNHITS, AutoTimeXer, AutoVanillaTransformer, AutoTimesNet
from neuralforecast.losses.pytorch import MAE, RMSE
from ray import tune
from ray.tune.integration.pytorch_lightning import TuneReportCheckpointCallback
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler
import optuna
from sklearn.preprocessing import StandardScaler, LabelEncoder, PolynomialFeatures
from sklearn.metrics import r2_score, mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import torch
from contextlib import redirect_stdout, redirect_stderr
from io import StringIO
import traceback
import pickle
import logging
import glob

In [None]:
def generate_directory_tree(startpath):
    output = [f"Directory tree starting from: {startpath}"]
    for root, dirs, files in os.walk(startpath):
        level = root.replace(startpath, '').count(os.sep)
        indent = ' ' * 4 * level
        output.append(f"{indent}{os.path.basename(root)}/")
        for f in files:
            output.append(f"{indent}    {f}")
    return '\n'.join(output)

root_path = "/home/jupyter-kohv04@vse.cz/kohv04"

# Check if the directory exists
if not os.path.exists(root_path):
    print(f"Directory {root_path} does not exist.")
else:
    # Generate the directory tree
    tree_output = generate_directory_tree(root_path)

    # Save to file
    output_file = "directory_tree.txt"
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(tree_output)

    print(f"Directory tree saved to {output_file}")

print("Python Version:", sys.version)
print("Operating System:", platform.system(), platform.release())
print("Platform:", platform.platform())
print("Architecture:", platform.architecture())

print("\nJupyter Information:")
print("Jupyter Core Version:", jupyter_core.__version__)
print("Jupyter Notebook Directory:", os.getcwd())
print("Jupyter Config Directory:", os.path.expanduser("~/.jupyter"))
print("Jupyter Data Directory:", os.environ.get("JUPYTER_DATA_DIR", "Not set"))
print("Jupyter Runtime Directory:", os.environ.get("JUPYTER_RUNTIME_DIR", "Not set"))

print("\nHardware Information:")
print("CPU Cores:", psutil.cpu_count(logical=True))
print("Total Memory:", f"{psutil.virtual_memory().total / (1024**3):.2f} GB")
try:
    result = subprocess.run(['nvidia-smi', '--query-gpu=name', '--format=csv'], capture_output=True, text=True)
    print("GPU Info:\n", result.stdout)
except FileNotFoundError:
    print("GPU Info: No NVIDIA GPU detected or nvidia-smi not installed")

print("\nVirtual Environment:")
if hasattr(sys, 'real_prefix') or (hasattr(sys, 'base_prefix') and sys.base_prefix != sys.prefix):
    print("Running in a virtual environment")
    print("Virtual Environment Path:", sys.prefix)
else:
    print("Not running in a virtual environment")

print("\nDocker Check:")
if os.path.exists("/.dockerenv") or os.path.isfile("/proc/self/cgroup"):
    with open("/proc/self/cgroup", "r") as f:
        if "docker" in f.read():
            print("Running inside a Docker container")
        else:
            print("Not running inside a Docker container")
else:
    print("Not running inside a Docker container")

installed_packages = [(d.project_name, d.version) for d in pkg_resources.working_set]
print("Installed Python libraries:")
for name, version in sorted(installed_packages):
    print(f"{name}: {version}")

### Config

In [None]:
# Directories
base_dir = "/home/jupyter-kohv04@vse.cz/kohv04/backtesting_final/"
test_dir = os.path.join(base_dir, "volume_prediction_test")
metadata_dir = os.path.join(base_dir, "metadata")
metrics_dir = os.path.join(test_dir, "metrics")

os.makedirs(test_dir, exist_ok=True)
os.makedirs(os.path.join(test_dir, "models"), exist_ok=True)
os.makedirs(os.path.join(test_dir, "predictions"), exist_ok=True)
os.makedirs(metrics_dir, exist_ok=True)
os.makedirs(metadata_dir, exist_ok=True)

metadata_file = os.path.join(metadata_dir, "nasdaq100_ticker_dataset.json")
if not os.path.exists(metadata_file):
    raise FileNotFoundError(
        f"Metadata file {metadata_file} not found. It should be placed in {metadata_dir}"
    )

# Logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Volume prediction modelling
- I tried and evaluated multiple models from neuralforecast on a subset of AAPL data, to find the best one, and then decided to use the TFT.
- TFT, with 60 minute lookback period on the time series, trained on 15 minute forecast, can be used for multi-horizon forecasting. 
- Evaluated within the CV
- Refer to volume_prediction.py

### Used processed DataFrame

**Schema**:

| Column Name                | Data Type       | Description                                                                 |
|----------------------------|-----------------|-----------------------------------------------------------------------------|
| `ticker`                   | string          | Stock ticker (example: "AAPL")                                                       |
| `timestamp`                | timestamp[ns]   | Timestamp of the 1-minute interval                                          |
| `open`                     | float64         | Normalized opening price (z-score)                                           |
| `high`                     | float64         | Normalized highest price (z-score)                                           |
| `low`                      | float64         | Normalized lowest price (z-score)                                            |
| `close`                    | float64         | Normalized closing price (z-score)                                           |
| `volume`                   | float64         | Normalized trading volume (z-score)                                          |
| `prev_session_high`        | float32         | Normalized previous session high price (z-score)                             |
| `prev_session_low`         | float32         | Normalized previous session low price (z-score)                              |
| `estimated_bid_ask_spread` | float32         | Normalized estimated bid-ask spread (z-score)                                |
| `estimated_obd`            | float32         | Normalized estimated order book depth (z-score)                              |
| `50_day_sma`               | float32         | Normalized 50-day simple moving average (z-score)                            |
| `news_impact`              | int8            | News impact score from standardized data                                     |
| `hour`                     | int64           | Hour of the day (9–16, based on trading hours 9:30 AM–4:00 PM ET)           |
| `day_of_week`              | int64           | Day of the week (0=Monday, 6=Sunday)                                        |
| `minute`                   | int64           | Minute of the hour (0–59)                                                   |
| `time_since_open`          | float64         | Minutes since market open (9:30 AM ET)                                       |
| `is_trading`               | int64           | Binary indicator (1, as data is filtered to trading hours)                   |
| `date`                     | object          | Date extracted from timestamp (for merging regimes)                          |
| `volatility_regime`        | int64           | Label-encoded volatility regime (e.g., 0, 1, 2 for different regimes)        |
| `trend_regime`             | int64           | Label-encoded trend regime (e.g., 0, 1, 2 for different regimes)             |
| `liquidity_regime`         | int64           | Label-encoded liquidity regime (e.g., 0, 1, 2 for different regimes)         |
| `news_impact_regime`       | int8            | News impact score from regimes data      |
| `volume_lag_1` to `volume_lag_15` | float64 | Normalized lagged volume (t-1 to t-60 minutes, z-score)                      |
| `close_lag_1` to `close_lag_15`   | float64 | Normalized lagged closing price (t-1 to t-60 minutes, z-score)               |
| `log_volume`               | float64         | Log-transformed raw volume (np.log1p(raw_volume)) scaled to [-1, 1]          |
| `returns`                  | float64         | Percentage change in closing price ((close_t - close_{t-1}) / close_{t-1})  |

In [None]:
# Suppressing all output by redirecting stdout and stderr to /dev/null
with open(os.devnull, 'w') as devnull:
    old_stdout = sys.stdout
    old_stderr = sys.stderr
    sys.stdout = devnull
    sys.stderr = devnull
    try:
        # Setting environment variables to suppress TensorFlow and other library output
        os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
        os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
        os.environ["PYTHONWARNINGS"] = "ignore"
        os.environ["NEURALFORECAST_LOG_LEVEL"] = "ERROR"
        
        # Main
        result = subprocess.run(
            [sys.executable, "/home/jupyter-kohv04@vse.cz/kohv04/volume_prediction.py"],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
            check=True
        )
    except subprocess.CalledProcessError as e:
        # Silently handling errors (no output)
        pass
    finally:
        sys.stdout = old_stdout
        sys.stderr = old_stderr

### Filtering aggregated metrics

In [None]:
base_dir = "/home/jupyter-kohv04@vse.cz/kohv04/backtesting_final/"
metadata_dir = os.path.join(base_dir, "metadata")
aggregated_metrics_file = os.path.join(metadata_dir, "aggregated_metrics.csv")
tft_horizon15_file = os.path.join(metadata_dir, "tft_horizon15_scaled_metrics.csv")
median_output_file = os.path.join(metadata_dir, "tft_horizon15_median_metrics.csv")

def filter_aggregated_metrics():
    """Filter aggregated metrics to include only TFT model, horizon 15, and scaled metrics."""
    if not os.path.exists(aggregated_metrics_file):
        raise FileNotFoundError(f"Aggregated metrics file {aggregated_metrics_file} not found")
    
    # Loading aggregated metrics
    metrics_df = pd.read_csv(aggregated_metrics_file)
    logger.info(f"Loaded {len(metrics_df)} rows from {aggregated_metrics_file}")
    
    # Filtering for TFT model, horizon 15, and required columns
    filtered_df = metrics_df[
        (metrics_df["model"] == "TFT") & 
        (metrics_df["horizon"] == 15)
    ][["ticker", "config", "scaled_MAE", "scaled_RMSE", "scaled_R2"]]
    
    # Saving filtered metrics
    filtered_df.to_csv(tft_horizon15_file, index=False)
    logger.info(f"Filtered metrics saved to {tft_horizon15_file} with {len(filtered_df)} rows")
    
    return filtered_df

def compute_median_metrics():
    """Compute median of scaled metrics from tft_horizon15_scaled_metrics.csv and save to CSV."""
    if not os.path.exists(tft_horizon15_file):
        raise FileNotFoundError(f"Input file {tft_horizon15_file} not found")
    
    # Loading TFT horizon 15 metrics
    metrics_df = pd.read_csv(tft_horizon15_file)
    logger.info(f"Loaded {len(metrics_df)} rows from {tft_horizon15_file}")
    
    # Computing median for each metric
    median_metrics = metrics_df[["scaled_MAE", "scaled_RMSE", "scaled_R2"]].median().to_dict()
    median_df = pd.DataFrame([median_metrics])
    
    # Saving median metrics
    median_df.to_csv(median_output_file, index=False)
    logger.info(f"Median metrics saved to {median_output_file}")
    
    return median_df

# Execute 
if __name__ == "__main__":
    filtered_metrics = filter_aggregated_metrics()
    print(f"Generated filtered metrics CSV with {len(filtered_metrics)} rows")
    
    median_metrics = compute_median_metrics()
    print(f"Generated median metrics CSV with columns: {list(median_metrics.columns)}")