In [None]:
#!/usr/bin/env python3
"""
import_raw_lob_ms.py
────────────────────
Loads a DOM CSV whose first column is *Unix-epoch milliseconds*
and saves it as a NumPy array:
    • ts  → float64  seconds (millisecond resolution preserved)
    • all price/size columns → float32
"""

import numpy as np
import pandas as pd
from pathlib import Path

# ───────── configuration ─────────
CSV_PATH = Path("../data/raw-sierra/dom-datatick.csv")
OUT_DIR  = Path("../data/processed/my_nq")
OUT_DIR.mkdir(parents=True, exist_ok=True)

N_LEVELS = 10
COLS = (["ts_ms"] +
        [f"bid_px{i}" for i in range(N_LEVELS)] +
        [f"bid_sz{i}" for i in range(N_LEVELS)] +
        [f"ask_px{i}" for i in range(N_LEVELS)] +
        [f"ask_sz{i}" for i in range(N_LEVELS)])

print("📥  reading CSV …")
df = pd.read_csv(CSV_PATH, names=COLS)

# ── keep timestamp in float64 seconds, others in float32 ──
ts_sec   = (df["ts_ms"].astype(np.float64) / 1_000.0).to_numpy()      # 64-bit
features = df.iloc[:, 1:].astype(np.float32).to_numpy()               # 40 × float32

raw = np.column_stack([ts_sec, features])      # shape (rows, 41)

out_path = OUT_DIR / "raw_lob.npy"
np.save(out_path, raw)
print(f"✅ saved {out_path}   {raw.shape[0]:,} rows × {raw.shape[1]} cols")


📥  reading CSV …


FileNotFoundError: [Errno 2] No such file or directory: 'data\\raw-sierra\\dom-datatick.csv'

In [None]:
#!/usr/bin/env python3
"""
resample_raw_lob_1s.py
──────────────────────
Load **raw_lob.npy** (millisecond-resolved seconds in col-0)  
and down-sample to an even 1-second grid.

Output → raw_lob_1s.npy  
layout unchanged: ts (float64 sec) + 40 × float32 LOB features.
"""

from pathlib import Path

import numpy as np
import pandas as pd

# ───────── paths ─────────
IN_PATH  = Path("../data/processed/my_nq/raw_lob.npy")
OUT_DIR  = IN_PATH.parent
OUT_PATH = OUT_DIR / "raw_lob_1s.npy"

# ───────── column names ─────────
N_LEVELS = 10
COLS = (["ts_sec"] +
        [f"bid_px{i}" for i in range(N_LEVELS)] +
        [f"bid_sz{i}" for i in range(N_LEVELS)] +
        [f"ask_px{i}" for i in range(N_LEVELS)] +
        [f"ask_sz{i}" for i in range(N_LEVELS)])  # total = 41

# ───────── load ─────────
print("📥  loading raw_lob.npy …")
raw = np.load(IN_PATH)                                   # (rows, 41)
df  = pd.DataFrame(raw, columns=COLS)

# ─── ts_sec → DateTime index ───
df.index = pd.to_datetime(df["ts_sec"], unit="s", utc=True)
df.drop(columns="ts_sec", inplace=True)
df = df.sort_index()                                     # ensure chronological

# ─── 1-second resample ───
print("⏱️   resampling to 1-second grid …")
df_1s = df.resample("1S").last().ffill()

# ─── rebuild NumPy array ───
ts_sec = (df_1s.index.astype("int64") // 1_000_000_000).astype(np.float64)
features = df_1s.astype(np.float32).to_numpy()
raw_1s   = np.column_stack([ts_sec, features])           # (rows, 41)

# ─── save ───
np.save(OUT_PATH, raw_1s)
print(f"✅ saved {OUT_PATH}   {raw_1s.shape[0]:,} rows × {raw_1s.shape[1]} cols")


📥  loading raw_lob.npy …


FileNotFoundError: [Errno 2] No such file or directory: 'data\\processed\\my_nq\\raw_lob.npy'

In [10]:
#!/usr/bin/env python3
"""
make_labels_30s_fast.py
───────────────────────
Create binary labels for NQ DOM snapshots:

 label = 1  ⇐  within the next 30 s
                 • mid-price falls ≥ 4 ticks  (4 × 0.25 = 1.00 pt)
                 • AND never rises > 2 ticks (2 × 0.25 = 0.50 pt)
 label = 0  otherwise

Runs ~20–50× faster than a naive loop by using two monotone de-ques and Numba.
"""

import numpy as np
from pathlib import Path
from numba import njit
from tqdm import tqdm

# ───────────────────────────────────────────────────────────
# 1. CONFIGURATION
# ───────────────────────────────────────────────────────────
DATA_DIR   = Path("../data/processed/my_nq")         # adjust if needed
RAW_FILE   = DATA_DIR / "raw_lob_100ms.npy"          # floats with ms ts
LABEL_FILE = DATA_DIR / "labels_30s.npy"

N_LEVELS   = 10           # depth recorded in each side of book
TICK_SIZE  = 0.25         # CME NQ
DROP_TICKS = 4            # ≥ 4 ticks down → 1.00 pt
RISE_TICKS = 2            # ≤ 2 ticks up   → 0.50 pt
HORIZON_S  = 30.0         # 30-second horizon


# ───────────────────────────────────────────────────────────
# 2. FAST LABELLER (Numba)
# ───────────────────────────────────────────────────────────
@njit(cache=True)
def label_drop_rule(ts, mid,
                    tick_size, drop_ticks, rise_ticks, horizon):
    """
    Parameters
    ----------
    ts   : 1-D float64, seconds (ascending)
    mid  : 1-D float32/64, mid prices
    Returns
    -------
    labels : uint8 (0/1), same length as ts
    """
    N = ts.shape[0]
    labels = np.zeros(N, np.uint8)

    dq_min = np.empty(N, np.int64)   # stores indices of future mins
    dq_max = np.empty(N, np.int64)   # stores indices of future maxs
    head_min = head_max = 0
    tail_min = tail_max = 0

    for i in range(N - 1, -1, -1):       # walk right → left
        # drop indices outside horizon
        while head_min < tail_min and ts[dq_min[head_min]] - ts[i] > horizon:
            head_min += 1
        while head_max < tail_max and ts[dq_max[head_max]] - ts[i] > horizon:
            head_max += 1

        # window min / max (if deque empty, mid[i] is both)
        win_min = mid[dq_min[head_min]] if head_min < tail_min else mid[i]
        win_max = mid[dq_max[head_max]] if head_max < tail_max else mid[i]

        # apply rule
        if (mid[i] - win_min >= drop_ticks * tick_size) and \
           (win_max - mid[i] <= rise_ticks * tick_size):
            labels[i] = 1

        # push current index into min-deque (monotone ↑)
        while tail_min > head_min and mid[dq_min[tail_min - 1]] >= mid[i]:
            tail_min -= 1
        dq_min[tail_min] = i
        tail_min += 1

        # push into max-deque (monotone ↓)
        while tail_max > head_max and mid[dq_max[tail_max - 1]] <= mid[i]:
            tail_max -= 1
        dq_max[tail_max] = i
        tail_max += 1

    return labels


# ───────────────────────────────────────────────────────────
# 3. LOAD RAW SNAPSHOTS
# ───────────────────────────────────────────────────────────
print("📥  Loading raw_lob_100ms.npy …")
raw = np.load(RAW_FILE, mmap_mode="r")                  # (N, 41)

bid0_col = 1
ask0_col = 1 + 2 * N_LEVELS

ts  = raw[:, 0].astype(np.float64)                      # float-seconds (ms kept)
mid = (raw[:, bid0_col] + raw[:, ask0_col]) / 2.0       # mid-price series

print(f"   → {len(ts):,} snapshots")

# ───────────────────────────────────────────────────────────
# 4. GENERATE LABELS (JIT + RUN)
# ───────────────────────────────────────────────────────────
print("⚙️  Compiling & running fast labeller …")
# tiny warm-up to trigger JIT
_ = label_drop_rule(ts[:10], mid[:10],
                    TICK_SIZE, DROP_TICKS, RISE_TICKS, HORIZON_S)

# full dataset
labels = label_drop_rule(ts, mid,
                         TICK_SIZE, DROP_TICKS, RISE_TICKS, HORIZON_S)

# ───────────────────────────────────────────────────────────
# 5. SAVE
# ───────────────────────────────────────────────────────────
np.save(LABEL_FILE, labels)
pos = int(labels.sum())
print(f"✅ Saved {LABEL_FILE}\n"
      f"   positives: {pos:,}  ({pos/len(labels):.2%})")


📥  Loading raw_lob_100ms.npy …
   → 9,551,796 snapshots
⚙️  Compiling & running fast labeller …
✅ Saved ..\data\processed\my_nq\labels_30s.npy
   positives: 135,410  (1.42%)


In [11]:
#!/usr/bin/env python3
"""
make_bucketed.py
────────────────
TLOB-style tokenisation for the resampled NQ DOM:

    • price tokens  : relative tick distance from best bid / ask
    • size  tokens  : log-compressed volumes, quantile-bucketed

Outputs
-------
bucketed_lob.npy  (int16)   ts | 2×N_LEVELS price_tok | 2×N_LEVELS size_tok
size_edges.npy    (float32) 129 boundaries used for the volume buckets
"""

import numpy as np
from pathlib import Path

# ───────────────────────────────────────────────────────────
# 1. paths & hyper-params
# ───────────────────────────────────────────────────────────
DATA_DIR     = Path("../data/processed/my_nq")
RAW_FILE     = DATA_DIR / "raw_lob_100ms.npy"
BUCKET_FILE  = DATA_DIR / "bucketed_lob.npy"
EDGES_FILE   = DATA_DIR / "size_edges.npy"

N_LEVELS   = 10           # depth recorded
TICK_SIZE  = 0.25         # CME NQ
MAX_TICKS  = 50           # clip ±50 → 0-100 token range
SZ_BUCKETS = 128          # number of volume buckets 0-127

# ───────────────────────────────────────────────────────────
# 2. load float snapshots
# ───────────────────────────────────────────────────────────
raw = np.load(RAW_FILE, mmap_mode="r")        # (N, 41)

ts_col      = raw[:, :1]                      # keep as int64/float64
bid_px_idx  = np.arange(1, 1 + N_LEVELS)
bid_sz_idx  = np.arange(1 + N_LEVELS, 1 + 2*N_LEVELS)
ask_px_idx  = np.arange(1 + 2*N_LEVELS, 1 + 3*N_LEVELS)
ask_sz_idx  = np.arange(1 + 3*N_LEVELS, 1 + 4*N_LEVELS)

# ───────────────────────────────────────────────────────────
# 3. PRICE TOKENISATION  (relative ticks)
# ───────────────────────────────────────────────────────────
best_bid = raw[:, bid_px_idx[0]][:, None]          # (N,1)
best_ask = raw[:, ask_px_idx[0]][:, None]

rel_bid = np.round((best_bid - raw[:, bid_px_idx]) / TICK_SIZE)
rel_ask = np.round((raw[:, ask_px_idx] - best_ask) / TICK_SIZE)
rel_ticks = np.hstack([rel_bid, rel_ask]).astype(np.int16)

# clip & shift to 0 … 100
rel_ticks = np.clip(rel_ticks, -MAX_TICKS, MAX_TICKS)
price_tok = (rel_ticks + MAX_TICKS).astype(np.int16)        # (N, 20)

# ───────────────────────────────────────────────────────────
# 4. SIZE TOKENISATION  (log1p + quantile buckets)
# ───────────────────────────────────────────────────────────
log_sz   = np.log1p(raw[:, np.hstack([bid_sz_idx, ask_sz_idx])])
# build equiprobable edges
edges    = np.quantile(log_sz.ravel(), np.linspace(0, 1, SZ_BUCKETS + 1))
edges[0]  = -np.inf                                    # left-open
edges[-1] =  np.inf                                    # right-open

size_tok = (np.searchsorted(edges, log_sz, "right") - 1).astype(np.int16)  # 0-127

# sanity ranges
assert price_tok.min() >= 0 and price_tok.max() <= 2*MAX_TICKS
assert size_tok.min()  >= 0 and size_tok.max()  < SZ_BUCKETS

# ───────────────────────────────────────────────────────────
# 5. assemble & save
# ───────────────────────────────────────────────────────────
bucketed = np.concatenate(
    [ts_col.astype(np.int64),       # keep timestamp
     price_tok,
     size_tok],
    axis=1).astype(np.int16, copy=False)      # ts stays int64 underneath mmap

np.save(BUCKET_FILE, bucketed)
np.save(EDGES_FILE, edges.astype(np.float32))

print(f"✅ bucketed_lob.npy : {bucketed.shape}  (tokens)")
print(f"✅ size_edges.npy   : {len(edges)} edges (0-{SZ_BUCKETS-1} tokens)")


✅ bucketed_lob.npy : (9551796, 41)  (tokens)
✅ size_edges.npy   : 129 edges (0-127 tokens)
