In [2]:
#!/usr/bin/env python3
"""
import_raw_lob_ms.py
────────────────────
Loads a DOM CSV whose first column is *Unix-epoch milliseconds*
and saves it as a NumPy array:
    • ts  → float64  seconds (millisecond resolution preserved)
    • all price/size columns → float32
"""

import numpy as np
import pandas as pd
from pathlib import Path

# ───────── configuration ─────────
CSV_PATH = Path("../data/raw-sierra/dom-datatick.csv")
OUT_DIR  = Path("../data/processed/my_nq")
OUT_DIR.mkdir(parents=True, exist_ok=True)

N_LEVELS = 10
COLS = (["ts_ms"] +
        [f"bid_px{i}" for i in range(N_LEVELS)] +
        [f"bid_sz{i}" for i in range(N_LEVELS)] +
        [f"ask_px{i}" for i in range(N_LEVELS)] +
        [f"ask_sz{i}" for i in range(N_LEVELS)])

print("📥  reading CSV …")
df = pd.read_csv(CSV_PATH, names=COLS)

# ── keep timestamp in float64 seconds, others in float32 ──
ts_sec   = (df["ts_ms"].astype(np.float64) / 1_000.0).to_numpy()      # 64-bit
features = df.iloc[:, 1:].astype(np.float32).to_numpy()               # 40 × float32

raw = np.column_stack([ts_sec, features])      # shape (rows, 41)

out_path = OUT_DIR / "raw_lob.npy"
np.save(out_path, raw)
print(f"✅ saved {out_path}   {raw.shape[0]:,} rows × {raw.shape[1]} cols")


📥  reading CSV …
✅ saved ..\data\processed\my_nq\raw_lob.npy   4,236,994 rows × 41 cols


In [3]:
#!/usr/bin/env python3
"""
resample_raw_lob_1s.py
──────────────────────
Load **raw_lob.npy** (millisecond-resolved seconds in col-0)  
and down-sample to an even 1-second grid.

Output → raw_lob_1s.npy  
layout unchanged: ts (float64 sec) + 40 × float32 LOB features.
"""

from pathlib import Path

import numpy as np
import pandas as pd

# ───────── paths ─────────
IN_PATH  = Path("../data/processed/my_nq/raw_lob.npy")
OUT_DIR  = IN_PATH.parent
OUT_PATH = OUT_DIR / "raw_lob_1s.npy"

# ───────── column names ─────────
N_LEVELS = 10
COLS = (["ts_sec"] +
        [f"bid_px{i}" for i in range(N_LEVELS)] +
        [f"bid_sz{i}" for i in range(N_LEVELS)] +
        [f"ask_px{i}" for i in range(N_LEVELS)] +
        [f"ask_sz{i}" for i in range(N_LEVELS)])  # total = 41

# ───────── load ─────────
print("📥  loading raw_lob.npy …")
raw = np.load(IN_PATH)                                   # (rows, 41)
df  = pd.DataFrame(raw, columns=COLS)

# ─── ts_sec → DateTime index ───
df.index = pd.to_datetime(df["ts_sec"], unit="s", utc=True)
df.drop(columns="ts_sec", inplace=True)
df = df.sort_index()                                     # ensure chronological

# ─── 1-second resample ───
print("⏱️   resampling to 1-second grid …")
df_1s = df.resample("1S").last().ffill()

# ─── rebuild NumPy array ───
ts_sec = (df_1s.index.astype("int64") // 1_000_000_000).astype(np.float64)
features = df_1s.astype(np.float32).to_numpy()
raw_1s   = np.column_stack([ts_sec, features])           # (rows, 41)

# ─── save ───
np.save(OUT_PATH, raw_1s)
print(f"✅ saved {OUT_PATH}   {raw_1s.shape[0]:,} rows × {raw_1s.shape[1]} cols")

print("🔎  running sanity checks …")
# 1) monotonic 1-second steps
diff = np.diff(raw_1s[:, 0])
assert np.all(diff == 1.0), "timestamps are not strict 1-second increments"
# 2) no NaN / inf
assert np.isfinite(raw_1s).all(), "array contains NaN or ±Inf"
print("   • timestamp step OK")
print("   • no NaN / Inf")
# 3) peek at head / tail
np.set_printoptions(precision=3, suppress=True)
print("   first 3 rows:\n", raw_1s[:3])
print("   last  3 rows:\n",  raw_1s[-3:])
print("🎉  resampling script finished successfully")


📥  loading raw_lob.npy …
⏱️   resampling to 1-second grid …


  df_1s = df.resample("1S").last().ffill()


✅ saved ..\data\processed\my_nq\raw_lob_1s.npy   955,180 rows × 41 cols
🔎  running sanity checks …
   • timestamp step OK
   • no NaN / Inf
   first 3 rows:
 [[1.729e+09 2.051e+04 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00 1.000e+00 0.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
  2.051e+04 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 1.000e+00 0.000e+00 0.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00]
 [1.729e+09 2.051e+04 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00 1.000e+00 0.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
  2.051e+04 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 1.000e+00 0.000e+00 0.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+

📥  Loading raw_lob_100ms.npy …
   → 9,551,796 snapshots
⚙️  Compiling & running fast labeller …
✅ Saved ..\data\processed\my_nq\labels_30s.npy
   positives: 135,410  (1.42%)
