In [None]:
%load_ext jupyter_black

In [None]:
import pandas as pd
import datetime
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pytz

from utils import plot_line, plot_quantiles

In [None]:
DATA_DIR = "../data/"
LOG_DIR = "../logs/"
PROCESSED_DIR = "./data/"

DATA_FILE = "wif-usdc"

### Preprocessing

In [None]:
data_path = f"{DATA_DIR}{DATA_FILE}.csv"
processed_data_path = f"{PROCESSED_DIR}{DATA_FILE}-analysis.csv"

df = pd.read_csv(data_path)

# df

In [None]:
oracle_enabled = df["oracle_price"].notna().all()

df["timestamp_ms"] = pd.to_datetime(df["timestamp_ms"], unit="ms")
df = df.rename(columns={"timestamp_ms": "timestamp"})
df["timestamp"] = df["timestamp"].dt.tz_localize("UTC").dt.tz_convert("US/Eastern")
df = df.sort_values(by="timestamp")

df["midpoint"] = (df["BID1"] + df["ASK1"]) / 2
df["spread"] = df["ASK1"] - df["BID1"]
df["spread_bps"] = df["spread"] / df["midpoint"] * 10000

df["l1_l2_bid_spread"] = df["BID1"] - df["BID2"]
df["l1_l2_bid_spread_bps"] = df["l1_l2_bid_spread"] / df["midpoint"] * 10000

df["l1_l2_ask_spread"] = df["ASK2"] - df["ASK1"]
df["l1_l2_ask_spread_bps"] = df["l1_l2_ask_spread"] / df["midpoint"] * 10000

if oracle_enabled:
    df["midpoint_oracle_spread"] = df["midpoint"] - df["oracle_price"]
    df["midpoint_oracle_spread_bps"] = (
        (df["midpoint"] - df["oracle_price"]) / df["oracle_price"] * 10000
    )

df

In [None]:
df.to_csv(processed_data_path, index=False)

### Overview

In [None]:
duration = df["timestamp"].max() - df["timestamp"].min()
hours = duration.components.hours
minutes = duration.components.minutes

time_diffs = df["timestamp"].diff().dt.total_seconds() * 1000
time_diffs = time_diffs.dropna()

if oracle_enabled:
    N = max(
        [
            int(col[-1])
            for col in df.columns
            if col.startswith(("BID", "ASK")) and col[-1].isdigit()
        ]
    )

    bid_columns = [f"BID{i}" for i in range(1, N + 1)] + [
        f"BID_SIZE{i}" for i in range(1, N + 1)
    ]
    ask_columns = [f"ASK{i}" for i in range(1, N + 1)] + [
        f"ASK_SIZE{i}" for i in range(1, N + 1)
    ]

    df["oracle_change"] = df["oracle_price"].diff().ne(0)
    df["book_change"] = df[bid_columns + ask_columns].diff().ne(0).any(axis=1)

    oracle_updates = df.loc[df["oracle_change"], "timestamp"]
    oracle_diffs = oracle_updates.diff().dt.total_seconds().mul(1000).dropna()

    book_updates = df.loc[df["book_change"], "timestamp"]
    book_diffs = book_updates.diff().dt.total_seconds().mul(1000).dropna()

In [None]:
if oracle_enabled:
    print(
        f"Tracked {len(oracle_diffs) + 1} oracle updates and {len(book_diffs) + 1} book updates for {hours}h {minutes}m"
    )
else:
    print(f"Tracked {len(book_diffs) + 1} book updates for {hours}h {minutes}m")

plot_quantiles(time_diffs, bins=50, name="Time between Updates (ms)").show()
if oracle_enabled:
    plot_quantiles(oracle_diffs, bins=50, name="Time between Oracle (ms)").show()
    plot_quantiles(book_diffs, bins=50, name="Time between Books (ms)").show()

plot_quantiles(df["spread_bps"], bins=25, name="Spread (bps)").show()
plot_quantiles(
    df["l1_l2_bid_spread_bps"], bins=25, name="L1-L2 Bid Spread (bps)"
).show()
plot_quantiles(
    df["l1_l2_ask_spread_bps"], bins=25, name="L1-L2 Ask Spread (bps)"
).show()

if oracle_enabled:
    plot_quantiles(
        df["midpoint_oracle_spread_bps"], bins=25, name="Midpoint-Oracle Spread (bps)"
    ).show()

plot_line(
    df["timestamp"],
    [df["spread_bps"]],
    ["Spread (bps)"],
    x_title="Time",
    y_title="Spread (bps)",
    show_legend=False,
).show()
plot_line(
    df["timestamp"],
    [df["l1_l2_bid_spread_bps"]],
    ["L1-L2 Bid Spread (bps)"],
    x_title="Time",
    y_title="L1-L2 Bid Spread (bps))",
    show_legend=False,
).show()
plot_line(
    df["timestamp"],
    [df["l1_l2_ask_spread_bps"]],
    ["L1-L2 Ask Spread (bps)"],
    x_title="Time",
    y_title="L1-L2 Ask Spread (bps))",
    show_legend=False,
).show()
if oracle_enabled:
    plot_line(
        df["timestamp"],
        [df["midpoint_oracle_spread_bps"]],
        ["Midpoint-Oracle Spread (bps)"],
        x_title="Time",
        y_title="Midpoint-Oracle Spread (bps)",
        show_legend=False,
    ).show()
plot_line(
    df["timestamp"],
    [df["BID1"], df["ASK1"], df["oracle_price"]],
    ["Bid", "Ask", "Oracle"],
    x_title="Time",
    y_title="Best Levels",
).show()