In [None]:
# Standard library imports
import warnings
from pathlib import Path

# Third-party imports
import pandas as pd
import polars as pl
from polars import col, when

# Suppress warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore")

from settings import config
from pull_taq import get_taq_nbbo, get_taq_wct
from transform_taq import extract_features_taq

In [None]:
# Change default pandas display options

pd.options.display.max_columns = 30
pd.options.display.max_colwidth = 200
pd.set_option('display.float_format', lambda x: '%.4f' % x)
pd.set_option('display.expand_frame_repr', False)

# Global variables
RAW_DATA_DIR = Path(config("RAW_DATA_DIR"))
RAW_DATA_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_DIR = Path(config("OUTPUT_DIR"))
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
WRDS_USERNAME = config("WRDS_USERNAME")

from pull_taq import get_taq_nbbo, get_taq_wct

## 1. Load TAQ data

### 1.1. Quotes Data

Pull quotes data from TAQ database (NBBO)
- Display first 5 rows to confirm data is loaded correctly

In [None]:
quotes = get_taq_nbbo(('SPY'), date='2024-03-07', use_bars=False)
display(quotes.head())

### 1.2. Trades Data

Pull quotes data from Trades database (WCT)
- Display first 5 rows to confirm data is loaded correctly

In [None]:
trades = get_taq_wct(('SPY'), date='2024-03-07')
display(trades.head())

## 2. Data Preparation & feature Engineering

#### 2.1. Merge Quotes and Trades

Asof-join of quotes and trades dataframes
- Match each trade with the most recent quote, keeping only rows with actual trades (left join) (that naturally discards pure-quote timestamps that lack trades.)
- Display first 5 rows to confirm data is loaded correctly

In [None]:
# Asof-join on trade_ts (left side) to quote_ts (right side).
df = pd.merge_asof(
    trades.sort_values("time_trade"),
    quotes.sort_values("time_quote")[["time_quote", "best_bid", "best_bidsizeshares", "best_ask", "best_asksizeshares"]],
    left_on="time_trade",
    right_on="time_quote",
    direction="backward"  # Ensures we take the most recent quote before the trade
)
df = df.drop(columns="time_quote")
display(df.head())


### 2.2. Feature Engineering

Extracted features:
- **Duration since last trade**: Time since the last trade.
- **Mid price**: Average of the best bid and ask prices.
- **Spread**: Difference between the best ask and bid prices.
- **Size imbalance**: Difference between the best ask and bid sizes divided by their sum.
- **Trade direction (sign)**: +1 for buy, -1 for sell.
- **Next mid price**: Mid price of the next quote.
- **Next mid-price change**: Change in mid price from the current to the next quote.

In [None]:
df = extract_features_taq(df)
df.head()

## 3. Analysis

