In [1]:
import polars as pl

In [2]:
raw_bbo_data = pl.read_parquet("data/raw_bbo_data_1.parquet")
raw_bbo_data.head()

index,xltime,bid-price,bid-volume,ask-price,ask-volume,Stock
"datetime[μs, America/New_York]",f64,f64,i64,f64,i64,str
2008-01-02 09:30:04.132 EST,39449.604214,29.68,1,29.69,373,"""C.N"""
2008-01-02 09:30:04.334 EST,39449.604217,29.68,1,29.69,372,"""C.N"""
2008-01-02 09:30:04.423 EST,39449.604218,29.67,1,29.69,361,"""C.N"""
2008-01-02 09:30:04.576 EST,39449.60422,29.67,1,29.69,356,"""C.N"""
2008-01-02 09:30:04.825 EST,39449.604223,29.67,1,29.69,353,"""C.N"""


In [3]:
# To get data types of columns
dtypes_info = raw_bbo_data.dtypes
print(dtypes_info)

info_raw_bbo_data = pl.DataFrame({
    "column": raw_bbo_data.columns,
    "data_type": [str(dtype) for dtype in dtypes_info],
    "Total of Observation for each column": [raw_bbo_data.select(pl.col(column).count()).to_numpy()[0][0] for column in raw_bbo_data.columns]
})

print(info_raw_bbo_data)

[Datetime(time_unit='us', time_zone='America/New_York'), Float64, Float64, Int64, Float64, Int64, String]
shape: (7, 3)
┌────────────┬─────────────────────────────────┬─────────────────────────────────┐
│ column     ┆ data_type                       ┆ Total of Observation for each … │
│ ---        ┆ ---                             ┆ ---                             │
│ str        ┆ str                             ┆ u32                             │
╞════════════╪═════════════════════════════════╪═════════════════════════════════╡
│ index      ┆ Datetime(time_unit='us', time_… ┆ 127927147                       │
│ xltime     ┆ Float64                         ┆ 127927147                       │
│ bid-price  ┆ Float64                         ┆ 127927147                       │
│ bid-volume ┆ Int64                           ┆ 127927147                       │
│ ask-price  ┆ Float64                         ┆ 127927147                       │
│ ask-volume ┆ Int64                           ┆ 1

In [4]:
# Checking for missing values in each column
for column in raw_bbo_data.columns:
    null_count = raw_bbo_data[column].is_null().sum()
    print(f"Column '{column}' has {null_count} missing values")

Column 'index' has 0 missing values
Column 'xltime' has 0 missing values
Column 'bid-price' has 0 missing values
Column 'bid-volume' has 0 missing values
Column 'ask-price' has 0 missing values
Column 'ask-volume' has 0 missing values
Column 'Stock' has 0 missing values


In [5]:
raw_bbo_data = raw_bbo_data.rename({
    "index": "timestamp",
    "bid-price": "bid",
    "ask-price": "ask",
    "bid-volume": "bid_volume",
    "ask-volume": "ask_volume"
})

In [6]:
dataraw_bbo_data_bbo = raw_bbo_data.with_columns(
    pl.col('timestamp').cast(pl.Datetime).alias('timestamp')  # Ensure the 'Time' column is cast to Datetime
)
raw_bbo_data = raw_bbo_data.with_columns([
    # pl.col("timestamp").dt.year().alias("Year"),
    # pl.col("timestamp").dt.month().alias("Month"),
    pl.col("timestamp").dt.day().alias("Day"),
    pl.col("timestamp").dt.hour().alias("Hour"),
    pl.col("timestamp").dt.minute().alias("Minute"),
    # pl.col("timestamp").dt.second().alias("Second"),
])



: 

: 

In [6]:
dataraw_bbo_data_bbo = raw_bbo_data.with_columns(
    pl.col('timestamp').cast(pl.Datetime).alias('timestamp')  # Ensure the 'Time' column is cast to Datetime
)

In [7]:
raw_bbo_data = raw_bbo_data.with_columns(
    raw_bbo_data['timestamp'].dt.strftime('%Y-%m-%d %H:%M').alias('time')
)

raw_bbo_data.head(5)

timestamp,xltime,bid,bid_volume,ask,ask_volume,Stock,time
"datetime[μs, America/New_York]",f64,f64,i64,f64,i64,str,str
2008-01-02 09:30:04.132 EST,39449.604214,29.68,1,29.69,373,"""C.N""","""2008-01-02 09:30"""
2008-01-02 09:30:04.334 EST,39449.604217,29.68,1,29.69,372,"""C.N""","""2008-01-02 09:30"""
2008-01-02 09:30:04.423 EST,39449.604218,29.67,1,29.69,361,"""C.N""","""2008-01-02 09:30"""
2008-01-02 09:30:04.576 EST,39449.60422,29.67,1,29.69,356,"""C.N""","""2008-01-02 09:30"""
2008-01-02 09:30:04.825 EST,39449.604223,29.67,1,29.69,353,"""C.N""","""2008-01-02 09:30"""


In [8]:
raw_bbo_data = raw_bbo_data.with_columns(
        (pl.col('bid') * pl.col('bid_volume')).alias("bid_price_volume"),
        (pl.col('ask') * pl.col('ask_volume')).alias("ask_price_volume")
    )

raw_bbo_data.head(3)

timestamp,xltime,bid,bid_volume,ask,ask_volume,Stock,time,bid_price_volume,ask_price_volume
"datetime[μs, America/New_York]",f64,f64,i64,f64,i64,str,str,f64,f64
2008-01-02 09:30:04.132 EST,39449.604214,29.68,1,29.69,373,"""C.N""","""2008-01-02 09:30""",29.68,11074.37
2008-01-02 09:30:04.334 EST,39449.604217,29.68,1,29.69,372,"""C.N""","""2008-01-02 09:30""",29.68,11044.68
2008-01-02 09:30:04.423 EST,39449.604218,29.67,1,29.69,361,"""C.N""","""2008-01-02 09:30""",29.67,10718.09


In [9]:
aggregate_bbo_data = raw_bbo_data.group_by(["time", "Stock"]).agg([
    (pl.sum("bid_price_volume") / pl.sum("bid_volume")).alias("bid_vwa"),
    (pl.sum("ask_price_volume") / pl.sum("ask_volume")).alias("ask_vwa")
])

aggregate_bbo_data.head(5)


time,Stock,bid_vwa,ask_vwa
str,str,f64,f64
"""2008-04-23 14:26""","""PG.N""",67.096526,67.110915
"""2008-08-07 13:28""","""C.N""",18.899127,18.911981
"""2008-08-25 15:49""","""RTN.N""",59.832404,59.834434
"""2008-11-25 10:38""","""RTN.N""",46.339335,46.4047
"""2008-01-15 12:58""","""TWX.N""",15.95,15.96


In [17]:
aggregate_bbo_data = aggregate_bbo_data.with_columns(
        ((pl.col('bid_vwa') + pl.col('ask_vwa')) / 2).alias("vwap_mid_price")
    )

aggregate_bbo_data.head(5)

time,Stock,bid_vwa,ask_vwa,vwap_mid_price
str,str,f64,f64,f64
"""2008-04-23 14:26""","""PG.N""",67.096526,67.110915,67.10372
"""2008-08-07 13:28""","""C.N""",18.899127,18.911981,18.905554
"""2008-08-25 15:49""","""RTN.N""",59.832404,59.834434,59.833419
"""2008-11-25 10:38""","""RTN.N""",46.339335,46.4047,46.372018
"""2008-01-15 12:58""","""TWX.N""",15.95,15.96,15.955


In [18]:
aggregate_bbo_data.write_parquet("data/vwap_bbo_data_with_mid_prices.parquet")

In [11]:
result = (
    aggregate_bbo_data
    .with_columns(
        (pl.col("time").str.slice(0, 10)).alias("month")  # Extract year and month as 'YYYY-MM'
    )
    .group_by(["Stock", "month"])  # Group by Stock and extracted month
    .agg(pl.count())  # Count rows in each group
    .sort(["Stock", "month"])  # Sort by Stock and month
)


  .agg(pl.count())  # Count rows in each group


### Compare sizes

In [12]:
result

Stock,month,count
str,str,u32
"""C.N""","""2008-01-02""",391
"""C.N""","""2008-01-03""",391
"""C.N""","""2008-01-04""",391
"""C.N""","""2008-01-07""",391
"""C.N""","""2008-01-08""",390
…,…,…
"""WFC.N""","""2008-12-24""",214
"""WFC.N""","""2008-12-26""",390
"""WFC.N""","""2008-12-29""",389
"""WFC.N""","""2008-12-30""",391


In [20]:
result.write_parquet("data/results_vwap.parquet")

In [19]:
aggregate_bbo_data

time,Stock,bid_vwa,ask_vwa,vwap_mid_price
str,str,f64,f64,f64
"""2008-04-23 14:26""","""PG.N""",67.096526,67.110915,67.10372
"""2008-08-07 13:28""","""C.N""",18.899127,18.911981,18.905554
"""2008-08-25 15:49""","""RTN.N""",59.832404,59.834434,59.833419
"""2008-11-25 10:38""","""RTN.N""",46.339335,46.4047,46.372018
"""2008-01-15 12:58""","""TWX.N""",15.95,15.96,15.955
…,…,…,…,…
"""2008-08-01 11:18""","""TWX.N""",14.172429,14.18974,14.181085
"""2008-04-10 12:03""","""TWX.N""",14.549424,14.561791,14.555608
"""2008-01-17 15:52""","""C.N""",24.919062,24.919497,24.919279
"""2008-02-01 10:47""","""MDT.N""",47.143655,47.154791,47.149223
