In [1]:
import polars as pl

In [2]:
raw_bbo_data = pl.read_parquet("data/raw_bbo_data.parquet")
raw_bbo_data.head()

index,xltime,bid-price,bid-volume,ask-price,ask-volume,Stock
"datetime[μs, America/New_York]",f64,f64,i64,f64,i64,str
2008-07-01 09:30:49.866 EDT,39630.563077,52.67,11,52.79,2,"""ABT.N"""
2008-07-01 09:30:49.951 EDT,39630.563078,52.67,11,52.76,2,"""ABT.N"""
2008-07-01 09:30:50.069 EDT,39630.56308,52.67,4,52.76,2,"""ABT.N"""
2008-07-01 09:30:50.079 EDT,39630.56308,52.63,5,52.76,2,"""ABT.N"""
2008-07-01 09:30:50.349 EDT,39630.563083,37.45,9,37.47,8,"""MRK.N"""


In [3]:
# To get data types of columns
dtypes_info = raw_bbo_data.dtypes
print(dtypes_info)

# If you want a more visual summary similar to `.info()` in pandas
info_raw_bbo_data = pl.DataFrame({
    "column": raw_bbo_data.columns,
    "data_type": [str(dtype) for dtype in dtypes_info],
    "Total of Observation for each column": [raw_bbo_data.select(pl.col(column).count()).to_numpy()[0][0] for column in raw_bbo_data.columns]
})

print(info_raw_bbo_data)

[Datetime(time_unit='us', time_zone='America/New_York'), Float64, Float64, Int64, Float64, Int64, String]
shape: (7, 3)
┌────────────┬─────────────────────────────────┬─────────────────────────────────┐
│ column     ┆ data_type                       ┆ Total of Observation for each … │
│ ---        ┆ ---                             ┆ ---                             │
│ str        ┆ str                             ┆ u32                             │
╞════════════╪═════════════════════════════════╪═════════════════════════════════╡
│ index      ┆ Datetime(time_unit='us', time_… ┆ 30966392                        │
│ xltime     ┆ Float64                         ┆ 30966392                        │
│ bid-price  ┆ Float64                         ┆ 30966392                        │
│ bid-volume ┆ Int64                           ┆ 30966392                        │
│ ask-price  ┆ Float64                         ┆ 30966392                        │
│ ask-volume ┆ Int64                           ┆ 3

In [4]:
# Checking for missing values in each column
for column in raw_bbo_data.columns:
    null_count = raw_bbo_data[column].is_null().sum()
    print(f"Column '{column}' has {null_count} missing values")

Column 'index' has 0 missing values
Column 'xltime' has 0 missing values
Column 'bid-price' has 0 missing values
Column 'bid-volume' has 0 missing values
Column 'ask-price' has 0 missing values
Column 'ask-volume' has 0 missing values
Column 'Stock' has 0 missing values


In [5]:
raw_bbo_data = raw_bbo_data.rename({
    "index": "timestamp",
    "bid-price": "bid",
    "ask-price": "ask",
    "bid-volume": "bid_volume",
    "ask-volume": "ask_volume"
})

In [6]:
dataraw_bbo_data_bbo = raw_bbo_data.with_columns(
    pl.col('timestamp').cast(pl.Datetime).alias('timestamp')  # Ensure the 'Time' column is cast to Datetime
)
raw_bbo_data = raw_bbo_data.with_columns([
    # pl.col("timestamp").dt.year().alias("Year"),
    # pl.col("timestamp").dt.month().alias("Month"),
    pl.col("timestamp").dt.day().alias("Day"),
    pl.col("timestamp").dt.hour().alias("Hour"),
    pl.col("timestamp").dt.minute().alias("Minute"),
    # pl.col("timestamp").dt.second().alias("Second"),
])



In [6]:
dataraw_bbo_data_bbo = raw_bbo_data.with_columns(
    pl.col('timestamp').cast(pl.Datetime).alias('timestamp')  # Ensure the 'Time' column is cast to Datetime
)

In [7]:
raw_bbo_data = raw_bbo_data.with_columns(
    raw_bbo_data['timestamp'].dt.strftime('%Y-%m-%d %H:%M').alias('time')
)

raw_bbo_data.head(5)

timestamp,xltime,bid,bid_volume,ask,ask_volume,Stock,time
"datetime[μs, America/New_York]",f64,f64,i64,f64,i64,str,str
2008-07-01 09:30:49.866 EDT,39630.563077,52.67,11,52.79,2,"""ABT.N""","""2008-07-01 09:30"""
2008-07-01 09:30:49.951 EDT,39630.563078,52.67,11,52.76,2,"""ABT.N""","""2008-07-01 09:30"""
2008-07-01 09:30:50.069 EDT,39630.56308,52.67,4,52.76,2,"""ABT.N""","""2008-07-01 09:30"""
2008-07-01 09:30:50.079 EDT,39630.56308,52.63,5,52.76,2,"""ABT.N""","""2008-07-01 09:30"""
2008-07-01 09:30:50.349 EDT,39630.563083,37.45,9,37.47,8,"""MRK.N""","""2008-07-01 09:30"""


In [8]:
raw_bbo_data = raw_bbo_data.with_columns(
        (pl.col('bid') * pl.col('bid_volume')).alias("bid_price_volume"),
        (pl.col('ask') * pl.col('ask_volume')).alias("ask_price_volume")
    )

raw_bbo_data.head(3)

timestamp,xltime,bid,bid_volume,ask,ask_volume,Stock,time,bid_price_volume,ask_price_volume
"datetime[μs, America/New_York]",f64,f64,i64,f64,i64,str,str,f64,f64
2008-07-01 09:30:49.866 EDT,39630.563077,52.67,11,52.79,2,"""ABT.N""","""2008-07-01 09:30""",579.37,105.58
2008-07-01 09:30:49.951 EDT,39630.563078,52.67,11,52.76,2,"""ABT.N""","""2008-07-01 09:30""",579.37,105.52
2008-07-01 09:30:50.069 EDT,39630.56308,52.67,4,52.76,2,"""ABT.N""","""2008-07-01 09:30""",210.68,105.52


In [9]:
aggregate_bbo_data = raw_bbo_data.group_by(["time", "Stock"]).agg([
    (pl.sum("bid_price_volume") / pl.sum("bid_volume")).alias("bid_vwa"),
    (pl.sum("ask_price_volume") / pl.sum("ask_volume")).alias("ask_vwa")
])

aggregate_bbo_data.head(5)


time,Stock,bid_vwa,ask_vwa
str,str,f64,f64
"""2008-08-06 15:39""","""EMR.N""",48.9904,49.007246
"""2008-07-01 13:11""","""HON.N""",49.968058,49.985636
"""2008-07-30 10:05""","""VZ.N""",34.119728,34.14013
"""2008-08-13 12:27""","""ABT.N""",58.477794,58.4896
"""2008-07-11 15:14""","""MRK.N""",36.679088,36.701447


In [10]:
result = (
    aggregate_bbo_data
    .with_columns(
        (pl.col("time").str.slice(0, 10)).alias("month")  # Extract year and month as 'YYYY-MM'
    )
    .group_by(["Stock", "month"])  # Group by Stock and extracted month
    .agg(pl.count())  # Count rows in each group
    .sort(["Stock", "month"])  # Sort by Stock and month
)


  .agg(pl.count())  # Count rows in each group


In [11]:
result

Stock,month,count
str,str,u32
"""ABT.N""","""2008-07-01""",390
"""ABT.N""","""2008-07-02""",390
"""ABT.N""","""2008-07-03""",211
"""ABT.N""","""2008-07-07""",391
"""ABT.N""","""2008-07-08""",390
…,…,…
"""VZ.N""","""2008-12-24""",214
"""VZ.N""","""2008-12-26""",391
"""VZ.N""","""2008-12-29""",391
"""VZ.N""","""2008-12-30""",391
