In [28]:
import polars as pl
from datetime import datetime, timedelta
import matplotlib.pyplot as plt

### BBO

In [2]:
data_bbo = pl.read_parquet("data/raw_bbo_data.parquet")

In [3]:
data_bbo = data_bbo.drop("xltime")

data_bbo = data_bbo.rename({
    "index": "Time",
    "bid-price" : "Bid_Price",
    "bid-volume" : "Bid_Volume",
    "ask-price" : "Ask_Price",
    "ask-volume" : "Ask_Volume"	
    })


Time,Bid_Price,Bid_Volume,Ask_Price,Ask_Volume,Stock
"datetime[μs, America/New_York]",f64,i64,f64,i64,str
2008-07-01 09:30:07.238999 EDT,32.25,18,32.34,1,"""WAG.N"""
2008-07-01 09:30:07.607999 EDT,32.27,1,32.3,1,"""WAG.N"""
2008-07-01 09:30:11.085 EDT,32.25,18,32.3,1,"""WAG.N"""
2008-07-01 09:30:11.244 EDT,32.25,18,32.36,46,"""WAG.N"""
2008-07-01 09:30:11.299 EDT,32.25,17,32.36,46,"""WAG.N"""


In [4]:
dtypes_info = data_bbo.dtypes
print(dtypes_info)

# If you want a more visual summary similar to `.info()` in pandas
info_data_bbo = pl.DataFrame({
    "column": data_bbo.columns,
    "data_type": [str(dtype) for dtype in dtypes_info],
    "Total of Observation for each column": [data_bbo.select(pl.col(column).count()).to_numpy()[0][0] for column in data_bbo.columns]
})

print(info_data_bbo)

[Datetime(time_unit='us', time_zone='America/New_York'), Float64, Int64, Float64, Int64, String]
shape: (6, 3)
┌────────────┬─────────────────────────────────┬─────────────────────────────────┐
│ column     ┆ data_type                       ┆ Total of Observation for each … │
│ ---        ┆ ---                             ┆ ---                             │
│ str        ┆ str                             ┆ u32                             │
╞════════════╪═════════════════════════════════╪═════════════════════════════════╡
│ Time       ┆ Datetime(time_unit='us', time_… ┆ 32665584                        │
│ Bid_Price  ┆ Float64                         ┆ 32665584                        │
│ Bid_Volume ┆ Int64                           ┆ 32665584                        │
│ Ask_Price  ┆ Float64                         ┆ 32665584                        │
│ Ask_Volume ┆ Int64                           ┆ 32665584                        │
│ Stock      ┆ String                          ┆ 32665584  

In [5]:
# Checking for missing values in each column
for column in data_bbo.columns:
    null_count = data_bbo[column].is_null().sum()
    print(f"Column '{column}' has {null_count} missing values")

Column 'Time' has 0 missing values
Column 'Bid_Price' has 0 missing values
Column 'Bid_Volume' has 0 missing values
Column 'Ask_Price' has 0 missing values
Column 'Ask_Volume' has 0 missing values
Column 'Stock' has 0 missing values


In [8]:
data_bbo = data_bbo.with_columns(
    ((data_bbo['Bid_Price'] + data_bbo['Ask_Price']) / 2 ).alias("Mid_Price"),
    (data_bbo['Bid_Price'] * data_bbo['Bid_Volume']).alias("Bid_Volume_Price"),
    (data_bbo['Ask_Price'] * data_bbo['Ask_Volume']).alias("Ask_Volume_Price")
    )

data_bbo = data_bbo.with_columns(
    (data_bbo['Mid_Price'] / data_bbo['Mid_Price'].shift(1) - 1).alias("Simple_Return"),
    (data_bbo['Mid_Price'] / data_bbo['Mid_Price'].shift(1)).log().alias("Log_Return")
)

In [18]:
data_bbo.describe()

statistic,Time,Bid_Price,Bid_Volume,Ask_Price,Ask_Volume,Stock,Mid_Price,Bid_Volume_Price,Ask_Volume_Price,Simple_Return,Log_Return
str,str,f64,f64,f64,f64,str,f64,f64,f64,f64,f64
"""count""","""32665584""",32665584.0,32665584.0,32665584.0,32665584.0,"""32665584""",32665584.0,32665584.0,32665584.0,32665583.0,32665583.0
"""null_count""","""0""",0.0,0.0,0.0,0.0,"""0""",0.0,0.0,0.0,1.0,1.0
"""mean""","""2008-10-03 21:07:40.878676-04:…",24.007892,31.68592,24.035969,29.999436,,24.021931,383.020028,395.944653,0.047483,-3.5755e-09
"""std""",,10.558782,130.167709,10.566081,137.795939,,10.562424,1343.614515,2194.327816,0.674264,0.259607
"""min""","""2008-07-01 09:30:07.238999-04:…",1.35,0.0,1.36,0.0,"""BK.N""",1.355,0.0,0.0,-0.952546,-3.047993
"""25%""","""2008-08-19 12:28:29.193000-04:…",16.52,2.0,16.53,2.0,,16.525,43.56,41.72,0.0,0.0
"""50%""","""2008-10-09 14:24:31.788000-04:…",27.1,5.0,27.14,5.0,,27.125,120.44,118.44,0.0,0.0
"""75%""","""2008-11-18 12:49:33.793000-05:…",32.7,20.0,32.73,20.0,,32.715,350.57,343.6,0.0,0.0
"""max""","""2008-12-31 16:00:00.439000-05:…",40.63,17913.0,42.01,13216.0,"""WAG.N""",41.01,331211.37,266068.0,20.073016,3.047993


In [None]:
data_bbo = data_bbo.with_columns(
    pl.col('Time').cast(pl.Datetime).alias('Time')  # Ensure the 'Time' column is cast to Datetime
)
data_bbo = data_bbo.with_columns([
    pl.col("Time").dt.year().alias("Year"),
    pl.col("Time").dt.month().alias("Month"),
    pl.col("Time").dt.day().alias("Day"),
    pl.col("Time").dt.hour().alias("Hour"),
    pl.col("Time").dt.minute().alias("Minute"),
    pl.col("Time").dt.second().alias("Second"),
])

In [1]:
data_bbo.head()

NameError: name 'data_bbo' is not defined

# TRADE

In [13]:
data_trade = pl.read_parquet("data/raw_trade_data.parquet")

In [14]:
data_trade = data_trade.rename({
    "index" : "Time",
    "trade-price" : "Trade_Price",
    "trade-volume" : "Trade_Volume"
    })

Time,Trade_Price,Trade_Volume
"datetime[μs, America/New_York]",f64,f64
2008-07-01 09:30:07.163 EDT,32.3,69600.0


In [15]:
dtypes_info = data_trade.dtypes
print(dtypes_info)

# If you want a more visual summary similar to `.info()` in pandas
info_data_trade = pl.DataFrame({
    "column": data_trade.columns,
    "data_type": [str(dtype) for dtype in dtypes_info],
    "Total of Observation for each column": [data_trade.select(pl.col(column).count()).to_numpy()[0][0] for column in data_trade.columns]
})

print(info_data_trade)

[Datetime(time_unit='us', time_zone='America/New_York'), Float64, Float64]
shape: (3, 3)
┌──────────────┬─────────────────────────────────┬─────────────────────────────────┐
│ column       ┆ data_type                       ┆ Total of Observation for each … │
│ ---          ┆ ---                             ┆ ---                             │
│ str          ┆ str                             ┆ u32                             │
╞══════════════╪═════════════════════════════════╪═════════════════════════════════╡
│ Time         ┆ Datetime(time_unit='us', time_… ┆ 8115100                         │
│ Trade_Price  ┆ Float64                         ┆ 8115100                         │
│ Trade_Volume ┆ Float64                         ┆ 8115100                         │
└──────────────┴─────────────────────────────────┴─────────────────────────────────┘


In [16]:
# Checking for missing values in each column
for column in data_trade.columns:
    null_count = data_trade[column].is_null().sum()
    print(f"Column '{column}' has {null_count} missing values")

Column 'Time' has 0 missing values
Column 'Trade_Price' has 0 missing values
Column 'Trade_Volume' has 0 missing values


In [19]:
data_trade.describe()

statistic,Time,Trade_Price,Trade_Volume
str,str,f64,f64
"""count""","""8115100""",8115100.0,8115100.0
"""null_count""","""0""",0.0,0.0
"""mean""","""2008-10-02 11:36:31.048446-04:…",23.324136,735.181859
"""std""",,10.456872,25744.637667
"""min""","""2008-07-01 09:30:07.163000-04:…",1.35,100.0
"""25%""","""2008-08-21 10:00:14.173000-04:…",15.42,100.0
"""50%""","""2008-10-06 15:58:47.467000-04:…",26.16,200.0
"""75%""","""2008-11-14 11:08:25.292000-05:…",32.37,500.0
"""max""","""2008-12-31 16:03:23.503000-05:…",42.0,17277200.0
