In [53]:
import pandas as pd
import databento as db
import numpy as np
from IPython.display import display
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import os

In [58]:
file_path = "../data/XNAS-20250103-FR8JNPNEEV/xnas-itch-20241203.mbp-10.dbn.zst"
data = db.DBNStore.from_file(file_path)
df = data.to_df()

filtered_df = df[(df['depth'] >= 0) & (df['depth'] <= 4)]

# Step 2: Retain only specific columns
columns_to_keep = [
    'bid_px_00', 'ask_px_00', 'bid_sz_00', 'ask_sz_00', 
    'bid_ct_00', 'ask_ct_00', 'ts_event', 'bid_px_01', 'ask_px_01', 'bid_sz_01', 'ask_sz_01', 
    'bid_ct_01', 'ask_ct_01', 'bid_px_02', 'ask_px_02', 'bid_sz_02', 'ask_sz_02', 
    'bid_ct_02', 'ask_ct_02', 'bid_px_03', 'ask_px_03', 'bid_sz_03', 'ask_sz_03', 
    'bid_ct_03', 'ask_ct_03', 'bid_px_04', 'ask_px_04', 'bid_sz_04', 'ask_sz_04', 
    'bid_ct_04', 'ask_ct_04'
]
filtered_df = filtered_df[columns_to_keep]

# Step 2: Calculate OFI
def calculate_correct_ofi(df):
    ofi_list = []
    depth_levels = 5  # Levels 0 to 4
    old_bid_sz = np.zeros(depth_levels)
    old_ask_sz = np.zeros(depth_levels)

    for index, row in df.iterrows():
        # Store OFI for each depth level
        ofi_event = {'ts_event': row['ts_event']}
        total_ofi = 0
        
        for i in range(depth_levels):
            # Extract current bid and ask sizes
            new_bid_sz = row[f'bid_sz_0{i}']
            new_ask_sz = row[f'ask_sz_0{i}']
            
            # Calculate delta sizes
            delta_bid = new_bid_sz - old_bid_sz[i]
            delta_ask = new_ask_sz - old_ask_sz[i]
            
            # Compute OFI for this level
            level_ofi = delta_bid - delta_ask
            ofi_event[f'ofi_0{i}'] = level_ofi
            total_ofi += level_ofi
            
            # Update old sizes
            old_bid_sz[i] = new_bid_sz
            old_ask_sz[i] = new_ask_sz
        
        # Add total OFI for this event
        ofi_event['total_ofi'] = total_ofi
        ofi_list.append(ofi_event)

    return pd.DataFrame(ofi_list)

# Step 3: Apply Corrected OFI Calculation
ofi_df = calculate_correct_ofi(filtered_df)

# Step 4: Display Results
#print(ofi_df.head())
# Step 1: Extract OFI columns for PCA
ofi_columns = [f'ofi_0{i}' for i in range(5)]  # Multi-level OFI columns
ofi_data = ofi_df[ofi_columns]

# Step 2: Standardize the OFI data
scaler = StandardScaler()
ofi_standardized = scaler.fit_transform(ofi_data)

# Step 3: Apply PCA
pca = PCA(n_components=1)  # Keep only the first principal component
ofi_integrated = pca.fit_transform(ofi_standardized)

# Step 4: Add the PCA result to the DataFrame
ofi_df['ofi_integrated'] = ofi_integrated

# Step 5: Display the updated DataFrame
print(ofi_df[['ts_event', 'ofi_integrated']].head())

# unique_levels = filtered_df['depth'].unique()

# unique_levels_as_integers = [int(level) for level in unique_levels]

# # Sort the levels
# sorted_levels = sorted(unique_levels_as_integers)

# print("Unique levels:", sorted_levels)
#display(filtered_df)


                             ts_event  ofi_integrated
0 2024-12-03 09:00:00.035672616+00:00       -1.777603
1 2024-12-03 09:00:00.121915651+00:00        3.400511
2 2024-12-03 09:00:00.121932046+00:00        0.233129
3 2024-12-03 09:00:00.122328825+00:00       -3.400510
4 2024-12-03 09:00:00.122343653+00:00        3.400511


In [37]:
def process_directory_for_ofi(directory_path, levels=5):
    """
    Process all .zst files in a directory to calculate multi-level OFI metrics and integrate them using PCA.
    Args:
        directory_path (str): Path to the directory containing .zst files.
        levels (int): Number of levels to calculate OFI for.
    Returns:
        DataFrame: Combined data with OFI metrics for all processed files.
    """
    combined_data = []

    # Loop through all files in the directory
    for file_name in os.listdir(directory_path):
        # Check if the file ends with .zst
        if file_name.endswith(".zst"):
            file_path = os.path.join(directory_path, file_name)
            print(f"Processing file: {file_path}")

            try:
                # Load the data from the .zst file
                data = db.DBNStore.from_file(file_path)
                df = data.to_df()

                ofi_columns = []
                # Step 1: Compute delta (changes) in bid and ask sizes for each level
                for level in range(levels):  # Top levels: 0 to levels-1
                    bid_px_col = f"bid_px_0{level}"
                    ask_px_col = f"ask_px_0{level}"
                    bid_sz_col = f"bid_sz_0{level}"
                    ask_sz_col = f"ask_sz_0{level}"

                    if not all(col in df.columns for col in [bid_px_col, ask_px_col, bid_sz_col, ask_sz_col]):
                        print(f"Missing columns for level {level} in file {file_name}, skipping level.")
                        continue

                    ofi_bid_col = f"ofi_bid_0{level}"
                    ofi_ask_col = f"ofi_ask_0{level}"

                    # Calculate OFI using explicit numpy logic
                    bid_diff = df[bid_sz_col].diff().fillna(0)
                    ask_diff = df[ask_sz_col].diff().fillna(0)

                    df[ofi_bid_col] = np.where(
                        df[bid_px_col] >= df[bid_px_col].shift(), bid_diff, -df[bid_sz_col]
                    )
                    df[ofi_ask_col] = np.where(
                        df[ask_px_col] <= df[ask_px_col].shift(), -ask_diff, df[ask_sz_col]
                    )

                    ofi_columns.extend([ofi_bid_col, ofi_ask_col])

                # Step 2: Integrate Multi-Level OFI using PCA
                print("Integrating multi-level OFI using PCA...")
                pca = PCA(n_components=1)
                df["integrated_ofi"] = pca.fit_transform(df[ofi_columns].fillna(0))

                # Append processed data to combined list
                combined_data.append(df)

                print("OFI calculation completed for file.")

            except Exception as e:
                print(f"Error processing file {file_path}: {e}")

    # Combine all processed data into a single DataFrame
    if combined_data:
        return pd.concat(combined_data, ignore_index=True)
    else:
        print("No valid data processed.")
        return pd.DataFrame()

In [None]:
# Example usage
directory_path = "../data/XNAS-20250103-FR8JNPNEEV"  # Replace with your directory path
levels = 5
combined_ofi_data = process_directory_for_ofi(directory_path, levels)

if not combined_ofi_data.empty:
    print("Combined OFI Metrics (First 5 Rows):")
    print(combined_ofi_data.head())