In [16]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [17]:
df = pd.read_csv("first_25000_rows.csv")

### Best-Level OFI

In [18]:
def compute_best_level_ofi(df):
    df = df.copy()
    df['prev_bid_px'] = df['bid_px_00'].shift(1)
    df['prev_ask_px'] = df['ask_px_00'].shift(1)
    df['prev_bid_sz'] = df['bid_sz_00'].shift(1)
    df['prev_ask_sz'] = df['ask_sz_00'].shift(1)

    def bid_flow(row):
        if pd.isna(row['prev_bid_px']):
            return 0
        if row['bid_px_00'] > row['prev_bid_px']:
            return row['bid_sz_00']
        elif row['bid_px_00'] == row['prev_bid_px']:
            return row['bid_sz_00'] - row['prev_bid_sz']
        else:
            return -row['prev_bid_sz']

    def ask_flow(row):
        if pd.isna(row['prev_ask_px']):
            return 0
        if row['ask_px_00'] > row['prev_ask_px']:
            return -row['ask_sz_00']
        elif row['ask_px_00'] == row['prev_ask_px']:
            return row['ask_sz_00'] - row['prev_ask_sz']
        else:
            return row['prev_ask_sz']

    df['best_level_ofi'] = df.apply(lambda r: bid_flow(r) - ask_flow(r), axis=1)
    return df[['ts_event', 'symbol', 'best_level_ofi']]

In [19]:
    best_ofi_df = compute_best_level_ofi(df)
    print(best_ofi_df.head())

                         ts_event symbol  best_level_ofi
0  2024-10-21T11:54:29.221064336Z   AAPL             0.0
1  2024-10-21T11:54:29.223769812Z   AAPL             2.0
2  2024-10-21T11:54:29.225030400Z   AAPL             3.0
3  2024-10-21T11:54:29.712434212Z   AAPL             0.0
4  2024-10-21T11:54:29.764673165Z   AAPL             0.0


### Multi-Level OFI

In [20]:
def compute_multi_level_ofi(df, levels=10):
    df = df.copy()
    ofi_cols = []
    for level in range(levels):
        bid_px_col = f'bid_px_{level:02d}'
        ask_px_col = f'ask_px_{level:02d}'
        bid_sz_col = f'bid_sz_{level:02d}'
        ask_sz_col = f'ask_sz_{level:02d}'
        prev_bid_px = df[bid_px_col].shift(1)
        prev_ask_px = df[ask_px_col].shift(1)
        prev_bid_sz = df[bid_sz_col].shift(1)
        prev_ask_sz = df[ask_sz_col].shift(1)

        bid_flow = np.where(df[bid_px_col] > prev_bid_px,
                            df[bid_sz_col],
                            np.where(df[bid_px_col] == prev_bid_px,
                                     df[bid_sz_col] - prev_bid_sz,
                                     -prev_bid_sz))

        ask_flow = np.where(df[ask_px_col] > prev_ask_px,
                            -df[ask_sz_col],
                            np.where(df[ask_px_col] == prev_ask_px,
                                     df[ask_sz_col] - prev_ask_sz,
                                     prev_ask_sz))

        ofi = bid_flow - ask_flow
        col_name = f'ofi_{level}'
        df[col_name] = ofi
        ofi_cols.append(col_name)

    return df[['ts_event', 'symbol'] + ofi_cols]

In [21]:
    multi_ofi_df = compute_multi_level_ofi(df)
    print(multi_ofi_df.head())

                         ts_event symbol  ofi_0  ofi_1  ofi_2  ofi_3  ofi_4  \
0  2024-10-21T11:54:29.221064336Z   AAPL    NaN    NaN    NaN    NaN    NaN   
1  2024-10-21T11:54:29.223769812Z   AAPL    2.0    0.0    0.0    0.0    0.0   
2  2024-10-21T11:54:29.225030400Z   AAPL    3.0    0.0    0.0    0.0    0.0   
3  2024-10-21T11:54:29.712434212Z   AAPL    0.0    0.0  200.0    0.0    0.0   
4  2024-10-21T11:54:29.764673165Z   AAPL    0.0    0.0 -200.0    0.0    0.0   

   ofi_5  ofi_6  ofi_7  ofi_8  ofi_9  
0    NaN    NaN    NaN    NaN    NaN  
1    0.0    0.0    0.0    0.0    0.0  
2    0.0    0.0    0.0    0.0    0.0  
3    0.0    0.0    0.0    0.0    0.0  
4    0.0    0.0    0.0    0.0    0.0  


### Integrated OFI

In [22]:
def compute_integrated_ofi(df, levels=10):
    multi_df = compute_multi_level_ofi(df, levels)
    ofi_cols = [f'ofi_{i}' for i in range(levels)]
    X = multi_df[ofi_cols].fillna(0)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    pca = PCA(n_components=1)
    integrated_ofi = pca.fit_transform(X_scaled).flatten()

    multi_df['integrated_ofi'] = integrated_ofi
    return multi_df[['ts_event', 'symbol', 'integrated_ofi']]

In [23]:
    integrated_ofi_df = compute_integrated_ofi(df)
    print(integrated_ofi_df.head())

                         ts_event symbol  integrated_ofi
0  2024-10-21T11:54:29.221064336Z   AAPL        0.004431
1  2024-10-21T11:54:29.223769812Z   AAPL        0.009953
2  2024-10-21T11:54:29.225030400Z   AAPL        0.012713
3  2024-10-21T11:54:29.712434212Z   AAPL        0.215095
4  2024-10-21T11:54:29.764673165Z   AAPL       -0.206233


### Cross-Asset OFI

In [28]:
def compute_cross_asset_ofi(df, ofi_col='best_level_ofi'):
    df = df.copy()
    df_grouped = df.groupby(['ts_event', 'symbol'])[ofi_col].mean().reset_index()

    
    pivot = df_grouped.pivot(index='ts_event', columns='symbol', values=ofi_col)
    pivot = pivot.fillna(0)
    return pivot.reset_index()

In [30]:
cross_asset_ofi_df = compute_cross_asset_ofi(best_ofi_df, ofi_col='best_level_ofi')
print(cross_asset_ofi_df.head())

symbol                        ts_event  AAPL
0       2024-10-21T11:54:29.221064336Z   0.0
1       2024-10-21T11:54:29.223769812Z   2.0
2       2024-10-21T11:54:29.225030400Z   3.0
3       2024-10-21T11:54:29.712434212Z   0.0
4       2024-10-21T11:54:29.764673165Z   0.0
