In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import TimeSeriesSplit

# Load the data
train = pd.read_csv('train.csv')
book = pd.read_parquet('order_book_feature.parquet')

# Calculate the weighted average price
book['wap'] = (book['bid_price1'] * book['ask_size1'] + book['ask_price1'] * book['bid_size1']) / (book['bid_size1'] + book['ask_size1'])

# Calculate log returns
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff()

# Calculate realized volatility
def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

# Calculate realized volatility per time_id
def realized_volatility_per_time_id(df_book_data):
    df_book_data['log_return'] = df_book_data.groupby(['time_id', 'stock_id'])['wap'].apply(log_return)
    df_book_data = df_book_data[~df_book_data['log_return'].isnull()]
    df_realized_vol_per_stock = pd.DataFrame(df_book_data.groupby(['stock_id', 'time_id'])['log_return'].agg(realized_volatility)).reset_index()
    df_realized_vol_per_stock = df_realized_vol_per_stock.rename(columns={'log_return': 'realized_vol'})
    return df_realized_vol_per_stock

df_past_realized_train = realized_volatility_per_time_id(book)

# Feature Engineering: Add new features
book['bid_ask_spread'] = (book['ask_price1'] - book['bid_price1']) / ((book['ask_price1'] + book['bid_price1']) / 2)
book['total_size'] = book['bid_size1'] + book['ask_size1']

# Calculate the average bid-ask spread and total size per stock_id and time_id
avg_spread_size = book.groupby(['stock_id', 'time_id']).agg({'bid_ask_spread': 'mean', 'total_size': 'mean'}).reset_index()

# Merge the average bid-ask spread and total size with the realized volatility
df_past_realized_train = df_past_realized_train.merge(avg_spread_size, on=['stock_id', 'time_id'], how='left')

# Find the optimal EWMA alpha
alphas = np.arange(0.01, 1.01, 0.01)
best_rmspe = np.inf
best_r2 = -np.inf
best_alpha = None

def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

for alpha in alphas:
    df_past_realized_train['ewma_vol'] = df_past_realized_train.groupby('stock_id')['realized_vol'].transform(lambda x: x.ewm(alpha=alpha).mean())
    df_joined = train.merge(df_past_realized_train, on=['stock_id', 'time_id'], how='left')
    df_joined['prediction'] = df_joined['ewma_vol'].fillna(df_joined['realized_vol'])
    RMSPE = rmspe(y_true=df_joined['target'], y_pred=df_joined['prediction'])
    R2 = r2_score(y_true=df_joined['target'], y_pred=df_joined['prediction'])

    if RMSPE < best_rmspe:
        best_rmspe = RMSPE
        best_r2 = R2
        best_alpha = alpha

print(f'Best RMSPE score: {best_rmspe:.4f}, with ewma_alpha: {best_alpha:.4f}')


print(f'Best R2 score: {best_r2:.4f}, with ewma_alpha: {best_alpha:.4f}')


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df_book_data['log_return'] = df_book_data.groupby(['time_id', 'stock_id'])['wap'].apply(log_return)


Best RMSPE score: 0.2077, with ewma_alpha: 0.9100
Best R2 score: 0.8587, with ewma_alpha: 0.9100
