In [4]:
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
import xgboost as xgb

def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff()

def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

train = pd.read_csv('train.csv')
book = pd.read_parquet('order_book_feature.parquet')
trades = pd.read_parquet('trades.parquet')

book['wap'] = (book['bid_price1'] * book['ask_size1'] + book['ask_price1'] * book['bid_size1']) / (book['bid_size1'] + book['ask_size1'])
book['bid_ask_spread'] = book['ask_price1'] - book['bid_price1']
book['order_imbalance'] = (book['bid_size1'] - book['ask_size1']) / (book['bid_size1'] + book['ask_size1'])

trades['price_log_return'] = trades.groupby(['time_id', 'stock_id'])['price'].apply(log_return)
trades = trades.dropna()

features = book.groupby(['time_id', 'stock_id']).agg({
    'wap': 'mean',
    'bid_ask_spread': 'mean',
    'order_imbalance': 'mean'
}).reset_index()

trades_features = trades.groupby(['time_id', 'stock_id']).agg({
    'price_log_return': [realized_volatility],
    'size': 'sum',
    'order_count': 'sum'
}).reset_index()

trades_features.columns = ['_'.join(col) if col[0] != 'time_id' and col[0] != 'stock_id' else col[0] for col in trades_features.columns.values]

merged_features = pd.merge(features, trades_features, on=['time_id', 'stock_id'], how='left')

train_merged = train.merge(merged_features, on=['stock_id', 'time_id'], how='left')

X = train_merged.drop(['target', 'time_id', 'stock_id'], axis=1)
y = train_merged['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = xgb.XGBRegressor(objective='reg:squarederror', n_jobs=-1, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

R2 = round(r2_score(y_true=y_test, y_pred=y_pred), 3)
RMSPE = round(rmspe(y_true=y_test, y_pred=y_pred), 3)

print(f'Performance of the XGBoost model with feature engineering: R2 score: {R2}, RMSPE: {RMSPE}')


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  trades['price_log_return'] = trades.groupby(['time_id', 'stock_id'])['price'].apply(log_return)


Performance of the XGBoost model with feature engineering: R2 score: 0.879, RMSPE: 0.197


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
import xgboost as xgb

# Existing functions...

train = pd.read_csv('train.csv')
book = pd.read_parquet('order_book_feature.parquet')
trades = pd.read_parquet('trades.parquet')
time_id_reference = pd.read_csv('time_id_reference.csv')

# Parse date and time
time_id_reference['datetime'] = pd.to_datetime(time_id_reference['date'] + ' ' + time_id_reference['time'])
time_id_reference = time_id_reference.drop(['date', 'time'], axis=1)

# Extract time features
time_id_reference['hour'] = time_id_reference['datetime'].dt.hour
time_id_reference['weekday'] = time_id_reference['datetime'].dt.weekday
time_id_reference['week'] = time_id_reference['datetime'].dt.isocalendar().week.astype(int)
time_id_reference['month'] = time_id_reference['datetime'].dt.month

# Merge time_id_reference with train
train = train.merge(time_id_reference, on='time_id', how='left')

# Calculate features from book and trades...

# Merge the features
merged_features = pd.merge(features, trades_features, on=['time_id', 'stock_id'], how='left')

train_merged = train.merge(merged_features, on=['stock_id', 'time_id'], how='left')

X = train_merged.drop(['target', 'time_id', 'stock_id', 'datetime'], axis=1)
y = train_merged['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = xgb.XGBRegressor(objective='reg:squarederror', n_jobs=-1, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

R2 = round(r2_score(y_true=y_test, y_pred=y_pred), 3)
RMSPE = round(rmspe(y_true=y_test, y_pred=y_pred), 3)

print(f'Performance of the XGBoost model with time-based features: R2 score: {R2}, RMSPE: {RMSPE}')


In [None]:
import pandas as pd
import numpy as np

train = pd.read_csv('train.csv')
book = pd.read_parquet('order_book_feature.parquet')

book['wap'] = (book['bid_price1'] * book['ask_size1'] + book['ask_price1'] * book['bid_size1']) / (book['bid_size1'] + book['ask_size1'])

def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff()

def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

def realized_volatility_per_time_id(df_book_data):
    df_book_data['log_return'] = df_book_data.groupby(['time_id', 'stock_id'])['wap'].apply(log_return)
    df_book_data = df_book_data[~df_book_data['log_return'].isnull()]
    df_realized_vol_per_stock = pd.DataFrame(df_book_data.groupby(['stock_id', 'time_id'])['log_return'].agg(realized_volatility)).reset_index()
    df_realized_vol_per_stock = df_realized_vol_per_stock.rename(columns={'log_return': 'realized_vol'})
    return df_realized_vol_per_stock

df_past_realized_train = realized_volatility_per_time_id(book)

# Calculate the moving average of past realized volatilities
window_size = 5
df_past_realized_train['moving_average_vol'] = df_past_realized_train.groupby('stock_id')['realized_vol'].rolling(window=window_size).mean().reset_index(level=0, drop=True)

df_joined = train.merge(df_past_realized_train, on=['stock_id', 'time_id'], how='left')

# If the moving average is not available (e.g., due to lack of past data), use the realized volatility as the prediction
df_joined['prediction'] = df_joined['moving_average_vol'].fillna(df_joined['realized_vol'])

from sklearn.metrics import r2_score

def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

R2 = round(r2_score(y_true=df_joined['target'], y_pred=df_joined['prediction']), 3)
RMSPE = round(rmspe(y_true=df_joined['target'], y_pred=df_joined['prediction']), 3)

print(f'Performance of the moving average prediction: R2 score: {R2}, RMSPE: {RMSPE}')


In [None]:
import pandas as pd
import numpy as np

train = pd.read_csv('train.csv')
book = pd.read_parquet('order_book_feature.parquet')

book['wap'] = (book['bid_price1'] * book['ask_size1'] + book['ask_price1'] * book['bid_size1']) / (book['bid_size1'] + book['ask_size1'])

def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff()

def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

def realized_volatility_per_time_id(df_book_data):
    df_book_data['log_return'] = df_book_data.groupby(['time_id', 'stock_id'])['wap'].apply(log_return)
    df_book_data = df_book_data[~df_book_data['log_return'].isnull()]
    df_realized_vol_per_stock = pd.DataFrame(df_book_data.groupby(['stock_id', 'time_id'])['log_return'].agg(realized_volatility)).reset_index()
    df_realized_vol_per_stock = df_realized_vol_per_stock.rename(columns={'log_return': 'realized_vol'})
    return df_realized_vol_per_stock

# Calculate order book skewness
book['skewness'] = (book['bid_size1'] - book['ask_size1']) / (book['bid_size1'] + book['ask_size1'])

# Calculate average skewness per stock and time_id
book_skewness = book.groupby(['stock_id', 'time_id'])['skewness'].mean().reset_index()

df_past_realized_train = realized_volatility_per_time_id(book)

# Calculate the moving average of past realized volatilities
window_size = 5
df_past_realized_train['moving_average_vol'] = df_past_realized_train.groupby('stock_id')['realized_vol'].rolling(window=window_size).mean().reset_index(level=0, drop=True)

df_joined = train.merge(df_past_realized_train, on=['stock_id', 'time_id'], how='left')
df_joined = df_joined.merge(book_skewness, on=['stock_id', 'time_id'], how='left')

# If the moving average is not available (e.g., due to lack of past data), use the realized volatility as the prediction
df_joined['prediction'] = df_joined['moving_average_vol'].fillna(df_joined['realized_vol'])

# Incorporate skewness into the prediction by adding a weight to the moving average
skewness_weight = 0.1
df_joined['prediction'] = df_joined['prediction'] * (1 + skewness_weight * df_joined['skewness'])

from sklearn.metrics import r2_score

def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

R2 = round(r2_score(y_true=df_joined['target'], y_pred=df_joined['prediction']), 3)
RMSPE = round(rmspe(y_true=df_joined['target'], y_pred=df_joined['prediction']), 3)

print(f'Performance of the moving average prediction: R2 score: {R2}, RMSPE: {RMSPE}')


In [None]:
import pandas as pd
import numpy as np

train = pd.read_csv('train.csv')
book = pd.read_parquet('order_book_feature.parquet')

book['wap'] = (book['bid_price1'] * book['ask_size1'] + book['ask_price1'] * book['bid_size1']) / (book['bid_size1'] + book['ask_size1'])

def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff()

def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

def realized_volatility_per_time_id(df_book_data):
    df_book_data['log_return'] = df_book_data.groupby(['time_id', 'stock_id'])['wap'].apply(log_return)
    df_book_data = df_book_data[~df_book_data['log_return'].isnull()]
    df_realized_vol_per_stock = pd.DataFrame(df_book_data.groupby(['stock_id', 'time_id'])['log_return'].agg(realized_volatility)).reset_index()
    df_realized_vol_per_stock = df_realized_vol_per_stock.rename(columns={'log_return': 'realized_vol'})
    return df_realized_vol_per_stock

df_past_realized_train = realized_volatility_per_time_id(book)

# Calculate the exponential weighted moving average of past realized volatilities
ewma_alpha = 0.1
df_past_realized_train['ewma_vol'] = df_past_realized_train.groupby('stock_id')['realized_vol'].transform(lambda x: x.ewm(alpha=ewma_alpha).mean())

df_joined = train.merge(df_past_realized_train, on=['stock_id', 'time_id'], how='left')

# If the EWMA is not available (e.g., due to lack of past data), use the realized volatility as the prediction
df_joined['prediction'] = df_joined['ewma_vol'].fillna(df_joined['realized_vol'])

from sklearn.metrics import r2_score

def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

R2 = round(r2_score(y_true=df_joined['target'], y_pred=df_joined['prediction']), 3)
RMSPE = round(rmspe(y_true=df_joined['target'], y_pred=df_joined['prediction']), 3)

print(f'Performance of the exponential weighted moving average prediction: R2 score: {R2}, RMSPE: {RMSPE}')


In [5]:
import pandas as pd
import numpy as np

train = pd.read_csv('train.csv')
book = pd.read_parquet('order_book_feature.parquet')
time_id_reference = pd.read_csv('time_id_reference.csv')

book['wap'] = (book['bid_price1'] * book['ask_size1'] + book['ask_price1'] * book['bid_size1']) / (book['bid_size1'] + book['ask_size1'])

def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff()

def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

def realized_volatility_per_time_id(df_book_data):
    df_book_data['log_return'] = df_book_data.groupby(['time_id', 'stock_id'])['wap'].apply(log_return)
    df_book_data = df_book_data[~df_book_data['log_return'].isnull()]
    df_realized_vol_per_stock = pd.DataFrame(df_book_data.groupby(['stock_id', 'time_id'])['log_return'].agg(realized_volatility)).reset_index()
    df_realized_vol_per_stock = df_realized_vol_per_stock.rename(columns={'log_return': 'realized_vol'})
    return df_realized_vol_per_stock

df_past_realized_train = realized_volatility_per_time_id(book)

# Calculate the exponential weighted moving average of past realized volatilities
ewma_alpha = 0.1
df_past_realized_train['ewma_vol'] = df_past_realized_train.groupby('stock_id')['realized_vol'].transform(lambda x: x.ewm(alpha=ewma_alpha).mean())

# Merge time_id_reference to get the hour of the day
df_past_realized_train = df_past_realized_train.merge(time_id_reference, on='time_id')

# Calculate the average realized volatility per hour of the day
df_past_realized_train['hour'] = pd.to_datetime(df_past_realized_train['time']).dt.hour
hourly_volatility = df_past_realized_train.groupby('hour')['realized_vol'].mean().reset_index()

df_joined = train.merge(df_past_realized_train, on=['stock_id', 'time_id'], how='left')
df_joined = df_joined.merge(hourly_volatility, on='hour', how='left', suffixes=('', '_hourly'))

# If the EWMA is not available (e.g., due to lack of past data), use the realized volatility as the prediction
df_joined['prediction'] = df_joined['ewma_vol'].fillna(df_joined['realized_vol'])

# Incorporate seasonality (hourly volatility) into the prediction by adding a weight
seasonality_weight = 0.1
df_joined['prediction'] = df_joined['prediction'] * (1 + seasonality_weight * (df_joined['realized_vol_hourly'] / df_joined['prediction']))

from sklearn.metrics import r2_score

def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

R2 = round(r2_score(y_true=df_joined['target'], y_pred=df_joined['prediction']), 3)
RMSPE = round(rmspe(y_true=df_joined['target'], y_pred=df_joined['prediction']), 3)
print(f'Performance of the exponential weighted moving average and seasonality prediction: R2 score: {R2}, RMSPE: {RMSPE}')


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df_book_data['log_return'] = df_book_data.groupby(['time_id', 'stock_id'])['wap'].apply(log_return)


Performance of the exponential weighted moving average and seasonality prediction: R2 score: 0.663, RMSPE: 0.488


In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import r2_score

def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff()

def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

def generate_features(df):
    df['bid_ask_spread'] = df['ask_price1'] - df['bid_price1']
    df['order_imbalance'] = (df['bid_size1'] - df['ask_size1']) / (df['bid_size1'] + df['ask_size1'])
    return df

def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

class EMATransformer(BaseEstimator, TransformerMixin):
    def __init__(self, alpha):
        self.alpha = alpha

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ema = X.ewm(alpha=self.alpha).mean()
        return X_ema

# Load data and preprocess
book = pd.read_parquet('order_book_feature.parquet')
book = generate_features(book)

book['wap'] = (book['bid_price1'] * book['ask_size1'] + book['ask_price1'] * book['bid_size1']) / (book['bid_size1'] + book['ask_size1'])


book['log_return'] = book.groupby(['time_id', 'stock_id'])['wap'].apply(log_return).fillna(0)


# Define EMA parameters
ewma_alpha = 0.1
df_past_realized_train['ewma_vol'] = df_past_realized_train.groupby('stock_id')['realized_vol'].transform(lambda x: x.ewm(alpha=ewma_alpha).mean())

# Aggregate features and target
agg_features = ['ewma_vol', 'bid_ask_spread', 'order_imbalance']
agg_df = book.groupby(['stock_id', 'time_id'])[agg_features].mean().reset_index()
agg_df['target'] = book.groupby(['stock_id', 'time_id'])['log_return'].agg(realized_volatility).reset_index(drop=True)

# Train-test split
train_df = agg_df.sample(frac=0.8, random_state=42)
test_df = agg_df.drop(train_df.index)

# Train linear regression model
model = LinearRegression()
model.fit(train_df[agg_features], train_df['target'])

# Predict and evaluate
preds = model.predict(test_df[agg_features])
RMSPE_score = rmspe(test_df['target'], preds)

# Calculate R^2 score
R2 = r2_score(test_df['target'], preds)

print(f'R^2: {R2:.4f}')
print(f'RMSPE: {RMSPE_score:.4f}')