In [24]:
import pandas as pd
import numpy as np

train = pd.read_csv('train.csv')
book = pd.read_parquet('order_book_feature.parquet')
time_id_reference = pd.read_csv('time_id_reference.csv')

book['wap'] = (book['bid_price1'] * book['ask_size1'] + book['ask_price1'] * book['bid_size1']) / (book['bid_size1'] + book['ask_size1'])

def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff()

def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

def realized_volatility_per_time_id(df_book_data):
    df_book_data['log_return'] = df_book_data.groupby(['time_id', 'stock_id'])['wap'].apply(log_return)
    df_book_data = df_book_data[~df_book_data['log_return'].isnull()]
    df_realized_vol_per_stock = pd.DataFrame(df_book_data.groupby(['stock_id', 'time_id'])['log_return'].agg(realized_volatility)).reset_index()
    df_realized_vol_per_stock = df_realized_vol_per_stock.rename(columns={'log_return': 'realized_vol'})
    return df_realized_vol_per_stock

df_past_realized_train = realized_volatility_per_time_id(book)

# Calculate the exponential weighted moving average of past realized volatilities
ewma_alpha = 0.1
df_past_realized_train['ewma_vol'] = df_past_realized_train.groupby('stock_id')['realized_vol'].transform(lambda x: x.ewm(alpha=ewma_alpha).mean())

# Merge time_id_reference to get the hour of the day
df_past_realized_train = df_past_realized_train.merge(time_id_reference, on='time_id')

# Calculate the average realized volatility per hour of the day
df_past_realized_train['hour'] = pd.to_datetime(df_past_realized_train['time']).dt.hour
hourly_volatility = df_past_realized_train.groupby('hour')['realized_vol'].mean().reset_index()

df_joined = train.merge(df_past_realized_train, on=['stock_id', 'time_id'], how='left')
df_joined = df_joined.merge(hourly_volatility, on='hour', how='left', suffixes=('', '_hourly'))

# If the EWMA is not available (e.g., due to lack of past data), use the realized volatility as the prediction
df_joined['prediction'] = df_joined['ewma_vol'].fillna(df_joined['realized_vol'])

# Incorporate seasonality (hourly volatility) into the prediction by adding a weight
seasonality_weight = 0.1
df_joined['prediction'] = df_joined['prediction'] * (1 + seasonality_weight * (df_joined['realized_vol_hourly'] / df_joined['prediction']))

from sklearn.metrics import r2_score

def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

R2 = round(r2_score(y_true=df_joined['target'], y_pred=df_joined['prediction']), 3)
RMSPE = round(rmspe(y_true=df_joined['target'], y_pred=df_joined['prediction']), 3)
print(f'Performance of the exponential weighted moving average and seasonality prediction: R2 score: {R2}, RMSPE: {RMSPE}')

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df_book_data['log_return'] = df_book_data.groupby(['time_id', 'stock_id'])['wap'].apply(log_return)


Performance of the exponential weighted moving average and seasonality prediction: R2 score: 0.663, RMSPE: 0.488


In [12]:
from sklearn.model_selection import KFold

# Create a list of possible alpha and weight values
alphas = np.linspace(0.01, 0.99, 10)
weights = np.linspace(0.01, 0.99, 10)

# Initialize the best R^2 score and corresponding parameters
best_r2 = -np.inf
best_alpha = None
best_weight = None

# Perform a grid search using 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for ewma_alpha in alphas:
    for seasonality_weight in weights:
        r2_scores = []
        for train_idx, test_idx in kf.split(df_joined):
            train_df = df_joined.iloc[train_idx].copy()
            test_df = df_joined.iloc[test_idx].copy()

            # Calculate the EWMA for the training set
            train_df.loc[:, 'ewma_vol'] = train_df.groupby('stock_id')['realized_vol'].transform(lambda x: x.ewm(alpha=ewma_alpha).mean())

            # Calculate the predictions using the EWMA and seasonality weight
            train_df.loc[:, 'prediction'] = train_df['ewma_vol'].fillna(train_df['realized_vol'])
            train_df.loc[:, 'prediction'] = train_df['prediction'] * (1 + seasonality_weight * (train_df['realized_vol_hourly'] / train_df['prediction']))

            # Calculate the R^2 score for this fold
            r2 = r2_score(y_true=train_df['target'], y_pred=train_df['prediction'])
            r2_scores.append(r2)

        # Calculate the average R^2 score across all folds
        avg_r2 = np.mean(r2_scores)

        # Update the best R^2 score and corresponding parameters if necessary
        if avg_r2 > best_r2:
            best_r2 = avg_r2
            best_alpha = ewma_alpha
            best_weight = seasonality_weight

print(f"Best R^2 score: {best_r2:.4f}, with ewma_alpha: {best_alpha:.4f}, and seasonality_weight: {best_weight:.4f}")


Best R^2 score: 0.8643, with ewma_alpha: 0.7722, and seasonality_weight: 0.0100


In [13]:
best_rmspe = float('inf')
best_alpha_rmspe = None
best_weight_rmspe = None

for ewma_alpha in alphas:
    for seasonality_weight in weights:
        rmspe_scores = []
        for train_idx, test_idx in kf.split(df_joined):
            train_df = df_joined.iloc[train_idx].copy()
            test_df = df_joined.iloc[test_idx].copy()

            train_df.loc[:, 'ewma_vol'] = train_df.groupby('stock_id')['realized_vol'].transform(lambda x: x.ewm(alpha=ewma_alpha).mean())
            train_df.loc[:, 'prediction'] = train_df['ewma_vol'].fillna(train_df['realized_vol'])
            train_df.loc[:, 'prediction'] = train_df['prediction'] * (1 + seasonality_weight * (train_df['realized_vol_hourly'] / train_df['prediction']))

            rmspe_val = rmspe(y_true=train_df['target'], y_pred=train_df['prediction'])
            rmspe_scores.append(rmspe_val)

        avg_rmspe = np.mean(rmspe_scores)

        if avg_rmspe < best_rmspe:
            best_rmspe = avg_rmspe
            best_alpha_rmspe = ewma_alpha
            best_weight_rmspe = seasonality_weight

print(f"Best RMSPE score: {best_rmspe:.4f}, with ewma_alpha: {best_alpha_rmspe:.4f}, and seasonality_weight: {best_weight_rmspe:.4f}")

Best RMSPE score: 0.2159, with ewma_alpha: 0.8811, and seasonality_weight: 0.0100


In [14]:
best_r2 = -float('inf')
best_alpha_r2 = None
best_weight_r2 = None

best_rmspe = float('inf')
best_alpha_rmspe = None
best_weight_rmspe = None

for ewma_alpha in alphas:
    for seasonality_weight in weights:
        r2_scores = []
        rmspe_scores = []
        
        for train_idx, test_idx in kf.split(df_joined):
            train_df = df_joined.iloc[train_idx].copy()
            test_df = df_joined.iloc[test_idx].copy()

            train_df.loc[:, 'ewma_vol'] = train_df.groupby('stock_id')['realized_vol'].transform(lambda x: x.ewm(alpha=ewma_alpha).mean())
            train_df.loc[:, 'prediction'] = train_df['ewma_vol'].fillna(train_df['realized_vol'])
            train_df.loc[:, 'prediction'] = train_df['prediction'] * (1 + seasonality_weight * (train_df['realized_vol_hourly'] / train_df['prediction']))

            r2_val = r2_score(y_true=train_df['target'], y_pred=train_df['prediction'])
            rmspe_val = rmspe(y_true=train_df['target'], y_pred=train_df['prediction'])
            
            r2_scores.append(r2_val)
            rmspe_scores.append(rmspe_val)

        avg_r2 = np.mean(r2_scores)
        avg_rmspe = np.mean(rmspe_scores)

        if avg_r2 > best_r2:
            best_r2 = avg_r2
            best_alpha_r2 = ewma_alpha
            best_weight_r2 = seasonality_weight

        if avg_rmspe < best_rmspe:
            best_rmspe = avg_rmspe
            best_alpha_rmspe = ewma_alpha
            best_weight_rmspe = seasonality_weight

print(f"Best R^2 score: {best_r2:.4f}, with ewma_alpha: {best_alpha_r2:.4f}, and seasonality_weight: {best_weight_r2:.4f}")

print(f"Best RMSPE score: {best_rmspe:.4f}, with ewma_alpha: {best_alpha_rmspe:.4f}, and seasonality_weight: {best_weight_rmspe:.4f}")

Best R^2 score: 0.8643, with ewma_alpha: 0.7722, and seasonality_weight: 0.0100
Best RMSPE score: 0.2159, with ewma_alpha: 0.8811, and seasonality_weight: 0.0100


In [15]:
best_r2 = -float('inf')
best_alpha_r2 = None
best_weight_r2 = None
best_rmspe_r2 = None

for ewma_alpha in alphas:
    for seasonality_weight in weights:
        r2_scores = []
        rmspe_scores = []
        
        for train_idx, test_idx in kf.split(df_joined):
            train_df = df_joined.iloc[train_idx].copy()
            test_df = df_joined.iloc[test_idx].copy()

            train_df.loc[:, 'ewma_vol'] = train_df.groupby('stock_id')['realized_vol'].transform(lambda x: x.ewm(alpha=ewma_alpha).mean())
            train_df.loc[:, 'prediction'] = train_df['ewma_vol'].fillna(train_df['realized_vol'])
            train_df.loc[:, 'prediction'] = train_df['prediction'] * (1 + seasonality_weight * (train_df['realized_vol_hourly'] / train_df['prediction']))

            r2_val = r2_score(y_true=train_df['target'], y_pred=train_df['prediction'])
            rmspe_val = rmspe(y_true=train_df['target'], y_pred=train_df['prediction'])
            
            r2_scores.append(r2_val)
            rmspe_scores.append(rmspe_val)

        avg_r2 = np.mean(r2_scores)
        avg_rmspe = np.mean(rmspe_scores)

        if avg_r2 > best_r2:
            best_r2 = avg_r2
            best_alpha_r2 = ewma_alpha
            best_weight_r2 = seasonality_weight
            best_rmspe_r2 = avg_rmspe

print(f"Best R^2 score: {best_r2:.4f}, with ewma_alpha: {best_alpha_r2:.4f}, and seasonality_weight: {best_weight_r2:.4f}")
print(f"Corresponding RMSPE score: {best_rmspe_r2:.4f}")


Best R^2 score: 0.8643, with ewma_alpha: 0.7722, and seasonality_weight: 0.0100
Corresponding RMSPE score: 0.2217


In [25]:
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import TimeSeriesSplit

# Load the data
train = pd.read_csv('train.csv')
book = pd.read_parquet('order_book_feature.parquet')

# Calculate the weighted average price
book['wap'] = (book['bid_price1'] * book['ask_size1'] + book['ask_price1'] * book['bid_size1']) / (book['bid_size1'] + book['ask_size1'])

# Calculate log returns
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff()

# Calculate realized volatility
def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

# Calculate realized volatility per time_id
def realized_volatility_per_time_id(df_book_data):
    df_book_data['log_return'] = df_book_data.groupby(['time_id', 'stock_id'])['wap'].apply(log_return)
    df_book_data = df_book_data[~df_book_data['log_return'].isnull()]
    df_realized_vol_per_stock = pd.DataFrame(df_book_data.groupby(['stock_id', 'time_id'])['log_return'].agg(realized_volatility)).reset_index()
    df_realized_vol_per_stock = df_realized_vol_per_stock.rename(columns={'log_return': 'realized_vol'})
    return df_realized_vol_per_stock

df_past_realized_train = realized_volatility_per_time_id(book)

# Feature Engineering: Add new features
book['bid_ask_spread'] = (book['ask_price1'] - book['bid_price1']) / ((book['ask_price1'] + book['bid_price1']) / 2)
book['total_size'] = book['bid_size1'] + book['ask_size1']

# Calculate the average bid-ask spread and total size per stock_id and time_id
avg_spread_size = book.groupby(['stock_id', 'time_id']).agg({'bid_ask_spread': 'mean', 'total_size': 'mean'}).reset_index()

# Merge the average bid-ask spread and total size with the realized volatility
df_past_realized_train = df_past_realized_train.merge(avg_spread_size, on=['stock_id', 'time_id'], how='left')

# Find the optimal EWMA alpha
alphas = np.arange(0.01, 1.01, 0.01)
best_rmspe = np.inf
best_alpha = None

def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

for alpha in alphas:
    df_past_realized_train['ewma_vol'] = df_past_realized_train.groupby('stock_id')['realized_vol'].transform(lambda x: x.ewm(alpha=alpha).mean())
    df_joined = train.merge(df_past_realized_train, on=['stock_id', 'time_id'], how='left')
    df_joined['prediction'] = df_joined['ewma_vol'].fillna(df_joined['realized_vol'])
    RMSPE = rmspe(y_true=df_joined['target'], y_pred=df_joined['prediction'])

    if RMSPE < best_rmspe:
        best_rmspe = RMSPE
        best_alpha = alpha

print(f'Best RMSPE score: {best_rmspe:.4f}, with ewma_alpha: {best_alpha:.4f}')


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df_book_data['log_return'] = df_book_data.groupby(['time_id', 'stock_id'])['wap'].apply(log_return)


Best RMSPE score: 0.2077, with ewma_alpha: 0.9100


In [26]:
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import TimeSeriesSplit

# Load the data
train = pd.read_csv('train.csv')
book = pd.read_parquet('order_book_feature.parquet')

# Calculate the weighted average price
book['wap'] = (book['bid_price1'] * book['ask_size1'] + book['ask_price1'] * book['bid_size1']) / (book['bid_size1'] + book['ask_size1'])

# Calculate log returns
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff()

# Calculate realized volatility
def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

# Calculate realized volatility per time_id
def realized_volatility_per_time_id(df_book_data):
    df_book_data['log_return'] = df_book_data.groupby(['time_id', 'stock_id'])['wap'].apply(log_return)
    df_book_data = df_book_data[~df_book_data['log_return'].isnull()]
    df_realized_vol_per_stock = pd.DataFrame(df_book_data.groupby(['stock_id', 'time_id'])['log_return'].agg(realized_volatility)).reset_index()
    df_realized_vol_per_stock = df_realized_vol_per_stock.rename(columns={'log_return': 'realized_vol'})
    return df_realized_vol_per_stock

df_past_realized_train = realized_volatility_per_time_id(book)

# Feature Engineering: Add new features
book['bid_ask_spread'] = (book['ask_price1'] - book['bid_price1']) / ((book['ask_price1'] + book['bid_price1']) / 2)
book['total_size'] = book['bid_size1'] + book['ask_size1']

# Calculate the average bid-ask spread and total size per stock_id and time_id
avg_spread_size = book.groupby(['stock_id', 'time_id']).agg({'bid_ask_spread': 'mean', 'total_size': 'mean'}).reset_index()

# Merge the average bid-ask spread and total size with the realized volatility
df_past_realized_train = df_past_realized_train.merge(avg_spread_size, on=['stock_id', 'time_id'], how='left')

# Find the optimal EWMA alpha
alphas = np.arange(0.01, 1.01, 0.01)
best_rmspe = np.inf
best_r2 = -np.inf
best_alpha = None

def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

for alpha in alphas:
    df_past_realized_train['ewma_vol'] = df_past_realized_train.groupby('stock_id')['realized_vol'].transform(lambda x: x.ewm(alpha=alpha).mean())
    df_joined = train.merge(df_past_realized_train, on=['stock_id', 'time_id'], how='left')
    df_joined['prediction'] = df_joined['ewma_vol'].fillna(df_joined['realized_vol'])
    RMSPE = rmspe(y_true=df_joined['target'], y_pred=df_joined['prediction'])
    R2 = r2_score(y_true=df_joined['target'], y_pred=df_joined['prediction'])

    if RMSPE < best_rmspe:
        best_rmspe = RMSPE
        best_r2 = R2
        best_alpha = alpha

print(f'Best RMSPE score: {best_rmspe:.4f}, with ewma_alpha: {best_alpha:.4f}')


print(f'Best R2 score: {best_r2:.4f}, with ewma_alpha: {best_alpha:.4f}')


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df_book_data['log_return'] = df_book_data.groupby(['time_id', 'stock_id'])['wap'].apply(log_return)


Best RMSPE score: 0.2077, with ewma_alpha: 0.9100
Best R2 score: 0.8587, with ewma_alpha: 0.9100
