In [1]:
# Naive prediction
# =========================
#
# Naive prediction is predicting that the future value is the same as the present value.

import os
from sklearn.metrics import r2_score
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

list_order_book_file_train = glob.glob('../../data/book_train.parquet/*')

In [13]:
# Log Return
def log_return(list_stock_price):
    return np.log(list_stock_price).diff()

# Realized Volatility
def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

# Realized Volatility of a certain stock per time id
def realized_volatility_per_time_id(file_path, prediction_column_name):
    df_book_data = pd.read_parquet(file_path)
    df_book_data['wap'] = (df_book_data['bid_price1']*df_book_data['ask_size1'] +
                               df_book_data['ask_price1']*df_book_data['bid_size1']) / (
                                    df_book_data['bid_size1'] + df_book_data['ask_size1'])
    df_book_data['log_return'] = df_book_data.groupby(['time_id'])['wap'].apply(log_return)
    df_book_data = df_book_data[~df_book_data['log_return'].isnull()]
    df_realized_vol_per_stock = pd.DataFrame(df_book_data.groupby(['time_id'])['log_return'].agg(realized_volatility)).reset_index()
    df_realized_vol_per_stock = df_realized_vol_per_stock.rename(columns={'log_return': prediction_column_name})
    stock_id = file_path.split('=')[1]
    df_realized_vol_per_stock['row_id'] = df_realized_vol_per_stock['time_id'].apply(lambda x: f'{stock_id}-{x}')
    return df_realized_vol_per_stock[['row_id', prediction_column_name]]

# Past Realized Volatility for each individual stocks.
def past_realized_volatility_per_stock(list_file, prediction_column_name):
    df_past_realized = pd.DataFrame()
    for file in list_file:
        df_past_realized = pd.concat([
            df_past_realized,
            realized_volatility_per_time_id(file, prediction_column_name)
        ])
        
    return df_past_realized

df_past_realized_train = past_realized_volatility_per_stock(list_file=list_order_book_file_train, prediction_column_name='pred')

In [14]:
df_past_realized_train.head()

Unnamed: 0,row_id,pred
0,75-5,0.009007
1,75-11,0.003141
2,75-16,0.002871
3,75-31,0.00497
4,75-62,0.003171


In [19]:
train = pd.read_csv('../../data/train.csv')
train.head()

Unnamed: 0,stock_id,time_id,target
0,0,5,0.004136
1,0,11,0.001445
2,0,16,0.002168
3,0,31,0.002195
4,0,62,0.001747


In [20]:
train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
train = train[['row_id', 'target']]
df_joined = train.merge(df_past_realized_train[['row_id', 'pred']], on=['row_id'], how='left')
df_joined.head()

Unnamed: 0,row_id,target,pred
0,0-5,0.004136,
1,0-11,0.001445,
2,0-16,0.002168,
3,0-31,0.002195,
4,0-62,0.001747,


In [None]:
def rmspe(y_true, y_pred):
    return (np.sqrt(np.mean(np.square((y_true - y_pred)/ y_true))))

R2 = round(r2_score(y_true=df_joined['target'], y_pred = df_joined['pred']), 3)
RMSPE = round(rmspe(y_true=df_joined['target'], y_pred=df_joined['pred']), 3)
print(f'performance of the naive prediction: R2 score {R2}, RMSPE: {RMSPE}')