In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV

import gc
import os
import time
import warnings
from itertools import combinations
from warnings import simplefilter

import joblib
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold, TimeSeriesSplit

warnings.filterwarnings("ignore")
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Memory Reduction Function

In [3]:
def reduce_mem_usage(df, verbose=0):
    """
    Iterate through all numeric columns of a dataframe and modify the data type
    to reduce memory usage.
    """

    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float32)

    if verbose:
        logger.info(f"Memory usage of dataframe is {start_mem:.2f} MB")
        end_mem = df.memory_usage().sum() / 1024**2
        logger.info(f"Memory usage after optimization is: {end_mem:.2f} MB")
        decrease = 100 * (start_mem - end_mem) / start_mem
        logger.info(f"Decreased by {decrease:.2f}%")

    return df


# Data Preparation

In [4]:
all = pd.read_csv("/content/drive/MyDrive/Desktop/Advanced Analytics Edge/Project/Individual Work/Input/train.csv")
all = all.dropna(subset=['target','wap'])
all.isnull().sum()

stock_id                         0
date_id                          0
seconds_in_bucket                0
imbalance_size                   0
imbalance_buy_sell_flag          0
reference_price                  0
matched_size                     0
far_price                  2894122
near_price                 2856960
bid_price                        0
bid_size                         0
ask_price                        0
ask_size                         0
wap                              0
target                           0
time_id                          0
row_id                           0
dtype: int64

In [5]:
all_sample = all
train = all_sample[all_sample["date_id"]<475]
test = all_sample[all_sample["date_id"]>=475]
del all, all_sample

print(train.shape)
print(test.shape)
train.head()

(5171760, 17)
(66000, 17)


Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
0,0,0,0,3180602.69,1,0.999812,13380276.64,,,0.999812,60651.5,1.000026,8493.03,1.0,-3.029704,0,0_0_0
1,1,0,0,166603.91,-1,0.999896,1642214.25,,,0.999896,3233.04,1.00066,20605.09,1.0,-5.519986,0,0_0_1
2,2,0,0,302879.87,-1,0.999561,1819368.03,,,0.999403,37956.0,1.000298,18995.0,1.0,-8.38995,0,0_0_2
3,3,0,0,11917682.27,-1,1.000171,18389745.62,,,0.999999,2324.9,1.000214,479032.4,1.0,-4.0102,0,0_0_3
4,4,0,0,447549.96,-1,0.999532,17860614.95,,,0.999394,16485.54,1.000016,434.1,1.0,-7.349849,0,0_0_4


In [6]:
weights = [
    0.004, 0.001, 0.002, 0.006, 0.004, 0.004, 0.002, 0.006, 0.006, 0.002, 0.002, 0.008,
    0.006, 0.002, 0.008, 0.006, 0.002, 0.006, 0.004, 0.002, 0.004, 0.001, 0.006, 0.004,
    0.002, 0.002, 0.004, 0.002, 0.004, 0.004, 0.001, 0.001, 0.002, 0.002, 0.006, 0.004,
    0.004, 0.004, 0.006, 0.002, 0.002, 0.04 , 0.002, 0.002, 0.004, 0.04 , 0.002, 0.001,
    0.006, 0.004, 0.004, 0.006, 0.001, 0.004, 0.004, 0.002, 0.006, 0.004, 0.006, 0.004,
    0.006, 0.004, 0.002, 0.001, 0.002, 0.004, 0.002, 0.008, 0.004, 0.004, 0.002, 0.004,
    0.006, 0.002, 0.004, 0.004, 0.002, 0.004, 0.004, 0.004, 0.001, 0.002, 0.002, 0.008,
    0.02 , 0.004, 0.006, 0.002, 0.02 , 0.002, 0.002, 0.006, 0.004, 0.002, 0.001, 0.02,
    0.006, 0.001, 0.002, 0.004, 0.001, 0.002, 0.006, 0.006, 0.004, 0.006, 0.001, 0.002,
    0.004, 0.006, 0.006, 0.001, 0.04 , 0.006, 0.002, 0.004, 0.002, 0.002, 0.006, 0.002,
    0.002, 0.004, 0.006, 0.006, 0.002, 0.002, 0.008, 0.006, 0.004, 0.002, 0.006, 0.002,
    0.004, 0.006, 0.002, 0.004, 0.001, 0.004, 0.002, 0.004, 0.008, 0.006, 0.008, 0.002,
    0.004, 0.002, 0.001, 0.004, 0.004, 0.004, 0.006, 0.008, 0.004, 0.001, 0.001, 0.002,
    0.006, 0.004, 0.001, 0.002, 0.006, 0.004, 0.006, 0.008, 0.002, 0.002, 0.004, 0.002,
    0.04 , 0.002, 0.002, 0.004, 0.002, 0.002, 0.006, 0.02 , 0.004, 0.002, 0.006, 0.02,
    0.001, 0.002, 0.006, 0.004, 0.006, 0.004, 0.004, 0.004, 0.004, 0.002, 0.004, 0.04,
    0.002, 0.008, 0.002, 0.004, 0.001, 0.004, 0.006, 0.004,
]

weights = {int(k):v for k,v in enumerate(weights)}

# Feature Engineering

In [7]:
global_stock_id_feats = {
        "median_size": train.groupby("stock_id")["bid_size"].median() + train.groupby("stock_id")["ask_size"].median(),
        "std_size": train.groupby("stock_id")["bid_size"].std() + train.groupby("stock_id")["ask_size"].std(),
        "ptp_size": train.groupby("stock_id")["bid_size"].max() - train.groupby("stock_id")["bid_size"].min(),
        "median_price": train.groupby("stock_id")["bid_price"].median() + train.groupby("stock_id")["ask_price"].median(),
        "std_price": train.groupby("stock_id")["bid_price"].std() + train.groupby("stock_id")["ask_price"].std(),
        "ptp_price": train.groupby("stock_id")["bid_price"].max() - train.groupby("stock_id")["ask_price"].min(),
    }

In [8]:
from numba import njit, prange

@njit(parallel=True)
def compute_triplet_imbalance(df_values, comb_indices):
    num_rows = df_values.shape[0]
    num_combinations = len(comb_indices)
    imbalance_features = np.empty((num_rows, num_combinations))

    for i in prange(num_combinations):
        a, b, c = comb_indices[i]
        for j in range(num_rows):
            max_val = max(df_values[j, a], df_values[j, b], df_values[j, c])
            min_val = min(df_values[j, a], df_values[j, b], df_values[j, c])
            mid_val = df_values[j, a] + df_values[j, b] + df_values[j, c] - min_val - max_val
            if mid_val == min_val:  # Prevent division by zero
                imbalance_features[j, i] = np.nan
            else:
                imbalance_features[j, i] = (max_val - mid_val) / (mid_val - min_val)

    return imbalance_features


def calculate_triplet_imbalance_numba(price, df):
    # Convert DataFrame to numpy array for Numba compatibility
    df_values = df[price].values
    comb_indices = [(price.index(a), price.index(b), price.index(c)) for a, b, c in combinations(price, 3)]

    # Calculate the triplet imbalance
    features_array = compute_triplet_imbalance(df_values, comb_indices)

    # Create a DataFrame from the results
    columns = [f"{a}_{b}_{c}_imb2" for a, b, c in combinations(price, 3)]
    features = pd.DataFrame(features_array, columns=columns)

    return features

In [9]:
# generate imbalance features
def imbalance_features(df):
    prices = ["reference_price", "far_price", "near_price", "ask_price", "bid_price", "wap"]
    sizes = ["matched_size", "bid_size", "ask_size", "imbalance_size"]

    # V1
    df["volume"] = df.eval("ask_size + bid_size")
    df["mid_price"] = df.eval("(ask_price + bid_price) / 2")
    df["liquidity_imbalance"] = df.eval("(bid_size-ask_size)/(bid_size+ask_size)")
    df["matched_imbalance"] = df.eval("(imbalance_size-matched_size)/(matched_size+imbalance_size)")
    df["size_imbalance"] = df.eval("bid_size / ask_size")

    for c in combinations(prices, 2):
        df[f"{c[0]}_{c[1]}_imb"] = df.eval(f"({c[0]} - {c[1]})/({c[0]} + {c[1]})")

    for c in [['ask_price', 'bid_price', 'wap', 'reference_price'], sizes]:
        triplet_feature = calculate_triplet_imbalance_numba(c, df)
        df[triplet_feature.columns] = triplet_feature.values

    # V2
    df["stock_weights"] = df["stock_id"].map(weights)
    df["weighted_wap"] = df["stock_weights"] * df["wap"]
    df['wap_momentum'] = df.groupby('stock_id')['weighted_wap'].pct_change(periods=6)
    df["imbalance_momentum"] = df.groupby(['stock_id'])['imbalance_size'].diff(periods=1) / df['matched_size']
    df["price_spread"] = df["ask_price"] - df["bid_price"]
    df["spread_intensity"] = df.groupby(['stock_id'])['price_spread'].diff()
    df['price_pressure'] = df['imbalance_size'] * (df['ask_price'] - df['bid_price'])
    df['market_urgency'] = df['price_spread'] * df['liquidity_imbalance']
    df['depth_pressure'] = (df['ask_size'] - df['bid_size']) * (df['far_price'] - df['near_price'])
    df['spread_depth_ratio'] = (df['ask_price'] - df['bid_price']) / (df['bid_size'] + df['ask_size'])
    df['mid_price_movement'] = df['mid_price'].diff(periods=5).apply(lambda x: 1 if x > 0 else (-1 if x < 0 else 0))
    df['micro_price'] = ((df['bid_price'] * df['ask_size']) + (df['ask_price'] * df['bid_size'])) / (df['bid_size'] + df['ask_size'])
    df['relative_spread'] = (df['ask_price'] - df['bid_price']) / df['wap']

    for func in ["mean", "std", "skew", "kurt"]:
        df[f"all_prices_{func}"] = df[prices].agg(func, axis=1)
        df[f"all_sizes_{func}"] = df[sizes].agg(func, axis=1)

    # V3
    for col in ['matched_size', 'imbalance_size', 'reference_price', 'imbalance_buy_sell_flag']:
        for window in [1, 2, 3, 5, 10]:
            df[f"{col}_shift_{window}"] = df.groupby('stock_id')[col].shift(window)
            df[f"{col}_ret_{window}"] = df.groupby('stock_id')[col].pct_change(window)

    for col in ['ask_price', 'bid_price', 'ask_size', 'bid_size',
                'wap', 'near_price', 'far_price']:
        for window in [1, 2, 3, 5, 10]:
            df[f"{col}_diff_{window}"] = df.groupby("stock_id")[col].diff(window)

    return df.replace([np.inf, -np.inf], 0)

# generate time & stock features
def other_features(df):
    df["dow"] = df["date_id"] % 5
    df["dom"] = df["date_id"] % 20
    df["seconds"] = df["seconds_in_bucket"] % 60
    df["minute"] = df["seconds_in_bucket"] // 60

    for key, value in global_stock_id_feats.items():
        df[f"global_{key}"] = df["stock_id"].map(value.to_dict())

    return df

# generate all features
def generate_all_features(df):
    cols = [c for c in df.columns if c not in ["row_id", "time_id"]]
    df = df[cols]
    df = imbalance_features(df)
    df = other_features(df)
    gc.collect()

    feature_name = [i for i in df.columns if i not in ["row_id", "time_id", "date_id"]]

    return df[feature_name]

In [10]:
# Generate data set
train = generate_all_features(train)
print("Build Train Finished.")
test = generate_all_features(test)
print("Build Test Finished.")

# Preparing the data
X_train = train.drop(columns=['target'])
X_test = test.drop(columns=['target'])
y_train = train['target']
y_test = test['target']

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = scaler.transform(X_test)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

Build Train Finished.
Build Test Finished.


In [11]:
feature_no_missing = train.isnull().sum() == 0
X_train_scaled = X_train_scaled.loc[:, feature_no_missing]
X_test_scaled = X_test_scaled.loc[:, feature_no_missing]

# Traditional Feature Engineering

In [12]:
# # Traditional Feature Setting
# features = ['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_buy_sell_flag', 'reference_price', 'matched_size','wap']
# target = 'target'

# # Preparing the data
# X_train = train[features]
# y_train = train[target]
# X_test = test[features]
# y_test = test[target]

# # Standardize the data
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
# X_test_scaled = scaler.transform(X_test)
# X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

In [25]:
# Get Prediction result
def predict_result(X_train, X_test, y_train, y_test, model):

    # Make predictions on the training set
    train_predictions = model.predict(X_train)
    # Flatten predictions if necessary
    train_predictions = np.ravel(train_predictions)

    # Make predictions on the test set
    test_predictions = model.predict(X_test)
    # Flatten predictions if necessary
    test_predictions = np.ravel(test_predictions)

    # Calculate Mean Absolute Error on training data
    train_mae = mean_absolute_error(y_train, train_predictions)
    # Calculate Mean Absolute Error
    test_mae = mean_absolute_error(y_test, test_predictions)

    return train_predictions, test_predictions, train_mae, test_mae

In [26]:
# Calculate test MAE
def calculate_mae(y_test, y_test_pred, model):

    # Calculate Mean Absolute Error
    test_mae = mean_absolute_error(y_test, y_test_pred)

    return test_mae

# Modeling

## Model 1: Neural Network

In [14]:
def fit_neural_network(X_train, X_test, y_train, y_test):

    # Create the Neural Network Model
    model = Sequential()
    model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='linear'))  # Output layer for regression

    # Compile the model
    model.compile(loss='mean_absolute_error', optimizer='adam')

    # Define the early stopping callback
    early_stopping_callback = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=10, verbose=1, mode='min', restore_best_weights=True)

    # Train the model with the early stopping callback
    model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, callbacks=[early_stopping_callback])

    return model

In [15]:
nn_model = fit_neural_network(X_train_scaled, X_test_scaled, y_train, y_test)
nn_model.save('/content/drive/MyDrive/Desktop/Advanced Analytics Edge/Project/Individual Work/Output/nn_model.h5')  # saves the model as an H5 file

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [16]:
nn_train_pred, nn_test_pred, nn_train_mae, nn_test_mae = predict_result(X_train_scaled, X_test_scaled, y_train, y_test, nn_model)
print("Train MAE: ", nn_train_mae)
print("Test MAE: ", nn_test_mae)

Train MAE:  6.288131115394802
Test MAE:  5.314431949632113


## Model 2: Deep Neural Network

In [17]:
def fit_deep_neural_network(X_train, X_test, y_train, y_test):

    model = Sequential()
    model.add(Dense(256, input_dim=X_train.shape[1], activation='relu'))  # Added new layer
    model.add(Dropout(0.3))
    model.add(Dense(128, activation='relu'))  # Increased neurons
    model.add(Dropout(0.3))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))  # Existing dropout layer
    model.add(Dense(32, activation='relu'))
    model.add(Dense(16, activation='relu'))   # Added new layer
    model.add(Dense(1, activation='linear'))  # Output layer

    # Compile the model with a different learning rate
    optimizer = Adam(learning_rate=0.001)  # Adjust learning rate as needed
    model.compile(loss='mean_absolute_error', optimizer=optimizer)

    # Train the model (include early stopping as before)
    early_stopping_callback = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=10, verbose=1, mode='min', restore_best_weights=True)

    # Train the model with the early stopping callback
    model.fit(X_train, y_train, epochs=10, batch_size=16, validation_split=0.2, callbacks=[early_stopping_callback])

    return model


In [18]:
dnn_model = fit_deep_neural_network(X_train_scaled, X_test_scaled, y_train, y_test)
dnn_model.save('/content/drive/MyDrive/Desktop/Advanced Analytics Edge/Project/Individual Work/Output/dnn_model.h5')  # saves the model as an H5 file

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [19]:
dnn_train_pred, dnn_test_pred, dnn_train_mae, dnn_test_mae = predict_result(X_train_scaled, X_test_scaled, y_train, y_test, dnn_model)
print("Train MAE: ", dnn_train_mae)
print("Test MAE: ", dnn_test_mae)

Train MAE:  6.29959904297082
Test MAE:  5.324659005751953


## Model 3: Fine-tuned Deep Neural Network

In [20]:
# Function to create the model (needed for KerasClassifier)
def create_model(neurons=32, dropout_rate=0.3, learning_rate=0.01):
    model = Sequential()
    model.add(Dense(neurons, input_dim=X_train.shape[1], activation='relu'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(neurons, activation='relu'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='linear'))
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(loss='mean_absolute_error', optimizer=optimizer)
    return model

def fit_neural_network_grid_search(X_train, X_test, y_train, y_test):

    # Define the grid search parameters
    param_grid = {
        'neurons': [64, 128],
        'dropout_rate': [0.3, 0.4],
        'learning_rate': [0.001, 0.01],
    }

    # Wrap the model using KerasClassifier
    model = KerasClassifier(build_fn=create_model, epochs=100, batch_size=32, verbose=0)

    # Create GridSearchCV
    grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
    grid_result = grid.fit(X_train, y_train)

    # Get the best model
    best_model = grid_result.best_estimator_.model

    return best_model

In [21]:
# nn_fine_tuned_model = fit_neural_network_grid_search(X_train_scaled, X_test_scaled, y_train, y_test)
# nn_fine_tuned_model.save('/content/drive/MyDrive/Desktop/Advanced Analytics Edge/Project/Individual Work/Output/nn_fine_tuned_model.h5')

In [22]:
# nn_fine_tuned_train_mae, nn_fine_tuned_test_mae = calculate_mae(X_train_scaled, X_test_scaled, y_train, y_test, nn_fine_tuned_model)
# print("Train MAE: ", nn_fine_tuned_train_mae)
# print("Test MAE: ", nn_fine_tuned_test_mae)

# Test Models

In [28]:
# loading neural network model
nn_model = tf.keras.models.load_model('/content/drive/MyDrive/Desktop/Advanced Analytics Edge/Project/Individual Work/Output/nn_model.h5')
# calculate MAE for neural network model
nn_train_pred, nn_test_pred, nn_train_mae, nn_test_mae = predict_result(X_train_scaled, X_test_scaled, y_train, y_test, nn_model)
print("Train MAE: ", nn_train_mae)
print("Test MAE: ", nn_test_mae)

Train MAE:  6.288131115394802
Test MAE:  5.314431949632113


In [29]:
# loading deep neural network model
dnn_model = tf.keras.models.load_model('/content/drive/MyDrive/Desktop/Advanced Analytics Edge/Project/Individual Work/Output/dnn_model.h5')
# calculate MAE for deep neural network model
dnn_train_pred, dnn_test_pred, dnn_train_mae, dnn_test_mae = predict_result(X_train_scaled, X_test_scaled, y_train, y_test, dnn_model)
print("Train MAE: ", dnn_train_mae)
print("Test MAE: ", dnn_test_mae)

Train MAE:  6.29959904297082
Test MAE:  5.324659005751953


# Output Prediction

In [30]:
nn_train_pred_df = pd.DataFrame({'perd_target': nn_train_pred})
dnn_train_pred_df = pd.DataFrame({'perd_target': dnn_train_pred})
nn_test_pred_df = pd.DataFrame({'perd_target': nn_test_pred})
dnn_test_pred_df = pd.DataFrame({'perd_target': dnn_test_pred})

In [31]:
# output results
nn_train_pred_df.to_csv('/content/drive/MyDrive/Desktop/Advanced Analytics Edge/Project/Individual Work/Prediction/nn_train_pred.csv', index=False, header=False)
dnn_train_pred_df.to_csv('/content/drive/MyDrive/Desktop/Advanced Analytics Edge/Project/Individual Work/Prediction/dnn_train_pred.csv', index=False, header=False)
nn_test_pred_df.to_csv('/content/drive/MyDrive/Desktop/Advanced Analytics Edge/Project/Individual Work/Prediction/nn_test_pred.csv', index=False, header=False)
dnn_test_pred_df.to_csv('/content/drive/MyDrive/Desktop/Advanced Analytics Edge/Project/Individual Work/Prediction/dnn_test_pred.csv', index=False, header=False)