In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [25]:
all = pd.read_csv("/content/drive/MyDrive/Desktop/Advanced Analytics Edge/Project/Individual Work/Input/new_train_data.csv")
all = all.dropna(subset=['target','wap'])
all.isnull().sum()

stock_id                        0
date_id                         0
seconds_in_bucket               0
wap                             0
target                          0
time_id                         0
row_id                          0
weight                          0
weighted_wap                    0
index                           0
next_wap                        0
next_index                      0
ask_price                       0
ask_size                        0
bid_price                       0
bid_size                        0
far_price                  182727
imbalance_buy_sell_flag         0
imbalance_size                  0
matched_size                    0
near_price                 180000
reference_price                 0
dtype: int64

In [26]:
all_sample = all[all["date_id"]> 450]
train = all_sample[all_sample["date_id"]<475]
test = all_sample[all_sample["date_id"]>=475]

print(train.shape)
print(test.shape)
train.columns

(264000, 22)
(66000, 22)


Index(['stock_id', 'date_id', 'seconds_in_bucket', 'wap', 'target', 'time_id',
       'row_id', 'weight', 'weighted_wap', 'index', 'next_wap', 'next_index',
       'ask_price', 'ask_size', 'bid_price', 'bid_size', 'far_price',
       'imbalance_buy_sell_flag', 'imbalance_size', 'matched_size',
       'near_price', 'reference_price'],
      dtype='object')

In [27]:
def feature_engineering(X):

    _X = X.copy()
    # features taken from https://www.kaggle.com/code/nhsmith/optiver-catboost-with-feature-selection
    _X["liquidity_imbalance"] = _X.eval("(bid_size-ask_size)/(bid_size+ask_size)")
    _X["matched_imbalance"] = _X.eval("(imbalance_size-matched_size)/(matched_size+imbalance_size)")
    _X["price_spread"] = _X["ask_price"] - _X["bid_price"]
    _X['market_urgency'] = _X['price_spread'] * _X['liquidity_imbalance']

    return _X

In [28]:
# apply feature engineering
train = feature_engineering(train)
test = feature_engineering(test)
# show result
print(train.shape)
print(test.shape)
train.columns

(264000, 26)
(66000, 26)


Index(['stock_id', 'date_id', 'seconds_in_bucket', 'wap', 'target', 'time_id',
       'row_id', 'weight', 'weighted_wap', 'index', 'next_wap', 'next_index',
       'ask_price', 'ask_size', 'bid_price', 'bid_size', 'far_price',
       'imbalance_buy_sell_flag', 'imbalance_size', 'matched_size',
       'near_price', 'reference_price', 'liquidity_imbalance',
       'matched_imbalance', 'price_spread', 'market_urgency'],
      dtype='object')

In [29]:
# Convert all numerics to float32 to reduce memory footprint
train = train.astype(np.float32)
test = test.astype(np.float32)

In [30]:
# Selecting features
# features = [col for col in train.columns if col not in ['stock_id','row_id', 'time_id', 'target', 'next_index', 'next_wap', 'weight', 'weighted_wap', 'far_price', 'near_price']]
features = ['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_buy_sell_flag', 'reference_price', 'matched_size','wap']
target = 'next_wap'

# Preparing the data
X_train = train[features]
y_train = train[target]
X_test = test[features]
y_test = test[target]

y_train_target = train['target']
y_test_target = test['target']

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = scaler.transform(X_test)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

In [11]:
def get_pred_index_and_target(data): # input has to have value 'pred_next wap'
    data['pred_next_weighted_wap'] = data['pred_next_wap'] * data['weight']
    grouped_sums = data.groupby('time_id')['pred_next_weighted_wap'].sum()
    data['pred_next_index'] = data['time_id'].map(grouped_sums)
    data['pred_target'] = ((data['pred_next_wap'] / data['wap']) - (data['pred_next_index'] / data['index'])) * 10000
    return data.loc[:,['pred_next_weighted_wap', 'pred_next_index', 'pred_target']]

In [12]:
# Define a neural network model
def fit_neural_network(X_train, y_train):

    # Create the Neural Network Model
    model = Sequential()
    model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='linear'))  # Output layer for regression

    # Define optimizer
    optimizer = Adam(learning_rate=0.0001)
    # Compile the model
    model.compile(loss='mean_absolute_error', optimizer=optimizer)

    # Define the early stopping callback
    early_stopping_callback = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=10, verbose=1, mode='min', restore_best_weights=True)

    # Train the model with the early stopping callback
    model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, callbacks=[early_stopping_callback])

    return model

In [22]:
# Get Prediction result
def calculate_mae(train, test, X_train, X_test, y_train_target, y_test_target, model):

    # Make predictions on the training set
    train_pred_next_wap = model.predict(X_train)
    # Flatten predictions if necessary
    train_pred_next_wap = np.ravel(train_pred_next_wap)
    # Add to the train data
    train['pred_next_wap'] = train_pred_next_wap

    # Make predictions on the test set
    test_pred_next_wap = model.predict(X_test)
    # Flatten predictions if necessary
    test_pred_next_wap = np.ravel(test_pred_next_wap)
    # Add to the test data
    test['pred_next_wap'] = test_pred_next_wap

    # Get predicted index and target
    train[['pred_next_weighted_wap', 'pred_next_index', 'pred_target']] = get_pred_index_and_target(train)
    test[['pred_next_weighted_wap', 'pred_next_index', 'pred_target']] = get_pred_index_and_target(test)

    # Calculate Mean Absolute Error on train data
    train_mae = mean_absolute_error(y_train_target, train['pred_target'])
    # Calculate Mean Absolute Error on test data
    test_mae = mean_absolute_error(y_test_target, test['pred_target'])

    return train, test, train_mae, test_mae

In [14]:
nn_model = fit_neural_network(X_train_scaled, y_train)
nn_model.save('/content/drive/MyDrive/Desktop/Advanced Analytics Edge/Project/Individual Work/Output/nn_model.h5')  # saves the model as an H5 file

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [23]:
# loading saved model
nn_model = tf.keras.models.load_model('/content/drive/MyDrive/Desktop/Advanced Analytics Edge/Project/Individual Work/Output/nn_model.h5')

In [31]:
nn_train, nn_test, nn_train_mae, nn_test_mae = calculate_mae(train, test, X_train_scaled, X_test_scaled, y_train_target, y_test_target, nn_model)
print("Train MAE: ", nn_train_mae)
print("Test MAE: ", nn_test_mae)

Train MAE:  6.606262
Test MAE:  6.192182


In [16]:
nn_train

Unnamed: 0,stock_id,date_id,seconds_in_bucket,wap,target,time_id,row_id,weight,weighted_wap,index,...,near_price,reference_price,liquidity_imbalance,matched_imbalance,price_spread,market_urgency,pred_next_wap,pred_next_weighted_wap,pred_next_index,pred_target
0,0.0,451.0,0.0,1.000000,12.749434,24805.0,45100.0,0.004000,0.004000,1.000000,...,,1.001570,0.288933,0.569310,0.000168,4.854067e-05,1.000189,0.004001,0.999826,3.634691
1,1.0,451.0,0.0,1.000000,-0.090003,24805.0,45101.0,0.001000,0.001000,1.000000,...,,0.999733,-0.000218,-0.327344,0.000436,-9.521268e-08,0.999702,0.001000,0.999826,-1.233220
2,2.0,451.0,0.0,1.000000,5.480051,24805.0,45102.0,0.002000,0.002000,1.000000,...,,0.999531,-0.024923,-1.000000,0.001066,-2.656796e-05,0.999690,0.002000,0.999826,-1.355410
3,3.0,451.0,0.0,1.000000,-3.870130,24805.0,45103.0,0.005999,0.005999,1.000000,...,,1.000054,-0.028673,-0.813436,0.000202,-5.791891e-06,0.999824,0.005998,0.999826,-0.016689
4,4.0,451.0,0.0,1.000000,2.980232,24805.0,45104.0,0.004001,0.004001,1.000000,...,,1.000013,0.817904,-0.257562,0.001681,1.374896e-03,0.999843,0.004000,0.999826,0.174046
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
263995,195.0,474.0,540.0,1.002569,-2.319813,26124.0,474540192.0,0.004001,0.004011,0.999981,...,1.000703,1.002504,0.079424,-0.809315,0.000120,9.530911e-06,1.001999,0.004009,1.000093,-6.809831
263996,196.0,474.0,540.0,1.002677,-3.029704,26124.0,474540192.0,0.001000,0.001003,0.999981,...,1.003725,1.002682,0.963277,-0.914636,0.000261,2.514152e-04,1.002096,0.001002,1.000093,-6.910563
263997,197.0,474.0,540.0,0.997456,-12.980103,26124.0,474540192.0,0.004000,0.003990,0.999981,...,0.997294,0.997393,0.282559,-0.973735,0.000099,2.797336e-05,0.998348,0.003994,1.000093,7.827282
263998,198.0,474.0,540.0,1.000766,-1.450181,26124.0,474540192.0,0.005999,0.006004,0.999981,...,1.000918,1.000918,-0.219758,-0.977775,0.000250,-5.493946e-05,1.000692,0.006004,1.000093,-1.860261


In [17]:
def fit_deep_neural_network(X_train, y_train):
    # Adjusted Neural Network Model
    model = Sequential()
    model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))  # Increased neurons
    model.add(Dropout(0.3))  # Added dropout
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))  # Added dropout
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='linear'))  # Output layer

    # Compile the model with a different learning rate
    optimizer = Adam(learning_rate=0.0001)  # Adjust learning rate as needed
    model.compile(loss='mean_absolute_error', optimizer=optimizer)

    # Train the model (include early stopping as before)
    early_stopping_callback = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=10, verbose=1, mode='min', restore_best_weights=True)

    # Train the model with the early stopping callback
    model.fit(X_train, y_train, epochs=10, batch_size=16, validation_split=0.2, callbacks=[early_stopping_callback])

    return model


In [18]:
dnn_model = fit_deep_neural_network(X_train_scaled, y_train)
dnn_model.save('/content/drive/MyDrive/Desktop/Advanced Analytics Edge/Project/Individual Work/Output/dnn_model_sample_data.h5')  # saves the model as an H5 file

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [34]:
# loading saved model
dnn_model = tf.keras.models.load_model('/content/drive/MyDrive/Desktop/Advanced Analytics Edge/Project/Individual Work/Output/dnn_model_sample_data.h5')

In [36]:
dnn_train, dnn_test, dnn_train_mae, dnn_test_mae = calculate_mae(train, test, X_train_scaled, X_test_scaled, y_train_target, y_test_target, dnn_model)
print("Train MAE: ", dnn_train_mae)
print("Test MAE: ", dnn_test_mae)

Train MAE:  7.2215676
Test MAE:  9.652169


In [20]:
# Function to create the model (needed for KerasClassifier)
def create_model(neurons=32, dropout_rate=0.3, learning_rate=0.01):
    model = Sequential()
    model.add(Dense(neurons, input_dim=X_train.shape[1], activation='relu'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(neurons, activation='relu'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='linear'))
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(loss='mean_absolute_error', optimizer=optimizer)
    return model

def fit_neural_network_grid_search(X_train, y_train):

    # Define the grid search parameters
    param_grid = {
        'neurons': [64, 128],
        'dropout_rate': [0.3, 0.4],
        'learning_rate': [0.001, 0.01],
    }

    # Wrap the model using KerasClassifier
    model = KerasClassifier(build_fn=create_model, epochs=100, batch_size=32, verbose=0)

    # Create GridSearchCV
    grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
    grid_result = grid.fit(X_train, y_train)

    # Get the best model
    best_model = grid_result.best_estimator_.model

    return best_model

In [21]:
# nn_fine_tuned_model = fit_neural_network_grid_search(X_train_scaled, y_train)
# nn_fine_tuned_model.save('/content/drive/MyDrive/Desktop/Advanced Analytics Edge/Project/Individual Work/Output/nn_fine_tuned_model.h5')

  model = KerasClassifier(build_fn=create_model, epochs=100, batch_size=32, verbose=0)


KeyboardInterrupt: ignored

In [None]:
# nn_fine_tuned_train, nn_fine_tuned_test, nn_fine_tuned_train_mae, nn_fine_tuned_test_mae = calculate_mae(train, test, X_train_scaled, X_test_scaled, y_train, y_test, nn_fine_tuned_model)
# print("Train MAE: ", nn_fine_tuned_train_mae)
# print("Test MAE: ", nn_fine_tuned_test_mae)