In [57]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler
import polars as pl
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Bidirectional, GRU, Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, GRU, Dense, Dropout

In [59]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/cfmens/X_test_m4HAPAP.csv
/kaggle/input/cfmens/y_train_or6m3Ta.csv
/kaggle/input/cfmens/X_train_N1UvY30.csv


In [2]:
X_train = pl.scan_csv('/kaggle/input/cfmens/X_train_N1UvY30.csv')
def add_feature_engineering(df):
    # Imbalance Metrics
    df = df.with_columns([
        (pl.col('bid_size') - pl.col('ask_size')).alias('imbalance'),
        (pl.col('bid_size') / (pl.col('ask_size') + 0.01)).alias('imbalance_ratio')  # Avoid division by zero
    ])
    
    # Recent Price Change
    # Calculate price change within each observation id assuming each group 'obs_id' is sorted by time
    df = df.with_columns(
    (pl.col('price').diff().fill_null(0).cumsum().over('obs_id')).alias('cumulative_price_change')
)
    

    # VWAP (Volume-Weighted Average Price)
    # VWAP is calculated as the sum of (price * volume) divided by the total volume
    price_times_volume = pl.col('price') * pl.col('bid_size')  # Use bid_size as a proxy for volume
    total_volume = pl.col('bid_size')
    
    df = df.with_columns(
        ((price_times_volume) / (total_volume)).alias('vwap')
    )

    return df

# Apply feature engineering
df = add_feature_engineering(X_train)
order_counts = df.groupby(['obs_id', 'order_id']).agg(pl.count().alias('count'))
df = df.join(order_counts, on=['obs_id', 'order_id'])

# Step 2: Create shifted columns conditionally
# Using mask to apply shift only to rows where 'count' > 1
columns_to_shift = ['action', 'venue', 'side', 'bid', 'ask', 'bid_size', 'ask_size', 'trade']
for col in columns_to_shift:
    df = df.with_columns(
        pl.when(pl.col('count') > 1)
        .then(pl.col(col).shift(1).over(['obs_id', 'order_id']))
        .otherwise(pl.lit(None))
        .alias(f'prev_{col}')
    )

# Step 3: Optionally, remove the 'count' column if no longer needed
df = df.drop('count')
df = df.fill_null(0)

  (pl.col('price').diff().fill_null(0).cumsum().over('obs_id')).alias('cumulative_price_change')
  order_counts = df.groupby(['obs_id', 'order_id']).agg(pl.count().alias('count'))
  order_counts = df.groupby(['obs_id', 'order_id']).agg(pl.count().alias('count'))


In [7]:

categorical_columns = ['venue', 'action', 'side','trade', 'prev_action', 'prev_venue', 'prev_trade']

for col in categorical_columns:
    df = df.with_columns(pl.col(col).cast(pl.Utf8).cast(pl.Categorical).alias(col))

# To one-hot encode, we can use `to_dummies` (similar to pandas get_dummies)

X_train_p = df.collect().to_pandas()
X_train_encoded = pd.get_dummies(X_train_p, columns=[
    'venue', 'action', 'side', 'trade', 
    'prev_action', 'prev_venue', 'prev_trade', 'prev_side'
], drop_first=True).drop(['obs_id'], axis=1)

# List of quantitative variables
# You should replace the placeholders with the actual names of your quantitative variables
quant_vars = [
 
 'order_id',
 'price',
 'bid',
 'ask',
 'bid_size',
 'ask_size',
 'flux',
 'imbalance',
 'imbalance_ratio',
 'cumulative_price_change',
 'vwap',
 'prev_bid',
 'prev_ask',
 'prev_bid_size',
 'prev_ask_size',
 ]

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the quantitative variables
X_train_encoded[quant_vars] = scaler.fit_transform(X_train_encoded[quant_vars])

# Now X_train_encoded has normalized quantitative variables and encoded categorical variables


In [8]:
X_train_encoded

Unnamed: 0,order_id,price,bid,ask,bid_size,ask_size,flux,imbalance,imbalance_ratio,cumulative_price_change,...,trade_true,prev_action_U,prev_action_D,prev_venue_4,prev_venue_1,prev_venue_2,prev_venue_5,prev_venue_3,prev_trade_true,prev_side_B
0,-1.634780,-0.002671,-0.004684,-0.052716,-0.445946,-0.588867,0.686083,0.107172,0.872301,-0.001111,...,False,False,False,False,False,False,False,False,False,False
1,-1.585247,-0.004831,-0.004684,-0.052716,-0.445946,-0.588867,0.686083,0.107172,0.872301,-0.003273,...,False,False,False,False,False,False,False,False,False,False
2,-1.535713,-0.002763,-0.004684,-0.052716,-0.445946,-0.588867,-0.686455,0.107172,0.872301,-0.001203,...,False,False,False,False,False,False,False,False,False,False
3,-1.486179,-0.002671,-0.004684,-0.052716,-0.445946,-0.588867,0.686083,0.107172,0.872301,-0.001111,...,False,False,False,False,False,False,False,False,False,False
4,-1.436645,-0.002350,-0.004684,-0.052716,-0.445946,-0.588867,-0.686455,0.107172,0.872301,-0.000789,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16079995,1.386790,0.002016,0.268808,-0.048209,0.412520,-0.231378,-0.686455,0.515532,-0.150774,0.004962,...,False,False,False,True,False,False,False,False,False,False
16079996,1.832596,-0.003774,0.268808,-0.048209,0.412520,-0.093882,0.686083,0.406636,-0.159070,-0.000835,...,False,False,False,False,False,False,False,False,False,False
16079997,1.882130,0.001741,0.268808,-0.048209,0.412520,-0.093882,0.686083,0.406636,-0.159070,0.004686,...,False,False,False,False,False,False,False,False,False,False
16079998,1.931664,0.001741,0.268808,-0.048209,0.412520,-0.093882,0.686083,0.406636,-0.159070,0.004686,...,False,False,False,False,False,False,False,False,False,False


In [10]:
  
categorical_vars = [col for col in X_train_encoded.columns if col not in quant_vars]

# Convert quantitative variables to float (if not already)
X_train_encoded[quant_vars] = X_train_encoded[quant_vars].astype(float)

# Convert categorical variables to int
X_train_encoded[categorical_vars] = X_train_encoded[categorical_vars].astype(int)

# Convert the DataFrame to a NumPy array
X_train_np = X_train_encoded.to_numpy()

X_train_np = X_train_encoded.to_numpy()

# Assuming each sequence is 100 observations long
num_sequences = int(X_train_np.shape[0] / 100)
num_features = X_train_np.shape[1]  # Number of features after encoding

X_train_reshaped = X_train_np.reshape((num_sequences, 100, num_features))

 

In [14]:
# Path to your y_train CSV file
y_train_path = '/kaggle/input/cfmens/y_train_or6m3Ta.csv'

# Read y_train using Pandas
y_train_df = pd.read_csv(y_train_path)

In [13]:
y_train_df.iloc[:, -1]

0         10
1         15
2          0
3         13
4          0
          ..
160795    13
160796     1
160797     3
160798    11
160799     5
Name: eqt_code_cat, Length: 160800, dtype: int64

In [15]:
 
y_train = y_train_df.iloc[:, -1].values  # This extracts the labels as a NumPy array

 
num_sequences = int(X_train_reshaped.shape[0])

assert len(y_train) == num_sequences, "The length of y_train does not match the number of sequences in X_train."

 

num_classes = np.unique(y_train).size

# Convert labels to one-hot encoding
y_train_categorical = to_categorical(y_train, num_classes=num_classes)

# y_train_categorical is now ready to be used with your model.


2024-05-11 15:03:01.895443: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-11 15:03:01.895650: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-11 15:03:02.162362: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [42]:


# Define the model
model = Sequential([
    # Add a Bidirectional GRU layer
    Bidirectional(GRU(64, return_sequences=True), input_shape=(100, X_train_reshaped.shape[2])),
    Dropout(0.1),  # Dropout for regularization
    Bidirectional(GRU(32)),
    Dropout(0.1),  # Another Dropout layer for regularization
    # Output layer, assuming `num_classes` is defined from y_train preparation
    Dense(num_classes, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Model summary
model.summary()
 


  super().__init__(**kwargs)


In [17]:

early_stopping = EarlyStopping(
    monitor='val_loss',     # Monitor validation loss
    patience=3,             # Number of epochs with no improvement after which training will be stopped
    verbose=1,              # Output a message when stopping
    restore_best_weights=True  # Restore model weights from the epoch with the best value of the monitored quantity
)
history = model.fit(
    X_train_reshaped, 
    y_train_categorical, 
    epochs=50,  # Number of epochs: adjust based on your data and training progress
    batch_size=64,  # Batch size: adjust based on your data and available memory
    validation_split=0.25,
    callbacks=[early_stopping]# Use 20% of the data for validation
    
)


Epoch 1/50
[1m1885/1885[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 19ms/step - accuracy: 0.1497 - loss: 2.6497 - val_accuracy: 0.2813 - val_loss: 2.2053
Epoch 2/50
[1m1885/1885[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 18ms/step - accuracy: 0.2909 - loss: 2.1643 - val_accuracy: 0.3459 - val_loss: 1.9930
Epoch 3/50
[1m1885/1885[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 18ms/step - accuracy: 0.3504 - loss: 1.9716 - val_accuracy: 0.3783 - val_loss: 1.8737
Epoch 4/50
[1m1885/1885[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 18ms/step - accuracy: 0.3893 - loss: 1.8431 - val_accuracy: 0.4158 - val_loss: 1.7596
Epoch 5/50
[1m1885/1885[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 18ms/step - accuracy: 0.4222 - loss: 1.7350 - val_accuracy: 0.4399 - val_loss: 1.6852
Epoch 6/50
[1m1885/1885[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 18ms/step - accuracy: 0.4488 - loss: 1.6451 - val_accuracy: 0.4719 - val_loss: 1.5842
Epoc

In [24]:
X_test = pl.scan_csv('/kaggle/input/cfmens/X_test_m4HAPAP.csv')
def add_feature_engineering(df):
    # Imbalance Metrics
    df = df.with_columns([
        (pl.col('bid_size') - pl.col('ask_size')).alias('imbalance'),
        (pl.col('bid_size') / (pl.col('ask_size') + 0.01)).alias('imbalance_ratio')  # Avoid division by zero
    ])
    
    # Recent Price Change
    # Calculate price change within each observation id assuming each group 'obs_id' is sorted by time
    df = df.with_columns(
    (pl.col('price').diff().fill_null(0).cumsum().over('obs_id')).alias('cumulative_price_change')
)
    

    # VWAP (Volume-Weighted Average Price)
    # VWAP is calculated as the sum of (price * volume) divided by the total volume
    price_times_volume = pl.col('price') * pl.col('bid_size')  # Use bid_size as a proxy for volume
    total_volume = pl.col('bid_size')
    
    df = df.with_columns(
        ((price_times_volume) / (total_volume)).alias('vwap')
    )

    return df

# Apply feature engineering
df_test = add_feature_engineering(X_test)
order_counts = df_test.groupby(['obs_id', 'order_id']).agg(pl.count().alias('count'))
df_test = df_test.join(order_counts, on=['obs_id', 'order_id'])

columns_to_shift = ['action', 'venue', 'side', 'bid', 'ask', 'bid_size', 'ask_size', 'trade']
for col in columns_to_shift:
    df_test = df_test.with_columns(
        pl.when(pl.col('count') > 1)
        .then(pl.col(col).shift(1).over(['obs_id', 'order_id']))
        .otherwise(pl.lit(None))
        .alias(f'prev_{col}')
    )

# Step 3: Optionally, remove the 'count' column if no longer needed
df_test = df_test.drop('count')
categorical_columns = ['venue', 'action', 'side','trade', 'prev_action', 'prev_venue', 'prev_trade']
for col in categorical_columns:
    df = df_test.with_columns(pl.col(col).cast(pl.Utf8).cast(pl.Categorical).alias(col))
X_test_p = df.collect().to_pandas()
X_test_encoded = pd.get_dummies(X_test_p, columns=[
    'venue', 'action', 'side', 'trade', 
    'prev_action', 'prev_venue', 'prev_trade', 'prev_side'
], drop_first=True).drop(['obs_id'], axis=1)

 
quant_vars = [
 
 'order_id',
 'price',
 'bid',
 'ask',
 'bid_size',
 'ask_size',
 'flux',
 'imbalance',
 'imbalance_ratio',
 'cumulative_price_change',
 'vwap',
 'prev_bid',
 'prev_ask',
 'prev_bid_size',
 'prev_ask_size',
 ]

# transform the quantitative variables
X_test_encoded[quant_vars] = scaler.transform(X_test_encoded[quant_vars])
categorical_vars = [col for col in X_test_encoded.columns if col not in quant_vars]
X_test_encoded[quant_vars] = X_test_encoded[quant_vars].astype(float)
X_test_encoded[categorical_vars] = X_test_encoded[categorical_vars].astype(int)
X_test_np = X_test_encoded.to_numpy()
num_sequences = int(X_test_np.shape[0] / 100)
num_features = X_test_np.shape[1]  # Number of features after encoding
X_test_reshaped = X_test_np.reshape((num_sequences, 100, num_features))


  (pl.col('price').diff().fill_null(0).cumsum().over('obs_id')).alias('cumulative_price_change')
  order_counts = df_test.groupby(['obs_id', 'order_id']).agg(pl.count().alias('count'))
  order_counts = df_test.groupby(['obs_id', 'order_id']).agg(pl.count().alias('count'))


In [28]:
X_test_reshaped.shape

(81600, 100, 33)

In [36]:
# Make predictions
predictions = model.predict(X_test_reshaped)

# If your model outputs one-hot encoded predictions, convert these to label predictions
# Assuming your model does classification
predicted_labels = predictions.argmax(axis=1)


[1m2550/2550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 4ms/step


In [41]:
# Example: Save predictions to CSV for submission
submission_df = pd.DataFrame(predicted_labels, columns=['eqt_code_cat'])
submission_df.to_csv('submission_laset.csv')


In [40]:
submission_df.eqt_code_cat.value_counts()

eqt_code_cat
9     64276
14    11285
11     2313
0      1181
1       925
19      884
12      674
10       33
13       10
8         8
3         6
17        3
21        1
16        1
Name: count, dtype: int64

In [50]:
X_train_encoded

Unnamed: 0,order_id,price,bid,ask,bid_size,ask_size,flux,venue_4,venue_1,venue_5,venue_2,venue_0,venue_3,action_A,action_D,action_U,side_A,side_B,trade_false,trade_true
0,0,0.30,0.00,0.01,100,1,100,True,False,False,False,False,False,True,False,False,True,False,True,False
1,1,-0.17,0.00,0.01,100,1,100,True,False,False,False,False,False,True,False,False,False,True,True,False
2,2,0.28,0.00,0.01,100,1,-100,True,False,False,False,False,False,False,True,False,True,False,True,False
3,3,0.30,0.00,0.01,100,1,100,True,False,False,False,False,False,True,False,False,True,False,True,False
4,4,0.37,0.00,0.01,100,1,-100,True,False,False,False,False,False,False,True,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16079995,61,1.32,0.01,0.06,735,261,-100,True,False,False,False,False,False,False,True,False,True,False,True,False
16079996,70,0.06,0.01,0.06,735,361,100,False,False,False,False,True,False,True,False,False,True,False,True,False
16079997,71,1.26,0.01,0.06,735,361,100,True,False,False,False,False,False,True,False,False,True,False,True,False
16079998,72,1.26,0.01,0.06,735,361,100,True,False,False,False,False,False,True,False,False,True,False,True,False


# Use a CNN-BIGRU


In [32]:


# Define the model
model = Sequential([
    # Add a 1D Convolutional layer to extract features
    Conv1D(filters=64, kernel_size=3, activation='selu', input_shape=(100, X_train_reshaped.shape[2])),
    MaxPooling1D(pool_size=2),
    Dropout(0.2),  # Dropout for regularization

    # Add a Bidirectional GRU layer
    Bidirectional(GRU(64, return_sequences=True)),
    Dropout(0.2),  # Dropout for regularization
    Bidirectional(GRU(32)),
    Dropout(0.2),  # Another Dropout layer for regularization

    # Output layer, assuming `num_classes` is defined from y_train preparation
    Dense(num_classes, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Model summary
model.summary()


  super().__init__(


In [33]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(
    monitor='val_loss',     # Monitor validation loss
    patience=3,             # Number of epochs with no improvement after which training will be stopped
    verbose=1,              # Output a message when stopping
    restore_best_weights=True  # Restore model weights from the epoch with the best value of the monitored quantity
)

In [34]:
history = model.fit(
    X_train_reshaped, 
    y_train_categorical, 
    epochs=50,
    batch_size=64,
    validation_split=0.2,
    callbacks=[early_stopping]  # Include the early stopping callback
)

Epoch 1/50
[1m2010/2010[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 14ms/step - accuracy: 0.1644 - loss: 2.6030 - val_accuracy: 0.3068 - val_loss: 2.1198
Epoch 2/50
[1m2010/2010[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 13ms/step - accuracy: 0.3147 - loss: 2.0949 - val_accuracy: 0.3778 - val_loss: 1.8845
Epoch 3/50
[1m2010/2010[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 13ms/step - accuracy: 0.3685 - loss: 1.9071 - val_accuracy: 0.4192 - val_loss: 1.7472
Epoch 4/50
[1m2010/2010[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 13ms/step - accuracy: 0.4114 - loss: 1.7720 - val_accuracy: 0.4413 - val_loss: 1.6726
Epoch 5/50
[1m2010/2010[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 13ms/step - accuracy: 0.4444 - loss: 1.6611 - val_accuracy: 0.4815 - val_loss: 1.5484
Epoch 6/50
[1m2010/2010[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 13ms/step - accuracy: 0.4632 - loss: 1.5915 - val_accuracy: 0.4919 - val_loss: 1.5071
Epoc