In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/cfmens/X_test_m4HAPAP.csv
/kaggle/input/cfmens/y_train_or6m3Ta.csv
/kaggle/input/cfmens/X_train_N1UvY30.csv


In [2]:
import polars as pl

# Load datasets
x_train_path = '/kaggle/input/cfmens/X_train_N1UvY30.csv'
x_test_path = '/kaggle/input/cfmens/X_test_m4HAPAP.csv'
X_train = pl.scan_csv(x_train_path)

# Function to add feature engineering to a Polars DataFrame
def add_feature_engineering(df):
    # Statistical Features
    for col in ['price', 'bid', 'ask', 'bid_size', 'ask_size', 'flux']:
        df = df.with_columns([
            pl.col(col).mean().over('obs_id').alias(f'{col}_mean'),
            pl.col(col).std().over('obs_id').alias(f'{col}_std'),
            (pl.col(col).max().over('obs_id') - pl.col(col).min().over('obs_id')).alias(f'{col}_range')
        ])
    
    # Imbalance Metrics
    df = df.with_columns([
        (pl.col('bid_size') - pl.col('ask_size')).alias('imbalance'),
        (pl.col('bid_size') / (pl.col('ask_size') + 0.01)).alias('imbalance_ratio')  # Avoid division by zero
    ])
    
    # Recent Price Change
    # Assuming each group 'obs_id' is already sorted by time
    df = df.with_columns(
        (pl.col('price').last().over('obs_id') - pl.col('price').first().over('obs_id')).alias('price_change')
    )
    
    # VWAP
    vwap = (pl.col('price') * pl.col('flux')).sum().over('obs_id') / pl.col('flux').sum().over('obs_id')
    df = df.with_columns(vwap.alias('vwap'))
    
    return df

# Apply feature engineering
X_train_fe = add_feature_engineering(X_train)
 
# Note: The function assumes 'obs_id' or an equivalent exists to group by sequences.
# You might need to adjust it according to your actual dataset structure.


In [3]:
categorical_columns = ['venue', 'action', 'side','trade']

for col in categorical_columns:
    X_train = X_train.with_columns(pl.col(col).cast(pl.Utf8).cast(pl.Categorical).alias(col))

# To one-hot encode, we can use `to_dummies` (similar to pandas get_dummies)


In [4]:
X_train_p = X_train.collect().to_pandas()

In [21]:
X_train_encoded = pd.get_dummies(X_train_p, columns=['venue', 'action', 'side','trade']).drop(['obs_id'], axis=1)

In [22]:
# Assuming y_train is loaded and you have the encoded X_train
# Ensure X_train_encoded is a NumPy array for reshaping
X_train_np = X_train_encoded.to_numpy()

# Assuming each sequence is 100 observations long
num_sequences = int(X_train_np.shape[0] / 100)
num_features = X_train_np.shape[1]  # Number of features after encoding

# Reshape X_train to have dimensions: (num_sequences, 100, num_features)
X_train_reshaped = X_train_np.reshape((num_sequences, 100, num_features))

# Proceed with model training using the reshaped X_train_reshaped


In [45]:
# Path to your y_train CSV file
y_train_path = '/kaggle/input/cfmens/y_train_or6m3Ta.csv'

# Read y_train using Pandas
y_train_df = pd.read_csv(y_train_path)

In [26]:
y_train_df.iloc[:, -1]

0         10
1         15
2          0
3         13
4          0
          ..
160795    13
160796     1
160797     3
160798    11
160799     5
Name: eqt_code_cat, Length: 160800, dtype: int64

In [27]:
import pandas as pd
import numpy as np



# Assuming y_train_df contains a single column with labels
# If y_train_df contains multiple columns or the label column is not the first one,
# adjust the column selection accordingly.
y_train = y_train_df.iloc[:, -1].values  # This extracts the labels as a NumPy array

# Since every 100 observations in X_train correspond to a single label in y_train,
# and assuming the observations are evenly distributed among the labels,
# we don't need to reshape y_train, but we should ensure its length matches the reshaped X_train's
num_sequences = int(X_train_reshaped.shape[0])

# Check if y_train length matches the number of sequences
assert len(y_train) == num_sequences, "The length of y_train does not match the number of sequences in X_train."

# At this point, y_train is ready and properly aligned with X_train_reshaped for model training.

# Optionally, convert y_train to categorical if it represents classes
from tensorflow.keras.utils import to_categorical

# Determine the number of classes for y_train
num_classes = np.unique(y_train).size

# Convert labels to one-hot encoding
y_train_categorical = to_categorical(y_train, num_classes=num_classes)

# y_train_categorical is now ready to be used with your model.


2024-04-03 19:04:39.381975: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-03 19:04:39.382094: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-03 19:04:39.508099: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [28]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, GRU, Dense, Dropout

# Define the model
model = Sequential([
    # Add a Bidirectional GRU layer
    Bidirectional(GRU(64, return_sequences=True), input_shape=(100, X_train_reshaped.shape[2])),
    Dropout(0.2),  # Dropout for regularization
    Bidirectional(GRU(32)),
    Dropout(0.2),  # Another Dropout layer for regularization
    # Output layer, assuming `num_classes` is defined from y_train preparation
    Dense(num_classes, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Model summary
model.summary()


  super().__init__(**kwargs)


In [32]:
X_train_reshaped = X_train_reshaped.astype(int)
history = model.fit(
    X_train_reshaped, 
    y_train_categorical, 
    epochs=10,  # Number of epochs: adjust based on your data and training progress
    batch_size=64,  # Batch size: adjust based on your data and available memory
    validation_split=0.2  # Use 20% of the data for validation
)


Epoch 1/10
[1m2010/2010[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 19ms/step - accuracy: 0.4003 - loss: 1.8253 - val_accuracy: 0.4365 - val_loss: 1.7399
Epoch 2/10
[1m2010/2010[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 18ms/step - accuracy: 0.4121 - loss: 1.7980 - val_accuracy: 0.4491 - val_loss: 1.7046
Epoch 3/10
[1m2010/2010[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 18ms/step - accuracy: 0.4235 - loss: 1.7629 - val_accuracy: 0.4566 - val_loss: 1.6804
Epoch 4/10
[1m2010/2010[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 18ms/step - accuracy: 0.4262 - loss: 1.7486 - val_accuracy: 0.4538 - val_loss: 1.6681
Epoch 5/10
[1m2010/2010[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 18ms/step - accuracy: 0.4299 - loss: 1.7264 - val_accuracy: 0.4531 - val_loss: 1.6940
Epoch 6/10
[1m2010/2010[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 18ms/step - accuracy: 0.4369 - loss: 1.7191 - val_accuracy: 0.4682 - val_loss: 1.6392
Epoc

In [37]:
import pandas as pd

# Path to your X_test CSV file
x_test_path = '/kaggle/input/cfmens/X_test_m4HAPAP.csv'

# Load X_test using Pandas
X_test = pd.read_csv(x_test_path)

# One-hot encode categorical variables in X_test just like X_train
X_test_encoded = pd.get_dummies(X_test, columns=['venue', 'action', 'side', 'trade']).drop(['obs_id'], axis=1)

# Ensure X_test_encoded has the same columns in the same order as X_train_encoded
# This step is crucial because the model expects the same feature set in training and prediction
# There might be missing columns if some categories are not present in the test data
missing_cols = set(X_train_encoded.columns) - set(X_test_encoded.columns)
for c in missing_cols:
    X_test_encoded[c] = 0  # Add missing columns as zeros

# Reorder X_test_encoded columns to match X_train_encoded
X_test_encoded = X_test_encoded[X_train_encoded.columns]

# Convert X_test_encoded to a numpy array and reshape for the model
X_test_np = X_test_encoded.to_numpy()

# Assuming each sequence is 100 observations long, reshape X_test
num_sequences_test = int(X_test_np.shape[0] / 100)
X_test_reshaped = X_test_np.reshape((num_sequences_test, 100, X_test_encoded.shape[1]))


In [39]:
X_test_reshaped = X_test_reshaped.astype(int)


In [40]:
X_test_reshaped

array([[[ 0,  0,  0, ...,  0,  0,  0],
        [ 1,  0,  0, ...,  0,  0,  0],
        [ 2,  1,  0, ...,  0,  0,  0],
        ...,
        [48,  0,  0, ...,  1,  0,  0],
        [51,  0,  0, ...,  0,  0,  0],
        [50,  0,  0, ...,  0,  0,  0]],

       [[ 0,  0,  0, ...,  0,  0,  0],
        [ 1,  0,  0, ...,  0,  0,  0],
        [ 2,  0,  0, ...,  1,  0,  0],
        ...,
        [24,  0,  0, ...,  0,  0,  0],
        [26,  0,  0, ...,  0,  0,  0],
        [68,  0,  0, ...,  0,  0,  0]],

       [[ 0,  0,  0, ...,  1,  0,  0],
        [ 1,  0,  0, ...,  1,  0,  0],
        [ 2,  0,  0, ...,  1,  0,  0],
        ...,
        [60,  0,  0, ...,  1,  0,  0],
        [61,  0,  0, ...,  1,  0,  0],
        [58,  0,  0, ...,  1,  0,  0]],

       ...,

       [[ 0,  0,  0, ...,  1,  0,  0],
        [ 1, -1,  0, ...,  1,  0,  0],
        [ 2, -1,  0, ...,  1,  0,  0],
        ...,
        [63,  0,  0, ...,  1,  0,  0],
        [63,  0,  0, ...,  1,  0,  0],
        [64,  0,  0, ...,  1,  0

In [41]:
# Make predictions
predictions = model.predict(X_test_reshaped)

# If your model outputs one-hot encoded predictions, convert these to label predictions
# Assuming your model does classification
predicted_labels = predictions.argmax(axis=1)


[1m2550/2550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 6ms/step


In [49]:
# Example: Save predictions to CSV for submission
submission_df = pd.DataFrame(predicted_labels, columns=['eqt_code_cat'])
submission_df.to_csv('submission.csv')


In [47]:
y_train_df

Unnamed: 0,obs_id,eqt_code_cat
0,0,10
1,1,15
2,2,0
3,3,13
4,4,0
...,...,...
160795,160795,13
160796,160796,1
160797,160797,3
160798,160798,11


In [50]:
X_train_encoded

Unnamed: 0,order_id,price,bid,ask,bid_size,ask_size,flux,venue_4,venue_1,venue_5,venue_2,venue_0,venue_3,action_A,action_D,action_U,side_A,side_B,trade_false,trade_true
0,0,0.30,0.00,0.01,100,1,100,True,False,False,False,False,False,True,False,False,True,False,True,False
1,1,-0.17,0.00,0.01,100,1,100,True,False,False,False,False,False,True,False,False,False,True,True,False
2,2,0.28,0.00,0.01,100,1,-100,True,False,False,False,False,False,False,True,False,True,False,True,False
3,3,0.30,0.00,0.01,100,1,100,True,False,False,False,False,False,True,False,False,True,False,True,False
4,4,0.37,0.00,0.01,100,1,-100,True,False,False,False,False,False,False,True,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16079995,61,1.32,0.01,0.06,735,261,-100,True,False,False,False,False,False,False,True,False,True,False,True,False
16079996,70,0.06,0.01,0.06,735,361,100,False,False,False,False,True,False,True,False,False,True,False,True,False
16079997,71,1.26,0.01,0.06,735,361,100,True,False,False,False,False,False,True,False,False,True,False,True,False
16079998,72,1.26,0.01,0.06,735,361,100,True,False,False,False,False,False,True,False,False,True,False,True,False


In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.utils import to_categorical

 

# Encode Categorical Variables using get_dummies for X_train
 
# Standardize Features
scaler = StandardScaler()
X_flat = X_train_encoded.values.reshape(-1, X_train_encoded.shape[1])
X_standardized = scaler.fit_transform(X_flat)

# Reshape Data to match the neural network's expected input
num_sequences = int(X_flat.shape[0] / 100)
num_features = X_standardized.shape[1]
X_train_reshaped = X_standardized.reshape(num_sequences, 100, num_features)

# Convert y_train to categorical if it represents classes
num_classes = len(np.unique(y_train))
y_train_categorical = to_categorical(y_train, num_classes=num_classes)

# Advanced Feature Engineering (Statistical Features Example)
# Assuming a simple function to calculate a rolling mean as an example
def add_rolling_mean_feature(data, window_size=5):
    rolling_means = np.mean(data.reshape(-1, window_size, data.shape[-1]), axis=1)
    # Reshape to concatenate with the original data
    rolling_means_repeated = np.repeat(rolling_means, window_size, axis=0)
    return np.concatenate([data, rolling_means_repeated.reshape(data.shape)], axis=-1)

# Apply feature engineering
X_train_engineered = add_rolling_mean_feature(X_train_reshaped)

# Data Augmentation (Simple Noise Addition Example)
def augment_data(data, noise_level=0.01):
    noise = np.random.normal(loc=0.0, scale=noise_level, size=data.shape)
    return data + noise

X_train_augmented = augment_data(X_train_engineered)

# Assuming the model is already defined and compiled as `model`
history = model.fit(
    X_train_augmented, 
    y_train_categorical, 
    epochs=10, 
    batch_size=32, 
    validation_split=0.2
)
