In [38]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0


In [39]:
resume_training = True
nb_epoch = 960
model_1 = True



model_2 = not model_1

## Exploring the Dataset 

In [40]:
PLOTS = False
trade_plots = False 

import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns

In [41]:
x_train_path = "..\Data\X_train.csv"
y_train_path = "..\Data\y_train.csv"

In [42]:
# x_train = pd.read_csv(x_train_path)
# y_train = pd.read_csv(y_train_path)
# y_train.head()

Data = 504 days × 24 stocks × 20 observations/day × 100 events/observation  

Here is a description of each column in the dataset. <br>

| Column | Description |
| ------ | ------------ |
| **Obs_id** | which observation are we taking into account <br>-> for that observation we will keep track of the next 100 operations in the book orders |
| **Venue_id** | for a given stock, exchanges can happen across many venues :  this id tracks which venue we consider <br> ==> it could be of importance (some stocks are typically traded across many venues ?) |
| **order_id** | for a given observation sequence, each operation is related to an order. An order can be added, updated, deleted. <br>The order_id allows to track the lifecycle of individual orders within a sequence.   |
| **action** |  A (adding an order to the book) , D (Deleting an order from the book), U = updating an action from the book |
| **side** | B (bids, values to buy the action) , A (Ask, values to sell the action) 
| **Price** | - price : price of the order that was affected. *This best_bid_price , at the time of the first event, is substracted from all price reated columns (price, bid, ask  ) |
| **bid , ask** |- bid , ask == best bid (highest bid) /best ask (lowest ask)   |
| **bid_size, ask_size** |  volume of orders at the best bid, respectively ask, price  , on the *aggregated book* <br> => this too could be a valuable information, perhaps some stocks are encoutering more volume than others.  |
|**flux** | the change in volume at a specific price level in the order book due to a particular event |
|**Trade**|A boolean true or false to indicate whether a deletion or update event was due to a trade or due to a cancellation. <br> Most Deletions and updates actually dont occur from Trades

### Example: For a given Observation

| `order_id` | `action` | `price` | `side` | **Description**                                          |
|------------|----------|---------|-------|----------------------------------------------------------|
| 0          | A        | 100.5   | B     | A new order (ID 0) is added at 100.5 on the bid side.    |
| 1          | A        | 101.0   | A     | A new order (ID 1) is added at 101.0 on the ask side.    |
| 0          | U        | 100.5   | B     | The order with ID 0 is updated (e.g., quantity changed). |
| 1          | D        | 101.0   | A     | The order with ID 1 is deleted (removed from the book).  |


# Explore trade info intuition 

In [43]:
if trade_plots:
        
    # Filter actions that are either 'D' or 'U'
    du_actions = df[(df['action'].isin(['D', 'U']))]

    # Count actions where 'trade' is True among 'D' or 'U'
    du_trades = du_actions[du_actions['trade'] == True]

    # Calculate the percentage
    percentage = (len(du_trades) / len(du_actions)) * 100

    # Display the result
    print(f"Percentage of 'D' or 'U' actions coming from trades: {percentage:.2f}%")

    # Merge the main DataFrame (df) with y_train using obs_id
    df = df.merge(y_train, on='obs_id', how='left')  # Assuming y_train has columns ['obs_id', 'stock']

    # Define a function to calculate the percentage for each observation
    def calculate_percentage(sub_df):
        is_du = sub_df['action'].isin(['D', 'U'])
        is_du_trade = is_du & (sub_df['trade'] == True)
        return (is_du_trade.sum() / is_du.sum()) * 100 if is_du.sum() > 0 else 0

    # Group by obs_id and calculate percentage
    df_obs = df.groupby('obs_id').apply(calculate_percentage).reset_index(name='percentage')

    df_obs = df_obs.merge(y_train, on='obs_id', how='left')

    # Group by stock and calculate statistics
    stock_stats = df_obs.groupby('eqt_code_cat')['percentage'].agg(['mean', 'std', 'min', 'max'])

    plt.figure(figsize=(12, 8))
    sns.boxplot(x='eqt_code_cat', y='percentage', data=df_obs)
    plt.title('Distribution of Percentages by Stock')
    plt.xlabel('Stock')
    plt.ylabel('Percentage of D/U Actions from Trades')
    plt.xticks(rotation=90)
    plt.show()


        
    # Cap percentages at 6%
    df_obs_capped = df_obs[df_obs['percentage'] <= 6]

    # Create subplots: One histogram per stock
    stocks = df_obs_capped['eqt_code_cat'].unique()  # Get unique stocks
    num_stocks = len(stocks)

    # Define the number of rows and columns for subplots
    ncols = 4  # Number of columns
    nrows = (num_stocks + ncols - 1) // ncols  # Calculate rows based on number of stocks

    # Create the figure
    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(15, nrows * 3))
    axes = axes.flatten()  # Flatten axes for easier indexing

    # Plot each stock
    for i, stock in enumerate(stocks):
        # Filter the data for the current stock
        stock_data = df_obs_capped[df_obs_capped['eqt_code_cat'] == stock]
        
        # Plot the histogram for the stock
        sns.histplot(
            stock_data,
            x='percentage',
            bins=30,
            ax=axes[i],
            element='step',
            stat='percent'  # Show percentages instead of counts
        )
        axes[i].set_title(f'Stock {stock}', fontsize=12)
        axes[i].set_xlim(0, 6)  # Cap the percentage at 6
        axes[i].set_ylim(0, 100)  # Cap the y-axis at 100%
        axes[i].set_xlabel('Percentage of D/U Actions', fontsize=10)
        axes[i].set_ylabel('Percentage (%)', fontsize=10)

    # Remove unused subplots
    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])

    plt.tight_layout()
    plt.show()



        



There seems to be three modes : 0% of trades , 2% of trades, 4% of trades 

# Some ideas after the initial exploration

For a given observation, what can help determine the stock ?    
we could use visualisation (for a given stock : average volatility observed , average number of increase of orders, average number of decrease of oders etc simple metrics as such)  
  
To go more in depth : we must use embeddings of our data, think of interesting traits, use correlations, try and reduce the dimensionality.  
--> ideas seem endless we could train an embedding matrix to predict the venue idk 

Combien d'actions d'affilée ? volatilité du prix sur les 100 actions ? prix max et min enregistrés ? % de trade ? 

# Visualisations supplémentaires

volatilité : affichons la distribution de prix des variations de prix pour chacune des 24 actions

In [44]:
# x_train = pd.read_csv(x_train_path)
# y_train = pd.read_csv(y_train_path)
# y_train.head()

In [45]:
# def distrib_variations(data, stock ):
#     # for a given sequence, we keep : lowest , highest (price)

#     #une sequence est définie 

# Reproducing the Benchmark

The benchmark for the challenge is the following architecture :  

Preprocess:  
converting each event into a 30-dimensionnal vector.  
group each 100-event-observations into a single "observation" vector, size 100x30
  
Architecture:  
bidirectionnal GRU network, with 64 hidden units.  Producing a single 128 dimensional vector per "observation vector" .  
Many to one architecture :converts the "observation vector" (of 100 individual events) into a single embedding of size 124.  
Then two dense layers 124 -> 64 with SeLU activation, 64 -> 24 with softmax activation  
  
Training :  
Cross entropy Loss  
batch size : 1000 "obervation vectors"  (dim : 1000x100x30)  
optimizer : Base ADAM with lr = 10e-3

In [46]:
import numpy as np
from tensorflow.keras.datasets import imdb
from tensorflow.keras import backend as K
from tensorflow.keras import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.layers import Dense, Activation, Embedding, Dropout, Input, LSTM, Reshape, Lambda, RepeatVector

## Pre processing of the data 


Exact pre process structure isnt described, so I will do what sounds relevant. 

Here is a description of each column in the dataset. <br>

| Column | Description |
| ------ | ------------ |
| **Obs_id** | which observation are we taking into account <br>-> for that observation we will keep track of the next 100 operations in the book orders |
| **Venue_id** | for a given stock, exchanges can happen across many venues :  this id tracks which venue we consider <br> ==> it could be of importance (some stocks are typically traded across many venues ?) |
| **order_id** | for a given observation sequence, each operation is related to an order. An order can be added, updated, deleted. <br>The order_id allows to track the lifecycle of individual orders within a sequence.   |
| **action** |  A (adding an order to the book) , D (Deleting an order from the book), U = updating an action from the book |
| **side** | B (bids, values to buy the action) , A (Ask, values to sell the action) 
| **Price** | - price : price of the order that was affected. *This best_bid_price , at the time of the first event, is substracted from all price reated columns (price, bid, ask  ) |
| **bid , ask** |- bid , ask == best bid (highest bid) /best ask (lowest ask)   |
| **bid_size, ask_size** |  volume of orders at the best bid, respectively ask, price  , on the *aggregated book* <br> => this too could be a valuable information, perhaps some stocks are encoutering more volume than others.  |
|**flux** | the change in volume at a specific price level in the order book due to a particular event |
|**Trade**|A boolean true or false to indicate whether a deletion or update event was due to a trade or due to a cancellation. <br> Most Deletions and updates actually dont occur from Trades

In [47]:
#venue => one hot encode it 
#action => one hot encode it 
#side : => one hot encode it 
#price,bid,ask,bid_size,ask_size,flux : no transfo
#trade : one hot encode it 

#Justifications ? => none, just exploring 


## Preprocess data

In [48]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import h5py  # For saving large arrays in memory-efficient HDF5 format
from tqdm import tqdm 

In [49]:
def encode_df(df_to_encode):
    categorical_columns = ['venue','action','side','trade']
    df_pandas_encoded = pd.get_dummies(df_to_encode,columns=categorical_columns,drop_first=True,dtype=int)

    return df_pandas_encoded

In [91]:
def transform_df(df):
    #we want to drop obs_id, order_id 
    df = df.drop(['order_id','obs_id'],axis=1,errors='ignore') #dropping obs id because they did so in the benchmark
    return df

In [51]:
def correct_df(
    df, 
    column_names=[
        'price', 'bid', 'ask', 'bid_size', 'ask_size', 'flux', 
        'venue_1', 'venue_2', 'venue_3', 'venue_4', 'venue_5', 'action_D', 'action_U', 'side_B',
        'trade_True'
    ]
):
    """
    Ensures the DataFrame has columns in a specified order, adding missing columns with zeros.
    
    Parameters:
        df (pd.DataFrame): The DataFrame to correct.
        column_names (list): List of column names in the desired order (default provided).
    
    Returns:
        pd.DataFrame: The corrected DataFrame.
    """
    # Add missing columns with zeros
    for column in column_names:
        if column not in df.columns:
            df[column] = 0
    
    # Reorder columns to match the specified order
    df = df[column_names]
    
    return df


In [52]:
# df_test = x_train.head(int(10e3))
# df_test = encode_df(df_test)
# df_test = transform_df(df_test)
# df_test


In [53]:
def create_lstm_data(data, k):
    '''
    input:
        data - the pandas object of (n_observations x 100 , p) shape, where n is the number of rows,
               p is the number of predictors
        k    - the length of the sequences, namely, the number of previous rows 
               (including current) we want to use to predict the target.
    output:
        X_data - the predictors numpy matrix of (n-k, k, p) shape
    '''


    # initialize zero matrix of (n-k, k, p) shape to store the n-k number
    # of sequences of k-length and zero array of (n-k, 1) to store targets
    X_data = np.zeros((data.shape[0]//k, k, data.shape[1]))
    
    # run loop to slice k-number of previous rows as 1 sequence to predict
    # 1 target and save them to X_data matrix and y_data list
    for i in range(data.shape[0]//k):
        cur_sequence = data.iloc[k*i: k*(i+1), :]
                
        X_data[i,:,:] = cur_sequence
    
    return X_data

In [54]:
#installing a library to handle out-of-memeory packages https://stackoverflow.com/questions/30376581/save-numpy-array-in-append-mode/64403144#64403144

In [55]:
# pip install npy-append-array

In [56]:
from npy_append_array import NpyAppendArray

In [57]:
import pandas as pd
import numpy as np
from tqdm import tqdm

def process_data_chunked(x_train_path, y_train_path, output_prefix, chunk_size=10_000, seq_len=100):
    """
    Process data chunk by chunk and save the results incrementally.
    
    Args:
    - x_train_path: Path to the X_train CSV file.
    - y_train_path: Path to the y_train CSV file.
    - output_prefix: Prefix for the output files.
    - chunk_size: Number of rows to process in each chunk.
    - seq_len: Length of each sequence for LSTM.
    """
    # Read y_train (the target file) entirely as it's small and doesn't need chunking
    y_train_full = pd.read_csv(y_train_path).drop('obs_id', axis=1)

    # Use tqdm to show progress
    total_rows = sum(1 for _ in open(x_train_path)) - 1  # Get total rows excluding header
    num_chunks = (total_rows + chunk_size - 1) // chunk_size  # Calculate total chunks

    X_train_npy_name = f"..\Data\{output_prefix}_X_train.npy"
    y_train_npy_name = f"..\Data\{output_prefix}_y_train.npy"


    # Process the X_train file in chunks
    with NpyAppendArray(X_train_npy_name, delete_if_exists=True) as npaa:
        for i, chunk in enumerate(tqdm(pd.read_csv(x_train_path, chunksize=chunk_size), desc="Processing Chunks", total=num_chunks)):
            # Apply the transformation functions
            chunk = correct_df(transform_df(encode_df(chunk)))

            # Create LSTM-compatible data for this chunk
            X_data = create_lstm_data(chunk, seq_len)

            npaa.append(X_data)

            # Clear memory for the current chunk
            del X_data, chunk


    np.save(y_train_npy_name,y_train_full)
    print(f"Processing completed. X_train and y_train saved as {output_prefix}_X_train.npy and {output_prefix}_y_train.npy.")


In [58]:
# date = '04-12'
# process_data_chunked(x_train_path, y_train_path, date, chunk_size=10_000, seq_len=100)


# Code

In [59]:
# date = '05-12'
# x_path_npy = fr'..\Data\{date}_X_train_LSTM.npy'
# y_path_npy = fr'..\Data\{date}_y_train.npy'
# # Load the .npy file
# data = np.load(x_path_npy)

# # Display the data (e.g., shape, a sample of the contents)
# print("Data Shape:", data.shape)

In [60]:
# distribs_vol_2 = [[] for k in range(24)]

# for n in range(y_train.shape[0]):
#     df_n = df.iloc[n*100:(n+1)*100,:]
#     stock = df_n.iloc[0,:]['eqt_code_cat']
#     max = df_n['price'].max()
#     min = df_n['price'].min()

#     distribs_vol_2[stock].append(max+min)
    

    
# for i,stock in enumerate(distribs_vol_2):
#     print(f'stock {i}, volatilité moyenne: {np.mean(stock)}')

In [61]:
# import matplotlib.pyplot as plt
# import numpy as np
# # Define the range and bin width

# bin_width = 0.5
# start = -300
# end = 300

# # Create bins
# bins = np.arange(start, end + bin_width, bin_width)

# def plot_volatilites(distribs):
#     """
#     Affiche les distributions de volatilité sous forme d'histogrammes.

#     Parameters:
#     - distribs : list of arrays
#         Chaque élément de la liste représente les volatilités pour un stock.
#     """
#     num_stocks = len(distribs)
#     plt.figure(figsize=(5, 5 * num_stocks))  # Ajustez la taille pour des visualisations claires
    
#     for i, stock_vol in enumerate(distribs):
#         plt.subplot(num_stocks, 1, i + 1)  # Crée un subplot pour chaque stock
#         plt.hist(stock_vol, bins=bins, edgecolor='black')  # Histogramme
#         plt.title(f'Distribution des volatilités pour le stock {i}')
#         plt.xlabel('Volatilité')
#         plt.ylabel('Fréquence')
#         plt.ylim(top=1000)
#         plt.xlim([-300,300])
    
#     plt.tight_layout()  # Ajuste les espacements entre les subplots
#     plt.show()

# # Exemple d'utilisation
# # distribs_vol est supposé être une liste d'arrays de volatilités
# plot_volatilites(distribs_vol_2)


In [62]:
# from sklearn.model_selection import train_test_split
# from sklearn.naive_bayes import GaussianNB

# date = '05-12'

# x_path_npy = fr'..\Data\{date}_X_train_add.npy'
# y_path_npy = fr'..\Data\{date}_y_train.npy'

# X_full = np.load(x_path_npy)#, mmap_mode="r")
# y_full = np.load(y_path_npy).ravel()#,mmap_mode="r")

# X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=0.2, random_state=0)
# gnb = GaussianNB()
# y_pred = gnb.fit(X_train, y_train).predict(X_test)
# print("Number of mislabeled points out of a total %d points : %d"
#       % (X_test.shape[0], (y_test != y_pred).sum()))

In [63]:
# y_path = r"D:\Desktop\Coding-Projects\Prediction-Challenge\Notebooks\27-11_y_train.npy"
# # Load the .npy file
# y = np.load(y_path)

# # Display the data (e.g., shape, a sample of the contents)
# print("Data Shape:", y.shape)


## Data generator

In [64]:
## for each sequence, We want to generate some value that seem interesting regarding the full sequence, such features could be generated by the LSTM but the search space is so big that we implement them by hannd

# def genrate_additional_features(sequence: np.array)->np.array :
#     '''
#     given a sequence of 100x19 features "sequence" 
#     returns a np array "features" with relevant features '''

In [65]:
import numpy as np
import keras
import gc 

class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, list_IDs,x_path_npy,y_path_npy, batch_size=10050, dim=(100,15),
                 n_classes=24, shuffle=True):
        'Initialization'
        self.dim = dim
        self.batch_size = batch_size
        self.list_IDs = list_IDs
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.on_epoch_end()
        self.x_path = x_path_npy
        self.y_path = y_path_npy

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(list_IDs_temp)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples' # X : (n_samples, *dim)
        # Initialization
        X = np.empty((self.batch_size, *self.dim))
        x_additional = np.empty((self.batch_size))
        y = np.empty((self.batch_size))

        # Generate data
        X_full = np.load(self.x_path, mmap_mode="r")
        y_full = np.load(self.y_path,mmap_mode="r")

        for i, ID in enumerate(list_IDs_temp):
            # Store sample
            X[i,] = X_full[ID]

            # Store class
            y[i] = y_full[ID].astype(int)[0]
        
        del X_full
        del y_full
        gc.collect()


        

        return X, keras.utils.to_categorical(y, num_classes=self.n_classes)

In [66]:
# # Datasets
# full_ids = np.arange(160800)

# # Shuffle the IDs to ensure randomness
# np.random.shuffle(full_ids)

# # Compute the split index for 80/20
# split_index = int(len(full_ids) * 13/16) #13/16 test, 3/16 val

# # Split the IDs
# train_ids = full_ids[:split_index]
# val_ids = full_ids[split_index:]



# # Define file paths
# train_ids_path = "train_ids.txt"
# val_ids_path = "val_ids.txt"

# # Save the IDs to text files
# with open(train_ids_path, "w") as train_file:
#     for id in train_ids:
#         train_file.write(f"{id}\n")

# with open(val_ids_path, "w") as val_file:
#     for id in val_ids:
#         val_file.write(f"{id}\n")

# print("Train and validation IDs saved.")


In [67]:
# Parameters
params = {'dim': (100,15),
          'batch_size': 10050,
          'n_classes': 24,
          'shuffle': True}



# Define file paths
train_ids_path = "train_ids.txt"
val_ids_path = "val_ids.txt"

# Read the IDs from text files
with open(train_ids_path, "r") as train_file:
    train_ids = [int(line.strip()) for line in train_file]

with open(val_ids_path, "r") as val_file:
    val_ids = [int(line.strip()) for line in val_file]

print("Train and validation IDs loaded.")

date = '05-12'
x_path_npy = fr'..\Data\{date}_X_train_LSTM.npy'
y_path_npy = fr'..\Data\{date}_y_train.npy'

# Generators
training_generator = DataGenerator(train_ids,x_path_npy,y_path_npy, **params)
val_generator = DataGenerator(val_ids,x_path_npy,y_path_npy, **params)


Train and validation IDs loaded.


## Model

Now we have the data in a satisfactory format.  
each row of our X_train is made of 100 event, each of these events is represented in a 18 dimension space.  
and for each row of our train set, we have a single target value : in y  
Let's now create a similar architecture as the benchmark  

In [68]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Bidirectional,LSTM
from tensorflow.keras.models import load_model
from keras.layers import Input, Bidirectional, LSTM, Dense, Dropout, BatchNormalization
from keras.regularizers import l2
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

In [69]:

if not resume_training:

    if model_1:


        # Input for fixed-length (length = 100) sequences of event observation (dimension = 19)
        inputs = keras.Input(shape=(100,15))

        # Add 2 bidirectional LSTMs
        x = Bidirectional(LSTM(64))(inputs)
        x = Dense(64)(x)

        # Add a classifier
        outputs = Dense(24, activation="softmax")(x)
        model = keras.Model(inputs, outputs)
        model.summary()

        
    
    if model_2:
        

        inputs = Input(shape=(100, 15))
        x = Bidirectional(LSTM(64, return_sequences=True, dropout=0.3, recurrent_dropout=0.3))(inputs)
        x = Bidirectional(LSTM(64, dropout=0.3, recurrent_dropout=0.3))(x)
        x = Dense(64, activation="relu", kernel_regularizer=l2(0.01))(x)
        x = BatchNormalization()(x)
        x = Dropout(0.4)(x)
        outputs = Dense(24, activation="softmax")(x)

        model = keras.Model(inputs, outputs)
        model.summary()




    model.compile(optimizer=keras.optimizers.Adam(learning_rate=3e-3), loss="categorical_crossentropy", metrics=["accuracy"])


### Callbacks

In [70]:
# Import necessary libraries for callbacks
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, TensorBoard, CSVLogger
from keras.callbacks import ModelCheckpoint, Callback


if model_2:
    model_path = 'best_model2.h5'
    csv_file = 'training_log_model2.csv'
    mdl = 'model2'

if model_1:
    model_path = 'best_model.h5'
    csv_file = 'training_log.csv'
    mdl = 'model1'

class SaveEveryNEpoch(Callback):
    def __init__(self, save_path, n_epochs):
        self.save_path = save_path
        self.n_epochs = n_epochs

    def on_epoch_end(self, epoch, logs=None):
        if (epoch + 1) % self.n_epochs == 0:
            self.model.save(f"{self.save_path}_epoch_{epoch + 1}.h5")
            print(f"Model saved at epoch {epoch + 1}")


save_every_100_epochs = SaveEveryNEpoch(save_path=mdl+"_checkpoint", n_epochs=100)

# Define callbacks to enhance and monitor the training process
callbacks = [
    # 1. ModelCheckpoint:
    # Saves the model to a file ('best_model.h5') whenever the validation loss ('val_loss') improves.
    # Ensures that only the best version of the model (with the lowest validation loss) is saved.
    ModelCheckpoint(
        filepath=model_path,   # Filepath to save the model
        monitor='val_loss',        # Metric to monitor
        save_best_only=True,       # Save only the best model
        mode='min'                 # Minimize the 'val_loss'
    ),
    
    # 2. EarlyStopping:
    # Stops training if the validation loss does not improve for 'patience' epochs (5 in this case).
    # Prevents overfitting and saves time by stopping early when progress stalls.
    EarlyStopping(
        monitor='val_loss',        # Metric to monitor
        patience=500,                # Number of epochs to wait without improvement
        mode='min',                # Minimize the 'val_loss'
        restore_best_weights=True  # Restore the model weights from the best epoch
    ),
    
    # 3. ReduceLROnPlateau:
    # Reduces the learning rate when the validation loss plateaus (does not improve for 3 epochs here).
    # Helps the model converge better by lowering the learning rate when progress slows down.
    ReduceLROnPlateau(
        monitor='val_loss',        # Metric to monitor
        factor=0.05,                # Factor by which to reduce the learning rate
        patience=5,                # Number of epochs to wait before reducing the learning rate
        min_lr=1e-6                # Minimum learning rate to avoid reducing it too much
    ),
    
    # 4. TensorBoard:
    # Logs training metrics, such as loss and accuracy, for visualization using TensorBoard.
    # Also logs histograms and the computational graph of the model.
    TensorBoard(
        log_dir='./logs',          # Directory to save TensorBoard logs
        histogram_freq=1,          # Log histograms of weights after every epoch
        write_graph=True,          # Save the computation graph
        write_images=True          # Save visualizations of weights and biases
    ),
    
    # 5. CSVLogger:
    # Logs training and validation metrics to a CSV file ('training_log.csv').
    # Useful for tracking metrics over time and for external analysis.
    CSVLogger(
        filename=csv_file,  # Path to save the log file
        append=True                  # Append to existing file if it exists
    ),

    # Save every 100 epochs
    save_every_100_epochs
]

# TRAINING

In [71]:
if resume_training:
    model = load_model(model_path)
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-3), loss="categorical_crossentropy", metrics=["accuracy"])

# Train model on dataset
model.fit(
    x=training_generator,
    validation_data=val_generator,
    epochs=nb_epoch,  # Specify the number of epochs as needed
    callbacks = callbacks
)   

Epoch 1/960
Epoch 2/960
Epoch 3/960
Epoch 4/960
Epoch 5/960
Epoch 6/960
Epoch 7/960
Epoch 8/960
Epoch 9/960
Epoch 10/960
Epoch 11/960
Epoch 12/960
Epoch 13/960
Epoch 14/960
Epoch 15/960
Epoch 16/960
Epoch 17/960
Epoch 18/960
Epoch 19/960
Epoch 20/960
Epoch 21/960
Epoch 22/960
Epoch 23/960
Epoch 24/960
Epoch 25/960
Epoch 26/960
Epoch 27/960
Epoch 28/960
Epoch 29/960
Epoch 30/960
Epoch 31/960
Epoch 32/960
Epoch 33/960
Epoch 34/960
Epoch 35/960
Epoch 36/960
Epoch 37/960
Epoch 38/960
Epoch 39/960
Epoch 40/960
Epoch 41/960
Epoch 42/960
Epoch 43/960
Epoch 44/960
Epoch 45/960
Epoch 46/960
Epoch 47/960
Epoch 48/960
Epoch 49/960
Epoch 50/960
Epoch 51/960
Epoch 52/960
Epoch 53/960
Epoch 54/960
Epoch 55/960
Epoch 56/960
Epoch 57/960
Epoch 58/960
Epoch 59/960
Epoch 60/960
Epoch 61/960
Epoch 62/960
Epoch 63/960
Epoch 64/960
Epoch 65/960
Epoch 66/960
Epoch 67/960
Epoch 68/960
Epoch 69/960
Epoch 70/960
Epoch 71/960
Epoch 72/960
Epoch 73/960
Epoch 74/960
Epoch 75/960
Epoch 76/960
Epoch 77/960
Epoch 78

<keras.callbacks.History at 0x253c9ee5940>

## Prediction with our trained model 

In [110]:
res = pd.read_csv(r"C:\Users\yaeld\Desktop\Coding-Projects\Prediction-Challenge\Data\y_train.csv")

In [116]:
res.describe()

Unnamed: 0,obs_id,eqt_code_cat
count,160800.0,160800.0
mean,80399.5,11.5
std,46419.10598,6.922208
min,0.0,0.0
25%,40199.75,5.75
50%,80399.5,11.5
75%,120599.25,17.25
max,160799.0,23.0


In [120]:
from random import  randint
sequence = [randint(0,20000) for j in range(10)]

#loading a sequence 

for i in sequence:
        
    A = np.load(x_path_npy,mmap_mode='r')[i] #this is a 100x15 sequence
    A = np.expand_dims(A, axis=0)  # Adding batch dimension
    A.shape
    label = np.load(y_path_npy,mmap_mode='r')[i]

    predicted = model.predict(A)
    
    res = predicted[0].argmax()
    print(res)

    top_3_indices = predicted[0].argsort()[-3:][::-1]

    # Get the top 3 values
    top_3_values = predicted[0][top_3_indices]

    print("Top 3 predicted labels :", top_3_indices)
    print("with following Top 3 values:", top_3_values)

    #print(f"Label predicted", predicted[0].argmax())
    print(f"True Label",label[0])

20
Top 3 predicted labels : [20 21  3]
with following Top 3 values: [0.48542622 0.19780622 0.12540379]
True Label 21
0
Top 3 predicted labels : [ 0 16  9]
with following Top 3 values: [0.59157586 0.17859574 0.1512707 ]
True Label 0
20
Top 3 predicted labels : [20 21  3]
with following Top 3 values: [0.51727337 0.20434642 0.11373307]
True Label 20
6
Top 3 predicted labels : [ 6  1 14]
with following Top 3 values: [0.48318508 0.25440148 0.24987142]
True Label 14
20
Top 3 predicted labels : [20 21  3]
with following Top 3 values: [0.34941864 0.23039033 0.16028647]
True Label 21
5
Top 3 predicted labels : [ 5 15  7]
with following Top 3 values: [0.41474846 0.27974904 0.10711259]
True Label 15
21
Top 3 predicted labels : [21 22 20]
with following Top 3 values: [0.30986473 0.25158417 0.14179005]
True Label 20
8
Top 3 predicted labels : [ 8  2 23]
with following Top 3 values: [0.44535515 0.19047794 0.11856452]
True Label 14
17
Top 3 predicted labels : [17  9  0]
with following Top 3 values: [

# Create a prediction submission

In [79]:
X_test_path = "../Data/X_test.csv"

In [None]:
X_test = pd.read_csv(X_test_path)

In [76]:
X_test.head()

Unnamed: 0,obs_id,venue,order_id,action,side,price,bid,ask,bid_size,ask_size,trade,flux
0,0,4,0,A,A,0.15,0.0,0.15,511,100,False,100
1,0,2,1,D,A,0.16,0.0,0.15,511,100,False,-100
2,0,4,2,D,A,1.63,0.0,0.15,511,100,False,-100
3,0,4,3,A,A,1.62,0.0,0.15,511,100,False,100
4,0,2,4,A,A,0.15,0.0,0.15,511,200,False,100


In [93]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from npy_append_array import NpyAppendArray  # Ensure this is correctly installed
import logging

def process_test_data_chunked(x_test_path, output_prefix, chunk_size=10_000, seq_len=100):
    """
    Process data chunk by chunk and save the results incrementally.

    Args:
    - x_test_path: Path to the X_test CSV file.
    - output_prefix: Prefix for the output files.
    - chunk_size: Number of rows to process in each chunk.
    - seq_len: Length of each sequence for LSTM.
    """

    # Configure logging
    logging.basicConfig(filename="process_log.log", level=logging.INFO, format="%(asctime)s - %(message)s")
    logging.info("Starting the chunked processing.")

    # Determine total rows and chunks
    total_rows = sum(1 for _ in open(x_test_path)) - 1  # Get total rows excluding header
    num_chunks = (total_rows + chunk_size - 1) // chunk_size  # Calculate total chunks

    # Output file
    X_test_npy_name = f"../Data/{output_prefix}_X_test.npy"

    try:
        # Process in chunks
        with NpyAppendArray(X_test_npy_name, delete_if_exists=True) as npaa:
            for i, chunk in enumerate(tqdm(pd.read_csv(x_test_path, chunksize=chunk_size), desc="Processing Chunks", total=num_chunks)):
                try:
                    logging.info(f"Processing chunk {i + 1} of {num_chunks}.")

                    # Apply the transformation functions
                    chunk = correct_df(transform_df(encode_df(chunk)))

                    # Create LSTM-compatible data for this chunk
                    X_data = create_lstm_data(chunk, seq_len)

                    # Append to the .npy file
                    npaa.append(X_data)

                    # Clear memory
                    del X_data, chunk

                except Exception as e:
                    logging.error(f"Error in chunk {i + 1}: {e}")
                    continue  # Skip to the next chunk

        logging.info(f"Processing completed. Saved as {output_prefix}_X_test.npy")

    except Exception as e:
        logging.error(f"Critical error during processing: {e}")
        raise


In [80]:
total_rows = sum(1 for _ in open(X_test_path)) - 1
total_rows

8160000

In [82]:
X_test.shape

(8160000, 12)

In [94]:
process_test_data_chunked(X_test_path,"09-12")

Processing Chunks: 100%|██████████| 816/816 [00:40<00:00, 20.37it/s]


In [98]:
npy_path = fr"..\Data\09-12_X_test.npy"
A = np.load(npy_path,mmap_mode='r') #this is a 100x15 sequence
A = np.expand_dims(A, axis=0)  # Adding batch dimension
N = len(A)


In [109]:
len(A[0])

81600

1

In [1]:
def create_submission(npy_path):
    A = np.load(npy_path,mmap_mode='r') #this is N_obsx100x15 
    N = len(A)
    results = []
    
    for i in range(N):
        obs_id = i

        seq = A[i]
        seq = np.expand_dims(seq, axis=0)
        predicted = model.predict(seq)
        res = predicted[0].argmax()

        results.append([obs_id,res])
        
    df_res = pd.DataFrame(results, columns=['obs_id', 'eqt_code_cat'])

    pd.save_csv(df_res,'../Data/submission-09-12.csv',index=False)

    


In [2]:
create_submission(npy_path)

NameError: name 'npy_path' is not defined