## Load Packages

In [6]:
import pandas as pd
import os
from pathlib import Path
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

## Data Processing

In [2]:
def read_partitioned_parquet(base_dir, num_partitions, file_name='part-0.parquet'):
    """
    Read Parquet files from multiple partition directories and combine them into a single DataFrame
    
    Parameters:
    base_dir (str): Base directory containing the partition folders
    num_partitions (int): Number of partition directories (0 to num_partitions-1)
    file_name (str): Name of the Parquet file in each partition directory
    
    Returns:
    pandas.DataFrame: Combined DataFrame from all partition files
    """
    dfs = []
    
    for i in range(num_partitions):
        # Construct the directory path for each partition
        partition_dir = os.path.join(base_dir, f"partition_id={i}")
        file_path = os.path.join(partition_dir, file_name)
        
        try:
            # Read the Parquet file from the partition
            df = pd.read_parquet(file_path)
            dfs.append(df)
            print(f"Successfully read partition id={i}")
            
        except FileNotFoundError:
            print(f"Warning: File not found in partition id={i} at {file_path}")
            continue
        except Exception as e:
            print(f"Error reading partition id={i}: {str(e)}")
            continue
    
    if not dfs:
        print("No files were successfully read")
        return None
    
    # Combine all DataFrames
    combined_df = pd.concat(dfs, ignore_index=True)
    
    # Print summary information
    print("\nSummary:")
    print(f"Total number of partitions read: {len(dfs)}")
    print(f"Total number of rows: {len(combined_df)}")
    print(f"Number of columns: {len(combined_df.columns)}")
    
    
    return combined_df

# Example usage

base_directory = "C:/Users/oybw/Desktop/Jane Street/jane-street-real-time-market-data-forecasting/train.parquet"  # Replace with your base directory path
num_partitions = 1
    
# Read all partitions
df = read_partitioned_parquet(
    base_dir=base_directory,
    num_partitions=num_partitions
    )
    
if df is not None:   
    # Show first few rows
    print("\nFirst few rows of combined data:")
    print(df.head())
        


Successfully read partition id=0

Summary:
Total number of partitions read: 1
Total number of rows: 1944210
Number of columns: 92

First few rows of combined data:
   date_id  time_id  symbol_id    weight  feature_00  feature_01  feature_02  \
0        0        0          1  3.889038         NaN         NaN         NaN   
1        0        0          7  1.370613         NaN         NaN         NaN   
2        0        0          9  2.285698         NaN         NaN         NaN   
3        0        0         10  0.690606         NaN         NaN         NaN   
4        0        0         14  0.440570         NaN         NaN         NaN   

   feature_03  feature_04  feature_05  ...  feature_78  responder_0  \
0         NaN         NaN    0.851033  ...   -0.281498     0.738489   
1         NaN         NaN    0.676961  ...   -0.302441     2.965889   
2         NaN         NaN    1.056285  ...   -0.096792    -0.864488   
3         NaN         NaN    1.139366  ...   -0.296244     0.408499   


## Check NaN Rows

In [7]:
# Count rows with any NaN value
rows_with_nan = df.isna().any(axis=1).sum()
print(f"Number of rows containing NaN: {rows_with_nan}")

# Or even simpler, to see NaN count per column
print("\nNaN count per column:")
print(df.isna().sum())

Number of rows containing NaN: 1944210

NaN count per column:
date_id              0
time_id              0
symbol_id            0
weight               0
feature_00     1944210
                ...   
responder_4          0
responder_5          0
responder_6          0
responder_7          0
responder_8          0
Length: 92, dtype: int64


## Delete NaN Columns

In [4]:
def drop_nan_columns(df, threshold=1.0):
    """
    Remove columns from DataFrame that contain NaN values based on a threshold.
    
    Parameters:
    df (pandas.DataFrame): Input DataFrame
    threshold (float): Threshold of NaN values (between 0 and 1) above which to drop column
                      Default is 1.0 (drop if any NaN exists)
    
    Returns:
    pandas.DataFrame: DataFrame with NaN columns removed
    """
    # Calculate the percentage of NaN values in each column
    nan_percent = df.isna().mean()
    
    # Get columns to keep (those with NaN percentage below threshold)
    cols_to_keep = nan_percent[nan_percent < threshold].index
    
    # Return DataFrame with only those columns
    return df[cols_to_keep]

# specify a threshold (e.g., drop columns with >50% NaN values)
clean_df = drop_nan_columns(df, threshold=0.5)
print(clean_df)

         date_id  time_id  symbol_id    weight  feature_05  feature_06  \
0              0        0          1  3.889038    0.851033    0.242971   
1              0        0          7  1.370613    0.676961    0.151984   
2              0        0          9  2.285698    1.056285    0.187227   
3              0        0         10  0.690606    1.139366    0.273328   
4              0        0         14  0.440570    0.955200    0.262404   
...          ...      ...        ...       ...         ...         ...   
1944205      169      848         19  3.438631   -0.028087    0.287438   
1944206      169      848         30  0.768528   -0.022584    0.442352   
1944207      169      848         33  1.354696   -0.024804    0.420692   
1944208      169      848         34  1.021797   -0.016138    0.303561   
1944209      169      848         38  1.570022   -0.017634    0.271368   

         feature_07  feature_08  feature_09  feature_10  ...  feature_78  \
0          0.263400   -0.891687    

## Training Data Preparation

In [9]:
def prepare_data(df, sequence_length=10):
    """Prepare data for LSTM model"""
    # Select input features
    id_columns = ['date_id', 'time_id', 'symbol_id']
    feature_columns = [col for col in df.columns if 'feature_' in col]
    input_columns = id_columns + feature_columns
    
    # Separate input and output
    X = df[input_columns].values
    y = df['responder_6'].values
    
    # Scale the features
    scaler_X = StandardScaler()
    X_scaled = scaler_X.fit_transform(X)
    
    # Create sequences
    X_sequences = []
    y_sequences = []
    
    for i in range(len(X_scaled) - sequence_length):
        X_sequences.append(X_scaled[i:(i + sequence_length)])
        y_sequences.append(y[i + sequence_length])
    
    return np.array(X_sequences), np.array(y_sequences), scaler_X

In [7]:
class TimeSeriesDataset(Dataset):
    """Custom Dataset for time series data"""
    def __init__(self, X, y):
        self.X = torch.FloatTensor(X)
        self.y = torch.FloatTensor(y)
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

## LSTM Model

In [8]:
class LSTMModel(nn.Module):
    """LSTM Model"""
    def __init__(self, input_size, hidden_size=256, num_layers=2, dropout=0.2):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout
        )
        
        self.fc = nn.Sequential(
            nn.Linear(hidden_size, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
        
    def forward(self, x):
        # Initialize hidden state with zeros
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        
        # Forward propagate LSTM
        lstm_out, _ = self.lstm(x, (h0, c0))  # out: tensor of shape (batch_size, seq_length, hidden_size)
        
        # Get the output from the last time step
        out = self.fc(lstm_out[:, -1, :])
        return out

## Training

In [10]:
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader

def train_model(model, train_loader, val_loader, criterion, optimizer,schedular, num_epochs, device):
    """Train the model with progress bars"""
    train_losses = []
    val_losses = []
    best_val_loss = float('inf')
    best_model = None
    
    # Create epoch progress bar
    epoch_pbar = tqdm(range(num_epochs), desc='Training Progress', position=0)
    
    for epoch in epoch_pbar:
        # Training
        model.train()
        train_loss = 0
        
        # Create batch progress bar
        batch_pbar = tqdm(
            train_loader, 
            desc=f'Epoch {epoch+1}/{num_epochs}',
            leave=False,
            position=1
        )
        
        for batch_X, batch_y in batch_pbar:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs.squeeze(), batch_y)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            
            # Update batch progress bar
            current_lr = optimizer.param_groups[0]['lr']
            batch_pbar.set_postfix({
                    'batch_loss': f'{loss.item():.4f}',
                    'lr': f'{current_lr:.6f}'
                    })
        
        # Average training loss
        train_loss /= len(train_loader)

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            val_pbar = tqdm(
                val_loader,
                desc='Validation',
                leave=False,
                position=1
            )
            
            for batch_X, batch_y in val_pbar:
                batch_X, batch_y = batch_X.to(device), batch_y.to(device)
                outputs = model(batch_X)
                loss = criterion(outputs.squeeze(), batch_y)
                val_loss += loss.item()
                
                # Update validation progress bar
                val_pbar.set_postfix({'val_batch_loss': f'{loss.item():.4f}'})
        
        # Average losses
        val_loss /= len(val_loader)
        
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        
        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model = model.state_dict()
        
        # Update epoch progress bar
        epoch_pbar.set_postfix({
            'train_loss': f'{train_loss:.4f}',
            'val_loss': f'{val_loss:.4f}',
            'best_val_loss': f'{best_val_loss:.4f}'
        })
    
    # Close progress bars
    epoch_pbar.close()
    
    # Load best model
    model.load_state_dict(best_model)
    return model, train_losses, val_losses

## Main part

In [None]:
# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)
    
# Parameters
SEQUENCE_LENGTH = 10
BATCH_SIZE = 32
EPOCHS = 50
LEARNING_RATE = 0.001
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
# Prepare data
X_sequences, y_sequences, scaler_X = prepare_data(df, SEQUENCE_LENGTH)
    
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X_sequences, 
y_sequences, 
test_size=0.2, 
random_state=42,
shuffle=False
    )
    
# Create datasets
train_dataset = TimeSeriesDataset(X_train, y_train)
test_dataset = TimeSeriesDataset(X_test, y_test)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
    
# Initialize model
input_size = X_sequences.shape[2]  # Number of features
model = LSTMModel(input_size=input_size).to(DEVICE)
    
# Loss function and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
    
# Train model
model, train_losses, val_losses = train_model(
    model, 
    train_loader, 
    test_loader, 
    criterion, 
    optimizer, 
    EPOCHS, 
    DEVICE
    )
    
# Evaluate model
model.eval()
test_predictions = []
test_actual = []
    
with torch.no_grad():
    for batch_X, batch_y in test_loader:
        batch_X = batch_X.to(DEVICE)
        outputs = model(batch_X)
        test_predictions.extend(outputs.cpu().numpy())
        test_actual.extend(batch_y.numpy())
    
# Calculate MAE
mae = np.mean(np.abs(np.array(test_predictions) - np.array(test_actual)))
print(f'\nTest MAE: {mae:.4f}')

## Model Evaluation

In [None]:
# Evaluate model
model.eval()
test_predictions = []
test_actual = []
    
with torch.no_grad():
    for batch_X, batch_y in test_loader:
        batch_X = batch_X.to(DEVICE)
        outputs = model(batch_X)
        test_predictions.extend(outputs.cpu().numpy())
        test_actual.extend(batch_y.numpy())
    
# Calculate metrics
mae = np.mean(np.abs(np.array(test_predictions) - np.array(test_actual)))
mse = np.mean((np.array(test_predictions) - np.array(test_actual))**2)
rmse = np.sqrt(mse)
    
print('\nTest Metrics:')
print(f'MAE: {mae:.4f}')
print(f'MSE: {mse:.4f}')
print(f'RMSE: {rmse:.4f}')
    