In [1]:
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt
from pathlib import Path
from tensorflow.keras.constraints import Constraint
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [2]:
class VolumePredictor:
    def __init__(self, stock, neurons_layer, lags, frequency, direction, validation_split=0.25):
        self.stock = stock
        self.neurons_layer = neurons_layer
        self.lags = lags
        self.frequency = frequency
        self.direction = direction
        self.validation_split = validation_split
        self.scaler = MinMaxScaler()
        self.model = None
        self.history = None
        if self.direction == 'total':
            self.volume = 'volume'
        elif self.direction == 'buy':
            self.volume = 'buyVol'
        else:
            self.volume = 'sellVol'
        
    def load_data(self, file_path):
        table = pq.read_table(file_path)
        df = table.to_pandas()
        df['ts_event'] = pd.to_datetime(df['ts_event'])
        df.set_index('ts_event', inplace=True)

        # Start from 2024/09
        df = df[(df.index >= '2024/09') & (df.index < '2025/02')]
        if self.frequency == '15min':
            df = df.resample('15T').sum(numeric_only=True)
        df = df[(df.index.time >= pd.to_datetime('09:30').time()) & 
                (df.index.time < pd.to_datetime('16:00').time())]
        
        df = df[[self.volume]]
        df = df[df[self.volume] != 0]

        # Verify data structure
        if self.volume not in df.columns:
            volume_col = df.columns[0]
            df = df.rename(columns={volume_col: self.volume})
            print(f"Renamed column '{volume_col}' to '{self.volume}'")

        # Convert to numpy array and handle missing values
        volume_data = df[[self.volume]].fillna(method='ffill')

        return volume_data
            
    def prepare_data(self, data):
        """
        Prepare data for the NAR model with specified number of lags and validation split
        """
        # Scale the data
        scaled_data = self.scaler.fit_transform(data[self.volume].values.reshape(-1, 1))
        
        # Create sequences with dates
        X, y, dates = [], [], []
        for i in range(len(scaled_data) - self.lags):
            X.append(scaled_data[i:(i + self.lags), 0])
            y.append(scaled_data[i + self.lags, 0])
            dates.append(data.index[i + self.lags])
            
        X = np.array(X)
        y = np.array(y)
        dates = pd.DatetimeIndex(dates)

        # Split into train, validation, and test sets
        test_mask = dates >= '2025/01'
        train_val_mask = ~test_mask
        
        X_train_val = X[train_val_mask]
        y_train_val = y[train_val_mask]
        train_val_dates = dates[train_val_mask]
        
        # Further split training data into train and validation
        val_size = int(len(X_train_val) * self.validation_split)
        
        X_train = X_train_val[:-val_size]
        y_train = y_train_val[:-val_size]
        train_dates = train_val_dates[:-val_size]
        
        X_val = X_train_val[-val_size:]
        y_val = y_train_val[-val_size:]
        val_dates = train_val_dates[-val_size:]
        
        X_test = X[test_mask]
        y_test = y[test_mask]
        test_dates = dates[test_mask]
        
        return (X_train, y_train, train_dates), (X_val, y_val, val_dates), (X_test, y_test, test_dates)

    def build_model(self):
        if len(self.neurons_layer) == 1:
            # Build an autoregressive neural network with one hidden layer
            self.model = Sequential(
                [
                Dense(self.neurons_layer[0], activation='sigmoid', input_dim=self.lags),
                Dense(1, activation='linear')
            ]
            )
        else:
            # Build an autoregressive neural network with two hidden layers
            self.model = Sequential([
                Dense(self.neurons_layer[0],
                    activation='relu', 
                    input_dim=self.lags,
                    kernel_initializer='he_normal'), 
                Dense(self.neurons_layer[1], 
                    activation='relu',
                    kernel_initializer='he_normal'),
                Dense(1, activation='linear')
            ])
        
        self.model.compile(
            optimizer=Adam(learning_rate=0.001),
            loss='mse'
        )
    
    def train(self, X_train, y_train, X_val, y_val, epochs=200, batch_size=32):
        """
        Train the model with validation and early stopping
        """
        # Define callbacks
        early_stopping = EarlyStopping(
            monitor='val_loss',
            patience=10,
            restore_best_weights=True
        )
        
        checkpoint = ModelCheckpoint(
            'best_model.h5',
            monitor='val_loss',
            save_best_only=True,
            mode='min',
            verbose=0
        )
        
        # Train the model
        self.history = self.model.fit(
            X_train, y_train,
            epochs=epochs,
            batch_size=batch_size,
            validation_data=(X_val, y_val),
            callbacks=[early_stopping, checkpoint],
            verbose=1
        )
        
        # Plot training and validation loss
        self.plot_loss_curves()
        self.plot_loss_relationship()
        
        return self.history
    
    def plot_loss_curves(self):
        """
        Plot training and validation loss curves
        """
        if self.history is None:
            print("No training history available")
            return
        plt.figure(figsize=(5, 3))
        plt.plot(self.history.history['loss'], label='Training Loss')
        plt.plot(self.history.history['val_loss'], label='Validation Loss')
        plt.title(f'Model Loss (Neurons: {self.neurons_layer if len(self.neurons_layer) > 1 else str(self.neurons_layer[0])}, Lags: {self.lags})')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend()
        plt.grid(True)
        plt.savefig(f'/data/workspace_files/image/{self.direction}/{self.stock}_hn_{self.neurons_layer}_l_{self.lags}_{self.frequency}_model_loss_{self.direction}.jpg')
        plt.show()

    def plot_loss_relationship(self):
        """
        Plot the relationship between training and validation loss
        """
        if self.history is None:
            print("No training history available")
            return
            
        train_losses = self.history.history['loss']
        val_losses = self.history.history['val_loss']
        
        # Find minimum validation loss and its corresponding training loss
        min_val_loss_idx = np.argmin(val_losses)
        min_val_loss = val_losses[min_val_loss_idx]
        corresponding_train_loss = train_losses[min_val_loss_idx]
        
        plt.figure(figsize=(5, 3))
        
        # Set up the plot area
        plt.grid(True, linestyle='--', alpha=0.7)
        plt.xlabel('L_train')
        plt.ylabel('L_val')
        plt.plot(train_losses, val_losses, 'b-', alpha=0.5)
        
        # Add arrow for optimal point
        plt.plot([corresponding_train_loss], [min_val_loss], 'ko', 
                label='min loss function, lack of generalization')
        
        # Add triangle for overfitting point
        if len(train_losses) > min_val_loss_idx + 1:
            overfitting_idx = min_val_loss_idx + 1
            plt.plot([train_losses[overfitting_idx]], [val_losses[overfitting_idx]], 
                    '^k', label='overfitting')
        
        # Set axis limits with some padding
        min_train = min(train_losses)
        max_train = max(train_losses)
        min_val = min(val_losses)
        max_val = max(val_losses)
        
        plt.xlim(min_train * 0.9, max_train * 1.1)
        plt.ylim(min_val * 0.9, max_val * 1.1)
        
        plt.legend()
        plt.title('plot_loss_relationship')
        plt.savefig(f'/data/workspace_files/image/{self.direction}/{self.stock}_hn_{self.neurons_layer}_l_{self.lags}_{self.frequency}_loss_relationship_{self.direction}.jpg')
        plt.show()
        
        # Print the optimal point information
        print(f"Optimal point:")
        print(f"Train Loss: {corresponding_train_loss:.4f}")
        print(f"Val Loss: {min_val_loss:.4f}")
        print(f"Reach at the {min_val_loss_idx + 1} th round")


    def predict(self, X):
        """
        Make predictions and inverse transform to original scale
        """
        predictions = self.model.predict(X)
        return self.scaler.inverse_transform(predictions.reshape(-1, 1))
    
    def calculate_metrics(self, y_true, y_pred):
        """
        Calculate both RRMSE and MAE metrics
        """
        y_true = self.scaler.inverse_transform(y_true.reshape(-1, 1))
        y_pred = self.scaler.inverse_transform(y_pred.reshape(-1, 1))
        
        rrmse = np.sqrt(np.mean(np.square((y_true - y_pred))))/ np.mean(y_true)
        mae = np.mean(np.abs(y_true - y_pred))
        
        return {'rrmse': rrmse, 'mae': mae}

    def save_model(self, filepath):
        """
        Save the trained model
        """
        self.model.save(filepath)

    def load_model(self, filepath):
        """
        Load a previously trained model
        """
        from tensorflow.keras.models import load_model
        self.model = load_model(filepath)

In [None]:
# Grid search
for d in ['total', 'buy', 'sell']:
    for f in ['1min', '15min']:
        for s in ['SPY', 'QQQ']:
            results = pd.DataFrame(columns= ['Stock','Hidden Neurons', 'Lags', 
                                             'Training RRMSE',  'Validation RRMSE', 'Testing RRMSE',
                                             ])
            for hn in [[5], [10], [20], [40], [2,2],[5,2],[5,5],[16,8], [32,16]]:
                for l in [2, 3, 5, 10, 20, 30, 50]:
                    # Initialize predictor
                    predictor = VolumePredictor(stock=s, neurons_layer=hn, lags=l, frequency=f, direction = d)

                    # Load data from parquet file
                    volume_data = predictor.load_data(f'/data/workspace_files/pq/{s.lower()}_bar_1m_tradeDir.pq')

                    (X_train, y_train, train_dates), (X_val, y_val, val_dates), (X_test, y_test, test_dates) = predictor.prepare_data(volume_data)

                    # Build and train model
                    predictor.build_model()
                    history = predictor.train(X_train, y_train, X_val, y_val, epochs=200, batch_size=32)

                    # Make predictions
                    train_pred = predictor.model.predict(X_train)
                    val_pred = predictor.model.predict(X_val)
                    test_pred = predictor.model.predict(X_test)
                    test_pred = np.maximum(test_pred, 0)

                    # Calculate metrics
                    train_loss = predictor.calculate_metrics(y_train, train_pred)
                    test_loss = predictor.calculate_metrics(y_test, test_pred)
                    val_loss = predictor.calculate_metrics(y_val, val_pred)


                    y_test_orig = predictor.scaler.inverse_transform(y_test.reshape(-1, 1))
                    test_pred_orig = predictor.scaler.inverse_transform(test_pred.reshape(-1, 1))

                    # Create DataFrame for this model's predictions
                    model_predictions = pd.DataFrame({
                        'Date': test_dates,
                        'Actual': y_test_orig.flatten(),
                        'Predicted': test_pred_orig.flatten(),
                        'Hidden_Neurons': f"{hn[0]},{hn[1]}" if len(hn) > 1 else hn[0],
                        'Lags': l
                    })

                    # Plot results
                    plt.figure(figsize=(5, 3))
                    plt.plot(test_dates, y_test_orig, label='Actual')
                    plt.plot(test_dates, test_pred_orig, label='Predicted')
                    plt.title(f'Test Set: Actual vs Predicted for {hn if len(hn) > 1 else str(hn[0])} neurons with lag {l}')
                    plt.xlabel('Time')
                    plt.ylabel('Trading Volume')
                    plt.legend()
                    plt.xticks(rotation=45)
                    plt.tight_layout()
                    plt.savefig(f'/data/workspace_files/image/{d}/{s}_hn_{hn}_l_{l}_{f}_volume_prediction_{d}.jpg')
                    plt.show()
                    results.loc[len(results)] = [s, hn, l,
                                                f'{train_loss["rrmse"]:.3f}', f'{val_loss["rrmse"]:.3f}', f'{test_loss["rrmse"]:.3f}'
                                                ]  

                    model_predictions.to_csv(f'/data/workspace_files/20250225/result/{d}/{s}_hn_{hn if len(hn) > 1 else str(hn[0])}_l_{l}_{f}_{d}.csv')

                    
            results.to_csv(f'/data/workspace_files/result/{s}_model_performance_summary_{f}_{d}.csv')