In [None]:
import numpy as np
import pandas as pd
from google.colab import drive

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    roc_auc_score, roc_curve, precision_recall_curve
)
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models, callbacks, regularizers
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import AUC, Precision, Recall
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from tensorflow.keras.layers import Input, Dense, Concatenate, Dropout, BatchNormalization, Conv1D, GlobalMaxPooling1D
from tensorflow.keras.models import Model
from sklearn.utils import class_weight

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
stock_data = pd.read_csv("/content/drive/MyDrive/final_data.csv")
stock_data  = stock_data [stock_data ['date'].between('2009-01-01', '2024-12-31')]

In [None]:
id_col = ["PERMNO", 'CUSIP', 'Ticker', 'SICCD','NAICS']
time_col = ['MthCalDt']
factor_col = ['mom_12','mom_6', 'vol_12', 'vol_6', 'rev_1', 'rvol_1',"rsi_6", 'trend_strength']
market_col = [ 'qmj_safety','seas_11_15na',
              'ret_3_1', 'iskew_ff3_21d', 'rskew_21d', 'sti_gr1a','earnings_variability', 'nfna_gr1a',
              'seas_16_20an', 'corr_1260d']
fin_col = ['capxy', 'chechy', 'cshfdy', 'cshpry', 'dltry', 'dpcy',
       'epspxy', 'oibdpy', 'txty']

In [None]:
scaler_factor = StandardScaler()
stock_data[factor_col] = scaler_factor.fit_transform(stock_data[factor_col ])

scaler_fin = StandardScaler()
stock_data[fin_col] = scaler_fin.fit_transform(stock_data[fin_col ])

scaler_market = StandardScaler()
stock_data[market_col] = scaler_market.fit_transform(stock_data[market_col ])

In [None]:
train_set = stock_data[stock_data["year"] <= 2022]
val_set = train_set[train_set["year"]>2019]
train_set = train_set[train_set["year"]<=2019]
test_set = stock_data[stock_data["year"]>2022]

In [None]:
ml_col = factor_col + fin_col + market_col+["Quarter","naics", "ind_1"]

In [None]:
X_train = train_set[ml_col]
y_train = train_set["pred_cat"]
X_val = val_set[ml_col]
y_val = val_set["pred_cat"]
X_test = test_set[ml_col]
y_test = test_set["pred_cat"]

In [None]:
N_COMPUSTAT_FEATURES = len(fin_col)
N_RETURN_FEATURES = len(factor_col)
N_CLASSES = 1
X_compustat_train, X_returns_train, X_embeddings_train = X_train[fin_col], X_train[factor_col], X_train[market_col+["Quarter","naics", "ind_1"]]
X_compustat_val, X_returns_val, X_embeddings_val = X_val[fin_col], X_val[factor_col], X_val[market_col+["Quarter","naics", "ind_1"]]
X_compustat_test, X_returns_test, X_embeddings_test = X_test[fin_col], X_test[factor_col], X_test[market_col+["Quarter","naics", "ind_1"]]

N_EMBEDDING_FEATURES = len(market_col+["Quarter","naics", "ind_1"])

In [None]:
def build_multi_input_model(n_embedding_features, use_compustat=True, use_returns=True, use_reports=True,):
    # Input A: Compustat Factors Branch
    input_compustat = Input(shape=(N_COMPUSTAT_FEATURES,), name='compustat_input')
    x1 = Dense(64, activation='relu')(input_compustat)
    x1 = BatchNormalization()(x1)
    x1 = Dropout(0.3)(x1)
    x1 = Dense(32, activation='relu')(x1)

    # Input B: Cumulative Returns Branch

    # Simple Dense layer
    # input_returns = Input(shape=(N_RETURN_FEATURES,), name='returns_input')
    # x2 = Dense(32, activation='relu')(input_returns)
    # x2 = BatchNormalization()(x2)
    # x2 = Dropout(0.3)(x2)
    # x2 = Dense(16, activation='relu')(x2)

    input_returns = Input(shape=(N_RETURN_FEATURES, 1), name='returns_input')
    # 1D Convolutional layer to find temporal patterns
    x2 = Conv1D(filters=32, kernel_size=2, activation='relu')(input_returns)
    x2 = GlobalMaxPooling1D()(x2)
    x2 = Dropout(0.3)(x2)

    # Input C: Textual Embeddings Branch
    input_embeddings = Input(shape=(n_embedding_features,), name='embedding_input')
    x3 = Dense(128, activation='relu')(input_embeddings)
    x3 = BatchNormalization()(x3)
    x3 = Dropout(0.5)(x3)
    x3 = Dense(64, activation='relu')(x3)

    # Concatenate the outputs of the three branches
    if use_compustat and use_returns and use_reports:
        combined = Concatenate()([x1, x2, x3])
    elif use_compustat and use_returns:
        combined = Concatenate()([x1, x2])
    elif use_compustat and use_reports:
        combined = Concatenate()([x1, x3])
    elif use_returns and use_reports:
        combined = Concatenate()([x2, x3])
    elif use_compustat:
        combined = x1
    elif use_returns:
        combined = x2
    elif use_reports:
        combined = x3

    # Final prediction head
    z = Dense(64, activation='relu')(combined)
    z = Dropout(0.5)(z)
    z = Dense(32, activation='relu')(z)

    # Output layer for binary classification
    output_layer = Dense(N_CLASSES, activation='sigmoid', name='output')(z)

    # Build and compile the model
    model = Model(inputs=[input_compustat, input_returns, input_embeddings], outputs=output_layer)

    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    return model

In [None]:
model = build_multi_input_model(N_EMBEDDING_FEATURES, True, True, True)
model.summary()

# Define EarlyStopping callback
early_stopping = dnn_callbacks = [
    callbacks.EarlyStopping(patience=10, restore_best_weights=True),
    callbacks.ReduceLROnPlateau(patience=5, factor=0.5),
    callbacks.ModelCheckpoint('best_dnn_model.h5', save_best_only=True)
]

all_possible_classes = np.array([0, 1])

# Balance the training dataset
class_weights = class_weight.compute_class_weight(
    class_weight = "balanced",
    classes = all_possible_classes,
    y = np.ravel(y_train)
)

class_weight_dict = dict(enumerate(class_weights))

print(class_weight_dict)

history = model.fit(
    [X_compustat_train, X_returns_train, X_embeddings_train],
    y_train,
    validation_data=([X_compustat_val, X_returns_val, X_embeddings_val], y_val),
    epochs=50,
    batch_size=256,
    callbacks=[early_stopping],
    class_weight=class_weight_dict
)

{0: np.float64(1.116786460861054), 1: np.float64(0.9053266663003462)}
Epoch 1/50
[1m3218/3218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5257 - loss: 0.6974



[1m3218/3218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 5ms/step - accuracy: 0.5257 - loss: 0.6974 - val_accuracy: 0.5173 - val_loss: 0.6925 - learning_rate: 0.0010
Epoch 2/50
[1m3218/3218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - accuracy: 0.5436 - loss: 0.6905 - val_accuracy: 0.5191 - val_loss: 0.6929 - learning_rate: 0.0010
Epoch 3/50
[1m3218/3218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - accuracy: 0.5420 - loss: 0.6900 - val_accuracy: 0.5098 - val_loss: 0.6955 - learning_rate: 0.0010
Epoch 4/50
[1m3218/3218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - accuracy: 0.5433 - loss: 0.6897 - val_accuracy: 0.5165 - val_loss: 0.6931 - learning_rate: 0.0010
Epoch 5/50
[1m3218/3218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.5388 - loss: 0.6898 - val_accuracy: 0.5003 - val_loss: 0.6957 - learning_ra

In [None]:
test_loss, test_accuracy = model.evaluate(
    [X_compustat_test, X_returns_test, X_embeddings_test],
    y_test
)
print(f"\nTest Accuracy: {test_accuracy:.4f}")
print(f"Test Loss: {test_loss:.4f}")

[1m5987/5987[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2ms/step - accuracy: 0.5598 - loss: 0.6875

Test Accuracy: 0.5598
Test Loss: 0.6876


In [None]:
import matplotlib.pyplot as plt

def plot_training_history(history):

    fig, axes = plt.subplots(2, 2, figsize=(12, 8))

    if "auc" in history.history:
      axes[0,0].plot(history.history['auc'], label='Train AUC')
      axes[0,0].plot(history.history['val_auc'], label='Val AUC')
      axes[0,0].set_title('Model AUC')
      axes[0,0].legend()

    axes[0,1].plot(history.history['loss'], label='Train Loss')
    axes[0,1].plot(history.history['val_loss'], label='Val Loss')
    axes[0,1].set_title('Model Loss')
    axes[0,1].legend()

    axes[1,0].plot(history.history['accuracy'], label='Train Acc')
    axes[1,0].plot(history.history['val_accuracy'], label='Val Acc')
    axes[1,0].set_title('Model Accuracy')
    axes[1,0].legend()

    if "precision" in history.history:
      axes[1,1].plot(history.history['precision'], label='Train Precision')
      axes[1,1].plot(history.history['val_precision'], label='Val Precision')
      axes[1,1].set_title('Model Precision')
      axes[1,1].legend()

    if 'lr' in history.history:
        axes[1,1].plot(history.history['lr'])
        axes[1,1].set_title('Learning Rate')
        axes[1,1].set_yscale('log')

    plt.tight_layout()
    plt.show()
    plt.savefig("DNN.jpg")

plot_training_history(history)