In [None]:
import numpy as np
import pandas as pd
from google.colab import drive

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    roc_auc_score, roc_curve, precision_recall_curve
)
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models, callbacks
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import AUC, Precision, Recall
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
stock_data = pd.read_csv("/content/drive/MyDrive/final_data.csv")
stock_data  = stock_data [stock_data ['date'].between('2004-01-01', '2024-12-31')]

In [None]:
extra_data = pd.read_csv("/content/drive/MyDrive/extra_factor.csv",index_col=0)

In [None]:
stock_data = pd.concat([stock_data,extra_data],axis = 1)

In [None]:
id_col = ["PERMNO", 'CUSIP', 'Ticker', 'SICCD','NAICS']
time_col = ['MthCalDt']
factor_col = ['mom_12','mom_6', 'vol_12', 'vol_6', 'rev_1', 'rvol_1', 'beta',"rsi_6", 'trend_strength']
market_col = [ 'qmj_safety','seas_11_15na',
              'ret_3_1', 'iskew_ff3_21d', 'rskew_21d', 'sti_gr1a','earnings_variability', 'nfna_gr1a',
              'seas_16_20an', 'corr_1260d']
fin_col = ['capxy', 'chechy', 'cshfdy', 'cshpry', 'dltry', 'dpcy',
       'epspxy', 'oibdpy', 'txty']
extra_col = extra_data.columns

In [None]:
train_set = stock_data[stock_data["year"] <= 2021]
vol_set = train_set[train_set["year"]>2018]
train_set = train_set[train_set["year"]<=2018]
test_set = stock_data[stock_data["year"]>2021]

In [None]:
ml_col = factor_col+fin_col+market_col+list(extra_col)+["quarter", "naics", "ind_1"]

In [None]:
X_train = train_set[ml_col].values
y_train = train_set["pred_cat"]
X_vol = vol_set[ml_col].values
y_vol = vol_set["pred_cat"]
X_test = test_set[ml_col].values
y_test = test_set["pred_cat"]

In [None]:
stock_data

Unnamed: 0,PERMNO,cusip,Ticker,TradingSymbol,PERMCO,SICCD,NAICS,MthCalDt,MthRet,sprtrn,...,rf_bootstrap_3,rf_bootstrap_4,rf_bootstrap_5,rf_bootstrap_6,rf_depth_1,rf_depth_2,rf_depth_3,rf_depth_4,genetic_ordinal_1,genetic_ordinal_10
0,88488,57059Y20,MKH,MKH,38391,6726,0,2004-01-30,0.017517,0.017276,...,0.700014,0.707231,0.721664,0.721664,0.013986,0.000000,0.675439,0.008475,7.779292,0.004968
1,88488,57059Y20,MKH,MKH,38391,6726,0,2004-01-30,0.017517,0.017276,...,0.721498,0.707068,0.714283,0.721498,0.013986,0.000000,0.675439,0.008475,7.779292,0.004968
2,88488,57059Y20,MKH,MKH,38391,6726,0,2004-01-30,0.017517,0.017276,...,0.709011,0.709011,0.709011,0.709011,0.013986,0.000000,0.070175,0.008475,7.779292,0.004968
3,88488,57059Y20,MKH,MKH,38391,6726,0,2004-01-30,0.017517,0.017276,...,0.685363,0.692429,0.699494,0.706560,0.013986,0.000000,0.070175,0.008475,7.779292,0.004968
4,88311,75902E10,RKH,RKH,37676,6726,0,2004-01-30,0.010649,0.017276,...,0.671143,0.678062,0.684981,0.691900,0.010490,0.000000,0.070175,0.327684,-14.603992,-0.000348
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1522303,19421,25460G77,WFH,WFH,53101,6726,525990,2024-12-31,-0.005809,-0.024990,...,0.579013,0.579013,0.573223,0.579013,0.269231,0.563776,0.085526,0.378531,-0.720313,0.002805
1522304,19425,31609234,FBCV,FBCV,54581,6726,525990,2024-12-31,-0.068526,-0.024990,...,0.568373,0.574232,0.585951,0.585951,0.269231,0.563776,0.085526,0.378531,-0.674258,0.000484
1522305,19426,31609235,FBCG,FBCG,54581,6726,525990,2024-12-31,0.006564,-0.024990,...,0.584827,0.602914,0.602914,0.602914,0.269231,0.563776,0.085526,0.378531,-0.644826,0.002802
1522306,19427,31609236,FFLC,FFLC,54581,6726,525990,2024-12-31,-0.033064,-0.024990,...,0.572296,0.572296,0.572296,0.572296,0.269231,0.563776,0.085526,0.378531,-0.674483,0.002278


In [None]:
def create_dnn_model(input_dim, hidden_layers=[256, 128, 64], dropout_rate=0.3):

    model = models.Sequential([
        layers.Input(shape=(input_dim,)),
        layers.BatchNormalization()
    ])

    for i, units in enumerate(hidden_layers):
        model.add(layers.Dense(units, activation='relu'))
        model.add(layers.BatchNormalization())
        model.add(layers.Dropout(dropout_rate))


    model.add(layers.Dense(1, activation='sigmoid'))

    return model


dnn_model = create_dnn_model(X_train.shape[1])

dnn_model.compile(
    optimizer=Adam(learning_rate=0.01),
    loss=BinaryCrossentropy(),
    metrics=['accuracy', AUC(name='auc'), Precision(name='precision'), Recall(name='recall')]
)

print("Model Structure:")
dnn_model.summary()

print("Training")
dnn_callbacks = [
    callbacks.EarlyStopping(patience=20, restore_best_weights=True),
    callbacks.ReduceLROnPlateau(patience=10, factor=0.5),
    callbacks.ModelCheckpoint('best_dnn_model.h5', save_best_only=True)
]

dnn_history = dnn_model.fit(
    X_train, y_train,
    validation_data=(X_vol, y_vol),
    epochs=50,
    batch_size=32,
    callbacks=dnn_callbacks,
    verbose=1
)


dnn_pred_proba = dnn_model.predict(X_test)
dnn_pred = (dnn_pred_proba > 0.5).astype(int)
dnn_accuracy = accuracy_score(y_test, dnn_pred)
dnn_auc = roc_auc_score(y_test, dnn_pred_proba)

print(f"DNN - Accuracy: {dnn_accuracy:.4f}, AUC: {dnn_auc:.4f}")
print()

Model Structure:


Training
Epoch 1/50
[1m31103/31103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6449 - auc: 0.6889 - loss: 0.6358 - precision: 0.6402 - recall: 0.7541



[1m31103/31103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 3ms/step - accuracy: 0.6449 - auc: 0.6889 - loss: 0.6358 - precision: 0.6402 - recall: 0.7541 - val_accuracy: 0.6534 - val_auc: 0.6612 - val_loss: 0.6683 - val_precision: 0.6625 - val_recall: 0.8176 - learning_rate: 0.0100
Epoch 2/50
[1m31103/31103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 3ms/step - accuracy: 0.6483 - auc: 0.6928 - loss: 0.6324 - precision: 0.6425 - recall: 0.7606 - val_accuracy: 0.6556 - val_auc: 0.6602 - val_loss: 0.7733 - val_precision: 0.6538 - val_recall: 0.8602 - learning_rate: 0.0100
Epoch 3/50
[1m31103/31103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 3ms/step - accuracy: 0.6493 - auc: 0.6946 - loss: 0.6313 - precision: 0.6442 - recall: 0.7571 - val_accuracy: 0.6379 - val_auc: 0.6609 - val_loss: 0.9370 - val_precision: 0.6797 - val_reca