In [None]:
import numpy as np
import pandas as pd
from google.colab import drive

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    roc_auc_score, roc_curve, precision_recall_curve
)
from sklearn.decomposition import KernelPCA

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models, callbacks
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import AUC, Precision, Recall
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
stock_data = pd.read_csv("/content/drive/MyDrive/final_data.csv")
stock_data  = stock_data [stock_data ['date'].between('2004-01-01', '2024-12-31')]

In [None]:
id_col = ["PERMNO", 'CUSIP', 'Ticker', 'SICCD','NAICS']
time_col = ['MthCalDt']
factor_col = ['mom_12','mom_6', 'vol_12', 'vol_6', 'rev_1', 'rvol_1', 'beta',"rsi_6", 'trend_strength']
market_col = [ 'qmj_safety','seas_11_15na',
              'ret_3_1', 'iskew_ff3_21d', 'rskew_21d', 'sti_gr1a','earnings_variability', 'nfna_gr1a',
              'seas_16_20an', 'corr_1260d']
fin_col = ['capxy', 'chechy', 'cshfdy', 'cshpry', 'dltry', 'dpcy',
       'epspxy', 'oibdpy', 'txty']

In [None]:
train_set = stock_data[stock_data["year"] <= 2021]
val_set = train_set[train_set["year"]>2018]
train_set = train_set[train_set["year"]<=2018]
test_set = stock_data[stock_data["year"]>2021]

In [None]:
ml_col = factor_col+fin_col+market_col+["quarter", "naics", "ind_1"]

In [None]:
X_train = train_set[ml_col].values
y_train = train_set["pred_cat"]
X_val = val_set[ml_col].values
y_val = val_set["pred_cat"]
X_test = test_set[ml_col].values
y_test = test_set["pred_cat"]

In [None]:
from sklearn.kernel_approximation import RBFSampler

rbf_mapper = RBFSampler(gamma=1.0/X_train.shape[1], n_components=100, random_state=42)
X_train_rbf = rbf_mapper.fit_transform(X_train)

X_train_rbf = np.concatenate([X_train, X_train_rbf], axis=1)
X_val_rbf = np.concatenate([X_val, rbf_mapper.transform(X_val)], axis=1)
X_test_rbf = np.concatenate([X_test, rbf_mapper.transform(X_test)], axis=1)

In [None]:
def create_dnn_model(input_dim, hidden_layers=[256, 128, 64], dropout_rate=0.3):

    model = models.Sequential([
        layers.Input(shape=(input_dim,)),
        layers.BatchNormalization()
    ])

    for i, units in enumerate(hidden_layers):
        model.add(layers.Dense(units, activation='relu'))
        model.add(layers.BatchNormalization())
        model.add(layers.Dropout(dropout_rate))


    model.add(layers.Dense(1, activation='sigmoid'))

    return model


dnn_model = create_dnn_model(X_train_rbf.shape[1])

dnn_model.compile(
    optimizer=Adam(learning_rate=0.01),
    loss=BinaryCrossentropy(),
    metrics=['accuracy', AUC(name='auc'), Precision(name='precision'), Recall(name='recall')]
)

print("Model Structure:")
dnn_model.summary()

print("Training")
dnn_callbacks = [
    callbacks.EarlyStopping(patience=20, restore_best_weights=True),
    callbacks.ReduceLROnPlateau(patience=10, factor=0.5),
    callbacks.ModelCheckpoint('best_dnn_model.h5', save_best_only=True)
]

dnn_history = dnn_model.fit(
    X_train_rbf, y_train,
    validation_data=(X_val_rbf, y_val),
    epochs=50,
    batch_size=32,
    callbacks=dnn_callbacks,
    verbose=1
)


dnn_pred_proba = dnn_model.predict(X_test_rbf)
dnn_pred = (dnn_pred_proba > 0.5).astype(int)
dnn_accuracy = accuracy_score(y_test, dnn_pred)
dnn_auc = roc_auc_score(y_test, dnn_pred_proba)

print(f"DNN - Accuracy: {dnn_accuracy:.4f}, AUC: {dnn_auc:.4f}")
print()


Model Structure:



训练DNN模型...
Epoch 1/50
[1m31103/31103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6155 - auc: 0.6517 - loss: 0.6566 - precision: 0.6152 - recall: 0.7341



[1m31103/31103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 4ms/step - accuracy: 0.6155 - auc: 0.6517 - loss: 0.6566 - precision: 0.6152 - recall: 0.7341 - val_accuracy: 0.5544 - val_auc: 0.5558 - val_loss: 1.2232 - val_precision: 0.6239 - val_recall: 0.5789 - learning_rate: 0.0100
Epoch 2/50
[1m31102/31103[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 3ms/step - accuracy: 0.6337 - auc: 0.6734 - loss: 0.6434 - precision: 0.6287 - recall: 0.7556



[1m31103/31103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 4ms/step - accuracy: 0.6337 - auc: 0.6734 - loss: 0.6434 - precision: 0.6287 - recall: 0.7556 - val_accuracy: 0.5992 - val_auc: 0.5929 - val_loss: 0.7776 - val_precision: 0.6318 - val_recall: 0.7364 - learning_rate: 0.0100
Epoch 3/50
[1m31095/31103[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 3ms/step - accuracy: 0.6363 - auc: 0.6777 - loss: 0.6412 - precision: 0.6316 - recall: 0.7518



[1m31103/31103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 3ms/step - accuracy: 0.6363 - auc: 0.6777 - loss: 0.6412 - precision: 0.6316 - recall: 0.7518 - val_accuracy: 0.5733 - val_auc: 0.6072 - val_loss: 0.7275 - val_precision: 0.6426 - val_recall: 0.5913 - learning_rate: 0.0100
Epoch 4/50
[1m31103/31103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 3ms/step - accuracy: 0.6382 - auc: 0.6802 - loss: 0.6399 - precision: 0.6339 - recall: 0.7515 - val_accuracy: 0.5847 - val_auc: 0.5386 - val_loss: 2.0796 - val_precision: 0.6087 - val_recall: 0.7901 - learning_rate: 0.0100
Epoch 5/50
[1m31103/31103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 3ms/step - accuracy: 0.6408 - auc: 0.6829 - loss: 0.6381 - precision: 0.6357 - recall: 0.7581 - val_accuracy: 0.5936 - val_auc: 0.5810 - val_loss: 3.9223 - val_precision: 0.6341 - val_rec



[1m31103/31103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 3ms/step - accuracy: 0.6406 - auc: 0.6838 - loss: 0.6379 - precision: 0.6366 - recall: 0.7543 - val_accuracy: 0.5756 - val_auc: 0.6145 - val_loss: 0.7165 - val_precision: 0.6599 - val_recall: 0.5499 - learning_rate: 0.0100
Epoch 7/50
[1m31086/31103[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 3ms/step - accuracy: 0.6415 - auc: 0.6843 - loss: 0.6376 - precision: 0.6370 - recall: 0.7549



[1m31103/31103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 3ms/step - accuracy: 0.6415 - auc: 0.6843 - loss: 0.6376 - precision: 0.6370 - recall: 0.7549 - val_accuracy: 0.6063 - val_auc: 0.6094 - val_loss: 0.6820 - val_precision: 0.6155 - val_recall: 0.8513 - learning_rate: 0.0100
Epoch 8/50
[1m31103/31103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 3ms/step - accuracy: 0.6420 - auc: 0.6855 - loss: 0.6367 - precision: 0.6369 - recall: 0.7560 - val_accuracy: 0.5887 - val_auc: 0.5494 - val_loss: 1.4159 - val_precision: 0.6065 - val_recall: 0.8236 - learning_rate: 0.0100
Epoch 9/50
[1m31103/31103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 3ms/step - accuracy: 0.6425 - auc: 0.6867 - loss: 0.6359 - precision: 0.6378 - recall: 0.7561 - val_accuracy: 0.5904 - val_auc: 0.5901 - val_loss: 0.7617 - val_precision: 0.6367 - val_re



[1m31103/31103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 4ms/step - accuracy: 0.6475 - auc: 0.6938 - loss: 0.6312 - precision: 0.6421 - recall: 0.7572 - val_accuracy: 0.6191 - val_auc: 0.6231 - val_loss: 0.6788 - val_precision: 0.6466 - val_recall: 0.7537 - learning_rate: 0.0050
Epoch 23/50
[1m31103/31103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m122s[0m 4ms/step - accuracy: 0.6463 - auc: 0.6930 - loss: 0.6316 - precision: 0.6408 - recall: 0.7572 - val_accuracy: 0.6081 - val_auc: 0.6253 - val_loss: 0.7440 - val_precision: 0.6518 - val_recall: 0.6927 - learning_rate: 0.0050
Epoch 24/50
[1m31103/31103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 4ms/step - accuracy: 0.6467 - auc: 0.6930 - loss: 0.6315 - precision: 0.6414 - recall: 0.7585 - val_accuracy: 0.5639 - val_auc: 0.5588 - val_loss: 0.8152 - val_precision: 0.6102 - val_



[1m31103/31103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m121s[0m 4ms/step - accuracy: 0.6463 - auc: 0.6928 - loss: 0.6316 - precision: 0.6406 - recall: 0.7560 - val_accuracy: 0.6124 - val_auc: 0.5913 - val_loss: 0.6726 - val_precision: 0.6349 - val_recall: 0.7770 - learning_rate: 0.0050
Epoch 28/50
[1m31103/31103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m121s[0m 4ms/step - accuracy: 0.6457 - auc: 0.6927 - loss: 0.6318 - precision: 0.6407 - recall: 0.7556 - val_accuracy: 0.6150 - val_auc: 0.6098 - val_loss: 0.7257 - val_precision: 0.6390 - val_recall: 0.7691 - learning_rate: 0.0050
Epoch 29/50
[1m31103/31103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 4ms/step - accuracy: 0.6467 - auc: 0.6934 - loss: 0.6313 - precision: 0.6419 - recall: 0.7564 - val_accuracy: 0.6140 - val_auc: 0.6006 - val_loss: 0.6879 - val_precision: 0.6387 - val_



[1m31103/31103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 3ms/step - accuracy: 0.6481 - auc: 0.6962 - loss: 0.6292 - precision: 0.6424 - recall: 0.7597 - val_accuracy: 0.6125 - val_auc: 0.6045 - val_loss: 0.6681 - val_precision: 0.6447 - val_recall: 0.7357 - learning_rate: 0.0025
Epoch 46/50
[1m31103/31103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 3ms/step - accuracy: 0.6481 - auc: 0.6958 - loss: 0.6294 - precision: 0.6428 - recall: 0.7579 - val_accuracy: 0.6126 - val_auc: 0.6022 - val_loss: 0.6719 - val_precision: 0.6446 - val_recall: 0.7364 - learning_rate: 0.0025
Epoch 47/50
[1m31103/31103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 3ms/step - accuracy: 0.6483 - auc: 0.6960 - loss: 0.6293 - precision: 0.6430 - recall: 0.7574 - val_accuracy: 0.6095 - val_auc: 0.5992 - val_loss: 0.6796 - val_precision: 0.6471 - val_



[1m31103/31103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 3ms/step - accuracy: 0.6489 - auc: 0.6969 - loss: 0.6287 - precision: 0.6440 - recall: 0.7566 - val_accuracy: 0.6142 - val_auc: 0.6023 - val_loss: 0.6650 - val_precision: 0.6392 - val_recall: 0.7651 - learning_rate: 0.0025
[1m8863/8863[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 1ms/step
DNN - Accuracy: 0.5434, AUC: 0.6358

