In [None]:
# My imports
import numpy as np
import pandas as pd
import warnings
import tensorflow as tf
from sklearn.linear_model import HuberRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import regularizers
from sklearn.preprocessing import MinMaxScaler

warnings.filterwarnings('ignore')

### Data Preparation

In [None]:
## Load 94 firm characteristics dataset
data_ch = pd.read_csv('GKX_20201231.csv')
data_ch['DATE'] = pd.to_datetime(data_ch['DATE'], format='%Y%m%d') + pd.offsets.MonthEnd(0)
data_ch = data_ch[(data_ch['DATE'] >= '1957-01-31') & (data_ch['DATE'] <= '2016-12-31')]
cols = data_ch.columns.tolist()
cols_new = [x for x in cols if x not in ['permno', 'prc', 'SHROUT', 'mve0']]
data_ch = data_ch[cols_new]

In [None]:
# Construct dummy variables and remove the samples with missing 'sic2'
data_ch  = data_ch.dropna(subset=['sic2']).reset_index(drop=True)
dummies = pd.get_dummies(data_ch['sic2'], prefix='dum_', dtype=np.float64)
data_ch = data_ch.drop('sic2', axis=1)
data_ch = pd.concat([data_ch, dummies], axis=1)

In [None]:
# Replace all missings of firm characteristics with 0
chas = [x for x in cols_new if x not in ['DATE', 'RET', 'sic2']]
data_ch[chas] = data_ch[chas].fillna(0)

In [None]:
## Load 8 macroeconomic predictors
data_ma = pd.read_csv('PredictorData2023.csv')
data_ma['yyyymm'] = pd.to_datetime(data_ma['yyyymm'], format='%Y%m') + pd.offsets.MonthEnd(0)
data_ma = data_ma[(data_ma['yyyymm'] >= '1957-01-31') & (data_ma['yyyymm'] <= '2016-12-31')].reset_index(drop=True)

In [None]:
# Construct 8 macroeconomic predictors
ma_predictors = ['dp', 'ep', 'bm', 'ntis', 'tbl', 'tms', 'dfy', 'svar']
data_ma['Index'] = data_ma['Index'].str.replace(',', '').astype('float64')
data_ma['dp'] = np.log(data_ma['D12'] / data_ma['Index'])
data_ma['ep'] = np.log(data_ma['E12'] / data_ma['Index'])
data_ma.rename(columns={'b/m': 'bm'}, inplace=True)
data_ma['tms'] = data_ma['lty'] - data_ma['tbl']
data_ma['dfy'] = data_ma['BAA'] - data_ma['AAA']
data_ma = data_ma[['yyyymm'] + ma_predictors]

In [None]:
# Construct the dataset including all covariates
data_ma_long = pd.merge(data_ch['DATE'], data_ma, left_on='DATE', right_on='yyyymm', how='left').drop('yyyymm', axis=1)
for cha in chas:
    for predictor in ma_predictors:
        name = cha + '_' + predictor
        data_ch[name] = data_ch[cha] * data_ma_long[predictor]
data = data_ch

In [None]:
# Split the dataset
def get_data_split(data, str, end, model=None):
    if model == 'ols3':
        dt = data[['DATE', 'RET', 'mvel1', 'bm', 'mom1m']]
        covariates = ['mvel1', 'bm', 'mom1m']
        X = dt[(dt['DATE'] >= str) & (dt['DATE'] <= end)][covariates].to_numpy()
        y = dt[(dt['DATE'] >= str) & (dt['DATE'] <= end)]['RET'].to_numpy()
        return X, y
    else:
        covariates = [x for x in data.columns if (x != 'RET') & (x != 'DATE')]
        X = data[(data['DATE'] >= str) & (data['DATE'] <= end)][covariates].to_numpy()
        y = data[(data['DATE'] >= str) & (data['DATE'] <= end)]['RET'].to_numpy()
        return X, y

def r2_score(y, yhat):
    r2 = (1 - sum((y - yhat) ** 2) / sum(y ** 2)) * 100
    return r2

In [None]:
train_str = '1957-01-31'; train_end = '1974-12-31'
val_str = '1975-01-31'; val_end = '1986-12-31'
test_str = '1987-01-31'; test_end = '2016-12-31'

### OLS (top 1,000)

#### OLS

In [None]:
# Get top 1,000 stock
dt = data.groupby('DATE').apply(lambda x: x.nlargest(1000, 'mvel1')).reset_index(drop=True)

In [None]:
# OLS with Huber loss using top 1,000 stocks
ols_oos = []
for i in range(30):
    # Get training dataset, test dataset split
    str = pd.to_datetime(train_str)
    end = pd.to_datetime(val_end) + pd.DateOffset(years=i)
    oos_str = end + pd.DateOffset(months=1)
    oos_end = end + pd.DateOffset(years=1)

    X_train, y_train = get_data_split(dt, str, end)
    X_test, y_test = get_data_split(dt, oos_str, oos_end)

    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)

    # Fit the OLS model using 920 features
    epsilon = X_train.shape[0] * 0.001
    ols = HuberRegressor(fit_intercept=False, epsilon=epsilon)
    ols.fit(X_train, y_train) 
    ols_oos.append(ols.predict(X_test)) 

In [None]:
# Save oos predictions to a csv file
y_pred = np.concatenate(ols_oos)
with open('test_ols.csv', 'a') as f:
        np.savetxt(f, y_pred, delimiter=',')

# Compute 30 years' out of sample R^2 for NN
a = pd.read_csv('test_ols.csv', header=None)
y_test = get_data_split(dt, test_str, test_end)[1]
r2_score(y_test, np.squeeze(a.values))

#### OLS-3

In [None]:
# OLS-3 with huber loss using top 1,000 stocks
ols3_oos = []
for i in range(30):
    # Get training dataset, test dataset split
    str = pd.to_datetime(train_str)
    end = pd.to_datetime(val_end) + pd.DateOffset(years=i)
    oos_str = end + pd.DateOffset(months=1)
    oos_end = end + pd.DateOffset(years=1)
    
    X_train, y_train = get_data_split(dt, str, end, model='ols3')
    X_test, y_test = get_data_split(dt, oos_str, oos_end, model='ols3')
    
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)
    
    # Fit OLS-3
    epsilon = X_train.shape[0] * 0.001
    ols3 = HuberRegressor(fit_intercept=False, epsilon=epsilon)
    ols3.fit(X_train, y_train)
    ols3_oos.append(ols3.predict(X_test))

In [None]:
# Save oos predictions to a csv file
y_pred = np.concatenate(ols3_oos)
with open('test_ols3.csv', 'a') as f:
        np.savetxt(f, y_pred, delimiter=',')
        
# Compute 30 years' out of sample R^2 for NN
a = pd.read_csv('test_ols3.csv', header=None)
y_test = get_data_split(dt, test_str, test_end)[1]
r2_score(y_test, np.squeeze(a.values))

### Neural Network (all data)

In [None]:
# Get the model
def get_model(hidden_units, l1_coeff, learning_rate):
    model = Sequential()
    for i, units in enumerate(hidden_units):
            model.add(Dense(units, kernel_regularizer=regularizers.l1(l1_coeff),
                            activation='relu'))
    if len(hidden_units) == 0:
        model.add(Dense(1))
    else:
        model.add(Dense(1))
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), loss='mse')
    return model

#### NN1~NN5

In [None]:
nn_oos = []
for i in range(0, 30):
    # Get training, validation and test dataset split
    str = pd.to_datetime(train_str)
    end = pd.to_datetime(train_end) + pd.DateOffset(years=i)
    mid_str = end + pd.DateOffset(months=1)
    mid_end = end + pd.DateOffset(years=12)
    oos_str = mid_end + pd.DateOffset(months=1)
    oos_end = mid_end + pd.DateOffset(years=1)

    X_train, y_train = get_data_split(data, str, end)
    X_val, y_val = get_data_split(data, mid_str, mid_end)
    X_test, y_test = get_data_split(data, oos_str, oos_end)

    # Scale the input features to the range [0, 1]
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.fit_transform(X_val)
    X_test = scaler.fit_transform(X_test)

    # Create the training, validation and test Datasets
    train_ds = tf.data.Dataset.from_tensor_slices((X_train.astype(np.float32), y_train.astype(np.float32)))
    val_ds = tf.data.Dataset.from_tensor_slices((X_val.astype(np.float32), y_val.astype(np.float32)))
    test_ds = tf.data.Dataset.from_tensor_slices((X_test.astype(np.float32), y_test.astype(np.float32)))

    # Batch and prefetch the Datasets
    train_ds = train_ds.batch(10000).prefetch(tf.data.AUTOTUNE)
    val_ds = val_ds.batch(10000).prefetch(tf.data.AUTOTUNE)
    test_ds = test_ds.batch(10000).prefetch(tf.data.AUTOTUNE)

    # Fit NN
    model = get_model(hidden_units=[32, 16, 8, 4, 2], l1_coeff=0.0001, learning_rate=0.005)
    earlystopping = EarlyStopping(monitor='val_loss', patience=5)
    model.fit(train_ds, epochs=50, validation_data=val_ds, verbose=False,
                    callbacks=[earlystopping])

    # Predict oos
    nn_oos.append(model.predict(test_ds))

In [None]:
# Save oos predictions to a csv file
y_pred = np.squeeze(np.concatenate(nn_oos))
with open('test_nn5.csv', 'a') as f:
        np.savetxt(f, y_pred, delimiter=',')

# Compute 30 years' out of sample R^2 for NN
a = pd.read_csv('test_nn5.csv', header=None)
y_test = get_data_split(data, test_str, test_end)[1]
r2_score(y_test, np.squeeze(a.values))