This file is the neural network implementation. It is run on the google colab, so the address of the files need to be changed if run on other devices.

In [None]:
import pandas as pd
import numpy as np
import math
import tensorflow as tf
tf.config.list_physical_devices('GPU')
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

from google.colab import drive
drive.mount('/content/drive')

Data preprocessing

In [None]:
data=pd.read_csv('/content/drive/My Drive/GKX_20201231.csv')

#select data from 1960 to 2019
data = data[data.DATE >= 19600100][data.DATE <=20191230].reset_index(drop = True) 

data['DATE']=data['DATE']//100 
data = data.rename(columns={'DATE': 'yyyymm'})#Let the DATE become the format "yyyymm"

#merge two datasets and fill mixing value
dm=pd.read_excel('/content/drive/My Drive/macro.xlsx')
dm = dm.loc[(dm['yyyymm']>=196001)&(dm['yyyymm']<=201912)]

dm['dp']=dm['D12'].apply(math.log)-dm['Index'].apply(math.log)
dm['ep']=dm['E12'].apply(math.log)-dm['Index'].apply(math.log)
dm['tms']=dm['lty']-dm['tbl']
dm['dfy']=dm['BAA']-dm['AAA']
#Rename b/m by bm and Keep only 8 needed macropredictors
ned=['yyyymm', 'dp', 'ep', 'bm', 'ntis', 'tbl', 'tms', 'dfy','svar']
dm=dm.rename(columns={'b/m':'bm'})[ned]
dm.to_csv('macro.csv',index=False)

macro = pd.read_csv('macro.csv')
macro

data1 = pd.merge(data, macro, how='left', on='yyyymm', suffixes=('', '_macro'))

data1['excess_ret']=data1['RET']-data1['tbl'] #Calculate return in excess of risk-free rate

nonpd = ['yyyymm','RET','SHROUT','mve0','prc','permno','excess_ret'] #Nonpredictors
macropd = ['dp','ep_macro','bm_macro','ntis','tbl','tms','dfy','svar'] #Macroeconomic predictors
sic2 = ['sic2'] #Industrial dummies
lst = nonpd+macropd+sic2
stockpd = [p for p in data1.columns if p not in lst] #Stock-level predictors
len(stockpd)

period_list = np.unique(data1['yyyymm'])
len(period_list)

for pd in stockpd:
    c_row = 0
    for period in period_list:
        n_period = data1[data1['yyyymm']==period].shape[0]
        #Compute the cross-sectional median of this stock predictor
        median = np.nanmedian(data1.loc[c_row:(c_row+n_period-1),pd])
        #Fill missing values with median
        data1.loc[c_row:(c_row+n_period-1),pd] = data1.loc[c_row:(c_row+n_period-1),pd].replace(np.nan,median)
        c_row = c_row + n_period

newdt = data1.fillna(0)

newdt = newdt.drop(columns=['prc','SHROUT','mve0']) #Drop the predictors that are not used
newdt.shape

newdt.to_csv('merge_data.csv')

In [None]:
import pandas as pd
import numpy as np
import math
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler 

df = pd.read_csv("/content/drive/My Drive/merge_data.csv")


#clear nan
def handling_nan(df):
  for column in df.columns:
    if df[column].isnull().sum() == 0:
      continue
    #if null>half，drop this column
    elif (df[column].isnull().sum()/len(df[column])) > 0.5:
      df.drop(column, axis = 1, inplace = True)
    #if null<half，fill with mean value
    else:
      df[column].fillna(df[column].mean(), inplace = True)


# training data & valid data
train = df[df["yyyymm"] < 197800]
valid = df[(df["yyyymm"] > 197800)&(df["yyyymm"] < 199000)]

x_train = train.drop(["permno","yyyymm",'excess_ret','RET'],axis = 1)
x_valid = valid.drop(["permno","yyyymm",'excess_ret','RET',],axis = 1)

y_train = np.array(train['excess_ret']).reshape(-1,1)
y_valid = np.array(valid['excess_ret']).reshape(-1,1)

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_valid = scaler.transform(x_valid)

# test data
def testing_data(year_of_test):
  end_of_test = (year_of_test+1)*100


  #handling nan
  handling_nan(df)

  
  
  test = df[(df["yyyymm"] > year_of_test*100)&(df["yyyymm"] < end_of_test)]

  #x,y
  
  x_test = test.drop(["permno","yyyymm",'excess_ret','RET'],axis = 1)
  
  y_test = np.array(test['excess_ret']).reshape(-1,1)


  #StandardScale
  scaler = StandardScaler()
  x_test = scaler.fit_transform(x_test)

  return x_test, y_test

#define R^2 function
def R_square(ypred,ytrue): 
  dif2=np.sum(np.power(ytrue-ypred,2))
  return 1-(dif2/np.sum(np.power(ytrue,2)))

#define loss function
def R_loss(y_true, y_pred):
  return tf.reduce_mean(tf.square(y_true-y_pred))/tf.reduce_mean(tf.square(y_true))


Training proces(choose one from NN1-NN5)

NN1

In [None]:
rsquare_oos_valid = []
rsquare_oos_test = []

#NN1 
model = keras.models.Sequential()
model.add(keras.layers.Dense(32, activation='relu', input_shape=x_train.shape[1:]))
model.add(keras.layers.Dense(1))

sgd = keras.optimizers.SGD(learning_rate = 0.0002, clipnorm = 0.5)

model.summary()
model.compile(loss = R_loss, optimizer= sgd)
  
EarlyStop = keras.callbacks.EarlyStopping(
      monitor='val_loss',
      patience=5,
      verbose=1,
      min_delta=0.001, 
      mode='min')
  
Reduce = keras.callbacks.ReduceLROnPlateau(
      monitor='val_loss',
      factor=0.2,
      patience=3,
      verbose=1,
      mode='min',
      min_delta=0.001,
      cooldown=0,
      min_lr=0)

model.fit(
      x_train,
      y_train,
      validation_data = (x_valid, y_valid),
      epochs = 50,
      callbacks = [Reduce,EarlyStop])
  
model.save('/content/drive/My Drive/NN_models/NN1.h5')

rsquare_oos_valid.append(R_square(model.predict(x_valid),y_valid))
print(rsquare_oos_valid)

NN2

In [None]:
rsquare_oos_valid = []
rsquare_oos_test = []

#nn2
model = keras.models.Sequential()
model.add(keras.layers.Dense(32, activation='relu', input_shape=x_train.shape[1:]))
model.add(keras.layers.Dense(16, activation='relu'))
model.add(keras.layers.Dense(1))

sgd = keras.optimizers.SGD(learning_rate = 0.0002, clipnorm = 0.5)

model.summary()
model.compile(loss = R_loss, optimizer= sgd)
  
EarlyStop = keras.callbacks.EarlyStopping(
      monitor='val_loss',
      patience=5,
      verbose=1,
      min_delta=0.001, 
      mode='min')
  
Reduce = keras.callbacks.ReduceLROnPlateau(
      monitor='val_loss',
      factor=0.2,
      patience=3,
      verbose=1,
      mode='min',
      min_delta=0.001,
      cooldown=0,
      min_lr=0)

model.fit(
      x_train,
      y_train,
      validation_data = (x_valid, y_valid),
      epochs = 50,
      callbacks = [Reduce,EarlyStop])
  
model.save('/content/drive/My Drive/NN_models/NN2.h5')

rsquare_oos_valid.append(R_square(model.predict(x_valid),y_valid))
print(rsquare_oos_valid)


NN3

In [None]:
rsquare_oos_valid = []
rsquare_oos_test = []

#nn3
model = keras.models.Sequential()
model.add(keras.layers.Dense(32, activation='relu', input_shape=x_train.shape[1:]))
model.add(keras.layers.Dense(16, activation='relu'))
model.add(keras.layers.Dense(8, activation='relu'))
model.add(keras.layers.Dense(1))

sgd = keras.optimizers.SGD(learning_rate = 0.0002, clipnorm = 0.5)

model.summary()
model.compile(loss = R_loss, optimizer= sgd)
  
EarlyStop = keras.callbacks.EarlyStopping(
      monitor='val_loss',
      patience=5,
      verbose=1,
      min_delta=0.001, 
      mode='min')
  
Reduce = keras.callbacks.ReduceLROnPlateau(
      monitor='val_loss',
      factor=0.2,
      patience=3,
      verbose=1,
      mode='min',
      min_delta=0.001,
      cooldown=0,
      min_lr=0)

model.fit(
      x_train,
      y_train,
      validation_data = (x_valid, y_valid),
      epochs = 50,
      callbacks = [Reduce,EarlyStop])
  
model.save('/content/drive/My Drive/NN_models/NN3.h5')

rsquare_oos_valid.append(R_square(model.predict(x_valid),y_valid))
print(rsquare_oos_valid)


NN4

In [None]:
rsquare_oos_valid = []
rsquare_oos_test = []

#nn4
model = keras.models.Sequential()
model.add(keras.layers.Dense(32, activation='relu', input_shape=x_train.shape[1:]))
model.add(keras.layers.Dense(16, activation='relu'))
model.add(keras.layers.Dense(8, activation='relu'))
model.add(keras.layers.Dense(4, activation='relu'))
model.add(keras.layers.Dense(1))

sgd = keras.optimizers.SGD(learning_rate = 0.0002, clipnorm = 0.5)

model.summary()
model.compile(loss = R_loss, optimizer= sgd)
  
EarlyStop = keras.callbacks.EarlyStopping(
      monitor='val_loss',
      patience=5,
      verbose=1,
      min_delta=0.001, 
      mode='min')
  
Reduce = keras.callbacks.ReduceLROnPlateau(
      monitor='val_loss',
      factor=0.2,
      patience=3,
      verbose=1,
      mode='min',
      min_delta=0.001,
      cooldown=0,
      min_lr=0)

model.fit(
      x_train,
      y_train,
      validation_data = (x_valid, y_valid),
      epochs = 50,
      callbacks = [Reduce,EarlyStop])
  
model.save('/content/drive/My Drive/NN_models/NN4.h5')

rsquare_oos_valid.append(R_square(model.predict(x_valid),y_valid))
print(rsquare_oos_valid)


NN5

In [None]:
rsquare_oos_valid = []
rsquare_oos_test = []

#nn5
model = keras.models.Sequential()
model.add(keras.layers.Dense(32, activation='relu', input_shape=x_train.shape[1:]))
model.add(keras.layers.Dense(16, activation='relu'))
model.add(keras.layers.Dense(8, activation='relu'))
model.add(keras.layers.Dense(4, activation='relu'))
model.add(keras.layers.Dense(2, activation='relu'))
model.add(keras.layers.Dense(1))

sgd = keras.optimizers.SGD(learning_rate = 0.0002, clipnorm = 0.5)

model.summary()
model.compile(loss = R_loss, optimizer= sgd)
  
EarlyStop = keras.callbacks.EarlyStopping(
      monitor='val_loss',
      patience=5,
      verbose=1,
      min_delta=0.001, 
      mode='min')
  
Reduce = keras.callbacks.ReduceLROnPlateau(
      monitor='val_loss',
      factor=0.2,
      patience=3,
      verbose=1,
      mode='min',
      min_delta=0.001,
      cooldown=0,
      min_lr=0)

model.fit(
      x_train,
      y_train,
      validation_data = (x_valid, y_valid),
      epochs = 50,
      callbacks = [Reduce,EarlyStop])
  
model.save('/content/drive/My Drive/NN_models/NN5.h5')

rsquare_oos_valid.append(R_square(model.predict(x_valid),y_valid))
print(rsquare_oos_valid)


Testing process

In [None]:
#test
for year in range(1990,2020):
  print("start testing the data in "+str(year))
  x_test, y_test = testing_data(year)
  

  rsquare_oos_test.append(R_square(model.predict(x_test),y_test))
  np.savetxt('R2_'+str(year)+'.csv', 
           rsquare_oos_test,
           delimiter =", ", 
           fmt ='% s')
  
  print(rsquare_oos_test)

In [None]:
# plot R2
import matplotlib.pyplot as plt
import csv
  
x = []
y = []

R2 = pd.read_csv('R2_2019.csv')
R2 = R2.to_numpy()

      
for i in range(1,len(R2)+1):
      x.append(i+1989)
      y.append(R2[i-1].item())
  
plt.bar(x, y, color = 'b', width = 0.72, label = "R_squre")
plt.xlabel('Years')
plt.ylabel('R_squre')
plt.title('R_squre from 1990-2019')
plt.legend()
plt.show()

print(R2.mean())

In [None]:
feature_score_nn1 = []
handling_nan(df)
max_year = rsquare_oos_test.idxmax()
# R^2 for year max_year+1990 is the max
test = df[(df["yyyymm"] > (max_year+1990)*100)&(df["yyyymm"] < (max_year+1991)*100)]

  
x_test = test.drop(["permno","yyyymm",'RET','excess_ret'],axis = 1)
y_test = np.array(test['excess_ret']).reshape(-1,1)

for col in range(1,len(x_test.columns)):
    tt =x_test.copy()
    #tt.iloc[:,col].fillna(0,inplace=True)
    tt.iloc[:,col] = 0
    #tt.iloc[:,:col] = 0

    scaler = StandardScaler()
    tt = scaler.fit_transform(tt)

    feature_score_nn1.append(np.abs(R_square(model.predict(tt),y_test)-R2[max_year]))

print(feature_score_nn1)

In [None]:
#plot variable importance
id = x_test.columns[1:].copy()

importance_nn1 = pd.DataFrame(feature_score_nn1,columns=['Importance'], index=id)
importance_nn1 = importance_nn1.sort_values(by='Importance',ascending = True)
importance_nn1 = importance_nn1.tail(20)
importance_nn1.plot(kind='barh',figsize=(9,7))