In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [None]:
df = pd.read_csv("/content/drive/My Drive/datashare/GKX_20201231.csv")
df = df[df["DATE"] >= 19600101].reset_index(drop = True) #选取1960年之后的数据
df.head()

Unnamed: 0,permno,DATE,mvel1,RET,prc,SHROUT,beta,betasq,chmom,dolvol,idiovol,indmom,mom1m,mom6m,mom12m,mom36m,mve0,pricedelay,turn,absacc,acc,age,agr,cashdebt,cashpr,cfp,cfp_ia,chatoia,chcsho,chempia,chinv,chpmia,convind,currat,depr,divi,divo,dy,egr,ep,...,ps,quick,rd,rd_mve,rd_sale,realestate,roic,salecash,saleinv,salerec,secured,securedind,sgr,sin,sp,tang,tb,aeavol,cash,chtx,cinvest,ear,nincr,roaq,roavol,roeq,rsup,stdacc,stdcf,ms,baspread,ill,maxret,retvol,std_dolvol,std_turn,zerotrade,sic2,bm,bm_ia
0,10006,19600129,68773.0,0.005155,48.75,1418,1.158041,1.341058,-0.078184,9.474396,0.027283,0.136142,0.013055,-0.021572,0.046317,-0.051308,69127.5,-0.015614,0.168077,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.015272,2.059137e-07,0.033943,0.015149,0.591078,0.533983,9.859742e-08,37.0,,
1,10014,19600129,9823.5,0.138889,5.125,2183,1.838109,3.378645,-0.506541,7.946573,0.0674,0.136142,-0.027027,-0.139535,0.15625,0.52381,11187.875,0.271738,0.210261,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.035918,1.90616e-06,0.028571,0.021261,0.49165,0.481666,8.535634e-08,,,
2,10022,19600129,21133.75,-0.045455,13.125,1537,1.157077,1.338827,-0.374534,8.507143,0.043777,0.136142,0.009091,-0.007626,0.365436,0.876728,20173.125,-0.029163,0.276296,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.022607,6.754297e-07,0.04,0.026199,0.87711,1.639491,5.271194e-08,,,
3,10030,19600129,81200.0,-0.015,49.25,1624,1.327625,1.762587,-0.186923,9.550378,0.024872,0.136142,0.012,-0.024721,0.144885,0.142554,79982.0,0.029423,0.18986,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.01496,4.355397e-07,0.038462,0.020206,0.936132,0.771756,1.102852e-07,,,
4,10057,19600129,27062.5,-0.020785,53.0,500,1.194604,1.42708,-0.293635,8.138565,0.030824,0.136142,0.021028,-0.111214,0.06753,-0.270734,26500.0,-0.065354,0.103333,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.002941,3.479863e-07,0.018692,0.006685,0.451979,0.19001,0.9545457,,,


In [None]:
#Handle NA
def handling_na(df):
  for column in df.columns:
    if df[column].isnull().sum() == 0:
      continue
    #缺失值多于50%，直接drop该列
    elif (df[column].isnull().sum()/len(df[column])) > 0.5:
      df.drop(column, axis = 1, inplace = True)
    #缺失值少于50%，填充平均值 
    else:
      df[column].fillna(df[column].mean(), inplace = True)

In [None]:
#输入测试集年份，自动准备数据
def cleaning_data(year_of_test):
  end_of_test = (year_of_test+1)*10000
  end_of_validation = year_of_test*10000
  start_of_validation = (year_of_test-12)*10000

  alldata = df[df["DATE"] < end_of_test]

  #handling na
  handling_na(alldata)

  #划分train/validation/test
  train = alldata[alldata["DATE"] < start_of_validation]
  valid = alldata[(alldata["DATE"] > start_of_validation)&(alldata["DATE"] < end_of_validation)]
  test = alldata[(alldata["DATE"] > end_of_validation)&(alldata["DATE"] < end_of_test)]

  #x,y
  x_train = train.drop(["permno","DATE",'RET'],axis = 1)
  x_valid = valid.drop(["permno","DATE",'RET'],axis = 1)
  x_test = test.drop(["permno","DATE",'RET'],axis = 1)
  
  y_train = np.array(train['RET']).reshape(-1,1)
  y_valid = np.array(valid['RET']).reshape(-1,1)
  y_test = np.array(test['RET']).reshape(-1,1)


  #StandardScale
  scaler = StandardScaler()
  x_train = scaler.fit_transform(x_train)
  x_valid = scaler.transform(x_valid)
  x_test = scaler.transform(x_test)

  return x_train, x_valid, x_test, y_train, y_valid, y_test

In [None]:
#定义计算out of sample R square函数
def R_square(ypred,ytrue): 
  dif2=np.sum(np.power(ytrue-ypred,2))
  return 1-(dif2/np.sum(np.power(ytrue,2)))

In [None]:
#自定义定义损失函数
def R_loss(y_true, y_pred):
  return tf.reduce_mean(tf.square(y_true-y_pred))/tf.reduce_mean(tf.square(y_true))

In [None]:
import tensorflow as tf
from tensorflow import keras

In [None]:
rsquare_oos_valid = []
rsquare_oos_test = []

#NN2
for year in range(1990,2021):
  print("start training the data from 1960 to "+str(year))
  x_train, x_valid, x_test, y_train, y_valid, y_test = cleaning_data(year)

  model = keras.models.Sequential()
  model.add(keras.layers.Dense(32, activation='relu', input_shape=x_train.shape[1:]))
  model.add(keras.layers.Dense(16, activation='relu'))
  model.add(keras.layers.Dense(1))

  sgd = keras.optimizers.SGD(learning_rate = 0.0002, clipnorm = 0.5)

  model.summary()
  model.compile(loss = R_loss, optimizer= sgd)
  
  EarlyStop = keras.callbacks.EarlyStopping(
      monitor='val_loss',
      patience=5,
      verbose=1,
      min_delta=0.01, 
      mode='min')
  
  Reduce = keras.callbacks.ReduceLROnPlateau(
      monitor='val_loss',
      factor=0.2,
      patience=3,
      verbose=1,
      mode='min',
      min_delta=0.01,
      cooldown=0,
      min_lr=0)

  model.fit(
      x_train,
      y_train,
      validation_data = (x_valid, y_valid),
      epochs = 100,
      callbacks = [Reduce,EarlyStop])
  
  model.save('/content/drive/My Drive/NN_models/NN2_'+str(year)+'.h5')

  rsquare_oos_valid.append(R_square(model.predict(x_valid),y_valid))
  print(rsquare_oos_valid)
  rsquare_oos_test.append(R_square(model.predict(x_test),y_test))
  print(rsquare_oos_test)

  del model