In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from tensorflow import keras

In [2]:
df = pd.read_csv("/content/drive/My Drive/datashare/GKX_20201231.csv")
df = df[df["DATE"] >= 19600101].reset_index(drop = True) #选取1960年之后的数据
df.head()

Unnamed: 0,permno,DATE,mvel1,RET,prc,SHROUT,beta,betasq,chmom,dolvol,idiovol,indmom,mom1m,mom6m,mom12m,mom36m,mve0,pricedelay,turn,absacc,acc,age,agr,cashdebt,cashpr,cfp,cfp_ia,chatoia,chcsho,chempia,chinv,chpmia,convind,currat,depr,divi,divo,dy,egr,ep,...,ps,quick,rd,rd_mve,rd_sale,realestate,roic,salecash,saleinv,salerec,secured,securedind,sgr,sin,sp,tang,tb,aeavol,cash,chtx,cinvest,ear,nincr,roaq,roavol,roeq,rsup,stdacc,stdcf,ms,baspread,ill,maxret,retvol,std_dolvol,std_turn,zerotrade,sic2,bm,bm_ia
0,10006,19600129,68773.0,0.005155,48.75,1418,1.158041,1.341058,-0.078184,9.474396,0.027283,0.136142,0.013055,-0.021572,0.046317,-0.051308,69127.5,-0.015614,0.168077,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.015272,2.059137e-07,0.033943,0.015149,0.591078,0.533983,9.859742e-08,37.0,,
1,10014,19600129,9823.5,0.138889,5.125,2183,1.838109,3.378645,-0.506541,7.946573,0.0674,0.136142,-0.027027,-0.139535,0.15625,0.52381,11187.875,0.271738,0.210261,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.035918,1.90616e-06,0.028571,0.021261,0.49165,0.481666,8.535634e-08,,,
2,10022,19600129,21133.75,-0.045455,13.125,1537,1.157077,1.338827,-0.374534,8.507143,0.043777,0.136142,0.009091,-0.007626,0.365436,0.876728,20173.125,-0.029163,0.276296,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.022607,6.754297e-07,0.04,0.026199,0.87711,1.639491,5.271194e-08,,,
3,10030,19600129,81200.0,-0.015,49.25,1624,1.327625,1.762587,-0.186923,9.550378,0.024872,0.136142,0.012,-0.024721,0.144885,0.142554,79982.0,0.029423,0.18986,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.01496,4.355397e-07,0.038462,0.020206,0.936132,0.771756,1.102852e-07,,,
4,10057,19600129,27062.5,-0.020785,53.0,500,1.194604,1.42708,-0.293635,8.138565,0.030824,0.136142,0.021028,-0.111214,0.06753,-0.270734,26500.0,-0.065354,0.103333,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.002941,3.479863e-07,0.018692,0.006685,0.451979,0.19001,0.9545457,,,


In [3]:
#储存变量重要性
variables_importance = {}
for depth in range(1,6):
  variables_importance["NN"+str(depth)]={}
  for variable in df.columns:
    variables_importance["NN"+str(depth)][variable] = []

#储存模型表现
model_performance = {}
for depth in range(1,6):
  model_performance["NN"+str(depth)]=[]

In [4]:
#Handle NA
def handling_na(df):
  for column in df.columns:
    if df[column].isnull().sum() == 0:
      continue
    #缺失值多于50%，直接drop该列
    elif (df[column].isnull().sum()/len(df[column])) > 0.5:
      df.drop(column, axis = 1, inplace = True)
    #缺失值少于50%，填充平均值 
    else:
      df[column].fillna(df[column].mean(), inplace = True)

#输入测试集年份，自动准备数据
def cleaning_data(year_of_test):
  end_of_test = (year_of_test+1)*10000
  end_of_validation = year_of_test*10000
  start_of_validation = (year_of_test-12)*10000

  alldata = df[df["DATE"] < end_of_test]

  #handling na
  handling_na(alldata)

  #划分train/validation/test
  train = alldata[alldata["DATE"] < start_of_validation]
  valid = alldata[(alldata["DATE"] > start_of_validation)&(alldata["DATE"] < end_of_validation)]
  test = alldata[(alldata["DATE"] > end_of_validation)&(alldata["DATE"] < end_of_test)]

  #x,y
  x_train = train.drop(["permno","DATE",'RET'],axis = 1)
  x_valid = valid.drop(["permno","DATE",'RET'],axis = 1)
  x_test = test.drop(["permno","DATE",'RET'],axis = 1)
  
  y_train = np.array(train['RET']).reshape(-1,1)
  y_valid = np.array(valid['RET']).reshape(-1,1)
  y_test = np.array(test['RET']).reshape(-1,1)

  variables = x_test.columns


  #StandardScale
  scaler = StandardScaler()
  x_train = scaler.fit_transform(x_train)
  x_valid = scaler.transform(x_valid)
  x_test = scaler.transform(x_test)

  return x_train, x_valid, x_test, y_train, y_valid, y_test, variables


#定义计算out of sample R square函数
def R_square(ypred,ytrue): 
  dif2=np.sum(np.power(ytrue-ypred,2))
  return 1-(dif2/np.sum(np.power(ytrue,2)))

#自定义定义损失函数
def R_loss(y_true, y_pred):
  return tf.reduce_mean(tf.square(y_true-y_pred))/tf.reduce_mean(tf.square(y_true))

In [6]:
for year in range(2000,2021):
  x_train, x_valid, x_test, y_train, y_valid, y_test, variables = cleaning_data(year)

  models = {}
  R2 = []

  for depth in range(1,6):
    print("loading the NN"+str(depth)+" model in "+str(year))
    model = keras.models.load_model("/content/drive/My Drive/NN_models/"+"NN"+str(depth)+"_"+str(year)+".h5",compile = False)
    print("calculating the performance of NN"+str(depth)+" model in "+str(year))
    R2.append(R_square(model.predict(x_test),y_test))
    model_performance["NN"+str(depth)].append(R2[depth - 1])
    models["NN"+str(depth)] = model

  '''
  for i in range(len(variables)):
    variable = variables[i]

    x_test_ar = x_test
    x_test_ar[:,i] = 0

    for depth in range(1,6):
      print("calculating the importance of "+variable+" in NN"+str(depth)+" model in "+str(year))
      variables_importance["NN"+str(depth)][variable].append(R2[depth - 1] - R_square(models["NN"+str(depth)].predict(x_test),y_test))
  '''

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


loading the NN1 model in 2000
calculating the performance of NN1 model in 2000
loading the NN2 model in 2000
calculating the performance of NN2 model in 2000
loading the NN3 model in 2000
calculating the performance of NN3 model in 2000
loading the NN4 model in 2000
calculating the performance of NN4 model in 2000
loading the NN5 model in 2000
calculating the performance of NN5 model in 2000


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


loading the NN1 model in 2001
calculating the performance of NN1 model in 2001
loading the NN2 model in 2001
calculating the performance of NN2 model in 2001
loading the NN3 model in 2001
calculating the performance of NN3 model in 2001
loading the NN4 model in 2001
calculating the performance of NN4 model in 2001
loading the NN5 model in 2001
calculating the performance of NN5 model in 2001


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


loading the NN1 model in 2002
calculating the performance of NN1 model in 2002
loading the NN2 model in 2002
calculating the performance of NN2 model in 2002
loading the NN3 model in 2002
calculating the performance of NN3 model in 2002
loading the NN4 model in 2002
calculating the performance of NN4 model in 2002
loading the NN5 model in 2002
calculating the performance of NN5 model in 2002


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


loading the NN1 model in 2003
calculating the performance of NN1 model in 2003
loading the NN2 model in 2003
calculating the performance of NN2 model in 2003
loading the NN3 model in 2003
calculating the performance of NN3 model in 2003
loading the NN4 model in 2003
calculating the performance of NN4 model in 2003
loading the NN5 model in 2003
calculating the performance of NN5 model in 2003


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


loading the NN1 model in 2004
calculating the performance of NN1 model in 2004
loading the NN2 model in 2004
calculating the performance of NN2 model in 2004
loading the NN3 model in 2004
calculating the performance of NN3 model in 2004
loading the NN4 model in 2004
calculating the performance of NN4 model in 2004
loading the NN5 model in 2004
calculating the performance of NN5 model in 2004


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


loading the NN1 model in 2005
calculating the performance of NN1 model in 2005
loading the NN2 model in 2005
calculating the performance of NN2 model in 2005
loading the NN3 model in 2005
calculating the performance of NN3 model in 2005
loading the NN4 model in 2005
calculating the performance of NN4 model in 2005
loading the NN5 model in 2005
calculating the performance of NN5 model in 2005


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


loading the NN1 model in 2006
calculating the performance of NN1 model in 2006
loading the NN2 model in 2006
calculating the performance of NN2 model in 2006
loading the NN3 model in 2006
calculating the performance of NN3 model in 2006
loading the NN4 model in 2006
calculating the performance of NN4 model in 2006
loading the NN5 model in 2006
calculating the performance of NN5 model in 2006


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


loading the NN1 model in 2007
calculating the performance of NN1 model in 2007
loading the NN2 model in 2007
calculating the performance of NN2 model in 2007
loading the NN3 model in 2007
calculating the performance of NN3 model in 2007
loading the NN4 model in 2007
calculating the performance of NN4 model in 2007
loading the NN5 model in 2007
calculating the performance of NN5 model in 2007


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


loading the NN1 model in 2008
calculating the performance of NN1 model in 2008
loading the NN2 model in 2008
calculating the performance of NN2 model in 2008
loading the NN3 model in 2008
calculating the performance of NN3 model in 2008
loading the NN4 model in 2008
calculating the performance of NN4 model in 2008
loading the NN5 model in 2008
calculating the performance of NN5 model in 2008


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


loading the NN1 model in 2009
calculating the performance of NN1 model in 2009
loading the NN2 model in 2009
calculating the performance of NN2 model in 2009
loading the NN3 model in 2009
calculating the performance of NN3 model in 2009
loading the NN4 model in 2009
calculating the performance of NN4 model in 2009
loading the NN5 model in 2009
calculating the performance of NN5 model in 2009


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


loading the NN1 model in 2010
calculating the performance of NN1 model in 2010
loading the NN2 model in 2010
calculating the performance of NN2 model in 2010
loading the NN3 model in 2010
calculating the performance of NN3 model in 2010
loading the NN4 model in 2010
calculating the performance of NN4 model in 2010
loading the NN5 model in 2010
calculating the performance of NN5 model in 2010


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


loading the NN1 model in 2011
calculating the performance of NN1 model in 2011
loading the NN2 model in 2011
calculating the performance of NN2 model in 2011
loading the NN3 model in 2011
calculating the performance of NN3 model in 2011
loading the NN4 model in 2011
calculating the performance of NN4 model in 2011
loading the NN5 model in 2011
calculating the performance of NN5 model in 2011


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


loading the NN1 model in 2012
calculating the performance of NN1 model in 2012
loading the NN2 model in 2012
calculating the performance of NN2 model in 2012
loading the NN3 model in 2012
calculating the performance of NN3 model in 2012
loading the NN4 model in 2012
calculating the performance of NN4 model in 2012
loading the NN5 model in 2012
calculating the performance of NN5 model in 2012


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


loading the NN1 model in 2013
calculating the performance of NN1 model in 2013
loading the NN2 model in 2013
calculating the performance of NN2 model in 2013
loading the NN3 model in 2013
calculating the performance of NN3 model in 2013
loading the NN4 model in 2013
calculating the performance of NN4 model in 2013
loading the NN5 model in 2013
calculating the performance of NN5 model in 2013


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


loading the NN1 model in 2014
calculating the performance of NN1 model in 2014
loading the NN2 model in 2014
calculating the performance of NN2 model in 2014
loading the NN3 model in 2014
calculating the performance of NN3 model in 2014
loading the NN4 model in 2014
calculating the performance of NN4 model in 2014
loading the NN5 model in 2014
calculating the performance of NN5 model in 2014


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


loading the NN1 model in 2015
calculating the performance of NN1 model in 2015
loading the NN2 model in 2015
calculating the performance of NN2 model in 2015
loading the NN3 model in 2015
calculating the performance of NN3 model in 2015
loading the NN4 model in 2015
calculating the performance of NN4 model in 2015
loading the NN5 model in 2015
calculating the performance of NN5 model in 2015


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


loading the NN1 model in 2016
calculating the performance of NN1 model in 2016
loading the NN2 model in 2016
calculating the performance of NN2 model in 2016
loading the NN3 model in 2016
calculating the performance of NN3 model in 2016
loading the NN4 model in 2016
calculating the performance of NN4 model in 2016
loading the NN5 model in 2016
calculating the performance of NN5 model in 2016


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


loading the NN1 model in 2017
calculating the performance of NN1 model in 2017
loading the NN2 model in 2017
calculating the performance of NN2 model in 2017
loading the NN3 model in 2017
calculating the performance of NN3 model in 2017
loading the NN4 model in 2017
calculating the performance of NN4 model in 2017
loading the NN5 model in 2017
calculating the performance of NN5 model in 2017


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


loading the NN1 model in 2018
calculating the performance of NN1 model in 2018
loading the NN2 model in 2018
calculating the performance of NN2 model in 2018
loading the NN3 model in 2018
calculating the performance of NN3 model in 2018
loading the NN4 model in 2018
calculating the performance of NN4 model in 2018
loading the NN5 model in 2018
calculating the performance of NN5 model in 2018


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


loading the NN1 model in 2019
calculating the performance of NN1 model in 2019
loading the NN2 model in 2019
calculating the performance of NN2 model in 2019
loading the NN3 model in 2019
calculating the performance of NN3 model in 2019
loading the NN4 model in 2019
calculating the performance of NN4 model in 2019
loading the NN5 model in 2019
calculating the performance of NN5 model in 2019
loading the NN1 model in 2020
calculating the performance of NN1 model in 2020
loading the NN2 model in 2020
calculating the performance of NN2 model in 2020
loading the NN3 model in 2020
calculating the performance of NN3 model in 2020
loading the NN4 model in 2020
calculating the performance of NN4 model in 2020
loading the NN5 model in 2020
calculating the performance of NN5 model in 2020


In [15]:
for depth in range(1,6):
  ar = np.array(model_performance['NN'+str(depth)])
  print(ar[~np.isnan(ar)].max(),ar[~np.isnan(ar)].min())

0.0070290253136827685 -0.5503108530362153
0.011596167510019573 -0.3741507934668249
0.009543322614161354 -0.5858633453943152
0.021076643456462096 -0.7802475603404495
0.021227531341382555 -0.21983425573173565
