In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [None]:
cd ../

In [None]:
df = pd.read_csv("../input/empirical-asset-pricing/preprocessed_data.csv")
df = df[df["DATE"] >= 19900101].reset_index(drop = True) #选取1960年之后的数据
df.head()

In [None]:
#Handle NA
def handling_na(df):
  for column in df.columns:
    if df[column].isnull().sum() == 0:
      continue
    #缺失值多于50%，直接drop该列
    elif (df[column].isnull().sum()/len(df[column])) > 0.5:
      df.drop(column, axis = 1, inplace = True)
    #缺失值少于50%，填充平均值 
    else:
      df[column].fillna(df[column].mean(), inplace = True)

In [None]:
#输入测试集年份，自动准备数据
def cleaning_data(year_of_test):
  end_of_test = (year_of_test+1)*10000
  end_of_validation = year_of_test*10000
  start_of_validation = (year_of_test-12)*10000

  alldata = df[df["DATE"] < end_of_test]

  #handling na
  handling_na(alldata)

  #划分train/validation/test
  train = alldata[alldata["DATE"] < start_of_validation]
  valid = alldata[(alldata["DATE"] > start_of_validation)&(alldata["DATE"] < end_of_validation)]
  test = alldata[(alldata["DATE"] > end_of_validation)&(alldata["DATE"] < end_of_test)]

  #x,y
  x_train = train.drop(["permno","DATE",'RET'],axis = 1)
  x_valid = valid.drop(["permno","DATE",'RET'],axis = 1)
  x_test = test.drop(["permno","DATE",'RET'],axis = 1)
  
  y_train = np.array(train['RET']).reshape(-1,1)
  y_valid = np.array(valid['RET']).reshape(-1,1)
  y_test = np.array(test['RET']).reshape(-1,1)


  #StandardScale
  scaler = StandardScaler()
  x_train = scaler.fit_transform(x_train)
  x_valid = scaler.transform(x_valid)
  x_test = scaler.transform(x_test)

  return x_train, x_valid, x_test, y_train, y_valid, y_test

In [None]:
#定义计算out of sample R square函数
def R_square(ypred,ytrue): 
  dif2=np.sum(np.power(ytrue-ypred,2))
  return 1-(dif2/np.sum(np.power(ytrue,2)))

In [None]:
#自定义定义损失函数
def R_loss(y_true, y_pred):
  return tf.reduce_mean(tf.square(y_true-y_pred))/tf.reduce_mean(tf.square(y_true))

In [None]:
import tensorflow as tf
from tensorflow import keras

In [None]:
rsquare_oos_valid = []
rsquare_oos_test = []

#NN5
for year in range(2004,2021):
  print("start training the data from 1960 to "+str(year))
  x_train, x_valid, x_test, y_train, y_valid, y_test = cleaning_data(year)

  model = keras.models.Sequential()
  model.add(keras.layers.Dense(32, activation='relu', input_shape=x_train.shape[1:]))
  model.add(keras.layers.Dense(16, activation='relu'))
  model.add(keras.layers.Dense(8, activation='relu'))
  model.add(keras.layers.Dense(4, activation='relu'))
  model.add(keras.layers.Dense(2, activation='relu'))
  model.add(keras.layers.Dense(1))

  sgd = keras.optimizers.SGD(learning_rate = 0.0002, clipnorm = 0.5)

  model.summary()
  model.compile(loss = R_loss, optimizer= sgd)
  
  EarlyStop = keras.callbacks.EarlyStopping(
      monitor='val_loss',
      patience=2,
      verbose=1,
      min_delta=0.01, 
      mode='min')
  
  Reduce = keras.callbacks.ReduceLROnPlateau(
      monitor='val_loss',
      factor=0.2,
      patience=2,
      verbose=1,
      mode='min',
      min_delta=0.01,
      cooldown=0,
      min_lr=0)

  model.fit(
      x_train,
      y_train,
      validation_data = (x_valid, y_valid),
      epochs = 100,
      callbacks = [Reduce,EarlyStop])
  
  model.save('/content/drive/My Drive/NN_models/NN5_'+str(year)+'.h5')

  rsquare_oos_valid.append(R_square(model.predict(x_valid),y_valid))
  print(rsquare_oos_valid)
  rsquare_oos_test.append(R_square(model.predict(x_test),y_test))
  print(rsquare_oos_test)

  del model