In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from sklearn.cross_decomposition import PLSRegression
from sklearn.decomposition import PCA

In [2]:
df = pd.read_csv("GKX_20201231.csv")
df = df[df["DATE"] >= 19600101].reset_index(drop = True) #选取1960年之后的数据
df.head()

Unnamed: 0,permno,DATE,mvel1,RET,prc,SHROUT,beta,betasq,chmom,dolvol,...,baspread,ill,maxret,retvol,std_dolvol,std_turn,zerotrade,sic2,bm,bm_ia
0,10006,19600129,68773.0,0.005155,48.75,1418,1.158041,1.341058,-0.078184,9.474396,...,0.015272,2.059137e-07,0.033943,0.015149,0.591078,0.533983,9.859742e-08,37.0,,
1,10014,19600129,9823.5,0.138889,5.125,2183,1.838109,3.378645,-0.506541,7.946573,...,0.035918,1.90616e-06,0.028571,0.021261,0.49165,0.481666,8.535634e-08,,,
2,10022,19600129,21133.75,-0.045455,13.125,1537,1.157077,1.338827,-0.374534,8.507143,...,0.022607,6.754297e-07,0.04,0.026199,0.87711,1.639491,5.271194e-08,,,
3,10030,19600129,81200.0,-0.015,49.25,1624,1.327625,1.762587,-0.186923,9.550378,...,0.01496,4.355397e-07,0.038462,0.020206,0.936132,0.771756,1.102852e-07,,,
4,10057,19600129,27062.5,-0.020785,53.0,500,1.194604,1.42708,-0.293635,8.138565,...,0.002941,3.479863e-07,0.018692,0.006685,0.451979,0.19001,0.9545457,,,


In [3]:
#划分train/validation/test
#输入测试集的年份，返回三个集合
def split_function(year_of_test):
  end_of_test = (year_of_test+1)*10000
  end_of_validation = year_of_test*10000
  start_of_validation = (year_of_test-12)*10000
  Train = df[df["DATE"] < start_of_validation]
  Validation = df[(df["DATE"] > start_of_validation)&(df["DATE"] < end_of_validation)]
  Test = df[(df["DATE"] > end_of_validation)&(df["DATE"] < end_of_test)]
  return Train,Validation,Test

In [4]:
#缺失值大于50%的指标，缺失值用0填充
def fillna_by_zero_of_columns(df):
    for column in list(df.columns[df.isnull().sum() > len(df)*0.5]):
        df[column].fillna(0.00001, inplace=True)
    return df

In [5]:
#缺失值小于50%的指标，缺失值用均值填充
def fillna_by_mean_of_columns(df):
    for column in list(df.columns[df.isnull().sum() < len(df)*0.5]):
        mean_val = df[column].mean()
        df[column].fillna(mean_val, inplace=True)
    return df

In [6]:
#数据缺失值处理
fillna_by_zero_of_columns(df)
fillna_by_mean_of_columns(df)

# Normalized the data
scaler = StandardScaler()


In [7]:
 #定义计算out of sample R square函数
errs=[]
def error(ypred,ytrue): 
  dif2=np.sum(np.power(ytrue-ypred,2))
  return 1-dif2/np.sum(np.power(ytrue,2))

In [None]:
#PCA 初始化
pca=PCA(copy=True,n_components=0.80)
components=[]
year_of_validation_start=[]
year_of_test_start=[]
rsquare_oos_validation=[]
rsquare_oos_test=[]
#计算PCA函数
for i in range(30):
      year_of_validation_start.append(1990-12+i)
      year_of_test_start.append(1990+i)
      x_train=split_function(1990+i)[0].drop(['RET'],axis=1)
      scaler.fit(x_train)
      x_train=scaler.transform(x_train)  ##标准化x_train
      y_train=split_function(1990+i)[0]['RET']
      y_mean= np.mean(y_train)
      y_std = np.std(y_train)
      y_train=np.array((y_train-y_mean)/y_std )  ##标准化y_train
      x_validation=split_function(1990+i)[1].drop(['RET'],axis=1)
      scaler.fit(x_validation)
      x_validation=scaler.transform(x_validation)  ##标准化x_validation
      y_validation=split_function(1990+i)[1]['RET']
      y_mean2= np.mean(y_validation)
      y_std2 = np.std(y_validation)
      y_validation=np.array((y_validation-y_mean2)/y_std2 )  ##标准化y_validation
      x_test=split_function(1990+i)[2].drop(['RET'],axis=1)
      scaler.fit(x_test)
      x_test=scaler.transform(x_test)  ##标准化x_test
      y_test=split_function(1990+i)[2]['RET']
      y_mean3= np.mean(y_test)
      y_std3 = np.std(y_test)
      y_test=np.array((y_test-y_mean3)/y_std3 )  ##标准化y_test
      pca.fit(x_train, y_train)
      components.append(pca.n_components_)
      x_train = pca.transform(x_train)
      ols = sm.OLS(y_train,x_train).fit()
      x_validation=pca.transform(x_validation)  ##x_validation拟合pca
      y_validation_pre = ols.predict(x_validation).flatten()
      x_test=pca.transform(x_test)        ##x_test拟合pca
      y_test_pre = ols.predict(x_test).flatten()
      rsquare_oos_validation.append(error(y_validation_pre,y_validation))
      rsquare_oos_test.append(error(y_test_pre,y_test))
      del x_train
      del y_train
      del x_validation
      del y_validation
      del x_test
      del y_test




In [None]:
year_of_validation_start

In [None]:
rsquare_oos_validation

[-0.005306077643253415,
 -0.0045180155706994185,
 -0.005520640462510329,
 -0.0034778778856929815,
 -0.0013943087001941912,
 -0.0016036554684946136,
 0.00017934879683467742,
 0.0003665905605629849,
 0.0010244068417090135,
 0.0007793387202500446,
 0.0006278043483689544,
 0.0006157727565794247,
 0.000502689336594031,
 0.0018320238981218706,
 0.00177126238781955,
 0.0014519612660763848,
 0.0011684337444336501,
 0.001030549879609044,
 0.0011544592935154308,
 0.0005423291815711551,
 0.00037127016785287736,
 0.00047903851795161323,
 -0.0001100779866900492,
 0.0016891888600922833,
 -0.00015651923320780803,
 -0.001373738725838125,
 -0.0008453009496429686,
 -0.0007070360229795014,
 -0.000745732210707839,
 -0.0004971382055958262]

In [None]:
rsquare_oos_test

[-0.0036840059716529705,
 -0.0008200416617936135,
 0.0074004728626017036,
 0.00500792271489825,
 0.001828432692646098,
 -0.0017018193870870224,
 0.0004030455846242287,
 0.0005351019223243103,
 -0.0012922093172365035,
 -0.0004294797859396571,
 0.0010054504834607902,
 0.0024731418616976697,
 0.008834269750201407,
 -0.006622085618061435,
 -0.0007596434468579716,
 -7.771021226932184e-05,
 -0.003371928356469933,
 -0.006513651384279617,
 -0.005641062299593935,
 -0.0008308793137632442,
 0.0013551068732193805,
 -0.0044890395105665615,
 0.0015681869106789481,
 0.004301420309453019,
 -0.0024537928066701298,
 -9.31101338330631e-05,
 -0.0011222994569604072,
 -0.0013448643132905058,
 -0.002832925998695446,
 0.0012105736951680823]

In [None]:
components_= np.array(components)
components_

array([47, 48, 47, 48, 49, 50, 50, 51, 52, 52, 53, 54, 54, 54, 54, 54, 55,
       55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55])

In [8]:
##计算PCR模型的feature importance (已经得到R2最大值为test=2002,n_components=54)

x_train=split_function(2002)[0].drop(['RET'],axis=1)
scaler.fit(x_train)
x_train=scaler.transform(x_train)  ##标准化x_train
y_train=split_function(2002)[0]['RET']
y_mean= np.mean(y_train)
y_std = np.std(y_train)
y_train=np.array((y_train-y_mean)/y_std )  ##标准化y_train

In [12]:
pca=PCA(copy=True,n_components=54)
pca.fit(x_train, y_train)
x_train = pca.transform(x_train)
ols = sm.OLS(y_train,x_train).fit()

In [43]:
for i in range(54):
    coef.append(ols.params[i]*pca.components_[i])
    

In [61]:
#计算featur importance
variable_importance=[]
for i in range(100):
    coef_new=1
    for j in range(54):
        coef_new=coef_new*coef[j][i]
        
    variable_importance.append(coef_new)

In [62]:
variable_importance

[-1.1128984286065192e-214,
 1.6107702290852958e-219,
 1.697314549912734e-230,
 8.061931982054987e-213,
 1.4884222069248758e-221,
 -4.334359953151774e-216,
 -1.7914649428135292e-212,
 5.76756196579542e-216,
 3.106281236849746e-222,
 -3.5053548960303627e-231,
 -8.539253386468064e-226,
 -4.508384078692092e-214,
 6.8743212666234125e-220,
 -1.3192190368949153e-221,
 8.728989181859076e-212,
 -3.105102674573523e-230,
 -9.79923819825038e-224,
 -5.659154372057212e-222,
 9.59112722046608e-224,
 8.679115681566446e-224,
 2.774117276053924e-206,
 1.161389231020379e-224,
 -8.357092728393894e-219,
 1.0794415851956207e-211,
 8.894515611740475e-227,
 3.3733110462759e-221,
 3.0795896573492624e-211,
 1.8045296251999524e-211,
 -1.2066742347907193e-225,
 9.185752915647287e-225,
 -6.574280524101718e-218,
 6.939947675074598e-213,
 5.149949463393979e-234,
 -1.1560770006465934e-215,
 3.9754259782328413e-219,
 1.3602128505626797e-219,
 -7.376878670799419e-213,
 -5.465786811149462e-221,
 -1.9326571485380018e-213

In [None]:
      
#PLS 初始化
components_group=[20,40,60,80]
rsquare_oos_validation_group=[]
rsquare_oos_validation=[]
components=[]
year_of_validation_start=[]
year_of_test_start=[]
rsquare_oos_validation=[]
rsquare_oos_test=[]

#计算PLS函数
for i in range(30):
  year_of_validation_start.append(1990-12+i)
  year_of_test_start.append(1990+i)

  x_train=split_function(1990+i)[0].drop(['RET'],axis=1)
  scaler.fit(x_train)
  x_train=scaler.transform(x_train)  #标准化x_train
  y_train=split_function(1990+i)[0]['RET']
  y_mean= np.mean(y_train)
  y_std = np.std(y_train)
  y_train=np.array((y_train-y_mean)/y_std )  #标准化y_train
  x_validation=split_function(1990+i)[1].drop(['RET'],axis=1)
  scaler.fit(x_validation)
  x_validation=scaler.transform(x_validation)  #标准化x_validation
  y_validation=split_function(1990+i)[1]['RET']
  y_mean2= np.mean(y_validation)
  y_std2 = np.std(y_validation)
  y_validation=np.array((y_validation-y_mean2)/y_std2 )  #标准化y_validation
  x_test=split_function(1990+i)[2].drop(['RET'],axis=1)
  scaler.fit(x_test)
  x_test=scaler.transform(x_test)  #标准化x_test
  y_test=split_function(1990+i)[2]['RET']
  y_mean3= np.mean(y_test)
  y_std3 = np.std(y_test)
  y_test=np.array((y_test-y_mean3)/y_std3 )  #标准化y_test

  #选择参数n_components
  for j in components_group:
    pls = PLSRegression(n_components=j)
    pls.fit(x_train, y_train)
    y_validation_pre = pls.predict(x_validation).flatten() #x_validation拟合pls
    rsquare_oos_validation_group.append(error(y_validation_pre,y_validation)) 


  rsquare_oos_validation.append(np.max(rsquare_oos_validation_group))    #根据R2筛选参数n_components
  x=np.where(np.max(rsquare_oos_validation_group))
  components.append(components_group[list(x[0])[0].tolist()])

  components_=list(map(int, components))[i]
  pls = PLSRegression(n_components=components_)
  pls.fit(x_train, y_train)
  y_test_pre = pls.predict(x_test).flatten() ##x_test拟合pls
  rsquare_oos_test.append(error(y_test_pre,y_test))
  print(i)

  del x_train
  del y_train
  del x_validation
  del y_validation
  del x_test
  del y_test



0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29


In [None]:
components

[20,
 20,
 20,
 20,
 40,
 40,
 20,
 20,
 20,
 20,
 20,
 20,
 60,
 20,
 20,
 20,
 20,
 20,
 20,
 20,
 20,
 20,
 20,
 20,
 40,
 80,
 20,
 20,
 20,
 20]

In [None]:
rsquare_oos_validation

[-0.016697834214127738,
 -0.016697834214127738,
 -0.016697834214127738,
 -0.016697834214127738,
 -0.016697834214127738,
 -0.016697834214127738,
 -0.01263680815460999,
 -0.006112063538282753,
 -0.0020659892878958086,
 0.0007822208124590446,
 0.0038812226241614622,
 0.00800204262377402,
 0.011987717976301271,
 0.015456033104542244,
 0.016277843049055107,
 0.016277843049055107,
 0.016830303742142783,
 0.01752788956022977,
 0.018197815396008843,
 0.018716073835814195,
 0.02150935655018682,
 0.02200618900591056,
 0.022195850382910165,
 0.022195850382910165,
 0.022195850382910165,
 0.022195850382910165,
 0.022195850382910165,
 0.022195850382910165,
 0.022195850382910165,
 0.022195850382910165]

In [None]:
rsquare_oos_test

[-0.02768497952008353,
 -0.03241188212242063,
 -0.007274262520273966,
 -0.01711354785359065,
 -0.027631253986653226,
 -0.03057088851096834,
 -0.02734381914054218,
 -0.016481313685857257,
 -0.004980335449019879,
 0.008816795481041972,
 0.0009374367803018924,
 0.025807828471266725,
 0.017351614542065352,
 0.006507379565147686,
 0.007743266621790479,
 0.008085379946142446,
 0.006909925573069686,
 0.0019696990743021114,
 0.015524602273084498,
 0.022208681605709435,
 0.022303882194080193,
 0.015365612234721215,
 0.014811327061237889,
 0.02043323089025406,
 0.011693653727885223,
 0.012009225014730784,
 0.017561364070879604,
 0.010788143169972786,
 0.008402950945182175,
 0.01689507209974961]