In [1]:
from pathlib import Path

import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from pyearth.earth import Earth

In [2]:
import warnings


warnings.simplefilter("ignore")

plt.style.use("seaborn-whitegrid")
plt.rc(
    "figure",
    autolayout=True,
    figsize=(11, 4),
    titlesize=18,
    titleweight='bold',
)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=16,
    titlepad=10,
)
plot_params = dict(
    color="0.75",
    style=".-",
    markeredgecolor="0.25",
    markerfacecolor="0.25",
    legend=False,
)
%config InlineBackend.figure_format = 'retina'

In [3]:
raw_dns_factor = pd.read_csv("./data/dns_factor.csv", index_col='Date', parse_dates=['Date'])

In [4]:
start_date_str = "2012-01-01"
end_date_str = "2022-12-31"

In [5]:
start_date = datetime.datetime.strptime(start_date_str, "%Y-%m-%d")
end_date = datetime.datetime.strptime(end_date_str, "%Y-%m-%d")
print(start_date, end_date)

2012-01-01 00:00:00 2022-12-31 00:00:00


In [6]:
raw_y =  pd.read_csv("./data/dns_factor.csv", index_col='Date', parse_dates=['Date'])
raw_X =  pd.read_csv("./data/clean_features/all_features.csv", index_col='Date', parse_dates=['Date'])

raw_X = raw_X[(raw_X.index>=start_date) & (raw_X.index<=end_date)]
raw_X

Unnamed: 0_level_0,CN_GDP_Current_Price_Cum_YTY,CN_GDP_Constant_Price_Cum_YTY,CN_GDP_Deflator_GDP_Cum_YTY,CN_GDP_Constant_Current_Q,CN_IFA_Cum_MTM,CN_IE_FGI_MTM,CN_PMI_New_Order,CN_PMI_RMP,CN_TRSCG_Cum_MOM,CN_PFE_MOM,...,LIBORO/N,LIBOR1W,LIBOR1M,LIBOR2M,LIBOR3M,LIBOR6M,LIBOR12M,CN_New_Jobs_Cum,CN_Difficulty_Cum,CN_Reemployed_Cum
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-01-04,18.49,9.550832,8.0272,127039.6,23.8,20.78,49.8,47.1,18.5,11.0800,...,0.15000,0.20720,0.29530,0.4282,0.58250,0.81100,1.13035,1221.0,0.0,0.0
2012-01-05,18.49,9.550832,8.0272,127039.6,23.8,20.78,49.8,47.1,18.5,11.0800,...,0.14950,0.20620,0.29530,0.4292,0.58250,0.81200,1.13035,1221.0,0.0,0.0
2012-01-06,18.49,9.550832,8.0272,127039.6,23.8,20.78,49.8,47.1,18.5,11.0800,...,0.14900,0.20620,0.29630,0.4282,0.58150,0.81200,1.13035,1221.0,0.0,0.0
2012-01-09,18.49,9.550832,8.0272,127039.6,23.8,20.78,49.8,47.1,18.5,11.0800,...,0.14900,0.20620,0.29630,0.4282,0.58050,0.81000,1.12825,1221.0,0.0,0.0
2012-01-10,18.49,9.550832,8.0272,127039.6,23.8,20.78,49.8,47.1,18.5,11.0800,...,0.15000,0.20420,0.29580,0.4282,0.57950,0.80850,1.12605,1221.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-26,5.73,3.000000,3.1439,289504.9,5.3,11.40,46.4,50.7,-0.1,4.7989,...,4.31671,0.07638,4.38686,0.1525,4.72643,5.15314,5.44386,1145.0,163.0,476.0
2022-12-27,5.73,3.000000,3.1439,289504.9,5.3,11.40,46.4,50.7,-0.1,4.7989,...,4.31671,0.07638,4.38686,0.1525,4.72643,5.15314,5.44386,1145.0,163.0,476.0
2022-12-28,5.73,3.000000,3.1439,289504.9,5.3,11.40,46.4,50.7,-0.1,4.7989,...,4.31643,0.07638,4.38357,0.1525,4.72986,5.15114,5.47029,1145.0,163.0,476.0
2022-12-29,5.73,3.000000,3.1439,289504.9,5.3,11.40,46.4,50.7,-0.1,4.7989,...,4.31186,0.07638,4.36871,0.1525,4.75386,5.13757,5.44257,1145.0,163.0,476.0


In [7]:
y, X = raw_y.align(raw_X, join="inner", axis=0)

In [8]:
print(X.shape, y.shape)

(2744, 182) (2744, 4)


In [15]:
# 计算相关系数
X_column_list = X.columns.tolist()
corr_list = []
for column in X_column_list:
    corr = np.corrcoef(X[column], y['Beta0'])[0][1]
    corr_list.append(corr)

kk = pd.DataFrame()
kk['feature'] = X_column_list
kk['corr'] = corr_list
kk['corr'] = kk['corr'].abs()
kk.sort_values(['corr'], ascending=False).reset_index(drop=True)['feature'][:30]

0          Fixed_Issue_Rate_30Y
1          Fixed_Issue_Rate_10Y
2          Fixed_Issue_Rate_50Y
3           Fixed_Issue_Rate_7Y
4           Fixed_Issue_Rate_5Y
5           Fixed_Issue_Rate_3M
6                      SHIBOR1Y
7                      SHIBOR9M
8                      SHIBOR3M
9                      SHIBOR6M
10          Fixed_Issue_Rate_3Y
11          Fixed_Issue_Rate_1Y
12                Loan_Fin_Rate
13                        IBO1Y
14          Fixed_Issue_Rate_6M
15                  CN_SSFS_MOM
16                     SHIBOR1M
17                        IBO6M
18                     SHIBOR2W
19                        CN_M2
20                          R3M
21          Shibor3M_Libor3M_2Y
22                        IBO3M
23          Shibor3M_Libor3M_1Y
24          Shibor3M_Libor3M_3Y
25                       IBO021
26          Shibor3M_Libor3M_4Y
27                        IBO9M
28                          R1M
29    CN_GDP_Constant_Current_Q
Name: feature, dtype: object

In [16]:
# 计算相关系数
X_column_list = X.columns.tolist()
corr_list = []
for column in X_column_list:
    corr = np.corrcoef(X[column], y['Beta1'])[0][1]
    corr_list.append(corr)

kk = pd.DataFrame()
kk['feature'] = X_column_list
kk['corr'] = corr_list
kk['corr'] = kk['corr'].abs()
kk.sort_values(['corr'], ascending=False).reset_index(drop=True)['feature'][:30]

0                     R1M
1                     R2M
2     Fixed_Issue_Rate_1Y
3                    R014
4                    R021
5                   FR014
6                   IBO1M
7     Fixed_Issue_Rate_3Y
8                   OR001
9                     R3M
10               SHIBOR1M
11                 IBO021
12                   R001
13                 IBO014
14                 IBO001
15              SHIBORO/N
16                  IBO2M
17                  FR001
18                   R007
19               SHIBOR2W
20    Fixed_Issue_Rate_6M
21                  OR014
22                  IBO3M
23                  FR007
24               SHIBOR3M
25                  IBO4M
26    Fixed_Issue_Rate_2Y
27                  IBO6M
28                  OR007
29                 IBO007
Name: feature, dtype: object

In [17]:
# 计算相关系数
X_column_list = X.columns.tolist()
corr_list = []
for column in X_column_list:
    corr = np.corrcoef(X[column], y['Beta1'])[0][1]
    corr_list.append(corr)

kk = pd.DataFrame()
kk['feature'] = X_column_list
kk['corr'] = corr_list
kk['corr'] = kk['corr'].abs()
kk.sort_values(['corr'], ascending=False).reset_index(drop=True)['feature'][:30]

0                     R1M
1                     R2M
2     Fixed_Issue_Rate_1Y
3                    R014
4                    R021
5                   FR014
6                   IBO1M
7     Fixed_Issue_Rate_3Y
8                   OR001
9                     R3M
10               SHIBOR1M
11                 IBO021
12                   R001
13                 IBO014
14                 IBO001
15              SHIBORO/N
16                  IBO2M
17                  FR001
18                   R007
19               SHIBOR2W
20    Fixed_Issue_Rate_6M
21                  OR014
22                  IBO3M
23                  FR007
24               SHIBOR3M
25                  IBO4M
26    Fixed_Issue_Rate_2Y
27                  IBO6M
28                  OR007
29                 IBO007
Name: feature, dtype: object