# 因子构建

|宏观风险    |  真实因子                 |  隐含因子                                                                            |
|--     |--             |--            |
|经济增长风险|   GDP 同比- 预期 GDP 同比 |   沪深300指数（10%），恒生中国企业指数（15%）、<br>住宅价格指数(25%)、CRB 工业原料指数(50%)   |
|利率风险    |  10年期国债收益率         |  中债-国债总财富指数(7-10 年)                                                          |
|通胀风险    |  0.3*PPI +0.7*CPI         |  原油(33%)、螺纹钢(34%)、猪肉(33%)                                                   |
|信用风险    |  AA 中票(3年)-国债(3年)   |  多：企业债AA(3-5年）总财富指数、<br>空：国债(3-5 年)总财富指数                              |


## 1. 读入因子成分资产原始数据

In [2]:
import os, sys, argparse, logging

# 设置logging格式和级别
logging.basicConfig(
    level=logging.DEBUG,
    # level=logging.INFO, 
    format='%(filename)s-line%(lineno)d %(levelname)s: %(message)s'
    )

logging.disable(logging.DEBUG)
logging.disable(logging.INFO)

import matplotlib
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np
from datetime import datetime
from copy import deepcopy

logging.disable(logging.NOTSET)

framework_path = os.path.join(os.path.abspath(''), '../../../../国君研究所/工作/FOF_portfolio_toolbox/framework')
if framework_path not in sys.path:
    sys.path.append(framework_path)

this_path = os.path.abspath('.')
if this_path not in sys.path:
    sys.path.append(this_path)

In [3]:
raw_data_path = os.path.join(this_path, '../data/raw_data_from_guozhi')

file_1 = os.path.join(raw_data_path, 'factor_data1.xlsx')
data_1 = pd.read_excel(file_1, header=0)

# drop header and tail
data_1 = data_1.iloc[1:-2]

# cut date range
data_1.set_index('指标名称', inplace=True)
data_1 = data_1.loc['2005': '2022']

data_1.columns = [
    '恒生中国企业指数', 
    '原油', 
    '沪深300', 
    '猪肉_当月同比', 
    '工业原料指数', 
    '国债到期收益率', 
    '螺纹钢', 
    '猪肉_周环比',
    '住宅指数',
    '猪肉_平均批发价'  
]

data_1

  warn("Workbook contains no default style, apply openpyxl's default")


Unnamed: 0_level_0,恒生中国企业指数,原油,沪深300,猪肉_当月同比,工业原料指数,国债到期收益率,螺纹钢,猪肉_周环比,住宅指数,猪肉_平均批发价
指标名称,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2005-01-03,4763.47,42.12,,,322.38,,,,,
2005-01-04,4697.75,43.91,982.794,,319.22,5.2221,,,,
2005-01-05,4564.98,43.39,992.564,,320.43,5.0084,,,,
2005-01-06,4538.77,45.56,983.174,,318.08,5.1707,,,,
2005-01-07,4513.03,45.43,983.958,,320.5,5.1533,,,,
...,...,...,...,...,...,...,...,...,...,...
2022-10-18,5756.31,82.82,3838.2667,,558.8,2.7002,3666,,,34.97
2022-10-19,5597.79,85.55,3776.5335,,555.96,2.7052,3662,,,35.12
2022-10-20,5512.3,84.51,3754.9269,,557.42,2.7175,3622,,,35.47
2022-10-21,5517.44,85.05,3742.8929,,557.7,2.7278,3634,,,35.67


In [4]:
file_2 = os.path.join(raw_data_path, 'factor_data2.csv')
data_2 = pd.read_csv(file_2, index_col=0)
data_2.index = pd.to_datetime(data_2.index)

data_2

Unnamed: 0,CBA00651.CS,CBA04031.CS,CBA00631.CS
2005-01-04,97.7711,,104.6127
2005-01-05,101.1029,,105.3801
2005-01-06,96.8454,,103.9313
2005-01-07,98.2687,,103.6937
2005-01-10,98.5395,,104.0194
...,...,...,...
2022-09-26,216.5491,261.8073,202.9701
2022-09-27,216.4853,261.7283,202.9239
2022-09-28,216.1451,261.6437,202.8520
2022-09-29,215.9363,261.6474,202.7287


In [5]:
data = pd.concat([data_1, data_2], axis=1)
data

Unnamed: 0,恒生中国企业指数,原油,沪深300,猪肉_当月同比,工业原料指数,国债到期收益率,螺纹钢,猪肉_周环比,住宅指数,猪肉_平均批发价,CBA00651.CS,CBA04031.CS,CBA00631.CS
2005-01-03,4763.47,42.12,,,322.38,,,,,,,,
2005-01-04,4697.75,43.91,982.794,,319.22,5.2221,,,,,97.7711,,104.6127
2005-01-05,4564.98,43.39,992.564,,320.43,5.0084,,,,,101.1029,,105.3801
2005-01-06,4538.77,45.56,983.174,,318.08,5.1707,,,,,96.8454,,103.9313
2005-01-07,4513.03,45.43,983.958,,320.5,5.1533,,,,,98.2687,,103.6937
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-10-18,5756.31,82.82,3838.2667,,558.8,2.7002,3666,,,34.97,,,
2022-10-19,5597.79,85.55,3776.5335,,555.96,2.7052,3662,,,35.12,,,
2022-10-20,5512.3,84.51,3754.9269,,557.42,2.7175,3622,,,35.47,,,
2022-10-21,5517.44,85.05,3742.8929,,557.7,2.7278,3634,,,35.67,,,


## 2. 构建因子

In [10]:
date_range = [datetime(2014,1,1), datetime(2022,9,30)]

In [11]:
# 经济增长风险
df = data[['沪深300', '恒生中国企业指数', '住宅指数', '工业原料指数']]

df = df.loc[date_range[0]: date_range[1]].fillna(method='ffill')
df

Unnamed: 0,沪深300,恒生中国企业指数,住宅指数,工业原料指数
2014-01-02,2321.9780,10709.34,,531.30
2014-01-03,2290.7790,10436.76,,528.80
2014-01-06,2238.6370,10290.55,,528.33
2014-01-07,2238.0010,10236.12,,528.97
2014-01-08,2241.9110,10329.82,,529.05
...,...,...,...,...
2022-09-26,3836.6773,6137.78,0.32,568.08
2022-09-27,3892.2950,6150.47,0.32,567.09
2022-09-28,3828.7098,5958.62,0.32,566.92
2022-09-29,3827.1434,5912.25,0.32,567.71
