In [28]:
import numpy as np
import pandas as pd
from scipy import stats

from zipline import get_calendar
from zipline.pipeline.fundamentals import Fundamentals
from zipline.research import returns, select_output_by
from zipline.data.benchmarks_cn import get_cn_benchmark_returns

from zipline.pipeline import Pipeline
from zipline.research import run_pipeline

# 部门因子

## 期间

In [2]:
start = '2010-01-01' # 整理风险模型数据最早日期

c = get_calendar('SZSH')

end = c.actual_last_session

dates = c.sessions_in_range(start, end)

## 基础数据

In [3]:
Fundamentals.sector_maps

{101: '基本材料',
 102: '主要消费',
 103: '金融服务',
 104: '房地产',
 205: '可选消费',
 206: '医疗保健',
 207: '公用事业',
 308: '通讯服务',
 309: '能源',
 310: '工业领域',
 311: '工程技术'}

In [4]:
# 顺序再编码
SECTOR_MAPS = {
    101: 1,
    102: 2,
    103: 3,
    104: 4,
    205: 5,
    206: 6,
    207: 7,
    308: 8,
    309: 9,
    310: 10,
    311: 11
}

In [5]:
# 部门影射指数代码
# 把工程技术影射为信息技术
SECTOR_INDEX_MAPS = {101:'399614',102:'399617',103:'399619',
                     104:'399241',205:'399616',206:'399618',
                     207:'399622',308:'399621',309:'399613',
                     310:'399615',311:'399620'}

# 指数收益

In [6]:
index_rets = get_cn_benchmark_returns('399241')

In [7]:
index_rets.tail()

2018-05-30 00:00:00+00:00   -0.026863
2018-05-31 00:00:00+00:00    0.016891
2018-06-01 00:00:00+00:00    0.001059
2018-06-04 00:00:00+00:00    0.020132
2018-06-05 00:00:00+00:00    0.006109
dtype: float64

# 计算部门因子

## 常量

In [8]:
PPY = 244 # 每年约244个交易日

## 参数

In [9]:
stock_code = '000001'

one_day = pd.Timestamp('2018-6-5')

start_date = one_day - c.day * PPY * 2 # 前偏2年的日期， 以此为起点

## 准备数据

### sector

部门为静态，使用最新值

In [10]:
def make_pipeline():
    return Pipeline(columns={
        'sector': Fundamentals.info.sector_code.latest,
    })

In [11]:
result = run_pipeline(make_pipeline(), '2018-5-31', '2018-5-31')

In [12]:
result.tail()

Unnamed: 0,Unnamed: 1,sector
2018-05-31 00:00:00+00:00,洛阳钼业(603993),101.0
2018-05-31 00:00:00+00:00,中新科技(603996),205.0
2018-05-31 00:00:00+00:00,继峰股份(603997),205.0
2018-05-31 00:00:00+00:00,方盛制药(603998),206.0
2018-05-31 00:00:00+00:00,读者传媒(603999),205.0


### pct

In [None]:
# 获取股票收益率数据

pct = returns('000001', start_date, one_day)

In [15]:
pct.head()

2016-06-02 00:00:00+00:00   -0.001887
2016-06-03 00:00:00+00:00    0.003898
2016-06-06 00:00:00+00:00    0.000941
2016-06-07 00:00:00+00:00    0.000941
2016-06-08 00:00:00+00:00   -0.001879
Freq: C, Name: 平安银行(000001), dtype: float64

In [16]:
sector_code = int(select_output_by(result, assets=[stock_code])['sector'])

In [17]:
sector_code

103

In [18]:
# 对应指数代码
index_code = SECTOR_INDEX_MAPS[sector_code]

In [19]:
# 指数收益率
index_rets = get_cn_benchmark_returns(index_code).loc[start_date:one_day]

In [20]:
index_rets.shape

(489,)

In [21]:
pct.shape

(488,)

In [None]:
# 数据形态保持一致

In [22]:
common_index = pct.index.intersection(index_rets.index)

In [23]:
r = index_rets[common_index].values # 股票收益率，非累计

In [24]:
f = pct[common_index].values # 指数收益率，非累计

## 回归

In [25]:
# 此处应为通过指数收益来预测股票收益，所以自变量为指数收益率，因变量为股票收益率
slope, intercept, r_value, p_value, std_err = stats.linregress(f, r)

In [26]:
err = r - slope * f # 此处存疑？严格来说，误差要么以标准差代表，要么绝对值差异。怎么会少截距？

In [None]:
# 取最后一项
err = err[-1]

## 保存相对于部门sector的beta值

In [29]:
beta = np.zeros(len(SECTOR_MAPS))

In [30]:
beta[SECTOR_MAPS[sector_code]] = slope # 除股票所属部门外，其余全为0

In [31]:
beta

array([0.        , 0.        , 0.        , 0.46411184, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        ])

In [33]:
r = beta * f[-1] + err[-1] # 如果按材料说明，此处应为标量！即用部门因子预测结果+未解释部分残余。但结果为向量

In [None]:
def get_sector_beta_t(t):
    pass

In [None]:
def get_epsilon_t(t):
    pass