In [1]:
from factor_analysis_adv import *

### 1 基础设置

In [2]:
# 时间
start_date = '2020-01-01'
end_date = '2024-04-01'

# 标的
index_item = '000300.XSHG'
index_fix = INDEX_FIX(start_date,end_date,index_item)
stock_list = index_fix.columns.tolist()
date_list = index_fix.index.tolist()

### 2 特征构建

In [3]:
from rqfactor import Factor, REF, MA, SUM, CS_ZSCORE,STD,PCT_CHANGE,CORR,DELTA,MAX,MIN
from rqfactor.extension import UserDefinedLeafFactor

HIGH = Factor('high')
LOW = Factor('low')
OPEN = Factor('open')
CLOSE = Factor('close')
VOLUME = Factor('volume')
TURNOVER = Factor('total_turnover')

def turnover_rate(order_book_ids,start_date,end_date):
    return get_turnover_rate(order_book_ids,
                             start_date,
                             end_date,
                             fields='today').today.unstack('order_book_id').reindex(columns = order_book_ids,
                                                                                    index = pd.to_datetime(get_trading_dates(start_date,end_date)))

DAILY_TURNOVER_RATE = UserDefinedLeafFactor('DAILY_TURNOVER_RATE',turnover_rate)

candle_up_shadow = (HIGH - MAX(OPEN,CLOSE))/HIGH
candle_down_shadow = (MIN(OPEN,CLOSE)-LOW)/LOW


factor_dict = {
                # 自建因子
                # 流动性因子
                'liq_turn_avg':MA(DAILY_TURNOVER_RATE, 20),
                'liq_turn_std':STD(DAILY_TURNOVER_RATE, 20),
                'liq_vstd':SUM(Factor('total_turnover'), 20)/STD(PCT_CHANGE(CLOSE, 1), 20),
                'liq_zamount':MA(Factor('total_turnover'), 20)/STD(Factor('total_turnover'), 20),

                # 量价相关性
                'corr_price_turn':CORR(CLOSE, DAILY_TURNOVER_RATE, 20),
                'corr_ret_turn':CORR(PCT_CHANGE(CLOSE, 1), DELTA(DAILY_TURNOVER_RATE, 1), 20),
                
                # 波动率
                'vol_highlow_std':STD(HIGH/Factor('low'), 20),
                'vol_up_shadow_std':STD((HIGH-MAX(Factor('open'),VOLUME))/HIGH, 20),
                'candle_shadow':CS_ZSCORE(candle_up_shadow) + CS_ZSCORE(candle_down_shadow),
                
                # 动量
                'mmt_normal_M':CLOSE / REF(CLOSE, 20) - 1,
                'mmt_normal_A':REF(CLOSE, 20) / REF(CLOSE, 240) - 1,

                # 量价数据库因子
                # 均线类型
                'MACD_HIST':Factor('MACD_HIST'),         # 指数平滑移动平均线
                'MATRIX':Factor('MATRIX'),               # 三重指数平均移动平均
                'ASIT':Factor('ASIT'),                   # 震动升降指标 
                'DAVOL5':Factor('DAVOL5'),               # 平均换手率与 120 日平均换手率比值
                'MADPO':Factor('MADPO'),                 # 区间震荡线
                'MCST':Factor('MCST'),                   # 市场成本

                # 超买超卖
                'OBOS':Factor('OBOS'),                   # 超买超卖指标
                'RSI10':Factor('RSI10'),                 # 相对强弱指标
                'WR':Factor('WR'),                       # 威廉指标
                'BIAS20':Factor('BIAS20'),               # 乖离率
                'ACCER':Factor('ACCER'),                 # 幅度涨速
                'CYF':Factor('CYF'),                     # 市场能量

                'ADTM':Factor('ADTM'),                   # 动态买卖气指标
                'ATR':Factor('ATR'),                     # 真实波幅
                'MADKX':Factor('MADKX'),                 # 多空线
                'CCI':Factor('CCI'),                     # 商品路径指标
                'MFI':Factor('MFI'),                     # 资金流量指标
                'MAUDL':Factor('MAUDL'),                 # 引力线

                # 能量型
                'BR':Factor('BR'),                       # 人气意愿指标
                'MAVR':Factor('MAVR'),                   # 容量比例
                'MACYR':Factor('MACYR'),                 # 市场强弱
                'MAMASS':Factor('MAMASS'),               # 梅斯线
                ' MFI':Factor('MFI'),                     # OBV
                'AMP20':Factor('AMP20'),                 # AMP20
               }

In [4]:
for k,v in tqdm(factor_dict.items()):
    # 获取因子
    df = execute_factor(v,stock_list,start_date,end_date)
    # 文件夹新建
    create_dir_not_exist('./data/raw')
    # 数据导出
    df.to_pickle(f'./data/raw/{k}.pkl')

100%|██████████| 35/35 [01:45<00:00,  3.02s/it]


### 3 特征清洗

In [5]:
for k,v in tqdm(factor_dict.items()):
    # 数据读入
    df = pd.read_pickle(f'./data/raw/{k}.pkl')
    # 文件夹新建
    create_dir_not_exist('./data/neu')
    # 因子清洗
    df = data_clean(df,index_fix,index_item)
    # 数据存储
    df.to_pickle(f'./data/neu/{k}.pkl')

100%|██████████| 35/35 [01:17<00:00,  2.22s/it]


In [6]:
value_dict = {}

for k,v in factor_dict.items():
    # 数据导入
    value_dict[k] = pd.read_pickle(f'./data/neu/{k}.pkl')

### 4 特征检验

In [7]:
T = 5
ic_df = pd.DataFrame(index = date_list)
ic_summary = pd.DataFrame()
for k,v in value_dict.items():    
    # 计算icir指标，T检验
    ic,ic_summary_temp = Quick_Factor_Return_N_IC(v,T,index_item,k)
    ic_df = pd.concat([ic_df,ic.to_frame(k)],axis = 1)
    ic_summary = pd.concat([ic_summary,ic_summary_temp.set_index('name')],axis = 0)

# 文件夹新建
create_dir_not_exist('./data/report')
# 存储
ic_df.to_pickle(f'./data/report/ic_df_{T}.pkl')
ic_summary.to_csv(f'./data/report/ic_summary_{T}.csv')

{'name': 'liq_turn_avg', 'IC mean': -0.0312, 'IC std': 0.1096, 'IR': -0.2849, 'IC>0': 0.3851, 'ABS_IC>2%': 0.8719, 't_stat': -9.1121}
{'name': 'liq_turn_std', 'IC mean': -0.0295, 'IC std': 0.0963, 'IR': -0.3059, 'IC>0': 0.3783, 'ABS_IC>2%': 0.869, 't_stat': -9.785}
{'name': 'liq_vstd', 'IC mean': -0.0052, 'IC std': 0.0794, 'IR': -0.0656, 'IC>0': 0.4633, 'ABS_IC>2%': 0.8221, 't_stat': -2.0976}
{'name': 'liq_zamount', 'IC mean': 0.0108, 'IC std': 0.0664, 'IR': 0.163, 'IC>0': 0.5914, 'ABS_IC>2%': 0.7566, 't_stat': 5.2131}
{'name': 'corr_price_turn', 'IC mean': -0.0093, 'IC std': 0.0829, 'IR': -0.1125, 'IC>0': 0.4516, 'ABS_IC>2%': 0.8221, 't_stat': -3.597}
{'name': 'corr_ret_turn', 'IC mean': -0.0101, 'IC std': 0.0854, 'IR': -0.1187, 'IC>0': 0.4604, 'ABS_IC>2%': 0.8143, 't_stat': -3.7972}
{'name': 'vol_highlow_std', 'IC mean': -0.0231, 'IC std': 0.1108, 'IR': -0.2083, 'IC>0': 0.4252, 'ABS_IC>2%': 0.8622, 't_stat': -6.6638}
{'name': 'vol_up_shadow_std', 'IC mean': 0.0057, 'IC std': 0.1035, 

### 5 特征过滤

In [12]:
factor_names = ic_summary[ic_summary.IR.abs() > 0.1].index.tolist()
ic_summary[ic_summary.IR.abs() > 0.1]

Unnamed: 0_level_0,IC mean,IC std,IR,IC>0,ABS_IC>2%,t_stat
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
liq_turn_avg,-0.0312,0.1096,-0.2849,0.3851,0.8719,-9.1121
liq_turn_std,-0.0295,0.0963,-0.3059,0.3783,0.869,-9.785
liq_zamount,0.0108,0.0664,0.163,0.5914,0.7566,5.2131
corr_price_turn,-0.0093,0.0829,-0.1125,0.4516,0.8221,-3.597
corr_ret_turn,-0.0101,0.0854,-0.1187,0.4604,0.8143,-3.7972
vol_highlow_std,-0.0231,0.1108,-0.2083,0.4252,0.8622,-6.6638
candle_shadow,-0.0131,0.0874,-0.15,0.4272,0.8299,-4.7961
mmt_normal_A,0.0259,0.1431,0.1812,0.5699,0.9013,5.7969
MCST,-0.0185,0.1137,-0.1631,0.4506,0.87,-5.2152
CYF,-0.032,0.1105,-0.2899,0.393,0.8671,-9.2738


### 6 特征组合

In [26]:
ml_data = pd.DataFrame()
for i in tqdm(factor_names):
    ml_data[i] = value_dict[i].stack() 

100%|██████████| 14/14 [00:00<00:00, 20.27it/s]


### 7 目标集构建

In [27]:
def ml_data_gen(df,predict_day = 5):
    # 获取股票队列
    stock_list = sorted(set(df.index.get_level_values(1)))
    # 获取开始时间&结束时间
    start_date = df.index.get_level_values(0).min().strftime('%F')
    end_date = get_next_trading_date(df.index.get_level_values(0).max(),2).strftime('%F')
    # 获取开盘价
    price_open = get_price(stock_list,start_date,end_date,fields=['open']).open.unstack('order_book_id')
    # 计算未来N天收益
    ret_n = price_open.pct_change(predict_day).shift(-1 * (predict_day + 1))
    # 计算未来1天收益
    current_ret = price_open.pct_change(1).shift(-1)
    # 计算未来N天的超额收益
    excess_ret_n = ret_n.sub(ret_n.mean(axis = 1),axis = 0)
    # 插入表格
    df[f'ret_{predict_day}d'] = ret_n.stack()
    df[f'excess_ret_{predict_day}d'] = excess_ret_n.stack()
    df['current_ret'] = current_ret.stack()
    # 超额收益设置目标集 （涨为大于0；跌为小于0）
    df['target'] = df[f'excess_ret_{predict_day}d'] > 0 
    df.index.names = ['date','order_book_id']

    return df

In [28]:
ml_data = ml_data_gen(ml_data)

In [32]:
ml_data.iloc[:,5:]

Unnamed: 0_level_0,Unnamed: 1_level_0,vol_highlow_std,candle_shadow,mmt_normal_A,MCST,CYF,ATR,MADKX,MAUDL,AMP20,ret_5d,excess_ret_5d,current_ret,target
date,order_book_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2020-01-02,000001.XSHE,0.767360,0.189099,1.630257,0.379035,0.220262,0.452849,0.464853,0.474638,0.940617,-0.008856,-0.023913,0.017416,False
2020-01-02,000002.XSHE,2.130335,0.710111,-0.899662,0.443806,0.976252,0.614641,0.850202,0.906591,1.514249,-0.021706,-0.036763,-0.002744,False
2020-01-02,000063.XSHE,0.307602,-0.366186,1.249383,1.057896,1.272605,1.197023,1.019584,1.042019,0.791573,0.015265,0.000207,0.028884,True
2020-01-02,000069.XSHE,-0.241554,-0.613905,-0.195490,-0.740743,-0.134856,-0.868731,-0.848948,-0.846255,-0.209022,-0.001278,-0.016335,-0.008907,False
2020-01-02,000100.XSHE,0.083669,0.042600,-0.472926,1.579189,0.593559,-2.393650,-2.213715,-2.187245,0.668535,0.041394,0.026336,0.017745,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-04-01,688363.XSHG,-0.541943,-0.730393,-0.965612,1.304050,-0.259034,1.585730,1.431937,1.383895,0.010347,,,0.023517,False
2024-04-01,688396.XSHG,-1.810791,0.066354,-0.567657,0.279431,-2.218073,-0.397263,0.066566,0.065475,-1.315384,,,0.012535,False
2024-04-01,688561.XSHG,-0.536166,-0.797238,-0.916671,0.317323,-1.921370,-0.186559,-0.148906,-0.159137,-0.362989,,,0.019799,False
2024-04-01,688599.XSHG,1.358823,-1.048776,-1.012424,-0.032858,-1.156408,-0.037175,-0.192738,-0.216944,-0.121995,,,0.025566,False


### 10 数据存储

In [29]:
ml_data.to_pickle('./data/ml_data.pkl')