In [9]:
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt

In [10]:
date = '2023.09.22'
base_dirpath = '/home/wangzirui/workspace/data'
df = pd.read_pickle(f'{base_dirpath}/fac_ret_{date}.pkl')
# df.dropna(inplace=True)

In [12]:
df.set_index(['tradetime', 'securityid'], inplace=True)

In [63]:
import warnings
import pandas as pd
import numpy as np
from scipy import stats
from scipy.stats import ConstantInputWarning
import os, sys
sys.path.insert(0, "../../")
from alphalens.utils import get_forward_returns_columns

from factor_cal.config_loader import basic_config as cfg
from factor_cal.table.ddb_table import PriceTable, SecLevelFacTable
from factor_cal.utils import ddb_utils as du

# igmore the warning of ConstantInputWarning
warnings.filterwarnings("ignore", category=ConstantInputWarning)
# obtain the ddb session
s = du.DDBSessionSingleton().session

class MaxLossExceededError(Exception):
    pass

def get_trade_clode_bydate(stat_date, config):
    price_info = config['price_info']
    pc_tb = PriceTable(price_info['price_dbPath'], price_info['price_tbName'], price_info['time_col'], price_info['sec_col'], price_info['price_cols'])
    price = pc_tb.load_price(stat_date, config['start_time'], config['end_time'], sec_list=None)
    return s.loadTable(tableName=price).toDF()

def quantize_factor(factor_data, config, no_raise=False):
    def quantile_calc(x, _quantiles, _bins, _equal_quantile, _zero_aware, _no_raise):
        try:
            if _quantiles is not None and _bins is None and not _equal_quantile and not _zero_aware:
                return pd.qcut(x, _quantiles, labels=False) + 1
            elif _quantiles is not None and _bins is None and not _equal_quantile and _zero_aware:
                pos_quantiles = pd.qcut(x[x>=0], _quantiles // 2,
                                        labels=False) + _quantiles // 2 + 1
                neg_quantiles = pd.qcut(x[x<0], _quantiles // 2,
                                        labels=False) + 1
                return pd.concat([pos_quantiles, neg_quantiles]).sort_index()
            elif _quantiles is not None and _bins is None and _equal_quantile and not _zero_aware:
                nrow = x.shape[0]
                
                quantiles_list = []
                edges = [int(i) for i in np.linspace(0, nrow, _quantiles+1)]
                for i in range(_quantiles):
                    start, end = edges[i], edges[i+1]
                    quantiles_list += ([i+1] * (end - start))
                return pd.Series(quantiles_list, x.sort_values().index, name=x.name).sort_index()        
            elif _bins is not None and _quantiles is None and not _zero_aware:
                return pd.cut(x, _bins, labels=False) + 1
            elif _bins is not None and _quantiles is None and _zero_aware:
                pos_bins = pd.cut(x[x >= 0], _bins // 2,
                                  labels=False) + _bins // 2 + 1
                neg_bins = pd.cut(x[x < 0], _bins // 2,
                                  labels=False) + 1
                return pd.concat([pos_bins, neg_bins]).sort_index()
                
        except Exception as e:
            if _no_raise:
                return pd.Series(index=x.index)
            raise e
    
    grouper = [factor_data.index.get_level_values('datetime')]
    factor_quantile = factor_data.groupby(grouper)['factor'] \
        .apply(quantile_calc, config['quantiles'], config['bins'], config['equal_quantile'], config['zero_aware'], no_raise)
    factor_quantile.name = 'factor_quantile'
    return factor_quantile.dropna()

def get_clean_data(factor_and_ret, config):
    initial_amount = float(len(factor_and_ret.index))
    factor_and_ret.index = factor_and_ret.index.rename(['datetime', 'asset'])
    
    factor_and_ret = factor_and_ret.dropna()
    fwdret_amount = float(len(factor_and_ret.index))
    
    no_raise = False if config['max_loss'] == 0 else True
    quantile_data = quantize_factor(factor_and_ret, config, no_raise)
    
    factor_and_ret['factor_quantile'] = quantile_data
    
    factor_and_ret = factor_and_ret.dropna()
    
    binning_amount = float(len(factor_and_ret.index))
    
    tot_loss = (initial_amount - binning_amount) / initial_amount
    fwdret_loss = (initial_amount - fwdret_amount) / initial_amount
    bin_loss = tot_loss - fwdret_loss
    
    print("Dropped %.1f%% entries from factor data: %.1f%% in forward "
          "returns computation and %.1f%% in binning phase "
          "(set max_loss=0 to see potentially suppressed Exceptions)." %
          (tot_loss * 100, fwdret_loss * 100,  bin_loss * 100))
    
    if tot_loss > config['max_loss']:
        message = ("max_loss (%.1f%%) exceeded %.1f%%, consider increasing it."
                   % (config['max_loss'] * 100, tot_loss * 100))
        raise MaxLossExceededError(message)
    else:
        print("max_loss is %.1f%%, not exceeded: OK!" % (config['max_loss'] * 100))

    return factor_and_ret

In [32]:
def load_factor_and_return(stat_date, factor_name, config):
    base_dirpath = '/home/wangzirui/workspace/data'
    factors_filepath = f'{base_dirpath}/fac_ret_{stat_date}.pkl'
    if not os.path.exists(factors_filepath):
        print(f'[warning]: There is no file:{factors_filepath}')
    factors_df = pd.read_pickle(factors_filepath)
    factors_df.set_index(['tradetime', 'securityid'], inplace=True)
    factors_df = factors_df.sort_index(level=0)
    
    # get the dataframe with factor and return information
    sel_cols = factors_df.columns[:3]
    sel_cols = np.append(sel_cols, [factor_name])
    factor_and_ret = factors_df[sel_cols].copy()
    factor_and_ret.rename(columns={factor_name: "factor"}, inplace=True)
    
    # tidy the dataframe and quantile it
    factor_and_ret = get_clean_data(factor_and_ret, config['evaluation'])
    return factor_and_ret

In [82]:
np.sort(df['factor_quantile'].unique())
max(np.sort(df['factor_quantile'].unique()))

5

In [34]:
# read config file
config = cfg.BasicConfig('../config/config.yml')
cur_date = '2023.09.22'
factor_name = 'ret_v_prod_5min'
df = load_factor_and_return(cur_date, factor_name, config)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  factor_and_ret['factor_quantile'] = quantile_data


Dropped 1.6% entries from factor data: 1.6% in forward returns computation and 0.0% in binning phase (set max_loss=0 to see potentially suppressed Exceptions).
max_loss is 50.0%, not exceeded: OK!


In [69]:
close = get_trade_clode_bydate(cur_date, config)
close.set_index(['tradetime', 'securityid'], inplace=True)
close.sort_index(inplace=True)

In [112]:
df_q = df[df['factor_quantile']==1]
grouper = [df_q.index.get_level_values('datetime')]
def cal_wt(group):
        group['wt'] = 1/group.shape[0]
        return group
df_q = df_q.groupby(grouper).apply(cal_wt)
df_q[['factor', 'wt']].sort_index(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_q[['factor', 'wt']].sort_index(inplace=True)


In [120]:
port_weight = df_q[['factor', 'wt']]
port_weight.index = port_weight.index.rename(['tradetime', 'securityid'])
port_weight

Unnamed: 0_level_0,Unnamed: 1_level_0,factor,wt
tradetime,securityid,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-09-22 09:31:39,000026.SZ,0.181818,0.019231
2023-09-22 09:31:39,000158.SZ,0.242424,0.019231
2023-09-22 09:31:39,000415.SZ,0.030303,0.019231
2023-09-22 09:31:39,000536.SZ,0.030303,0.019231
2023-09-22 09:31:39,000969.SZ,0.272727,0.019231
...,...,...,...
2023-09-22 14:55:00,688778.SH,0.490000,0.001567
2023-09-22 14:55:00,688798.SH,0.010000,0.001567
2023-09-22 14:55:00,688799.SH,0.020000,0.001567
2023-09-22 14:55:00,688819.SH,0.150000,0.001567


In [103]:
last_ts_table = td_return.groupby('securityid').apply(lambda x: max(x.index.get_level_values('tradetime')))
last_ts_table.name='tradetime'
# last_ts_table.reset_index(inplace=True)
# last_ts_table['securityid']

In [126]:
port_weight = port_weight.reset_index()
port_weight

Unnamed: 0,tradetime,securityid,factor,wt
0,2023-09-22 09:31:39,000026.SZ,0.181818,0.019231
1,2023-09-22 09:31:39,000158.SZ,0.242424,0.019231
2,2023-09-22 09:31:39,000415.SZ,0.030303,0.019231
3,2023-09-22 09:31:39,000536.SZ,0.030303,0.019231
4,2023-09-22 09:31:39,000969.SZ,0.272727,0.019231
...,...,...,...,...
4714886,2023-09-22 14:55:00,688778.SH,0.490000,0.001567
4714887,2023-09-22 14:55:00,688798.SH,0.010000,0.001567
4714888,2023-09-22 14:55:00,688799.SH,0.020000,0.001567
4714889,2023-09-22 14:55:00,688819.SH,0.150000,0.001567


In [168]:
# last_ts_table = last_ts_table.reset_index()
# last_ts_table
last_ts = dict(zip(last_ts_table['securityid'], last_ts_table['tradetime']))

ages = np.arange(20)
time_stamps = port_weight['tradetime'].unique()
dict_ts_index = {pd.Timestamp(ts): i for i, ts in enumerate(time_stamps)}
dict_index_ts = {i: pd.Timestamp(ts) for i, ts in enumerate(time_stamps)}
# dict_ts_index = {ts: i for i, ts in enumerate(np.datetime_as_string(time_stamps, unit='s'))}
# dict_index_ts = {i: ts for i, ts in enumerate(np.datetime_as_string(time_stamps, unit='s'))}

In [170]:
x = pos.iloc[0]
dict_index_ts[dict_ts_index[x['tradetime']] + x['age']]
dict_index_ts[dict_ts_index[x['tradetime']] + x['age']] < last_ts[x['securityid']]

True

In [189]:
pos = pd.merge(port_weight[:2], pd.DataFrame({'age': ages}), how='cross')
pos.rename(columns={'tradetime': 'tranche'}, inplace=True)
pos['tradetime'] = pos.apply(lambda x: dict_index_ts[dict_ts_index[x['tranche']] + x['age']] if dict_index_ts[dict_ts_index[x['tranche']] + x['age']] < last_ts[x['securityid']] else np.nan, axis=1)
pos = pos[(pos['tradetime'].notna())]

pos = pos.merge(td_return, on=['tradetime', 'securityid'], how='left')

In [190]:
pos.head(5)

Unnamed: 0,tranche,securityid,factor,wt,age,tradetime,ret
0,2023-09-22 09:31:39,000026.SZ,0.181818,0.019231,0,2023-09-22 09:31:39,-0.000873
1,2023-09-22 09:31:39,000026.SZ,0.181818,0.019231,1,2023-09-22 09:31:42,0.000873
2,2023-09-22 09:31:39,000026.SZ,0.181818,0.019231,2,2023-09-22 09:31:45,0.0
3,2023-09-22 09:31:39,000026.SZ,0.181818,0.019231,3,2023-09-22 09:31:48,0.000873
4,2023-09-22 09:31:39,000026.SZ,0.181818,0.019231,4,2023-09-22 09:31:51,-0.000872


In [215]:
pos['cumret'] = pos.groupby(['securityid', 'tranche'])['ret'].transform(lambda x: np.cumprod(1 + x))
pos['expr'] = pos['cumret'] * pos['wt']
pos['pnl'] = pos['expr'] * pos['ret'] / (1 + pos['ret'])
# pos['pnl'] = pos['pnl'].shift(1)
pos.loc[pos['age']==0, 'pnl'] = 0
pos.head(5)

# pos['expr'] = pos.groupby(['securityid', 'tranche']).apply(lambda x: np.cumprod(1 + x['ret']))

Unnamed: 0,tranche,securityid,factor,wt,age,tradetime,ret,expr,pnl,cumret
0,2023-09-22 09:31:39,000026.SZ,0.181818,0.019231,0,2023-09-22 09:31:39,-0.000873,0.019214,0.0,0.999127
1,2023-09-22 09:31:39,000026.SZ,0.181818,0.019231,1,2023-09-22 09:31:42,0.000873,0.019231,1.7e-05,1.0
2,2023-09-22 09:31:39,000026.SZ,0.181818,0.019231,2,2023-09-22 09:31:45,0.0,0.019231,0.0,1.0
3,2023-09-22 09:31:39,000026.SZ,0.181818,0.019231,3,2023-09-22 09:31:48,0.000873,0.019248,1.7e-05,1.000873
4,2023-09-22 09:31:39,000026.SZ,0.181818,0.019231,4,2023-09-22 09:31:51,-0.000872,0.019231,-1.7e-05,1.0


In [237]:
tranche_info = pos.groupby('tranche')['pnl'].sum().reset_index().rename(columns={'pnl': 'tranche_pnl'})
tranche_info['net_value'] = (1+tranche_info['tranche_pnl']).shift(20)
tranche_info['net_value'].fillna(1, inplace=True)
pos_tmp = pos.merge(tranche_info, on=['tranche'], how='left')
pos_tmp['correct_pnl'] = pos_tmp['pnl'] * pos_tmp['net_value']
pos_tmp.head(5)

Unnamed: 0,tranche,securityid,factor,wt,age,tradetime,ret,expr,pnl,cumret,tranche_pnl,net_value,correct_pnl
0,2023-09-22 09:31:39,000026.SZ,0.181818,0.019231,0,2023-09-22 09:31:39,-0.000873,0.019214,0.0,0.999127,-5.7e-05,1.0,0.0
1,2023-09-22 09:31:39,000026.SZ,0.181818,0.019231,1,2023-09-22 09:31:42,0.000873,0.019231,1.7e-05,1.0,-5.7e-05,1.0,1.7e-05
2,2023-09-22 09:31:39,000026.SZ,0.181818,0.019231,2,2023-09-22 09:31:45,0.0,0.019231,0.0,1.0,-5.7e-05,1.0,0.0
3,2023-09-22 09:31:39,000026.SZ,0.181818,0.019231,3,2023-09-22 09:31:48,0.000873,0.019248,1.7e-05,1.000873,-5.7e-05,1.0,1.7e-05
4,2023-09-22 09:31:39,000026.SZ,0.181818,0.019231,4,2023-09-22 09:31:51,-0.000872,0.019231,-1.7e-05,1.0,-5.7e-05,1.0,-1.7e-05


In [252]:
col_name = 'group_1'
res = pos.groupby('tradetime')['pnl'].sum().reset_index().rename(columns={'pnl': col_name})
res['ret'] = (res[col_name] + 1).pct_change(1)

res['net_value'] = 1+res['group_1'].cumsum()
res['new_ret'] = res['net_value'].pct_change(1)
res.head(5)

Unnamed: 0,tradetime,group_1,ret,net_value,new_ret
0,2023-09-22 09:31:39,0.0,,1.0,
1,2023-09-22 09:31:42,1.7e-05,1.7e-05,1.000017,1.7e-05
2,2023-09-22 09:31:45,4.6e-05,3e-05,1.000063,4.6e-05
3,2023-09-22 09:31:48,1.7e-05,-3e-05,1.00008,1.7e-05
4,2023-09-22 09:31:51,6e-06,-1e-05,1.000086,6e-06


In [205]:
pos.head(5)

Unnamed: 0,tranche,securityid,factor,wt,age,tradetime,ret,expr
0,2023-09-22 09:31:39,000026.SZ,0.181818,0.019231,0,2023-09-22 09:31:39,-0.000873,0.999127
1,2023-09-22 09:31:39,000026.SZ,0.181818,0.019231,1,2023-09-22 09:31:42,0.000873,1.0
2,2023-09-22 09:31:39,000026.SZ,0.181818,0.019231,2,2023-09-22 09:31:45,0.0,1.0
3,2023-09-22 09:31:39,000026.SZ,0.181818,0.019231,3,2023-09-22 09:31:48,0.000873,1.000873
4,2023-09-22 09:31:39,000026.SZ,0.181818,0.019231,4,2023-09-22 09:31:51,-0.000872,1.0


In [178]:
td_return = close.groupby(close.index.get_level_values('securityid')).apply(lambda x: x.pct_change(1))
td_return.rename(columns={'close': 'ret'}, inplace=True)

In [184]:
td_return

Unnamed: 0_level_0,Unnamed: 1_level_0,ret
tradetime,securityid,Unnamed: 2_level_1
2023-09-22 09:30:00,000002.SZ,
2023-09-22 09:30:00,000004.SZ,
2023-09-22 09:30:00,000006.SZ,
2023-09-22 09:30:00,000010.SZ,
2023-09-22 09:30:00,000026.SZ,
...,...,...
2023-09-22 15:00:00,603990.SH,0.000000
2023-09-22 15:00:00,605366.SH,-0.002430
2023-09-22 15:00:00,688403.SH,0.001006
2023-09-22 15:00:00,688469.SH,0.001887


In [76]:
td_return.xs('000002.SZ', level='securityid')

Unnamed: 0_level_0,close
tradetime,Unnamed: 1_level_1
2023-09-22 09:30:00,
2023-09-22 09:30:03,-0.001504
2023-09-22 09:30:06,0.000753
2023-09-22 09:30:09,-0.002257
2023-09-22 09:30:12,-0.000754
...,...
2023-09-22 14:59:48,0.000000
2023-09-22 14:59:51,0.000000
2023-09-22 14:59:54,0.000000
2023-09-22 14:59:57,0.000000


In [77]:
close.xs('000002.SZ', level='securityid')

Unnamed: 0_level_0,close
tradetime,Unnamed: 1_level_1
2023-09-22 09:30:00,13.30
2023-09-22 09:30:03,13.28
2023-09-22 09:30:06,13.29
2023-09-22 09:30:09,13.26
2023-09-22 09:30:12,13.25
...,...
2023-09-22 14:59:48,13.39
2023-09-22 14:59:51,13.39
2023-09-22 14:59:54,13.39
2023-09-22 14:59:57,13.39


In [61]:
df.replace(np.inf, np.nan, inplace=True)

In [62]:

def cal_portforlio_return(group):
    _port_return = group[get_forward_returns_columns(df.columns)]\
        .apply(lambda x: x.mean())
    return _port_return

grouper = [df.index.get_level_values('datetime')]
grouper.append('factor_quantile')
df.groupby(grouper).apply(cal_portforlio_return)

Unnamed: 0_level_0,Unnamed: 1_level_0,1m,3m,5m
datetime,factor_quantile,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-09-22 09:31:39,1,-0.002683,-0.003028,-0.006572
2023-09-22 09:31:39,2,-0.000323,-0.001943,-0.002660
2023-09-22 09:31:39,3,-0.001970,-0.004099,-0.003616
2023-09-22 09:31:39,4,-0.001633,-0.003079,-0.003103
2023-09-22 09:31:39,5,-0.001432,-0.003857,-0.005593
...,...,...,...,...
2023-09-22 14:55:00,1,-0.001000,-0.011356,-0.000119
2023-09-22 14:55:00,2,-0.001217,-0.017768,-0.000152
2023-09-22 14:55:00,3,-0.012767,-0.021691,-0.010381
2023-09-22 14:55:00,4,-0.006191,-0.024290,-0.002065


In [22]:
df.loc[pd.to_datetime('2023-09-22 13:00:00')]

Unnamed: 0_level_0,1m,3m,5m,close_ret,pv_corr,td_pv_corr_5min,td_pv_corr_1min,td_pv_corr,ret_v_prod_5min,ret_v_prod_1min,...,can_en_v_ratio_fillna,can_en_pv_ratio,can_en_pv_ratio_fillna,best_v_imbalance_tsrank,bs_press,OB_price_spread_tsrank,td_price_std,en_b_price_std,en_s_price_std,en_v_order_inbalance
securityid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000001.SZ,-0.001794,-0.001794,-0.002691,0.000000,0.267198,0.028589,0.027009,0.333921,0.505,0.525,...,0.600,0.580000,0.580,0.766667,-1.531254,0.250,0.616667,0.233333,0.083333,-0.010638
000002.SZ,-0.001504,-0.002256,-0.001504,0.000752,-0.183128,0.206403,0.328613,0.103428,0.930,0.900,...,0.710,0.707071,0.710,1.000000,0.350569,0.525,0.983333,0.783333,0.266667,0.853211
000004.SZ,-0.000614,0.000614,0.001228,0.000000,0.549407,-0.087823,0.195711,0.512378,0.495,0.475,...,0.445,,0.440,0.966667,-1.006108,0.325,0.683333,0.450000,0.041667,0.000000
000005.SZ,-0.006803,-0.013605,-0.013605,0.000000,,0.012422,,,0.505,0.525,...,0.495,,0.495,0.033333,0.006874,0.525,0.508333,0.666667,0.208333,0.000000
000006.SZ,-0.002160,-0.002160,-0.002160,0.000000,-0.666667,0.057208,0.082011,-0.666667,0.505,0.550,...,0.855,0.792857,0.855,0.950000,-0.703649,0.525,0.800000,0.166667,0.675000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
688799.SH,-0.002749,-0.003749,-0.003749,0.000000,,0.124517,-0.057945,,0.485,0.450,...,0.540,,0.545,0.483333,0.060089,0.625,0.316667,0.875000,0.491667,0.000000
688800.SH,-0.006914,-0.008195,-0.009987,0.000000,,0.041761,-0.485812,,0.500,0.475,...,0.500,,0.495,0.466667,0.493385,0.650,0.733333,0.883333,0.491667,0.000000
688819.SH,-0.001741,-0.003192,-0.003192,0.000000,,0.157544,-0.052632,,0.515,0.550,...,0.515,,0.485,0.550000,-0.435791,0.950,0.016667,0.341667,0.083333,0.000000
688981.SH,-0.000594,0.000000,-0.001782,0.000396,0.179952,0.205484,0.508150,0.287244,0.730,0.800,...,0.650,0.650000,0.650,0.575000,-0.821475,0.525,0.233333,0.416667,0.866667,0.015748


In [14]:
df.loc[pd.to_datetime('2023-09-22 11:30:00')]

Unnamed: 0_level_0,1m,3m,5m,close_ret,pv_corr,td_pv_corr_5min,td_pv_corr_1min,td_pv_corr,ret_v_prod_5min,ret_v_prod_1min,...,can_en_v_ratio_fillna,can_en_pv_ratio,can_en_pv_ratio_fillna,best_v_imbalance_tsrank,bs_press,OB_price_spread_tsrank,td_price_std,en_b_price_std,en_s_price_std,en_v_order_inbalance
securityid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000001.SZ,-0.000898,-0.000898,-0.001795,0.000898,0.277870,0.028593,0.026910,0.298460,0.900,1.000,...,0.740,0.740000,0.740,0.775000,-0.332348,0.725,0.716667,0.250000,0.083333,0.414365
000002.SZ,-0.000752,-0.001505,-0.000752,-0.000752,-0.306984,0.208322,0.305839,-0.146302,0.030,0.100,...,0.230,0.232323,0.230,0.250000,0.999264,0.525,0.633333,0.766667,0.266667,0.034483
000004.SZ,0.000615,0.001229,0.002458,-0.000615,0.398746,-0.077372,0.190410,0.349086,0.070,0.050,...,0.650,0.611111,0.650,0.250000,-0.178317,0.325,0.716667,0.300000,0.033333,0.565217
000005.SZ,-0.006803,-0.013605,-0.013605,0.000000,,0.012422,,,0.505,0.525,...,0.495,,0.495,0.458333,-0.019542,0.525,0.508333,0.750000,0.200000,0.000000
000006.SZ,-0.002160,-0.002160,-0.002160,-0.002160,-0.975309,0.055806,0.045330,-0.975309,0.020,0.050,...,0.660,0.514286,0.660,0.050000,-0.744273,0.525,0.766667,0.433333,0.666667,-1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
688799.SH,-0.002499,-0.003749,-0.003749,0.000000,,0.121209,-0.016420,,0.490,0.450,...,0.075,0.220588,0.075,0.483333,0.060089,0.650,0.400000,0.883333,0.500000,0.000000
688800.SH,-0.006914,-0.008195,-0.009987,0.000000,0.190119,0.054091,-0.358401,0.190119,0.500,0.500,...,0.085,0.119718,0.085,0.100000,0.493385,0.675,0.866667,0.975000,0.483333,-1.000000
688819.SH,-0.001741,-0.003192,-0.003482,0.000000,,0.156167,-0.076472,,0.520,0.550,...,0.515,,0.485,0.550000,-0.435791,0.975,0.050000,0.333333,0.075000,0.000000
688981.SH,-0.000396,0.000198,-0.001585,0.000000,0.315342,0.204390,0.577751,0.411713,0.470,0.475,...,0.150,0.150000,0.150,0.683333,-1.179004,0.075,0.033333,0.416667,0.850000,-0.466667


In [8]:
df_test = df[:10].set_index(['tradetime', 'securityid'])
df_test[df_test.columns[:3]]

Unnamed: 0_level_0,Unnamed: 1_level_0,1m,3m,5m
tradetime,securityid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-09-22 09:30:00,000001.SZ,0.001812,0.0,-0.000906
2023-09-22 09:30:00,000002.SZ,-0.002259,-0.003765,-0.004518
2023-09-22 09:30:00,000004.SZ,-0.002448,-0.00612,-0.009792
2023-09-22 09:30:00,000005.SZ,-0.013333,-0.02,-0.013333
2023-09-22 09:30:00,000006.SZ,0.0,-0.002183,-0.002183
2023-09-22 09:30:00,000007.SZ,-0.023758,-0.019438,-0.025918
2023-09-22 09:30:00,000008.SZ,-0.004098,-0.008197,-0.008197
2023-09-22 09:30:00,000009.SZ,-0.000984,-0.002953,-0.004921
2023-09-22 09:30:00,000010.SZ,-0.012012,-0.012012,-0.003003
2023-09-22 09:30:00,000011.SZ,-0.004094,-0.007165,-0.008188


In [5]:
df= df.replace([np.inf, -np.inf], np.nan).dropna(subset=df.columns)
df.shape

(21252313, 31)

In [6]:
X, y = df.drop(columns=df.columns[:5]), df[['1m']]

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=1)

In [8]:
print(X_train.shape)
print(y_train.shape)

print(np.isnan(X_train).any().any())
print(np.isnan(y_train).any().any())
print(np.isinf(X_train).any().any())
print(np.isinf(y_train).any().any())
print(np.isfinite(X_train).all().any())
print(np.isfinite(y_train).all().any())

(17001850, 26)
(17001850, 1)
False
False
False
False
True
True


In [28]:


# Create regression matrices
dtrain_reg = xgb.DMatrix(X_train, y_train)
dvalid_reg = xgb.DMatrix(X_valid, y_valid)

In [None]:
y_valid

In [27]:
# Define hyperparameters
params = {"objective": "reg:squarederror", "tree_method": "hist"}

evals = [(dtrain_reg, "train"), (dvalid_reg, "validation")]
n = 5000
# n = 100
model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
   evals=evals,
   verbose_eval=100,
   early_stopping_rounds=50,
)

ValueError: feature_names mismatch: ['td_ret_v_prod_5min', 'td_p_v_ratio_3s_1min', 'td_p_v_ratio_3s_5min', 'close_ret_15min', 'close_ret_5min', 'close_ret_1min', 'ohlc_rat', 'clh', 'clh_delta', 'td_buy_rank', 'td_sell_rank', 'close_adjusted', 'retail_dir_tsrank_10min', 'retail_dir_rowrank', 'wb1_tsrank', 'wb5_tsrank', 'wb10_tsrank', 'td_v_ratio', 'bs_v1_tsrank', 'bs_v5_tsrank', 'bs_pv1_tsrank', 'bs_pv5_tsrank', 'en_b_price_tsrank', 'en_s_price_tsrank', 'bs_press', 'en_v_order_inbalance'] ['1m']
expected td_v_ratio, close_ret_15min, bs_pv1_tsrank, en_s_price_tsrank, retail_dir_tsrank_10min, en_v_order_inbalance, wb10_tsrank, close_adjusted, wb5_tsrank, td_buy_rank, bs_v1_tsrank, bs_v5_tsrank, td_p_v_ratio_3s_5min, bs_pv5_tsrank, ohlc_rat, en_b_price_tsrank, bs_press, retail_dir_rowrank, close_ret_1min, td_ret_v_prod_5min, wb1_tsrank, clh_delta, td_sell_rank, close_ret_5min, clh, td_p_v_ratio_3s_1min in input data
training data did not have the following fields: 1m

In [26]:
model.feature_names

['td_ret_v_prod_5min',
 'td_p_v_ratio_3s_1min',
 'td_p_v_ratio_3s_5min',
 'close_ret_15min',
 'close_ret_5min',
 'close_ret_1min',
 'ohlc_rat',
 'clh',
 'clh_delta',
 'td_buy_rank',
 'td_sell_rank',
 'close_adjusted',
 'retail_dir_tsrank_10min',
 'retail_dir_rowrank',
 'wb1_tsrank',
 'wb5_tsrank',
 'wb10_tsrank',
 'td_v_ratio',
 'bs_v1_tsrank',
 'bs_v5_tsrank',
 'bs_pv1_tsrank',
 'bs_pv5_tsrank',
 'en_b_price_tsrank',
 'en_s_price_tsrank',
 'bs_press',
 'en_v_order_inbalance']

In [48]:
from sklearn.metrics import mean_squared_error

preds = model.predict(dvalid_reg)

In [31]:
date = '2024.02.19'
df_tt = pd.read_pickle(f'/home/wangzirui/workspace/data/fac_ret_{date}.pkl')


FileNotFoundError: [Errno 2] No such file or directory: '/home/wangzirui/workspace/data/fac_ret_2024.02.19.pkl'

In [3]:
def preprocess(df):
    df= df.replace([np.inf, -np.inf], np.nan).dropna(subset=df.columns)
    
    return df

def check(x:pd.DataFrame, y:pd.DataFrame):
    x = x.loc[:,x.columns[5:]].to_numpy()
    y = y.to_numpy()
    print(x.shape)
    print(np.isnan(x).any().any())
    print(np.isinf(x).any().any())
    print(np.isfinite(x).all().any())
    
    print(y.shape)
    print(np.isnan(y).any().any())
    print(np.isinf(y).any().any())
    print(np.isfinite(y).all().any())
    

In [6]:
df_tt = preprocess(df_tt)

(11695611, 60)
False
False
True
(11695611, 1)
False
False
True


In [7]:
X_test, y_test = df_tt.drop(columns=df_tt.columns[:5]), df_tt[['1m']]

The history saving thread hit an unexpected error (OperationalError('database or disk is full')).History will not be written to the database.


In [30]:
# model.save_model('/home/wangzirui/workspace/models/preliminary_model.json')

model = xgb.Booster()
model.load_model('/home/wangzirui/workspace/models/preliminary_model.json')

In [117]:
check(X_test, y_test)

(11695611, 55)
False
False
True
(11695611, 1)
False
False
True


In [14]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_testScaled = scaler.fit_transform(X_test)

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  new_unnormalized_variance -= correction**2 / new_sample_count


In [15]:
X_testScaled = pd.DataFrame(X_testScaled, columns=X_test.columns, index=X_test.index)

In [16]:
dtest_reg = xgb.DMatrix(X_testScaled, y_test)

In [17]:
preds = model.predict(dtest_reg)

In [18]:
y_test.head(5)

Unnamed: 0,1m
0,0.003106
8,0.010101
12,0.006557
13,-0.005417
14,0.0059


In [19]:
df_test = df_tt.loc[y_test.index.to_list(), df_tt.columns[:5]]

In [29]:
df_test['factor'] = preds

NameError: name 'preds' is not defined

In [21]:
df_test.set_index(['tradetime', 'securityid'], inplace=True)
df_test.index.set_names(['date', 'asset'], inplace=True)

In [22]:
df_test.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,1m,3m,5m,factor
date,securityid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-02-19 09:45:00,000001.SZ,0.003106,0.004141,0.00207,0.020396
2024-02-19 09:45:00,000010.SZ,0.010101,0.010101,0.005051,0.014754
2024-02-19 09:45:00,000016.SZ,0.006557,0.013115,0.013115,0.013283
2024-02-19 09:45:00,000017.SZ,-0.005417,-0.004334,0.001083,-0.006651
2024-02-19 09:45:00,000019.SZ,0.0059,0.0059,0.00885,0.033655


In [164]:
from alphalens import performance as perf
def get_factor_ic_summary_info(data):
    group_neutral = False
    ic_data = perf.factor_information_coefficient(data, group_neutral)


    ic_summary_table = pd.DataFrame()
    ic_summary_table["IC Mean"] = ic_data.mean()
    ic_summary_table["IC Std."] = ic_data.std()
    ic_summary_table["Risk-Adjusted IC"] = \
        ic_data.mean() / ic_data.std()
    t_stat, p_value = stats.ttest_1samp(ic_data, 0, nan_policy='omit')
    ic_summary_table["t-stat(IC)"] = t_stat
    ic_summary_table["p-value(IC)"] = p_value
    ic_summary_table["IC Skew"] = stats.skew(ic_data, nan_policy='omit')
    ic_summary_table["IC Kurtosis"] = stats.kurtosis(ic_data, nan_policy='omit')
    ic_summary_table['IC win rate'] = (ic_data > 0).sum() / ic_data.count()
    return ic_summary_table

In [165]:
get_factor_ic_summary_info(df_test)

KeyError: 'Level date not found'