In [102]:
import pandas as pd
import os

# 指定包含Feather文件的目录路径
directory_path = '/Users/zhangrui/Desktop/励京资本/A股分钟'

# 获取目录下所有.feather文件
file_paths = [os.path.join(directory_path, file) for file in os.listdir(directory_path) if file.endswith('.feather')]

# 读取数据并整合
all_data = []

for file_path in file_paths:
    # 读取每个feather文件
    df = pd.read_feather(file_path)
    all_data.append(df)

# 将所有数据拼接成一个完整的DataFrame
data = pd.concat(all_data, ignore_index=True)

# 数据预处理：处理缺失值、去噪声等
data['volume'].fillna(0, inplace=True)
data['close'].fillna(method='ffill', inplace=True)

In [103]:
data

Unnamed: 0,date,stkcd,open,high,low,close,volume,money
0,2022-12-01 09:31:00,000004.XSHE,75.87,76.32,75.8,76.25,25947.0,1972620.0
1,2022-12-01 09:32:00,000004.XSHE,76.25,76.25,75.95,75.95,5721.0,435038.0
2,2022-12-01 09:33:00,000004.XSHE,76.1,76.1,75.87,76.02,7763.0,590132.0
3,2022-12-01 09:34:00,000004.XSHE,76.02,76.17,75.95,76.1,11107.0,844782.0
4,2022-12-01 09:35:00,000004.XSHE,76.32,76.54,76.17,76.54,15324.0,1170280.0
...,...,...,...,...,...,...,...,...
1008235,2024-08-22 14:56:00,000006.XSHE,199.33,199.84,199.33,199.33,2492.0,497302.0
1008236,2024-08-22 14:57:00,000006.XSHE,199.33,199.84,199.33,199.33,2266.0,452122.0
1008237,2024-08-22 14:58:00,000006.XSHE,199.84,199.84,199.84,199.84,112.0,22407.0
1008238,2024-08-22 14:59:00,000006.XSHE,199.84,199.84,199.84,199.84,0.0,0.0


我们需要构建三个基础型因子，分别是开盘和尾盘半小时之间的差异：

	1.	lh_rtnDiff: 涨跌幅比值
	2.	lh_volDiff: 成交量之和比值
	3.	lh_stdDiff: 波动率比值

下面是如何根据这些因子构建方法进行代码实现的步骤：

1. 构建因子的代码

首先，我们需要将每个交易日划分为开盘的前半小时和尾盘的后半小时。

步骤：

	1.	开盘前半小时： 9:30 至 10:00
	2.	尾盘后半小时： 14:30 至 15:00

In [104]:
import numpy as np
import pandas as pd

# 确保 'date' 列是时间格式
data['date'] = pd.to_datetime(data['date'])

# 提取日期的天部分，创建新的 'day' 列
data['day'] = data['date'].dt.date

# 定义开盘和尾盘半小时的时间范围
def label_time_period(row):
    hour, minute = row['date'].hour, row['date'].minute
    if (hour == 9 and minute >= 30) or (hour == 10 and minute == 0):
        return 'morning_half_hour'
    elif (hour == 14 and minute >= 30) or (hour == 15 and minute == 0):
        return 'afternoon_half_hour'
    else:
        return 'other'

# 为每行数据打上时间段标签（开盘前半小时、尾盘后半小时等）
data['time_period'] = data.apply(label_time_period, axis=1)

# 分别计算开盘和尾盘半小时的聚合数据，按天和股票代码聚合
morning_data = data[data['time_period'] == 'morning_half_hour'].groupby(['stkcd', 'day']).agg({
    'open': 'first',  # 开盘价
    'close': 'last',  # 收盘价
    'volume': 'sum',  # 成交量总和
    'high': 'max',    # 最高价
    'low': 'min'      # 最低价
}).reset_index()

afternoon_data = data[data['time_period'] == 'afternoon_half_hour'].groupby(['stkcd', 'day']).agg({
    'open': 'first',  # 开盘价
    'close': 'last',  # 收盘价
    'volume': 'sum',  # 成交量总和
    'high': 'max',    # 最高价
    'low': 'min'      # 最低价
}).reset_index()

# 合并早盘和尾盘数据，确保它们能够正确匹配
merged_data = pd.merge(morning_data, afternoon_data, on=['stkcd', 'day'], suffixes=('_morning', '_afternoon'))

# 显示合并后的数据
merged_data

Unnamed: 0,stkcd,day,open_morning,close_morning,volume_morning,high_morning,low_morning,open_afternoon,close_afternoon,volume_afternoon,high_afternoon,low_afternoon
0,000001.XSHE,2022-12-01,1657.9100000000,1635.6100000000,1312148.0000000000,1692.6100000000,1626.9300000000,1628.1700000000,1623.2200000000,305936.0000000000,1629.4100000000,1619.5000000000
1,000001.XSHE,2022-12-02,1628.1700000000,1579.8500000000,910072.0000000000,1629.4100000000,1572.4100000000,1602.1500000000,1598.4400000000,136144.0000000000,1603.3900000000,1598.4400000000
2,000001.XSHE,2022-12-05,1621.9800000000,1635.6100000000,705372.0000000000,1640.5600000000,1612.0700000000,1676.5000000000,1676.5000000000,379124.0000000000,1678.9800000000,1672.7800000000
3,000001.XSHE,2022-12-06,1652.9600000000,1670.3000000000,653014.0000000000,1680.2200000000,1641.8000000000,1656.6700000000,1664.1100000000,195782.0000000000,1665.3500000000,1655.4300000000
4,000001.XSHE,2022-12-07,1651.7200000000,1631.8900000000,394468.0000000000,1656.6700000000,1631.8900000000,1623.2200000000,1629.4100000000,277266.0000000000,1631.8900000000,1623.2200000000
...,...,...,...,...,...,...,...,...,...,...,...,...
3776,000010.XSHE,2024-08-16,23.0,22.47,316485.0,23.26,22.47,22.6,22.6,157264.0,22.73,22.6
3777,000010.XSHE,2024-08-19,22.73,22.6,345615.0,22.73,22.2,22.6,22.6,110321.0,22.73,22.47
3778,000010.XSHE,2024-08-20,22.6,22.6,365495.0,22.86,22.47,22.47,22.6,106704.0,22.6,22.33
3779,000010.XSHE,2024-08-21,22.47,22.6,193916.0,22.6,22.2,22.6,22.47,164132.0,22.6,22.33


In [105]:
# 计算涨跌幅（尾盘收盘价 - 尾盘开盘价）和（开盘收盘价 - 开盘开盘价）
merged_data['rtn_morning'] = (merged_data['close_morning'] - merged_data['open_morning']) / merged_data['open_morning']
merged_data['rtn_afternoon'] = (merged_data['close_afternoon'] - merged_data['open_afternoon']) / merged_data['open_afternoon']

# 计算 lh_rtnDiff 因子
merged_data['lh_rtnDiff'] = merged_data['rtn_afternoon'] - merged_data['rtn_morning']

# 防止 volume_morning 为零，将 volume_morning 中的零值替换为一个很小的值
merged_data['volume_morning'].replace(0, 1e-5, inplace=True)

# 计算 lh_volDiff 因子
merged_data['lh_volDiff'] = merged_data['volume_afternoon'] / merged_data['volume_morning']

# 计算波动率：使用最高价和最低价的差值
merged_data['std_morning'] = (merged_data['high_morning'] - merged_data['low_morning']) / merged_data['open_morning']
merged_data['std_afternoon'] = (merged_data['high_afternoon'] - merged_data['low_afternoon']) / merged_data['open_afternoon']

# 转换 std_morning 和 std_afternoon 列为 float 类型，防止 Decimal 类型的计算错误
merged_data['std_morning'] = merged_data['std_morning'].astype(float)
merged_data['std_afternoon'] = merged_data['std_afternoon'].astype(float)

# 防止除以零，将 std_morning 和 std_afternoon 中的零值替换为一个很小的值
merged_data['std_morning'].replace(0, 1e-5, inplace=True)
merged_data['std_afternoon'].replace(0, 1e-5, inplace=True)

# 计算 lh_stdDiff 因子
merged_data['lh_stdDiff'] = merged_data['std_afternoon'] / merged_data['std_morning']

# 显示结果
result_001 = merged_data[['stkcd', 'day', 'lh_rtnDiff', 'lh_volDiff', 'lh_stdDiff']]

In [106]:
# 直接使用 pandas 的方法显示结果
print(result_001)

            stkcd         day                         lh_rtnDiff  \
0     000001.XSHE  2022-12-01    0.01041044674670095431774733669   
1     000001.XSHE  2022-12-02    0.02736185231876770652175288317   
2     000001.XSHE  2022-12-05  -0.008403309535259374344936435714   
3     000001.XSHE  2022-12-06  -0.005999335358948075581050171356   
4     000001.XSHE  2022-12-07    0.01581907473763802754405459567   
...           ...         ...                                ...   
3776  000010.XSHE  2024-08-16                           0.023043   
3777  000010.XSHE  2024-08-19                           0.005719   
3778  000010.XSHE  2024-08-20                           0.005785   
3779  000010.XSHE  2024-08-21                          -0.011538   
3780  000010.XSHE  2024-08-22                          -0.017913   

                          lh_volDiff  lh_stdDiff  
0     0.2331566256245484503272496700    0.153639  
1     0.1495969549662004764458196714    0.088252  
2     0.5374809320472034614359

In [107]:
merged_data

Unnamed: 0,stkcd,day,open_morning,close_morning,volume_morning,high_morning,low_morning,open_afternoon,close_afternoon,volume_afternoon,high_afternoon,low_afternoon,rtn_morning,rtn_afternoon,lh_rtnDiff,lh_volDiff,std_morning,std_afternoon,lh_stdDiff
0,000001.XSHE,2022-12-01,1657.9100000000,1635.6100000000,1312148.0000000000,1692.6100000000,1626.9300000000,1628.1700000000,1623.2200000000,305936.0000000000,1629.4100000000,1619.5000000000,-0.01345066981923023565814790911,-0.003040223072529281340400572422,0.01041044674670095431774733669,0.2331566256245484503272496700,0.039616,0.006087,0.153639
1,000001.XSHE,2022-12-02,1628.1700000000,1579.8500000000,910072.0000000000,1629.4100000000,1572.4100000000,1602.1500000000,1598.4400000000,136144.0000000000,1603.3900000000,1598.4400000000,-0.02967749067972017664003144635,-0.002315638360952470118278563181,0.02736185231876770652175288317,0.1495969549662004764458196714,0.035009,0.003090,0.088252
2,000001.XSHE,2022-12-05,1621.9800000000,1635.6100000000,705372.0000000000,1640.5600000000,1612.0700000000,1676.5000000000,1676.5000000000,379124.0000000000,1678.9800000000,1672.7800000000,0.008403309535259374344936435714,0,-0.008403309535259374344936435714,0.5374809320472034614359515263,0.017565,0.003698,0.210543
3,000001.XSHE,2022-12-06,1652.9600000000,1670.3000000000,653014.0000000000,1680.2200000000,1641.8000000000,1656.6700000000,1664.1100000000,195782.0000000000,1665.3500000000,1655.4300000000,0.01049027199690252637692382151,0.004490936637954450795873650154,-0.005999335358948075581050171356,0.2998128677179968576477686543,0.023243,0.005988,0.257621
4,000001.XSHE,2022-12-07,1651.7200000000,1631.8900000000,394468.0000000000,1656.6700000000,1631.8900000000,1623.2200000000,1629.4100000000,277266.0000000000,1631.8900000000,1623.2200000000,-0.01200566682004213789262102536,0.003813407917595889651433570311,0.01581907473763802754405459567,0.7028859121652453430950039040,0.015003,0.005341,0.356022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3776,000010.XSHE,2024-08-16,23.0,22.47,316485.0,23.26,22.47,22.6,22.6,157264.0,22.73,22.6,-0.023043,0.0,0.023043,0.496908,0.034348,0.005752,0.167469
3777,000010.XSHE,2024-08-19,22.73,22.6,345615.0,22.73,22.2,22.6,22.6,110321.0,22.73,22.47,-0.005719,0.0,0.005719,0.319202,0.023317,0.011504,0.493388
3778,000010.XSHE,2024-08-20,22.6,22.6,365495.0,22.86,22.47,22.47,22.6,106704.0,22.6,22.33,0.0,0.005785,0.005785,0.291944,0.017257,0.012016,0.696313
3779,000010.XSHE,2024-08-21,22.47,22.6,193916.0,22.6,22.2,22.6,22.47,164132.0,22.6,22.33,0.005785,-0.005752,-0.011538,0.846408,0.017802,0.011947,0.671117


1. RankIC测试：

RankIC 是通过将股票按因子值排序后计算其与未来收益的相关性。我们将使用每日的RankIC进行测试，计算每个因子的均值、标准差、ICIR以及T统计量。

In [108]:
import numpy as np
import pandas as pd
from scipy import stats

# 计算每日收益率
merged_data['daily_return'] = merged_data.groupby('stkcd')['close_morning'].shift(-1) / merged_data['close_morning'] - 1

# 计算每个因子的 RankIC
def calc_rank_ic(factor_values, returns):
    return factor_values.corr(returns, method='spearman')

# 创建因子列表
factor_columns = ['lh_rtnDiff', 'lh_volDiff', 'lh_stdDiff']
ic_values = {factor: [] for factor in factor_columns}

# 计算每个交易日的 RankIC
for day, group in merged_data.groupby('day'):
    for factor in factor_columns:
        ic = calc_rank_ic(group[factor], group['daily_return'])
        ic_values[factor].append(ic)

# 将IC值转为DataFrame
ic_df = pd.DataFrame(ic_values, index=merged_data['day'].unique())

# 计算IC均值、IC标准差、ICIR（IC均值/IC标准差）和T统计量
ic_stats = pd.DataFrame(index=factor_columns)
ic_stats['IC均值'] = ic_df.mean()
ic_stats['IC标准差'] = ic_df.std()
ic_stats['ICIR'] = ic_stats['IC均值'] / ic_stats['IC标准差']
ic_stats['T统计量'] = ic_stats['IC均值'] / (ic_stats['IC标准差'] / np.sqrt(len(ic_df)))

# 输出IC统计结果
print(ic_stats)

                IC均值     IC标准差      ICIR      T统计量
lh_rtnDiff  0.040312  0.402268  0.100211  2.056155
lh_volDiff  0.045191  0.388009  0.116469  2.389752
lh_stdDiff  0.059888  0.393682  0.152122  3.121288


In [109]:
data

Unnamed: 0,date,stkcd,open,high,low,close,volume,money,day,time_period
0,2022-12-01 09:31:00,000004.XSHE,75.87,76.32,75.8,76.25,25947.0,1972620.0,2022-12-01,morning_half_hour
1,2022-12-01 09:32:00,000004.XSHE,76.25,76.25,75.95,75.95,5721.0,435038.0,2022-12-01,morning_half_hour
2,2022-12-01 09:33:00,000004.XSHE,76.1,76.1,75.87,76.02,7763.0,590132.0,2022-12-01,morning_half_hour
3,2022-12-01 09:34:00,000004.XSHE,76.02,76.17,75.95,76.1,11107.0,844782.0,2022-12-01,morning_half_hour
4,2022-12-01 09:35:00,000004.XSHE,76.32,76.54,76.17,76.54,15324.0,1170280.0,2022-12-01,morning_half_hour
...,...,...,...,...,...,...,...,...,...,...
1008235,2024-08-22 14:56:00,000006.XSHE,199.33,199.84,199.33,199.33,2492.0,497302.0,2024-08-22,afternoon_half_hour
1008236,2024-08-22 14:57:00,000006.XSHE,199.33,199.84,199.33,199.33,2266.0,452122.0,2024-08-22,afternoon_half_hour
1008237,2024-08-22 14:58:00,000006.XSHE,199.84,199.84,199.84,199.84,112.0,22407.0,2024-08-22,afternoon_half_hour
1008238,2024-08-22 14:59:00,000006.XSHE,199.84,199.84,199.84,199.84,0.0,0.0,2024-08-22,afternoon_half_hour


In [110]:
merged_data

Unnamed: 0,stkcd,day,open_morning,close_morning,volume_morning,high_morning,low_morning,open_afternoon,close_afternoon,volume_afternoon,high_afternoon,low_afternoon,rtn_morning,rtn_afternoon,lh_rtnDiff,lh_volDiff,std_morning,std_afternoon,lh_stdDiff,daily_return
0,000001.XSHE,2022-12-01,1657.9100000000,1635.6100000000,1312148.0000000000,1692.6100000000,1626.9300000000,1628.1700000000,1623.2200000000,305936.0000000000,1629.4100000000,1619.5000000000,-0.01345066981923023565814790911,-0.003040223072529281340400572422,0.01041044674670095431774733669,0.2331566256245484503272496700,0.039616,0.006087,0.153639,-0.0340912564731201203220816698
1,000001.XSHE,2022-12-02,1628.1700000000,1579.8500000000,910072.0000000000,1629.4100000000,1572.4100000000,1602.1500000000,1598.4400000000,136144.0000000000,1603.3900000000,1598.4400000000,-0.02967749067972017664003144635,-0.002315638360952470118278563181,0.02736185231876770652175288317,0.1495969549662004764458196714,0.035009,0.003090,0.088252,0.035294489983226255657182644
2,000001.XSHE,2022-12-05,1621.9800000000,1635.6100000000,705372.0000000000,1640.5600000000,1612.0700000000,1676.5000000000,1676.5000000000,379124.0000000000,1678.9800000000,1672.7800000000,0.008403309535259374344936435714,0,-0.008403309535259374344936435714,0.5374809320472034614359515263,0.017565,0.003698,0.210543,0.021209212465074192503102818
3,000001.XSHE,2022-12-06,1652.9600000000,1670.3000000000,653014.0000000000,1680.2200000000,1641.8000000000,1656.6700000000,1664.1100000000,195782.0000000000,1665.3500000000,1655.4300000000,0.01049027199690252637692382151,0.004490936637954450795873650154,-0.005999335358948075581050171356,0.2998128677179968576477686543,0.023243,0.005988,0.257621,-0.0229958690055678620607076573
4,000001.XSHE,2022-12-07,1651.7200000000,1631.8900000000,394468.0000000000,1656.6700000000,1631.8900000000,1623.2200000000,1629.4100000000,277266.0000000000,1631.8900000000,1623.2200000000,-0.01200566682004213789262102536,0.003813407917595889651433570311,0.01581907473763802754405459567,0.7028859121652453430950039040,0.015003,0.005341,0.356022,0.009871988920821869121080465
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3776,000010.XSHE,2024-08-16,23.0,22.47,316485.0,23.26,22.47,22.6,22.6,157264.0,22.73,22.6,-0.023043,0.0,0.023043,0.496908,0.034348,0.005752,0.167469,0.005785
3777,000010.XSHE,2024-08-19,22.73,22.6,345615.0,22.73,22.2,22.6,22.6,110321.0,22.73,22.47,-0.005719,0.0,0.005719,0.319202,0.023317,0.011504,0.493388,0.0
3778,000010.XSHE,2024-08-20,22.6,22.6,365495.0,22.86,22.47,22.47,22.6,106704.0,22.6,22.33,0.0,0.005785,0.005785,0.291944,0.017257,0.012016,0.696313,0.0
3779,000010.XSHE,2024-08-21,22.47,22.6,193916.0,22.6,22.2,22.6,22.47,164132.0,22.6,22.33,0.005785,-0.005752,-0.011538,0.846408,0.017802,0.011947,0.671117,0.005752


In [111]:
# 过滤出数值类型的列（排除日期、字符串等非数值列）
numeric_columns = merged_data.select_dtypes(include=['number']).columns

# 用每列的平均值替换 NaN 值
merged_data[numeric_columns].fillna(merged_data[numeric_columns].mean(), inplace=True)

# 再次检查 NaN 值是否处理完毕
print(merged_data.isnull().sum())

stkcd                0
day                  0
open_morning        81
close_morning        0
volume_morning       0
high_morning        81
low_morning         81
open_afternoon      81
close_afternoon      0
volume_afternoon     0
high_afternoon      81
low_afternoon       81
rtn_morning         81
rtn_afternoon       81
lh_rtnDiff          81
lh_volDiff           0
std_morning         81
std_afternoon       81
lh_stdDiff          81
daily_return         9
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_data[numeric_columns].fillna(merged_data[numeric_columns].mean(), inplace=True)


In [112]:
merged_data

Unnamed: 0,stkcd,day,open_morning,close_morning,volume_morning,high_morning,low_morning,open_afternoon,close_afternoon,volume_afternoon,high_afternoon,low_afternoon,rtn_morning,rtn_afternoon,lh_rtnDiff,lh_volDiff,std_morning,std_afternoon,lh_stdDiff,daily_return
0,000001.XSHE,2022-12-01,1657.9100000000,1635.6100000000,1312148.0000000000,1692.6100000000,1626.9300000000,1628.1700000000,1623.2200000000,305936.0000000000,1629.4100000000,1619.5000000000,-0.01345066981923023565814790911,-0.003040223072529281340400572422,0.01041044674670095431774733669,0.2331566256245484503272496700,0.039616,0.006087,0.153639,-0.0340912564731201203220816698
1,000001.XSHE,2022-12-02,1628.1700000000,1579.8500000000,910072.0000000000,1629.4100000000,1572.4100000000,1602.1500000000,1598.4400000000,136144.0000000000,1603.3900000000,1598.4400000000,-0.02967749067972017664003144635,-0.002315638360952470118278563181,0.02736185231876770652175288317,0.1495969549662004764458196714,0.035009,0.003090,0.088252,0.035294489983226255657182644
2,000001.XSHE,2022-12-05,1621.9800000000,1635.6100000000,705372.0000000000,1640.5600000000,1612.0700000000,1676.5000000000,1676.5000000000,379124.0000000000,1678.9800000000,1672.7800000000,0.008403309535259374344936435714,0,-0.008403309535259374344936435714,0.5374809320472034614359515263,0.017565,0.003698,0.210543,0.021209212465074192503102818
3,000001.XSHE,2022-12-06,1652.9600000000,1670.3000000000,653014.0000000000,1680.2200000000,1641.8000000000,1656.6700000000,1664.1100000000,195782.0000000000,1665.3500000000,1655.4300000000,0.01049027199690252637692382151,0.004490936637954450795873650154,-0.005999335358948075581050171356,0.2998128677179968576477686543,0.023243,0.005988,0.257621,-0.0229958690055678620607076573
4,000001.XSHE,2022-12-07,1651.7200000000,1631.8900000000,394468.0000000000,1656.6700000000,1631.8900000000,1623.2200000000,1629.4100000000,277266.0000000000,1631.8900000000,1623.2200000000,-0.01200566682004213789262102536,0.003813407917595889651433570311,0.01581907473763802754405459567,0.7028859121652453430950039040,0.015003,0.005341,0.356022,0.009871988920821869121080465
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3776,000010.XSHE,2024-08-16,23.0,22.47,316485.0,23.26,22.47,22.6,22.6,157264.0,22.73,22.6,-0.023043,0.0,0.023043,0.496908,0.034348,0.005752,0.167469,0.005785
3777,000010.XSHE,2024-08-19,22.73,22.6,345615.0,22.73,22.2,22.6,22.6,110321.0,22.73,22.47,-0.005719,0.0,0.005719,0.319202,0.023317,0.011504,0.493388,0.0
3778,000010.XSHE,2024-08-20,22.6,22.6,365495.0,22.86,22.47,22.47,22.6,106704.0,22.6,22.33,0.0,0.005785,0.005785,0.291944,0.017257,0.012016,0.696313,0.0
3779,000010.XSHE,2024-08-21,22.47,22.6,193916.0,22.6,22.2,22.6,22.47,164132.0,22.6,22.33,0.005785,-0.005752,-0.011538,0.846408,0.017802,0.011947,0.671117,0.005752


### 2.4

In [77]:
!pip install ace_tools



In [113]:
import numpy as np
import pandas as pd

# 示例收益率数据，替换为你的实际数据
head_rtn = np.random.randn(100) / 100  # 多头收益率
tail_rtn = np.random.randn(100) / 100  # 空头收益率

# 计算多空收益率
long_short_rtn = head_rtn - tail_rtn

# 检查并处理NaN，避免计算过程中出现NaN
head_rtn = np.nan_to_num(head_rtn)
tail_rtn = np.nan_to_num(tail_rtn)
long_short_rtn = np.nan_to_num(long_short_rtn)

# 定义计算年化波动率、夏普比率和最大回撤的函数
def calculate_metrics(returns, risk_free_rate=0):
    """计算年化波动率、夏普比率和最大回撤"""
    if len(returns) == 0 or np.std(returns) == 0:
        return 0, 0, 0  # 如果没有有效数据，返回0
    # 年化波动率
    annual_volatility = np.std(returns) * np.sqrt(252)
    # 夏普比率
    sharpe_ratio = (np.mean(returns) - risk_free_rate) / np.std(returns) * np.sqrt(252)
    # 计算累计收益率
    cumulative_returns = np.cumsum(returns)
    peak = np.maximum.accumulate(cumulative_returns)
    drawdown = peak - cumulative_returns
    max_drawdown = np.max(drawdown)
    return annual_volatility, sharpe_ratio, max_drawdown

# 计算各个收益率的指标
head_metrics = calculate_metrics(head_rtn)
tail_metrics = calculate_metrics(tail_rtn)
long_short_metrics = calculate_metrics(long_short_rtn)

# 确保所有的收益率计算结果都没有NaN，用0代替
head_avg_rtn = np.nan_to_num(np.mean(head_rtn), nan=0)
tail_avg_rtn = np.nan_to_num(np.mean(tail_rtn), nan=0)
long_short_avg_rtn = np.nan_to_num(np.mean(long_short_rtn), nan=0)

# 创建DataFrame来展示结果
results_002 = pd.DataFrame({
    '年化波动率': [head_metrics[0], tail_metrics[0], long_short_metrics[0]],
    '夏普比率': [head_metrics[1], tail_metrics[1], long_short_metrics[1]],
    '最大回撤': [head_metrics[2], tail_metrics[2], long_short_metrics[2]],
    '多头收益率': [head_avg_rtn, np.nan, np.nan],
    '空头收益率': [np.nan, tail_avg_rtn, np.nan],
    '多空收益率': [np.nan, np.nan, long_short_avg_rtn]
}, index=['多头', '空头', '多空'])

# 使用 fillna 方法替换剩余的 NaN
results_002.fillna(0, inplace=True)

In [115]:
# 直接使用 pandas 的方法显示结果
print(results_002)

       年化波动率      夏普比率      最大回撤     多头收益率     空头收益率     多空收益率
多头  0.169335 -0.734557  0.150622 -0.000494  0.000000  0.000000
空头  0.144137  3.737937  0.037849  0.000000  0.002138  0.000000
多空  0.230685 -2.874747  0.316423  0.000000  0.000000 -0.002632


In [116]:
# %% RankIC 测试
# 假设 diff_idx, diff_std, diff_vol 是我们构建的因子
factor_columns = ['lh_rtnDiff', 'lh_volDiff', 'lh_stdDiff']

# 初始化保存 IC 值的字典
ic_values = {factor: [] for factor in factor_columns}

# 计算每个交易日的 RankIC
for day, group in merged_data.groupby('day'):
    for factor in factor_columns:
        ic = calc_rank_ic(group[factor], group['daily_return'])
        ic_values[factor].append(ic)

# 将 IC 值转为 DataFrame
ic_df = pd.DataFrame(ic_values, index=merged_data['day'].unique())

# 计算 IC 的均值、标准差、ICIR 和 T 统计量
ic_stats = pd.DataFrame(index=factor_columns)
ic_stats['IC均值'] = ic_df.mean()
ic_stats['IC标准差'] = ic_df.std()
ic_stats['ICIR'] = ic_stats['IC均值'] / ic_stats['IC标准差']
ic_stats['T统计量'] = ic_stats['IC均值'] / (ic_stats['IC标准差'] / np.sqrt(len(ic_df)))

# 输出 RankIC 统计结果
print(ic_stats)

                IC均值     IC标准差      ICIR      T统计量
lh_rtnDiff  0.040312  0.402268  0.100211  2.056155
lh_volDiff  0.045191  0.388009  0.116469  2.389752
lh_stdDiff  0.059888  0.393682  0.152122  3.121288


### 3.2

In [117]:
import numpy as np
import pandas as pd

# 假设已有数据 'data'，包括时间、开盘价、收盘价、成交量等列
data['date'] = pd.to_datetime(data['date'])
data['day'] = data['date'].dt.date

# 分域：基于时间的分域（半小时区间）
def label_time_period(row):
    hour, minute = row['date'].hour, row['date'].minute
    if (hour == 9 and minute >= 30) or (hour == 10 and minute == 0):
        return 'morning_half_hour'
    elif (hour == 14 and minute >= 30) or (hour == 15 and minute == 0):
        return 'afternoon_half_hour'
    else:
        return 'other'

data['time_period'] = data.apply(label_time_period, axis=1)

# 对每个半小时段计算波动率和成交量等特征
agg_data = data.groupby(['stkcd', 'time_period']).agg({
    'close': 'last',  # 收盘价
    'open': 'first',  # 开盘价
    'volume': 'sum',  # 成交量总和
    'high': 'max',    # 最高价
    'low': 'min'      # 最低价
}).reset_index()

# 计算收益率和波动率
agg_data['return'] = (agg_data['close'] - agg_data['open']) / agg_data['open']
agg_data['volatility'] = (agg_data['high'] - agg_data['low']) / agg_data['open']

# 按照公式计算分域内的收益率波动率（gcutCHR)
agg_data['gcutCHR'] = agg_data['volatility'] * 100  # 简化公式示意

# 显示分域特征数据
print(agg_data)

# 输出结果包含每个半小时区间的收益率、波动率和成交量特征

          stkcd          time_period            close             open  \
0   000001.XSHE  afternoon_half_hour  1423.8700000000  1628.1700000000   
1   000001.XSHE    morning_half_hour  1412.9900000000  1657.9100000000   
2   000001.XSHE                other  1422.5100000000  1635.6100000000   
3   000002.XSHE  afternoon_half_hour          1092.63          3003.65   
4   000002.XSHE    morning_half_hour          1104.45          2950.65   
5   000002.XSHE                other          1094.32          3027.75   
6   000004.XSHE  afternoon_half_hour            78.03            76.54   
7   000004.XSHE    morning_half_hour            79.37            75.87   
8   000004.XSHE                other            77.36            76.92   
9   000005.XSHE  afternoon_half_hour             8.49            19.12   
10  000005.XSHE    morning_half_hour             8.49            19.42   
11  000005.XSHE                other             8.49            19.32   
12  000006.XSHE  afternoon_half_hour  

	1.	显著性算法计算：
	•	根据公式  intraDS = \max \left( \frac{|qcutCHR_i - \text{avg}(qcutCHR)|}{|qcutCHR_i| + |\text{avg}(qcutCHR)| + \epsilon} \right) ，我们可以计算显著性因子。
	•	将该算法应用于每个分域的量价特征，例如波动率、收益率等。
	2.	构建因子：
	•	根据表9中的因子汇总，基于不同的特征（时间、价格、成交量），我们将构建对应的因子：
	•	时间：收益率因子 intraDSRtn_byT，成交量因子 intraDSVol_byT。
	•	价格：波动率因子 intraDSStd_byP，成交量因子 intraDSVol_byP。
	•	成交量：波动率因子 intraDSStd_byV，收益率因子 intraDSRtn_byV。
	3.	移动平均：
	•	对每个显著性因子进行20日移动均值计算，用于后续的测试。

### 3.3

In [118]:
merged_data

Unnamed: 0,stkcd,day,open_morning,close_morning,volume_morning,high_morning,low_morning,open_afternoon,close_afternoon,volume_afternoon,high_afternoon,low_afternoon,rtn_morning,rtn_afternoon,lh_rtnDiff,lh_volDiff,std_morning,std_afternoon,lh_stdDiff,daily_return
0,000001.XSHE,2022-12-01,1657.9100000000,1635.6100000000,1312148.0000000000,1692.6100000000,1626.9300000000,1628.1700000000,1623.2200000000,305936.0000000000,1629.4100000000,1619.5000000000,-0.01345066981923023565814790911,-0.003040223072529281340400572422,0.01041044674670095431774733669,0.2331566256245484503272496700,0.039616,0.006087,0.153639,-0.0340912564731201203220816698
1,000001.XSHE,2022-12-02,1628.1700000000,1579.8500000000,910072.0000000000,1629.4100000000,1572.4100000000,1602.1500000000,1598.4400000000,136144.0000000000,1603.3900000000,1598.4400000000,-0.02967749067972017664003144635,-0.002315638360952470118278563181,0.02736185231876770652175288317,0.1495969549662004764458196714,0.035009,0.003090,0.088252,0.035294489983226255657182644
2,000001.XSHE,2022-12-05,1621.9800000000,1635.6100000000,705372.0000000000,1640.5600000000,1612.0700000000,1676.5000000000,1676.5000000000,379124.0000000000,1678.9800000000,1672.7800000000,0.008403309535259374344936435714,0,-0.008403309535259374344936435714,0.5374809320472034614359515263,0.017565,0.003698,0.210543,0.021209212465074192503102818
3,000001.XSHE,2022-12-06,1652.9600000000,1670.3000000000,653014.0000000000,1680.2200000000,1641.8000000000,1656.6700000000,1664.1100000000,195782.0000000000,1665.3500000000,1655.4300000000,0.01049027199690252637692382151,0.004490936637954450795873650154,-0.005999335358948075581050171356,0.2998128677179968576477686543,0.023243,0.005988,0.257621,-0.0229958690055678620607076573
4,000001.XSHE,2022-12-07,1651.7200000000,1631.8900000000,394468.0000000000,1656.6700000000,1631.8900000000,1623.2200000000,1629.4100000000,277266.0000000000,1631.8900000000,1623.2200000000,-0.01200566682004213789262102536,0.003813407917595889651433570311,0.01581907473763802754405459567,0.7028859121652453430950039040,0.015003,0.005341,0.356022,0.009871988920821869121080465
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3776,000010.XSHE,2024-08-16,23.0,22.47,316485.0,23.26,22.47,22.6,22.6,157264.0,22.73,22.6,-0.023043,0.0,0.023043,0.496908,0.034348,0.005752,0.167469,0.005785
3777,000010.XSHE,2024-08-19,22.73,22.6,345615.0,22.73,22.2,22.6,22.6,110321.0,22.73,22.47,-0.005719,0.0,0.005719,0.319202,0.023317,0.011504,0.493388,0.0
3778,000010.XSHE,2024-08-20,22.6,22.6,365495.0,22.86,22.47,22.47,22.6,106704.0,22.6,22.33,0.0,0.005785,0.005785,0.291944,0.017257,0.012016,0.696313,0.0
3779,000010.XSHE,2024-08-21,22.47,22.6,193916.0,22.6,22.2,22.6,22.47,164132.0,22.6,22.33,0.005785,-0.005752,-0.011538,0.846408,0.017802,0.011947,0.671117,0.005752


In [119]:
import numpy as np
import pandas as pd

# 假设 merged_data 已经加载，以下是基于最高价和最低价计算波动率的步骤

# 计算早盘和尾盘的波动率
merged_data['volatility_morning'] = (merged_data['high_morning'] - merged_data['low_morning']) / merged_data['open_morning']
merged_data['volatility_afternoon'] = (merged_data['high_afternoon'] - merged_data['low_afternoon']) / merged_data['open_afternoon']

# 计算平均波动率（全日的平均值）
merged_data['avg_volatility_morning'] = merged_data.groupby('stkcd')['volatility_morning'].transform('mean')
merged_data['avg_volatility_afternoon'] = merged_data.groupby('stkcd')['volatility_afternoon'].transform('mean')

In [120]:
import numpy as np
import pandas as pd

# 定义显著性计算公式，并确保所有输入都是 float 类型
def calculate_intraDS(qcutCHR, avg_qcutCHR):
    epsilon = 1e-5  # 防止除以零的微小数值
    
    # 将 Decimal 转换为 float，以避免计算错误
    qcutCHR = qcutCHR.astype(float)
    avg_qcutCHR = avg_qcutCHR.astype(float)
    
    return np.abs(qcutCHR - avg_qcutCHR) / (np.abs(qcutCHR) + np.abs(avg_qcutCHR) + epsilon)

# 计算不同维度下的显著性因子
merged_data['intraDSRtn_byT'] = calculate_intraDS(merged_data['rtn_morning'], merged_data['rtn_afternoon'])
merged_data['intraDSVol_byT'] = calculate_intraDS(merged_data['volume_afternoon'], merged_data['volume_morning'])

merged_data['intraDSStd_byP'] = calculate_intraDS(merged_data['std_morning'], merged_data['std_afternoon'])
merged_data['intraDSVol_byP'] = calculate_intraDS(merged_data['volume_afternoon'], merged_data['volume_morning'])

merged_data['intraDSStd_byV'] = calculate_intraDS(merged_data['std_afternoon'], merged_data['std_morning'])
merged_data['intraDSRtn_byV'] = calculate_intraDS(merged_data['rtn_afternoon'], merged_data['rtn_morning'])

# 计算20日移动均值
for col in ['intraDSRtn_byT', 'intraDSVol_byT', 'intraDSStd_byP', 'intraDSVol_byP', 'intraDSStd_byV', 'intraDSRtn_byV']:
    merged_data[f'{col}_20d_mean'] = merged_data.groupby('stkcd')[col].transform(lambda x: x.rolling(window=20).mean())

# 显示结果
print(merged_data[['stkcd', 'day', 'intraDSRtn_byT_20d_mean', 'intraDSVol_byT_20d_mean', 'intraDSStd_byP_20d_mean']].head())

         stkcd         day  intraDSRtn_byT_20d_mean  intraDSVol_byT_20d_mean  \
0  000001.XSHE  2022-12-01                      NaN                      NaN   
1  000001.XSHE  2022-12-02                      NaN                      NaN   
2  000001.XSHE  2022-12-05                      NaN                      NaN   
3  000001.XSHE  2022-12-06                      NaN                      NaN   
4  000001.XSHE  2022-12-07                      NaN                      NaN   

   intraDSStd_byP_20d_mean  
0                      NaN  
1                      NaN  
2                      NaN  
3                      NaN  
4                      NaN  


In [121]:
# 检查缺失值
print(merged_data.isnull().sum())

# 使用前向填充处理缺失值
merged_data.fillna(method='ffill', inplace=True)

# 再次计算20日移动均值
for col in ['intraDSRtn_byT', 'intraDSVol_byT', 'intraDSStd_byP', 'intraDSVol_byP', 'intraDSStd_byV', 'intraDSRtn_byV']:
    merged_data[f'{col}_20d_mean'] = merged_data.groupby('stkcd')[col].transform(lambda x: x.rolling(window=20, min_periods=1).mean())

# 显示结果
print(merged_data[['stkcd', 'day', 'intraDSRtn_byT_20d_mean', 'intraDSVol_byT_20d_mean', 'intraDSStd_byP_20d_mean']].head())

stkcd                         0
day                           0
open_morning                 81
close_morning                 0
volume_morning                0
high_morning                 81
low_morning                  81
open_afternoon               81
close_afternoon               0
volume_afternoon              0
high_afternoon               81
low_afternoon                81
rtn_morning                  81
rtn_afternoon                81
lh_rtnDiff                   81
lh_volDiff                    0
std_morning                  81
std_afternoon                81
lh_stdDiff                   81
daily_return                  9
volatility_morning           81
volatility_afternoon         81
avg_volatility_morning        0
avg_volatility_afternoon      0
intraDSRtn_byT               81
intraDSVol_byT                0
intraDSStd_byP               81
intraDSVol_byP                0
intraDSStd_byV               81
intraDSRtn_byV               81
intraDSRtn_byT_20d_mean     252
intraDSV

	1.	Rank IC 测试:
	•	根据不同因子的日度收益率相关性，计算 Rank IC 的均值、标准差、ICIR 和 T 统计量。
	2.	多空组合测试:
	•	计算每个因子的多空收益率、多头收益率、空头收益率、夏普比率、年化波动率和最大回撤。

### 3.4

In [122]:
merged_data

Unnamed: 0,stkcd,day,open_morning,close_morning,volume_morning,high_morning,low_morning,open_afternoon,close_afternoon,volume_afternoon,...,intraDSStd_byP,intraDSVol_byP,intraDSStd_byV,intraDSRtn_byV,intraDSRtn_byT_20d_mean,intraDSVol_byT_20d_mean,intraDSStd_byP_20d_mean,intraDSVol_byP_20d_mean,intraDSStd_byV_20d_mean,intraDSRtn_byV_20d_mean
0,000001.XSHE,2022-12-01,1657.9100000000,1635.6100000000,1312148.0000000000,1692.6100000000,1626.9300000000,1628.1700000000,1623.2200000000,305936.0000000000,...,0.733484,0.621854,0.733484,0.630902,0.630902,0.621854,0.733484,0.621854,0.733484,0.630902
1,000001.XSHE,2022-12-02,1628.1700000000,1579.8500000000,910072.0000000000,1629.4100000000,1572.4100000000,1602.1500000000,1598.4400000000,136144.0000000000,...,0.837589,0.739740,0.837589,0.854974,0.742938,0.680797,0.785536,0.680797,0.785536,0.742938
2,000001.XSHE,2022-12-05,1621.9800000000,1635.6100000000,705372.0000000000,1640.5600000000,1612.0700000000,1676.5000000000,1676.5000000000,379124.0000000000,...,0.651844,0.300829,0.651844,0.998811,0.828229,0.554141,0.740972,0.554141,0.740972,0.828229
3,000001.XSHE,2022-12-06,1652.9600000000,1670.3000000000,653014.0000000000,1680.2200000000,1641.8000000000,1656.6700000000,1664.1100000000,195782.0000000000,...,0.590103,0.538683,0.590103,0.400190,0.721219,0.550277,0.703255,0.550277,0.703255,0.721219
4,000001.XSHE,2022-12-07,1651.7200000000,1631.8900000000,394468.0000000000,1656.6700000000,1631.8900000000,1623.2200000000,1629.4100000000,277266.0000000000,...,0.474669,0.174477,0.474669,0.999368,0.776849,0.475117,0.657538,0.475117,0.657538,0.776849
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3776,000010.XSHE,2024-08-16,23.0,22.47,316485.0,23.26,22.47,22.6,22.6,157264.0,...,0.712929,0.336087,0.712929,0.999566,0.779722,0.381373,0.429422,0.381373,0.429422,0.779722
3777,000010.XSHE,2024-08-19,22.73,22.6,345615.0,22.73,22.2,22.6,22.6,110321.0,...,0.339139,0.516068,0.339139,0.998255,0.779656,0.391774,0.410366,0.391774,0.410366,0.779656
3778,000010.XSHE,2024-08-20,22.6,22.6,365495.0,22.86,22.47,22.47,22.6,106704.0,...,0.178967,0.548055,0.178967,0.998275,0.779582,0.392318,0.379046,0.392318,0.379046,0.779582
3779,000010.XSHE,2024-08-21,22.47,22.6,193916.0,22.6,22.2,22.6,22.47,164132.0,...,0.196738,0.083184,0.196738,0.999134,0.779559,0.379128,0.367315,0.379128,0.367315,0.779559


In [130]:
# Assuming 'data' is your DataFrame, save it to your desktop
save_path = '/Users/zhangrui/Desktop/data_modified001.csv'

# Save the DataFrame as a CSV file
merged_data.to_csv(save_path, index=False)

In [127]:
# 查看每一列的数据类型，确保因子列和 'daily_return' 是数值型
print(merged_data.dtypes)

stkcd                        object
day                          object
open_morning                 object
close_morning                object
volume_morning               object
high_morning                 object
low_morning                  object
open_afternoon               object
close_afternoon              object
volume_afternoon             object
high_afternoon               object
low_afternoon                object
rtn_morning                  object
rtn_afternoon                object
lh_rtnDiff                   object
lh_volDiff                   object
std_morning                 float64
std_afternoon               float64
lh_stdDiff                  float64
daily_return                 object
volatility_morning           object
volatility_afternoon         object
avg_volatility_morning      float64
avg_volatility_afternoon    float64
intraDSRtn_byT              float64
intraDSVol_byT              float64
intraDSStd_byP              float64
intraDSVol_byP              

In [128]:
# 查看 merged_data 中的缺失值情况
print(merged_data.isnull().sum())

stkcd                       0
day                         0
open_morning                0
close_morning               0
volume_morning              0
high_morning                0
low_morning                 0
open_afternoon              0
close_afternoon             0
volume_afternoon            0
high_afternoon              0
low_afternoon               0
rtn_morning                 0
rtn_afternoon               0
lh_rtnDiff                  0
lh_volDiff                  0
std_morning                 0
std_afternoon               0
lh_stdDiff                  0
daily_return                0
volatility_morning          0
volatility_afternoon        0
avg_volatility_morning      0
avg_volatility_afternoon    0
intraDSRtn_byT              0
intraDSVol_byT              0
intraDSStd_byP              0
intraDSVol_byP              0
intraDSStd_byV              0
intraDSRtn_byV              0
intraDSRtn_byT_20d_mean     0
intraDSVol_byT_20d_mean     0
intraDSStd_byP_20d_mean     0
intraDSVol

In [129]:
merged_data

Unnamed: 0,stkcd,day,open_morning,close_morning,volume_morning,high_morning,low_morning,open_afternoon,close_afternoon,volume_afternoon,...,intraDSStd_byP,intraDSVol_byP,intraDSStd_byV,intraDSRtn_byV,intraDSRtn_byT_20d_mean,intraDSVol_byT_20d_mean,intraDSStd_byP_20d_mean,intraDSVol_byP_20d_mean,intraDSStd_byV_20d_mean,intraDSRtn_byV_20d_mean
0,000001.XSHE,2022-12-01,1657.9100000000,1635.6100000000,1312148.0000000000,1692.6100000000,1626.9300000000,1628.1700000000,1623.2200000000,305936.0000000000,...,0.733484,0.621854,0.733484,0.630902,0.630902,0.621854,0.733484,0.621854,0.733484,0.630902
1,000001.XSHE,2022-12-02,1628.1700000000,1579.8500000000,910072.0000000000,1629.4100000000,1572.4100000000,1602.1500000000,1598.4400000000,136144.0000000000,...,0.837589,0.739740,0.837589,0.854974,0.742938,0.680797,0.785536,0.680797,0.785536,0.742938
2,000001.XSHE,2022-12-05,1621.9800000000,1635.6100000000,705372.0000000000,1640.5600000000,1612.0700000000,1676.5000000000,1676.5000000000,379124.0000000000,...,0.651844,0.300829,0.651844,0.998811,0.828229,0.554141,0.740972,0.554141,0.740972,0.828229
3,000001.XSHE,2022-12-06,1652.9600000000,1670.3000000000,653014.0000000000,1680.2200000000,1641.8000000000,1656.6700000000,1664.1100000000,195782.0000000000,...,0.590103,0.538683,0.590103,0.400190,0.721219,0.550277,0.703255,0.550277,0.703255,0.721219
4,000001.XSHE,2022-12-07,1651.7200000000,1631.8900000000,394468.0000000000,1656.6700000000,1631.8900000000,1623.2200000000,1629.4100000000,277266.0000000000,...,0.474669,0.174477,0.474669,0.999368,0.776849,0.475117,0.657538,0.475117,0.657538,0.776849
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3776,000010.XSHE,2024-08-16,23.0,22.47,316485.0,23.26,22.47,22.6,22.6,157264.0,...,0.712929,0.336087,0.712929,0.999566,0.779722,0.381373,0.429422,0.381373,0.429422,0.779722
3777,000010.XSHE,2024-08-19,22.73,22.6,345615.0,22.73,22.2,22.6,22.6,110321.0,...,0.339139,0.516068,0.339139,0.998255,0.779656,0.391774,0.410366,0.391774,0.410366,0.779656
3778,000010.XSHE,2024-08-20,22.6,22.6,365495.0,22.86,22.47,22.47,22.6,106704.0,...,0.178967,0.548055,0.178967,0.998275,0.779582,0.392318,0.379046,0.392318,0.379046,0.779582
3779,000010.XSHE,2024-08-21,22.47,22.6,193916.0,22.6,22.2,22.6,22.47,164132.0,...,0.196738,0.083184,0.196738,0.999134,0.779559,0.379128,0.367315,0.379128,0.367315,0.779559


In [96]:
# 将因子列和 'daily_return' 转换为数值类型并填充缺失值
for col in factor_columns + ['daily_return']:
    merged_data[col] = pd.to_numeric(merged_data[col], errors='coerce')  # 将无法转换的值设置为 NaN
    merged_data[col].fillna(0, inplace=True)  # 用 0 填充 NaN 值

In [97]:
# 定义因子列
factor_columns = ['intraDSVol_byT', 'intraDSRtn_byT', 'intraDSStd_byP', 'intraDSVol_byP', 'intraDSRtn_byV', 'intraDSStd_byV']

# 初始化 Rank IC 结果数据框
ic_results = pd.DataFrame(index=factor_columns, columns=['IC均值', 'IC标准差', 'ICIR', 'T统计量'])

# 计算每个因子的 Rank IC
for factor in factor_columns:
    daily_ic = []
    for day, group in merged_data.groupby('day'):
        if group[factor].dtype == 'float64':  # 确保因子列是 float 类型
            # 仅在存在非 NaN 值时计算 Rank IC
            if group[factor].notna().sum() > 0 and group['daily_return'].notna().sum() > 0:
                ic = group[factor].corr(group['daily_return'], method='spearman')
                daily_ic.append(ic)
    
    if len(daily_ic) > 0:  # 确保存在有效数据进行计算
        ic_results.loc[factor, 'IC均值'] = np.mean(daily_ic)
        ic_results.loc[factor, 'IC标准差'] = np.std(daily_ic)
        ic_results.loc[factor, 'ICIR'] = ic_results.loc[factor, 'IC均值'] / ic_results.loc[factor, 'IC标准差']
        ic_results.loc[factor, 'T统计量'] = ic_results.loc[factor, 'IC均值'] / (ic_results.loc[factor, 'IC标准差'] / np.sqrt(len(daily_ic)))

# 输出 Rank IC 结果
print("Rank IC 测试结果")
print(ic_results)

Rank IC 测试结果
               IC均值 IC标准差 ICIR T统计量
intraDSVol_byT  NaN   NaN  NaN  NaN
intraDSRtn_byT  NaN   NaN  NaN  NaN
intraDSStd_byP  NaN   NaN  NaN  NaN
intraDSVol_byP  NaN   NaN  NaN  NaN
intraDSRtn_byV  NaN   NaN  NaN  NaN
intraDSStd_byV  NaN   NaN  NaN  NaN


In [98]:
# 将所有应该为数值的列转换为 float64 类型，忽略无法转换的值
for col in merged_data.columns:
    if merged_data[col].dtype == 'object':
        # 尝试将列转换为数值类型
        merged_data[col] = pd.to_numeric(merged_data[col], errors='coerce')

In [99]:
# 用每列的平均值填充 NaN 值
merged_data.fillna(merged_data.mean(), inplace=True)

In [100]:
# 定义因子列
factor_columns = ['intraDSVol_byT', 'intraDSRtn_byT', 'intraDSStd_byP', 'intraDSVol_byP', 'intraDSRtn_byV', 'intraDSStd_byV']

# 初始化 Rank IC 结果数据框
ic_results = pd.DataFrame(index=factor_columns, columns=['IC均值', 'IC标准差', 'ICIR', 'T统计量'])

# 计算每个因子的 Rank IC
for factor in factor_columns:
    daily_ic = []
    for day, group in merged_data.groupby('day'):
        if group[factor].dtype == 'float64':  # 确保因子列是 float 类型
            # 仅在存在非 NaN 值时计算 Rank IC
            if group[factor].notna().sum() > 0 and group['daily_return'].notna().sum() > 0:
                ic = group[factor].corr(group['daily_return'], method='spearman')
                daily_ic.append(ic)
    
    if len(daily_ic) > 0:  # 确保存在有效数据进行计算
        ic_results.loc[factor, 'IC均值'] = np.mean(daily_ic)
        ic_results.loc[factor, 'IC标准差'] = np.std(daily_ic)
        ic_results.loc[factor, 'ICIR'] = ic_results.loc[factor, 'IC均值'] / ic_results.loc[factor, 'IC标准差']
        ic_results.loc[factor, 'T统计量'] = ic_results.loc[factor, 'IC均值'] / (ic_results.loc[factor, 'IC标准差'] / np.sqrt(len(daily_ic)))

# 输出 Rank IC 结果
print("Rank IC 测试结果")
print(ic_results)

Rank IC 测试结果
               IC均值 IC标准差 ICIR T统计量
intraDSVol_byT  NaN   NaN  NaN  NaN
intraDSRtn_byT  NaN   NaN  NaN  NaN
intraDSStd_byP  NaN   NaN  NaN  NaN
intraDSVol_byP  NaN   NaN  NaN  NaN
intraDSRtn_byV  NaN   NaN  NaN  NaN
intraDSStd_byV  NaN   NaN  NaN  NaN


In [91]:
# 多空组合测试函数
def calculate_long_short_performance(data, factor):
    # 排序股票，按因子值分为多头（Top 30%）和空头（Bottom 30%）
    long_stocks = data[data[factor] >= data[factor].quantile(0.7)]
    short_stocks = data[data[factor] <= data[factor].quantile(0.3)]
    
    # 计算多头和空头组合的收益率
    long_rtn = long_stocks['daily_return'].mean()
    short_rtn = short_stocks['daily_return'].mean()
    
    # 多空组合收益率
    long_short_rtn = long_rtn - short_rtn
    
    # 计算年化波动率和夏普比率
    annual_volatility = np.std(data['daily_return']) * np.sqrt(252)
    sharpe_ratio = (long_short_rtn - 0) / np.std(data['daily_return']) * np.sqrt(252)
    
    # 最大回撤（简单计算）
    cumulative_returns = (1 + data['daily_return']).cumprod()
    peak = cumulative_returns.cummax()
    drawdown = peak - cumulative_returns
    max_drawdown = drawdown.max()
    
    return long_rtn, short_rtn, long_short_rtn, annual_volatility, sharpe_ratio, max_drawdown

# 初始化组合测试结果数据框
performance_results = pd.DataFrame(columns=['多头收益率', '空头收益率', '多空收益率', '年化波动率', '夏普比率', '最大回撤'])

# 计算每个因子的多空组合表现
for factor in factor_columns:
    long_rtn, short_rtn, long_short_rtn, annual_vol, sharpe_ratio, max_dd = calculate_long_short_performance(merged_data, factor)
    performance_results.loc[factor] = [long_rtn, short_rtn, long_short_rtn, annual_vol, sharpe_ratio, max_dd]

# 输出组合测试结果
print("组合测试结果")
print(performance_results)

TypeError: float() argument must be a string or a number, not 'datetime.date'