### 导入数据

In [10]:
import pandas as pd
import os

# 指定包含Feather文件的目录路径
directory_path = '/Users/zhangrui/Desktop/励京资本/A股分钟'

# 获取目录下所有.feather文件
file_paths = [os.path.join(directory_path, file) for file in os.listdir(directory_path) if file.endswith('.feather')]

# 读取数据并整合
all_data = []

for file_path in file_paths:
    # 读取每个feather文件
    df = pd.read_feather(file_path)
    all_data.append(df)

# 将所有数据拼接成一个完整的DataFrame
data = pd.concat(all_data, ignore_index=True)

# 数据预处理：处理缺失值、去噪声等
data['volume'].fillna(0, inplace=True)
data['close'].fillna(method='ffill', inplace=True)

In [11]:
data

Unnamed: 0,date,stkcd,open,high,low,close,volume,money
0,2022-12-01 09:31:00,000004.XSHE,75.87,76.32,75.8,76.25,25947.0,1972620.0
1,2022-12-01 09:32:00,000004.XSHE,76.25,76.25,75.95,75.95,5721.0,435038.0
2,2022-12-01 09:33:00,000004.XSHE,76.1,76.1,75.87,76.02,7763.0,590132.0
3,2022-12-01 09:34:00,000004.XSHE,76.02,76.17,75.95,76.1,11107.0,844782.0
4,2022-12-01 09:35:00,000004.XSHE,76.32,76.54,76.17,76.54,15324.0,1170280.0
...,...,...,...,...,...,...,...,...
1008235,2024-08-22 14:56:00,000006.XSHE,199.33,199.84,199.33,199.33,2492.0,497302.0
1008236,2024-08-22 14:57:00,000006.XSHE,199.33,199.84,199.33,199.33,2266.0,452122.0
1008237,2024-08-22 14:58:00,000006.XSHE,199.84,199.84,199.84,199.84,112.0,22407.0
1008238,2024-08-22 14:59:00,000006.XSHE,199.84,199.84,199.84,199.84,0.0,0.0


我们需要构建三个基础型因子，分别是开盘和尾盘半小时之间的差异：

	1.	lh_rtnDiff: 涨跌幅比值
	2.	lh_volDiff: 成交量之和比值
	3.	lh_stdDiff: 波动率比值

下面是如何根据这些因子构建方法进行代码实现的步骤：

1. 构建因子的代码

首先，我们需要将每个交易日划分为开盘的前半小时和尾盘的后半小时。

步骤：

	1.	开盘前半小时： 9:30 至 10:00
	2.	尾盘后半小时： 14:30 至 15:00

In [12]:
import numpy as np
import pandas as pd

# 确保 'date' 列是时间格式
data['date'] = pd.to_datetime(data['date'])

# 提取日期的天部分，创建新的 'day' 列
data['day'] = data['date'].dt.date

# 定义开盘和尾盘半小时的时间范围
def label_time_period(row):
    hour, minute = row['date'].hour, row['date'].minute
    if (hour == 9 and minute >= 30) or (hour == 10 and minute == 0):
        return 'morning_half_hour'
    elif (hour == 14 and minute >= 30) or (hour == 15 and minute == 0):
        return 'afternoon_half_hour'
    else:
        return 'other'

# 为每行数据打上时间段标签（开盘前半小时、尾盘后半小时等）
data['time_period'] = data.apply(label_time_period, axis=1)

# 分别计算开盘和尾盘半小时的聚合数据，按天和股票代码聚合
morning_data = data[data['time_period'] == 'morning_half_hour'].groupby(['stkcd', 'day']).agg({
    'open': 'first',  # 开盘价
    'close': 'last',  # 收盘价
    'volume': 'sum',  # 成交量总和
    'high': 'max',    # 最高价
    'low': 'min'      # 最低价
}).reset_index()

afternoon_data = data[data['time_period'] == 'afternoon_half_hour'].groupby(['stkcd', 'day']).agg({
    'open': 'first',  # 开盘价
    'close': 'last',  # 收盘价
    'volume': 'sum',  # 成交量总和
    'high': 'max',    # 最高价
    'low': 'min'      # 最低价
}).reset_index()

# 合并早盘和尾盘数据，确保它们能够正确匹配
merged_data = pd.merge(morning_data, afternoon_data, on=['stkcd', 'day'], suffixes=('_morning', '_afternoon'))

# 显示合并后的数据
merged_data

Unnamed: 0,stkcd,day,open_morning,close_morning,volume_morning,high_morning,low_morning,open_afternoon,close_afternoon,volume_afternoon,high_afternoon,low_afternoon
0,000001.XSHE,2022-12-01,1657.9100000000,1635.6100000000,1312148.0000000000,1692.6100000000,1626.9300000000,1628.1700000000,1623.2200000000,305936.0000000000,1629.4100000000,1619.5000000000
1,000001.XSHE,2022-12-02,1628.1700000000,1579.8500000000,910072.0000000000,1629.4100000000,1572.4100000000,1602.1500000000,1598.4400000000,136144.0000000000,1603.3900000000,1598.4400000000
2,000001.XSHE,2022-12-05,1621.9800000000,1635.6100000000,705372.0000000000,1640.5600000000,1612.0700000000,1676.5000000000,1676.5000000000,379124.0000000000,1678.9800000000,1672.7800000000
3,000001.XSHE,2022-12-06,1652.9600000000,1670.3000000000,653014.0000000000,1680.2200000000,1641.8000000000,1656.6700000000,1664.1100000000,195782.0000000000,1665.3500000000,1655.4300000000
4,000001.XSHE,2022-12-07,1651.7200000000,1631.8900000000,394468.0000000000,1656.6700000000,1631.8900000000,1623.2200000000,1629.4100000000,277266.0000000000,1631.8900000000,1623.2200000000
...,...,...,...,...,...,...,...,...,...,...,...,...
3776,000010.XSHE,2024-08-16,23.0,22.47,316485.0,23.26,22.47,22.6,22.6,157264.0,22.73,22.6
3777,000010.XSHE,2024-08-19,22.73,22.6,345615.0,22.73,22.2,22.6,22.6,110321.0,22.73,22.47
3778,000010.XSHE,2024-08-20,22.6,22.6,365495.0,22.86,22.47,22.47,22.6,106704.0,22.6,22.33
3779,000010.XSHE,2024-08-21,22.47,22.6,193916.0,22.6,22.2,22.6,22.47,164132.0,22.6,22.33


In [17]:
# 计算涨跌幅（尾盘收盘价 - 尾盘开盘价）和（开盘收盘价 - 开盘开盘价）
merged_data['rtn_morning'] = (merged_data['close_morning'] - merged_data['open_morning']) / merged_data['open_morning']
merged_data['rtn_afternoon'] = (merged_data['close_afternoon'] - merged_data['open_afternoon']) / merged_data['open_afternoon']

# 计算 lh_rtnDiff 因子
merged_data['lh_rtnDiff'] = merged_data['rtn_afternoon'] - merged_data['rtn_morning']

# 防止 volume_morning 为零，将 volume_morning 中的零值替换为一个很小的值
merged_data['volume_morning'].replace(0, 1e-5, inplace=True)

# 计算 lh_volDiff 因子
merged_data['lh_volDiff'] = merged_data['volume_afternoon'] / merged_data['volume_morning']

# 计算波动率：使用最高价和最低价的差值
merged_data['std_morning'] = (merged_data['high_morning'] - merged_data['low_morning']) / merged_data['open_morning']
merged_data['std_afternoon'] = (merged_data['high_afternoon'] - merged_data['low_afternoon']) / merged_data['open_afternoon']

# 转换 std_morning 和 std_afternoon 列为 float 类型，防止 Decimal 类型的计算错误
merged_data['std_morning'] = merged_data['std_morning'].astype(float)
merged_data['std_afternoon'] = merged_data['std_afternoon'].astype(float)

# 防止除以零，将 std_morning 和 std_afternoon 中的零值替换为一个很小的值
merged_data['std_morning'].replace(0, 1e-5, inplace=True)
merged_data['std_afternoon'].replace(0, 1e-5, inplace=True)

# 计算 lh_stdDiff 因子
merged_data['lh_stdDiff'] = merged_data['std_afternoon'] / merged_data['std_morning']

# 显示结果
result = merged_data[['stkcd', 'day', 'lh_rtnDiff', 'lh_volDiff', 'lh_stdDiff']]

In [18]:
# 直接使用 pandas 的方法显示结果
print(result)

# 如果想把结果保存为 CSV 文件，可以使用以下代码：
result.to_csv('factor_calculation_results.csv', index=False)

            stkcd         day                         lh_rtnDiff  \
0     000001.XSHE  2022-12-01    0.01041044674670095431774733669   
1     000001.XSHE  2022-12-02    0.02736185231876770652175288317   
2     000001.XSHE  2022-12-05  -0.008403309535259374344936435714   
3     000001.XSHE  2022-12-06  -0.005999335358948075581050171356   
4     000001.XSHE  2022-12-07    0.01581907473763802754405459567   
...           ...         ...                                ...   
3776  000010.XSHE  2024-08-16                           0.023043   
3777  000010.XSHE  2024-08-19                           0.005719   
3778  000010.XSHE  2024-08-20                           0.005785   
3779  000010.XSHE  2024-08-21                          -0.011538   
3780  000010.XSHE  2024-08-22                          -0.017913   

                          lh_volDiff  lh_stdDiff  
0     0.2331566256245484503272496700    0.153639  
1     0.1495969549662004764458196714    0.088252  
2     0.5374809320472034614359

In [19]:
merged_data

Unnamed: 0,stkcd,day,open_morning,close_morning,volume_morning,high_morning,low_morning,open_afternoon,close_afternoon,volume_afternoon,high_afternoon,low_afternoon,rtn_morning,rtn_afternoon,lh_rtnDiff,lh_volDiff,std_morning,std_afternoon,lh_stdDiff
0,000001.XSHE,2022-12-01,1657.9100000000,1635.6100000000,1312148.0000000000,1692.6100000000,1626.9300000000,1628.1700000000,1623.2200000000,305936.0000000000,1629.4100000000,1619.5000000000,-0.01345066981923023565814790911,-0.003040223072529281340400572422,0.01041044674670095431774733669,0.2331566256245484503272496700,0.039616,0.006087,0.153639
1,000001.XSHE,2022-12-02,1628.1700000000,1579.8500000000,910072.0000000000,1629.4100000000,1572.4100000000,1602.1500000000,1598.4400000000,136144.0000000000,1603.3900000000,1598.4400000000,-0.02967749067972017664003144635,-0.002315638360952470118278563181,0.02736185231876770652175288317,0.1495969549662004764458196714,0.035009,0.003090,0.088252
2,000001.XSHE,2022-12-05,1621.9800000000,1635.6100000000,705372.0000000000,1640.5600000000,1612.0700000000,1676.5000000000,1676.5000000000,379124.0000000000,1678.9800000000,1672.7800000000,0.008403309535259374344936435714,0,-0.008403309535259374344936435714,0.5374809320472034614359515263,0.017565,0.003698,0.210543
3,000001.XSHE,2022-12-06,1652.9600000000,1670.3000000000,653014.0000000000,1680.2200000000,1641.8000000000,1656.6700000000,1664.1100000000,195782.0000000000,1665.3500000000,1655.4300000000,0.01049027199690252637692382151,0.004490936637954450795873650154,-0.005999335358948075581050171356,0.2998128677179968576477686543,0.023243,0.005988,0.257621
4,000001.XSHE,2022-12-07,1651.7200000000,1631.8900000000,394468.0000000000,1656.6700000000,1631.8900000000,1623.2200000000,1629.4100000000,277266.0000000000,1631.8900000000,1623.2200000000,-0.01200566682004213789262102536,0.003813407917595889651433570311,0.01581907473763802754405459567,0.7028859121652453430950039040,0.015003,0.005341,0.356022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3776,000010.XSHE,2024-08-16,23.0,22.47,316485.0,23.26,22.47,22.6,22.6,157264.0,22.73,22.6,-0.023043,0.0,0.023043,0.496908,0.034348,0.005752,0.167469
3777,000010.XSHE,2024-08-19,22.73,22.6,345615.0,22.73,22.2,22.6,22.6,110321.0,22.73,22.47,-0.005719,0.0,0.005719,0.319202,0.023317,0.011504,0.493388
3778,000010.XSHE,2024-08-20,22.6,22.6,365495.0,22.86,22.47,22.47,22.6,106704.0,22.6,22.33,0.0,0.005785,0.005785,0.291944,0.017257,0.012016,0.696313
3779,000010.XSHE,2024-08-21,22.47,22.6,193916.0,22.6,22.2,22.6,22.47,164132.0,22.6,22.33,0.005785,-0.005752,-0.011538,0.846408,0.017802,0.011947,0.671117


In [20]:
# 保存结果到Mac桌面
merged_data.to_csv('/Users/zhangrui/Desktop/factor_data.csv', index=False)

1. RankIC测试：

RankIC 是通过将股票按因子值排序后计算其与未来收益的相关性。我们将使用每日的RankIC进行测试，计算每个因子的均值、标准差、ICIR以及T统计量。

In [21]:
import numpy as np
import pandas as pd
from scipy import stats

# 计算每日收益率
merged_data['daily_return'] = merged_data.groupby('stkcd')['close_morning'].shift(-1) / merged_data['close_morning'] - 1

# 计算每个因子的 RankIC
def calc_rank_ic(factor_values, returns):
    return factor_values.corr(returns, method='spearman')

# 创建因子列表
factor_columns = ['lh_rtnDiff', 'lh_volDiff', 'lh_stdDiff']
ic_values = {factor: [] for factor in factor_columns}

# 计算每个交易日的 RankIC
for day, group in merged_data.groupby('day'):
    for factor in factor_columns:
        ic = calc_rank_ic(group[factor], group['daily_return'])
        ic_values[factor].append(ic)

# 将IC值转为DataFrame
ic_df = pd.DataFrame(ic_values, index=merged_data['day'].unique())

# 计算IC均值、IC标准差、ICIR（IC均值/IC标准差）和T统计量
ic_stats = pd.DataFrame(index=factor_columns)
ic_stats['IC均值'] = ic_df.mean()
ic_stats['IC标准差'] = ic_df.std()
ic_stats['ICIR'] = ic_stats['IC均值'] / ic_stats['IC标准差']
ic_stats['T统计量'] = ic_stats['IC均值'] / (ic_stats['IC标准差'] / np.sqrt(len(ic_df)))

# 输出IC统计结果
print(ic_stats)

# 保存结果到CSV文件
ic_stats.to_csv('factor_ic_stats.csv')

                IC均值     IC标准差      ICIR      T统计量
lh_rtnDiff  0.040312  0.402268  0.100211  2.056155
lh_volDiff  0.045191  0.388009  0.116469  2.389752
lh_stdDiff  0.059888  0.393682  0.152122  3.121288


2. 多空组合测试：

根据因子值构建多空组合，分析多头和空头组合的年化波动率、夏普比率、最大回撤、多头收益率和空头收益率等指标。

In [24]:
# 检查数据中是否存在 NaN 值
print(merged_data.isnull().sum())

# 如果存在 NaN，填充或去除缺失值
merged_data.fillna(0, inplace=True)

stkcd                0
day                  0
open_morning        81
close_morning        0
volume_morning       0
high_morning        81
low_morning         81
open_afternoon      81
close_afternoon      0
volume_afternoon     0
high_afternoon      81
low_afternoon       81
rtn_morning         81
rtn_afternoon       81
lh_rtnDiff          81
lh_volDiff           0
std_morning         81
std_afternoon       81
lh_stdDiff          81
daily_return         9
dtype: int64


In [26]:
# 检查因子的分布情况，确保数据合理
print(merged_data[['lh_rtnDiff', 'lh_volDiff', 'lh_stdDiff']].describe())

# 检查每个分组中的数据量
for factor in factor_columns:
    for day, group in merged_data.groupby('day'):
        long_group = group[group[factor].rank() >= len(group) * 0.8]
        short_group = group[group[factor].rank() <= len(group) * 0.2]
        print(f"Factor: {factor}, Date: {day}, Long Group Size: {len(long_group)}, Short Group Size: {len(short_group)}")

        lh_rtnDiff   lh_volDiff   lh_stdDiff
count  3781.000000  3781.000000  3781.000000
mean     -0.000355     0.609996     1.609904
std       0.018119     0.685835    53.738205
min      -0.142510     0.000000     0.000000
25%      -0.006912     0.317418     0.228339
50%       0.000000     0.488929     0.339559
75%       0.008081     0.725871     0.516706
max       0.097740    18.996319  2926.481085
Factor: lh_rtnDiff, Date: 2022-12-01, Long Group Size: 2, Short Group Size: 1
Factor: lh_rtnDiff, Date: 2022-12-02, Long Group Size: 2, Short Group Size: 1
Factor: lh_rtnDiff, Date: 2022-12-05, Long Group Size: 2, Short Group Size: 1
Factor: lh_rtnDiff, Date: 2022-12-06, Long Group Size: 2, Short Group Size: 1
Factor: lh_rtnDiff, Date: 2022-12-07, Long Group Size: 2, Short Group Size: 1
Factor: lh_rtnDiff, Date: 2022-12-08, Long Group Size: 2, Short Group Size: 1
Factor: lh_rtnDiff, Date: 2022-12-09, Long Group Size: 2, Short Group Size: 1
Factor: lh_rtnDiff, Date: 2022-12-12, Long Group 

In [27]:
# 修改分组比例，确保有足够的数据进行计算
def calc_long_short_returns(df, factor, quantile=0.3):  # 调整 quantile 比例为 0.3
    df['factor_rank'] = df[factor].rank()
    n = len(df)

    # 定义多头和空头组合
    long_group = df[df['factor_rank'] >= n * (1 - quantile)]
    short_group = df[df['factor_rank'] <= n * quantile]

    if len(long_group) == 0 or len(short_group) == 0:
        return np.nan, np.nan, np.nan

    # 计算多空组合收益率
    long_return = long_group['daily_return'].mean()
    short_return = short_group['daily_return'].mean()
    long_short_return = long_return - short_return

    return long_return, short_return, long_short_return

In [34]:
!pip install ace_tools



In [30]:
import numpy as np
import pandas as pd

# 示例收益率数据，替换为你的实际数据
head_rtn = np.random.randn(100) / 100  # 多头收益率
tail_rtn = np.random.randn(100) / 100  # 空头收益率

# 计算多空收益率
long_short_rtn = head_rtn - tail_rtn

# 检查并处理NaN，避免计算过程中出现NaN
head_rtn = np.nan_to_num(head_rtn)
tail_rtn = np.nan_to_num(tail_rtn)
long_short_rtn = np.nan_to_num(long_short_rtn)

# 定义计算年化波动率、夏普比率和最大回撤的函数
def calculate_metrics(returns, risk_free_rate=0):
    """计算年化波动率、夏普比率和最大回撤"""
    if len(returns) == 0 or np.std(returns) == 0:
        return 0, 0, 0  # 如果没有有效数据，返回0
    # 年化波动率
    annual_volatility = np.std(returns) * np.sqrt(252)
    # 夏普比率
    sharpe_ratio = (np.mean(returns) - risk_free_rate) / np.std(returns) * np.sqrt(252)
    # 计算累计收益率
    cumulative_returns = np.cumsum(returns)
    peak = np.maximum.accumulate(cumulative_returns)
    drawdown = peak - cumulative_returns
    max_drawdown = np.max(drawdown)
    return annual_volatility, sharpe_ratio, max_drawdown

# 计算各个收益率的指标
head_metrics = calculate_metrics(head_rtn)
tail_metrics = calculate_metrics(tail_rtn)
long_short_metrics = calculate_metrics(long_short_rtn)

# 确保所有的收益率计算结果都没有NaN，用0代替
head_avg_rtn = np.nan_to_num(np.mean(head_rtn), nan=0)
tail_avg_rtn = np.nan_to_num(np.mean(tail_rtn), nan=0)
long_short_avg_rtn = np.nan_to_num(np.mean(long_short_rtn), nan=0)

# 创建DataFrame来展示结果
results = pd.DataFrame({
    '年化波动率': [head_metrics[0], tail_metrics[0], long_short_metrics[0]],
    '夏普比率': [head_metrics[1], tail_metrics[1], long_short_metrics[1]],
    '最大回撤': [head_metrics[2], tail_metrics[2], long_short_metrics[2]],
    '多头收益率': [head_avg_rtn, np.nan, np.nan],
    '空头收益率': [np.nan, tail_avg_rtn, np.nan],
    '多空收益率': [np.nan, np.nan, long_short_avg_rtn]
}, index=['多头', '空头', '多空'])

# 使用 fillna 方法替换剩余的 NaN
results.fillna(0, inplace=True)

# 显示结果
tools.display_dataframe_to_user(name="收益率分析结果", dataframe=results)

In [31]:
# 直接使用 pandas 的方法显示结果
print(results)

# 如果想把结果保存为 CSV 文件，可以使用以下代码：
result.to_csv('factor_calculation_results1.csv', index=False)

       年化波动率      夏普比率      最大回撤     多头收益率     空头收益率     多空收益率
多头  0.157017  0.219631  0.086533  0.000137       NaN       NaN
空头  0.165478 -0.749125  0.158905       NaN -0.000492       NaN
多空  0.210812  0.751616  0.118449       NaN       NaN  0.000629


In [36]:
import numpy as np
import pandas as pd

# 示例收益率数据，替换为你的实际数据
head_rtn = np.random.randn(100) / 100  # 多头收益率
tail_rtn = np.random.randn(100) / 100  # 空头收益率

# 计算多空收益率
long_short_rtn = head_rtn - tail_rtn

# 检查并处理NaN，避免计算过程中出现NaN
head_rtn = np.nan_to_num(head_rtn)
tail_rtn = np.nan_to_num(tail_rtn)
long_short_rtn = np.nan_to_num(long_short_rtn)

# 定义计算年化波动率、夏普比率和最大回撤的函数
def calculate_metrics(returns, risk_free_rate=0):
    """计算年化波动率、夏普比率和最大回撤"""
    if len(returns) == 0 or np.std(returns) == 0:
        return 0, 0, 0  # 如果没有有效数据，返回0
    # 年化波动率
    annual_volatility = np.std(returns) * np.sqrt(252)
    # 夏普比率
    sharpe_ratio = (np.mean(returns) - risk_free_rate) / np.std(returns) * np.sqrt(252)
    # 计算累计收益率
    cumulative_returns = np.cumsum(returns)
    peak = np.maximum.accumulate(cumulative_returns)
    drawdown = peak - cumulative_returns
    max_drawdown = np.max(drawdown)
    return annual_volatility, sharpe_ratio, max_drawdown

# 计算各个收益率的指标
head_metrics = calculate_metrics(head_rtn)
tail_metrics = calculate_metrics(tail_rtn)
long_short_metrics = calculate_metrics(long_short_rtn)

# 确保所有的收益率计算结果都没有NaN，用0代替
head_avg_rtn = np.nan_to_num(np.mean(head_rtn), nan=0)
tail_avg_rtn = np.nan_to_num(np.mean(tail_rtn), nan=0)
long_short_avg_rtn = np.nan_to_num(np.mean(long_short_rtn), nan=0)

# 创建DataFrame来展示结果
results = pd.DataFrame({
    '年化波动率': [head_metrics[0], tail_metrics[0], long_short_metrics[0]],
    '夏普比率': [head_metrics[1], tail_metrics[1], long_short_metrics[1]],
    '最大回撤': [head_metrics[2], tail_metrics[2], long_short_metrics[2]],
    '多头收益率': [head_avg_rtn, np.nan, np.nan],
    '空头收益率': [np.nan, tail_avg_rtn, np.nan],
    '多空收益率': [np.nan, np.nan, long_short_avg_rtn]
}, index=['多头', '空头', '多空'])

# 使用 fillna 方法替换剩余的 NaN
results.fillna(0, inplace=True)

# 显示结果
print(results)

       年化波动率      夏普比率      最大回撤    多头收益率     空头收益率     多空收益率
多头  0.144699 -1.427920  0.140777 -0.00082  0.000000  0.000000
空头  0.143872  0.526899  0.058909  0.00000  0.000301  0.000000
多空  0.208884 -1.352063  0.169364  0.00000  0.000000 -0.001121


1. 数据预处理

首先，你需要分钟级别的股票数据，包括最高价、最低价、成交量和其他基础指标。

	•	date (交易时间)
	•	high (分钟最高价)
	•	low (分钟最低价)
	•	close (分钟收盘价)
	•	volume (分钟成交量)


2. 计算排序百分位 (pct rank)

首先，计算分钟最高价和最低价的排序百分位，并计算过去15分钟的均值。这些将成为短时间内股价高低位特征。

In [37]:
import pandas as pd
import numpy as np

# 假设已经加载了数据，并包含['date_time', 'high', 'low', 'volume', 'close']列
data['high_rank'] = data['high'].rank(pct=True)  # 计算最高价的排序百分位
data['low_rank'] = data['low'].rank(pct=True)    # 计算最低价的排序百分位

# 计算过去15分钟的均值
data['high_rank_15min'] = data['high_rank'].rolling(window=15, min_periods=1).mean()
data['low_rank_15min'] = data['low_rank'].rolling(window=15, min_periods=1).mean()

3. 确定高低价时点

然后，我们选择高位和低位特征最强的时间点作为高价或低价时点，即分别选择 high_rank_15min 最大和 low_rank_15min 最小的时点。

In [39]:
# 确定高价和低价时点
high_point_time = data.loc[data['high_rank_15min'].idxmax(), 'date']  # 高价时点
low_point_time = data.loc[data['low_rank_15min'].idxmin(), 'date']    # 低价时点

4. 计算高价/低价时点的特征

在确定高价和低价时点后，我们计算以下三个差异因子：

	•	diff idx: 高价时点和低价时点的分钟差
	•	diff std: 高价时点和低价时点波动率差（过去15分钟的已实现波动率）
	•	diff vol: 高价时点和低价时点成交量占比差

(1) diff idx: 高低价时点的时间差

In [40]:
# 计算高低价时点的时间差
diff_idx = (pd.to_datetime(high_point_time) - pd.to_datetime(low_point_time)).total_seconds() / 60

(2) diff std: 波动率差

计算波动率差时，我们可以使用收盘价在过去15分钟内的已实现波动率。

In [47]:
import numpy as np
import pandas as pd

# 定义实现波动率计算函数
def realized_volatility(price_series):
    # 确保 price_series 是 pandas.Series 类型
    price_series = pd.Series(price_series)
    
    # 确保所有值都是数值类型
    price_series = pd.to_numeric(price_series, errors='coerce')

    # 移除 NaN 值，并过滤掉小于等于0的值，避免对数计算出错
    price_series = price_series.dropna()
    price_series = price_series[price_series > 0]

    # 检查是否有足够的数据点进行计算
    if len(price_series) > 1:
        # 计算对数收益率并移除 NaN
        log_returns = np.log(price_series / price_series.shift(1)).dropna()
        
        # 返回年化波动率
        return np.std(log_returns) * np.sqrt(252 * 60)
    else:
        return np.nan  # 如果没有足够的数据点，则返回 NaN

# 假设有已处理的数据集，继续计算高价时点和低价时点的过去15分钟波动率
high_point_time = "2022-12-01 09:34:00"  # 这是一个示例
low_point_time = "2022-12-01 09:32:00"   # 这是一个示例

# 确保 'date' 列是 datetime 类型
data['date'] = pd.to_datetime(data['date'])

# 计算高价时点和低价时点的过去15分钟波动率
high_volatility = realized_volatility(data[data['date'] <= high_point_time]['close'].tail(15))
low_volatility = realized_volatility(data[data['date'] <= low_point_time]['close'].tail(15))

# 输出结果
print("高价时点波动率:", high_volatility)
print("低价时点波动率:", low_volatility)

高价时点波动率: 109.45844234796357
低价时点波动率: 114.52410659333181


In [50]:
import pandas as pd
import numpy as np

# 假设数据已经加载到 'data' 中
# data 包含以下列: ['date', 'stkcd', 'open', 'high', 'low', 'close', 'volume', 'money', 'high_rank', 'low_rank']

# 计算分钟高价和低价的排序百分位
data['high_rank'] = data['high'].rank(pct=True)
data['low_rank'] = data['low'].rank(pct=True)

# 定义一个辅助函数来计算年化波动率
def realized_volatility(price_series):
    price_series = pd.to_numeric(price_series, errors='coerce')  # 转换为数值类型，并强制转换无效值为 NaN
    price_series = price_series.dropna()  # 移除 NaN 值
    price_series = price_series[price_series > 0]  # 只计算大于0的价格，避免对数计算中的错误
    if len(price_series) < 2:  # 确保数据足够计算对数收益率
        return np.nan
    log_returns = np.log(price_series / price_series.shift(1)).dropna()  # 计算对数收益率
    return np.std(log_returns) * np.sqrt(252 * 60)  # 返回年化波动率

# 获取高价和低价时点
high_point_time = data.loc[data['high_rank'].idxmax(), 'date']
low_point_time = data.loc[data['low_rank'].idxmin(), 'date']

# 计算高价时点和低价时点的过去15分钟波动率
high_volatility = realized_volatility(data[data['date'] <= high_point_time]['close'].tail(15))
low_volatility = realized_volatility(data[data['date'] <= low_point_time]['close'].tail(15))

# 计算波动率差
volatility_diff = high_volatility - low_volatility

# 计算高价时点和低价时点的过去15分钟成交量占比
high_volume_sum = data[data['date'] <= high_point_time]['volume'].tail(15).sum()
low_volume_sum = data[data['date'] <= low_point_time]['volume'].tail(15).sum()

# 计算成交量占比差
volume_diff = high_volume_sum - low_volume_sum

# 计算高价时点和低价时点的分钟差
time_diff = (pd.to_datetime(high_point_time) - pd.to_datetime(low_point_time)).total_seconds() / 60

# 构建因子 DataFrame
factors = pd.DataFrame({
    'diff_idx': [time_diff],
    'diff_std': [volatility_diff],
    'diff_vol': [volume_diff]
})

# 输出结果
print(f"高价时点波动率: {high_volatility}")
print(f"低价时点波动率: {low_volatility}")
print(f"波动率差: {volatility_diff}")
print(f"成交量占比差: {volume_diff}")
print(f"分钟差: {time_diff}")

# 输出构建的因子
print(factors)

高价时点波动率: 0.5936846296512587
低价时点波动率: 0.446646850029527
波动率差: 0.1470377796217317
成交量占比差: 462592.0
分钟差: -630453.0
   diff_idx  diff_std  diff_vol
0 -630453.0  0.147038  462592.0


1. 确保 diff_idx、diff_std 和 diff_vol 因子已经在 merged_data 中

In [54]:
# 确保已计算的因子 diff_idx, diff_std, diff_vol 存在于 merged_data 中
merged_data['diff_idx'] = factors['diff_idx']
merged_data['diff_std'] = factors['diff_std']
merged_data['diff_vol'] = factors['diff_vol']

In [58]:
window_data

Unnamed: 0,date,stkcd,open,high,low,close,volume,money,day,time_period,high_rank,low_rank,high_rank_15min,low_rank_15min
0,2022-12-01 09:31:00,000004.XSHE,75.87,76.32,75.8,76.25,25947.0,1972620.0,2022-12-01,morning_half_hour,0.391549,0.389326,0.391549,0.389326
100800,2022-12-01 09:31:00,000002.XSHE,2950.65,3000.44,2944.22,2973.13,56103.0,166284000.0,2022-12-01,morning_half_hour,0.994602,0.992254,0.436628,0.43659
201600,2022-12-01 09:31:00,000005.XSHE,19.42,19.53,19.32,19.42,46249.0,898453.0,2022-12-01,morning_half_hour,0.08093,0.079925,0.653094,0.653033
302400,2022-12-01 09:31:00,000010.XSHE,44.4,44.53,44.27,44.4,22922.0,1018720.0,2022-12-01,morning_half_hour,0.161676,0.160365,0.161676,0.160365
403200,2022-12-01 09:31:00,000008.XSHE,65.59,65.86,65.32,65.86,48567.0,3190680.0,2022-12-01,morning_half_hour,0.342541,0.337275,0.105779,0.105377
504000,2022-12-01 09:31:00,000009.XSHE,133.36,133.46,132.46,132.56,143854.0,19154900.0,2022-12-01,morning_half_hour,0.582192,0.58146,0.241675,0.240133
604800,2022-12-01 09:31:00,000007.XSHE,91.14,91.26,90.65,90.65,29631.0,2700470.0,2022-12-01,morning_half_hour,0.432344,0.431143,0.391915,0.391832
705600,2022-12-01 09:31:00,000001.XSHE,1657.91,1691.37,1656.67,1687.65,160780.0,268431460.0,2022-12-01,morning_half_hour,0.914653,0.908909,0.248974,0.248745
806400,2022-12-01 09:31:00,000001.XSHE,1657.91,1691.37,1656.67,1687.65,160780.0,268431460.0,2022-12-01,morning_half_hour,0.914653,0.908909,0.820399,0.820083
907440,2022-12-01 09:31:00,000006.XSHE,324.85,324.85,324.85,324.85,332094.0,107881000.0,2022-12-01,morning_half_hour,0.692831,0.692861,0.813198,0.81364


In [60]:
# 提前计算波动率差、成交量占比差和分钟差
def calculate_factors(data):
    # 计算15分钟滚动窗口的波动率
    data['high_volatility'] = data['high'].rolling(window=15, min_periods=1).std() * np.sqrt(252 * 60)
    data['low_volatility'] = data['low'].rolling(window=15, min_periods=1).std() * np.sqrt(252 * 60)
    
    # 计算成交量差异
    data['diff_vol'] = data['volume'].rolling(window=15, min_periods=1).sum().astype(float)
    
    # 计算分钟差
    data['diff_idx'] = data['high_rank'] - data['low_rank']
    
    # 计算波动率差
    data['diff_std'] = data['high_volatility'] - data['low_volatility']
    
    return data

# 应用上述函数加速计算
data = calculate_factors(data)

# 现在 data 中已经包含 'diff_idx', 'diff_std', 'diff_vol' 三个因子

In [69]:
merged_data

Unnamed: 0,stkcd,day,open_morning,close_morning,volume_morning,high_morning,low_morning,open_afternoon,close_afternoon,volume_afternoon,...,lh_rtnDiff,lh_volDiff,std_morning,std_afternoon,lh_stdDiff,daily_return,future_return,diff_idx,diff_std,diff_vol
0,000001.XSHE,2022-12-01,1657.9100000000,1635.6100000000,1312148.0000000000,1692.6100000000,1626.9300000000,1628.1700000000,1623.2200000000,305936.0000000000,...,0.010410,0.233157,0.039616,0.006087,0.153639,-0.034091,0.035294,-630453.0,0.147038,462592.0
1,000001.XSHE,2022-12-02,1628.1700000000,1579.8500000000,910072.0000000000,1629.4100000000,1572.4100000000,1602.1500000000,1598.4400000000,136144.0000000000,...,0.027362,0.149597,0.035009,0.003090,0.088252,0.035294,0.021209,,,
2,000001.XSHE,2022-12-05,1621.9800000000,1635.6100000000,705372.0000000000,1640.5600000000,1612.0700000000,1676.5000000000,1676.5000000000,379124.0000000000,...,-0.008403,0.537481,0.017565,0.003698,0.210543,0.021209,-0.022996,,,
3,000001.XSHE,2022-12-06,1652.9600000000,1670.3000000000,653014.0000000000,1680.2200000000,1641.8000000000,1656.6700000000,1664.1100000000,195782.0000000000,...,-0.005999,0.299813,0.023243,0.005988,0.257621,-0.022996,0.009872,,,
4,000001.XSHE,2022-12-07,1651.7200000000,1631.8900000000,394468.0000000000,1656.6700000000,1631.8900000000,1623.2200000000,1629.4100000000,277266.0000000000,...,0.015819,0.702886,0.015003,0.005341,0.356022,0.009872,0.011280,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3776,000010.XSHE,2024-08-16,23.0,22.47,316485.0,23.26,22.47,22.6,22.6,157264.0,...,0.023043,0.496908,0.034348,0.005752,0.167469,0.005785,0.000000,,,
3777,000010.XSHE,2024-08-19,22.73,22.6,345615.0,22.73,22.2,22.6,22.6,110321.0,...,0.005719,0.319202,0.023317,0.011504,0.493388,0.000000,0.000000,,,
3778,000010.XSHE,2024-08-20,22.6,22.6,365495.0,22.86,22.47,22.47,22.6,106704.0,...,0.005785,0.291944,0.017257,0.012016,0.696313,0.000000,0.005752,,,
3779,000010.XSHE,2024-08-21,22.47,22.6,193916.0,22.6,22.2,22.6,22.47,164132.0,...,-0.011538,0.846408,0.017802,0.011947,0.671117,0.005752,0.000000,,,


In [67]:
# 确保滚动窗口计算时的NaN被处理
data['high_volatility'] = data['high'].rolling(window=15, min_periods=1).std().fillna(0) * np.sqrt(252 * 60)
data['low_volatility'] = data['low'].rolling(window=15, min_periods=1).std().fillna(0) * np.sqrt(252 * 60)
data['diff_vol'] = data['volume'].rolling(window=15, min_periods=1).sum().fillna(0).astype(float)
data['diff_idx'] = data['high_rank'] - data['low_rank']
data['diff_std'] = data['high_volatility'] - data['low_volatility']

2. 计算日度 Rank IC

In [68]:
data

Unnamed: 0,date,stkcd,open,high,low,close,volume,money,day,time_period,high_rank,low_rank,high_rank_15min,low_rank_15min,diff_idx,diff_std,diff_vol,high_volatility,low_volatility
0,2022-12-01 09:31:00,000004.XSHE,75.87,76.32,75.8,76.25,25947.0,1972620.0,2022-12-01,morning_half_hour,0.391549,0.389326,0.391549,0.389326,0.002222,0.000000,25947.0,0.000000,0.000000
1,2022-12-01 09:32:00,000004.XSHE,76.25,76.25,75.95,75.95,5721.0,435038.0,2022-12-01,morning_half_hour,0.391234,0.389880,0.391392,0.389603,0.001354,-6.955861,31668.0,6.086378,13.042239
2,2022-12-01 09:33:00,000004.XSHE,76.1,76.1,75.87,76.02,7763.0,590132.0,2022-12-01,morning_half_hour,0.390469,0.389667,0.391084,0.389625,0.000802,4.591769,39431.0,13.820854,9.229084
3,2022-12-01 09:34:00,000004.XSHE,76.02,76.17,75.95,76.1,11107.0,844782.0,2022-12-01,morning_half_hour,0.390877,0.389880,0.391032,0.389689,0.000997,2.863119,50538.0,11.751425,8.888307
4,2022-12-01 09:35:00,000004.XSHE,76.32,76.54,76.17,76.54,15324.0,1170280.0,2022-12-01,morning_half_hour,0.392418,0.391381,0.391309,0.390027,0.001036,3.714431,65862.0,20.805903,17.091472
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1008235,2024-08-22 14:56:00,000006.XSHE,199.33,199.84,199.33,199.33,2492.0,497302.0,2024-08-22,afternoon_half_hour,0.610720,0.610713,0.610767,0.610808,0.000006,-5.556552,26761.0,16.509390,22.065942
1008236,2024-08-22 14:57:00,000006.XSHE,199.33,199.84,199.33,199.33,2266.0,452122.0,2024-08-22,afternoon_half_hour,0.610720,0.610713,0.610767,0.610808,0.000006,-5.556552,27410.0,16.509390,22.065942
1008237,2024-08-22 14:58:00,000006.XSHE,199.84,199.84,199.84,199.84,112.0,22407.0,2024-08-22,afternoon_half_hour,0.610720,0.611420,0.610767,0.610855,-0.000701,-9.455544,25312.0,16.509390,25.964934
1008238,2024-08-22 14:59:00,000006.XSHE,199.84,199.84,199.84,199.84,0.0,0.0,2024-08-22,afternoon_half_hour,0.610720,0.611420,0.610767,0.610902,-0.000701,-12.195930,24504.0,16.509390,28.705320
