In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

# 读取数据，确保列名正确
file_path = "/nas197/uhome/zhangrui/merged_transaction_20241113.csv"
df = pd.read_csv(file_path, header=0)  # 明确指定首行为标题行

# 处理 datetime 格式
df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')  # 转换日期
df.dropna(subset=['datetime'], inplace=True)  # 移除转换失败的数据

# ===========================
# 处理 'flag' 列
# ===========================
# 转换 'flag' 为数值型数据，并归类
df['flag'] = pd.to_numeric(df['flag'], errors='coerce').fillna(0)

# - flag < 50 归类为【主动买盘】→ 标记为 1
# - flag >= 50 归类为【主动卖盘】→ 标记为 -1
df['trade_side'] = np.where(df['flag'] < 50, 1, -1)

# 处理 count 列（填充 NaN）
df['count'] = df['count'].fillna(0)

# 设定时间窗口（1 分钟）
df.set_index('datetime', inplace=True)
df = df.sort_index()

# ===========================
# 计算 OB 因子（交易不平衡因子）
# ===========================
def calc_ob_factor(group):
    buy_vol = group.loc[group['trade_side'] == 1, 'volume'].sum()
    sell_vol = group.loc[group['trade_side'] == -1, 'volume'].sum()
    total_vol = buy_vol + sell_vol
    if total_vol == 0:
        return 0
    return (buy_vol - sell_vol) / total_vol

# ===========================
# 计算订单斜率因子
# ===========================
def calc_order_slope(group):
    if len(group) < 2:
        return 0
    x = group['volume'].values.reshape(-1, 1)  # 交易量
    y = group['price'].values  # 成交价格
    slope = np.polyfit(x.flatten(), y, 1)[0] if len(x) > 1 else 0
    return slope

# ===========================
# 计算订单绝对价格分歧程度
# ===========================
def calc_price_divergence(group):
    avg_price = group['price'].mean()
    return np.abs(group['price'] - avg_price).mean()

# 以 1 分钟为窗口计算因子
factor_df = df.groupby([pd.Grouper(freq='1Min'), 'symbol']).agg(
    ob_factor=('volume', calc_ob_factor),
    order_slope=('volume', calc_order_slope),
    price_divergence=('price', calc_price_divergence)
).reset_index()

# 保存因子数据
output_path = "/nas197/uhome/zhangrui/high_freq_factors_20241113_001.csv"
factor_df.to_csv(output_path, index=False)

print(f"因子计算完成，已保存到 {output_path}")

In [None]:
factor_df

In [None]:
print(list(factor_df.columns))

In [1]:
import pandas as pd
# 读取数据，确保列名正确
file_path = "/nas197/uhome/zhangrui/merged_transaction_20241113.csv"
df = pd.read_csv(file_path, header=0)  # 明确指定首行为标题行

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import linregress

# 读取数据
# file_path = 'merged_transaction_20241113.csv'
data = df

# 数据预处理
data['datetime'] = pd.to_datetime(data['datetime'], errors='coerce')

# 处理 date 列（彻底修正）
if data['date'].dtype != 'datetime64[ns]':  # 确保是字符串类型的数据
    data['date'] = pd.to_datetime(data['date'].astype(str).str.split('.').str[0], format='%Y%m%d', errors='coerce')

# 处理 time 列
data['time'] = pd.to_datetime(data['time'].astype(str).str.zfill(9), format='%H%M%S%f', errors='coerce')

# OB因子 (Order Imbalance Factor)
data['buy_volume'] = np.where(data['order_type'] == 'B', data['volume'], 0)
data['sell_volume'] = np.where(data['order_type'] == 'S', data['volume'], 0)

# 汇总每个时间点的买卖单
ob_factor = data.groupby('datetime').agg({'buy_volume': 'sum', 'sell_volume': 'sum'})
ob_factor['OB'] = (ob_factor['buy_volume'] - ob_factor['sell_volume']) / (
        ob_factor['buy_volume'] + ob_factor['sell_volume'])

# 订单斜率因子 (Order Slope Factor)
def calc_order_slope(group):
    if len(group) < 2:
        return np.nan
    x = np.arange(len(group))
    y = group['price'].values
    slope, _, _, _, _ = linregress(x, y)
    return slope

order_slope = data.groupby('datetime').apply(calc_order_slope).rename('Order_Slope')

# 订单绝对价格分歧程度 (Order Absolute Price Divergence)
def calc_price_divergence(group):
    return np.abs(group['ask_order'].mean() - group['bid_order'].mean())

price_divergence = data.groupby('datetime').apply(calc_price_divergence).rename('Price_Divergence')

# 合并所有因子
factor_data = pd.concat([ob_factor['OB'], order_slope, price_divergence], axis=1).reset_index()

# 输出结果
print(factor_data.head())

# 保存因子数据
factor_data.to_csv('high_frequency_factors_20241113.csv', index=False)