In [76]:
import numpy as np
import pandas as pd
from tqdm import *

In [77]:
# 143245730代表14点32分钟57秒730ms
# 转化为60秒级时间, 例如如果订单时间为 143245730 转化为 143200000, 最后效果就是 143200000 - 143259999 的所有订单属于143200000这一bar
def get_second_60s(time_10ms):
    div = 100000
    return round(round(time_10ms // div) * div) # 先取整再标准化为Level2时间

# 转化为10秒级时间, 例如如果订单时间为 143245730 转化为 143240000, 最后效果就是 143240000 - 143249999 的所有订单属于143240000这一bar
def get_second_10s(time_10ms):
    div = 10000
    return round(round(time_10ms // div) * div) # 先取整再标准化为Level2时间



In [78]:
# 一个标准的时间轴
standard_time_list_10s = [92500000]
standard_time_list_60s = [92500000]
for range_list in [range(9300000, 11300000), range(13000000, 15000001)]:
    for i in range_list:
        h = round(i // 100 // 100 // 100)
        m = round(i // 100 // 100 % 100)
        s = round(i // 100 % 100)
        ms = round(i % 100)
        
        if s >= 60:
            continue
        if m >= 60:
            continue
        
        time_10ms = round(i * 10)
        time_10s = get_second_10s(time_10ms)
        time_60s = get_second_60s(time_10ms)
        
        standard_time_list_10s.append(time_10s)
        standard_time_list_60s.append(time_60s)
        
standard_time_list_10s = list(set(standard_time_list_10s))
standard_time_list_60s = list(set(standard_time_list_60s))
standard_time_list_10s.sort()
standard_time_list_60s.sort()

In [79]:
def _process_minute_data0(minute_data0, preclose, step = "60s"):
    minute_data = {}
    for field in minute_data0.columns:
        df = minute_data0[field].unstack(0)
        if field != "volume":
            df = df.applymap(lambda x: my_round2(x)) # 除了量之外，统一保留两位小数
        standard_time_list = eval(rf"standard_time_list_{step}")
        df = df.reindex(standard_time_list) # 标准化时间轴
        df.columns  = df.columns.map(lambda x: x[:6]) # 标准化代码
        if field in ["open", "high", "low", "close"]: # 价格填充
            df = df.replace(0, np.nan).ffill().fillna(preclose)
        else: # 量额填充
            df = df.fillna(0.0)
        minute_data[field] = df.stack()
    minute_data = pd.concat(minute_data, axis=1)
    return minute_data

In [80]:
# 制作频基础数据，高开低收量价等，其中step是间隔，可以为1s，10s和60s等，此处为60s，但是最好做成可变参数，以满足更多需求
def update_stock_basic_minute_data(date, step = "60s"):
    def _process_minute_data0(minute_data0, preclose, step = "60s"):
        minute_data = {}
        for field in minute_data0.columns:
            df = minute_data0[field].unstack(0)
            if field != "volume":
                df = df.applymap(lambda x: round(x+1e-8, 2)) # 除了量之外，统一保留两位小数
            standard_time_list = eval(rf"standard_time_list_{step}")
            df = df.reindex(standard_time_list).reindex(preclose.keys(), axis = 1) # 标准化时间轴
            if field in ["open", "high", "low", "close"]: # 价格填充
                df = df.replace(0, np.nan).ffill().fillna(preclose)
            else: # 量额填充
                df = df.fillna(0.0)
            minute_data[field] = df.stack()
        minute_data = pd.DataFrame(minute_data)
        return minute_data
    
    try:
        name = rf"StockBasic{step}Data" # 数据名称
        local_file = rf"./sample_data/local_basic{step}_sample_{date}.fea"

        # 读取所需逐笔成交trans数据，trans数据用于计算每分钟的高开低收量价等数据
        trans_data =  pd.read_feather(rf"./sample_data/local_trans_sample_{date}.fea")
        get_second = eval(rf"get_second_{step}")
        trans_data["time"] = trans_data["time"].map(get_second) # 处理成分钟或者10s频时间

        # 计算每分钟的高开低收量价等数据
        data_group = trans_data.groupby(["code", "time"])
        minute_data0 = data_group.agg({"transPrice": ["first", "max", "min", "last"],
                                        "transVolume": "sum",
                                        "transAmount": "sum"})
        minute_data0.columns = ["open", "high", "low", "close", "volume", "amount"]


        # 读取当日tick数据，此处主要用来获取preclose, 变成一个字典
        preclose = pd.read_feather(rf"./sample_data/local_tick_sample_{date}.fea", columns = ["code", "preclose"]).groupby(["code"]).max().dropna().to_dict()["preclose"]

        # 进一步处理分钟数据，进行一些填充和处理
        minute_data = _process_minute_data0(minute_data0, preclose, step)
        minute_data = minute_data.reset_index().sort_values(["code", "time"])

        # 输出数据
        minute_data = minute_data.reset_index(drop=True)
        minute_data.to_feather(local_file, compression='zstd')   
    
    except Exception as e:
        log_info(name, date, str(e))

date_list = ["20240726", "20240729", "20240730", "20240731"]
for date in tqdm(date_list):
    update_stock_basic_minute_data(date, step = "60s")

100%|██████████| 4/4 [00:47<00:00, 11.87s/it]
