In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from datetime import datetime
import csv
import functools
import warnings
warnings.filterwarnings('ignore') 
import os
from collections import defaultdict, deque
import paramiko
from functools import partial
import pickle
import redis
from IPython.display import display
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

pd.set_option('expand_frame_repr', True) # pycharm设置为False可以产生滑条，但vscode不能产生滑条所以会堆叠。因此vscode应该设置为True
pd.set_option('display.unicode.ambiguous_as_wide', True)
pd.set_option('display.unicode.east_asian_width', True)
# pd.set_option('display.max_rows', 100) # 显示的最大行数（避免只显示部分行数据）
pd.set_option('display.max_columns', 1000) # 显示的最大列数（避免列显示不全）
# pd.set_option("display.max_colwidth",1000) # 每一列最大的宽度（避免属性值或列名显示不全）
pd.set_option('display.width', 180) # 每一行的宽度（避免换行）
plt.rcParams['font.family'] = ['sans-serif']
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['figure.figsize'] = (16, 10)
%config InlineBackend.figure_format = 'svg'
%matplotlib inline

In [2]:
import rqdatac as rq
rq.init("15626436420", "vista2525")

path = '/home/yby/SGD-HFT-Intern/Projects/T0/Data'

In [3]:
start_date = 20210506
end_date = 20210630
rq.get_trading_dates(start_date=start_date, end_date = end_date)

[datetime.date(2021, 5, 6),
 datetime.date(2021, 5, 7),
 datetime.date(2021, 5, 10),
 datetime.date(2021, 5, 11),
 datetime.date(2021, 5, 12),
 datetime.date(2021, 5, 13),
 datetime.date(2021, 5, 14),
 datetime.date(2021, 5, 17),
 datetime.date(2021, 5, 18),
 datetime.date(2021, 5, 19),
 datetime.date(2021, 5, 20),
 datetime.date(2021, 5, 21),
 datetime.date(2021, 5, 24),
 datetime.date(2021, 5, 25),
 datetime.date(2021, 5, 26),
 datetime.date(2021, 5, 27),
 datetime.date(2021, 5, 28),
 datetime.date(2021, 5, 31),
 datetime.date(2021, 6, 1),
 datetime.date(2021, 6, 2),
 datetime.date(2021, 6, 3),
 datetime.date(2021, 6, 4),
 datetime.date(2021, 6, 7),
 datetime.date(2021, 6, 8),
 datetime.date(2021, 6, 9),
 datetime.date(2021, 6, 10),
 datetime.date(2021, 6, 11),
 datetime.date(2021, 6, 15),
 datetime.date(2021, 6, 16),
 datetime.date(2021, 6, 17),
 datetime.date(2021, 6, 18),
 datetime.date(2021, 6, 21),
 datetime.date(2021, 6, 22),
 datetime.date(2021, 6, 23),
 datetime.date(2021, 6,

In [4]:
def gen_date_ticker_dict(start_date = 20210506, end_date = 20210630):
    trading_dates = rq.get_trading_dates(start_date=start_date, end_date = end_date)
    trading_dates = list(map(lambda x: datetime.strftime(x, "%Y%m%d"), trading_dates))

    date_ticker_dict = defaultdict(list)
    tickers = os.listdir(path)
    for ticker in tickers:
        date_list = os.listdir(f'{path}/{ticker}/')
        for date in date_list:
            date = date[:8]
            if date in trading_dates:
                date_ticker_dict[date].append(ticker)

    return date_ticker_dict

date_ticker_dict = gen_date_ticker_dict()
date_ticker_dict

defaultdict(list, {})

In [5]:
def rotate_key_value_monthly(kv_dict):

    vk_dict = defaultdict(partial(defaultdict, list))
    for k, v in kv_dict.items():
        month = k[4:6]
        for value in v:
            vk_dict[month][value].append(k)

    return vk_dict

ticker_date_dict = rotate_key_value_monthly(date_ticker_dict)
ticker_date_dict.keys()

dict_keys([])

In [6]:
def gen_mapping():
    cnt = 0
    classifier_mappping = {}
    for i in range(3):
        for j in range(3):
            for k in range(3):
                classifier_mappping[(i, j, k)] = cnt
                cnt += 1
    return classifier_mappping

MAPPING = gen_mapping()
MAPPING

{(0, 0, 0): 0,
 (0, 0, 1): 1,
 (0, 0, 2): 2,
 (0, 1, 0): 3,
 (0, 1, 1): 4,
 (0, 1, 2): 5,
 (0, 2, 0): 6,
 (0, 2, 1): 7,
 (0, 2, 2): 8,
 (1, 0, 0): 9,
 (1, 0, 1): 10,
 (1, 0, 2): 11,
 (1, 1, 0): 12,
 (1, 1, 1): 13,
 (1, 1, 2): 14,
 (1, 2, 0): 15,
 (1, 2, 1): 16,
 (1, 2, 2): 17,
 (2, 0, 0): 18,
 (2, 0, 1): 19,
 (2, 0, 2): 20,
 (2, 1, 0): 21,
 (2, 1, 1): 22,
 (2, 1, 2): 23,
 (2, 2, 0): 24,
 (2, 2, 1): 25,
 (2, 2, 2): 26}

In [7]:
loading_path = '/home/yby/SGD-HFT-Intern/Projects/T0/Data_labels/'


def get_file_list():
    dict_stock_dates = defaultdict(list)
    stock_list = os.listdir(loading_path)
    for s in stock_list:
        dict_stock_dates[s] = os.listdir(f"{loading_path}{s}/")
    return stock_list, dict_stock_dates

stock_list, dick_stock_dates = get_file_list()
stock_list.__len__()

500

In [8]:
dick_stock_dates['000021']

['20211014.pkl',
 '20210628.pkl',
 '20211020.pkl',
 '20210517.pkl',
 '20211019.pkl',
 '20210524.pkl',
 '20210519.pkl',
 '20210622.pkl',
 '20210520.pkl',
 '20210617.pkl',
 '20210527.pkl',
 '20210518.pkl',
 '20211013.pkl',
 '20210630.pkl',
 '20210609.pkl',
 '20210513.pkl',
 '20211015.pkl',
 '20210507.pkl',
 '20211027.pkl',
 '20211021.pkl',
 '20210615.pkl',
 '20211026.pkl',
 '20210616.pkl',
 '20210526.pkl',
 '20210608.pkl',
 '20210611.pkl',
 '20211029.pkl',
 '20210511.pkl',
 '20210604.pkl',
 '20210601.pkl',
 '20210531.pkl',
 '20210510.pkl',
 '20211018.pkl',
 '20210625.pkl',
 '20211022.pkl',
 '20211028.pkl',
 '20210506.pkl',
 '20210603.pkl',
 '20210624.pkl',
 '20210618.pkl',
 '20210525.pkl',
 '20210623.pkl',
 '20211011.pkl',
 '20210629.pkl',
 '20210607.pkl',
 '20210621.pkl',
 '20210610.pkl',
 '20210528.pkl',
 '20211012.pkl',
 '20210514.pkl',
 '20211025.pkl',
 '20210602.pkl',
 '20210521.pkl',
 '20211008.pkl',
 '20210512.pkl']

In [10]:
data = pd.read_pickle("../../Data/000008/20210701.pkl")
data.head()

Unnamed: 0_level_0,date,code,timeidx,price,vwp,ask_price,bid_price,ask_price2,bid_price2,ask_price4,bid_price4,ask_price8,bid_price8,spread,tick_spread,ref_ind_0,ref_ind_1,ask_weight_14,ask_weight_13,ask_weight_12,ask_weight_11,ask_weight_10,ask_weight_9,ask_weight_8,ask_weight_7,ask_weight_6,ask_weight_5,ask_weight_4,ask_weight_3,ask_weight_2,ask_weight_1,ask_weight_0,bid_weight_0,bid_weight_1,bid_weight_2,bid_weight_3,bid_weight_4,bid_weight_5,bid_weight_6,bid_weight_7,bid_weight_8,bid_weight_9,bid_weight_10,bid_weight_11,bid_weight_12,bid_weight_13,bid_weight_14,ask_dec,bid_dec,ask_inc,bid_inc,ask_inc2,bid_inc2,preclose,limit,turnover,p_2,p_5,p_18,p_diff,circulation_mv
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1
09:30:03,20210701,8,3,2.215,2.2197,2.22,2.21,2.22,2.21,2.22,2.21,2.22,2.21,4.514673,4.514673,-0.002536,-0.001686,0,0,0,0,0,235.158,54.763,27.1365,80.9856,39.498,75.6874,150.7725,310.7328,182.0349,40.2042,179.2973,129.822,45.0921,47.3714,1.4105,6.3936,15.996,0.0,0.1065,0.0212,5.0218,0,0,0,0,0.0,0.0,0.0,0.0,42.155215,27.664464,2.22,0.1,2.2197,0.0,0.0,0.0,0.0,5863477000.0
09:30:06,20210701,8,6,2.215,2.22,2.22,2.21,2.22,2.21,2.22,2.21,2.22,2.21,4.514673,4.514673,-0.003521,-0.002222,0,0,0,0,0,235.158,55.269,28.1441,81.2592,39.6796,75.6874,151.38,314.6976,182.4809,39.8046,179.2973,143.968,48.3771,49.5514,1.4322,8.1216,16.512,0.0,0.1065,0.0212,5.0218,0,0,0,0,0.3987,0.0,0.0,0.0,0.270673,3.133117,2.22,0.1,0.5106,0.0,0.0,0.0,0.0,5863477000.0
09:30:09,20210701,8,9,2.215,2.21,2.22,2.21,2.22,2.21,2.22,2.21,2.22,2.21,4.514673,4.514673,-0.003145,-0.002018,0,0,0,0,0,235.158,55.269,30.8921,81.2592,39.6796,75.7552,151.515,314.6976,183.3729,41.514,180.3802,146.63,48.3771,50.4234,1.4322,8.1216,16.512,0.0,0.1065,0.0212,5.0218,0,0,0,0,0.0,0.0,0.0,0.0,1.892496,1.651725,2.22,0.1,0.0221,0.0,0.0,0.0,0.0,5863477000.0
09:30:12,20210701,8,12,2.215,2.215,2.22,2.21,2.22,2.21,2.22,2.21,2.22,2.21,4.514673,4.514673,-0.003463,-0.002175,0,0,0,0,0,235.158,55.269,30.8921,81.2592,39.6796,75.7552,151.515,314.6976,183.3729,42.0912,180.3802,146.63,48.3771,50.4234,1.4322,8.1216,16.512,0.0,0.1065,0.0212,5.0218,0,0,0,0,0.0,0.0,0.0,0.0,0.5759,0.0,2.22,0.1,0.0,0.0,0.0,0.0,0.0,5863477000.0
09:30:15,20210701,8,15,2.215,2.215,2.22,2.21,2.22,2.21,2.22,2.21,2.22,2.21,4.514673,4.514673,-0.003457,-0.00197,0,0,0,0,0,235.158,55.269,30.8921,81.2592,39.6796,75.7552,151.515,314.6976,183.3729,42.0912,182.0377,146.96,48.3771,50.4234,1.4322,8.1216,16.512,0.0,0.1065,0.0212,5.0218,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.730358,2.22,0.1,0.0,0.0,0.0,0.0,0.0,5863477000.0


In [17]:
import gzip
import pickle

def save(data, path):
    serialized = pickle.dumps(data)
    with gzip.open(path, 'wb', compresslevel=1, encoding=None) as file_obj:
        file_obj.write(serialized)

def load(path):
    with gzip.open(path, 'rb', compresslevel=1, encoding=None) as file_obj:
        raw_data = file_obj.read()
    return pickle.loads(raw_data)

save_path = "/home/yby/SGD-HFT-Intern/Projects/T0/CNN/train_dir_0/test.pkl"
save(data, save_path)
res = load(save_path)
res

Unnamed: 0_level_0,date,code,timeidx,price,vwp,ask_price,bid_price,ask_price2,bid_price2,ask_price4,bid_price4,ask_price8,bid_price8,spread,tick_spread,ref_ind_0,ref_ind_1,ask_weight_14,ask_weight_13,ask_weight_12,ask_weight_11,ask_weight_10,ask_weight_9,ask_weight_8,ask_weight_7,ask_weight_6,ask_weight_5,ask_weight_4,ask_weight_3,ask_weight_2,ask_weight_1,ask_weight_0,bid_weight_0,bid_weight_1,bid_weight_2,bid_weight_3,bid_weight_4,bid_weight_5,bid_weight_6,bid_weight_7,bid_weight_8,bid_weight_9,bid_weight_10,bid_weight_11,bid_weight_12,bid_weight_13,bid_weight_14,ask_dec,bid_dec,ask_inc,bid_inc,ask_inc2,bid_inc2,preclose,limit,turnover,p_2,p_5,p_18,p_diff,circulation_mv
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1
09:30:03,20210701,000008,3,2.215,2.219700,2.22,2.21,2.22,2.21,2.22,2.21,2.22,2.21,4.514673,4.514673,-0.002536,-0.001686,0,0,0,0,0,235.158,54.7630,27.1365,80.9856,39.4980,75.6874,150.7725,310.7328,182.034900,40.20420,179.2973,129.8220,45.0921,47.3714,1.4105,6.3936,15.9960,0.0000,0.1065,0.0212,5.0218,0,0,0,0,0.00000,0.0000,0.0,0.0,42.155215,27.664464,2.22,0.1,2.2197,0.0,0.0,0.0,0.0,5.863477e+09
09:30:06,20210701,000008,6,2.215,2.220000,2.22,2.21,2.22,2.21,2.22,2.21,2.22,2.21,4.514673,4.514673,-0.003521,-0.002222,0,0,0,0,0,235.158,55.2690,28.1441,81.2592,39.6796,75.6874,151.3800,314.6976,182.480900,39.80460,179.2973,143.9680,48.3771,49.5514,1.4322,8.1216,16.5120,0.0000,0.1065,0.0212,5.0218,0,0,0,0,0.39870,0.0000,0.0,0.0,0.270673,3.133117,2.22,0.1,0.5106,0.0,0.0,0.0,0.0,5.863477e+09
09:30:09,20210701,000008,9,2.215,2.210000,2.22,2.21,2.22,2.21,2.22,2.21,2.22,2.21,4.514673,4.514673,-0.003145,-0.002018,0,0,0,0,0,235.158,55.2690,30.8921,81.2592,39.6796,75.7552,151.5150,314.6976,183.372900,41.51400,180.3802,146.6300,48.3771,50.4234,1.4322,8.1216,16.5120,0.0000,0.1065,0.0212,5.0218,0,0,0,0,0.00000,0.0000,0.0,0.0,1.892496,1.651725,2.22,0.1,0.0221,0.0,0.0,0.0,0.0,5.863477e+09
09:30:12,20210701,000008,12,2.215,2.215000,2.22,2.21,2.22,2.21,2.22,2.21,2.22,2.21,4.514673,4.514673,-0.003463,-0.002175,0,0,0,0,0,235.158,55.2690,30.8921,81.2592,39.6796,75.7552,151.5150,314.6976,183.372900,42.09120,180.3802,146.6300,48.3771,50.4234,1.4322,8.1216,16.5120,0.0000,0.1065,0.0212,5.0218,0,0,0,0,0.00000,0.0000,0.0,0.0,0.575900,0.000000,2.22,0.1,0.0000,0.0,0.0,0.0,0.0,5.863477e+09
09:30:15,20210701,000008,15,2.215,2.215000,2.22,2.21,2.22,2.21,2.22,2.21,2.22,2.21,4.514673,4.514673,-0.003457,-0.001970,0,0,0,0,0,235.158,55.2690,30.8921,81.2592,39.6796,75.7552,151.5150,314.6976,183.372900,42.09120,182.0377,146.9600,48.3771,50.4234,1.4322,8.1216,16.5120,0.0000,0.1065,0.0212,5.0218,0,0,0,0,0.00000,0.0000,0.0,0.0,0.000000,1.730358,2.22,0.1,0.0000,0.0,0.0,0.0,0.0,5.863477e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14:56:39,20210701,000008,14199,2.205,2.205000,2.21,2.20,2.21,2.20,2.21,2.20,2.21,2.20,4.535147,4.535147,-0.019493,-0.013140,0,0,0,0,0,44.482,13.9461,117.6708,59.9280,71.9584,193.3650,187.8240,191.0887,250.954794,496.63783,102.5420,199.2681,214.3376,17.8374,29.3328,30.8955,0.8774,4.7499,0.6572,5.1906,0.0000,0,0,0,0,0.00000,0.0000,0.0,0.0,0.000000,0.000000,2.22,0.1,0.0000,0.0,0.0,0.0,0.0,5.863477e+09
14:56:42,20210701,000008,14202,2.205,2.205000,2.21,2.20,2.21,2.20,2.21,2.20,2.21,2.20,4.535147,4.535147,-0.019303,-0.012971,0,0,0,0,0,44.482,13.9461,117.6708,59.9280,71.9584,190.9125,185.8080,191.0887,250.954794,496.63783,99.8580,199.0929,214.3376,17.8374,29.3328,30.8955,0.8774,4.7499,0.6572,5.1906,0.0000,0,0,0,0,0.00000,2.6901,0.0,0.0,-0.022270,-0.036382,2.22,0.1,0.0000,0.0,0.0,0.0,0.0,5.863477e+09
14:56:45,20210701,000008,14205,2.205,2.205000,2.21,2.20,2.21,2.20,2.21,2.20,2.21,2.20,4.535147,4.535147,-0.019665,-0.013183,0,0,0,0,0,44.482,13.9461,117.6708,59.9280,71.9584,190.9125,185.8080,191.0887,250.954794,496.63783,99.8580,198.6768,214.3376,17.8374,29.3328,30.8955,0.8774,4.7499,0.6572,5.1906,0.0000,0,0,0,0,0.00000,0.0000,0.0,0.0,0.000000,-0.086656,2.22,0.1,0.0000,0.0,0.0,0.0,0.0,5.863477e+09
14:56:51,20210701,000008,14211,2.205,2.210000,2.21,2.20,2.21,2.20,2.21,2.20,2.21,2.20,4.535147,4.535147,-0.019603,-0.013148,0,0,0,0,0,44.482,13.9461,117.6708,59.9280,71.9584,190.9125,185.8080,190.9995,250.954794,496.41683,99.8580,198.6768,213.9452,17.8374,29.3328,30.8955,0.8774,4.7499,0.6572,5.1906,0.0000,0,0,0,0,0.11025,0.0000,0.0,0.0,-0.001874,-0.008489,2.22,0.1,0.1105,0.0,0.0,0.0,0.0,5.863477e+09


In [11]:
data_label = pd.read_pickle("../../Data_labels/000008/20210701.pkl")
data_label.head()

FileNotFoundError: [Errno 2] No such file or directory: '../../Data_labels/000008/20210701.pkl'

In [14]:
list(set(data_label.columns).difference(data.columns))

['time', 'cls_18', 'cls_2', 'cls_5', 'vwp_pct', 'price_pct']

In [15]:
data_label.shape


(3149, 67)

In [16]:
data.shape

(3149, 61)

In [17]:
data_label.columns

Index(['time', 'date', 'code', 'timeidx', 'price', 'vwp', 'ask_price', 'bid_price', 'ask_price2', 'bid_price2', 'ask_price4', 'bid_price4', 'ask_price8', 'bid_price8', 'spread',
       'tick_spread', 'ref_ind_0', 'ref_ind_1', 'ask_weight_14', 'ask_weight_13', 'ask_weight_12', 'ask_weight_11', 'ask_weight_10', 'ask_weight_9', 'ask_weight_8',
       'ask_weight_7', 'ask_weight_6', 'ask_weight_5', 'ask_weight_4', 'ask_weight_3', 'ask_weight_2', 'ask_weight_1', 'ask_weight_0', 'bid_weight_0', 'bid_weight_1',
       'bid_weight_2', 'bid_weight_3', 'bid_weight_4', 'bid_weight_5', 'bid_weight_6', 'bid_weight_7', 'bid_weight_8', 'bid_weight_9', 'bid_weight_10', 'bid_weight_11',
       'bid_weight_12', 'bid_weight_13', 'bid_weight_14', 'ask_dec', 'bid_dec', 'ask_inc', 'bid_inc', 'ask_inc2', 'bid_inc2', 'preclose', 'limit', 'turnover', 'p_2', 'p_5',
       'p_18', 'p_diff', 'circulation_mv', 'cls_2', 'cls_5', 'cls_18', 'price_pct', 'vwp_pct'],
      dtype='object')

In [18]:
factor_ret_cols = ['timeidx','price_pct','vwp_pct','spread','tick_spread','ref_ind_0','ref_ind_1','ask_weight_14',
                   'ask_weight_13','ask_weight_12','ask_weight_11','ask_weight_10','ask_weight_9','ask_weight_8','ask_weight_7',
                   'ask_weight_6','ask_weight_5','ask_weight_4','ask_weight_3','ask_weight_2','ask_weight_1','ask_weight_0',
                   'bid_weight_0','bid_weight_1','bid_weight_2','bid_weight_3','bid_weight_4','bid_weight_5','bid_weight_6',
                   'bid_weight_7','bid_weight_8','bid_weight_9','bid_weight_10','bid_weight_11','bid_weight_12','bid_weight_13',
                   'bid_weight_14','ask_dec','bid_dec','ask_inc','bid_inc','ask_inc2','bid_inc2','turnover',
                   'tag', 'cls_2', 'cls_5', 'cls_18']
                   
factor_ret_cols.__len__()

48

In [19]:
col_factors1 = ['date', 'time', 'timeidx', 'price', 'vwp', 'ask_price', 'bid_price', 'ask_price2', 'bid_price2',
               'ask_price4', 'bid_price4', 'ask_price8', 'bid_price8', 'spread', 'tick_spread',
               'ref_ind_0', 'ref_ind_1', 'ask_weight_14', 'ask_weight_13', 'ask_weight_12', 'ask_weight_11',
               'ask_weight_10', 'ask_weight_9',
               'ask_weight_8', 'ask_weight_7', 'ask_weight_6', 'ask_weight_5', 'ask_weight_4',
               'ask_weight_3', 'ask_weight_2', 'ask_weight_1', 'ask_weight_0', 'bid_weight_0',
               'bid_weight_1', 'bid_weight_2', 'bid_weight_3', 'bid_weight_4', 'bid_weight_5',
               'bid_weight_6', 'bid_weight_7', 'bid_weight_8', 'bid_weight_9', 'bid_weight_10',
               'bid_weight_11', 'bid_weight_12', 'bid_weight_13', 'bid_weight_14', 'ask_dec', 'bid_dec',
               'ask_inc', 'bid_inc', 'ask_inc2', 'bid_inc2', 'preclose', 'limit', 'turnover']
col_factors2 = ['date','code','timeidx','price','vwp','ask_price','bid_price','ask_price2','bid_price2','ask_price4',
               'bid_price4','ask_price8','bid_price8','spread','tick_spread','ref_ind_0','ref_ind_1','ask_weight_14',
               'ask_weight_13','ask_weight_12','ask_weight_11','ask_weight_10','ask_weight_9','ask_weight_8','ask_weight_7',
               'ask_weight_6','ask_weight_5','ask_weight_4','ask_weight_3','ask_weight_2','ask_weight_1','ask_weight_0',
               'bid_weight_0','bid_weight_1','bid_weight_2','bid_weight_3','bid_weight_4','bid_weight_5','bid_weight_6',
               'bid_weight_7','bid_weight_8','bid_weight_9','bid_weight_10','bid_weight_11','bid_weight_12','bid_weight_13',
               'bid_weight_14','ask_dec','bid_dec','ask_inc','bid_inc','ask_inc2','bid_inc2','preclose','limit','turnover',
               'circulation_mv', 'p_2','p_5','p_18','p_diff']
col_factors1.__len__()

56

In [20]:
list(set(col_factors1).difference(col_factors2))

['time']

In [21]:
col_factors = ['date', 'time', 'timeidx', 'price', 'vwp', 'ask_price', 'bid_price', 'ask_price2', 'bid_price2',
               'ask_price4', 'bid_price4', 'ask_price8', 'bid_price8', 'spread', 'tick_spread',
               'ref_ind_0', 'ref_ind_1', 'ask_weight_14', 'ask_weight_13', 'ask_weight_12', 'ask_weight_11',
               'ask_weight_10', 'ask_weight_9',
               'ask_weight_8', 'ask_weight_7', 'ask_weight_6', 'ask_weight_5', 'ask_weight_4',
               'ask_weight_3', 'ask_weight_2', 'ask_weight_1', 'ask_weight_0', 'bid_weight_0',
               'bid_weight_1', 'bid_weight_2', 'bid_weight_3', 'bid_weight_4', 'bid_weight_5',
               'bid_weight_6', 'bid_weight_7', 'bid_weight_8', 'bid_weight_9', 'bid_weight_10',
               'bid_weight_11', 'bid_weight_12', 'bid_weight_13', 'bid_weight_14', 'ask_dec', 'bid_dec',
               'ask_inc', 'bid_inc', 'ask_inc2', 'bid_inc2', 'preclose', 'limit', 'turnover']

factor_ret_cols = ['timeidx', 'price', 'vwp', 'spread', 'tick_spread', 'ref_ind_0', 'ref_ind_1',
                   'ask_weight_14', 'ask_weight_13', 'ask_weight_12', 'ask_weight_11',
                   'ask_weight_10', 'ask_weight_9', 'ask_weight_8', 'ask_weight_7',
                   'ask_weight_6', 'ask_weight_5', 'ask_weight_4', 'ask_weight_3',
                   'ask_weight_2', 'ask_weight_1', 'ask_weight_0', 'bid_weight_0',
                   'bid_weight_1', 'bid_weight_2', 'bid_weight_3', 'bid_weight_4',
                   'bid_weight_5', 'bid_weight_6', 'bid_weight_7', 'bid_weight_8',
                   'bid_weight_9', 'bid_weight_10', 'bid_weight_11', 'bid_weight_12',
                   'bid_weight_13', 'bid_weight_14', 'ask_dec', 'bid_dec', 'ask_inc',
                   'bid_inc', 'ask_inc2', 'bid_inc2', '10']

In [22]:
list(set(col_factors).difference(factor_ret_cols))

['limit',
 'turnover',
 'preclose',
 'ask_price2',
 'bid_price2',
 'bid_price4',
 'date',
 'time',
 'bid_price8',
 'ask_price8',
 'ask_price',
 'ask_price4',
 'bid_price']

In [23]:
list(set(factor_ret_cols).difference(col_factors))

['10']

In [12]:
path = '/sgd-data/t0_data/500factor/500factors'
# path = '/home/yby/YBY/HF_Proj'
allpath = []
allname = []
def getallfile(path):
    allfilelist = os.listdir(path)
    # 遍历该文件夹下的所有目录或者文件
    for file in allfilelist:
        filepath = os.path.join(path, file)
        # 如果是文件夹，递归调用函数
        if os.path.isdir(filepath):
            getallfile(filepath)
        # 如果不是文件夹，保存文件路径及文件名
        elif os.path.isfile(filepath):
            allpath.append(filepath)
            allname.append(file)
    return allpath, allname

allpath, allname = getallfile(path)
allpath

['/sgd-data/t0_data/500factor/500factors/002925/20210721.csv',
 '/sgd-data/t0_data/500factor/500factors/002925/20210813.csv',
 '/sgd-data/t0_data/500factor/500factors/002925/20211008.csv',
 '/sgd-data/t0_data/500factor/500factors/002925/20210706.csv',
 '/sgd-data/t0_data/500factor/500factors/002925/20210903.csv',
 '/sgd-data/t0_data/500factor/500factors/002925/20211018.csv',
 '/sgd-data/t0_data/500factor/500factors/002925/20210604.csv',
 '/sgd-data/t0_data/500factor/500factors/002925/20210906.csv',
 '/sgd-data/t0_data/500factor/500factors/002925/20210805.csv',
 '/sgd-data/t0_data/500factor/500factors/002925/20210511.csv',
 '/sgd-data/t0_data/500factor/500factors/002925/20210615.csv',
 '/sgd-data/t0_data/500factor/500factors/002925/20210623.csv',
 '/sgd-data/t0_data/500factor/500factors/002925/20210917.csv',
 '/sgd-data/t0_data/500factor/500factors/002925/20210720.csv',
 '/sgd-data/t0_data/500factor/500factors/002925/20210624.csv',
 '/sgd-data/t0_data/500factor/500factors/002925/2021101

In [13]:
allpath[0].split('/')[-1][:-4]

'20210721'

In [14]:
file_path = [x for x in allpath if int(x.split('/')[-1][:-4])]
file_path

['/sgd-data/t0_data/500factor/500factors/002925/20210721.csv',
 '/sgd-data/t0_data/500factor/500factors/002925/20210813.csv',
 '/sgd-data/t0_data/500factor/500factors/002925/20211008.csv',
 '/sgd-data/t0_data/500factor/500factors/002925/20210706.csv',
 '/sgd-data/t0_data/500factor/500factors/002925/20210903.csv',
 '/sgd-data/t0_data/500factor/500factors/002925/20211018.csv',
 '/sgd-data/t0_data/500factor/500factors/002925/20210604.csv',
 '/sgd-data/t0_data/500factor/500factors/002925/20210906.csv',
 '/sgd-data/t0_data/500factor/500factors/002925/20210805.csv',
 '/sgd-data/t0_data/500factor/500factors/002925/20210511.csv',
 '/sgd-data/t0_data/500factor/500factors/002925/20210615.csv',
 '/sgd-data/t0_data/500factor/500factors/002925/20210623.csv',
 '/sgd-data/t0_data/500factor/500factors/002925/20210917.csv',
 '/sgd-data/t0_data/500factor/500factors/002925/20210720.csv',
 '/sgd-data/t0_data/500factor/500factors/002925/20210624.csv',
 '/sgd-data/t0_data/500factor/500factors/002925/2021101

In [15]:
def gen_df_full_time():
  full_time = list(pd.date_range(start='09:30:00', end='11:30:00', freq='S'))
  full_time.extend(list(pd.date_range(start='13:00:00', end='15:00:00', freq='S')))
  full_time = [str(x.time()) for x in full_time]
  df_full_time = pd.DataFrame(index=full_time, columns={'price'})
  return df_full_time

gen_df_full_time()

Unnamed: 0,price
09:30:00,
09:30:01,
09:30:02,
09:30:03,
09:30:04,
...,...
14:59:56,
14:59:57,
14:59:58,
14:59:59,


In [28]:
import redis
rs = redis.Redis(host=os.environ.get("REDIS_HOST", "127.0.0.1"), port=3056, password="", db = 0, health_check_interval=30)
rs.keys()[:50]

[b'zjr_1min_300853.SZSE_20220224',
 b'zjr_1min_300651.SZSE_20201102',
 b'zjr_1min_603290.SSE_20220224',
 b'zjr_1min_603758.SSE_20211111',
 b'zjr_1min_601666.SSE_20190423',
 b'zjr_1min_300432.SZSE_20190903',
 b'zjr_1min_300407.SZSE_20200923',
 b'zjr_1min_002512.SZSE_20200605',
 b'zjr_1min_000963.SZSE_20200909',
 b'zjr_1min_000400.SZSE_20210112',
 b'zjr_1min_600393.SSE_20191220',
 b'zjr_1min_002366.SZSE_20191010',
 b'zjr_1min_300147.SZSE_20190108',
 b'zjr_1min_300630.SZSE_20220121',
 b'zjr_1min_600215.SSE_20210526',
 b'zjr_1min_300633.SZSE_20190418',
 b'zjr_1min_603111.SSE_20200804',
 b'zjr_1min_603429.SSE_20211213',
 b'zjr_1min_000892.SZSE_20200911',
 b'zjr_1min_002318.SZSE_20210330',
 b'zjr_1min_002253.SZSE_20220124',
 b'zjr_1min_600986.SSE_20190305',
 b'zjr_1min_002161.SZSE_20190626',
 b'zjr_1min_601611.SSE_20190731',
 b'zjr_1min_600038.SSE_20210901',
 b'zjr_1min_002430.SZSE_20220629',
 b'zjr_1min_002612.SZSE_20220301',
 b'zjr_1min_300661.SZSE_20200728',
 b'zjr_1min_000632.SZSE_202106

In [29]:
df_bytes_from_redis = rs.get('zjr_1min_300853.SZSE_20220224')
df_from_redis = pickle.loads(df_bytes_from_redis)
df_from_redis

Unnamed: 0,index,close,high,low,open,volume,mean_bid_sd,sum_bid_sd,mean_ask_sd,sum_ask_sd,vb_sum,va_sum,vb_mean,va_mean,all_traded_volume,all_canceled_volume,all_traded_mean,all_canceled_mean,active_buy_count,active_sell_count,active_buy_rate,active_sell_rate,p_5
0,0,34.80,34.98,34.79,34.79,2200,0.323930,0.045455,-0.323930,-0.045455,6900,6300,530.769231,484.615385,2400.0,7600.0,34.866667,0.0,6.0,9.0,0.400000,0.600000,0.010920
1,1,34.85,34.85,34.80,34.80,4300,0.339950,0.467529,-0.339950,-0.467529,12700,4608,747.058824,271.058824,4300.0,5854.0,34.864483,0.0,1.0,28.0,0.034483,0.965517,0.011191
2,2,34.85,34.89,34.83,34.89,2777,0.264617,0.351686,-0.264617,-0.351686,13969,6700,821.705882,394.117647,2877.0,2200.0,34.876818,0.0,6.0,16.0,0.272727,0.727273,0.011765
3,3,34.87,34.87,34.87,34.87,2900,-0.095482,-0.310638,0.095482,0.310638,8100,15400,675.000000,1283.333333,3100.0,8100.0,34.879167,0.0,5.0,7.0,0.416667,0.583333,0.012332
4,4,34.96,34.98,34.87,34.87,1400,-0.146391,0.450581,0.146391,-0.450581,23218,8794,1547.866667,586.266667,1400.0,10423.0,34.940000,0.0,4.0,5.0,0.444444,0.555556,0.009153
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
232,232,34.60,34.60,34.60,34.60,1900,0.304411,0.317241,-0.304411,-0.317241,19100,9900,1736.363636,900.000000,1900.0,4000.0,34.596667,0.0,2.0,1.0,0.666667,0.333333,0.000000
233,233,34.59,34.62,34.59,34.60,3900,-0.553891,-0.838791,0.553891,0.838791,6400,73000,914.285714,10428.571429,5100.0,900.0,34.607500,0.0,5.0,7.0,0.416667,0.583333,0.000000
234,234,34.59,34.59,34.59,34.59,1700,0.031645,-0.337058,-0.031645,0.337058,17800,35900,1369.230769,2761.538462,800.0,2200.0,34.590000,0.0,4.0,0.0,1.000000,0.000000,0.000000
235,235,34.60,34.60,34.59,34.59,9500,-0.073787,-0.078554,0.073787,0.078554,221700,259500,15835.714286,18535.714286,9500.0,5400.0,34.597000,0.0,16.0,4.0,0.800000,0.200000,0.000000


In [19]:
factor_ret_cols = ['timeidx','price','vwp','spread','tick_spread','ref_ind_0','ref_ind_1','ask_weight_14',
                   'ask_weight_13','ask_weight_12','ask_weight_11','ask_weight_10','ask_weight_9','ask_weight_8','ask_weight_7',
                   'ask_weight_6','ask_weight_5','ask_weight_4','ask_weight_3','ask_weight_2','ask_weight_1','ask_weight_0',
                   'bid_weight_0','bid_weight_1','bid_weight_2','bid_weight_3','bid_weight_4','bid_weight_5','bid_weight_6',
                   'bid_weight_7','bid_weight_8','bid_weight_9','bid_weight_10','bid_weight_11','bid_weight_12','bid_weight_13',
                   'bid_weight_14','ask_dec','bid_dec','ask_inc','bid_inc','ask_inc2','bid_inc2','turnover',
                   'cls_2', 'cls_5', 'cls_18']
len(factor_ret_cols)

47

In [21]:
import redis
rs = redis.Redis(host='localhost', port=6379, password="", db = 0, health_check_interval=30)

df_bytes_from_redis = rs.get("clslabels_600885_07")
df_from_redis = pickle.loads(df_bytes_from_redis)
a = pd.DataFrame(df_from_redis, columns=factor_ret_cols)
a

Unnamed: 0,timeidx,price,vwp,spread,tick_spread,ref_ind_0,ref_ind_1,ask_weight_14,ask_weight_13,ask_weight_12,ask_weight_11,ask_weight_10,ask_weight_9,ask_weight_8,ask_weight_7,ask_weight_6,ask_weight_5,ask_weight_4,ask_weight_3,ask_weight_2,ask_weight_1,ask_weight_0,bid_weight_0,bid_weight_1,bid_weight_2,bid_weight_3,bid_weight_4,bid_weight_5,bid_weight_6,bid_weight_7,bid_weight_8,bid_weight_9,bid_weight_10,bid_weight_11,bid_weight_12,bid_weight_13,bid_weight_14,ask_dec,bid_dec,ask_inc,bid_inc,ask_inc2,bid_inc2,turnover,cls_2,cls_5,cls_18
0,0.0,0.000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.000000,0.0000,0.0,0.0,0.0
1,0.0,0.000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.000000,0.0000,0.0,0.0,0.0
2,0.0,0.000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.000000,0.0000,0.0,0.0,0.0
3,0.0,0.000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.000000,0.0000,0.0,0.0,0.0
4,0.0,0.000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.000000,0.0000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105076,14206.0,65.295,65.200000,2.909871,0.5,-0.021056,-0.014490,0.0,0.0000,0.0000,1.3158,0.0000,34.8209,1.3132,0.0000,0.0000,0.6554,0.0000,33.404,0.0000,0.0000,5.8859,40.4196,7.1679,6.5130,7.8128,0.0000,0.0,126.7341,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.65295,0.0,0.0,0.000000,1.136133,3.9120,0.0,0.0,0.0
105077,14209.0,65.280,65.207500,2.144608,0.5,-0.021320,-0.014652,0.0,0.0000,1.3158,34.1640,0.6569,1.3132,0.0000,0.0000,0.6554,27.5100,0.6548,0.000,5.2320,0.6539,25.4865,18.9091,30.6387,11.7258,6.5110,1.3018,0.0,18.8507,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,22.391040,7.180800,2.6083,0.0,0.0,0.0
105078,14212.0,65.280,65.210000,2.144608,0.5,-0.021355,-0.014720,0.0,0.0000,1.3158,34.1640,0.6569,1.3132,0.0000,0.0000,0.6554,27.5100,0.6548,0.000,5.2320,0.6539,25.4865,23.4730,35.2007,11.7258,6.5110,1.3018,0.0,6.5010,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.65280,0.0,0.0,0.000000,9.341568,2.6084,0.0,0.0,0.0
105079,14215.0,65.250,65.206667,1.532567,0.5,-0.021079,-0.014511,0.0,1.3168,1.3158,36.1341,0.0000,0.0000,0.0000,0.6554,27.5100,0.0000,0.0000,5.232,25.4865,0.0000,4.9628,64.5436,11.7299,10.4208,7.8128,0.0000,0.0,20.8010,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.52500,0.0,0.0,4.110750,21.617325,9.7810,0.0,0.0,0.0


In [38]:
rs.keys()[:50]

[b"b'numpy_20210819_002683'",
 b"b'numpy_20210721_601997'",
 b"b'numpy_20210818_600901'",
 b"b'numpy_20210823_002465'",
 b"b'numpy_20210616_300316'",
 b"b'numpy_20210513_002372'",
 b"b'numpy_20210702_000156'",
 b"b'numpy_20210630_600862'",
 b"b'numpy_20210629_601928'",
 b"b'numpy_20210924_600985'",
 b"b'numpy_20210826_002317'",
 b"b'numpy_20210727_300024'",
 b"b'numpy_20210907_300496'",
 b"b'numpy_20210621_600312'",
 b"b'numpy_20210608_603056'",
 b"b'numpy_20210625_601975'",
 b"b'numpy_20210914_603056'",
 b"b'numpy_20210519_002583'",
 b"b'numpy_20211015_000519'",
 b"b'numpy_20210507_300088'",
 b"b'numpy_20210722_600643'",
 b"b'numpy_20210603_002221'",
 b"b'numpy_20211025_000402'",
 b"b'numpy_20210701_002701'",
 b"b'numpy_20210527_600885'",
 b"b'numpy_20210831_002966'",
 b"b'numpy_20210823_600372'",
 b"b'numpy_20210723_002212'",
 b"b'numpy_20210519_002064'",
 b"b'numpy_20210526_300024'",
 b"b'numpy_20210913_600376'",
 b"b'numpy_20210629_300002'",
 b"b'numpy_20210819_000415'",
 b"b'numpy

In [53]:
import gym
env = gym.make("LunarLander-v2", render_mode="human")
env.action_space.seed(42)

observation, info = env.reset(seed=42, return_info=True)

for _ in range(1000):
    observation, reward, done, info = env.step(env.action_space.sample())

    if done:
        observation, info = env.reset(return_info=True)

env.close()

TypeError: __init__() got an unexpected keyword argument 'render_mode'

In [50]:
a = all_redis_keys[0].decode(encoding = 'utf-8')
a

"b'numpy_20210819_002683'"

In [47]:
train_start_date = '20210701'
train_end_date = '20210930'
all_redis_keys = rs.keys()
keys_to_shard = [x.decode(encoding = 'utf-8') for x in all_redis_keys
                if ((len(x.decode(encoding = 'utf-8').split('_')) == 3)
                and (x.decode(encoding = 'utf-8').split('_')[2] <= train_end_date[4:6])
                and (x.decode(encoding = 'utf-8').split('_')[2] >= train_start_date[4:6])
                and (x.decode(encoding = 'utf-8').split('_')[0] == 'numpy'))]
keys_to_shard

[]

In [29]:
def shard_keys(start_date, end_date, seq_len = 50, time_step = 1, db = 0):
    shard_dict_whole = dict()
    rs = ut.redis_connection(db=db)
    all_redis_keys = rs.keys()
    keys_to_shard = [x.decode(encoding = 'utf-8') for x in all_redis_keys
                    if ((len(x.decode(encoding = 'utf-8').split('_')) == 3)
                    and (x.decode(encoding = 'utf-8').split('_')[2] <= end_date[4:6])
                    and (x.decode(encoding = 'utf-8').split('_')[2] >= start_date[4:6])
                    and (x.decode(encoding = 'utf-8').split('_')[0] == 'numpy'))]
    cnt = 0 # 记录所有序列的长度
    keys_to_shard.sort()
    for key in keys_to_shard:
        shard_dict_whole[cnt] = key
        cnt += 1
    rs.close()

    return shard_dict_whole, cnt

In [30]:
train_start_date = '20210701'
train_end_date = '20210930'
shard_dict, key_num = shard_keys(train_start_date, train_end_date, seq_len = 128, time_step = 5, db = 0)
shard_dict

{}

In [31]:
key_num

0

In [24]:
import tst.utilities as ut
rs = ut.redis_connection()
redis_keys = list(rs.keys())
cnn_redis_keys = [x for x in redis_keys if 'CNN' in str(x)]
# train_redis_keys = [x for x in cnn_redis_keys if (int(str(x).split('_')[1]) <= train_end_date)
#                     and (int(str(x).split('_')[1]) >= train_start_date)]
cnn_redis_keys

[]

In [16]:
class RemoteSrc:

    # 147上的路径
    REMOTE_PATH = "/sgd-data/data/stock/"
    # 缓存路径，方便复用，减少网络通讯
    TEMP = "/home/yby/YBY/CNN/backtest_temp/"

    def __init__(self):
        self._client = paramiko.Transport(("192.168.1.147", 22))
        self._client.connect(username="sgd", password="sgd123")
        # 使用sftp文件服务
        self._SFTP = paramiko.SFTPClient.from_transport(self._client)
        if not os.path.exists(self.TEMP):
            os.mkdir(self.TEMP)

        self.dict_stocksPerDay = defaultdict(list)

    def get_raw_bars(self, ticker, date):

        # 本地文件名，用于判断此前是否查询调用过
        local_path = f"{self.TEMP}{ticker}_{date}.csv.gz"

        if not os.path.exists(local_path):
            files_currentDay = self._SFTP.listdir(f"{self.REMOTE_PATH}{date}/tick_csv/")
            if date in self.dict_stocksPerDay.keys():
                stocks_currentDay = self.dict_stocksPerDay[date]
            else:
                stocks_currentDay = [s[:6] for s in files_currentDay]

            file_idx = stocks_currentDay.index(ticker)

            self._SFTP.get(remotepath=f"{self.REMOTE_PATH}{date}/tick_csv/{files_currentDay[file_idx]}",
                           localpath=local_path)

        data = pd.read_csv(local_path)

        # 数据字段处理，按自己需要修改
        data['server_time'] = pd.to_datetime(data.server_time)
        data['local_time'] = data['server_time']
        data['time'] = data.apply(lambda x: str(x['server_time'].time()), axis = 1)

        return data

src = RemoteSrc()

In [17]:
data1 = src.get_raw_bars('000009', '20210701')
range_d = ((data1['time'] > '09:30:00') & (data1['time'] < '11:30:00')) | ((data1['time'] >= '13:00:00')&(data1['time'] < '14:57:00'))

data1 = data1[range_d].reset_index(drop=True)
data1.head()

Unnamed: 0,date,code,server_time,local_time,preclose,open,high,low,last,upper_limit,lower_limit,volume,turnover,iopv,ask_price1,ask_volume1,ask_price2,ask_volume2,ask_price3,ask_volume3,ask_price4,ask_volume4,ask_price5,ask_volume5,ask_price6,ask_volume6,ask_price7,ask_volume7,ask_price8,ask_volume8,ask_price9,ask_volume9,ask_price10,ask_volume10,bid_price1,bid_volume1,bid_price2,bid_volume2,bid_price3,bid_volume3,bid_price4,bid_volume4,bid_price5,bid_volume5,bid_price6,bid_volume6,bid_price7,bid_volume7,bid_price8,bid_volume8,bid_price9,bid_volume9,bid_price10,bid_volume10,time
0,20210701,000009.SZSE,2021-07-01 09:30:03,2021-07-01 09:30:03,18.27,18.53,18.64,18.41,18.55,20.1,16.44,1498288,27774114,0,18.64,2100,18.65,22000,18.66,7200,18.67,22300,18.68,57400,18.69,2600,18.7,9300,18.71,2000,18.72,800,18.73,4100,18.55,70712,18.54,22100,18.53,14000,18.52,300,18.51,7800,18.5,6900,18.49,5700,18.48,1000,18.46,100,18.45,5200,09:30:03
1,20210701,000009.SZSE,2021-07-01 09:30:06,2021-07-01 09:30:06,18.27,18.53,18.64,18.41,18.56,20.1,16.44,1661488,30805528,0,18.59,8000,18.6,2300,18.61,1000,18.63,8900,18.64,3200,18.65,22700,18.66,4700,18.67,22300,18.68,56800,18.69,2600,18.56,1300,18.55,31712,18.54,22100,18.53,20000,18.52,1200,18.51,7800,18.5,9000,18.49,5700,18.48,1100,18.46,100,09:30:06
2,20210701,000009.SZSE,2021-07-01 09:30:09,2021-07-01 09:30:09,18.27,18.53,18.64,18.41,18.57,20.1,16.44,1691588,31364189,0,18.58,6100,18.59,7900,18.6,3400,18.61,1000,18.63,100,18.64,2200,18.65,22700,18.66,4700,18.67,22300,18.68,56300,18.57,5600,18.56,17100,18.55,29112,18.54,40900,18.53,22900,18.52,20300,18.51,7800,18.5,9200,18.49,5700,18.48,1100,09:30:09
3,20210701,000009.SZSE,2021-07-01 09:30:12,2021-07-01 09:30:12,18.27,18.53,18.64,18.41,18.58,20.1,16.44,1776188,32935217,0,18.58,2900,18.59,7000,18.6,5500,18.61,1000,18.63,700,18.64,2200,18.65,22700,18.66,4700,18.67,22300,18.68,56300,18.57,6100,18.56,4200,18.55,32212,18.54,40900,18.53,21800,18.52,20400,18.51,7800,18.5,9200,18.49,5700,18.48,1100,09:30:12
4,20210701,000009.SZSE,2021-07-01 09:30:15,2021-07-01 09:30:15,18.27,18.53,18.64,18.41,18.54,20.1,16.44,1863988,34563841,0,18.57,17400,18.59,10300,18.6,5500,18.61,1000,18.63,700,18.64,2200,18.65,22700,18.66,4700,18.67,22300,18.68,57300,18.54,5312,18.53,21800,18.52,20200,18.51,7800,18.5,10300,18.49,5700,18.48,1100,18.45,5200,18.43,5100,18.42,100,09:30:15


In [18]:
data1.columns

Index(['date', 'code', 'server_time', 'local_time', 'preclose', 'open', 'high', 'low', 'last', 'upper_limit', 'lower_limit', 'volume', 'turnover', 'iopv', 'ask_price1',
       'ask_volume1', 'ask_price2', 'ask_volume2', 'ask_price3', 'ask_volume3', 'ask_price4', 'ask_volume4', 'ask_price5', 'ask_volume5', 'ask_price6', 'ask_volume6',
       'ask_price7', 'ask_volume7', 'ask_price8', 'ask_volume8', 'ask_price9', 'ask_volume9', 'ask_price10', 'ask_volume10', 'bid_price1', 'bid_volume1', 'bid_price2',
       'bid_volume2', 'bid_price3', 'bid_volume3', 'bid_price4', 'bid_volume4', 'bid_price5', 'bid_volume5', 'bid_price6', 'bid_volume6', 'bid_price7', 'bid_volume7',
       'bid_price8', 'bid_volume8', 'bid_price9', 'bid_volume9', 'bid_price10', 'bid_volume10', 'time'],
      dtype='object')

In [21]:
data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4739 entries, 0 to 4738
Data columns (total 55 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   date          4739 non-null   int64         
 1   code          4739 non-null   object        
 2   server_time   4739 non-null   datetime64[ns]
 3   local_time    4739 non-null   datetime64[ns]
 4   preclose      4739 non-null   float64       
 5   open          4739 non-null   float64       
 6   high          4739 non-null   float64       
 7   low           4739 non-null   float64       
 8   last          4739 non-null   float64       
 9   upper_limit   4739 non-null   float64       
 10  lower_limit   4739 non-null   float64       
 11  volume        4739 non-null   int64         
 12  turnover      4739 non-null   int64         
 13  iopv          4739 non-null   int64         
 14  ask_price1    4739 non-null   float64       
 15  ask_volume1   4739 non-null   int64   

In [13]:
data.columns

Index(['date', 'code', 'timeidx', 'price', 'vwp', 'ask_price', 'bid_price', 'ask_price2', 'bid_price2', 'ask_price4', 'bid_price4', 'ask_price8', 'bid_price8', 'spread',
       'tick_spread', 'ref_ind_0', 'ref_ind_1', 'ask_weight_14', 'ask_weight_13', 'ask_weight_12', 'ask_weight_11', 'ask_weight_10', 'ask_weight_9', 'ask_weight_8',
       'ask_weight_7', 'ask_weight_6', 'ask_weight_5', 'ask_weight_4', 'ask_weight_3', 'ask_weight_2', 'ask_weight_1', 'ask_weight_0', 'bid_weight_0', 'bid_weight_1',
       'bid_weight_2', 'bid_weight_3', 'bid_weight_4', 'bid_weight_5', 'bid_weight_6', 'bid_weight_7', 'bid_weight_8', 'bid_weight_9', 'bid_weight_10', 'bid_weight_11',
       'bid_weight_12', 'bid_weight_13', 'bid_weight_14', 'ask_dec', 'bid_dec', 'ask_inc', 'bid_inc', 'ask_inc2', 'bid_inc2', 'preclose', 'limit', 'turnover', 'p_2', 'p_5',
       'p_18', 'p_diff', 'circulation_mv'],
      dtype='object')

Unnamed: 0_level_0,date,code,timeidx,price,vwp,ask_price,bid_price,ask_price2,bid_price2,ask_price4,bid_price4,ask_price8,bid_price8,spread,tick_spread,ref_ind_0,ref_ind_1,ask_weight_14,ask_weight_13,ask_weight_12,ask_weight_11,ask_weight_10,ask_weight_9,ask_weight_8,ask_weight_7,ask_weight_6,ask_weight_5,ask_weight_4,ask_weight_3,ask_weight_2,ask_weight_1,ask_weight_0,bid_weight_0,bid_weight_1,bid_weight_2,bid_weight_3,bid_weight_4,bid_weight_5,bid_weight_6,bid_weight_7,bid_weight_8,bid_weight_9,bid_weight_10,bid_weight_11,bid_weight_12,bid_weight_13,bid_weight_14,ask_dec,bid_dec,ask_inc,bid_inc,ask_inc2,bid_inc2,preclose,limit,turnover,p_2,p_5,p_18,p_diff,circulation_mv
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1
09:30:03,20210506,8,3,2.245,2.244167,2.25,2.24,2.25,2.24,2.25,2.24,2.253323,2.24,4.454343,4.454343,-0.020203,-0.01852,0,0,0,0,0,5.7798,15.4246,11.8784,7.623,28.014,24.961,77.9076,83.8311,57.3588,5.355,25.20672,38.4898,35.0982,30.0118,20.746,12.6363,3.488,1.1718,1.2744,2.5155,0,0,0,0,0,1.4817,5.3431,0.0,0.0,1.640421,0.48043,2.24,0.1,2.4237,-0.002227,-0.002227,0.0,0.0,5970085000.0
09:30:06,20210506,8,6,2.245,2.243559,2.25,2.24,2.25,2.24,2.25,2.24,2.25,2.24,4.454343,4.454343,-0.016139,-0.015986,0,0,0,0,0,5.7798,15.8906,11.8784,8.0388,28.014,29.541,78.3636,88.3711,65.8112,17.3475,29.61952,44.154,37.3182,36.6418,20.768,12.6363,3.488,1.1718,1.2744,2.5155,0,0,0,0,0,0.0,0.0,0.0,0.0,13.985003,5.802876,2.24,0.1,1.3237,-0.002227,-0.002227,-0.002227,0.0,5970085000.0
09:30:09,20210506,8,9,2.245,2.24,2.25,2.24,2.25,2.24,2.25,2.24,2.25,2.24,4.454343,4.454343,-0.01588,-0.015789,0,0,0,0,0,5.7798,15.8906,11.8784,8.0388,28.014,44.5863,80.5752,88.3711,65.8112,17.3475,29.68672,44.154,37.3182,36.6418,20.79,12.6363,3.488,1.1718,1.2744,2.5155,0,0,0,0,0,0.0,0.0,0.0,0.0,0.052309,0.06735,2.24,0.1,0.0448,0.00199,0.002171,0.0,0.000181,5970085000.0
09:30:12,20210506,8,12,2.245,2.24,2.25,2.24,2.25,2.24,2.25,2.24,2.25,2.24,4.454343,4.454343,-0.01564,-0.015313,0,0,0,0,0,6.0606,15.8906,11.8784,8.0388,28.014,44.5863,80.5752,88.3711,66.0372,17.37,29.48512,44.154,37.3182,36.6418,20.79,12.6363,3.488,1.1718,1.2744,2.5155,0,0,0,0,0,0.0,0.20205,0.0,0.0,0.070493,0.0,2.24,0.1,0.2016,-0.002227,0.000247,0.0,0.002475,5970085000.0
09:30:15,20210506,8,15,2.245,2.24,2.25,2.24,2.25,2.24,2.25,2.24,2.25,2.24,4.454343,4.454343,-0.015268,-0.015035,0,0,0,0,0,6.0606,15.8906,11.8784,8.0388,28.014,44.5863,80.5752,88.3711,66.0372,17.37,29.39552,44.154,37.3182,36.6418,20.79,12.6363,3.488,1.1718,1.2744,2.5155,0,0,0,0,0,0.0,0.0898,0.0,0.0,0.0,0.0,2.24,0.1,0.0896,-0.002227,0.001986,0.002227,0.004213,5970085000.0


In [14]:
np.intersect1d(data.columns, data1.columns).tolist()

['ask_price2',
 'ask_price4',
 'ask_price8',
 'bid_price2',
 'bid_price4',
 'bid_price8',
 'code',
 'date',
 'preclose',
 'turnover']

In [15]:
data1.shape

(4739, 55)

In [16]:
data.shape

(4739, 61)

In [12]:
data1[['bid_volume1'] + ['bid_volume2']].sum(axis=1)

0       92812
1       33012
2       22700
3       10300
4       27112
        ...  
4734     6700
4735    10000
4736    15500
4737    14100
4738    13300
Length: 4739, dtype: int64

In [13]:
data1[['bid_volume1', 'bid_volume2']].sum()

bid_volume1    75046348
bid_volume2    76498432
dtype: int64

In [17]:
def calc_wap1(df):
    wap = (df['bid_price1'] * df['ask_volume1'] + df['ask_price1'] * df['bid_volume1']) / (df['bid_volume1'] + df['ask_volume1'])
    return wap

def calc_wap3(df):
    wap = (df['bid_price1'] * df['bid_volume1'] + df['ask_price1'] * df['ask_volume1']) / (df['bid_volume1'] + df['ask_volume1'])
    return wap

def calc_wap_all(df, level_range):
    bp_name = ['bid_price%s'%i for i in level_range]
    ap_name = ['ask_price%s'%i for i in level_range]
    bv_name = ['bid_volume%s'%i for i in level_range]
    av_name = ['ask_volume%s'%i for i in level_range]
    wap = (np.sum(np.multiply(df[bp_name], df[bv_name]).to_numpy(), axis=1) + np.sum(np.multiply(df[ap_name], df[av_name]).to_numpy(), axis=1))/df[bv_name + av_name].sum(axis=1).to_numpy()
    return pd.DataFrame(wap, columns=['vwp'])

def calc_vwap(df):
    df = df.copy()
    # df['price'] = (df['ask_price1'] + df['bid_price1'])/2
    df['ttl'] = (df['high']+df['low']+df['last'])/3
    df['vwap'] = (df['ttl']*df['volume']).expanding(min_periods=1).sum()/df['volume'].expanding(min_periods=1).sum()
    return df['vwap']

def calc_spread(df):
    return (df['ask_price1'] - df['bid_price1'])/(df['ask_price1'] + df['bid_price1']) * 2 * 10000

def calc_spread2(df):
    return (df['ask_price1'] - df['bid_price1'])

In [18]:
calc_spread2(data1)

0       0.09
1       0.03
2       0.01
3       0.01
4       0.03
        ... 
4734    0.03
4735    0.02
4736    0.01
4737    0.01
4738    0.01
Length: 4739, dtype: float64

In [19]:
level = 10
# level_range = list(range(1, level+1, 1))
level_range = [1]
vwap = calc_wap_all(data1, level_range)
vwap

Unnamed: 0,vwp
0,18.552596
1,18.585806
2,18.575214
3,18.573222
4,18.562983
...,...
4734,17.299036
4735,17.277077
4736,17.262527
4737,17.238967


In [20]:
trial = src.get_raw_bars('000009', '20210701')
range_d = ((trial['time'] >= '09:30:00') & (trial['time'] < '11:30:00')) | ((trial['time'] >= '13:00:00')&(trial['time'] < '14:57:00'))
trial = trial[range_d].reset_index(drop=True)

trial['price'] = (trial['ask_price1'] + trial['bid_price1'])/2
trial['ttl'] = (trial['high']+trial['low']+trial['last'])/3
trial['vwap'] = (trial['price']*trial['volume']).expanding(min_periods=1).sum()/trial['volume'].expanding(min_periods=1).sum()


trial['vwap']

0       18.465000
1       18.542499
2       18.555434
3       18.561076
4       18.564312
          ...    
4735    17.667102
4736    17.666955
4737    17.666802
4738    17.666638
4739    17.666474
Name: vwap, Length: 4740, dtype: float64

In [21]:
data.head()

Unnamed: 0_level_0,date,code,timeidx,price,vwp,ask_price,bid_price,ask_price2,bid_price2,ask_price4,bid_price4,ask_price8,bid_price8,spread,tick_spread,ref_ind_0,ref_ind_1,ask_weight_14,ask_weight_13,ask_weight_12,ask_weight_11,ask_weight_10,ask_weight_9,ask_weight_8,ask_weight_7,ask_weight_6,ask_weight_5,ask_weight_4,ask_weight_3,ask_weight_2,ask_weight_1,ask_weight_0,bid_weight_0,bid_weight_1,bid_weight_2,bid_weight_3,bid_weight_4,bid_weight_5,bid_weight_6,bid_weight_7,bid_weight_8,bid_weight_9,bid_weight_10,bid_weight_11,bid_weight_12,bid_weight_13,bid_weight_14,ask_dec,bid_dec,ask_inc,bid_inc,ask_inc2,bid_inc2,preclose,limit,turnover,p_2,p_5,p_18,p_diff,circulation_mv
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1
09:30:03,20210701,9,3,18.595,18.552873,18.64,18.55,18.64,18.55,18.64026,18.55,18.64513,18.55,4.840011,0.537779,-0.001545,-0.000754,0.0,0.0,0.0,0.0,0.0,7.6793,1.4976,3.742,17.391,4.8594,107.2232,41.6341,13.4352,41.03,3.9144,131.17076,40.9734,25.942,0.5556,14.4378,12.765,10.5393,1.848,0.0,0.1846,9.594,0.0,0.0,0.0,0,131.31789,0.0,0.0,199.174764,30.261503,49.64865,18.27,0.1,896.6381,-0.001076,-0.002958,-0.025813,-0.001882,44267320000.0
09:30:06,20210701,9,6,18.575,18.574841,18.59,18.56,18.59,18.56,18.59,18.556038,18.59,18.553019,1.615074,0.538358,-0.002098,-0.001306,0.0,0.0,0.0,0.0,4.8594,106.1024,41.6341,8.7702,42.3355,5.9648,16.5807,0.0,1.861,4.278,14.872,2.4128,58.82576,40.9734,37.06,2.2224,14.4378,16.65,10.5393,2.0328,0.0,0.1846,0.0,0.0,0.0,0,0.0,72.4425,0.0,0.0,36.914097,12.619855,18.27,0.1,303.1414,0.0,-0.004038,-0.020458,-0.004038,44267320000.0
09:30:09,20210701,9,9,18.575,18.560166,18.58,18.57,18.58,18.57,18.58,18.57,18.58,18.57,0.538358,0.538358,-0.002004,-0.001059,0.0,0.0,0.0,0.0,105.1684,41.6341,8.7702,42.3355,4.1008,0.1863,0.0,1.861,6.324,14.6861,11.3338,10.3992,31.7376,54.00276,75.8286,42.4337,37.5956,14.4378,17.02,10.5393,2.0328,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,4.18309,79.785197,18.27,0.1,55.8661,-0.001077,-0.007268,-0.022611,-0.006191,44267320000.0
09:30:12,20210701,9,12,18.575,18.570071,18.58,18.57,18.58,18.57,18.58,18.57,18.583268,18.57,0.538358,0.538358,-0.00203,-0.001361,0.0,0.0,0.0,0.0,105.1684,41.6341,8.7702,42.3355,4.1008,1.3041,0.0,1.861,10.23,13.013,5.3882,11.3277,7.7952,59.75326,75.8286,40.3954,37.7808,14.4378,17.02,10.5393,2.0328,0.0,0.0,0.0,0.0,0,7.329695,19.880823,0.0,0.0,3.124315,3.99734,18.27,0.1,157.1028,-0.001884,-0.00996,-0.018304,-0.008075,44267320000.0
09:30:15,20210701,9,15,18.555,18.549248,18.57,18.54,18.57,18.54,18.57,18.54,18.57,18.54,1.616815,0.538938,-0.002606,-0.001779,0.0,0.0,0.0,107.0364,41.6341,8.7702,42.3355,4.1008,1.3041,0.0,1.861,10.23,19.1477,0.0,32.3118,9.848448,40.3954,37.4104,14.4378,19.055,10.5393,2.0328,0.0,0.0,9.594,0.0,9.5835,0.0,0.0,0,5.38095,144.91455,32.2857,0.0,5.366106,0.710656,18.27,0.1,162.8624,-0.002964,-0.009431,-0.019402,-0.006467,44267320000.0


In [22]:
level_range = range(15)
bp_name = ['bid_weight_%s'%i for i in level_range]
ap_name = ['ask_weight_%s'%i for i in level_range]

In [23]:
np.sum(data[ap_name+bp_name], axis=1)

time
09:30:03     490.416660
09:30:06     432.596960
09:30:09     532.427560
09:30:12     510.716160
09:30:15     421.628248
               ...     
14:56:45     937.283142
14:56:48     775.056042
14:56:51     957.221642
14:56:54    1061.170520
14:56:57     915.388820
Length: 4739, dtype: float64

In [24]:
data['circulation_mv'].to_numpy() * data['turnover'].to_numpy()/data1['volume']/1000

0       26491.410342
1        8076.649475
2        1461.964916
3        3915.418475
4        3867.772403
            ...     
4734        6.266265
4735        2.962924
4736        9.634546
4737       18.611297
4738        6.647974
Name: volume, Length: 4739, dtype: float64

In [25]:
data1['volume'] * data['price'].to_numpy()/data['circulation_mv'].to_numpy()

0       0.000629
1       0.000697
2       0.000710
3       0.000745
4       0.000781
          ...   
4734    0.044352
4735    0.044341
4736    0.044308
4737    0.044242
4738    0.044246
Name: volume, Length: 4739, dtype: float64

In [26]:
np.sum(data[ap_name], axis=1)

time
09:30:03    242.406200
09:30:06    247.258100
09:30:09    236.400200
09:30:12    233.805300
09:30:15    268.731600
               ...    
14:56:45    631.647942
14:56:48    464.238142
14:56:51    446.577942
14:56:54    543.897920
14:56:57    397.774620
Length: 4739, dtype: float64

In [27]:
np.sum(data[bp_name], axis=1)

time
09:30:03    248.010460
09:30:06    185.338860
09:30:09    296.027360
09:30:12    276.910860
09:30:15    152.896648
               ...    
14:56:45    305.635200
14:56:48    310.817900
14:56:51    510.643700
14:56:54    517.272600
14:56:57    517.614200
Length: 4739, dtype: float64

In [50]:
data1.head()

Unnamed: 0,date,code,server_time,local_time,preclose,open,high,low,last,upper_limit,lower_limit,volume,turnover,iopv,ask_price1,ask_volume1,ask_price2,ask_volume2,ask_price3,ask_volume3,ask_price4,ask_volume4,ask_price5,ask_volume5,ask_price6,ask_volume6,ask_price7,ask_volume7,ask_price8,ask_volume8,ask_price9,ask_volume9,ask_price10,ask_volume10,bid_price1,bid_volume1,bid_price2,bid_volume2,bid_price3,bid_volume3,bid_price4,bid_volume4,bid_price5,bid_volume5,bid_price6,bid_volume6,bid_price7,bid_volume7,bid_price8,bid_volume8,bid_price9,bid_volume9,bid_price10,bid_volume10,time
0,20210701,000009.SZSE,2021-07-01 09:30:03,2021-07-01 09:30:03,18.27,18.53,18.64,18.41,18.55,20.1,16.44,1498288,27774114,0,18.64,2100,18.65,22000,18.66,7200,18.67,22300,18.68,57400,18.69,2600,18.7,9300,18.71,2000,18.72,800,18.73,4100,18.55,70712,18.54,22100,18.53,14000,18.52,300,18.51,7800,18.5,6900,18.49,5700,18.48,1000,18.46,100,18.45,5200,09:30:03
1,20210701,000009.SZSE,2021-07-01 09:30:06,2021-07-01 09:30:06,18.27,18.53,18.64,18.41,18.56,20.1,16.44,1661488,30805528,0,18.59,8000,18.6,2300,18.61,1000,18.63,8900,18.64,3200,18.65,22700,18.66,4700,18.67,22300,18.68,56800,18.69,2600,18.56,1300,18.55,31712,18.54,22100,18.53,20000,18.52,1200,18.51,7800,18.5,9000,18.49,5700,18.48,1100,18.46,100,09:30:06
2,20210701,000009.SZSE,2021-07-01 09:30:09,2021-07-01 09:30:09,18.27,18.53,18.64,18.41,18.57,20.1,16.44,1691588,31364189,0,18.58,6100,18.59,7900,18.6,3400,18.61,1000,18.63,100,18.64,2200,18.65,22700,18.66,4700,18.67,22300,18.68,56300,18.57,5600,18.56,17100,18.55,29112,18.54,40900,18.53,22900,18.52,20300,18.51,7800,18.5,9200,18.49,5700,18.48,1100,09:30:09
3,20210701,000009.SZSE,2021-07-01 09:30:12,2021-07-01 09:30:12,18.27,18.53,18.64,18.41,18.58,20.1,16.44,1776188,32935217,0,18.58,2900,18.59,7000,18.6,5500,18.61,1000,18.63,700,18.64,2200,18.65,22700,18.66,4700,18.67,22300,18.68,56300,18.57,6100,18.56,4200,18.55,32212,18.54,40900,18.53,21800,18.52,20400,18.51,7800,18.5,9200,18.49,5700,18.48,1100,09:30:12
4,20210701,000009.SZSE,2021-07-01 09:30:15,2021-07-01 09:30:15,18.27,18.53,18.64,18.41,18.54,20.1,16.44,1863988,34563841,0,18.57,17400,18.59,10300,18.6,5500,18.61,1000,18.63,700,18.64,2200,18.65,22700,18.66,4700,18.67,22300,18.68,57300,18.54,5312,18.53,21800,18.52,20200,18.51,7800,18.5,10300,18.49,5700,18.48,1100,18.45,5200,18.43,5100,18.42,100,09:30:15


In [77]:
def judger(x, threshold):
    upper = x.iloc[-1] * (1 + threshold)
    lower = x.iloc[-1] * (1 - threshold)
    if (x >= upper).sum() > 1:
        return 2
    elif (x <= lower).sum() > 1:
        return 1
    else:
        return 0

data['price'][::-1].rolling(2, closed='both').apply(lambda x: judger(x, 0.001))[::-1]

time
09:30:03    1.0
09:30:06    0.0
09:30:09    0.0
09:30:12    1.0
09:30:15    0.0
           ... 
14:56:45    0.0
14:56:48    0.0
14:56:51    1.0
14:56:54    0.0
14:56:57    NaN
Name: price, Length: 4739, dtype: float64

In [75]:
data[['ref_ind_0', 'ref_ind_1']]

Unnamed: 0_level_0,ref_ind_0,ref_ind_1
time,Unnamed: 1_level_1,Unnamed: 2_level_1
09:30:03,-0.001545,-0.000754
09:30:06,-0.002098,-0.001306
09:30:09,-0.002004,-0.001059
09:30:12,-0.002030,-0.001361
09:30:15,-0.002606,-0.001779
...,...,...
14:56:45,-0.031068,-0.032531
14:56:48,-0.031116,-0.032638
14:56:51,-0.031182,-0.032651
14:56:54,-0.031385,-0.032825


In [54]:
test = data[['price', 'vwp', 'ask_price', 'ask_price2', 'ask_price4', 'ask_price8','ask_dec', 'ask_inc', 'p_2', 'p_5', 'p_18', 'p_diff']]

# test['diff'] = np.log((test['price']).diff(periods=2)
# test['ret_2'] = test['ask_price'].pct_change(2)
# test['ret_5'] = test['ask_price'].pct_change(5)
# test['ret_18'] = test['ask_price'].pct_change(18)

test['rec2'] = np.exp(test['p_2'] + test['p_2'].shift(2))


test.head(20)

Unnamed: 0_level_0,price,vwp,ask_price,ask_price2,ask_price4,ask_price8,ask_dec,ask_inc,p_2,p_5,p_18,p_diff,rec2
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
09:30:03,18.595,18.552873,18.64,18.64,18.64026,18.64513,131.31789,0.0,-0.001076,-0.002958,-0.025813,-0.001882,
09:30:06,18.575,18.574841,18.59,18.59,18.59,18.59,0.0,0.0,0.0,-0.004038,-0.020458,-0.004038,
09:30:09,18.575,18.560166,18.58,18.58,18.58,18.58,0.0,0.0,-0.001077,-0.007268,-0.022611,-0.006191,-0.002152
09:30:12,18.575,18.570071,18.58,18.58,18.58,18.583268,7.329695,0.0,-0.001884,-0.00996,-0.018304,-0.008075,-0.001884
09:30:15,18.555,18.549248,18.57,18.57,18.57,18.57,5.38095,32.2857,-0.002964,-0.009431,-0.019402,-0.006467,-0.004041
09:30:18,18.54,18.538569,18.56,18.56,18.56,18.56,13.927248,0.0,-0.005394,-0.010518,-0.016181,-0.005124,-0.007278
09:30:21,18.5,18.528802,18.51,18.514439,18.51722,18.527028,0.0,4.625,-0.005946,-0.010541,-0.013514,-0.004595,-0.00891
09:30:24,18.44,18.486915,18.45,18.45,18.45,18.45,0.0,58.8236,-0.003254,-0.008406,-0.012202,-0.005152,-0.008648
09:30:27,18.39,18.416368,18.43,18.43,18.43,18.438257,7.5399,4.4136,-0.002447,-0.006797,-0.008428,-0.00435,-0.008393
09:30:30,18.38,18.423214,18.41,18.41,18.41,18.41,0.0,0.0,-0.004081,-0.008433,-0.008161,-0.004353,-0.007334


In [86]:
x = pd.DataFrame({'a': [1,2,3,4,5,6,7], 'b': [3,4,5,6,7,8,9]})
x

Unnamed: 0,a,b
0,1,3
1,2,4
2,3,5
3,4,6
4,5,7
5,6,8
6,7,9


In [96]:
x['c'] = x['a'].rolling(3, closed='both').mean()
x['d'] = x['a'].rolling(3).mean()
x['e'] = x['a'].rolling(3, closed='left').mean()
x['f'] = x['a'].rolling(3, closed='left').apply(lambda x: x.iloc[0])
x['g'] = x['a'].rolling(3, closed='left').apply(lambda x: x.iloc[-1])
x

Unnamed: 0,a,b,c,d,e,f,g
0,1,3,,,,,
1,2,4,,,,,
2,3,5,2.0,2.0,,,
3,4,6,2.5,3.0,2.0,1.0,3.0
4,5,7,3.5,4.0,3.0,2.0,4.0
5,6,8,4.5,5.0,4.0,3.0,5.0
6,7,9,5.5,6.0,5.0,4.0,6.0


In [4]:
import os

test_dir = os.listdir("/home/yby/YBY/Data")
test_dir

['20210714_600566.pkl',
 '20210806_600875.pkl',
 '20211102_600597.pkl',
 '20210824_000062.pkl',
 '20211027_300316.pkl',
 '20211110_600219.pkl',
 '20210824_601016.pkl',
 '20210707_002941.pkl',
 '20211103_600266.pkl',
 '20210810_002212.pkl',
 '20210825_002221.pkl',
 '20211126_601179.pkl',
 '20210909_000690.pkl',
 '20210729_002273.pkl',
 '20211012_600787.pkl',
 '20211012_000750.pkl',
 '20211021_002302.pkl',
 '20211122_000543.pkl',
 '20210827_603077.pkl',
 '20211116_000600.pkl',
 '20210825_603888.pkl',
 '20211014_300324.pkl',
 '20211110_002138.pkl',
 '20210915_603377.pkl',
 '20210702_300207.pkl',
 '20210910_002387.pkl',
 '20210811_002081.pkl',
 '20210910_002572.pkl',
 '20210901_600126.pkl',
 '20211102_603228.pkl',
 '20211018_001914.pkl',
 '20210915_600776.pkl',
 '20211112_600808.pkl',
 '20211109_600260.pkl',
 '20210701_600258.pkl',
 '20211123_600460.pkl',
 '20211011_600859.pkl',
 '20211104_002507.pkl',
 '20211126_002818.pkl',
 '20211029_002138.pkl',
 '20210705_002064.pkl',
 '20211118_60033