### 1. SH snapshot lv2

In [5]:
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz


def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name)


class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db'):
        self.db_name = db_name
        self.uri = uri
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = 1
            seg = {'ver': version, 'data': self.ser(df_seg, version), 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)

        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        else:
            raise Exception('unknown version')


def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()


import pandas as pd
import random
import numpy as np
import glob
import pickle
import os
import datetime
import time
pd.set_option("max_columns", 200)


startDate = 20200203
endDate = 20200203
database_name = 'com_md_eq_cn'
user = "zhenyuy"
password = "bnONBrzSMGoE"

db = DB("192.168.10.178", database_name, user, password)
SH = db.read('md_snapshot_l2', start_date=startDate, end_date=endDate)
SZ = SH[SH['skey'] > 2000000]
SH = SH[SH['skey'] < 2000000]
SH['num'] = SH['skey'] * 10000 + SH['ordering']
SZ['num'] = SZ['skey'] * 10000 + SZ['ordering']
SH = SH[['date', 'skey', 'time', 'cum_volume', 'cum_amount', "close", "bid1p", "bid2p", "bid3p", "bid4p", "bid5p", "bid1q", "bid2q",
           "bid3q", "bid4q", "bid5q", "ask1p", "ask2p", "ask3p", "ask4p", "ask5p", "ask1q", "ask2q", "ask3q",
           "ask4q", "ask5q", "open", 'num']]
SZ = SZ[['date', 'skey', 'time', 'cum_volume', 'cum_amount', "close", "bid1p", "bid2p", "bid3p", "bid4p", "bid5p", "bid1q", "bid2q",
           "bid3q", "bid4q", "bid5q", "ask1p", "ask2p", "ask3p", "ask4p", "ask5p", "ask1q", "ask2q", "ask3q",
           "ask4q", "ask5q", "open", 'num']]

startDate = '20200203'
endDate = '20200203'

readPath = '/mnt/e/result/logs_***_zs_92_01_day_data'
dataPathLs = np.array(glob.glob(readPath))
dateLs = np.array([os.path.basename(i).split('_')[1] for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]


for n in range(len(dataPathLs)):
    path1 = np.array(glob.glob(dataPathLs[n] + '/mdLog_SH_***'))
    SH1 = pd.read_csv(path1[0])
    SH1 = SH1[SH1['source'] == 4]
    
    SH1['skey'] = SH1['StockID'] + 1000000
    SH1 = SH1.rename(columns={"openPrice":"open"})
    SH1["open"] = np.where(SH1["cum_volume"] > 0, SH1.groupby("skey")["open"].transform("max"), SH1["open"])
    SH1["time"] = SH1["time"].apply(lambda x: int((x.replace(':', "")).replace(".", "")) * 1000)

SH1 = SH1[['clockAtArrival', 'sequenceNo', 'skey', 'time', 'cum_volume', 'cum_amount', "close", "bid1p", "bid2p", "bid3p", "bid4p", "bid5p", "bid1q", "bid2q",
           "bid3q", "bid4q", "bid5q", "ask1p", "ask2p", "ask3p", "ask4p", "ask5p", "ask1q", "ask2q", "ask3q",
           "ask4q", "ask5q", "open"]]
for cols in ['cum_amount', "close", 'open']:
    SH1[cols] = SH1[cols].round(2)
cols = ['skey', 'time', 'cum_volume', 'cum_amount', "close", "bid1p", "bid2p", "bid3p", "bid4p", "bid5p", "bid1q", "bid2q",
           "bid3q", "bid4q", "bid5q", "ask1p", "ask2p", "ask3p", "ask4p", "ask5p", "ask1q", "ask2q", "ask3q",
           "ask4q", "ask5q", "open"]
SH1 = SH1[SH1['skey'].isin(SH['skey'].unique())]
re = pd.merge(SH, SH1, on=cols, how='outer')

p21 = re[(re['date'].isnull())][['clockAtArrival', 'sequenceNo', 'skey', 'time', 'cum_volume', 'cum_amount', "close", "bid1p", "bid2p", "bid3p", "bid4p", "bid5p", "bid1q", "bid2q",
           "bid3q", "bid4q", "bid5q", "ask1p", "ask2p", "ask3p", "ask4p", "ask5p", "ask1q", "ask2q", "ask3q",
           "ask4q", "ask5q", "open"]]
p22 = re[(re['sequenceNo'].isnull())][["skey", "date", "time", 'cum_volume', 'cum_amount', "close", "bid1p", "bid2p", "bid3p", "bid4p", "bid5p", "bid1q", "bid2q",
           "bid3q", "bid4q", "bid5q", "ask1p", "ask2p", "ask3p", "ask4p", "ask5p", "ask1q", "ask2q", "ask3q",
           "ask4q", "ask5q", "open", 'num']]

p11 = re[(~re['sequenceNo'].isnull()) & (~re['date'].isnull())][re[(~re['sequenceNo'].isnull()) 
                                                    & (~re['date'].isnull())]['num'].duplicated(keep=False)]
p12 = re[(~re['sequenceNo'].isnull()) & (~re['date'].isnull())].drop_duplicates(['num'], keep=False)
p11 = p11.sort_values(by=['num', 'sequenceNo'])
p11["order1"] = p11.groupby(["num"]).cumcount()
p11["order2"] = p11.groupby(["sequenceNo"]).cumcount()
# p11 = p11[p11['order1'] == p11['order2']]
# p11.drop(['order1', 'order2'],axis=1,inplace=True)
# p1 = pd.concat([p11, p12])
# p2 = pd.merge(p22, p21[['skey', 'time', 'clockAtArrival', 'sequenceNo']], on=['skey', 'time'], how='left')
# re1 = pd.concat([p1, p2])
# re1 = re1.sort_values(by='num')
# re1['seq1'] = re1.groupby('skey')['sequenceNo'].bfill().ffill()
# sl = list(set(SH['skey'].unique()) - set(SH1['skey'].unique()))
# re1.loc[re1['skey'].isin(sl), 'seq1'] = np.nan
# re1['count1'] = re1.groupby(['seq1']).cumcount()
# re1['count2'] = re1.groupby(['seq1'])['count1'].transform('nunique')
# re1['max_seq'] = re1.groupby('skey')['sequenceNo'].transform('max')
# re1['count'] = np.where((re1['seq1'] != re1['max_seq']) | (~re1['sequenceNo'].isnull()), re1['count1'] + 1 - re1['count2'], re1['count1'] - re1['count2'])
# re1.drop(["max_seq"],axis=1,inplace=True)
# re1.drop(["count1"],axis=1,inplace=True)
# re1.drop(["count2"],axis=1,inplace=True)
# re1['dup'] = np.where(~re1["sequenceNo"].isnull(), re1.groupby(['sequenceNo']).cumcount(), 0)
# re1['dup1'] = np.where(~re1["sequenceNo"].isnull(), re1.groupby(['sequenceNo'])['num'].transform('nunique'), 0)
# re1['nan'] = np.where((re1['sequenceNo'].isnull()) | (re1['dup'] != 0), 1, 0)
# re1.loc[(re1['dup1'] > 1) & (re1['count'] < 0), 'sequenceNo'] = np.nan
# assert((len(set(sl) - set(re1[re1['seq1'].isnull()]['skey'].unique())) == 0) & 
#            (len(set(re1[re1['seq1'].isnull()]['skey'].unique()) - set(sl)) == 0))
# assert(re1.shape[0] == SH.shape[0])

# display('%.2f%%' % (re1[re1['sequenceNo'].isnull()].shape[0]/re1.shape[0] * 100))

In [19]:
set(p11[p11['order1'] == p11['order2']].shape[0]['num'])

214

In [10]:
p11.groupby('cum_volume')['order1'].size().reset_index().sort_values(by='order1')

Unnamed: 0,cum_volume,order1
0,178800,4
76,6221400,4
74,5986700,4
73,5930827,4
72,5711500,4
...,...,...
94,12414163,6
83,7609000,6
55,3596700,6
47,2137918,6


In [4]:
pd.set_option("max_rows", 200)
re1[re1['skey'] == 1600000].tail(200)

Unnamed: 0,date,skey,time,cum_volume,cum_amount,close,bid1p,bid2p,bid3p,bid4p,bid5p,bid1q,bid2q,bid3q,bid4q,bid5q,ask1p,ask2p,ask3p,ask4p,ask5p,ask1q,ask2q,ask3q,ask4q,ask5q,open,num,clockAtArrival,sequenceNo,seq1,count,dup,dup1,nan
5000,20200221.0,1600000,145303000000,37651642,425034537.3,11.31,11.31,11.3,11.29,11.28,11.27,131150,241648,177000,103600,90400,11.32,11.33,11.34,11.35,11.36,77201,183450,171400,298860,270430,11.23,16000010000.0,1582268000000000.0,152735133.0,152735133.0,0,0,1.0,0
5001,20200221.0,1600000,145306000000,37652142,425040192.3,11.31,11.31,11.3,11.29,11.28,11.27,130650,241648,177000,103600,90400,11.32,11.33,11.34,11.35,11.36,77701,183450,171400,294860,270430,11.23,16000010000.0,1582268000000000.0,152768280.0,152768280.0,0,0,1.0,0
5002,20200221.0,1600000,145309000000,37655442,425077543.3,11.31,11.31,11.3,11.29,11.28,11.27,130350,241648,177000,103600,90400,11.32,11.33,11.34,11.35,11.36,84901,183450,171400,293860,270430,11.23,16000010000.0,1582268000000000.0,152804284.0,152804284.0,0,0,1.0,0
5003,20200221.0,1600000,145312000000,37656342,425087722.3,11.31,11.31,11.3,11.29,11.28,11.27,129750,241648,177000,103600,90700,11.32,11.33,11.34,11.35,11.36,99701,183450,172000,293860,270830,11.23,16000010000.0,1582268000000000.0,152841513.0,152841513.0,0,0,1.0,0
5004,20200221.0,1600000,145315000000,37658242,425109219.3,11.31,11.31,11.3,11.29,11.28,11.27,130950,241648,177000,103600,90700,11.32,11.33,11.34,11.35,11.36,98701,178450,172000,293860,269730,11.23,16000010000.0,1582268000000000.0,152892110.0,152892110.0,0,0,1.0,0
5005,20200221.0,1600000,145318000000,37663042,425163519.3,11.32,11.31,11.3,11.29,11.28,11.27,127450,247148,177000,100300,90700,11.32,11.33,11.34,11.35,11.36,104401,178450,172000,293860,269730,11.23,16000010000.0,1582268000000000.0,152924737.0,152924737.0,0,0,1.0,0
5006,20200221.0,1600000,145321000000,37664042,425174830.3,11.32,11.31,11.3,11.29,11.28,11.27,126550,247148,177000,100300,90700,11.32,11.33,11.34,11.35,11.36,109701,179750,172000,293860,279730,11.23,16000010000.0,1582268000000000.0,152958355.0,152958355.0,0,0,1.0,0
5007,20200221.0,1600000,145324000000,37664342,425178224.3,11.31,11.31,11.3,11.29,11.28,11.27,126950,247148,177000,100300,90700,11.32,11.33,11.34,11.35,11.36,109901,181350,177900,294860,279730,11.23,16000010000.0,1582268000000000.0,152993227.0,152993227.0,0,0,1.0,0
5008,20200221.0,1600000,145327000000,37665942,425196330.3,11.32,11.31,11.3,11.29,11.28,11.27,126550,248148,176900,100300,90700,11.32,11.33,11.34,11.35,11.36,109701,187750,177900,310260,279730,11.23,16000010000.0,1582268000000000.0,153032911.0,153032911.0,0,0,1.0,0
5009,20200221.0,1600000,145330000000,37668442,425224624.3,11.31,11.31,11.3,11.29,11.28,11.27,126250,248148,176900,100300,90700,11.32,11.33,11.34,11.35,11.36,112101,188850,177900,310260,279730,11.23,16000010000.0,1582268000000000.0,153074047.0,153074047.0,0,0,1.0,0


### 2. SZ snapshot lv2

In [2]:
startDate = '20200221'
endDate = '20200221'

readPath = '/mnt/e/result/logs_***_zs_92_01_day_data'
dataPathLs = np.array(glob.glob(readPath))
dateLs = np.array([os.path.basename(i).split('_')[1] for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]


for n in range(len(dataPathLs)):
    path1 = np.array(glob.glob(dataPathLs[n] + '/mdLog_SZ_***'))
    SZ1 = pd.read_csv(path1[0])
    SZ1 = SZ1[SZ1['source'] == 4]
    
    SZ1['skey'] = SZ1['StockID'] + 2000000
    SZ1 = SZ1.rename(columns={"openPrice":"open"})
    SZ1["open"] = np.where(SZ1["cum_volume"] > 0, SZ1.groupby("skey")["open"].transform("max"), SZ1["open"])
    SZ1["time"] = SZ1["time"].apply(lambda x: int((x.replace(':', "")).replace(".", "")) * 1000)

SZ1 = SZ1[['clockAtArrival', 'sequenceNo', 'skey', 'time', 'cum_volume', 'cum_amount', "close", "bid1p", "bid2p", "bid3p", "bid4p", "bid5p", "bid1q", "bid2q",
           "bid3q", "bid4q", "bid5q", "ask1p", "ask2p", "ask3p", "ask4p", "ask5p", "ask1q", "ask2q", "ask3q",
           "ask4q", "ask5q", "open"]]
for cols in ['cum_amount']:
    SZ1[cols] = SZ1[cols].round(2)
cols = ['skey', 'time', 'cum_volume', 'cum_amount', "close", "bid1p", "bid2p", "bid3p", "bid4p", "bid5p", "bid1q", "bid2q",
           "bid3q", "bid4q", "bid5q", "ask1p", "ask2p", "ask3p", "ask4p", "ask5p", "ask1q", "ask2q", "ask3q",
           "ask4q", "ask5q", "open"]
SZ1 = SZ1[SZ1['skey'].isin(SZ['skey'].unique())]
re = pd.merge(SZ, SZ1, on=cols, how='outer')

display(re.shape[0])
display(re[~re['sequenceNo'].isnull()].shape[0])
display(re[~re['date'].isnull()].shape[0])
display(SZ.shape[0])
display(SZ1.shape[0])

try:
    assert(re.shape[0] == re[~re['date'].isnull()].shape[0])
    print('SZ lv2 is complete')
except:
    display('%.2f%%' % (re[~re['date'].isnull()].shape[0]/re.shape[0] * 100))
    re = pd.merge(SZ, SZ1, on=cols, how='left')
    print('SZ lv2 is incomplete')

if re[re.duplicated('num', keep=False)].shape[0] == 0:
    re2 = re.sort_values(by='num')
    re2['seq1'] = re2.groupby('skey')['sequenceNo'].bfill().ffill()
    sl = list(set(SZ['skey'].unique()) - set(SZ1['skey'].unique()))
    re2.loc[re2['skey'].isin(sl), 'seq1'] = np.nan
    re2['count1'] = re2.groupby(['seq1']).cumcount()
    re2['count2'] = re2.groupby(['seq1'])['count1'].transform('nunique')
    re2['max_seq'] = re2.groupby('skey')['sequenceNo'].transform('max')
    re2['count'] = np.where((re2['seq1'] != re2['max_seq'])|(~re2["sequenceNo"].isnull()), re2['count1'] + 1 - re2['count2'], re2['count1'] - re2['count2'])
    re2.drop(["max_seq"],axis=1,inplace=True)
    re2.drop(["count1"],axis=1,inplace=True)
    re2.drop(["count2"],axis=1,inplace=True)
    re2['dup'] = np.where(~re2["sequenceNo"].isnull(), re2.groupby(['sequenceNo']).cumcount(), 0)
    re2['dup1'] = np.where(~re2["sequenceNo"].isnull(), re2.groupby(['sequenceNo'])['num'].transform('nunique'), 0)
    re2['nan'] = np.where((re2['sequenceNo'].isnull()) | (re2['dup'] != 0), 1, 0)
    re2.loc[(re2['dup1'] > 1) & (re2['count'] < 0), 'sequenceNo'] = np.nan
    assert((len(set(sl) - set(re2[re2['seq1'].isnull()]['skey'].unique())) == 0) & 
           (len(set(re2[re2['seq1'].isnull()]['skey'].unique()) - set(sl)) == 0))
    assert(re2.shape[0] == SZ.shape[0])

    display('%.2f%%' % (re2[re2['sequenceNo'].isnull()].shape[0]/re2.shape[0] * 100))
    
    
else:
    p1 = re[re['num'].duplicated(keep=False)]
    p2 = re.drop_duplicates(['num'], keep=False)
    p1["order1"] = p1.groupby(["num"]).cumcount()
    p1["order2"] = p1.groupby(["sequenceNo"]).cumcount()
    p1 = p1[p1['order1'] == p1['order2']]
    p1.drop(['order1', 'order2'],axis=1,inplace=True)
    re = pd.concat([p1, p2])
    re2 = re.sort_values(by='num')
    re2['seq1'] = re2.groupby('skey')['sequenceNo'].bfill().ffill()
    sl = list(set(SZ['skey'].unique()) - set(SZ1['skey'].unique()))
    re2.loc[re2['skey'].isin(sl), 'seq1'] = np.nan
    re2['count1'] = re2.groupby(['seq1']).cumcount()
    re2['count2'] = re2.groupby(['seq1'])['count1'].transform('nunique')
    re2['max_seq'] = re2.groupby('skey')['sequenceNo'].transform('max')
    re2['count'] = np.where((re2['seq1'] != re2['max_seq'])|(~re2["sequenceNo"].isnull()), re2['count1'] + 1 - re2['count2'], re2['count1'] - re2['count2'])
    re2.drop(["max_seq"],axis=1,inplace=True)
    re2.drop(["count1"],axis=1,inplace=True)
    re2.drop(["count2"],axis=1,inplace=True)
    re2['dup'] = np.where(~re2["sequenceNo"].isnull(), re2.groupby(['sequenceNo']).cumcount(), 0)
    re2['dup1'] = np.where(~re2["sequenceNo"].isnull(), re2.groupby(['sequenceNo'])['num'].transform('nunique'), 0)
    re2['nan'] = np.where((re2['sequenceNo'].isnull()) | (re2['dup'] != 0), 1, 0)
    re2.loc[(re2['dup1'] > 1) & (re2['count'] < 0), 'sequenceNo'] = np.nan
    sl = list(set(SZ['skey'].unique()) - set(SZ1['skey'].unique()))
    assert((len(set(sl) - set(re2[re2['seq1'].isnull()]['skey'].unique())) == 0) & 
           (len(set(re2[re2['seq1'].isnull()]['skey'].unique()) - set(sl)) == 0))
    assert(re2.shape[0] == SZ.shape[0])

    display('%.2f%%' % (re2[re2['sequenceNo'].isnull()].shape[0]/re2.shape[0] * 100))
    

9229790

9046203

9229790

9229790

9046203

SZ lv2 is complete


'1.99%'

### 3. SH & SZ trade

In [3]:
startDate = 20200221
endDate = 20200221
database_name = 'com_md_eq_cn'
user = "zhenyuy"
password = "bnONBrzSMGoE"

new_trade_data = []

db = DB("192.168.10.178", database_name, user, password)
trade = db.read('md_trade', start_date=startDate, end_date=endDate)[['skey', 'date', 'ApplSeqNum']]

startDate = '20200221'
endDate = '20200221'

readPath = '/mnt/e/result/logs_***_zs_92_01_day_data'
dataPathLs = np.array(glob.glob(readPath))
dateLs = np.array([os.path.basename(i).split('_')[1] for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]
for n in range(len(dataPathLs)):
    path1 = np.array(glob.glob(dataPathLs[n] + '/mdTradeLog***'))
    trade1 = pd.read_csv(path1[0])
trade1['skey'] = np.where(trade1['exchId'] == 2, trade1['SecurityID'] + 2000000, trade1['SecurityID'] + 1000000)
trade1 = trade1[trade1['skey'].isin(trade['skey'].unique())]
re = pd.merge(trade, trade1[['skey', 'ApplSeqNum', 'sequenceNo', 'clockAtArrival']], on=['skey', 'ApplSeqNum'],
             how='outer')
try:
    assert(re.shape[0] == trade.shape[0])
    display('trade data is complete')
    k = 0
except:
    display('%.2f%%' % (trade.shape[0]/re.shape[0] * 100))
    k = 1
    display('trade data incomplete')
    k1 = pd.merge(trade1, re[re['date'].isnull()][['skey', 'ApplSeqNum']], on=['skey', 'ApplSeqNum'], how='right')
    display(k1.shape[0])
    display(k1['ExecType'].unique())
    display(k1['TransactTime'].unique())
    k1['date'] = trade['date'].iloc[0]
    new_trade_data += [k1[['clockAtArrival', 'sequenceNo', 'TransactTime', 'ApplSeqNum', 'date', 'skey', 'ExecType', 'TradeBSFlag', 
   'TradePrice', 'TradeQty', 'BidApplSeqNum', 'OfferApplSeqNum']]]
    re = pd.merge(trade, trade1[['skey', 'ApplSeqNum', 'sequenceNo', 'clockAtArrival']], on=['skey', 'ApplSeqNum'],
             how='left')
    assert(re.shape[0] == trade.shape[0])

re3 = re.sort_values(by=['skey', 'ApplSeqNum'])
re3['seq1'] = re3.groupby('skey')['sequenceNo'].bfill().ffill()
sl = list(set(trade['skey'].unique()) - set(trade1['skey'].unique()))
re3.loc[re3['skey'].isin(sl), 'seq1'] = np.nan
re3['count1'] = re3.groupby(['seq1']).cumcount()
re3['count2'] = re3.groupby(['seq1'])['count1'].transform('nunique')
re3['max_seq'] = re3.groupby('skey')['sequenceNo'].transform('max')
re3['count'] = np.where((re3['seq1'] != re3['max_seq'])|(~re3['sequenceNo'].isnull()), re3['count1'] + 1 - re3['count2'], re3['count1'] - re3['count2'])
re3.drop(["max_seq"],axis=1,inplace=True)
re3.drop(["count1"],axis=1,inplace=True)
re3.drop(["count2"],axis=1,inplace=True)
re3['dup'] = np.where(~re3["sequenceNo"].isnull(), re3.groupby(['sequenceNo']).cumcount(), 0)
re3['dup1'] = np.where(~re3["sequenceNo"].isnull(), re3.groupby(['sequenceNo'])['ApplSeqNum'].transform('nunique'), 0)
re3['nan'] = np.where((re3['sequenceNo'].isnull()) | (re3['dup'] != 0), 1, 0)
re3.loc[(re3['dup1']>1) & (re3['count'] < 0), 'sequenceNo'] = np.nan
assert((len(set(sl) - set(re3[re3['seq1'].isnull()]['skey'].unique())) == 0) & 
       (len(set(re3[re3['seq1'].isnull()]['skey'].unique()) - set(sl)) == 0))
assert(re3.shape[0] == trade.shape[0])
if k == 1:
    k1['seq1'] = k1['sequenceNo']
    k1['count'] = 0
    k1['nan'] = 0
    k1['dup1'] = 1
    re3 = pd.concat([re3, k1[['clockAtArrival', 'date', 'sequenceNo', 'skey', 'ApplSeqNum', 'seq1', 'count', 'nan', 'dup1']]])

display('%.2f%%' % (re3[re3['sequenceNo'].isnull()].shape[0]/re3.shape[0] * 100))

  interactivity=interactivity, compiler=compiler, result=result)


'100.00%'

'trade data incomplete'

8

array(['4'], dtype=object)

array([133603130, 133603140])

'0.41%'

### 4. SZ order

In [4]:
startDate = 20200221
endDate = 20200221
database_name = 'com_md_eq_cn'
user = "zhenyuy"
password = "bnONBrzSMGoE"

db = DB("192.168.10.178", database_name, user, password)
order = db.read('md_order', start_date=startDate, end_date=endDate)[['skey', 'date', 'ApplSeqNum']]

startDate = '20200221'
endDate = '20200221'

new_order_data = []

readPath = '/mnt/e/result/logs_***_zs_92_01_day_data'
dataPathLs = np.array(glob.glob(readPath))
dateLs = np.array([os.path.basename(i).split('_')[1] for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]
for n in range(len(dataPathLs)):
    path1 = np.array(glob.glob(dataPathLs[n] + '/mdOrderLog***'))
    order1 = pd.read_csv(path1[0])
order1['skey'] = order1['SecurityID'] + 2000000
order1 = order1[order1['skey'].isin(order['skey'].unique())]
re = pd.merge(order, order1[['skey', 'ApplSeqNum', 'sequenceNo', 'clockAtArrival']], on=['skey', 'ApplSeqNum'],
             how='outer')
try:
    assert(re.shape[0] == order.shape[0])
    display('order data is complete')
    k = 0
except:
    display('%.2f%%' % (order.shape[0]/re.shape[0] * 100))
    k = 1
    display('order data incomplete')
    k2 = pd.merge(order1, re[re['date'].isnull()][['skey', 'ApplSeqNum']], on=['skey', 'ApplSeqNum'], how='right')
    display(k2.shape[0])
    display(k2['SecurityID'].unique())
    display(k2['TransactTime'].unique())
    k2['date'] = order['date'].iloc[0]
    new_order_data += [k2[['clockAtArrival', 'sequenceNo', 'TransactTime', 'ApplSeqNum', 'date', 'skey', 'Side', 
   'OrderType', 'Price', 'OrderQty']]]
    re = pd.merge(order, order1[['skey', 'ApplSeqNum', 'sequenceNo', 'clockAtArrival']], on=['skey', 'ApplSeqNum'],
             how='left')
    assert(re.shape[0] == order.shape[0])

re4 = re.sort_values(by=['skey', 'ApplSeqNum'])
re4['seq1'] = re4.groupby('skey')['sequenceNo'].bfill().ffill()
sl = list(set(order['skey'].unique()) - set(order1['skey'].unique()))
re4.loc[re4['skey'].isin(sl), 'seq1'] = np.nan
re4['count1'] = re4.groupby(['seq1']).cumcount()
re4['count2'] = re4.groupby(['seq1'])['count1'].transform('nunique')
re4['max_seq'] = re4.groupby('skey')['sequenceNo'].transform('max')
re4['count'] = np.where((re4['seq1'] != re4['max_seq'])|(~re4['sequenceNo'].isnull()), re4['count1'] + 1 - re4['count2'], re4['count1'] - re4['count2'])
re4.drop(["max_seq"],axis=1,inplace=True)
re4.drop(["count1"],axis=1,inplace=True)
re4.drop(["count2"],axis=1,inplace=True)
re4['dup'] = np.where(~re4["sequenceNo"].isnull(), re4.groupby(['sequenceNo']).cumcount(), 0)
re4['dup1'] = np.where(~re4["sequenceNo"].isnull(), re4.groupby(['sequenceNo'])['ApplSeqNum'].transform('nunique'), 0)
re4['nan'] = np.where((re4['sequenceNo'].isnull()) | (re4['dup'] != 0), 1, 0)
re4.loc[(re4['dup1'] > 1) & (re4['count'] < 0), 'sequenceNo'] = np.nan
assert((len(set(sl) - set(re4[re4['seq1'].isnull()]['skey'].unique())) == 0) & 
       (len(set(re4[re4['seq1'].isnull()]['skey'].unique()) - set(sl)) == 0))
assert(re4.shape[0] == order.shape[0])
if k == 1:
    k2['seq1'] = k2['ApplSeqNum']
    k2['count'] = 0
    k2['nan'] = 0
    k2['dup1'] = 1
    re4 = pd.concat([re4, k2[['clockAtArrival', 'date', 'sequenceNo', 'skey', 'ApplSeqNum', 'seq1', 'count', 'nan', 'dup1']]])


display('%.2f%%' % (re4[re4['sequenceNo'].isnull()].shape[0]/re4.shape[0] * 100))

  interactivity=interactivity, compiler=compiler, result=result)


'order data is complete'

'0.12%'

### 5. SH index

In [35]:
startDate = 20200221
endDate = 20200221
database_name = 'com_md_eq_cn'
user = "zhenyuy"
password = "bnONBrzSMGoE"

db = DB("192.168.10.178", database_name, user, password)
index = db.read('md_index', start_date=startDate, end_date=endDate)

startDate = '20200221'
endDate = '20200221'

readPath = '/mnt/e/result/logs_***_zs_92_01_day_data'
dataPathLs = np.array(glob.glob(readPath))
dateLs = np.array([os.path.basename(i).split('_')[1] for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]


for n in range(len(dataPathLs)):
    path1 = np.array(glob.glob(dataPathLs[n] + '/mdLog_SH_***'))
    index1 = pd.read_csv(path1[0])
    index1 = index1[index1['StockID'].isin([16, 300, 852, 905])]
    
    index1['skey'] = index1['StockID'] + 1000000
    index1 = index1.rename(columns={"openPrice":"open"})
    index1["open"] = np.where(index1["cum_volume"] > 0, index1.groupby("skey")["open"].transform("max"), index1["open"])
    index1['close'] = np.where(index1['cum_volume'] == 0, 0, index1['close'])
    index1["time"] = index1["time"].apply(lambda x: int((x.replace(':', "")).replace(".", "")) * 1000)
index['close'] = np.where(index['cum_volume'] == 0, 0, index['close'])
index['num'] = index['skey'] * 10000 + index['ordering']
index = index[['skey', 'date', 'cum_volume', 'cum_amount', "close", "open", 'num']]
index1 = index1[['clockAtArrival', 'sequenceNo', 'skey', 'cum_volume', 'cum_amount', "close", "open", "time"]]
for cols in ['cum_amount']:
    index1[cols] = index1[cols].round(1)
cols = ['skey', 'cum_volume', 'cum_amount', "close", "open"]
index1 = index1[index1['skey'].isin(index['skey'].unique())]
re = pd.merge(index, index1, on=cols, how='outer')

display(re.shape[0])
display(re[~re['sequenceNo'].isnull()].shape[0])
display(re[~re['date'].isnull()].shape[0])
display(index.shape[0])
display(index1.shape[0])

try:
    assert(re.shape[0] == re[~re['date'].isnull()].shape[0])
    print('index data is complete')
except:
    display('%.2f%%' % (re[~re['date'].isnull()].shape[0]/re.shape[0] * 100))
    re = pd.merge(index, index1, on=cols, how='left')
    print('index data is not complete')

p11 = re[re.duplicated('num', keep=False)]
p2 = re.drop_duplicates('num', keep=False)
p11["order1"] = p11.groupby(["num"]).cumcount()
p11["order2"] = p11.groupby(["sequenceNo"]).cumcount()
p11 = p11[p11['order1'] == p11['order2']]

p12 = re[re.duplicated('num', keep=False)].drop_duplicates('num')
p12 = pd.merge(p12, p11[['num', 'order1']], on='num', how='left')
p12 = p12[p12['order1'].isnull()]
p12['sequenceNo'] = np.nan

p11.drop(['order1', 'order2'],axis=1,inplace=True)
p12.drop(['order1'],axis=1,inplace=True)
p1 = pd.concat([p11, p12])

re = pd.concat([p1, p2])
assert(re[re.duplicated('num', keep=False)].shape[0] == 0)

if re[re['sequenceNo'].isnull()].shape[0] != 0:
    re5 = re.sort_values(by='num')
    re5['seq1'] = re5.groupby('skey')['sequenceNo'].bfill().ffill()
    sl = list(set(index['skey'].unique()) - set(index1['skey'].unique()))
    re5.loc[re5['skey'].isin(sl), 'seq1'] = np.nan
    re5['count1'] = re5.groupby(['seq1']).cumcount()
    re5['count2'] = re5.groupby(['seq1'])['count1'].transform('nunique')
    re5['max_seq'] = re5.groupby('skey')['sequenceNo'].transform('max')
    re5['count'] = np.where((re5['seq1'] != re5['max_seq'])|(~re5['sequenceNo'].isnull()), re5['count1'] + 1 - re5['count2'], re5['count1'] - re5['count2'])
    re5.drop(["max_seq"],axis=1,inplace=True)
    re5.drop(["count1"],axis=1,inplace=True)
    re5.drop(["count2"],axis=1,inplace=True)
    re5['dup'] = np.where(~re5["sequenceNo"].isnull(), re5.groupby(['sequenceNo']).cumcount(), 0)
    re5['dup1'] = np.where(~re5["sequenceNo"].isnull(), re5.groupby(['sequenceNo'])['num'].transform('nunique'), 0)
    re5['nan'] = np.where((re5['sequenceNo'].isnull()) | (re5['dup'] != 0), 1, 0)
    re5.loc[(re5['dup1'] > 1) & (re5['count'] < 0), 'sequenceNo'] = np.nan
    assert((len(set(sl) - set(re5[re5['seq1'].isnull()]['skey'].unique())) == 0) & 
           (len(set(re5[re5['seq1'].isnull()]['skey'].unique()) - set(sl)) == 0))
    assert(re5.shape[0] == index.shape[0])

    display('%.2f%%' % (re5[re5['sequenceNo'].isnull()].shape[0]/re5.shape[0] * 100))
else:
    re5 = re.sort_values(by='num')
    re5['seq1'] = re5['sequenceNo']
    sl = list(set(index['skey'].unique()) - set(index1['skey'].unique()))
    re5.loc[re5['skey'].isin(sl), 'seq1'] = np.nan
    re5['count1'] = re5.groupby(['seq1']).cumcount()
    re5['count2'] = re5.groupby(['seq1'])['count1'].transform('nunique')
    re5['max_seq'] = re5.groupby('skey')['sequenceNo'].transform('max')
    re5['count'] = np.where((re5['seq1'] != re5['max_seq'])|(~re5['sequenceNo'].isnull()), re5['count1'] + 1 - re5['count2'], re5['count1'] - re5['count2'])
    re5.drop(["max_seq"],axis=1,inplace=True)
    re5.drop(["count1"],axis=1,inplace=True)
    re5.drop(["count2"],axis=1,inplace=True)
    re5['dup'] = np.where(~re5["sequenceNo"].isnull(), re5.groupby(['sequenceNo']).cumcount(), 0)
    re5['dup1'] = np.where(~re5["sequenceNo"].isnull(), re5.groupby(['sequenceNo'])['num'].transform('nunique'), 0)
    re5['nan'] = np.where((re5['sequenceNo'].isnull()) | (re5['dup'] != 0), 1, 0)
    re5.loc[(re5['dup1'] > 1) & (re5['count'] < 0), 'sequenceNo'] = np.nan
    assert((len(set(sl) - set(re5[re5['seq1'].isnull()]['skey'].unique())) == 0) & 
           (len(set(re5[re5['seq1'].isnull()]['skey'].unique()) - set(sl)) == 0))
    assert(re5.shape[0] == index.shape[0])

    display('%.2f%%' % (re5[re5['sequenceNo'].isnull()].shape[0]/re5.shape[0] * 100))


893190

893190

893190

14716

16888

index data is complete


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


'2.71%'

In [36]:
p11.groupby('skey')['time'].max()

skey
1000016    151715000000
1000300    151709000000
1000852    151710000000
1000905    151713000000
Name: time, dtype: int64

In [38]:
p12[p12['skey'] == 1000016]

Unnamed: 0,skey,date,cum_volume,cum_amount,close,open,num,clockAtArrival,sequenceNo,time
1085,1000016,20200221,46509479,82434670000.0,2968.2975,2965.6019,10000163594,1582268245624560,,145707000000
1086,1000016,20200221,46509479,82434670000.0,2968.2975,2965.6019,10000163595,1582268245624560,,145707000000
1087,1000016,20200221,46509479,82434670000.0,2968.2975,2965.6019,10000163596,1582268245624560,,145707000000
1157,1000016,20200221,46920152,83241630000.0,2968.1381,2965.6019,10000163667,1582268445778429,,150027000000
1158,1000016,20200221,46920152,83241630000.0,2968.1381,2965.6019,10000163668,1582268445778429,,150027000000
1159,1000016,20200221,46920152,83241630000.0,2968.1381,2965.6019,10000163669,1582268445778429,,150027000000
1160,1000016,20200221,46920152,83241630000.0,2968.1381,2965.6019,10000163670,1582268445778429,,150027000000
1161,1000016,20200221,46920152,83241630000.0,2968.1381,2965.6019,10000163671,1582268445778429,,150027000000
1162,1000016,20200221,46920152,83241630000.0,2968.1381,2965.6019,10000163672,1582268445778429,,150027000000
1163,1000016,20200221,46920152,83241630000.0,2968.1381,2965.6019,10000163673,1582268445778429,,150027000000


In [40]:
index[(index['skey'] == 1000016) & (index['cum_volume'] == 46920152)]

Unnamed: 0,skey,date,cum_volume,cum_amount,close,open,num
3600,1000016,20200221,46920152,83241630000.0,2968.1381,2965.6019,10000163601
3601,1000016,20200221,46920152,83241630000.0,2968.1381,2965.6019,10000163602
3602,1000016,20200221,46920152,83241630000.0,2968.1381,2965.6019,10000163603
3603,1000016,20200221,46920152,83241630000.0,2968.1381,2965.6019,10000163604
3604,1000016,20200221,46920152,83241630000.0,2968.1381,2965.6019,10000163605
3605,1000016,20200221,46920152,83241630000.0,2968.1381,2965.6019,10000163606
3606,1000016,20200221,46920152,83241630000.0,2968.1381,2965.6019,10000163607
3607,1000016,20200221,46920152,83241630000.0,2968.1381,2965.6019,10000163608
3608,1000016,20200221,46920152,83241630000.0,2968.1381,2965.6019,10000163609
3609,1000016,20200221,46920152,83241630000.0,2968.1381,2965.6019,10000163610


### 6. concat all together

In [6]:
assert(len(set(SZ1['sequenceNo']) & set(SH1['sequenceNo']) & set(trade1['sequenceNo']) 
& set(order1['sequenceNo']) & set(index1['sequenceNo'])) == 0)
del SH
del SH1
del SZ
del SZ1
del trade
del trade1
del order
del order1
del index
del index1
re1['tag'] = 'SH'
re2['tag'] = 'SZ'
re3['tag'] = 'trade'
re4['tag'] = 'order'
re5['tag'] = 'index'

In [None]:
pd.set_option("max_rows", 200)
re1.head(200)

In [None]:
re3[(re3['seq1'] == 157255321)]

In [None]:
re1[(re1['nan'] == 1) & (~re1['seq1'].isnull())]

In [None]:
re1[re1['seq1'] == 1088550.0]

In [None]:
re2[re2['seq1'] == 1189.0]

In [None]:
re4[re4['seq1'] == 156717319.0]

In [None]:
re5[re5['dup1'] == 2]

In [None]:
re3[re3['seq1'] == 157255321.0]

In [7]:
re1 = re1[['skey', 'date', 'num', 'sequenceNo', 'seq1', 'clockAtArrival', 'nan', 'count', 'tag', 'dup1']]
re2 = re2[['skey', 'date', 'num', 'sequenceNo', 'seq1', 'clockAtArrival', 'nan', 'count', 'tag', 'dup1']]
re3 = re3[['skey', 'date', 'ApplSeqNum', 'sequenceNo', 'seq1', 'clockAtArrival', 'nan', 'count', 'tag', 'dup1']]
re4 = re4[['skey', 'date', 'ApplSeqNum', 'sequenceNo', 'seq1', 'clockAtArrival', 'nan', 'count', 'tag', 'dup1']]
re5 = re5[['skey', 'date', 'num', 'sequenceNo', 'seq1', 'clockAtArrival', 'nan', 'count', 'tag', 'dup1']]
re1 = re1.sort_values(by='num').reset_index(drop=True)
re1['seq2'] = re1.index
re2 = re2.sort_values(by='num').reset_index(drop=True)
re2['seq2'] = re2.index
re3 = re3.sort_values(by=['skey', 'ApplSeqNum']).reset_index(drop=True)
re3['seq2'] = re3.index
re4 = re4.sort_values(by=['skey', 'ApplSeqNum']).reset_index(drop=True)
re4['seq2'] = re4.index
re5 = re5.sort_values(by='num').reset_index(drop=True)
re5['seq2'] = re5.index

In [8]:
fr1 = []
fr2 = []
fr1 += [re1[re1['seq1'].isnull()]]
fr2 += [re1[~re1['seq1'].isnull()]]
del re1
display('1. here~')
fr1 += [re2[re2['seq1'].isnull()]]
fr2 += [re2[~re2['seq1'].isnull()]]
del re2
display('2. here~')
fr1 += [re3[re3['seq1'].isnull()]]
fr2 += [re3[~re3['seq1'].isnull()]]
del re3
display('3. here~')
fr1 += [re4[re4['seq1'].isnull()]]
fr2 += [re4[~re4['seq1'].isnull()]]
del re4
display('4. here~')
fr1 += [re5[re5['seq1'].isnull()]]
fr2 += [re5[~re5['seq1'].isnull()]]
del re5
display('5. here~')
fr1 = pd.concat(fr1).reset_index(drop=True)
fr2 = pd.concat(fr2).reset_index(drop=True)

import datetime
startTm = datetime.datetime.now()
fr2 = fr2.sort_values(by=['seq1', 'seq2'])
print(datetime.datetime.now() - startTm)

'1. here~'

'2. here~'

'3. here~'

'4. here~'

'5. here~'

0:07:38.313590


In [9]:
pd.set_option('max_rows', 200)
fr2.head(100)

Unnamed: 0,skey,date,num,sequenceNo,seq1,clockAtArrival,nan,count,tag,dup1,seq2,ApplSeqNum
141804349,1000016,20200221.0,10000160000.0,59.0,59.0,1582245000000000.0,0,0.0,index,1.0,0,
141808028,1000300,20200221.0,10003000000.0,60.0,60.0,1582245000000000.0,0,0.0,index,1.0,3679,
141811707,1000852,20200221.0,10008520000.0,61.0,61.0,1582245000000000.0,0,0.0,index,1.0,7358,
141815386,1000905,20200221.0,10009050000.0,62.0,62.0,1582245000000000.0,0,0.0,index,1.0,11037,
141804350,1000016,20200221.0,10000160000.0,329.0,329.0,1582245000000000.0,0,0.0,index,1.0,1,
141808029,1000300,20200221.0,10003000000.0,330.0,330.0,1582245000000000.0,0,0.0,index,1.0,3680,
141811708,1000852,20200221.0,10008520000.0,331.0,331.0,1582245000000000.0,0,0.0,index,1.0,7359,
141815387,1000905,20200221.0,10009050000.0,332.0,332.0,1582245000000000.0,0,0.0,index,1.0,11038,
141804351,1000016,20200221.0,10000160000.0,596.0,596.0,1582245000000000.0,0,0.0,index,1.0,2,
141808030,1000300,20200221.0,10003000000.0,597.0,597.0,1582245000000000.0,0,0.0,index,1.0,3681,


In [10]:
fr2[(fr2['skey'] == 1000016) & (fr2['tag'] == 'index')]

Unnamed: 0,skey,date,num,sequenceNo,seq1,clockAtArrival,nan,count,tag,dup1,seq2,ApplSeqNum
141804349,1000016,20200221.0,1.000016e+10,59.0,59.0,1.582245e+15,0,0.0,index,1.0,0,
141804350,1000016,20200221.0,1.000016e+10,329.0,329.0,1.582245e+15,0,0.0,index,1.0,1,
141804351,1000016,20200221.0,1.000016e+10,596.0,596.0,1.582245e+15,0,0.0,index,1.0,2,
141804352,1000016,20200221.0,1.000016e+10,852.0,852.0,1.582245e+15,0,0.0,index,1.0,3,
141804353,1000016,20200221.0,1.000016e+10,1133.0,1133.0,1.582245e+15,0,0.0,index,1.0,4,
...,...,...,...,...,...,...,...,...,...,...,...,...
141808023,1000016,20200221.0,1.000016e+10,,157410658.0,1.582268e+15,1,-5.0,index,0.0,3674,
141808024,1000016,20200221.0,1.000016e+10,,157410658.0,1.582268e+15,1,-4.0,index,0.0,3675,
141808025,1000016,20200221.0,1.000016e+10,,157410658.0,1.582268e+15,1,-3.0,index,0.0,3676,
141808026,1000016,20200221.0,1.000016e+10,,157410658.0,1.582268e+15,1,-2.0,index,0.0,3677,


In [11]:
fr2.loc[(fr2['nan']==0) & (fr2['dup1'] == 1), 'count'] = 0

In [12]:
pd.set_option('max_rows', 200)
fr2[fr2['dup1'] == 2]

Unnamed: 0,skey,date,num,sequenceNo,seq1,clockAtArrival,nan,count,tag,dup1,seq2,ApplSeqNum
10309,1600006,20200221.0,1.600006e+10,,342915.0,1.582248e+15,0,-1.0,SH,2.0,10309,
10310,1600006,20200221.0,1.600006e+10,342915.0,342915.0,1.582248e+15,1,0.0,SH,2.0,10310,
15235,1600007,20200221.0,1.600007e+10,,342916.0,1.582248e+15,0,-1.0,SH,2.0,15235,
15236,1600007,20200221.0,1.600007e+10,342916.0,342916.0,1.582248e+15,1,0.0,SH,2.0,15236,
28539,1600010,20200221.0,1.600010e+10,,342917.0,1.582248e+15,0,-1.0,SH,2.0,28539,
...,...,...,...,...,...,...,...,...,...,...,...,...
2219540,1600621,20200221.0,1.600621e+10,157185555.0,157185555.0,1.582268e+15,1,0.0,SH,2.0,2219540,
2134431,1600598,20200221.0,1.600599e+10,,157185616.0,1.582268e+15,0,-1.0,SH,2.0,2134431,
2134432,1600598,20200221.0,1.600599e+10,157185616.0,157185616.0,1.582268e+15,1,0.0,SH,2.0,2134432,
2723098,1600756,20200221.0,1.600757e+10,,157185621.0,1.582268e+15,0,-1.0,SH,2.0,2723098,


In [13]:
fr2['sum_nan'] = fr2['nan'].cumsum()
fr2['sequenceNo'] = fr2['sequenceNo'] + fr2['sum_nan']
fr2['sequenceNo'] = fr2['sequenceNo'].bfill()
fr2['sequenceNo'] = fr2['sequenceNo'] + fr2['count']
fr21 = fr2[~fr2['sequenceNo'].isnull()]
fr22 = fr2[fr2['sequenceNo'].isnull()]
display(fr22.shape[0])
display(fr21.shape[0])
display(fr2.shape[0])
if fr22.shape[0] != 0:
    fr22['sequenceNo'] = range(int(fr21['sequenceNo'].max()) + 1, int(fr21['sequenceNo'].max()) + 1 + fr22.shape[0])
    fr2 = pd.concat([fr21, fr22])
del fr21
del fr22
display(fr2.shape[0])

13

141819052

141819065

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


141819065

In [14]:
fr2.shape[0]

141819065

In [21]:
fr1['sequenceNo'] = range(int(fr2['sequenceNo'].max()) + 1, int(fr2['sequenceNo'].max()) + 1 + fr1.shape[0])
fr2 = pd.concat([fr1, fr2])
del fr1

In [22]:
fr2.head(1)

Unnamed: 0,skey,date,num,sequenceNo,seq1,clockAtArrival,nan,count,tag,dup1,seq2,ApplSeqNum,sum_nan
0,1688080,20200221.0,16880800000.0,157668404.0,,,1,,SH,0.0,6539289,,


In [24]:
SH = fr2[fr2['tag'] == 'SH'][["skey", "date", "num", 'sequenceNo', "clockAtArrival"]]
os.mkdir('/mnt/e/result/' + startDate)
SH.to_csv('/mnt/e/result/' + startDate + '/SH.csv')
del SH

SZ = fr2[fr2['tag'] == 'SZ'][["skey", "date", "num", 'sequenceNo', "clockAtArrival"]]
SZ.to_csv('/mnt/e/result/' + startDate + '/SZ.csv')
del SZ

trade = fr2[fr2['tag'] == 'trade'][["skey", "date", "ApplSeqNum", 'sequenceNo', "clockAtArrival"]]
trade.to_csv('/mnt/e/result/' + startDate + '/trade.csv')
del trade

order = fr2[fr2['tag'] == 'order'][["skey", "date", "ApplSeqNum", 'sequenceNo', "clockAtArrival"]]
order.to_csv('/mnt/e/result/' + startDate + '/order.csv')
del order

index = fr2[fr2['tag'] == 'index'][["skey", "date", "num", 'sequenceNo', "clockAtArrival"]]
index.to_csv('/mnt/e/result/' + startDate + '/index.csv')
del index
del fr2

In [3]:
# SH
startDate = 20200102
endDate = 20200102
database_name = 'com_md_eq_cn'
user = "zhenyuy"
password = "bnONBrzSMGoE"

db = DB("192.168.10.178", database_name, user, password)
SH = db.read('md_snapshot_l2', start_date=startDate, end_date=endDate)
SZ = SH[SH['skey'] > 2000000]
SH = SH[SH['skey'] < 2000000]
SH['num'] = SH['skey'] * 10000 + SH['ordering']
SZ['num'] = SZ['skey'] * 10000 + SZ['ordering']

SH1 = pd.read_pickle('/mnt/e/result/20200102/SH.pkl')

assert(SH.shape[0] == SH1.shape[0])

SH = pd.merge(SH, SH1[['num', 'clockAtArrival', 'sequenceNo']], on=['num'], how='outer')
assert(SH[SH['sequenceNo'].isnull()].shape[0] == 0)
assert(SH[SH['time'].isnull()].shape[0] == 0)
SH.drop(['num'],axis=1,inplace=True)
SH = SH.sort_values(by=['skey', 'ordering'])

SH.to_pickle('/mnt/e/result/20200102/add_columns/SH.pkl')

In [10]:
SH[SH['clockAtArrival_y'].isnull() & (SH['skey'] == 1600000)]

Unnamed: 0,skey,date,time,clockAtArrival_x,datetime,ordering,has_missing,cum_trades_cnt,cum_volume,cum_amount,prev_close,open,high,low,close,bid10p,bid9p,bid8p,bid7p,bid6p,bid5p,bid4p,bid3p,bid2p,bid1p,ask1p,ask2p,ask3p,ask4p,ask5p,ask6p,ask7p,ask8p,ask9p,ask10p,bid10q,bid9q,bid8q,bid7q,bid6q,bid5q,bid4q,bid3q,bid2q,bid1q,ask1q,ask2q,ask3q,ask4q,ask5q,ask6q,ask7q,ask8q,ask9q,ask10q,bid10n,bid9n,bid8n,bid7n,bid6n,bid5n,bid4n,bid3n,bid2n,bid1n,ask1n,ask2n,ask3n,ask4n,ask5n,ask6n,ask7n,ask8n,ask9n,ask10n,bid1Top1q,bid1Top2q,bid1Top3q,bid1Top4q,bid1Top5q,bid1Top6q,bid1Top7q,bid1Top8q,bid1Top9q,bid1Top10q,bid1Top11q,bid1Top12q,bid1Top13q,bid1Top14q,bid1Top15q,bid1Top16q,bid1Top17q,bid1Top18q,bid1Top19q,bid1Top20q,bid1Top21q,bid1Top22q,bid1Top23q,bid1Top24q,bid1Top25q,bid1Top26q,bid1Top27q,bid1Top28q,bid1Top29q,bid1Top30q,bid1Top31q,bid1Top32q,bid1Top33q,bid1Top34q,bid1Top35q,bid1Top36q,bid1Top37q,bid1Top38q,bid1Top39q,bid1Top40q,bid1Top41q,bid1Top42q,bid1Top43q,bid1Top44q,bid1Top45q,bid1Top46q,bid1Top47q,bid1Top48q,bid1Top49q,bid1Top50q,ask1Top1q,ask1Top2q,ask1Top3q,ask1Top4q,ask1Top5q,ask1Top6q,ask1Top7q,ask1Top8q,ask1Top9q,ask1Top10q,ask1Top11q,ask1Top12q,ask1Top13q,ask1Top14q,ask1Top15q,ask1Top16q,ask1Top17q,ask1Top18q,ask1Top19q,ask1Top20q,ask1Top21q,ask1Top22q,ask1Top23q,ask1Top24q,ask1Top25q,ask1Top26q,ask1Top27q,ask1Top28q,ask1Top29q,ask1Top30q,ask1Top31q,ask1Top32q,ask1Top33q,ask1Top34q,ask1Top35q,ask1Top36q,ask1Top37q,ask1Top38q,ask1Top39q,ask1Top40q,ask1Top41q,ask1Top42q,ask1Top43q,ask1Top44q,ask1Top45q,ask1Top46q,ask1Top47q,ask1Top48q,ask1Top49q,ask1Top50q,total_bid_quantity,total_ask_quantity,total_bid_vwap,total_ask_vwap,total_bid_orders,total_ask_orders,total_bid_levels,total_ask_levels,bid_trade_max_duration,ask_trade_max_duration,cum_canceled_buy_orders,cum_canceled_buy_volume,cum_canceled_buy_amount,cum_canceled_sell_orders,cum_canceled_sell_volume,cum_canceled_sell_amount,clockAtArrival_y,sequenceNo
554,1600000,20200103,95054000000,1578016254000000,2020-01-03 09:50:54,555,0,2680,5090831,63707610.0,12.47,12.57,12.57,12.48,12.5,12.41,12.42,12.43,12.44,12.45,12.46,12.47,12.48,12.49,12.5,12.51,12.52,12.53,12.54,12.55,12.56,12.57,12.58,12.59,12.6,110500,133700,97100,19100,472400,157500,267200,109000,29200,15200,32300,26800,83943,113600,191786,207200,309350,119100,230000,179300,44,31,21,15,61,67,51,29,9,4,10,10,18,36,60,62,29,34,44,82,11800,3000,100,300,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,100,14100,5900,100,500,3000,4000,1800,100,2700,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3329600,5862974,12.31,12.877,859,1932,80,114,616,466,559,3892269,48584530.0,377,1424250,17912120.0,,21402701.0
677,1600000,20200103,95657000000,1578016617000000,2020-01-03 09:56:57,678,0,3184,5944131,74368130.0,12.47,12.57,12.57,12.48,12.5,12.4,12.41,12.42,12.43,12.44,12.45,12.46,12.47,12.48,12.49,12.5,12.51,12.52,12.53,12.54,12.55,12.56,12.57,12.58,12.59,401500,114300,152900,99100,18500,481100,170700,285700,128500,74800,92500,33400,56777,110743,135000,206286,205900,310650,140900,231000,73,46,34,22,14,63,71,51,48,4,31,12,21,21,35,64,62,31,40,46,71000,3000,100,700,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,400,1000,1200,4000,1200,1200,900,10000,500,2400,3500,800,8300,400,800,1000,1600,1000,1100,600,10000,2200,1100,7900,6700,8100,7000,4600,2400,100,500,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3492900,6169311,12.314,12.865,901,2041,79,115,616,466,605,4032169,50329370.0,429,1502850,18897960.0,,25333665.0
901,1600000,20200103,100803000000,1578017283000000,2020-01-03 10:08:03,902,0,4169,7469074,93404170.0,12.47,12.57,12.57,12.47,12.49,12.4,12.41,12.42,12.43,12.44,12.45,12.46,12.47,12.48,12.49,12.5,12.51,12.52,12.53,12.54,12.55,12.56,12.57,12.58,12.59,406900,143300,159500,137200,148600,429900,194300,199100,126400,93700,232700,37400,102900,113643,128400,222286,204700,283050,142600,239600,78,53,39,41,21,73,90,61,33,19,19,14,27,25,30,64,61,33,42,51,62400,100,4000,700,200,800,300,500,5000,2000,3000,900,100,1800,1400,10000,300,100,100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,203800,300,5100,4000,5100,200,800,200,100,1700,3600,1000,100,200,100,4000,100,600,1700,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3663300,6450499,12.318,12.856,991,2096,79,115,1725,659,728,4615126,57588240.0,530,1717227,21583980.0,,31682141.0
1915,1600000,20200103,105830000000,1578020310000000,2020-01-03 10:58:30,1916,0,6859,13341866,166851900.0,12.47,12.57,12.57,12.47,12.5,12.4,12.41,12.42,12.43,12.44,12.45,12.46,12.47,12.48,12.49,12.5,12.51,12.52,12.53,12.54,12.55,12.56,12.57,12.58,12.59,441000,131600,143800,116300,47100,390200,172800,170100,316568,154724,117267,91360,238200,600637,156800,286386,232900,355050,176100,273407,88,51,42,38,21,87,84,79,29,30,54,47,63,44,50,89,73,55,54,58,924,2000,600,900,1000,1000,800,1000,100,900,11700,50100,100,7600,35000,3000,100,6000,2300,1000,1000,6600,200,5900,4000,4800,1200,1200,300,3400,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,29800,1000,1000,2000,1000,500,13177,400,1400,5100,800,10000,100,200,100,100,200,400,300,200,9700,300,500,1000,3600,100,300,600,400,200,100,5000,300,3400,400,600,3000,700,500,990,300,5600,500,500,1000,800,200,300,100,7100,4730692,7684929,12.169,12.83,1050,2548,81,115,2595,2569,1181,6297842,78561710.0,1007,3828428,48064080.0,,54221099.0
1935,1600000,20200103,105930000000,1578020370000000,2020-01-03 10:59:30,1936,0,6895,13397174,167543100.0,12.47,12.57,12.57,12.47,12.49,12.4,12.41,12.42,12.43,12.44,12.45,12.46,12.47,12.48,12.49,12.5,12.51,12.52,12.53,12.54,12.55,12.56,12.57,12.58,12.59,541000,131600,143800,116300,47100,389900,167800,170100,320368,180816,74967,94360,248200,600637,166000,296788,232900,355050,176100,274407,89,51,42,38,21,86,83,79,30,29,53,49,64,44,52,91,73,55,54,59,216,11700,50100,7600,35000,3000,100,6000,2300,1000,1000,6600,5900,4000,4800,1200,1200,300,400,1200,16400,1000,2000,5000,1000,6200,1000,600,4000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2177,400,1400,5100,800,10000,100,200,100,100,200,400,300,200,9700,300,500,1000,3600,100,300,600,400,200,100,5000,300,3400,400,600,3000,700,500,990,300,5600,500,500,1000,800,200,300,100,7100,200,300,800,100,200,1400,4855284,7677231,12.175,12.831,1049,2556,81,115,2595,2569,1190,6313742,78760110.0,1011,3830128,48085330.0,,54549484.0
3467,1600000,20200103,134125000000,1578030085000000,2020-01-03 13:41:25,3468,0,12674,24714969,309492600.0,12.47,12.57,12.61,12.47,12.6,12.5,12.51,12.52,12.53,12.54,12.55,12.56,12.57,12.58,12.59,12.6,12.61,12.62,12.63,12.64,12.65,12.66,12.67,12.68,12.69,150055,106000,81900,222200,67000,37600,223100,300807,128400,143670,26941,218101,341190,199000,383700,527930,203415,89300,253150,213191,52,25,10,11,11,12,10,4,17,13,7,76,80,86,72,151,79,31,100,79,39570,80000,1700,300,100,3200,1700,200,200,6100,300,3000,7300,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16741,2100,100,100,2000,4900,1000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4933500,7131495,12.384,13.022,1169,2276,92,106,2595,9411,2005,10281431,127436100.0,1727,5587180,70270690.0,,80656068.0
3790,1600000,20200103,135731000000,1578031051000000,2020-01-03 13:57:31,3791,0,14101,27850433,349006300.0,12.47,12.57,12.61,12.47,12.6,12.51,12.52,12.53,12.54,12.55,12.56,12.57,12.58,12.59,12.6,12.61,12.62,12.63,12.64,12.65,12.66,12.67,12.68,12.69,12.7,96700,71700,119900,67400,56300,208700,360307,162500,192754,60420,230698,489267,275387,436300,608790,225005,151900,319450,249891,258445,29,13,10,12,19,7,8,25,25,12,96,114,106,105,187,93,39,107,89,112,41220,300,5900,300,700,2000,300,900,400,500,6900,1000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,34342,800,200,1000,10000,600,200,10000,17200,1000,500,5000,500,900,2000,4000,500,20000,1000,300,500,2000,1000,1000,2000,1000,300,2000,1000,200,1600,1000,3000,3000,1000,800,400,500,1000,2400,3000,3500,1300,3000,1500,1000,200,1000,200,1000,4767404,8080109,12.384,12.991,1179,2523,93,105,2595,9411,2193,11337763,140678300.0,1953,6198480,78001430.0,,86105037.0
4991,1600000,20200103,145711000000,1578034631000000,2020-01-03 14:57:11,4992,0,18511,37196510,466692400.0,12.47,12.57,12.63,12.47,12.59,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.6,12.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,181500,181500,614291,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3482021,9119733,12.355,12.949,1016,2737,87,108,4122,11016,2964,14275869,177414000.0,2797,9028155,113829100.0,,110957639.0
5006,1600000,20200103,145811000000,1578034691000000,2020-01-03 14:58:11,5007,0,18511,37196510,466692400.0,12.47,12.57,12.63,12.47,12.59,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.6,12.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,284900,284900,604091,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3482021,9119733,12.355,12.949,1016,2737,87,108,4122,11016,2964,14275869,177414000.0,2797,9028155,113829100.0,,111142810.0
5056,1600000,20200103,151435000000,1578035675000000,2020-01-03 15:14:35,5057,0,18682,38018810,477053400.0,12.47,12.57,12.63,12.47,12.6,12.5,12.51,12.52,12.53,12.54,12.55,12.56,12.57,12.58,12.59,12.6,12.61,12.62,12.63,12.64,12.65,12.66,12.67,12.68,12.69,242655,141700,88000,161000,108900,256000,171500,205064,113634,61800,505651,315180,438762,548587,501400,642290,172930,151981,321850,280291,65,32,20,25,29,46,27,28,19,20,79,123,141,167,111,227,100,37,124,97,200,1000,10000,17000,9400,1000,3600,1000,2300,1500,700,2000,100,4000,1500,100,3800,1000,500,1100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15491,1200,500,1200,3600,1000,5000,1000,500,500,1000,900,200000,9900,1000,400,2000,5000,1000,1000,500,15000,500,4200,200,30000,1500,100,300,400,100,200,500,11000,200,5700,300,300,1800,20000,2000,600,100,500,1700,300,300,1000,5000,30000,3872921,9247793,12.375,12.948,1086,2805,88,107,4122,11016,2964,14275869,177414000.0,2797,9028155,113829100.0,,112139978.0


In [12]:
pd.set_option('max_rows', 200)
SH[(SH['skey'] == 1600000) & (SH['time'] > 151400000000)].head(200)

Unnamed: 0,skey,date,time,clockAtArrival_x,datetime,ordering,has_missing,cum_trades_cnt,cum_volume,cum_amount,prev_close,open,high,low,close,bid10p,bid9p,bid8p,bid7p,bid6p,bid5p,bid4p,bid3p,bid2p,bid1p,ask1p,ask2p,ask3p,ask4p,ask5p,ask6p,ask7p,ask8p,ask9p,ask10p,bid10q,bid9q,bid8q,bid7q,bid6q,bid5q,bid4q,bid3q,bid2q,bid1q,ask1q,ask2q,ask3q,ask4q,ask5q,ask6q,ask7q,ask8q,ask9q,ask10q,bid10n,bid9n,bid8n,bid7n,bid6n,bid5n,bid4n,bid3n,bid2n,bid1n,ask1n,ask2n,ask3n,ask4n,ask5n,ask6n,ask7n,ask8n,ask9n,ask10n,bid1Top1q,bid1Top2q,bid1Top3q,bid1Top4q,bid1Top5q,bid1Top6q,bid1Top7q,bid1Top8q,bid1Top9q,bid1Top10q,bid1Top11q,bid1Top12q,bid1Top13q,bid1Top14q,bid1Top15q,bid1Top16q,bid1Top17q,bid1Top18q,bid1Top19q,bid1Top20q,bid1Top21q,bid1Top22q,bid1Top23q,bid1Top24q,bid1Top25q,bid1Top26q,bid1Top27q,bid1Top28q,bid1Top29q,bid1Top30q,bid1Top31q,bid1Top32q,bid1Top33q,bid1Top34q,bid1Top35q,bid1Top36q,bid1Top37q,bid1Top38q,bid1Top39q,bid1Top40q,bid1Top41q,bid1Top42q,bid1Top43q,bid1Top44q,bid1Top45q,bid1Top46q,bid1Top47q,bid1Top48q,bid1Top49q,bid1Top50q,ask1Top1q,ask1Top2q,ask1Top3q,ask1Top4q,ask1Top5q,ask1Top6q,ask1Top7q,ask1Top8q,ask1Top9q,ask1Top10q,ask1Top11q,ask1Top12q,ask1Top13q,ask1Top14q,ask1Top15q,ask1Top16q,ask1Top17q,ask1Top18q,ask1Top19q,ask1Top20q,ask1Top21q,ask1Top22q,ask1Top23q,ask1Top24q,ask1Top25q,ask1Top26q,ask1Top27q,ask1Top28q,ask1Top29q,ask1Top30q,ask1Top31q,ask1Top32q,ask1Top33q,ask1Top34q,ask1Top35q,ask1Top36q,ask1Top37q,ask1Top38q,ask1Top39q,ask1Top40q,ask1Top41q,ask1Top42q,ask1Top43q,ask1Top44q,ask1Top45q,ask1Top46q,ask1Top47q,ask1Top48q,ask1Top49q,ask1Top50q,total_bid_quantity,total_ask_quantity,total_bid_vwap,total_ask_vwap,total_bid_orders,total_ask_orders,total_bid_levels,total_ask_levels,bid_trade_max_duration,ask_trade_max_duration,cum_canceled_buy_orders,cum_canceled_buy_volume,cum_canceled_buy_amount,cum_canceled_sell_orders,cum_canceled_sell_volume,cum_canceled_sell_amount,clockAtArrival_y,sequenceNo
5056,1600000,20200103,151435000000,1578035675000000,2020-01-03 15:14:35,5057,0,18682,38018810,477053357.4,12.47,12.57,12.63,12.47,12.6,12.5,12.51,12.52,12.53,12.54,12.55,12.56,12.57,12.58,12.59,12.6,12.61,12.62,12.63,12.64,12.65,12.66,12.67,12.68,12.69,242655,141700,88000,161000,108900,256000,171500,205064,113634,61800,505651,315180,438762,548587,501400,642290,172930,151981,321850,280291,65,32,20,25,29,46,27,28,19,20,79,123,141,167,111,227,100,37,124,97,200,1000,10000,17000,9400,1000,3600,1000,2300,1500,700,2000,100,4000,1500,100,3800,1000,500,1100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15491,1200,500,1200,3600,1000,5000,1000,500,500,1000,900,200000,9900,1000,400,2000,5000,1000,1000,500,15000,500,4200,200,30000,1500,100,300,400,100,200,500,11000,200,5700,300,300,1800,20000,2000,600,100,500,1700,300,300,1000,5000,30000,3872921,9247793,12.375,12.948,1086,2805,88,107,4122,11016,2964,14275869,177414000.0,2797,9028155,113829100.0,,112139978.0
5057,1600000,20200103,151535000000,1578035735000000,2020-01-03 15:15:35,5058,0,18682,38018810,477053357.4,12.47,12.57,12.63,12.47,12.6,12.5,12.51,12.52,12.53,12.54,12.55,12.56,12.57,12.58,12.59,12.6,12.61,12.62,12.63,12.64,12.65,12.66,12.67,12.68,12.69,242655,141700,88000,161000,108900,256000,171500,205064,113634,61800,505651,315180,438762,548587,501400,642290,172930,151981,321850,280291,65,32,20,25,29,46,27,28,19,20,79,123,141,167,111,227,100,37,124,97,200,1000,10000,17000,9400,1000,3600,1000,2300,1500,700,2000,100,4000,1500,100,3800,1000,500,1100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15491,1200,500,1200,3600,1000,5000,1000,500,500,1000,900,200000,9900,1000,400,2000,5000,1000,1000,500,15000,500,4200,200,30000,1500,100,300,400,100,200,500,11000,200,5700,300,300,1800,20000,2000,600,100,500,1700,300,300,1000,5000,30000,3872921,9247793,12.375,12.948,1086,2805,88,107,4122,11016,2964,14275869,177414000.0,2797,9028155,113829100.0,,112139979.0
5058,1600000,20200103,151635000000,1578035795000000,2020-01-03 15:16:35,5059,0,18682,38018810,477053357.4,12.47,12.57,12.63,12.47,12.6,12.5,12.51,12.52,12.53,12.54,12.55,12.56,12.57,12.58,12.59,12.6,12.61,12.62,12.63,12.64,12.65,12.66,12.67,12.68,12.69,242655,141700,88000,161000,108900,256000,171500,205064,113634,61800,505651,315180,438762,548587,501400,642290,172930,151981,321850,280291,65,32,20,25,29,46,27,28,19,20,79,123,141,167,111,227,100,37,124,97,200,1000,10000,17000,9400,1000,3600,1000,2300,1500,700,2000,100,4000,1500,100,3800,1000,500,1100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15491,1200,500,1200,3600,1000,5000,1000,500,500,1000,900,200000,9900,1000,400,2000,5000,1000,1000,500,15000,500,4200,200,30000,1500,100,300,400,100,200,500,11000,200,5700,300,300,1800,20000,2000,600,100,500,1700,300,300,1000,5000,30000,3872921,9247793,12.375,12.948,1086,2805,88,107,4122,11016,2964,14275869,177414000.0,2797,9028155,113829100.0,,112139980.0
5059,1600000,20200103,151735000000,1578035855000000,2020-01-03 15:17:35,5060,0,18682,38018810,477053357.4,12.47,12.57,12.63,12.47,12.6,12.5,12.51,12.52,12.53,12.54,12.55,12.56,12.57,12.58,12.59,12.6,12.61,12.62,12.63,12.64,12.65,12.66,12.67,12.68,12.69,242655,141700,88000,161000,108900,256000,171500,205064,113634,61800,505651,315180,438762,548587,501400,642290,172930,151981,321850,280291,65,32,20,25,29,46,27,28,19,20,79,123,141,167,111,227,100,37,124,97,200,1000,10000,17000,9400,1000,3600,1000,2300,1500,700,2000,100,4000,1500,100,3800,1000,500,1100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15491,1200,500,1200,3600,1000,5000,1000,500,500,1000,900,200000,9900,1000,400,2000,5000,1000,1000,500,15000,500,4200,200,30000,1500,100,300,400,100,200,500,11000,200,5700,300,300,1800,20000,2000,600,100,500,1700,300,300,1000,5000,30000,3872921,9247793,12.375,12.948,1086,2805,88,107,4122,11016,2964,14275869,177414000.0,2797,9028155,113829100.0,,112139981.0
5060,1600000,20200103,151835000000,1578035915000000,2020-01-03 15:18:35,5061,0,18682,38018810,477053357.4,12.47,12.57,12.63,12.47,12.6,12.5,12.51,12.52,12.53,12.54,12.55,12.56,12.57,12.58,12.59,12.6,12.61,12.62,12.63,12.64,12.65,12.66,12.67,12.68,12.69,242655,141700,88000,161000,108900,256000,171500,205064,113634,61800,505651,315180,438762,548587,501400,642290,172930,151981,321850,280291,65,32,20,25,29,46,27,28,19,20,79,123,141,167,111,227,100,37,124,97,200,1000,10000,17000,9400,1000,3600,1000,2300,1500,700,2000,100,4000,1500,100,3800,1000,500,1100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15491,1200,500,1200,3600,1000,5000,1000,500,500,1000,900,200000,9900,1000,400,2000,5000,1000,1000,500,15000,500,4200,200,30000,1500,100,300,400,100,200,500,11000,200,5700,300,300,1800,20000,2000,600,100,500,1700,300,300,1000,5000,30000,3872921,9247793,12.375,12.948,1086,2805,88,107,4122,11016,2964,14275869,177414000.0,2797,9028155,113829100.0,,112139982.0
5061,1600000,20200103,151935000000,1578035975000000,2020-01-03 15:19:35,5062,0,18682,38018810,477053357.4,12.47,12.57,12.63,12.47,12.6,12.5,12.51,12.52,12.53,12.54,12.55,12.56,12.57,12.58,12.59,12.6,12.61,12.62,12.63,12.64,12.65,12.66,12.67,12.68,12.69,242655,141700,88000,161000,108900,256000,171500,205064,113634,61800,505651,315180,438762,548587,501400,642290,172930,151981,321850,280291,65,32,20,25,29,46,27,28,19,20,79,123,141,167,111,227,100,37,124,97,200,1000,10000,17000,9400,1000,3600,1000,2300,1500,700,2000,100,4000,1500,100,3800,1000,500,1100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15491,1200,500,1200,3600,1000,5000,1000,500,500,1000,900,200000,9900,1000,400,2000,5000,1000,1000,500,15000,500,4200,200,30000,1500,100,300,400,100,200,500,11000,200,5700,300,300,1800,20000,2000,600,100,500,1700,300,300,1000,5000,30000,3872921,9247793,12.375,12.948,1086,2805,88,107,4122,11016,2964,14275869,177414000.0,2797,9028155,113829100.0,,112139983.0
5062,1600000,20200103,152035000000,1578036035000000,2020-01-03 15:20:35,5063,0,18682,38018810,477053357.4,12.47,12.57,12.63,12.47,12.6,12.5,12.51,12.52,12.53,12.54,12.55,12.56,12.57,12.58,12.59,12.6,12.61,12.62,12.63,12.64,12.65,12.66,12.67,12.68,12.69,242655,141700,88000,161000,108900,256000,171500,205064,113634,61800,505651,315180,438762,548587,501400,642290,172930,151981,321850,280291,65,32,20,25,29,46,27,28,19,20,79,123,141,167,111,227,100,37,124,97,200,1000,10000,17000,9400,1000,3600,1000,2300,1500,700,2000,100,4000,1500,100,3800,1000,500,1100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15491,1200,500,1200,3600,1000,5000,1000,500,500,1000,900,200000,9900,1000,400,2000,5000,1000,1000,500,15000,500,4200,200,30000,1500,100,300,400,100,200,500,11000,200,5700,300,300,1800,20000,2000,600,100,500,1700,300,300,1000,5000,30000,3872921,9247793,12.375,12.948,1086,2805,88,107,4122,11016,2964,14275869,177414000.0,2797,9028155,113829100.0,,112139984.0
5063,1600000,20200103,152136000000,1578036096000000,2020-01-03 15:21:36,5064,0,18682,38018810,477053357.4,12.47,12.57,12.63,12.47,12.6,12.5,12.51,12.52,12.53,12.54,12.55,12.56,12.57,12.58,12.59,12.6,12.61,12.62,12.63,12.64,12.65,12.66,12.67,12.68,12.69,242655,141700,88000,161000,108900,256000,171500,205064,113634,61800,505651,315180,438762,548587,501400,642290,172930,151981,321850,280291,65,32,20,25,29,46,27,28,19,20,79,123,141,167,111,227,100,37,124,97,200,1000,10000,17000,9400,1000,3600,1000,2300,1500,700,2000,100,4000,1500,100,3800,1000,500,1100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15491,1200,500,1200,3600,1000,5000,1000,500,500,1000,900,200000,9900,1000,400,2000,5000,1000,1000,500,15000,500,4200,200,30000,1500,100,300,400,100,200,500,11000,200,5700,300,300,1800,20000,2000,600,100,500,1700,300,300,1000,5000,30000,3872921,9247793,12.375,12.948,1086,2805,88,107,4122,11016,2964,14275869,177414000.0,2797,9028155,113829100.0,,112139985.0
5064,1600000,20200103,152235000000,1578036155000000,2020-01-03 15:22:35,5065,0,18682,38018810,477053357.4,12.47,12.57,12.63,12.47,12.6,12.5,12.51,12.52,12.53,12.54,12.55,12.56,12.57,12.58,12.59,12.6,12.61,12.62,12.63,12.64,12.65,12.66,12.67,12.68,12.69,242655,141700,88000,161000,108900,256000,171500,205064,113634,61800,505651,315180,438762,548587,501400,642290,172930,151981,321850,280291,65,32,20,25,29,46,27,28,19,20,79,123,141,167,111,227,100,37,124,97,200,1000,10000,17000,9400,1000,3600,1000,2300,1500,700,2000,100,4000,1500,100,3800,1000,500,1100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15491,1200,500,1200,3600,1000,5000,1000,500,500,1000,900,200000,9900,1000,400,2000,5000,1000,1000,500,15000,500,4200,200,30000,1500,100,300,400,100,200,500,11000,200,5700,300,300,1800,20000,2000,600,100,500,1700,300,300,1000,5000,30000,3872921,9247793,12.375,12.948,1086,2805,88,107,4122,11016,2964,14275869,177414000.0,2797,9028155,113829100.0,,112139986.0
5065,1600000,20200103,152335000000,1578036215000000,2020-01-03 15:23:35,5066,0,18682,38018810,477053357.4,12.47,12.57,12.63,12.47,12.6,12.5,12.51,12.52,12.53,12.54,12.55,12.56,12.57,12.58,12.59,12.6,12.61,12.62,12.63,12.64,12.65,12.66,12.67,12.68,12.69,242655,141700,88000,161000,108900,256000,171500,205064,113634,61800,505651,315180,438762,548587,501400,642290,172930,151981,321850,280291,65,32,20,25,29,46,27,28,19,20,79,123,141,167,111,227,100,37,124,97,200,1000,10000,17000,9400,1000,3600,1000,2300,1500,700,2000,100,4000,1500,100,3800,1000,500,1100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15491,1200,500,1200,3600,1000,5000,1000,500,500,1000,900,200000,9900,1000,400,2000,5000,1000,1000,500,15000,500,4200,200,30000,1500,100,300,400,100,200,500,11000,200,5700,300,300,1800,20000,2000,600,100,500,1700,300,300,1000,5000,30000,3872921,9247793,12.375,12.948,1086,2805,88,107,4122,11016,2964,14275869,177414000.0,2797,9028155,113829100.0,,112139987.0


In [112]:
# SZ
SZ1 = pd.read_pickle('/mnt/e/result/20200102/SZ.pkl')

assert(SZ.shape[0] == SZ1.shape[0])

SZ = pd.merge(SZ, SZ1[['num', 'clockAtArrival', 'sequenceNo']], on=['num'], how='outer')
assert(SZ[SZ['sequenceNo'].isnull()].shape[0] == 0)
assert(SZ[SZ['time'].isnull()].shape[0] == 0)
SZ.drop(['num'],axis=1,inplace=True)
SZ = SZ.sort_values(by=['skey', 'ordering'])

SZ.to_pickle('/mnt/e/result/20200102/add_columns/SZ.pkl')

In [122]:
# trade
database_name = 'com_md_eq_cn'
user = "zhenyuy"
password = "bnONBrzSMGoE"

db = DB("192.168.10.178", database_name, user, password)
trade = db.read('md_trade', start_date=startDate, end_date=endDate)

trade1 = pd.read_pickle('/mnt/e/result/20200102/trade.pkl')

assert(trade.shape[0] == trade1.shape[0])

trade = pd.merge(trade, trade1[['skey', 'ApplSeqNum', 'clockAtArrival', 'sequenceNo']], on=['skey', 'ApplSeqNum'], 
                 how='outer')
assert(trade[trade['sequenceNo'].isnull()].shape[0] == 0)
assert(trade[trade['time'].isnull()].shape[0] == 0)
trade = trade.sort_values(by=['skey', 'ApplSeqNum'])

trade.to_pickle('/mnt/e/result/20200102/add_columns/trade.pkl')

In [129]:
# order
database_name = 'com_md_eq_cn'
user = "zhenyuy"
password = "bnONBrzSMGoE"

db = DB("192.168.10.178", database_name, user, password)
order = db.read('md_order', start_date=startDate, end_date=endDate)

order1 = pd.read_pickle('/mnt/e/result/20200102/order.pkl')

assert(order.shape[0] == order1.shape[0])
order = pd.merge(order, order1[['skey', 'ApplSeqNum', 'clockAtArrival', 'sequenceNo']], on=['skey', 'ApplSeqNum'], 
                 how='outer')
assert(order[order['sequenceNo'].isnull()].shape[0] == 0)
assert(order[order['time'].isnull()].shape[0] == 0)
order = order.sort_values(by=['skey', 'ApplSeqNum'])

order.to_pickle('/mnt/e/result/20200102/add_columns/order.pkl')

In [133]:
# index
database_name = 'com_md_eq_cn'
user = "zhenyuy"
password = "bnONBrzSMGoE"

db = DB("192.168.10.178", database_name, user, password)
index = db.read('md_index', start_date=startDate, end_date=endDate)
index['num'] = index['skey'] * 10000 + index['ordering']

index1 = pd.read_pickle('/mnt/e/result/20200102/index.pkl')

assert(index.shape[0] == index1.shape[0])
index = pd.merge(index, index1[['num', 'clockAtArrival', 'sequenceNo']], on=['num'], how='outer')
assert(index[index['sequenceNo'].isnull()].shape[0] == 0)
assert(index[index['time'].isnull()].shape[0] == 0)
index.drop(['num'],axis=1,inplace=True)
index = index.sort_values(by=['skey', 'ordering'])

index.to_pickle('/mnt/e/result/20200102/add_columns/index.pkl')