In [9]:
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz


def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name)


class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db'):
        self.db_name = db_name
        self.uri = uri
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = 1
            seg = {'ver': version, 'data': self.ser(df_seg, version), 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)

        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        else:
            raise Exception('unknown version')


def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()











import pandas as pd
import random
import numpy as np
import glob
import pickle
import os
import datetime
import time
pd.set_option("max_columns", 200)


year = "2020"
startDate = '20200106'
endDate = '20200106'
database_name = 'com_md_eq_cn'
user = "zhenyuy"
password = "bnONBrzSMGoE"

db = DB("192.168.10.178", database_name, user, password)
    
mdOrderLog = db.read('md_order', start_date=startDate, end_date=endDate, symbol=[2000001])
datelist = np.sort(mdOrderLog['date'].unique())
display(datelist)

for d in datelist:
    readPath = '/mnt/e/mbd_data/' + str(d) + '/***'
    dataPathLs = np.array(glob.glob(readPath))
    mdOrderLog = db.read('md_order', start_date=str(d), end_date=str(d))
    display(d)
    display(set(mdOrderLog['skey'].astype(str).unique()) - set([np.array([os.path.basename(i).split('.')[0] for i in dataPathLs])][0]))
    display(set([np.array([os.path.basename(i).split('.')[0] for i in dataPathLs])][0]) -  set(mdOrderLog['skey'].astype(str).unique()))
#     for dd in dataPathLs:
#         data = pd.read_pickle(dd)
#         data = data.rename(columns={'StockID':"skey"})
#         data = data.rename(columns={'sequenceNo':"ApplSeqNum"})
#         data['date'] = d
#         data['datetime'] = data["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))
#         for cols in ['bid30p', 'bid29p',
#                 'bid28p', 'bid27p', 'bid26p', 'bid25p', 'bid24p', 'bid23p','bid22p', 'bid21p', 'bid20p', 'bid19p',
#                 'bid18p', 'bid17p', 'bid16p', 'bid15p', 'bid14p', 'bid13p', 'bid12p', 'bid11p',
#                 'bid10p', 'bid9p', 'bid8p', 'bid7p', 'bid6p', 'bid5p', 'bid4p', 'bid3p',
#                 'bid2p', 'bid1p', 'ask1p', 'ask2p', 'ask3p', 'ask4p', 'ask5p', 'ask6p', 'ask7p', 'ask8p', 'ask9p', 'ask10p',
#                 'ask11p', 'ask12p', 'ask13p', 'ask14p', 'ask15p', 'ask16p', 'ask17p',
#                 'ask18p', 'ask19p', 'ask20p', 'ask21p', 'ask22p', 'ask23p', 'ask24p',
#                 'ask25p', 'ask26p', 'ask27p', 'ask28p', 'ask29p', 'ask30p']:
#             data[cols] = data[cols].astype(float)
#         for cols in ['ApplSeqNum', 'date']:
#             data[cols] = data[cols].astype('int32')
#         data = data[['skey', 'date', 'time', 'clockAtArrival', 'datetime', 'ApplSeqNum', 'cum_volume', 'cum_amount', 'close', 'bid30p', 'bid29p',
#                 'bid28p', 'bid27p', 'bid26p', 'bid25p', 'bid24p', 'bid23p','bid22p', 'bid21p', 'bid20p', 'bid19p',
#                 'bid18p', 'bid17p', 'bid16p', 'bid15p', 'bid14p', 'bid13p', 'bid12p', 'bid11p',
#                 'bid10p', 'bid9p', 'bid8p', 'bid7p', 'bid6p', 'bid5p', 'bid4p', 'bid3p',
#                 'bid2p', 'bid1p', 'ask1p', 'ask2p', 'ask3p', 'ask4p', 'ask5p', 'ask6p', 'ask7p', 'ask8p', 'ask9p', 'ask10p',
#                 'ask11p', 'ask12p', 'ask13p', 'ask14p', 'ask15p', 'ask16p', 'ask17p',
#                 'ask18p', 'ask19p', 'ask20p', 'ask21p', 'ask22p', 'ask23p', 'ask24p',
#                 'ask25p', 'ask26p', 'ask27p', 'ask28p', 'ask29p', 'ask30p', 'bid30q',
#                 'bid29q', 'bid28q', 'bid27q', 'bid26q', 'bid25q', 'bid24q', 'bid23q',
#                 'bid22q', 'bid21q', 'bid20q', 'bid19q', 'bid18q', 'bid17q', 'bid16q', 'bid15q', 'bid14q', 'bid13q', 'bid12q', 'bid11q',
#                 'bid10q', 'bid9q', 'bid8q', 'bid7q', 'bid6q', 'bid5q', 'bid4q', 'bid3q',
#                 'bid2q', 'bid1q', 'ask1q', 'ask2q', 'ask3q', 'ask4q', 'ask5q', 'ask6q',
#                 'ask7q', 'ask8q', 'ask9q', 'ask10q', 'ask11q', 'ask12q', 'ask13q',
#                 'ask14q', 'ask15q', 'ask16q', 'ask17q', 'ask18q', 'ask19q', 'ask20q',
#                 'ask21q', 'ask22q', 'ask23q', 'ask24q', 'ask25q', 'ask26q', 'ask27q', 'ask28q', 'ask29q', 'ask30q',
#                 'bid30n', 'bid29n', 'bid28n', 'bid27n', 'bid26n', 'bid25n', 'bid24n',
#                 'bid23n', 'bid22n', 'bid21n', 'bid20n', 'bid19n', 'bid18n', 'bid17n',
#                 'bid16n', 'bid15n', 'bid14n', 'bid13n', 'bid12n', 'bid11n', 'bid10n',
#                 'bid9n', 'bid8n', 'bid7n', 'bid6n', 'bid5n', 'bid4n', 'bid3n', 'bid2n',
#                 'bid1n', 'ask1n', 'ask2n', 'ask3n', 'ask4n', 'ask5n', 'ask6n', 'ask7n', 'ask8n', 'ask9n', 'ask10n',
#                 'ask11n', 'ask12n', 'ask13n', 'ask14n', 'ask15n', 'ask16n', 'ask17n',
#                 'ask18n', 'ask19n', 'ask20n', 'ask21n', 'ask22n', 'ask23n', 'ask24n',
#                 'ask25n', 'ask26n', 'ask27n', 'ask28n', 'ask29n', 'ask30n',
#                 'bid1Top1q','bid1Top2q','bid1Top3q','bid1Top4q','bid1Top5q','bid1Top6q',
#     'bid1Top7q','bid1Top8q','bid1Top9q','bid1Top10q','bid1Top11q','bid1Top12q','bid1Top13q','bid1Top14q','bid1Top15q','bid1Top16q','bid1Top17q','bid1Top18q',
#     'bid1Top19q','bid1Top20q','bid1Top21q','bid1Top22q','bid1Top23q','bid1Top24q','bid1Top25q','bid1Top26q','bid1Top27q','bid1Top28q','bid1Top29q',
#     'bid1Top30q','bid1Top31q','bid1Top32q','bid1Top33q','bid1Top34q','bid1Top35q','bid1Top36q','bid1Top37q','bid1Top38q','bid1Top39q','bid1Top40q',
#     'bid1Top41q','bid1Top42q','bid1Top43q','bid1Top44q','bid1Top45q','bid1Top46q','bid1Top47q','bid1Top48q','bid1Top49q','bid1Top50q', 'ask1Top1q',
#     'ask1Top2q','ask1Top3q','ask1Top4q','ask1Top5q','ask1Top6q','ask1Top7q','ask1Top8q','ask1Top9q','ask1Top10q','ask1Top11q','ask1Top12q','ask1Top13q',
#     'ask1Top14q','ask1Top15q','ask1Top16q','ask1Top17q','ask1Top18q','ask1Top19q','ask1Top20q','ask1Top21q','ask1Top22q','ask1Top23q',
#     'ask1Top24q','ask1Top25q','ask1Top26q','ask1Top27q','ask1Top28q','ask1Top29q','ask1Top30q','ask1Top31q','ask1Top32q','ask1Top33q',
#     'ask1Top34q','ask1Top35q','ask1Top36q','ask1Top37q','ask1Top38q','ask1Top39q','ask1Top40q','ask1Top41q','ask1Top42q','ask1Top43q',
#     'ask1Top44q','ask1Top45q','ask1Top46q','ask1Top47q','ask1Top48q','ask1Top49q','ask1Top50q', 'total_bid_quantity', 'total_ask_quantity',
#                 'total_bid_vwap', 'total_ask_vwap', 'total_bid_orders', 'total_ask_orders', 'total_bid_levels',
#                 'total_ask_levels']]
#         db.write('md_snapshot_mbd', data)
#     print(str(d) + ' finished')


array([20200106], dtype=int32)

20200106

{'2000063',
 '2000338',
 '2000725',
 '2000858',
 '2002185',
 '2002230',
 '2002291',
 '2002351',
 '2002405',
 '2002456',
 '2002463',
 '2002466',
 '2002475',
 '2002547',
 '2002973',
 '2300014',
 '2300059',
 '2300088'}

set()

In [8]:
import pandas as pd
data = pd.read_pickle('/mnt/e/result/20200102/order.pkl')
data

Unnamed: 0,skey,date,ApplSeqNum,sequenceNo,clockAtArrival
42447,2002972,20200102.0,2.0,114487500.0,
42448,2002972,20200102.0,3.0,114487501.0,
42449,2002972,20200102.0,5.0,114487502.0,
42450,2002972,20200102.0,6.0,114487503.0,
42451,2002972,20200102.0,7.0,114487504.0,
...,...,...,...,...,...
68769821,2000760,20200102.0,17096142.0,113657245.0,1.577948e+15
92134148,2300236,20200102.0,16266528.0,113657246.0,1.577948e+15
72727669,2002045,20200102.0,17054834.0,113657247.0,1.577948e+15
87679491,2002962,20200102.0,17096143.0,113657248.0,1.577948e+15


In [21]:
import os 
os.environ['OMP_NUM_THREADS'] = '1'
import glob
import pymongo
import numpy as np
import pandas as pd
import pickle
import time
import gzip
import lzma
import pytz
import warnings
import glob
import datetime
from collections import defaultdict, OrderedDict
warnings.filterwarnings(action='ignore')


def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name)

class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db'):
        self.db_name = db_name
        self.uri = uri
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = 1
            seg = {'ver': version, 'data': self.ser(df_seg, version), 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        if version == 1:
            return gzip.compress(pickle.dumps(s), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s), preset=1)
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)

        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        else:
            raise Exception('unknown version')

def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m

class go():
    def __init__(self, thisDate_str, orders_data, trades_data):
        self.orders_data = orders_data
        self.trades_data = trades_data
        self.thisDate_str = thisDate_str
        
    def run(self, s):
        mdTradeLog = self.trades_data[s]
        mdOrderLog = self.orders_data[s]
        ###
        mdOrderLog['ID'] = int(mdOrderLog['skey'].dropna().unique())
        mdOrderLog['order_type'] = mdOrderLog['order_type'].astype(str)
        mdOrderLog['status'] = 'order'
        ## rename
        mdOrderLog.columns = ['skey', 'date', 'TransactTime', 'clockAtArrival', 'datetime', 'ApplSeqNum',
                              'Side', 'OrderType', 'Price', 'OrderQty', 'SecurityID', 'status']
        mdTradeLog['ID'] = int(mdTradeLog['skey'].dropna().unique())
        mdTradeLog['trade_type'] = mdTradeLog['trade_type'].astype(str)
        if 'trade_money' not in mdTradeLog.columns:
            mdTradeLog.columns = ['skey', 'date', 'TransactTime', 'clockAtArrival', 'datetime', 'ApplSeqNum',
                                  'ExecType', 'trade_flag', 'TradePrice', 'TradeQty', 'BidApplSeqNum',
                                  'OfferApplSeqNum', 'SecurityID']
        else:
            mdTradeLog.columns = ['skey', 'date', 'TransactTime', 'clockAtArrival', 'datetime', 'ApplSeqNum',
                                  'ExecType', 'trade_flag', 'TradePrice', 'TradeQty', 'BidApplSeqNum',
                                  'OfferApplSeqNum', 'SecurityID', 'trade_money']
            ###
        tradedLog = mdTradeLog[mdTradeLog['ExecType'] == '1'].reset_index(drop=True)
        tradedLog['status'] = 'trade'
        #
        bidOrderInfo = mdOrderLog[['ApplSeqNum', 'SecurityID', 'Price', 'OrderType', 'Side']].reset_index(drop=True)
        bidOrderInfo = bidOrderInfo.rename(
            columns={'TransactTime': 'TransactTime', 'ApplSeqNum': 'BidApplSeqNum', 'Price': 'BidOrderPrice',
                     'OrderType': 'BidOrderType', 'Side': 'BidSide'})
        tradedLog = pd.merge(tradedLog, bidOrderInfo, how='left', on=['SecurityID', 'BidApplSeqNum'],
                             validate='many_to_one')
        del bidOrderInfo

        askOrderInfo = mdOrderLog[['ApplSeqNum', 'SecurityID', 'Price', 'OrderType', 'Side']].reset_index(drop=True)
        askOrderInfo = askOrderInfo.rename(
            columns={'TransactTime': 'TransactTime', 'ApplSeqNum': 'OfferApplSeqNum', 'Price': 'OfferOrderPrice',
                     'OrderType': 'OfferOrderType', 'Side': 'OfferSide'})
        tradedLog = pd.merge(tradedLog, askOrderInfo, how='left', on=['SecurityID', 'OfferApplSeqNum'],
                             validate='many_to_one')
        del askOrderInfo

        cancelLog = mdTradeLog[mdTradeLog['ExecType'] == '4'].reset_index(drop=True)
        cancelLog['status'] = 'cancel'
        cancelLog['CancelApplSeqNum'] = cancelLog['BidApplSeqNum']
        mask = cancelLog['CancelApplSeqNum'] == 0
        cancelLog.loc[mask, 'CancelApplSeqNum'] = cancelLog.loc[mask, 'OfferApplSeqNum'].values
        del mask
        assert (cancelLog[cancelLog['CancelApplSeqNum'] == 0].shape[0] == 0)
        cancelLog = cancelLog.drop(columns=['TradePrice'])

        cancelPrice = mdOrderLog[['ApplSeqNum', 'SecurityID', 'Price', 'OrderType', 'Side']].reset_index(drop=True)
        cancelPrice = cancelPrice.rename(columns={'ApplSeqNum': 'CancelApplSeqNum', 'Price': 'TradePrice',
                                                  'OrderType': 'CancelOrderType', 'Side': 'CancelSide'})
        cancelLog = pd.merge(cancelLog, cancelPrice, how='left', on=['SecurityID', 'CancelApplSeqNum'],
                             validate='one_to_one')
        del cancelPrice

        msgData = pd.concat([mdOrderLog[['clockAtArrival', 'TransactTime', 'ApplSeqNum', 'SecurityID',
                                         'status', 'Side', 'OrderType', 'Price', 'OrderQty']],
                             tradedLog[['clockAtArrival', 'TransactTime', 'ApplSeqNum', 'SecurityID',
                                        'status', 'ExecType', 'TradePrice', 'TradeQty', 'BidApplSeqNum',
                                        'OfferApplSeqNum', 'BidOrderType', 'BidSide', 'OfferOrderType', 'OfferSide',
                                        'BidOrderPrice', 'OfferOrderPrice']]], sort=False)
        msgData = pd.concat([msgData, cancelLog[['clockAtArrival', 'TransactTime', 'ApplSeqNum',
                                                 'SecurityID', 'status', 'ExecType', 'TradePrice', 'TradeQty',
                                                 'CancelApplSeqNum',
                                                 'CancelOrderType', 'CancelSide']]], sort=False)
        del tradedLog
        del cancelLog
        msgData = msgData.sort_values(by=['ApplSeqNum']).reset_index(drop=True)
        for stockID, stockMsg in msgData.groupby(['SecurityID']):
            stockMsg = stockMsg.reset_index(drop=True)
            stockMsg['TransactTime'] = stockMsg['TransactTime'] / 1000
            stockMsg['isAuction'] = np.where(stockMsg['TransactTime'] < 92900000, True, False)
            stockMsg = stockMsg[stockMsg['TransactTime'] < 145655000].reset_index(drop=True)
            stockMsgNP = stockMsg.to_records()
            simMarket = SimMktSnapshotAllNew(exchange='SZ', stockID=stockID, levels=30)
            self.simMarket = simMarket
            self.stockMsg = stockMsg
        total_cancel = 0
        total_trade = 0
        try:
            for rowEntry in stockMsgNP:
                if rowEntry.isAuction:
                    if rowEntry.status == 'order':
                        print('o1')
                        simMarket.insertAuctionOrder(rowEntry.clockAtArrival, rowEntry.TransactTime,
                                                     rowEntry.ApplSeqNum, rowEntry.Side, rowEntry.Price,
                                                     rowEntry.OrderQty)
                    elif rowEntry.status == 'cancel':
                        print('c1')
                        simMarket.removeOrderByAuctionCancel(rowEntry.clockAtArrival, rowEntry.TransactTime,
                                                             rowEntry.ApplSeqNum, rowEntry.TradePrice,
                                                             rowEntry.TradeQty,
                                                             rowEntry.CancelApplSeqNum, rowEntry.CancelOrderType,
                                                             rowEntry.CancelSide)
                    elif rowEntry.status == 'trade':
                        print('t1')
                        simMarket.removeOrderByAuctionTrade(rowEntry.clockAtArrival, rowEntry.TransactTime,
                                                            rowEntry.ApplSeqNum, rowEntry.TradePrice, rowEntry.TradeQty,
                                                            rowEntry.BidOrderPrice, rowEntry.OfferOrderPrice)
                else:
                    if rowEntry.status == 'order':
                        print('o2')
                        simMarket.insertOrder(rowEntry.clockAtArrival, rowEntry.TransactTime, rowEntry.ApplSeqNum,
                                              rowEntry.Side, rowEntry.OrderType, rowEntry.Price, rowEntry.OrderQty,
                                              rowEntry.ApplSeqNum)
                    elif rowEntry.status == 'cancel':
                        print('c2')
                        simMarket.removeOrderByCancel(rowEntry.clockAtArrival, rowEntry.TransactTime,
                                                      rowEntry.ApplSeqNum, rowEntry.TradePrice, rowEntry.TradeQty,
                                                      rowEntry.CancelApplSeqNum, rowEntry.CancelOrderType,
                                                      rowEntry.CancelSide)
                        total_cancel += rowEntry.TradeQty
                    elif rowEntry.status == 'trade':
                        print('t2')
                        simMarket.removeOrderByTrade(rowEntry.clockAtArrival, rowEntry.TransactTime,
                                                     rowEntry.ApplSeqNum, rowEntry.TradePrice, rowEntry.TradeQty,
                                                     rowEntry.BidApplSeqNum,
                                                     rowEntry.OfferApplSeqNum)
                        total_trade += rowEntry.TradeQty
            self.af = simMarket.getAllInfo()
            self.af.to_pickle('/mnt/e/mbd_data/' + thisDate_str + '/' + str(s) + '.pkl')
        except Exception as e:
            print(s)
            print(e)
            
class SimMktSnapshotAllNew():

    def __init__(self, exchange, stockID, levels):
        
        self.errors = []
        self.exchange = exchange
        self.stockID = stockID
        self.levels = levels
        self.topK = 50

        self.bid = {}
        self.ask = {}
        self.allBidp = []
        self.allAskp = []
        self.bidp = []
        self.bidq = []
        self.askp = []
        self.askq = []
        self.bidn = []
        self.askn = []
        self.uOrder = {}
        self.takingOrder = {}
        self.tempOrder = {}
        self.hasTempOrder = False
        self.isAuction = True

        self.cur_cum_volume = 0
        self.cur_cum_amount = 0
        self.cur_close = 0
        self.bid1p = 0
        self.ask1p = 0
        self.cum_volume = []
        self.cum_amount = []
        self.close = []
        self.localTime = []
        self.exchangeTime = []
        self.sequenceNum = []
        self.bboImprove = []
        
        self.cum_aggressive_volume = []
        self.cum_aggressive_amount = []
        self.cum_market_volume = []
        self.cum_market_amount = []

        self.total_bid_qty = []
        self.total_bid_vwap = []
        self.total_bid_levels = []
        self.total_bid_orders_num = []
        self.total_ask_qty = []
        self.total_ask_vwap = []
        self.total_ask_levels = []
        self.total_ask_orders_num = []
            
        self.bidnq = defaultdict(OrderedDict)
        self.asknq = defaultdict(OrderedDict)
        self.bid1Topq = []
        self.ask1Topq = []
    
    def insertAuctionOrder(self, clockAtArrival, exchangeTime, seqNum, side, price, qty):

        if side == 1:
            if price in self.bid:
                self.bid[price] += qty
            else:
                self.bid[price] = qty
            ######
            self.bidnq[price][seqNum] = qty
            ######
        elif side == 2:
            if price in self.ask:
                self.ask[price] += qty
            else:
                self.ask[price] = qty
            ######
            self.asknq[price][seqNum] = qty
            ######
        self.localTime.append(clockAtArrival)
        self.exchangeTime.append(exchangeTime)
        self.sequenceNum.append(seqNum)

    def removeOrderByAuctionCancel(self, clockAtArrival, exchangeTime, seqNum,
                                   cancelPrice, cancelQty, cancelApplSeqNum, cancelOrderType, cancelSide):
        ######
        if cancelApplSeqNum in self.asknq[cancelPrice]:
            self.asknq[cancelPrice][cancelApplSeqNum] -= cancelQty
            if self.asknq[cancelPrice][cancelApplSeqNum] == 0:
                self.asknq[cancelPrice].pop(cancelApplSeqNum)
        else:
            self.bidnq[cancelPrice][cancelApplSeqNum] -= cancelQty
            if self.bidnq[cancelPrice][cancelApplSeqNum] == 0:
                self.bidnq[cancelPrice].pop(cancelApplSeqNum)
                ######
        if cancelApplSeqNum in self.uOrder:
            cancelPrice, cancelSide = self.uOrder[cancelApplSeqNum]
            assert (cancelPrice > 0)
            self.uOrder.pop(cancelApplSeqNum)

        if cancelSide == 1:
            remain = self.bid[cancelPrice] - cancelQty
            if remain == 0:
                self.bid.pop(cancelPrice)
            elif remain > 0:
                self.bid[cancelPrice] = remain

        elif cancelSide == 2:
            remain = self.ask[cancelPrice] - cancelQty
            if remain == 0:
                self.ask.pop(cancelPrice)
            elif remain > 0:
                self.ask[cancelPrice] = remain

        self.localTime.append(clockAtArrival)
        self.exchangeTime.append(exchangeTime)
        self.sequenceNum.append(seqNum)

    def removeOrderByAuctionTrade(self, clockAtArrival, exchangeTime, seqNum,
                                  price, qty, bidOrderPrice, offerOrderPrice):
        if bidOrderPrice in self.bid:
            bidRemain = self.bid[bidOrderPrice] - qty
            if bidRemain == 0:
                self.bid.pop(bidOrderPrice)
            elif bidRemain > 0:
                self.bid[bidOrderPrice] = bidRemain
            ######
            cum_vol = 0
            for seqNo in self.bidnq[bidOrderPrice]:
                cum_vol += self.bidnq[bidOrderPrice][seqNo]
                if cum_vol > qty:
                    self.bidnq[bidOrderPrice][seqNo] = cum_vol - qty
                    break
                elif cum_vol == qty:
                    self.bidnq[bidOrderPrice].pop(seqNo)
                    break
                else:
                    self.bidnq[bidOrderPrice].pop(seqNo)
            ######  
        else:
            print('bid price not in bid')

        if offerOrderPrice in self.ask:
            askRemain = self.ask[offerOrderPrice] - qty
            if askRemain == 0:
                self.ask.pop(offerOrderPrice)
            elif askRemain > 0:
                self.ask[offerOrderPrice] = askRemain
            ######
            cum_vol = 0
            for seqNo in self.asknq[offerOrderPrice]:
                cum_vol += self.asknq[offerOrderPrice][seqNo]
                if cum_vol > qty:
                    self.asknq[offerOrderPrice][seqNo] = cum_vol - qty
                    break
                elif cum_vol == qty:
                    self.asknq[offerOrderPrice].pop(seqNo)
                    break
                else:
                    self.asknq[offerOrderPrice].pop(seqNo)
            ######
        else:
            print('ask price not in ask')

        self.cur_cum_volume += qty
        self.cur_cum_amount += price * qty
        self.cur_close = price

        self.localTime.append(clockAtArrival)
        self.exchangeTime.append(exchangeTime)
        self.sequenceNum.append(seqNum)

    def insertOrder(self, clockAtArrival, exchangeTime, seqNum, side, orderType, price, qty, applySeqNum):

        if self.isAuction:
            auctionClockAtArrival = self.localTime[-1]
            auctionExchangeTime = self.exchangeTime[-1]
            auctionSeqNum = self.sequenceNum[-1]
            self.localTime = []
            self.exchangeTime = []
            self.sequenceNum = []
            self.bboImprove = []
            self.updateMktInfo(auctionClockAtArrival, auctionExchangeTime, auctionSeqNum, record=True)
            self.isAuction = False

        hasConvert = False
        if self.hasTempOrder:
            tempSeqNum = list(self.tempOrder.keys())[0]
            tempOrderType, tempSide, tempPrice, tempQty, tempStatus = self.tempOrder[tempSeqNum]
            if tempOrderType == '1':
                hasConvert = True
            self.tempToLimit(clockAtArrival, exchangeTime, tempSeqNum)
            self.hasTempOrder = False

        if orderType == '2':
            if side == 1 and price < self.ask1p:
                if price in self.bid:
                    self.bid[price] += qty
                else:
                    self.bid[price] = qty
                self.bidnq[price][applySeqNum] = qty
                if hasConvert:
                    self.updateMktInfo(clockAtArrival, exchangeTime, seqNum, record=True)
                else:
                    self.updateMktInfo(clockAtArrival, exchangeTime, seqNum, record=True)
            elif side == 2 and price > self.bid1p:
                if price in self.ask:
                    self.ask[price] += qty
                else:
                    self.ask[price] = qty
                self.asknq[price][applySeqNum] = qty
                if hasConvert:
                    self.updateMktInfo(clockAtArrival, exchangeTime, seqNum, record=True)
                else:
                    self.updateMktInfo(clockAtArrival, exchangeTime, seqNum, record=True)
            else:
                self.tempOrder[applySeqNum] = (orderType, side, price, qty, 0)
                self.hasTempOrder = True
                self.guessingTrade(clockAtArrival, exchangeTime, seqNum)

        elif orderType == '1':
            if side == 1:
                self.tempOrder[applySeqNum] = (orderType, side, self.ask1p, qty, 0)
                self.takingOrder[applySeqNum] = (self.ask1p, side)
            else:
                self.tempOrder[applySeqNum] = (orderType, side, self.bid1p, qty, 0)
                self.takingOrder[applySeqNum] = (self.bid1p, side)
            self.hasTempOrder = True

        elif orderType == '3':
            if side == 1:
                if len(self.bid) != 0:
                    self.bid[self.bid1p] += qty
                    self.uOrder[applySeqNum] = (self.bid1p, side)
                    self.bidnq[self.bid1p][applySeqNum] = qty
                else:
                    self.tempOrder[applySeqNum] = (orderType, side, self.bid1p, qty, 0)
                    self.hasTempOrder = True
            else:
                if len(self.ask) != 0:
                    self.ask[self.ask1p] += qty
                    self.uOrder[applySeqNum] = (self.ask1p, side)
                    self.asknq[self.ask1p][applySeqNum] = qty
                else:
                    self.tempOrder[applySeqNum] = (orderType, side, self.ask1p, qty, 0)
                    self.hasTempOrder = True
            if hasConvert:
                self.updateMktInfo(clockAtArrival, exchangeTime, seqNum, record=True)
            else:
                self.updateMktInfo(clockAtArrival, exchangeTime, seqNum, record=True)

    def removeOrderByTrade(self, clockAtArrival, exchangeTime, seqNum, price, qty, bidApplSeqNum, offerApplSeqNum):

        display(self.tempOrder)
        assert (len(self.tempOrder) == 1)
        
        if bidApplSeqNum in self.tempOrder:
            tempSeqNum = bidApplSeqNum
            passiveSeqNum = offerApplSeqNum
        elif offerApplSeqNum in self.tempOrder:
            tempSeqNum = offerApplSeqNum
            passiveSeqNum = bidApplSeqNum
        else:
            print('Trade not happend in taking order', bidApplSeqNum, offerApplSeqNum)

        tempOrderType, tempSide, tempPrice, tempQty, tempStatus = self.tempOrder[tempSeqNum]
        tempRemain = tempQty - qty
        if tempRemain == 0:
            self.tempOrder.pop(tempSeqNum)
            self.hasTempOrder = False
        else:
            self.tempOrder[tempSeqNum] = (tempOrderType, tempSide, tempPrice, tempRemain, 1)

        if tempSide == 1:
            assert (self.ask1p == price)
            askRemain = self.ask[price] - qty
            if askRemain == 0:
                self.ask.pop(price)
            elif askRemain > 0:
                self.ask[price] = askRemain
            else:
                assert (askRemain > 0)
            if tempOrderType == '1':
                self.asknq[price][passiveSeqNum] -= qty
                if self.asknq[price][passiveSeqNum] == 0:
                    self.asknq[price].pop(passiveSeqNum)
        elif tempSide == 2:
            if self.bid1p != price:
                print(seqNum)
            assert (self.bid1p == price)
            bidRemain = self.bid[price] - qty
            if bidRemain == 0:
                self.bid.pop(price)
            elif bidRemain > 0:
                self.bid[price] = bidRemain
            else:
                assert (bidRemain > 0)
            if tempOrderType == '1':
                self.bidnq[price][passiveSeqNum] -= qty
                if self.bidnq[price][passiveSeqNum] == 0:
                    self.bidnq[price].pop(passiveSeqNum)
                    
        self.cur_cum_volume += qty
        self.cur_cum_amount += price * qty
        self.cur_close = price

        if self.hasTempOrder == False and tempOrderType == '1':
            self.updateMktInfo(clockAtArrival, exchangeTime, seqNum, record=True)
        else:
            self.updateMktInfo(clockAtArrival, exchangeTime, seqNum, record=False)

    def removeOrderByCancel(self, clockAtArrival, exchangeTime, seqNum,
                            cancelPrice, cancelQty, cancelApplSeqNum, cancelOrderType, cancelSide):

        if self.isAuction:
            auctionClockAtArrival = self.localTime[-1]
            auctionExchangeTime = self.exchangeTime[-1]
            auctionSeqNum = self.sequenceNum[-1]
            self.localTime = []
            self.exchangeTime = []
            self.sequenceNum = []
            self.updateMktInfo(auctionClockAtArrival, auctionExchangeTime, auctionSeqNum, record=True)
            self.isAuction = False

        if cancelApplSeqNum in self.tempOrder:
            tempOrderType, tempSide, tempPrice, tempQty, tempStatus = self.tempOrder[cancelApplSeqNum]
            self.tempOrder.pop(cancelApplSeqNum)
            self.hasTempOrder = False
            
            if tempOrderType == '2':
                if cancelApplSeqNum in self.asknq[cancelPrice]:
                    self.asknq[cancelPrice][cancelApplSeqNum] -= cancelQty
                    if self.asknq[cancelPrice][cancelApplSeqNum] == 0:
                        self.asknq[cancelPrice].pop(cancelApplSeqNum)
                else:
                    self.bidnq[cancelPrice][cancelApplSeqNum] -= cancelQty
                    if self.bidnq[cancelPrice][cancelApplSeqNum] == 0:
                        self.bidnq[cancelPrice].pop(cancelApplSeqNum)
            if tempStatus == 1:
                self.updateMktInfo(clockAtArrival, exchangeTime, seqNum, record=True)
            else:
                self.updateMktInfo(clockAtArrival, exchangeTime, seqNum, record=False)

        else:
            hasConvert = False
            if self.hasTempOrder:
                tempSeqNum = list(self.tempOrder.keys())[0]
                tempOrderType, tempSide, tempPrice, tempQty, tempStatus = self.tempOrder[tempSeqNum]
                if tempOrderType == '1':
                    hasConvert = True
                self.tempToLimit(clockAtArrival, exchangeTime, seqNum)
                self.hasTempOrder = False

            if cancelOrderType == '3':
                cancelPrice, cancelSide = self.uOrder[cancelApplSeqNum]
                assert (cancelPrice > 0)
                self.uOrder.pop(cancelApplSeqNum)

            if cancelOrderType == '1':
                cancelPrice, cancelSide = self.takingOrder[cancelApplSeqNum]
                assert (cancelPrice > 0)

            if cancelSide == 1:
                remain = self.bid[cancelPrice] - cancelQty
                if remain == 0:
                    self.bid.pop(cancelPrice)
                elif remain > 0:
                    self.bid[cancelPrice] = remain

            elif cancelSide == 2:
                remain = self.ask[cancelPrice] - cancelQty
                if remain == 0:
                    self.ask.pop(cancelPrice)
                elif remain > 0:
                    self.ask[cancelPrice] = remain

            if cancelApplSeqNum in self.asknq[cancelPrice]:
                self.asknq[cancelPrice][cancelApplSeqNum] -= cancelQty
                if self.asknq[cancelPrice][cancelApplSeqNum] == 0:
                    self.asknq[cancelPrice].pop(cancelApplSeqNum)
            else:
                self.bidnq[cancelPrice][cancelApplSeqNum] -= cancelQty
                if self.bidnq[cancelPrice][cancelApplSeqNum] == 0:
                    self.bidnq[cancelPrice].pop(cancelApplSeqNum)

            if hasConvert:
                self.updateMktInfo(clockAtArrival, exchangeTime, seqNum, record=True)
            else:
                self.updateMktInfo(clockAtArrival, exchangeTime, seqNum, record=True)

    def guessingTrade(self, clockAtArrival, exchangeTime, seqNum):
        assert (len(self.tempOrder) == 1)
        key = list(self.tempOrder.keys())[0]
        orderType, orderSide, orderPrice, orderQty, tempStatus = self.tempOrder[key]
        fakeBid = self.bid.copy()
        fakeAsk = self.ask.copy()
        fakeVol = 0
        fakeAmount = 0
        fakeClose = 0
        if orderType == '1':
            if orderSide == 1:
                curAskP = sorted(fakeAsk.keys())
                remain = orderQty
                for askP in curAskP:
                    if remain > 0:
                        askSize = fakeAsk[askP]
                        if askSize > remain:
                            fakeAsk[askP] = askSize - remain
                            ######
                            cum_vol = 0
                            for seqNo in self.asknq[askP]:
                                cum_vol += self.asknq[askP][seqNo]
                                if cum_vol > remain:
                                    self.asknq[askP][seqNo] = cum_vol - remain
                                    break
                                elif cum_vol == remain:
                                    self.asknq[askP].pop(seqNo)
                                    break
                                else:
                                    self.asknq[askP].pop(seqNo)
                            ######
                            fakeVol += remain
                            fakeAmount += remain * askP
                            remain = 0
                        else:
                            fakeAsk.pop(askP)
                            ######
                            for seqNo in self.asknq[askP]:
                                self.asknq[askP].pop(seqNo)
                            ######
                            fakeVol += askSize
                            fakeAmount += askSize * askP
                            remain -= askSize
                        fakeClose = askP

            elif orderSide == 2:
                curBidP = sorted(fakeBid.keys(), reverse=True)
                remain = orderQty
                for bidP in curBidP:
                    if remain > 0:
                        bidSize = fakeBid[bidP]
                        if bidSize > remain:
                            fakeBid[bidP] = bidSize - remain
                            ######
                            cum_vol = 0
                            for seqNo in self.bidnq[bidP]:
                                cum_vol += self.bidnq[bidP][seqNo]
                                if cum_vol > remain:
                                    self.bidnq[bidP][seqNo] = cum_vol - remain
                                    break
                                elif cum_vol == remain:
                                    self.bidnq[bidP].pop(seqNo)
                                    break
                                else:
                                    self.bidnq[bidP].pop(seqNo)
                            ######
                            fakeVol += remain
                            fakeAmount += remain * bidP
                            remain = 0
                        else:
                            fakeBid.pop(bidP)
                            ######
                            for seqNo in self.bidnq[bidP]:
                                self.asknq[bidP].pop(seqNo)
                            ######
                            fakeVol += bidSize
                            fakeAmount += bidSize * bidP
                            remain -= bidSize
                        fakeClose = bidP

        elif orderType == '2':
            if orderSide == 1:
                curAskP = sorted(fakeAsk.keys())
                remain = orderQty
                for askP in curAskP:
                    if remain > 0 and askP <= orderPrice:
                        askSize = fakeAsk[askP]
                        if askSize > remain:
                            fakeAsk[askP] = askSize - remain
                            ######
                            cum_vol = 0
                            pop_list = []
                            for seqNo in self.asknq[askP]:
                                cum_vol += self.asknq[askP][seqNo]
                                if cum_vol > remain:
                                    self.asknq[askP][seqNo] = cum_vol - remain
                                    break
                                elif cum_vol == remain:
                                    pop_list.append(seqNo)
                                    break
                                else:
                                    pop_list.append(seqNo)
                            for seqNo in pop_list:
                                self.asknq[askP].pop(seqNo)
                            ######
                            fakeVol += remain
                            fakeAmount += remain * askP
                            remain = 0
                        else:
                            fakeAsk.pop(askP)
                            ######
                            pop_list = list(self.asknq[askP].keys())
                            for seqNo in pop_list:
                                self.asknq[askP].pop(seqNo)
                            ######
                            fakeVol += askSize
                            fakeAmount += askSize * askP
                            remain -= askSize
                        fakeClose = askP
                if remain > 0:
                    fakeBid[orderPrice] = remain
                    ######
                    self.bidnq[orderPrice][seqNum] = remain
                    ######
            elif orderSide == 2:
                curBidP = sorted(fakeBid.keys(), reverse=True)
                remain = orderQty
                for bidP in curBidP:
                    if remain > 0 and bidP >= orderPrice:
                        bidSize = fakeBid[bidP]
                        if bidSize > remain:
                            fakeBid[bidP] = bidSize - remain
                            ######
                            cum_vol = 0
                            pop_list = []
                            for seqNo in self.bidnq[bidP]:
                                cum_vol += self.bidnq[bidP][seqNo]
                                if cum_vol > remain:
                                    self.bidnq[bidP][seqNo] = cum_vol - remain
                                    break
                                elif cum_vol == remain:
                                    pop_list.append(seqNo)
                                    break
                                else:
                                    pop_list.append(seqNo)
                            for seqNo in pop_list:
                                self.bidnq[bidP].pop(seqNo)
                            ######
                            fakeVol += remain
                            fakeAmount += remain * bidP
                            remain = 0
                        else:
                            fakeBid.pop(bidP)
                            ######
                            pop_list = list(self.bidnq[bidP].keys())
                            for seqNo in pop_list:
                                self.bidnq[bidP].pop(seqNo)
                            ######
                            fakeVol += bidSize
                            fakeAmount += bidSize * bidP
                            remain -= bidSize
                        fakeClose = bidP
                if remain > 0:
                    fakeAsk[orderPrice] = remain
                    ######
                    self.asknq[orderPrice][seqNum] = remain
                    ######

        self.localTime.append(clockAtArrival)
        self.exchangeTime.append(exchangeTime)
        self.sequenceNum.append(seqNum)
        self.bboImprove.append(1)

        curBidP = sorted(fakeBid.keys(), reverse=True)[:self.levels]
        curAskP = sorted(fakeAsk.keys())[:self.levels]
        curBidQ = [fakeBid[i] for i in curBidP]
        curBidN = [len(list(self.bidnq[i].keys())) for i in curBidP]

        self.bidp += [curBidP + [0] * (self.levels - len(curBidP))]
        self.bidq += [curBidQ + [0] * (self.levels - len(curBidQ))]
        self.bidn += [curBidN + [0] * (self.levels - len(curBidN))]

        curAskQ = [fakeAsk[i] for i in curAskP]
        curAskN = [len(list(self.asknq[i].keys())) for i in curAskP]
        self.askp += [curAskP + [0] * (self.levels - len(curAskP))]
        self.askq += [curAskQ + [0] * (self.levels - len(curAskQ))]
        self.askn += [curAskN + [0] * (self.levels - len(curAskN))]

        self.cum_volume.append(self.cur_cum_volume + fakeVol)
        self.cum_amount.append(self.cur_cum_amount + fakeAmount)
        self.close.append(fakeClose)

        ######
        if len(fakeAsk) != 0:
            ask1p = curAskP[0]
        else:
            ask1p = curBidP[0] + 0.01

        if len(fakeBid) != 0:
            bid1p = curBidP[0]
        else:
            bid1p = curAskP[0] - 0.01
        self.currMid = (bid1p + ask1p) / 2
        bid_odrs = []
        count = 0
        for seqNo in self.bidnq[bid1p]:
            if count >= 50:
                break
            bid_odrs.append(self.bidnq[bid1p][seqNo])
            count += 1
        self.bid1Topq.append(bid_odrs + [0] * (50 - len(bid_odrs)))
        ask_odrs = []
        count = 0
        for seqNo in self.asknq[ask1p]:
            if count >= 50:
                break
            ask_odrs.append(self.asknq[ask1p][seqNo])
            count += 1
        self.ask1Topq.append(ask_odrs + [0] * (50 - len(ask_odrs)))
        ######
        #&#
        bid_price_levels = 0
        ask_price_levels = 0
        bid_order_nums = 0
        ask_order_nums = 0
        bid_qty = 0
        ask_qty = 0
        bid_amount = 0
        ask_amount = 0
        for p in self.bidnq:
            for seqNo in self.bidnq[p]:
                bid_qty += self.bidnq[p][seqNo]
                bid_amount += self.bidnq[p][seqNo] * p
                bid_order_nums += 1
            bid_price_levels += 1
        self.total_bid_qty.append(bid_qty)
        babq = 0 if bid_qty == 0 else bid_amount / bid_qty
        self.total_bid_vwap.append(babq)
        self.total_bid_levels.append(bid_price_levels)
        self.total_bid_orders_num.append(bid_order_nums)
        for p in self.asknq:
            for seqNo in self.asknq[p]:
                ask_qty += self.asknq[p][seqNo]
                ask_amount += self.asknq[p][seqNo] * p
                ask_order_nums += 1
            ask_price_levels += 1
        self.total_ask_qty.append(ask_qty)
        amaq = 0 if ask_qty == 0 else ask_amount / ask_qty
        self.total_ask_vwap.append(amaq)
        self.total_ask_levels.append(ask_price_levels)
        self.total_ask_orders_num.append(ask_order_nums)
        #&#

    def tempToLimit(self, clockAtArrival, exchangeTime, seqNum):
        assert (len(self.tempOrder) == 1)
        tempSeqNum = list(self.tempOrder.keys())[0]
        tempOrderType, tempSide, tempPrice, tempQty, tempStatus = self.tempOrder[tempSeqNum]
        if len(self.bid) != 0 and len(self.ask) != 0:
            assert (tempPrice < self.ask1p)
            assert (tempPrice > self.bid1p)
        if tempSide == 1:
            self.bid[tempPrice] = tempQty
            ######
            self.bidnq[tempPrice][tempSeqNum] = tempQty
            ######
        elif tempSide == 2:
            self.ask[tempPrice] = tempQty
            ######
            self.asknq[tempPrice][tempSeqNum] = tempQty
            ######
        self.tempOrder = {}
        self.hasTempOrder = False
        self.updateMktInfo(clockAtArrival, exchangeTime, seqNum, record=False)

    def updateMktInfo(self, clockAtArrival, exchangeTime, seqNum, record=True):
        curBidP = sorted(self.bid.keys(), reverse=True)[:self.levels]
        curAskP = sorted(self.ask.keys())[:self.levels]

        if len(self.ask) != 0:
            self.ask1p = curAskP[0]
        else:
            self.ask1p = curBidP[0] + 0.01

        if len(self.bid) != 0:
            self.bid1p = curBidP[0]
        else:
            self.bid1p = curAskP[0] - 0.01

        if record == True:
            self.localTime.append(clockAtArrival)
            self.exchangeTime.append(exchangeTime)
            self.sequenceNum.append(seqNum)

            curBidQ = [self.bid[i] for i in curBidP]
            curBidN = [len(list(self.bidnq[i].keys())) for i in curBidP]
            self.bidp += [curBidP + [0] * (self.levels - len(curBidP))]
            self.bidq += [curBidQ + [0] * (self.levels - len(curBidQ))]
            self.bidn += [curBidN + [0] * (self.levels - len(curBidN))]

            curAskQ = [self.ask[i] for i in curAskP]
            curAskN = [len(list(self.asknq[i].keys())) for i in curAskP]
            self.askp += [curAskP + [0] * (self.levels - len(curAskP))]
            self.askq += [curAskQ + [0] * (self.levels - len(curAskQ))]
            self.askn += [curAskN + [0] * (self.levels - len(curAskN))]

            self.cum_volume.append(self.cur_cum_volume)
            self.cum_amount.append(self.cur_cum_amount)
            self.close.append(self.cur_close)

            ######
            self.currMid = (self.bid1p + self.ask1p) / 2
            bid_odrs = []
            count = 0
            for seqNo in self.bidnq[self.bid1p]:
                if count >= self.topK:
                    break
                bid_odrs.append(self.bidnq[self.bid1p][seqNo])
                count += 1
            self.bid1Topq.append(bid_odrs + [0] * (self.topK - len(bid_odrs)))
            ask_odrs = []
            count = 0
            for seqNo in self.asknq[self.ask1p]:
                if count >= self.topK:
                    break
                ask_odrs.append(self.asknq[self.ask1p][seqNo])
                count += 1
            self.ask1Topq.append(ask_odrs + [0] * (self.topK - len(ask_odrs)))
            ######
            ####record these infos
            #&#
            bid_price_levels = 0
            ask_price_levels = 0
            bid_order_nums = 0
            ask_order_nums = 0
            bid_qty = 0
            ask_qty = 0
            bid_amount = 0
            ask_amount = 0
            for p in self.bidnq:
                for seqNo in self.bidnq[p]:
                    bid_qty += self.bidnq[p][seqNo]
                    bid_amount += self.bidnq[p][seqNo] * p
                    bid_order_nums += 1
                bid_price_levels += 1
            self.total_bid_qty.append(bid_qty)
            baq = 0 if bid_qty == 0 else bid_amount / bid_qty
            self.total_bid_vwap.append(baq)
            self.total_bid_levels.append(bid_price_levels)
            self.total_bid_orders_num.append(bid_order_nums)
            for p in self.asknq:
                for seqNo in self.asknq[p]:
                    ask_qty += self.asknq[p][seqNo]
                    ask_amount += self.asknq[p][seqNo] * p
                    ask_order_nums += 1
                ask_price_levels += 1
            self.total_ask_qty.append(ask_qty)
            aaq = 0 if ask_qty == 0 else ask_amount / ask_qty
            self.total_ask_vwap.append(aaq)
            self.total_ask_levels.append(ask_price_levels)
            self.total_ask_orders_num.append(ask_order_nums)
            #&#
            
    def getAllInfo(self):
        ##get n levels OrderBook
        bp_names = []
        ap_names = []
        bq_names = []
        aq_names = []
        bn_names = []
        an_names = []
        for n in range(1, self.levels + 1):
            bp_names.append('bid{}p'.format(n))
            ap_names.append('ask{}p'.format(n))
            bq_names.append('bid{}q'.format(n))
            aq_names.append('ask{}q'.format(n))
            bn_names.append('bid{}n'.format(n))
            an_names.append('ask{}n'.format(n))
        btopK_names = []
        atopK_names = []
        for n in range(1, self.topK + 1):
            btopK_names.append('bid1Top{}q'.format(n))
            atopK_names.append('ask1Top{}q'.format(n))

        bidp = pd.DataFrame(self.bidp, columns=bp_names)
        bidq = pd.DataFrame(self.bidq, columns=bq_names)
        bidn = pd.DataFrame(self.bidn, columns=bn_names)
        bidTopK = pd.DataFrame(self.bid1Topq, columns=btopK_names)
        askp = pd.DataFrame(self.askp, columns=ap_names)
        askq = pd.DataFrame(self.askq, columns=aq_names)
        askn = pd.DataFrame(self.askn, columns=an_names)
        askTopK = pd.DataFrame(self.ask1Topq, columns=atopK_names)
        mdData = pd.DataFrame({'clockAtArrival': self.localTime, 'time': self.exchangeTime,
                               'sequenceNo': self.sequenceNum, 'cum_volume': self.cum_volume,
                               'cum_amount': self.cum_amount, 'close': self.close})
        for data in [bidp, bidq, bidn, bidTopK, askp, askq, askn, askTopK]:
            mdData = pd.concat([mdData, data], axis=1, sort=False)
        mdData['source'] = 100
        mdData['exchange'] = self.exchange
        mdData['StockID'] = self.stockID
        closePrice = mdData['close'].values
        openPrice = closePrice[closePrice > 0][0]
        mdData['openPrice'] = openPrice
        mdData.loc[mdData['cum_volume'] == 0, 'openPrice'] = 0
        targetCols = ['time', 'clockAtArrival', 'StockID', 'cum_volume', 'cum_amount', 'close'] + \
                     bp_names[::-1] + ap_names + bq_names[::-1] + aq_names + bn_names[::-1] + an_names + btopK_names[::-1] + atopK_names
        mdData = mdData[targetCols].reset_index(drop=True)
        ##
        aggDf = pd.DataFrame([self.total_bid_qty, self.total_ask_qty,
                              self.total_bid_vwap, self.total_ask_vwap,
                              self.total_bid_levels, self.total_ask_levels,
                              self.total_bid_orders_num, self.total_ask_orders_num]).T
        
        aggCols = ['total_bid_quantity', 'total_ask_quantity',
                   'total_bid_vwap', 'total_ask_vwap',
                   'total_bid_levels', 'total_ask_levels',
                   'total_bid_orders', 'total_ask_orders']
        aggDf.columns = aggCols
        
        final_df = pd.concat([mdData, aggDf], axis=1)
        ##orderbook columns formatting
        for col in (['close'] + bp_names + ap_names):
            final_df[col] = np.round(final_df[col] * 100)
            final_df[col] = final_df[col].astype('int32')
        for col in (['cum_volume', 'total_bid_quantity', 'total_ask_quantity'] + bq_names + aq_names):
            final_df[col] = final_df[col].fillna(0)
            final_df[col] = final_df[col].astype('int64')
        for col in ['time', 'StockID', 'total_bid_levels', 'total_ask_levels',
                   'total_bid_orders', 'total_ask_orders'] + bn_names + an_names + btopK_names + atopK_names:
            final_df[col] = final_df[col].astype('int32')
        ##other columns formatting
        for col in final_df.columns:
            if 'Vol' in col:
                final_df[col] = final_df[col].astype('int32')
        return final_df
    
if __name__ == '__main__':
    import multiprocessing as mp
    import time
    db = DB("192.168.10.178", 'com_md_eq_cn', 'zhenyuy', 'bnONBrzSMGoE')
    #start date
    thisDate = datetime.date(2020, 1, 6)
    while thisDate < datetime.date(2020, 1, 7):
        intDate = (thisDate - datetime.date(1899, 12, 30)).days
        thisDate_str = str(thisDate).replace('-', '')

        mdOrderLog = db.read('md_order', start_date=thisDate_str, end_date=thisDate_str, symbol=[2002973])
        mdTradeLog = db.read('md_trade', start_date=thisDate_str, end_date=thisDate_str, symbol=[2002973])
        
        orders_data = {}
        trades_data = {}
        orders_data[2002973] = mdOrderLog
        trades_data[2002973] = mdTradeLog
        g = go(thisDate_str, orders_data, trades_data)
        g.run(2002973)

        print('finished ' + thisDate_str)
        thisDate = thisDate + datetime.timedelta(days=1)


o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o

o1
o1
c1
o1
o1
c1
o1
o1
c1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
c1
o1
c1
o1
o1
o1
o1
o1
o1
o1
o1
c1
c1
o1
c1
o1
o1
o1
o1
o1
o1
c1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
c1
o1
c1
o1
o1
o1
c1
o1
o1
o1
o1
o1
o1
o1
o1
c1
c1
o1
o1
c1
c1
c1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
c1
c1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
c1
o1
o1
o1
c1
o1
o1
o1
o1
c1
o1
o1
c1
o1
c1
o1
o1
o1
o1
c1
o1
o1
o1
o1
o1
c1
o1
c1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
c1
o1
o1
o1
c1
o1
o1
o1
c1
o1
o1
o1
o1
c1
c1
o1
o1
o1
o1
o1
o1
o1
o1
c1
o1
o1
o1
o1
o1
o1
o1
c1
o1
o1
o1
o1
o1
o1
o1
o1
o1
c1
c1
c1
c1
c1
c1
o1
o1
o1
c1
o1
c1
o1
c1
o1
o1
o1
c1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
c1
o1
c1
o1
c1
o1
o1
o1
c1
o1
o1
o1
o1
c1
c1
c1
c1
c1
o1
c1
c1
o1
c1
o1
o1
o1
c1
c1
o1
c1
c1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
c1
o1
o1
o1
o1
c1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
c1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
c1
o1
o1
c1
o1
o1
o1
o1
o1
o1
o1
o1
o1
o1
c1
o1
o1
o1
o1
o1
c1
o1
o1
o1
o1
o1
o1
o1
o1
c

{}

2002973

finished 20200106


In [60]:
import pandas as pd
trade = pd.read_csv(r'/mnt/e/result/logs_20200106_zs_92_01_day_data/mdTradeLog_20200106_0912.csv')
order = pd.read_csv(r'/mnt/e/result/logs_20200106_zs_92_01_day_data/mdOrderLog_20200106_0912.csv')

In [20]:
pd.set_option('max_columns', 200)
pd.set_option('max_rows', 200)
g.stockMsg[g.stockMsg['TransactTime'] >= 92501000].head(200)

Unnamed: 0,clockAtArrival,TransactTime,ApplSeqNum,SecurityID,status,Side,OrderType,Price,OrderQty,ExecType,TradePrice,TradeQty,BidApplSeqNum,OfferApplSeqNum,BidOrderType,BidSide,OfferOrderType,OfferSide,BidOrderPrice,OfferOrderPrice,CancelApplSeqNum,CancelOrderType,CancelSide,isAuction
3461,1578274200000000,93000000.0,318133,2000001,order,2.0,2.0,17.27,5000.0,,,,,,,,,,,,,,,False
3462,1578274200010000,93000010.0,319076,2000001,cancel,,,,,4.0,17.45,2000.0,,,,,,,,,137261.0,2.0,2.0,False
3463,1578274200010000,93000010.0,319134,2000001,cancel,,,,,4.0,17.17,3600.0,,,,,,,,,101423.0,2.0,2.0,False
3464,1578274200010000,93000010.0,319494,2000001,order,1.0,2.0,17.03,500.0,,,,,,,,,,,,,,,False
3465,1578274200010000,93000010.0,319495,2000001,trade,,,,,1.0,17.03,500.0,319494.0,163452.0,2.0,1.0,2.0,2.0,17.03,17.03,,,,False
3466,1578274200010000,93000010.0,319638,2000001,order,1.0,2.0,16.78,5500.0,,,,,,,,,,,,,,,False
3467,1578274200010000,93000010.0,319695,2000001,order,2.0,2.0,17.3,100.0,,,,,,,,,,,,,,,False
3468,1578274200010000,93000010.0,319906,2000001,order,2.0,2.0,17.02,100.0,,,,,,,,,,,,,,,False
3469,1578274200010000,93000010.0,319928,2000001,order,1.0,2.0,16.91,3000.0,,,,,,,,,,,,,,,False
3470,1578274200010000,93000010.0,320082,2000001,order,2.0,2.0,17.01,2900.0,,,,,,,,,,,,,,,False


In [22]:
pd.set_option('max_columns', 200)
pd.set_option('max_rows', 200)
g.stockMsg[g.stockMsg['TransactTime'] >= 92450000].head(200)

Unnamed: 0,clockAtArrival,TransactTime,ApplSeqNum,SecurityID,status,Side,OrderType,Price,OrderQty,ExecType,TradePrice,TradeQty,BidApplSeqNum,OfferApplSeqNum,BidOrderType,BidSide,OfferOrderType,OfferSide,BidOrderPrice,OfferOrderPrice,CancelApplSeqNum,CancelOrderType,CancelSide,isAuction
5558,1578273890430000,92450430.0,259937,2002973,order,1.0,2.0,6.89,700.0,,,,,,,,,,,,,,,True
5559,1578273890540000,92450540.0,260016,2002973,order,1.0,2.0,6.9,100.0,,,,,,,,,,,,,,,True
5560,1578273890630000,92450630.0,260061,2002973,order,1.0,2.0,8.27,600.0,,,,,,,,,,,,,,,True
5561,1578273890690000,92450690.0,260104,2002973,order,1.0,2.0,6.89,1000.0,,,,,,,,,,,,,,,True
5562,1578273891000000,92451000.0,260299,2002973,order,1.0,2.0,8.27,300.0,,,,,,,,,,,,,,,True
5563,1578273891130000,92451130.0,260369,2002973,order,1.0,2.0,6.89,900.0,,,,,,,,,,,,,,,True
5564,1578273891430000,92451430.0,260552,2002973,order,1.0,2.0,6.89,1400.0,,,,,,,,,,,,,,,True
5565,1578273891560000,92451560.0,260651,2002973,order,1.0,2.0,6.89,900.0,,,,,,,,,,,,,,,True
5566,1578273891580000,92451580.0,260666,2002973,order,1.0,2.0,6.89,8000.0,,,,,,,,,,,,,,,True
5567,1578273891660000,92451660.0,260737,2002973,order,1.0,2.0,6.89,100.0,,,,,,,,,,,,,,,True


In [37]:
g.stockMsg[g.stockMsg['ApplSeqNum'] == 20]

Unnamed: 0,clockAtArrival,TransactTime,ApplSeqNum,SecurityID,status,Side,OrderType,Price,OrderQty,ExecType,TradePrice,TradeQty,BidApplSeqNum,OfferApplSeqNum,BidOrderType,BidSide,OfferOrderType,OfferSide,BidOrderPrice,OfferOrderPrice,CancelApplSeqNum,CancelOrderType,CancelSide,isAuction
16,1578273300000000,91500000.0,20,2002973,order,1.0,2,7.58,1300.0,,,,,,,,,,,,,,,True


In [57]:
g.simMarket.bid

{8.27: 48646600.0,
 6.89: 6089700.0,
 7.58: 259300.0,
 7.63: 12900.0,
 7.6: 20300.0,
 6.9: 619800.0,
 3.75: 500.0,
 8.0: 64100.0,
 8.26: 501900.0,
 8.25: 12400.0,
 8.2: 52800.0,
 7.59: 6400.0,
 7.61: 6400.0,
 7.62: 6700.0,
 7.64: 8900.0,
 7.65: 6700.0,
 7.66: 9000.0,
 7.67: 6700.0,
 7.68: 6700.0,
 7.69: 9300.0,
 7.7: 6400.0,
 7.71: 5900.0,
 7.72: 9700.0,
 7.73: 6500.0,
 7.74: 10100.0,
 7.75: 17100.0,
 7.76: 5900.0,
 7.77: 5900.0,
 7.78: 5900.0,
 7.79: 5900.0,
 7.8: 13000.0,
 7.81: 5900.0,
 7.82: 6700.0,
 7.83: 6300.0,
 7.84: 5900.0,
 7.85: 5900.0,
 7.86: 11000.0,
 7.87: 5900.0,
 7.88: 5900.0,
 7.89: 20500.0,
 7.9: 58500.0,
 7.91: 6400.0,
 7.92: 6400.0,
 7.93: 6400.0,
 7.94: 7500.0,
 7.95: 6400.0,
 7.96: 6400.0,
 7.97: 5400.0,
 7.98: 6400.0,
 7.99: 7400.0,
 8.01: 6400.0,
 8.02: 6400.0,
 8.03: 6400.0,
 8.04: 18300.0,
 4.79: 200.0,
 3.71: 400.0,
 4.99: 100.0,
 8.24: 9900.0,
 8.23: 7400.0,
 8.22: 6900.0,
 3.74: 100.0,
 8.21: 8900.0,
 8.18: 7900.0,
 8.17: 6900.0,
 8.16: 11900.0,
 8.15: 6900

In [35]:
g.simMarket.ask

{8.27: 1000.0, 7.97: 500.0, 8.26: 500.0, 7.05: 500.0, 6.97: 500.0}

In [44]:
g.stockMsg[g.stockMsg['ApplSeqNum'] == 20.0]

Unnamed: 0,clockAtArrival,TransactTime,ApplSeqNum,SecurityID,status,Side,OrderType,Price,OrderQty,ExecType,TradePrice,TradeQty,BidApplSeqNum,OfferApplSeqNum,BidOrderType,BidSide,OfferOrderType,OfferSide,BidOrderPrice,OfferOrderPrice,CancelApplSeqNum,CancelOrderType,CancelSide,isAuction
16,1578273300000000,91500000.0,20,2002973,order,1.0,2,7.58,1300.0,,,,,,,,,,,,,,,True


In [59]:
g.stockMsg[g.stockMsg['Side'] == 1].head(20)

Unnamed: 0,clockAtArrival,TransactTime,ApplSeqNum,SecurityID,status,Side,OrderType,Price,OrderQty,ExecType,TradePrice,TradeQty,BidApplSeqNum,OfferApplSeqNum,BidOrderType,BidSide,OfferOrderType,OfferSide,BidOrderPrice,OfferOrderPrice,CancelApplSeqNum,CancelOrderType,CancelSide,isAuction
0,1578273300000000,91500000.0,2,2002973,order,1.0,2,8.27,390400.0,,,,,,,,,,,,,,,True
1,1578273300000000,91500000.0,3,2002973,order,1.0,2,6.89,2800.0,,,,,,,,,,,,,,,True
2,1578273300000000,91500000.0,4,2002973,order,1.0,2,8.27,359100.0,,,,,,,,,,,,,,,True
3,1578273300000000,91500000.0,5,2002973,order,1.0,2,8.27,53300.0,,,,,,,,,,,,,,,True
4,1578273300000000,91500000.0,6,2002973,order,1.0,2,8.27,53600.0,,,,,,,,,,,,,,,True
5,1578273300000000,91500000.0,9,2002973,order,1.0,2,8.27,181600.0,,,,,,,,,,,,,,,True
6,1578273300000000,91500000.0,10,2002973,order,1.0,2,8.27,149500.0,,,,,,,,,,,,,,,True
7,1578273300000000,91500000.0,11,2002973,order,1.0,2,8.27,60400.0,,,,,,,,,,,,,,,True
8,1578273300000000,91500000.0,12,2002973,order,1.0,2,6.89,39800.0,,,,,,,,,,,,,,,True
9,1578273300000000,91500000.0,13,2002973,order,1.0,2,8.27,724500.0,,,,,,,,,,,,,,,True


In [58]:
g.stockMsg[g.stockMsg['ApplSeqNum'] == 662.0]

Unnamed: 0,clockAtArrival,TransactTime,ApplSeqNum,SecurityID,status,Side,OrderType,Price,OrderQty,ExecType,TradePrice,TradeQty,BidApplSeqNum,OfferApplSeqNum,BidOrderType,BidSide,OfferOrderType,OfferSide,BidOrderPrice,OfferOrderPrice,CancelApplSeqNum,CancelOrderType,CancelSide,isAuction
360,1578273300020000,91500020.0,662,2002973,order,1.0,2,7.76,1300.0,,,,,,,,,,,,,,,True


In [39]:
g.stockMsg[g.stockMsg['status'] == 'trade'].head(100)

Unnamed: 0,clockAtArrival,TransactTime,ApplSeqNum,SecurityID,status,Side,OrderType,Price,OrderQty,ExecType,TradePrice,TradeQty,BidApplSeqNum,OfferApplSeqNum,BidOrderType,BidSide,OfferOrderType,OfferSide,BidOrderPrice,OfferOrderPrice,CancelApplSeqNum,CancelOrderType,CancelSide,isAuction
5606,1578273900000000,92500000.0,295880,2002973,trade,,,,,1,6.89,500.0,3.0,266119.0,2,1.0,2,2.0,6.89,6.88,,,,True
5607,1578273900000000,92500000.0,295881,2002973,trade,,,,,1,6.89,100.0,3.0,141266.0,2,1.0,2,2.0,6.89,6.89,,,,True
5608,1578273900000000,92500000.0,295882,2002973,trade,,,,,1,6.89,500.0,3.0,221586.0,2,1.0,2,2.0,6.89,6.89,,,,True
5609,1578274200000000,93000000.0,317475,2002973,trade,,,,,1,6.97,500.0,20.0,236319.0,2,1.0,2,2.0,7.58,6.97,,,,False
5610,1578274200000000,93000000.0,317476,2002973,trade,,,,,1,7.05,500.0,20.0,234077.0,2,1.0,2,2.0,7.58,7.05,,,,False
5792,1578274200230000,93000230.0,341180,2002973,trade,,,,,1,7.76,500.0,662.0,341179.0,2,1.0,2,2.0,7.76,6.89,,,,False
22960,1578276001000000,100001000.0,5111624,2002973,trade,,,,,1,8.27,400.0,2.0,5109328.0,2,1.0,2,2.0,8.27,7.0,,,,False
22961,1578276001000000,100001000.0,5111625,2002973,trade,,,,,1,8.27,500.0,2.0,5108650.0,2,1.0,2,2.0,8.27,7.7,,,,False
22962,1578276001000000,100001000.0,5111626,2002973,trade,,,,,1,8.27,500.0,2.0,5110614.0,2,1.0,2,2.0,8.27,7.7,,,,False
22963,1578276001000000,100001000.0,5111627,2002973,trade,,,,,1,8.27,500.0,2.0,5106876.0,2,1.0,2,2.0,8.27,7.71,,,,False


In [13]:
import pandas as pd
import datetime
pd.set_option('max_columns', 300)
pd.set_option('max_rows', 300)
data = pd.read_pickle('/mnt/e/mbd_data/20200102/2002290.pkl')
data = data.rename(columns={'StockID':"skey"})
data = data.rename(columns={'sequenceNo':"ApplSeqNum"})
data['date'] = 20200102
data['datetime'] = data["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))
for cols in ['bid30p', 'bid29p',
                    'bid28p', 'bid27p', 'bid26p', 'bid25p', 'bid24p', 'bid23p','bid22p', 'bid21p', 'bid20p', 'bid19p',
                    'bid18p', 'bid17p', 'bid16p', 'bid15p', 'bid14p', 'bid13p', 'bid12p', 'bid11p',
                    'bid10p', 'bid9p', 'bid8p', 'bid7p', 'bid6p', 'bid5p', 'bid4p', 'bid3p',
                    'bid2p', 'bid1p', 'ask1p', 'ask2p', 'ask3p', 'ask4p', 'ask5p', 'ask6p', 'ask7p', 'ask8p', 'ask9p', 'ask10p',
                    'ask11p', 'ask12p', 'ask13p', 'ask14p', 'ask15p', 'ask16p', 'ask17p',
                    'ask18p', 'ask19p', 'ask20p', 'ask21p', 'ask22p', 'ask23p', 'ask24p',
                    'ask25p', 'ask26p', 'ask27p', 'ask28p', 'ask29p', 'ask30p']:
    data[cols] = data[cols].astype(float)
for cols in ['ApplSeqNum', 'date']:
    data[cols] = data[cols].astype('int32')
data = data[['skey', 'date', 'time', 'clockAtArrival', 'datetime', 'ApplSeqNum', 'cum_volume', 'cum_amount', 'close', 'bid30p', 'bid29p',
                    'bid28p', 'bid27p', 'bid26p', 'bid25p', 'bid24p', 'bid23p','bid22p', 'bid21p', 'bid20p', 'bid19p',
                    'bid18p', 'bid17p', 'bid16p', 'bid15p', 'bid14p', 'bid13p', 'bid12p', 'bid11p',
                    'bid10p', 'bid9p', 'bid8p', 'bid7p', 'bid6p', 'bid5p', 'bid4p', 'bid3p',
                    'bid2p', 'bid1p', 'ask1p', 'ask2p', 'ask3p', 'ask4p', 'ask5p', 'ask6p', 'ask7p', 'ask8p', 'ask9p', 'ask10p',
                    'ask11p', 'ask12p', 'ask13p', 'ask14p', 'ask15p', 'ask16p', 'ask17p',
                    'ask18p', 'ask19p', 'ask20p', 'ask21p', 'ask22p', 'ask23p', 'ask24p',
                    'ask25p', 'ask26p', 'ask27p', 'ask28p', 'ask29p', 'ask30p', 'bid30q',
                    'bid29q', 'bid28q', 'bid27q', 'bid26q', 'bid25q', 'bid24q', 'bid23q',
                    'bid22q', 'bid21q', 'bid20q', 'bid19q', 'bid18q', 'bid17q', 'bid16q', 'bid15q', 'bid14q', 'bid13q', 'bid12q', 'bid11q',
                    'bid10q', 'bid9q', 'bid8q', 'bid7q', 'bid6q', 'bid5q', 'bid4q', 'bid3q',
                    'bid2q', 'bid1q', 'ask1q', 'ask2q', 'ask3q', 'ask4q', 'ask5q', 'ask6q',
                    'ask7q', 'ask8q', 'ask9q', 'ask10q', 'ask11q', 'ask12q', 'ask13q',
                    'ask14q', 'ask15q', 'ask16q', 'ask17q', 'ask18q', 'ask19q', 'ask20q',
                    'ask21q', 'ask22q', 'ask23q', 'ask24q', 'ask25q', 'ask26q', 'ask27q', 'ask28q', 'ask29q', 'ask30q',
                    'bid30n', 'bid29n', 'bid28n', 'bid27n', 'bid26n', 'bid25n', 'bid24n',
                    'bid23n', 'bid22n', 'bid21n', 'bid20n', 'bid19n', 'bid18n', 'bid17n',
                    'bid16n', 'bid15n', 'bid14n', 'bid13n', 'bid12n', 'bid11n', 'bid10n',
                    'bid9n', 'bid8n', 'bid7n', 'bid6n', 'bid5n', 'bid4n', 'bid3n', 'bid2n',
                    'bid1n', 'ask1n', 'ask2n', 'ask3n', 'ask4n', 'ask5n', 'ask6n', 'ask7n', 'ask8n', 'ask9n', 'ask10n',
                    'ask11n', 'ask12n', 'ask13n', 'ask14n', 'ask15n', 'ask16n', 'ask17n',
                    'ask18n', 'ask19n', 'ask20n', 'ask21n', 'ask22n', 'ask23n', 'ask24n',
                    'ask25n', 'ask26n', 'ask27n', 'ask28n', 'ask29n', 'ask30n',
                    'bid1Top1q','bid1Top2q','bid1Top3q','bid1Top4q','bid1Top5q','bid1Top6q',
        'bid1Top7q','bid1Top8q','bid1Top9q','bid1Top10q','bid1Top11q','bid1Top12q','bid1Top13q','bid1Top14q','bid1Top15q','bid1Top16q','bid1Top17q','bid1Top18q',
        'bid1Top19q','bid1Top20q','bid1Top21q','bid1Top22q','bid1Top23q','bid1Top24q','bid1Top25q','bid1Top26q','bid1Top27q','bid1Top28q','bid1Top29q',
        'bid1Top30q','bid1Top31q','bid1Top32q','bid1Top33q','bid1Top34q','bid1Top35q','bid1Top36q','bid1Top37q','bid1Top38q','bid1Top39q','bid1Top40q',
        'bid1Top41q','bid1Top42q','bid1Top43q','bid1Top44q','bid1Top45q','bid1Top46q','bid1Top47q','bid1Top48q','bid1Top49q','bid1Top50q', 'ask1Top1q',
        'ask1Top2q','ask1Top3q','ask1Top4q','ask1Top5q','ask1Top6q','ask1Top7q','ask1Top8q','ask1Top9q','ask1Top10q','ask1Top11q','ask1Top12q','ask1Top13q',
        'ask1Top14q','ask1Top15q','ask1Top16q','ask1Top17q','ask1Top18q','ask1Top19q','ask1Top20q','ask1Top21q','ask1Top22q','ask1Top23q',
        'ask1Top24q','ask1Top25q','ask1Top26q','ask1Top27q','ask1Top28q','ask1Top29q','ask1Top30q','ask1Top31q','ask1Top32q','ask1Top33q',
        'ask1Top34q','ask1Top35q','ask1Top36q','ask1Top37q','ask1Top38q','ask1Top39q','ask1Top40q','ask1Top41q','ask1Top42q','ask1Top43q',
        'ask1Top44q','ask1Top45q','ask1Top46q','ask1Top47q','ask1Top48q','ask1Top49q','ask1Top50q', 'total_bid_quantity', 'total_ask_quantity',
                    'total_bid_vwap', 'total_ask_vwap', 'total_bid_orders', 'total_ask_orders', 'total_bid_levels',
                    'total_ask_levels']]

In [14]:
data.dtypes

skey                           int32
date                           int32
time                           int32
clockAtArrival                 int64
datetime              datetime64[ns]
ApplSeqNum                     int32
cum_volume                     int64
cum_amount                   float64
close                        float64
bid30p                       float64
bid29p                       float64
bid28p                       float64
bid27p                       float64
bid26p                       float64
bid25p                       float64
bid24p                       float64
bid23p                       float64
bid22p                       float64
bid21p                       float64
bid20p                       float64
bid19p                       float64
bid18p                       float64
bid17p                       float64
bid16p                       float64
bid15p                       float64
bid14p                       float64
bid13p                       float64
b

In [5]:
len(['skey', 'date', 'time', 'clockAtArrival', 'datetime', 'ApplSeqNum', 'cum_volume', 'cum_amount', 'close', 'bid30p', 'bid29p',
                    'bid28p', 'bid27p', 'bid26p', 'bid25p', 'bid24p', 'bid23p','bid22p', 'bid21p', 'bid20p', 'bid19p',
                    'bid18p', 'bid17p', 'bid16p', 'bid15p', 'bid14p', 'bid13p', 'bid12p', 'bid11p',
                    'bid10p', 'bid9p', 'bid8p', 'bid7p', 'bid6p', 'bid5p', 'bid4p', 'bid3p',
                    'bid2p', 'bid1p', 'ask1p', 'ask2p', 'ask3p', 'ask4p', 'ask5p', 'ask6p', 'ask7p', 'ask8p', 'ask9p', 'ask10p',
                    'ask11p', 'ask12p', 'ask13p', 'ask14p', 'ask15p', 'ask16p', 'ask17p',
                    'ask18p', 'ask19p', 'ask20p', 'ask21p', 'ask22p', 'ask23p', 'ask24p',
                    'ask25p', 'ask26p', 'ask27p', 'ask28p', 'ask29p', 'ask30p', 'bid30q',
                    'bid29q', 'bid28q', 'bid27q', 'bid26q', 'bid25q', 'bid24q', 'bid23q',
                    'bid22q', 'bid21q', 'bid20q', 'bid19q', 'bid18q', 'bid17q', 'bid16q', 'bid15q', 'bid14q', 'bid13q', 'bid12q', 'bid11q',
                    'bid10q', 'bid9q', 'bid8q', 'bid7q', 'bid6q', 'bid5q', 'bid4q', 'bid3q',
                    'bid2q', 'bid1q', 'ask1q', 'ask2q', 'ask3q', 'ask4q', 'ask5q', 'ask6q',
                    'ask7q', 'ask8q', 'ask9q', 'ask10q', 'ask11q', 'ask12q', 'ask13q',
                    'ask14q', 'ask15q', 'ask16q', 'ask17q', 'ask18q', 'ask19q', 'ask20q',
                    'ask21q', 'ask22q', 'ask23q', 'ask24q', 'ask25q', 'ask26q', 'ask27q', 'ask28q', 'ask29q', 'ask30q',
                    'bid30n', 'bid29n', 'bid28n', 'bid27n', 'bid26n', 'bid25n', 'bid24n',
                    'bid23n', 'bid22n', 'bid21n', 'bid20n', 'bid19n', 'bid18n', 'bid17n',
                    'bid16n', 'bid15n', 'bid14n', 'bid13n', 'bid12n', 'bid11n', 'bid10n',
                    'bid9n', 'bid8n', 'bid7n', 'bid6n', 'bid5n', 'bid4n', 'bid3n', 'bid2n',
                    'bid1n', 'ask1n', 'ask2n', 'ask3n', 'ask4n', 'ask5n', 'ask6n', 'ask7n', 'ask8n', 'ask9n', 'ask10n',
                    'ask11n', 'ask12n', 'ask13n', 'ask14n', 'ask15n', 'ask16n', 'ask17n',
                    'ask18n', 'ask19n', 'ask20n', 'ask21n', 'ask22n', 'ask23n', 'ask24n',
                    'ask25n', 'ask26n', 'ask27n', 'ask28n', 'ask29n', 'ask30n',
                    'bid1Top1q','bid1Top2q','bid1Top3q','bid1Top4q','bid1Top5q','bid1Top6q',
        'bid1Top7q','bid1Top8q','bid1Top9q','bid1Top10q','bid1Top11q','bid1Top12q','bid1Top13q','bid1Top14q','bid1Top15q','bid1Top16q','bid1Top17q','bid1Top18q',
        'bid1Top19q','bid1Top20q','bid1Top21q','bid1Top22q','bid1Top23q','bid1Top24q','bid1Top25q','bid1Top26q','bid1Top27q','bid1Top28q','bid1Top29q',
        'bid1Top30q','bid1Top31q','bid1Top32q','bid1Top33q','bid1Top34q','bid1Top35q','bid1Top36q','bid1Top37q','bid1Top38q','bid1Top39q','bid1Top40q',
        'bid1Top41q','bid1Top42q','bid1Top43q','bid1Top44q','bid1Top45q','bid1Top46q','bid1Top47q','bid1Top48q','bid1Top49q','bid1Top50q', 'ask1Top1q',
        'ask1Top2q','ask1Top3q','ask1Top4q','ask1Top5q','ask1Top6q','ask1Top7q','ask1Top8q','ask1Top9q','ask1Top10q','ask1Top11q','ask1Top12q','ask1Top13q',
        'ask1Top14q','ask1Top15q','ask1Top16q','ask1Top17q','ask1Top18q','ask1Top19q','ask1Top20q','ask1Top21q','ask1Top22q','ask1Top23q',
        'ask1Top24q','ask1Top25q','ask1Top26q','ask1Top27q','ask1Top28q','ask1Top29q','ask1Top30q','ask1Top31q','ask1Top32q','ask1Top33q',
        'ask1Top34q','ask1Top35q','ask1Top36q','ask1Top37q','ask1Top38q','ask1Top39q','ask1Top40q','ask1Top41q','ask1Top42q','ask1Top43q',
        'ask1Top44q','ask1Top45q','ask1Top46q','ask1Top47q','ask1Top48q','ask1Top49q','ask1Top50q', 'total_bid_quantity', 'total_ask_quantity',
                    'total_bid_vwap', 'total_ask_vwap', 'total_bid_orders', 'total_ask_orders', 'total_bid_levels',
                    'total_ask_levels'])

296

In [None]:
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz


def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name)


class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db'):
        self.db_name = db_name
        self.uri = uri
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = 1
            seg = {'ver': version, 'data': self.ser(df_seg, version), 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)

        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        else:
            raise Exception('unknown version')


def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()


import pandas as pd
import random
import numpy as np
import glob
import pickle
import os
import datetime
import time
pd.set_option("max_columns", 200)

year = "2020"
startDate = '20200529'
endDate = '20200821'
database_name = 'com_md_eq_cn'
user = "zhenyuy"
password = "bnONBrzSMGoE"

db1 = DB("192.168.10.178", database_name, user, password)
save = {}
save['date'] = []
save['secid'] = []
mdOrderLog = db1.read('md_order', start_date=startDate, end_date=endDate, symbol=[2000001])
datelist = mdOrderLog['date'].unique()
for d in datelist:
    print(d)
    sl1 = db1.read('md_order', start_date=str(d), end_date=str(d))['skey'].unique()
    for s in sl1:
        mbd = db1.read('md_snapshot_mbd', start_date=str(d), end_date=str(d), symbol=s)
        if mbd is None:
            save['date'].append(d)
            save['secid'].append(s)
            print(s)
        else:
            try:
                assert(mbd[mbd['time'] < 0].shape[0] == 0)
                for cols in mbd.columns[mbd.columns != 'datetime']:
                    try:
                        assert(mbd[mbd[cols] < 0].shape[0] == 0)
                    except:
                        print(cols)
                        print(s)
            except:
                print('negative column!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
                print(s)
                mbd.loc[mbd['time'] < 0, 'time'] = mbd[mbd['time'] < 0]['datetime'].astype(str).apply(lambda x: int((x.split(' ')[1].replace(':', "")).replace(".", ""))).astype('int64') * 1000
                assert(mbd[mbd['time'] < 0].shape[0] == 0)
            db1.write('md_snapshot_mbd', mbd)
            del mbd
save = pd.DataFrame(save)
save.to_csv('/home/work516/Downloads/missing_sec_IC_2020.csv')

20200529
20200601
20200602
2002986
20200603
20200604
20200605
2300837
20200608
2300838
20200609
20200610
2002989
20200611
20200612
20200615
20200616
2300841
20200617
20200618
2300842
20200619
2300824
20200622
20200623
20200624
20200629
20200630
2300839
20200701
2300846
20200702
2300843
20200703
2300845
20200706
20200707
20200708
2300840
20200709
2300847
20200710
2300849
20200713
2300850
2300852
20200714
20200715
20200716
20200717
2300851
20200720
2300848
20200721
20200722
2300856
20200723
2300855
20200724
2300853
20200727


In [None]:
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz


def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name)


class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db'):
        self.db_name = db_name
        self.uri = uri
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = 1
            seg = {'ver': version, 'data': self.ser(df_seg, version), 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)

        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        else:
            raise Exception('unknown version')


def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()


import pandas as pd
import random
import numpy as np
import glob
import pickle
import os
import datetime
import time
pd.set_option("max_columns", 200)

year = "2019"
startDate = '20191105'
endDate = '20191105'
database_name = 'com_md_eq_cn'
user = "zhenyuy"
password = "bnONBrzSMGoE"

db1 = DB("192.168.10.178", database_name, user, password)
save = {}
save['date'] = []
save['secid'] = []
mdOrderLog = db1.read('md_order', start_date=startDate, end_date=endDate, symbol=[2000001])
datelist = mdOrderLog['date'].unique()
for d in datelist:
    print(d)
    sl1 = db1.read('md_snapshot_l2', start_date=str(d), end_date=str(d))['skey'].unique()
    sl1 = sl1[sl1 > 2000000]
    for s in sl1:
        mbd = db1.read('md_snapshot_mbd', start_date=str(d), end_date=str(d), symbol=s)
        if mbd is None:
            save['date'].append(d)
            save['secid'].append(s)
            print(s)
            continue
        try:
            assert(mbd.shape[1] == 166)
        except:
            assert(mbd.shape[1] == 186)
            mbd = mbd[['skey', 'date', 'time', 'clockAtArrival', 'datetime', 'ordering', 'ApplSeqNum', 'bbo_improve', 'pass_filter', 'cum_volume', 'cum_amount',
                         'prev_close', 'open', 'close','bid10p', 'bid9p', 'bid8p', 'bid7p', 'bid6p', 'bid5p', 'bid4p', 'bid3p', 'bid2p', 'bid1p', 
                         'ask1p', 'ask2p', 'ask3p', 'ask4p', 'ask5p', 'ask6p', 'ask7p', 'ask8p', 'ask9p', 'ask10p', 
                         'bid10q', 'bid9q', 'bid8q', 'bid7q', 'bid6q', 'bid5q', 'bid4q', 'bid3q', 'bid2q', 'bid1q', 
                         'ask1q', 'ask2q', 'ask3q', 'ask4q', 'ask5q', 'ask6q', 'ask7q', 'ask8q', 'ask9q', 'ask10q', 
                         'bid10n', 'bid9n', 'bid8n', 'bid7n', 'bid6n', 'bid5n', 'bid4n', 'bid3n', 'bid2n', 'bid1n', 
                         'ask1n', 'ask2n', 'ask3n', 'ask4n', 'ask5n', 'ask6n', 'ask7n', 'ask8n', 'ask9n', 'ask10n', 
                         'bid10qInsert', 'bid9qInsert', 'bid8qInsert', 'bid7qInsert', 'bid6qInsert', 'bid5qInsert', 'bid4qInsert', 'bid3qInsert', 'bid2qInsert', 'bid1qInsert',
                         'ask1qInsert', 'ask2qInsert', 'ask3qInsert', 'ask4qInsert', 'ask5qInsert', 'ask6qInsert', 'ask7qInsert', 'ask8qInsert', 'ask9qInsert', 'ask10qInsert', 
                         'bid10qCancel', 'bid9qCancel', 'bid8qCancel', 'bid7qCancel', 'bid6qCancel', 'bid5qCancel', 'bid4qCancel', 'bid3qCancel', 'bid2qCancel', 'bid1qCancel',
                         'ask1qCancel', 'ask2qCancel', 'ask3qCancel', 'ask4qCancel', 'ask5qCancel', 'ask6qCancel', 'ask7qCancel', 'ask8qCancel', 'ask9qCancel', 'ask10qCancel',
                         'bid10sCancel', 'bid9sCancel', 'bid8sCancel', 'bid7sCancel', 'bid6sCancel', 'bid5sCancel', 'bid4sCancel', 'bid3sCancel', 'bid2sCancel', 'bid1sCancel',
                         'ask1sCancel', 'ask2sCancel', 'ask3sCancel', 'ask4sCancel', 'ask5sCancel', 'ask6sCancel', 'ask7sCancel', 'ask8sCancel', 'ask9sCancel', 'ask10sCancel',
                         'total_bid_quantity', 'total_ask_quantity', 'total_bid_vwap', 'total_ask_vwap', 'total_bid_orders', 'total_ask_orders', 'total_bid_levels','total_ask_levels',
                         'cum_buy_market_order_volume', 'cum_sell_market_order_volume', 'cum_buy_market_order_amount', 'cum_sell_market_order_amount', 'cum_buy_market_trade_volume', 'cum_sell_market_trade_volume',
                         'cum_buy_market_trade_amount', 'cum_sell_market_trade_amount', 'cum_buy_aggLimit_onNBBO_order_volume', 'cum_sell_aggLimit_onNBBO_order_volume', 'cum_buy_aggLimit_onNBBO_order_amount',
                         'cum_sell_aggLimit_onNBBO_order_amount', 'cum_buy_aggLimit_onNBBO_trade_volume', 'cum_sell_aggLimit_onNBBO_trade_volume', 'cum_buy_aggLimit_onNBBO_trade_amount', 'cum_sell_aggLimit_onNBBO_trade_amount',
                         'cum_buy_aggLimit_improveNBBO_order_volume', 'cum_sell_aggLimit_improveNBBO_order_volume', 'cum_buy_aggLimit_improveNBBO_order_amount', 'cum_sell_aggLimit_improveNBBO_order_amount',
                         'cum_buy_aggLimit_improveNBBO_trade_volume', 'cum_sell_aggLimit_improveNBBO_trade_volume', 'cum_buy_aggLimit_improveNBBO_trade_amount', 'cum_sell_aggLimit_improveNBBO_trade_amount']]
            db1.write('md_snapshot_mbd', mbd)
        try:
            assert(mbd[mbd['time'] < 0].shape[0] == 0)
            del mbd
        except:
            print('negative column!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
            print(s)
            mbd.loc[mbd['time'] < 0, 'time'] = mbd[mbd['time'] < 0]['datetime'].astype(str).apply(lambda x: int((x.split(' ')[1].replace(':', "")).replace(".", ""))).astype('int64') * 1000
            assert(mbd[mbd['time'] < 0].shape[0] == 0)
            db1.write('md_snapshot_mbd', mbd)
            del mbd
save = pd.DataFrame(save)
# save.to_csv('/home/work516/Downloads/missing_sec_2020.csv')

20191105


In [1]:
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz


def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name)


class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db'):
        self.db_name = db_name
        self.uri = uri
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = 1
            seg = {'ver': version, 'data': self.ser(df_seg, version), 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)

        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        else:
            raise Exception('unknown version')


def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()



import pandas as pd
import random
import numpy as np
import glob
import pickle
import os
import datetime
import time
pd.set_option("max_columns", 200)

year = "2018"
startDate = '20180901'
endDate = '20181231'
database_name = 'com_md_eq_cn'
user = "zhenyuy"
password = "bnONBrzSMGoE"

db1 = DB("192.168.10.178", database_name, user, password)
save = {}
save['date'] = []
save['secid'] = []
mdOrderLog = db1.read('md_order', start_date=startDate, end_date=endDate, symbol=[2000001])
datelist = mdOrderLog['date'].unique()
ss = pd.read_csv('/mnt/ShareWithServer/result/shangshi.csv')
ss['skey'] = np.where(ss['证券代码'].str[-2:] == 'SZ', ss['证券代码'].str[:6].astype(int) + 2000000, ss['证券代码'].str[:6].astype(int) + 1000000)
ss['date'] = (ss['上市日期'].str[:4] + ss['上市日期'].str[5:7] + ss['上市日期'].str[8:10]).astype(int)

for d in datelist:
    print(d)
    sl1 = db1.read('md_trade', start_date=str(d), end_date=str(d))['skey'].unique()
    sl1 = sl1[sl1 > 2000000]
    for s in sl1:
        mbd = db1.read('md_snapshot_mbd', start_date=str(d), end_date=str(d), symbol=s)
        if mbd is None:
            if ss[ss['skey'] == s]['date'].iloc[0] == d:
                continue
            else:
                save['date'].append(d)
                save['secid'].append(s)
                print(s)
                continue
        try:
            assert(mbd.shape[1] == 166)
        except:
            print('mdb data column unupdated')
            print(s)

20180903
20180904
20180905
20180906
20180907
20180910
20180911
20180912
20180913
20180914
20180917
20180918
20180919
20180920
20180921
20180925
20180926
20180927
20180928
20181008
20181009
20181010
20181011
2000034
20181012
20181015
20181016
20181017
20181018
20181019
20181022
20181023
20181024
2002766
2300131
2300167
2300207
2300350
2300532
20181025
20181026
20181029
20181030
20181031
20181101
20181102
20181105
20181106
20181107
20181108
20181109
20181112
20181113
20181114
2002606
20181115
20181116
20181119
20181120
20181121
20181122
20181123
20181126
20181127
20181128
20181129
20181130
20181203
20181204
20181205
20181206
20181207
20181210
20181211
20181212
20181213
20181214
20181217
20181218
20181219
20181220
20181221
20181224
20181225
20181226
20181227
20181228
