In [1]:
import pymongo 
import io 
import pandas as pd 
import pickle 
import datetime 
import time 
import gzip 
import lzma 
import pytz 
import pyarrow as pa 
import pyarrow.parquet as pq 
import numpy as np 
import re

def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name)

class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db', version=3): 
        self.db_name = db_name 
        self.uri = uri 
        self.client = pymongo.MongoClient(self.uri) 
        self.db = self.client[self.db_name] 
        self.chunk_size = 20000 
        self.symbol_column = symbol_column 
        self.date_column = 'date' 
        self.version = version

    def parse_uri(self, uri): 
        # mongodb://user:password@example.com 
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}
        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("date must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid date type: " + str(type(x)))
        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)
        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)
        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)
        return query

    def read_tick(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name] 
        query = self.build_query(start_date, end_date, symbol) 
        if not query: 
            print('cannot read the whole table') 
            return None  
        segs = [] 
        for x in collection.find(query): 
            x['data'] = self.deser(x['data'], x['ver']) 
            segs.append(x) 
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start'])) 
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None

    def read_daily(self, table_name, start_date=None, end_date=None, skey=None, index_id=None, interval=None, index_name=None, col=None, return_sdi=True): 
        collection = self.db[table_name]
        # Build projection 
        prj = {'_id': 0} 
        if col is not None: 
            if return_sdi: 
                col = ['skey', 'date', 'index_id'] + col 
            for col_name in col: 
                prj[col_name] = 1 
        # Build query 
        query = {} 
        if skey is not None: 
            query['skey'] = {'$in': skey} 
        if interval is not None: 
            query['interval'] = {'$in': interval} 
        if index_id is not None: 
            query['index_id'] = {'$in': index_id}    
        if index_name is not None:
            n = '' 
            for name in index_name: 
                try: 
                    name = re.compile('[\u4e00-\u9fff]+').findall(name)[0] 
                    if len(n) == 0: 
                        n = n = "|".join(name) 
                    else: 
                        n = n + '|' + "|".join(name) 
                except: 
                    if len(n) == 0: 
                        n = name 
                    else: 
                        n = n + '|' + name 
            query['index_name'] = {'$regex': n}
        if start_date is not None: 
            if end_date is not None: 
                query['date'] = {'$gte': start_date, '$lte': end_date} 
            else: 
                query['date'] = {'$gte': start_date} 
        elif end_date is not None: 
            query['date'] = {'$lte': end_date} 
        # Load data 
        cur = collection.find(query, prj) 
        df = pd.DataFrame.from_records(cur) 
        if df.empty: 
            df = pd.DataFrame() 
        else:
            if 'index_id' in df.columns:
                df = df.sort_values(by=['date', 'index_id', 'skey']).reset_index(drop=True)
            else:
                df = df.sort_values(by=['date','skey']).reset_index(drop=True)
        return df 
 

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = self.version
            ser_data = self.ser(df_seg, version)
            seg = {'ver': version, 'data': ser_data, 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None
        collection.delete_many(query)

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        elif version == 3:
            # 32-bit number needs more space than 64-bit for parquet
            for col_name in s.columns:
                col = s[col_name]
                if col.dtype == np.int32:
                    s[col_name] = s[col_name].astype(np.int64)
                elif col.dtype == np.uint32:
                    s[col_name] = s[col_name].astype(np.uint64)
            tbl = pa.Table.from_pandas(s)
            f = io.BytesIO()
            pq.write_table(tbl, f, use_dictionary=False, compression='ZSTD', compression_level=0)
            f.seek(0)
            data = f.read()
            return data
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        print(version)
        def unpickle(s):
            return pickle.loads(s)
        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        elif version == 3:
            f = io.BytesIO()
            f.write(s)
            f.seek(0)
            return pq.read_table(f, use_threads=False).to_pandas()
        else:
            raise Exception('unknown version')

def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()

In [236]:
database_name = 'com_md_eq_cn'
user = 'zhenyuy'
password = 'bnONBrzSMGoE'

import sys

pd.set_option('max_columns', 200)
db1 = DB("192.168.10.178", database_name, user, password)

pd.set_option('max_rows', 300)
l2 = db1.read_tick('md_snapshot_l2', start_date=20201105, end_date=20201105, symbol=2300062)
l2[(l2['cum_volume'] > 0) & (l2['time'] <= 145655000000) & (l2['ApplSeqNum'] == -1)][['skey', 'date', 'cum_volume', 'prev_close', 'open', 'close', 'cum_trades_cnt', 'bid10p', 'bid9p',
                   'bid8p', 'bid7p', 'bid6p', 'bid5p', 'bid4p', 'bid3p', 'bid2p', 'bid1p', 'ask1p', 'ask2p',
                   'ask3p', 'ask4p', 'ask5p', 'ask6p', 'ask7p', 'ask8p', 'ask9p', 'ask10p', 'bid10q', 'bid9q', 
                   'bid8q', 'bid7q', 'bid6q', 'bid5q', 'bid4q', 'bid3q', 'bid2q', 'bid1q', 'ask1q', 'ask2q', 'ask3q', 
                   'ask4q', 'ask5q', 'ask6q','ask7q', 'ask8q', 'ask9q', 'ask10q', 'bid10n', 'bid9n', 'bid8n',
                   'bid7n', 'bid6n', 'bid5n', 'bid4n', 'bid3n', 'bid2n', 'bid1n', 'ask1n', 'ask2n', 'ask3n', 
                   'ask4n', 'ask5n', 'ask6n', 'ask7n', 'ask8n', 'ask9n', 'ask10n', 'total_bid_quantity', 'total_ask_quantity']]

3


Unnamed: 0,skey,date,cum_volume,prev_close,open,close,cum_trades_cnt,bid10p,bid9p,bid8p,bid7p,bid6p,bid5p,bid4p,bid3p,bid2p,bid1p,ask1p,ask2p,ask3p,ask4p,ask5p,ask6p,ask7p,ask8p,ask9p,ask10p,bid10q,bid9q,bid8q,bid7q,bid6q,bid5q,bid4q,bid3q,bid2q,bid1q,ask1q,ask2q,ask3q,ask4q,ask5q,ask6q,ask7q,ask8q,ask9q,ask10q,bid10n,bid9n,bid8n,bid7n,bid6n,bid5n,bid4n,bid3n,bid2n,bid1n,ask1n,ask2n,ask3n,ask4n,ask5n,ask6n,ask7n,ask8n,ask9n,ask10n,total_bid_quantity,total_ask_quantity
3126,2300062,20201105,35157436,17.34,17.34,17.63,42156,17.54,17.55,17.56,17.57,17.58,17.59,17.6,17.61,17.62,17.63,17.64,17.66,17.67,17.68,17.69,17.7,17.71,17.72,17.75,17.76,1500,27300,13300,4300,8400,1100,87000,14000,4100,8200,100,100,15300,10800,11900,3500,2300,2700,8900,9100,4,16,6,3,12,1,68,18,7,4,1,1,10,2,4,4,1,5,3,4,1786900,5727000


In [238]:
mbd =db1.read_tick('md_snapshot_mbd', start_date=20201105, end_date=20201105, symbol=2300062)
mbd[mbd['cum_volume'] == 35157436][['ApplSeqNum', 'skey', 'date', 'cum_volume', 'prev_close', 'open', 'close', 'cum_trades_cnt', 'bid10p', 'bid9p',
                   'bid8p', 'bid7p', 'bid6p', 'bid5p', 'bid4p', 'bid3p', 'bid2p', 'bid1p', 'ask1p', 'ask2p',
                   'ask3p', 'ask4p', 'ask5p', 'ask6p', 'ask7p', 'ask8p', 'ask9p', 'ask10p', 'bid10q', 'bid9q', 
                   'bid8q', 'bid7q', 'bid6q', 'bid5q', 'bid4q', 'bid3q', 'bid2q', 'bid1q', 'ask1q', 'ask2q', 'ask3q', 
                   'ask4q', 'ask5q', 'ask6q','ask7q', 'ask8q', 'ask9q', 'ask10q', 'bid10n', 'bid9n', 'bid8n',
                   'bid7n', 'bid6n', 'bid5n', 'bid4n', 'bid3n', 'bid2n', 'bid1n', 'ask1n', 'ask2n', 'ask3n', 
                   'ask4n', 'ask5n', 'ask6n', 'ask7n', 'ask8n', 'ask9n', 'ask10n', 'total_bid_quantity', 'total_ask_quantity']].tail()

3
3
3
3
3
3
3


Unnamed: 0,ApplSeqNum,skey,date,cum_volume,prev_close,open,close,cum_trades_cnt,bid10p,bid9p,bid8p,bid7p,bid6p,bid5p,bid4p,bid3p,bid2p,bid1p,ask1p,ask2p,ask3p,ask4p,ask5p,ask6p,ask7p,ask8p,ask9p,ask10p,bid10q,bid9q,bid8q,bid7q,bid6q,bid5q,bid4q,bid3q,bid2q,bid1q,ask1q,ask2q,ask3q,ask4q,ask5q,ask6q,ask7q,ask8q,ask9q,ask10q,bid10n,bid9n,bid8n,bid7n,bid6n,bid5n,bid4n,bid3n,bid2n,bid1n,ask1n,ask2n,ask3n,ask4n,ask5n,ask6n,ask7n,ask8n,ask9n,ask10n,total_bid_quantity,total_ask_quantity
79892,16630580,2300062,20201105,35157436,17.34,17.34,17.63,42156,17.54,17.55,17.56,17.57,17.58,17.59,17.6,17.61,17.62,17.63,17.64,17.66,17.67,17.68,17.69,17.7,17.71,17.72,17.75,17.76,1500,27300,13300,4300,8400,1100,87000,14000,4100,8200,100,100,15300,10800,11900,3500,2300,2700,8900,9100,4,16,6,3,12,1,68,18,7,4,1,1,10,2,4,4,1,5,3,4,1786900,5727000


In [234]:
order[order['ApplSeqNum'] == 399625]

Unnamed: 0,skey,date,time,clockAtArrival,ApplSeqNum,order_side,order_type,order_price,order_qty
1984,2300062,20201105,93000490000,1604539800490000,399625,1,2,17.0,900


In [233]:
trade[trade['ApplSeqNum'] >= 3361498].head(10)

Unnamed: 0,skey,date,time,clockAtArrival,ApplSeqNum,trade_type,trade_flag,trade_price,trade_qty,BidApplSeqNum,OfferApplSeqNum
10053,2300062,20201105,93955100000,1604540395100000,3361498,4,0,0.0,900,399625,0
10054,2300062,20201105,93955170000,1604540395170000,3361827,1,0,17.61,400,3361826,1815085
10055,2300062,20201105,93955170000,1604540395170000,3361828,1,0,17.61,100,3361826,3352534
10056,2300062,20201105,93955170000,1604540395170000,3361829,1,0,17.61,100,3361826,3358450
10057,2300062,20201105,93955180000,1604540395180000,3361870,1,0,17.6,300,3361869,3361846
10058,2300062,20201105,93955230000,1604540395230000,3362044,4,0,0.0,300,221078,0
10059,2300062,20201105,93955250000,1604540395250000,3362204,4,0,0.0,500,0,612086
10060,2300062,20201105,93955320000,1604540395320000,3362627,1,0,17.6,400,3361869,3362626
10061,2300062,20201105,93955320000,1604540395320000,3362628,1,0,17.59,100,3361006,3362626
10062,2300062,20201105,93955320000,1604540395320000,3362629,1,0,17.57,5900,3343287,3362626


In [192]:
trade = db1.read_tick('md_trade', start_date=20201105, end_date=20201105, symbol=2300062)
order = db1.read_tick('md_order', start_date=20201105, end_date=20201105, symbol=2300062)

3
3
3
3
3
3
3
3
3
3


In [207]:
order[order['ApplSeqNum'] <= 16630580].tail()

Unnamed: 0,skey,date,time,clockAtArrival,ApplSeqNum,order_side,order_type,order_price,order_qty
65291,2300062,20201105,132733720000,1604554053720000,16629584,2,2,17.64,4000
65292,2300062,20201105,132734090000,1604554054090000,16630018,1,2,17.65,100
65293,2300062,20201105,132734110000,1604554054110000,16630050,2,2,17.65,100
65294,2300062,20201105,132734470000,1604554054470000,16630550,1,2,17.65,400
65295,2300062,20201105,132734490000,1604554054490000,16630580,2,2,17.64,1000


In [219]:
trade[trade['ApplSeqNum'] >= 16630581].head(7)

Unnamed: 0,skey,date,time,clockAtArrival,ApplSeqNum,trade_type,trade_flag,trade_price,trade_qty,BidApplSeqNum,OfferApplSeqNum
58632,2300062,20201105,132734490000,1604554054490000,16630581,1,0,17.65,200,16630550,16630580
58633,2300062,20201105,132734490000,1604554054490000,16630582,1,0,17.64,500,16618721,16630580
58634,2300062,20201105,132734490000,1604554054490000,16630583,1,0,17.64,200,16629260,16630580
58635,2300062,20201105,132734490000,1604554054490000,16630584,1,0,17.63,400,12225732,13512672
58636,2300062,20201105,132734490000,1604554054490000,16630585,1,0,17.63,100,12775953,13512672
58637,2300062,20201105,132734490000,1604554054490000,16630586,1,0,17.63,100,12775953,14168417
58638,2300062,20201105,132734680000,1604554054680000,16630778,1,0,17.64,100,16630777,16630580


In [212]:
trade[trade['OfferApplSeqNum'] == 16630580]

Unnamed: 0,skey,date,time,clockAtArrival,ApplSeqNum,trade_type,trade_flag,trade_price,trade_qty,BidApplSeqNum,OfferApplSeqNum
58632,2300062,20201105,132734490000,1604554054490000,16630581,1,0,17.65,200,16630550,16630580
58633,2300062,20201105,132734490000,1604554054490000,16630582,1,0,17.64,500,16618721,16630580
58634,2300062,20201105,132734490000,1604554054490000,16630583,1,0,17.64,200,16629260,16630580
58638,2300062,20201105,132734680000,1604554054680000,16630778,1,0,17.64,100,16630777,16630580


In [223]:
order[order['ApplSeqNum'] == 14168417]

Unnamed: 0,skey,date,time,clockAtArrival,ApplSeqNum,order_side,order_type,order_price,order_qty
59983,2300062,20201105,112613740000,1604546773740000,14168417,2,2,17.28,100


In [222]:
trade[trade['ApplSeqNum'] == 14168417]

Unnamed: 0,skey,date,time,clockAtArrival,ApplSeqNum,trade_type,trade_flag,trade_price,trade_qty,BidApplSeqNum,OfferApplSeqNum


In [170]:
import datetime
da_te = str(db1.read_daily('md_stock_sizefilter', skey=[1688160])['date'].iloc[0])
start = '2020-11-12'
da_te = datetime.datetime.strptime(da_te, "%Y%m%d")
start = datetime.datetime.strptime(start, "%Y-%m-%d")
print(start.weekday() + 1)
print((da_te - start).days)

4
11


In [175]:
db1.read_daily('md_stock_sizefilter', start_date=20201009, skey=[2000939])

Unnamed: 0,skey,date,size_filter
0,2000939,20201109,0.0
1,2000939,20201110,0.0
2,2000939,20201111,0.0
3,2000939,20201112,0.0
4,2000939,20201113,0.0
5,2000939,20201116,0.0
6,2000939,20201117,0.0
7,2000939,20201118,0.0
8,2000939,20201119,0.0
9,2000939,20201120,0.0


In [92]:
# l2 = db1.read_tick('md_snapshot_l2', start_date=20201023, end_date=20201023)
l2[l2['skey'] > 2300001]['skey'].unique()

array([2300002, 2300003, 2300004, 2300005, 2300006, 2300007, 2300008,
       2300009, 2300010, 2300011, 2300012, 2300013, 2300014, 2300015,
       2300016, 2300017, 2300018, 2300019, 2300020, 2300021, 2300022,
       2300023, 2300024, 2300025, 2300026, 2300027, 2300029, 2300030,
       2300031, 2300032, 2300033, 2300034, 2300035, 2300036, 2300037,
       2300038, 2300039, 2300040, 2300041, 2300042, 2300043, 2300044,
       2300045, 2300046, 2300047, 2300048, 2300049, 2300050, 2300051,
       2300052, 2300053, 2300054, 2300055, 2300056, 2300057, 2300058,
       2300059, 2300061, 2300062, 2300063, 2300064, 2300065, 2300066,
       2300067, 2300068, 2300069, 2300070, 2300071, 2300072, 2300073,
       2300074, 2300075, 2300076, 2300077, 2300078, 2300079, 2300080,
       2300081, 2300082, 2300083, 2300084, 2300085, 2300086, 2300087,
       2300088, 2300089, 2300091, 2300092, 2300093, 2300094, 2300095,
       2300096, 2300097, 2300098, 2300099, 2300100, 2300101, 2300102,
       2300103, 2300

In [68]:
order[order['ApplSeqNum'] == 1577824]

Unnamed: 0,skey,date,time,clockAtArrival,ApplSeqNum,order_side,order_type,order_price,order_qty
10557,2000001,20200902,93254190000,1599010374190000,1577824,1,1,0.0,5000


In [71]:
trade[trade['ApplSeqNum'] >= 1577824].head()

Unnamed: 0,skey,date,time,clockAtArrival,ApplSeqNum,trade_type,trade_flag,trade_price,trade_qty,BidApplSeqNum,OfferApplSeqNum
3934,2000001,20200902,93254190000,1599010374190000,1577825,1,0,15.15,3098,1577824,1576852
3935,2000001,20200902,93254190000,1599010374190000,1577826,1,0,15.16,900,1577824,1458521
3936,2000001,20200902,93254190000,1599010374190000,1577827,1,0,15.16,1002,1577824,1465043
3937,2000001,20200902,93254300000,1599010374300000,1578242,4,0,0.0,10000,411570,0
3938,2000001,20200902,93254470000,1599010374470000,1578997,4,0,0.0,900,0,1576967


In [73]:
display(test.shape[0])
display(data1.shape[0])
display(re.shape[0])

153153

153160

306290

In [38]:
data1 = pd.read_pickle('E:\\1202.pkl')
re = pd.merge(test, data1, on=list(data1.columns[~data1.columns.isin(['cum_trades_cnt', 'total_bid_vwap', 'total_ask_vwap', 'ordering'])]), how='outer')

In [57]:
order[order['ApplSeqNum'] == 275221]

Unnamed: 0,skey,date,time,clockAtArrival,ApplSeqNum,order_side,order_type,order_price,order_qty
4446,2000001,20200902,92220310000,1599009740310000,275221,1,2,15.0,400


In [58]:
test[(test['ApplSeqNum'] <= 390835)].tail()

Unnamed: 0,skey,date,time,clockAtArrival,datetime,ordering,ApplSeqNum,bbo_improve,pass_filter,cum_volume,cum_amount,prev_close,open,close,bid10p,bid9p,bid8p,bid7p,bid6p,bid5p,bid4p,bid3p,bid2p,bid1p,ask1p,ask2p,ask3p,ask4p,ask5p,ask6p,ask7p,ask8p,ask9p,ask10p,bid10q,bid9q,bid8q,bid7q,bid6q,bid5q,bid4q,bid3q,bid2q,bid1q,ask1q,ask2q,ask3q,ask4q,ask5q,ask6q,ask7q,ask8q,ask9q,ask10q,bid10n,bid9n,bid8n,bid7n,bid6n,bid5n,bid4n,bid3n,bid2n,bid1n,ask1n,ask2n,ask3n,ask4n,ask5n,ask6n,ask7n,ask8n,ask9n,ask10n,bid10qInsert,bid9qInsert,bid8qInsert,bid7qInsert,bid6qInsert,bid5qInsert,bid4qInsert,bid3qInsert,bid2qInsert,bid1qInsert,ask1qInsert,ask2qInsert,ask3qInsert,ask4qInsert,ask5qInsert,ask6qInsert,ask7qInsert,ask8qInsert,ask9qInsert,ask10qInsert,bid10qCancel,bid9qCancel,bid8qCancel,bid7qCancel,bid6qCancel,bid5qCancel,bid4qCancel,bid3qCancel,bid2qCancel,bid1qCancel,ask1qCancel,ask2qCancel,ask3qCancel,ask4qCancel,ask5qCancel,ask6qCancel,ask7qCancel,ask8qCancel,ask9qCancel,ask10qCancel,bid10sCancel,bid9sCancel,bid8sCancel,bid7sCancel,bid6sCancel,bid5sCancel,bid4sCancel,bid3sCancel,bid2sCancel,bid1sCancel,ask1sCancel,ask2sCancel,ask3sCancel,ask4sCancel,ask5sCancel,ask6sCancel,ask7sCancel,ask8sCancel,ask9sCancel,ask10sCancel,total_bid_quantity,total_ask_quantity,total_bid_vwap,total_ask_vwap,total_bid_orders,total_ask_orders,total_bid_levels,total_ask_levels,cum_buy_market_order_volume,cum_sell_market_order_volume,cum_buy_market_order_amount,cum_sell_market_order_amount,cum_buy_market_trade_volume,cum_sell_market_trade_volume,cum_buy_market_trade_amount,cum_sell_market_trade_amount,cum_buy_aggLimit_onNBBO_order_volume,cum_sell_aggLimit_onNBBO_order_volume,cum_buy_aggLimit_onNBBO_order_amount,cum_sell_aggLimit_onNBBO_order_amount,cum_buy_aggLimit_onNBBO_trade_volume,cum_sell_aggLimit_onNBBO_trade_volume,cum_buy_aggLimit_onNBBO_trade_amount,cum_sell_aggLimit_onNBBO_trade_amount,cum_buy_aggLimit_improveNBBO_order_volume,cum_sell_aggLimit_improveNBBO_order_volume,cum_buy_aggLimit_improveNBBO_order_amount,cum_sell_aggLimit_improveNBBO_order_amount,cum_buy_aggLimit_improveNBBO_trade_volume,cum_sell_aggLimit_improveNBBO_trade_volume,cum_buy_aggLimit_improveNBBO_trade_amount,cum_sell_aggLimit_improveNBBO_trade_amount
19,2000001,20200902,93000030000,1599010200030000,2020-09-02 09:30:00.030,20,388884,0,-1,7241032,108688700.0,15.14,15.01,15.07,14.95,14.96,14.97,14.98,14.99,15.0,15.01,15.02,15.03,15.07,15.08,15.09,15.1,15.12,15.13,15.14,15.15,15.16,15.17,15.18,41800,30600,25800,25700,44900,744800,287168,374000,6300,35000,8500,300,1600,10500,9520,6200,4600,11500,4100,4300,21,29,13,23,7,173,39,6,1,1,3,1,2,4,14,10,7,2,2,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5275068,6208519,14.656156,15.849836,1676,2754,128,157,0,0,0.0,0.0,0,0,0.0,0.0,0,0,0.0,0.0,0,0,0.0,0.0,60000,0,903800.0,0.0,18700,0,281474.0,0.0
20,2000001,20200902,93000040000,1599010200040000,2020-09-02 09:30:00.040,21,389714,0,-1,7241032,108688700.0,15.14,15.01,15.07,14.95,14.96,14.97,14.98,14.99,15.0,15.01,15.02,15.03,15.07,15.08,15.09,15.1,15.12,15.13,15.14,15.15,15.16,15.17,15.18,41800,30600,25800,25700,44900,744800,287168,374000,6300,35000,8500,300,1600,10500,9520,6200,4600,11500,4100,4300,21,29,13,23,7,173,39,6,1,1,3,1,2,4,14,10,7,2,2,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5275568,6208519,14.656147,15.849836,1677,2754,128,157,0,0,0.0,0.0,0,0,0.0,0.0,0,0,0.0,0.0,0,0,0.0,0.0,60000,0,903800.0,0.0,18700,0,281474.0,0.0
21,2000001,20200902,93000050000,1599010200050000,2020-09-02 09:30:00.050,22,390524,0,-1,7241032,108688700.0,15.14,15.01,15.07,14.95,14.96,14.97,14.98,14.99,15.0,15.01,15.02,15.03,15.07,15.08,15.09,15.1,15.12,15.13,15.14,15.15,15.16,15.17,15.18,40800,30600,25800,25700,44900,744800,287168,374000,6300,35000,8500,300,1600,10500,9520,6200,4600,11500,4100,4300,20,29,13,23,7,173,39,6,1,1,3,1,2,4,14,10,7,2,2,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,14950.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5274568,6208519,14.656091,15.849836,1676,2754,128,157,0,0,0.0,0.0,0,0,0.0,0.0,0,0,0.0,0.0,0,0,0.0,0.0,60000,0,903800.0,0.0,18700,0,281474.0,0.0
22,2000001,20200902,93000050000,1599010200050000,2020-09-02 09:30:00.050,23,390696,0,-1,7241032,108688700.0,15.14,15.01,15.07,14.95,14.96,14.97,14.98,14.99,15.0,15.01,15.02,15.03,15.07,15.08,15.09,15.1,15.12,15.13,15.14,15.15,15.16,15.17,15.18,40800,30600,25800,25700,44900,744800,287168,374000,6300,35000,8500,300,1600,10500,9520,6200,4600,11500,4100,4300,20,29,13,23,7,173,39,6,1,1,3,1,2,4,14,10,7,2,2,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5274568,6209019,14.656091,15.849897,1676,2755,128,157,0,0,0.0,0.0,0,0,0.0,0.0,0,0,0.0,0.0,0,0,0.0,0.0,60000,0,903800.0,0.0,18700,0,281474.0,0.0
23,2000001,20200902,93000050000,1599010200050000,2020-09-02 09:30:00.050,24,390835,0,-1,7241032,108688700.0,15.14,15.01,15.07,14.95,14.96,14.97,14.98,14.99,15.0,15.01,15.02,15.03,15.07,15.08,15.09,15.1,15.12,15.13,15.14,15.15,15.16,15.17,15.18,40800,30600,25800,25700,44900,744400,287168,374000,6300,35000,8500,300,1600,10500,9520,6200,4600,11500,4100,4300,20,29,13,23,7,173,39,6,1,1,3,1,2,4,14,10,7,2,2,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,400,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,6000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5274168,6209019,14.656065,15.849897,1675,2755,128,157,0,0,0.0,0.0,0,0,0.0,0.0,0,0,0.0,0.0,0,0,0.0,0.0,60000,0,903800.0,0.0,18700,0,281474.0,0.0


In [59]:
test[(test['ApplSeqNum'] >= 390696) & (test['ApplSeqNum'] <= 391011)][list(data1.columns[~data1.columns.isin(['cum_trades_cnt', 'total_bid_vwap', 'total_ask_vwap', 'ordering'])])]

Unnamed: 0,skey,date,time,clockAtArrival,ApplSeqNum,bbo_improve,pass_filter,cum_volume,cum_amount,prev_close,open,close,bid10p,bid9p,bid8p,bid7p,bid6p,bid5p,bid4p,bid3p,bid2p,bid1p,ask1p,ask2p,ask3p,ask4p,ask5p,ask6p,ask7p,ask8p,ask9p,ask10p,bid10q,bid9q,bid8q,bid7q,bid6q,bid5q,bid4q,bid3q,bid2q,bid1q,ask1q,ask2q,ask3q,ask4q,ask5q,ask6q,ask7q,ask8q,ask9q,ask10q,bid10n,bid9n,bid8n,bid7n,bid6n,bid5n,bid4n,bid3n,bid2n,bid1n,ask1n,ask2n,ask3n,ask4n,ask5n,ask6n,ask7n,ask8n,ask9n,ask10n,total_bid_quantity,total_ask_quantity,total_bid_orders,total_ask_orders,total_bid_levels,total_ask_levels
22,2000001,20200902,93000050000,1599010200050000,390696,0,-1,7241032,108688700.0,15.14,15.01,15.07,14.95,14.96,14.97,14.98,14.99,15.0,15.01,15.02,15.03,15.07,15.08,15.09,15.1,15.12,15.13,15.14,15.15,15.16,15.17,15.18,40800,30600,25800,25700,44900,744800,287168,374000,6300,35000,8500,300,1600,10500,9520,6200,4600,11500,4100,4300,20,29,13,23,7,173,39,6,1,1,3,1,2,4,14,10,7,2,2,5,5274568,6209019,1676,2755,128,157
23,2000001,20200902,93000050000,1599010200050000,390835,0,-1,7241032,108688700.0,15.14,15.01,15.07,14.95,14.96,14.97,14.98,14.99,15.0,15.01,15.02,15.03,15.07,15.08,15.09,15.1,15.12,15.13,15.14,15.15,15.16,15.17,15.18,40800,30600,25800,25700,44900,744400,287168,374000,6300,35000,8500,300,1600,10500,9520,6200,4600,11500,4100,4300,20,29,13,23,7,173,39,6,1,1,3,1,2,4,14,10,7,2,2,5,5274168,6209019,1675,2755,128,157
24,2000001,20200902,93000050000,1599010200050000,390853,0,-1,7241032,108688700.0,15.14,15.01,15.07,14.95,14.96,14.97,14.98,14.99,15.0,15.01,15.02,15.03,15.07,15.08,15.09,15.1,15.12,15.13,15.14,15.15,15.16,15.17,15.18,40800,30600,25800,25700,44900,744400,287168,374600,6300,35000,8500,300,1600,10500,9520,6200,4600,11500,4100,4300,20,29,13,23,7,173,39,7,1,1,3,1,2,4,14,10,7,2,2,5,5274768,6209019,1676,2755,128,157
25,2000001,20200902,93000050000,1599010200050000,390861,0,-1,7241032,108688700.0,15.14,15.01,15.07,14.95,14.96,14.97,14.98,14.99,15.0,15.01,15.02,15.03,15.07,15.08,15.09,15.1,15.12,15.13,15.14,15.15,15.16,15.17,15.18,40800,30600,25800,25700,44900,744400,287168,374600,6300,35000,8500,300,1600,10500,9520,6200,4600,11500,4100,4300,20,29,13,23,7,173,39,7,1,1,3,1,2,4,14,10,7,2,2,5,5275268,6209019,1677,2755,128,157
26,2000001,20200902,93000050000,1599010200050000,390977,0,-1,7241032,108688700.0,15.14,15.01,15.07,14.95,14.96,14.97,14.98,14.99,15.0,15.01,15.02,15.03,15.07,15.08,15.09,15.1,15.12,15.13,15.14,15.15,15.16,15.17,15.18,40800,30600,25800,25700,44900,744400,287168,374600,6300,35000,8500,300,1600,10500,9520,6200,4600,11500,4100,4300,20,29,13,23,7,173,39,7,1,1,3,1,2,4,14,10,7,2,2,5,5275268,6209519,1677,2756,128,157
27,2000001,20200902,93000050000,1599010200050000,391011,0,-1,7241032,108688700.0,15.14,15.01,15.07,14.95,14.96,14.97,14.98,14.99,15.0,15.01,15.02,15.03,15.07,15.08,15.09,15.1,15.12,15.13,15.14,15.15,15.16,15.17,15.18,40800,30600,25800,25700,44900,744400,287168,374600,6300,35000,8500,300,1600,10500,9520,6200,4600,11500,4100,4300,20,29,13,23,7,173,39,7,1,1,3,1,2,4,14,10,7,2,2,5,5275668,6209519,1678,2756,128,157


In [60]:
data1[(data1['ApplSeqNum'] >= 390696) & (data1['ApplSeqNum'] <= 391011)][list(data1.columns[~data1.columns.isin(['cum_trades_cnt', 'total_bid_vwap', 'total_ask_vwap', 'ordering'])])]

Unnamed: 0,skey,date,time,clockAtArrival,ApplSeqNum,bbo_improve,pass_filter,cum_volume,cum_amount,prev_close,open,close,bid10p,bid9p,bid8p,bid7p,bid6p,bid5p,bid4p,bid3p,bid2p,bid1p,ask1p,ask2p,ask3p,ask4p,ask5p,ask6p,ask7p,ask8p,ask9p,ask10p,bid10q,bid9q,bid8q,bid7q,bid6q,bid5q,bid4q,bid3q,bid2q,bid1q,ask1q,ask2q,ask3q,ask4q,ask5q,ask6q,ask7q,ask8q,ask9q,ask10q,bid10n,bid9n,bid8n,bid7n,bid6n,bid5n,bid4n,bid3n,bid2n,bid1n,ask1n,ask2n,ask3n,ask4n,ask5n,ask6n,ask7n,ask8n,ask9n,ask10n,total_bid_quantity,total_ask_quantity,total_bid_orders,total_ask_orders,total_bid_levels,total_ask_levels
22,2000001,20200902,93000050000,1599010200050000,390696,0,-1,7241032,108688700.0,15.14,15.01,15.07,14.95,14.96,14.97,14.98,14.99,15.0,15.01,15.02,15.03,15.07,15.08,15.09,15.1,15.12,15.13,15.14,15.15,15.16,15.17,15.18,40800,30600,25800,25700,44900,744800,287168,374000,6300,35000,8500,300,1600,10500,9520,6200,4600,11500,4100,4300,20,29,13,23,7,173,39,6,1,1,3,1,2,4,14,10,7,2,2,5,5274568,6209019,1676,2755,128,157
23,2000001,20200902,93000050000,1599010200050000,390835,0,-1,7241032,108688700.0,15.14,15.01,15.07,14.95,14.96,14.97,14.98,14.99,15.0,15.01,15.02,15.03,15.07,15.08,15.09,15.1,15.12,15.13,15.14,15.15,15.16,15.17,15.18,40800,30600,25800,25700,44900,744400,287168,374000,6300,35000,8500,300,1600,10500,9520,6200,4600,11500,4100,4300,20,29,13,23,7,172,39,6,1,1,3,1,2,4,14,10,7,2,2,5,5274168,6209019,1675,2755,128,157
24,2000001,20200902,93000050000,1599010200050000,390853,0,-1,7241032,108688700.0,15.14,15.01,15.07,14.95,14.96,14.97,14.98,14.99,15.0,15.01,15.02,15.03,15.07,15.08,15.09,15.1,15.12,15.13,15.14,15.15,15.16,15.17,15.18,40800,30600,25800,25700,44900,744400,287168,374600,6300,35000,8500,300,1600,10500,9520,6200,4600,11500,4100,4300,20,29,13,23,7,172,39,7,1,1,3,1,2,4,14,10,7,2,2,5,5274768,6209019,1676,2755,128,157
25,2000001,20200902,93000050000,1599010200050000,390861,0,-1,7241032,108688700.0,15.14,15.01,15.07,14.95,14.96,14.97,14.98,14.99,15.0,15.01,15.02,15.03,15.07,15.08,15.09,15.1,15.12,15.13,15.14,15.15,15.16,15.17,15.18,40800,30600,25800,25700,44900,744400,287168,374600,6300,35000,8500,300,1600,10500,9520,6200,4600,11500,4100,4300,20,29,13,23,7,172,39,7,1,1,3,1,2,4,14,10,7,2,2,5,5275268,6209019,1677,2755,128,157
26,2000001,20200902,93000050000,1599010200050000,390977,0,-1,7241032,108688700.0,15.14,15.01,15.07,14.95,14.96,14.97,14.98,14.99,15.0,15.01,15.02,15.03,15.07,15.08,15.09,15.1,15.12,15.13,15.14,15.15,15.16,15.17,15.18,40800,30600,25800,25700,44900,744400,287168,374600,6300,35000,8500,300,1600,10500,9520,6200,4600,11500,4100,4300,20,29,13,23,7,172,39,7,1,1,3,1,2,4,14,10,7,2,2,5,5275268,6209519,1677,2756,128,157
27,2000001,20200902,93000050000,1599010200050000,391011,0,-1,7241032,108688700.0,15.14,15.01,15.07,14.95,14.96,14.97,14.98,14.99,15.0,15.01,15.02,15.03,15.07,15.08,15.09,15.1,15.12,15.13,15.14,15.15,15.16,15.17,15.18,40800,30600,25800,25700,44900,744400,287168,374600,6300,35000,8500,300,1600,10500,9520,6200,4600,11500,4100,4300,20,29,13,23,7,172,39,7,1,1,3,1,2,4,14,10,7,2,2,5,5275668,6209519,1678,2756,128,157


In [37]:
list(data1.columns[~data1.columns.isin(['cum_trades_cnt', 'total_bid_vwap', 'total_ask_vwap', 'ordering'])])

['skey',
 'date',
 'time',
 'clockAtArrival',
 'ApplSeqNum',
 'bbo_improve',
 'pass_filter',
 'cum_volume',
 'cum_amount',
 'prev_close',
 'open',
 'close',
 'bid10p',
 'bid9p',
 'bid8p',
 'bid7p',
 'bid6p',
 'bid5p',
 'bid4p',
 'bid3p',
 'bid2p',
 'bid1p',
 'ask1p',
 'ask2p',
 'ask3p',
 'ask4p',
 'ask5p',
 'ask6p',
 'ask7p',
 'ask8p',
 'ask9p',
 'ask10p',
 'bid10q',
 'bid9q',
 'bid8q',
 'bid7q',
 'bid6q',
 'bid5q',
 'bid4q',
 'bid3q',
 'bid2q',
 'bid1q',
 'ask1q',
 'ask2q',
 'ask3q',
 'ask4q',
 'ask5q',
 'ask6q',
 'ask7q',
 'ask8q',
 'ask9q',
 'ask10q',
 'bid10n',
 'bid9n',
 'bid8n',
 'bid7n',
 'bid6n',
 'bid5n',
 'bid4n',
 'bid3n',
 'bid2n',
 'bid1n',
 'ask1n',
 'ask2n',
 'ask3n',
 'ask4n',
 'ask5n',
 'ask6n',
 'ask7n',
 'ask8n',
 'ask9n',
 'ask10n',
 'total_bid_quantity',
 'total_ask_quantity',
 'total_bid_orders',
 'total_ask_orders',
 'total_bid_levels',
 'total_ask_levels']

In [101]:
cols = ['skey', 'date', 'cum_volume', 'prev_close', 'open', 'close', 'cum_trades_cnt', 'bid10p', 'bid9p',
       'bid8p', 'bid7p', 'bid6p', 'bid5p', 'bid4p', 'bid3p', 'bid2p', 'bid1p', 'ask1p', 'ask2p',
       'ask3p', 'ask4p', 'ask5p', 'ask6p', 'ask7p', 'ask8p', 'ask9p', 'ask10p', 'bid10q', 'bid9q', 
       'bid8q', 'bid7q', 'bid6q', 'bid5q', 'bid4q', 'bid3q', 'bid2q', 'bid1q', 'ask1q', 'ask2q', 'ask3q', 
       'ask4q', 'ask5q', 'ask6q','ask7q', 'ask8q', 'ask9q', 'ask10q', 'bid10n', 'bid9n', 'bid8n',
       'bid7n', 'bid6n', 'bid5n', 'bid4n', 'bid3n', 'bid2n', 'bid1n', 'ask1n', 'ask2n', 'ask3n', 
       'ask4n', 'ask5n', 'ask6n', 'ask7n', 'ask8n', 'ask9n', 'ask10n', 'total_bid_quantity', 'total_ask_quantity']
mbd1 = data2.drop_duplicates(cols, keep='first')
mbd = mbd1[cols+['ApplSeqNum']]
if 'ApplSeqNum' in l2.columns:
    l2 = l2[list(l2.columns[l2.columns != 'ApplSeqNum'])]
rl2 = pd.merge(l2, mbd, on=cols, how='left')
try:
    assert(rl2[(rl2['ApplSeqNum'].isnull()) & (rl2['cum_volume'] > 0) & (rl2['time'] <= 145655000000)].shape[0] == 0)
except:
    print(rl2[(rl2['ApplSeqNum'].isnull()) & (rl2['cum_volume'] > 0) & (rl2['time'] <= 145655000000)][['skey', 'date', 'time', 'cum_volume', 'close', 'bid1p', 'bid2p','bid1q', 'bid2q', 'ask1p', 'ask2p', 'ask1q', 'ask2q']])
    print(mbd1.tail(1)[['skey', 'date', 'time', 'cum_volume', 'close', 'bid1p', 'bid2p','bid1q', 'bid2q', 'ask1p', 'ask2p', 'ask1q', 'ask2q']])
rl2[(rl2['ApplSeqNum'].isnull()) & (rl2['cum_volume'] > 0) & (rl2['time'] <= 145655000000)]

Unnamed: 0,skey,date,time,clockAtArrival,ordering,has_missing,cum_trades_cnt,cum_volume,cum_amount,prev_close,open,high,low,close,bid10p,bid9p,bid8p,bid7p,bid6p,bid5p,bid4p,bid3p,bid2p,bid1p,ask1p,ask2p,ask3p,ask4p,ask5p,ask6p,ask7p,ask8p,ask9p,ask10p,bid10q,bid9q,bid8q,bid7q,bid6q,bid5q,bid4q,bid3q,bid2q,bid1q,ask1q,ask2q,ask3q,ask4q,ask5q,ask6q,ask7q,ask8q,ask9q,ask10q,bid10n,bid9n,bid8n,bid7n,bid6n,bid5n,bid4n,bid3n,bid2n,bid1n,ask1n,ask2n,ask3n,ask4n,ask5n,ask6n,ask7n,ask8n,ask9n,ask10n,bid1Top1q,bid1Top2q,bid1Top3q,bid1Top4q,bid1Top5q,bid1Top6q,bid1Top7q,bid1Top8q,bid1Top9q,bid1Top10q,bid1Top11q,bid1Top12q,bid1Top13q,bid1Top14q,bid1Top15q,bid1Top16q,bid1Top17q,bid1Top18q,bid1Top19q,bid1Top20q,bid1Top21q,bid1Top22q,bid1Top23q,bid1Top24q,bid1Top25q,bid1Top26q,bid1Top27q,bid1Top28q,bid1Top29q,bid1Top30q,bid1Top31q,bid1Top32q,bid1Top33q,bid1Top34q,bid1Top35q,bid1Top36q,bid1Top37q,bid1Top38q,bid1Top39q,bid1Top40q,bid1Top41q,bid1Top42q,bid1Top43q,bid1Top44q,bid1Top45q,bid1Top46q,bid1Top47q,bid1Top48q,bid1Top49q,bid1Top50q,ask1Top1q,ask1Top2q,ask1Top3q,ask1Top4q,ask1Top5q,ask1Top6q,ask1Top7q,ask1Top8q,ask1Top9q,ask1Top10q,ask1Top11q,ask1Top12q,ask1Top13q,ask1Top14q,ask1Top15q,ask1Top16q,ask1Top17q,ask1Top18q,ask1Top19q,ask1Top20q,ask1Top21q,ask1Top22q,ask1Top23q,ask1Top24q,ask1Top25q,ask1Top26q,ask1Top27q,ask1Top28q,ask1Top29q,ask1Top30q,ask1Top31q,ask1Top32q,ask1Top33q,ask1Top34q,ask1Top35q,ask1Top36q,ask1Top37q,ask1Top38q,ask1Top39q,ask1Top40q,ask1Top41q,ask1Top42q,ask1Top43q,ask1Top44q,ask1Top45q,ask1Top46q,ask1Top47q,ask1Top48q,ask1Top49q,ask1Top50q,total_bid_quantity,total_ask_quantity,total_bid_vwap,total_ask_vwap,total_bid_orders,total_ask_orders,total_bid_levels,total_ask_levels,bid_trade_max_duration,ask_trade_max_duration,cum_canceled_buy_orders,cum_canceled_buy_volume,cum_canceled_buy_amount,cum_canceled_sell_orders,cum_canceled_sell_volume,cum_canceled_sell_amount,ApplSeqNum


In [99]:
data1 = pd.read_pickle('E:\\1202.pkl')
data2 = pd.read_pickle('E:\\1203.pkl')
display(data1.shape[0])
display(data2.shape[0])

32127

32127

In [43]:
test[test['ApplSeqNum'] == 390835][list(data1.columns[data1.columns != 'cum_trades_cnt'])]

Unnamed: 0,skey,date,time,clockAtArrival,ordering,ApplSeqNum,bbo_improve,pass_filter,cum_volume,cum_amount,prev_close,open,close,bid10p,bid9p,bid8p,bid7p,bid6p,bid5p,bid4p,bid3p,bid2p,bid1p,ask1p,ask2p,ask3p,ask4p,ask5p,ask6p,ask7p,ask8p,ask9p,ask10p,bid10q,bid9q,bid8q,bid7q,bid6q,bid5q,bid4q,bid3q,bid2q,bid1q,ask1q,ask2q,ask3q,ask4q,ask5q,ask6q,ask7q,ask8q,ask9q,ask10q,bid10n,bid9n,bid8n,bid7n,bid6n,bid5n,bid4n,bid3n,bid2n,bid1n,ask1n,ask2n,ask3n,ask4n,ask5n,ask6n,ask7n,ask8n,ask9n,ask10n,total_bid_quantity,total_ask_quantity,total_bid_vwap,total_ask_vwap,total_bid_orders,total_ask_orders,total_bid_levels,total_ask_levels
23,2000001,20200902,93000050000,1599010200050000,24,390835,0,-1,7241032,108688700.0,15.14,15.01,15.07,14.95,14.96,14.97,14.98,14.99,15.0,15.01,15.02,15.03,15.07,15.08,15.09,15.1,15.12,15.13,15.14,15.15,15.16,15.17,15.18,40800,30600,25800,25700,44900,744400,287168,374000,6300,35000,8500,300,1600,10500,9520,6200,4600,11500,4100,4300,20,29,13,23,7,173,39,6,1,1,3,1,2,4,14,10,7,2,2,5,5274168,6209019,14.656065,15.849897,1675,2755,128,157


In [100]:
pd.merge(data1, data2, on=list(data1.columns[~data1.columns.isin(['total_bid_vwap', 'total_ask_vwap'])]), how='outer')

Unnamed: 0,skey,date,time,clockAtArrival,ordering,ApplSeqNum,bbo_improve,pass_filter,cum_trades_cnt,cum_volume,cum_amount,prev_close,open,close,bid10p,bid9p,bid8p,bid7p,bid6p,bid5p,bid4p,bid3p,bid2p,bid1p,ask1p,ask2p,ask3p,ask4p,ask5p,ask6p,ask7p,ask8p,ask9p,ask10p,bid10q,bid9q,bid8q,bid7q,bid6q,bid5q,bid4q,bid3q,bid2q,bid1q,ask1q,ask2q,ask3q,ask4q,ask5q,ask6q,ask7q,ask8q,ask9q,ask10q,bid10n,bid9n,bid8n,bid7n,bid6n,bid5n,bid4n,bid3n,bid2n,bid1n,ask1n,ask2n,ask3n,ask4n,ask5n,ask6n,ask7n,ask8n,ask9n,ask10n,total_bid_quantity,total_ask_quantity,total_bid_vwap_x,total_ask_vwap_x,total_bid_orders,total_ask_orders,total_bid_levels,total_ask_levels,datetime,total_bid_vwap_y,total_ask_vwap_y
0,2300824,20200810,92500000000,1597022700000000,1,429281,1,2,173,168900,5.151450e+06,30.96,30.5,30.50,30.02,30.03,30.08,30.18,30.20,30.21,30.28,30.37,30.42,30.47,30.50,30.58,30.59,30.60,30.61,30.69,30.70,30.71,30.77,30.80,400,200,3700,400,400,700,400,300,500,100,100,2700,2700,9900,2600,1100,300,100,2700,1100,2,2,2,1,2,2,2,1,1,1,1,2,1,7,2,1,1,1,1,3,59400,323358,28.942660,33.022090,103,287,50,115,2020-08-10 09:25:00.000,28.942660,33.022090
1,2300824,20200810,93000020000,1597023000020000,2,433088,0,-1,173,168900,5.151450e+06,30.96,30.5,30.50,30.02,30.03,30.08,30.18,30.20,30.21,30.28,30.37,30.42,30.47,30.50,30.58,30.59,30.60,30.61,30.69,30.70,30.71,30.77,30.80,500,200,3700,400,400,700,400,300,500,100,100,2700,2700,9900,2600,1100,300,100,2700,1100,3,2,2,1,2,2,2,1,1,1,1,2,1,7,2,1,1,1,1,3,59500,323358,28.944471,33.022090,104,287,50,115,2020-08-10 09:30:00.020,28.944471,33.022090
2,2300824,20200810,93000020000,1597023000020000,3,433751,0,-1,173,168900,5.151450e+06,30.96,30.5,30.50,30.02,30.03,30.08,30.18,30.20,30.21,30.28,30.37,30.42,30.47,30.50,30.58,30.59,30.60,30.61,30.69,30.70,30.71,30.77,30.80,400,200,3700,400,400,700,400,300,500,100,100,2700,2700,9900,2600,1100,300,100,2700,1100,2,2,2,1,2,2,2,1,1,1,1,2,1,7,2,1,1,1,1,3,59400,323358,28.942660,33.022090,103,287,50,115,2020-08-10 09:30:00.020,28.942660,33.022090
3,2300824,20200810,93000020000,1597023000020000,4,433821,0,-1,173,168900,5.151450e+06,30.96,30.5,30.50,30.02,30.03,30.08,30.18,30.20,30.21,30.28,30.37,30.42,30.47,30.50,30.58,30.59,30.60,30.61,30.69,30.70,30.71,30.77,30.80,400,200,3700,400,400,700,400,300,500,100,100,2700,2700,9900,2600,1100,300,100,2700,1100,2,2,2,1,2,2,2,1,1,1,1,2,1,7,2,1,1,1,1,3,59700,323358,28.946516,33.022090,104,287,51,115,2020-08-10 09:30:00.020,28.946516,33.022090
4,2300824,20200810,93000020000,1597023000020000,5,433832,1,2,174,169000,5.154497e+06,30.96,30.5,30.47,30.01,30.02,30.03,30.08,30.18,30.20,30.21,30.28,30.37,30.42,30.47,30.50,30.58,30.59,30.60,30.61,30.69,30.70,30.71,30.77,1400,400,200,3700,400,400,700,400,300,500,300,100,2700,2700,9900,2600,1100,300,100,2700,2,2,2,2,1,2,2,2,1,1,1,1,2,1,7,2,1,1,1,1,59600,323658,28.943960,33.019725,103,288,50,116,2020-08-10 09:30:00.020,28.943960,33.019725
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32122,2300824,20200810,145652970000,1597042612970000,32123,24563769,0,-1,16040,9133725,2.680432e+08,30.96,30.5,29.67,29.54,29.55,29.56,29.57,29.58,29.60,29.61,29.62,29.63,29.66,29.67,29.68,29.69,29.70,29.71,29.72,29.73,29.75,29.78,29.80,1500,14100,10100,4100,7000,500,1000,100,3800,100,1200,900,200,1200,100,500,100,700,100,1800,2,10,9,4,3,1,1,1,1,1,1,3,1,2,1,1,1,1,1,4,875551,618668,28.426773,32.137133,1118,677,148,206,2020-08-10 14:56:52.970,28.426773,32.137138
32123,2300824,20200810,145654000000,1597042614000000,32124,24565724,0,-1,16040,9133725,2.680432e+08,30.96,30.5,29.67,29.55,29.56,29.57,29.58,29.60,29.61,29.62,29.63,29.65,29.66,29.67,29.68,29.69,29.70,29.71,29.72,29.73,29.75,29.78,29.80,14100,10100,4100,7000,500,1000,100,3800,400,100,1200,900,200,1200,100,500,100,700,100,1800,10,9,4,3,1,1,1,1,1,1,1,3,1,2,1,1,1,1,1,4,875951,618668,28.427331,32.137133,1119,677,149,206,2020-08-10 14:56:54.000,28.427331,32.137138
32124,2300824,20200810,145654030000,1597042614030000,32125,24565748,1,2,16041,9133825,2.680462e+08,30.96,30.5,29.66,29.54,29.55,29.56,29.57,29.58,29.60,29.61,29.62,29.63,29.65,29.66,29.67,29.68,29.69,29.70,29.71,29.72,29.73,29.75,29.78,1500,14100,10100,4100,7000,500,1000,100,3800,400,400,1200,900,200,1200,100,500,100,700,100,2,10,9,4,3,1,1,1,1,1,1,1,3,1,2,1,1,1,1,1,875851,619068,28.427191,32.135533,1118,678,148,207,2020-08-10 14:56:54.030,28.427191,32.135538
32125,2300824,20200810,145654440000,1597042614440000,32126,24566503,1,2,16042,9134225,2.680580e+08,30.96,30.5,29.66,29.54,29.55,29.56,29.57,29.58,29.60,29.61,29.62,29.63,29.65,29.67,29.68,29.69,29.70,29.71,29.72,29.73,29.75,29.78,29.80,1500,14100,10100,4100,7000,500,1000,100,3800,400,1200,900,200,1200,100,500,100,700,100,1800,2,10,9,4,3,1,1,1,1,1,1,3,1,2,1,1,1,1,1,4,875851,618668,28.427191,32.137133,1118,677,148,206,2020-08-10 14:56:54.440,28.427191,32.137138
