In [1]:
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz


def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name)


class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db'):
        self.db_name = db_name
        self.uri = uri
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = 1
            seg = {'ver': version, 'data': self.ser(df_seg, version), 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)

        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        else:
            raise Exception('unknown version')


def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()


In [None]:
import pandas as pd
import random
import numpy as np
import glob
import pickle
import os
import datetime
import time
pd.set_option("max_columns", 200)

startDate = 20200102
endDate = 20200102
database_name = 'com_md_eq_cn'
user = "zhenyuy"
password = "bnONBrzSMGoE"

db = DB("192.168.10.178", database_name, user, password)
SH = db.read('md_snapshot_l2', start_date=startDate, end_date=endDate)
SZ = SH[SH['skey'] > 2000000]
SH = SH[SH['skey'] < 2000000]
SH['num'] = SH['skey'] * 10000 + SH['ordering']
SZ['num'] = SZ['skey'] * 10000 + SZ['ordering']
SH = SH[['date', 'skey', 'time', 'cum_volume', 'cum_amount', "close", "bid1p", "bid2p", "bid3p", "bid4p", "bid5p", "bid1q", "bid2q",
           "bid3q", "bid4q", "bid5q", "ask1p", "ask2p", "ask3p", "ask4p", "ask5p", "ask1q", "ask2q", "ask3q",
           "ask4q", "ask5q", "open", 'num']]
SZ = SZ[['date', 'skey', 'time', 'cum_volume', 'cum_amount', "close", "bid1p", "bid2p", "bid3p", "bid4p", "bid5p", "bid1q", "bid2q",
           "bid3q", "bid4q", "bid5q", "ask1p", "ask2p", "ask3p", "ask4p", "ask5p", "ask1q", "ask2q", "ask3q",
           "ask4q", "ask5q", "open", 'num']]

startDate = '20200102'
endDate = '20200102'

readPath = '/mnt/e/result/logs_***_zs_92_01_day_data'
dataPathLs = np.array(glob.glob(readPath))
dateLs = np.array([os.path.basename(i).split('_')[1] for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]


for n in range(len(dataPathLs)):
    path1 = np.array(glob.glob(dataPathLs[n] + '/mdLog_SH_***'))
    SH1 = pd.read_csv(path1[0])
    SH1 = SH1[SH1['source'] == 4]
    
    SH1['skey'] = SH1['StockID'] + 1000000
    SH1 = SH1.rename(columns={"openPrice":"open"})
    SH1["open"] = np.where(SH1["cum_volume"] > 0, SH1.groupby("skey")["open"].transform("max"), SH1["open"])
    SH1["time"] = SH1["time"].apply(lambda x: int((x.replace(':', "")).replace(".", "")) * 1000)

SH1 = SH1[['clockAtArrival', 'sequenceNo', 'skey', 'time', 'cum_volume', 'cum_amount', "close", "bid1p", "bid2p", "bid3p", "bid4p", "bid5p", "bid1q", "bid2q",
           "bid3q", "bid4q", "bid5q", "ask1p", "ask2p", "ask3p", "ask4p", "ask5p", "ask1q", "ask2q", "ask3q",
           "ask4q", "ask5q", "open"]]
for cols in ['cum_amount', "close", 'open']:
    SH1[cols] = SH1[cols].round(2)
cols = ['skey', 'time', 'cum_volume', 'cum_amount', "close", "bid1p", "bid2p", "bid3p", "bid4p", "bid5p", "bid1q", "bid2q",
           "bid3q", "bid4q", "bid5q", "ask1p", "ask2p", "ask3p", "ask4p", "ask5p", "ask1q", "ask2q", "ask3q",
           "ask4q", "ask5q", "open"]
SH1 = SH1[SH1['skey'].isin(SH['skey'].unique())]
re = pd.merge(SH, SH1, on=cols, how='outer')

p21 = re[(re['date'].isnull())][['clockAtArrival', 'sequenceNo', 'skey', 'time', 'cum_volume', 'cum_amount', "close", "bid1p", "bid2p", "bid3p", "bid4p", "bid5p", "bid1q", "bid2q",
           "bid3q", "bid4q", "bid5q", "ask1p", "ask2p", "ask3p", "ask4p", "ask5p", "ask1q", "ask2q", "ask3q",
           "ask4q", "ask5q", "open"]]
p22 = re[(re['sequenceNo'].isnull())][["skey", "date", "time", 'cum_volume', 'cum_amount', "close", "bid1p", "bid2p", "bid3p", "bid4p", "bid5p", "bid1q", "bid2q",
           "bid3q", "bid4q", "bid5q", "ask1p", "ask2p", "ask3p", "ask4p", "ask5p", "ask1q", "ask2q", "ask3q",
           "ask4q", "ask5q", "open", 'num']]

p11 = re[(~re['sequenceNo'].isnull()) & (~re['date'].isnull())][re[(~re['sequenceNo'].isnull()) 
                                                    & (~re['date'].isnull())]['num'].duplicated(keep=False)]
p12 = re[(~re['sequenceNo'].isnull()) & (~re['date'].isnull())].drop_duplicates(['num'], keep=False)
p11 = p11.sort_values(by=['num', 'sequenceNo'])
display(p11)
p11["order1"] = p11.groupby(["num"]).cumcount()
p11["order2"] = p11.groupby(["sequenceNo"]).cumcount()
p11 = p11[p11['order1'] == p11['order2']]
p11.drop(['order1', 'order2'],axis=1,inplace=True)
p1 = pd.concat([p11, p12])
p2 = pd.merge(p22, p21[['skey', 'time', 'clockAtArrival', 'sequenceNo']], on=['skey', 'time'], how='left')
re1 = pd.concat([p1, p2])
re1 = re1.sort_values(by='num')
re1['seq1'] = re1.groupby('skey')['sequenceNo'].ffill().bfill()
sl = list(set(SH['skey'].unique()) - set(SH1['skey'].unique()))
re1.loc[re1['skey'].isin(sl), 'seq1'] = np.nan
re1['count1'] = re1.groupby(['seq1']).cumcount()
re1['count2'] = re1.groupby(['seq1'])['count1'].transform('nunique')
re1['min_seq'] = re1.groupby('skey')['sequenceNo'].transform('min')
re1['count'] = np.where(re1['seq1'] != re1['min_seq'], re1['count1'], re1['count1']+1-re1['count2'])
re1.drop(["min_seq"],axis=1,inplace=True)
re1.drop(["count1"],axis=1,inplace=True)
re1.drop(["count2"],axis=1,inplace=True)
re1['dup'] = np.where(~re1["sequenceNo"].isnull(), re1.groupby(['sequenceNo']).cumcount(), 0)
re1['dup1'] = np.where(~re1["sequenceNo"].isnull(), re1.groupby(['sequenceNo'])['num'].transform('nunique'), 0)
re1['nan'] = np.where((re1['sequenceNo'].isnull()) | (re1['dup'] != 0), 1, 0)
re1.loc[(re1['dup1'] > 1) & (re1['dup'] > 0), 'sequenceNo'] = np.nan
assert((len(set(sl) - set(re1[re1['seq1'].isnull()]['skey'].unique())) == 0) & 
           (len(set(re1[re1['seq1'].isnull()]['skey'].unique()) - set(sl)) == 0))
assert(re1.shape[0] == SH.shape[0])

display('%.2f%%' % (re1[re1['sequenceNo'].isnull()].shape[0]/re1.shape[

In [20]:
re1[re1['dup1'] == 2].head(100)

Unnamed: 0,date,skey,time,cum_volume,cum_amount,close,bid1p,bid2p,bid3p,bid4p,bid5p,bid1q,bid2q,bid3q,bid4q,bid5q,ask1p,ask2p,ask3p,ask4p,ask5p,ask1q,ask2q,ask3q,ask4q,ask5q,open,num,clockAtArrival,sequenceNo,seq1,count,dup,dup1,nan
29,20200102.0,1600000,91400000000,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,16000000000.0,1577928000000000.0,327448.0,327448.0,0.0,0,2.0,0
30,20200102.0,1600000,91400000000,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,16000000000.0,1577928000000000.0,,327448.0,1.0,1,2.0,1
5195,20200102.0,1600004,91400000000,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,16000040000.0,1577928000000000.0,327908.0,327908.0,0.0,0,2.0,0
5196,20200102.0,1600004,91400000000,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,16000040000.0,1577928000000000.0,,327908.0,1.0,1,2.0,1
5216,20200102.0,1600004,91823000000,0,0.0,0.0,17.55,0.0,0.0,0.0,0.0,6800,0,0,0,0,17.55,0.0,0.0,0.0,0.0,6800,1700,0,0,0,0.0,16000040000.0,1577928000000000.0,1032980.0,1032980.0,0.0,0,2.0,0
5217,20200102.0,1600004,91823000000,0,0.0,0.0,17.55,0.0,0.0,0.0,0.0,6800,0,0,0,0,17.55,0.0,0.0,0.0,0.0,6800,1700,0,0,0,0.0,16000040000.0,1577928000000000.0,,1032980.0,1.0,1,2.0,1
10175,20200102.0,1600004,150000000000,17792404,311394000.0,17.52,17.51,17.5,17.49,17.48,17.47,96300,44600,15200,42020,39800,17.52,17.53,17.54,17.55,17.56,88234,29200,37600,25700,13700,17.56,16000050000.0,1577948000000000.0,113621755.0,113621755.0,0.0,0,2.0,0
10176,20200102.0,1600004,150000000000,17792404,311394000.0,17.52,17.51,17.5,17.49,17.48,17.47,96300,44600,15200,42020,39800,17.52,17.53,17.54,17.55,17.56,88234,29200,37600,25700,13700,17.56,16000050000.0,1577948000000000.0,,113621755.0,1.0,1,2.0,1
10267,20200102.0,1600006,91400000000,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,16000060000.0,1577928000000000.0,328381.0,328381.0,0.0,0,2.0,0
10268,20200102.0,1600006,91400000000,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,16000060000.0,1577928000000000.0,,328381.0,1.0,1,2.0,1


In [16]:
pd.set_option('max_rows', 200)
SH1[(SH1['skey'] == 1600000)].head(100)

Unnamed: 0,clockAtArrival,sequenceNo,skey,time,cum_volume,cum_amount,close,bid1p,bid2p,bid3p,bid4p,bid5p,bid1q,bid2q,bid3q,bid4q,bid5q,ask1p,ask2p,ask3p,ask4p,ask5p,ask1q,ask2q,ask3q,ask4q,ask5q,open
28666,1577925915026470,63655,1600000,84506000000,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0
33403,1577925975115012,72764,1600000,84606000000,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0
38115,1577926034976420,81848,1600000,84706000000,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0
42819,1577926094906958,90924,1600000,84806000000,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0
47554,1577926154909240,100031,1600000,84906000000,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0
52290,1577926214908206,109139,1600000,85006000000,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0
57025,1577926274908715,118246,1600000,85106000000,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0
61773,1577926334929346,127366,1600000,85206000000,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0
66495,1577926394909108,136460,1600000,85307000000,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0
71229,1577926454910734,145566,1600000,85406000000,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0


In [3]:
re1[re1['sequenceNo'].isnull()].shape[0]

97709

In [10]:
re1[re1['num'] >= 16000000060].head(10)

Unnamed: 0,date,skey,time,cum_volume,cum_amount,close,bid1p,bid2p,bid3p,bid4p,bid5p,bid1q,bid2q,bid3q,bid4q,bid5q,ask1p,ask2p,ask3p,ask4p,ask5p,ask1q,ask2q,ask3q,ask4q,ask5q,open,num,clockAtArrival,sequenceNo,seq1,count,dup,dup1,nan
59,20200102.0,1600000,91724000000,0,0.0,0.0,12.44,0.0,0.0,0.0,0.0,161429,21771,0,0,0,12.44,0.0,0.0,0.0,0.0,161429,0,0,0,0,0.0,16000000000.0,1577928000000000.0,973927.0,973927.0,0.0,0,1.0,0
1,20200102.0,1600000,91724000000,0,0.0,0.0,12.44,0.0,0.0,0.0,0.0,161429,22571,0,0,0,12.44,0.0,0.0,0.0,0.0,161429,0,0,0,0,0.0,16000000000.0,,,973927.0,1.0,0,0.0,1
61,20200102.0,1600000,91730000000,0,0.0,0.0,12.44,0.0,0.0,0.0,0.0,161429,22971,0,0,0,12.44,0.0,0.0,0.0,0.0,161429,0,0,0,0,0.0,16000000000.0,1577928000000000.0,979520.0,979520.0,0.0,0,1.0,0
62,20200102.0,1600000,91739000000,0,0.0,0.0,12.44,0.0,0.0,0.0,0.0,161429,29171,0,0,0,12.44,0.0,0.0,0.0,0.0,161429,0,0,0,0,0.0,16000000000.0,1577928000000000.0,987992.0,987992.0,0.0,0,1.0,0
63,20200102.0,1600000,91742000000,0,0.0,0.0,12.44,0.0,0.0,0.0,0.0,161429,27371,0,0,0,12.44,0.0,0.0,0.0,0.0,161429,0,0,0,0,0.0,16000000000.0,1577928000000000.0,991515.0,991515.0,0.0,0,1.0,0
64,20200102.0,1600000,91745000000,0,0.0,0.0,12.44,0.0,0.0,0.0,0.0,162429,26371,0,0,0,12.44,0.0,0.0,0.0,0.0,162429,0,0,0,0,0.0,16000000000.0,1577928000000000.0,994432.0,994432.0,0.0,0,1.0,0
65,20200102.0,1600000,91748000000,0,0.0,0.0,12.44,0.0,0.0,0.0,0.0,162429,46471,0,0,0,12.44,0.0,0.0,0.0,0.0,162429,0,0,0,0,0.0,16000000000.0,1577928000000000.0,996824.0,996824.0,0.0,0,1.0,0
66,20200102.0,1600000,91751000000,0,0.0,0.0,12.44,0.0,0.0,0.0,0.0,162629,50371,0,0,0,12.44,0.0,0.0,0.0,0.0,162629,0,0,0,0,0.0,16000000000.0,1577928000000000.0,1000322.0,1000322.0,0.0,0,1.0,0
67,20200102.0,1600000,91754000000,0,0.0,0.0,12.44,0.0,0.0,0.0,0.0,162629,50471,0,0,0,12.44,0.0,0.0,0.0,0.0,162629,0,0,0,0,0.0,16000000000.0,1577928000000000.0,1003317.0,1003317.0,0.0,0,1.0,0
68,20200102.0,1600000,91803000000,0,0.0,0.0,12.45,0.0,0.0,0.0,0.0,163400,0,0,0,0,12.45,0.0,0.0,0.0,0.0,163400,73130,0,0,0,0.0,16000000000.0,1577928000000000.0,1012705.0,1012705.0,0.0,0,1.0,0


In [None]:
re1[re1['skey'] == 1600000].head(100)

In [None]:
startDate = '20200221'
endDate = '20200221'

readPath = '/mnt/e/result/logs_***_zs_92_01_day_data'
dataPathLs = np.array(glob.glob(readPath))
dateLs = np.array([os.path.basename(i).split('_')[1] for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]


for n in range(len(dataPathLs)):
    path1 = np.array(glob.glob(dataPathLs[n] + '/mdLog_SZ_***'))
    SZ1 = pd.read_csv(path1[0])
    SZ1 = SZ1[SZ1['source'] == 4]
    
    SZ1['skey'] = SZ1['StockID'] + 2000000
    SZ1 = SZ1.rename(columns={"openPrice":"open"})
    SZ1["open"] = np.where(SZ1["cum_volume"] > 0, SZ1.groupby("skey")["open"].transform("max"), SZ1["open"])
    SZ1["time"] = SZ1["time"].apply(lambda x: int((x.replace(':', "")).replace(".", "")) * 1000)

SZ1 = SZ1[['clockAtArrival', 'sequenceNo', 'skey', 'time', 'cum_volume', 'cum_amount', "close", "bid1p", "bid2p", "bid3p", "bid4p", "bid5p", "bid1q", "bid2q",
           "bid3q", "bid4q", "bid5q", "ask1p", "ask2p", "ask3p", "ask4p", "ask5p", "ask1q", "ask2q", "ask3q",
           "ask4q", "ask5q", "open"]]
for cols in ['cum_amount']:
    SZ1[cols] = SZ1[cols].round(2)
cols = ['skey', 'time', 'cum_volume', 'cum_amount', "close", "bid1p", "bid2p", "bid3p", "bid4p", "bid5p", "bid1q", "bid2q",
           "bid3q", "bid4q", "bid5q", "ask1p", "ask2p", "ask3p", "ask4p", "ask5p", "ask1q", "ask2q", "ask3q",
           "ask4q", "ask5q", "open"]
SZ1 = SZ1[SZ1['skey'].isin(SZ['skey'].unique())]
re = pd.merge(SZ, SZ1, on=cols, how='outer')

display(re.shape[0])
display(re[~re['sequenceNo'].isnull()].shape[0])
display(re[~re['date'].isnull()].shape[0])
display(SZ.shape[0])
display(SZ1.shape[0])

try:
    assert(re.shape[0] == re[~re['date'].isnull()].shape[0])
    print('SZ lv2 is complete')
except:
    display('%.2f%%' % (re[~re['date'].isnull()].shape[0]/re.shape[0] * 100))
    re = pd.merge(SZ, SZ1, on=cols, how='left')
    print('SZ lv2 is incomplete')

if re[re.duplicated('num', keep=False)].shape[0] == 0:
    re2 = re.sort_values(by='num')
    re2['seq1'] = re2.groupby('skey')['sequenceNo'].bfill().ffill()
    sl = list(set(SZ['skey'].unique()) - set(SZ1['skey'].unique()))
    re2.loc[re2['skey'].isin(sl), 'seq1'] = np.nan
    re2['count1'] = re2.groupby(['seq1']).cumcount()
    re2['count2'] = re2.groupby(['seq1'])['count1'].transform('nunique')
    re2['max_seq'] = re2.groupby('skey')['sequenceNo'].transform('max')
    re2['count'] = np.where((re2['seq1'] != re2['max_seq'])|(~re2["sequenceNo"].isnull()), re2['count1'] + 1 - re2['count2'], re2['count1'] - re2['count2'])
    re2.drop(["max_seq"],axis=1,inplace=True)
    re2.drop(["count1"],axis=1,inplace=True)
    re2.drop(["count2"],axis=1,inplace=True)
    re2['dup'] = np.where(~re2["sequenceNo"].isnull(), re2.groupby(['sequenceNo']).cumcount(), 0)
    re2['dup1'] = np.where(~re2["sequenceNo"].isnull(), re2.groupby(['sequenceNo'])['num'].transform('nunique'), 0)
    re2['nan'] = np.where((re2['sequenceNo'].isnull()) | (re2['dup'] != 0), 1, 0)
    re2.loc[(re2['dup1'] > 1) & (re2['count'] < 0), 'sequenceNo'] = np.nan
    assert((len(set(sl) - set(re2[re2['seq1'].isnull()]['skey'].unique())) == 0) & 
           (len(set(re2[re2['seq1'].isnull()]['skey'].unique()) - set(sl)) == 0))
    assert(re2.shape[0] == SZ.shape[0])

    display('%.2f%%' % (re2[re2['sequenceNo'].isnull()].shape[0]/re2.shape[0] * 100))
    
    
else:
    p1 = re[re['num'].duplicated(keep=False)]
    p2 = re.drop_duplicates(['num'], keep=False)
    p1["order1"] = p1.groupby(["num"]).cumcount()
    p1["order2"] = p1.groupby(["sequenceNo"]).cumcount()
    p1 = p1[p1['order1'] == p1['order2']]
    p1.drop(['order1', 'order2'],axis=1,inplace=True)
    re = pd.concat([p1, p2])
    re2 = re.sort_values(by='num')
    re2['seq1'] = re2.groupby('skey')['sequenceNo'].bfill().ffill()
    sl = list(set(SZ['skey'].unique()) - set(SZ1['skey'].unique()))
    re2.loc[re2['skey'].isin(sl), 'seq1'] = np.nan
    re2['count1'] = re2.groupby(['seq1']).cumcount()
    re2['count2'] = re2.groupby(['seq1'])['count1'].transform('nunique')
    re2['max_seq'] = re2.groupby('skey')['sequenceNo'].transform('max')
    re2['count'] = np.where((re2['seq1'] != re2['max_seq'])|(~re2["sequenceNo"].isnull()), re2['count1'] + 1 - re2['count2'], re2['count1'] - re2['count2'])
    re2.drop(["max_seq"],axis=1,inplace=True)
    re2.drop(["count1"],axis=1,inplace=True)
    re2.drop(["count2"],axis=1,inplace=True)
    re2['dup'] = np.where(~re2["sequenceNo"].isnull(), re2.groupby(['sequenceNo']).cumcount(), 0)
    re2['dup1'] = np.where(~re2["sequenceNo"].isnull(), re2.groupby(['sequenceNo'])['num'].transform('nunique'), 0)
    re2['nan'] = np.where((re2['sequenceNo'].isnull()) | (re2['dup'] != 0), 1, 0)
    re2.loc[(re2['dup1'] > 1) & (re2['count'] < 0), 'sequenceNo'] = np.nan
    sl = list(set(SZ['skey'].unique()) - set(SZ1['skey'].unique()))
    assert((len(set(sl) - set(re2[re2['seq1'].isnull()]['skey'].unique())) == 0) & 
           (len(set(re2[re2['seq1'].isnull()]['skey'].unique()) - set(sl)) == 0))
    assert(re2.shape[0] == SZ.shape[0])

    display('%.2f%%' % (re2[re2['sequenceNo'].isnull()].shape[0]/re2.shape[0] * 100))
    

In [2]:
import pandas as pd
import random
import numpy as np
import glob
import pickle
import os
import datetime
import time
pd.set_option("max_columns", 200)

startDate = 20200430
endDate = 20200430
database_name = 'com_md_eq_cn'
user = "zhenyuy"
password = "bnONBrzSMGoE"

new_trade_data = []

db = DB("192.168.10.178", database_name, user, password)
trade = db.read('md_trade', start_date=startDate, end_date=endDate)[['skey', 'date', 'ApplSeqNum']]

startDate = '20200430'
endDate = '20200430'

readPath = '/mnt/e/result/logs_***_zs_92_01_day_data'
dataPathLs = np.array(glob.glob(readPath))
dateLs = np.array([os.path.basename(i).split('_')[1] for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]
for n in range(len(dataPathLs)):
    path1 = np.array(glob.glob(dataPathLs[n] + '/mdTradeLog***'))
    trade1 = pd.read_csv(path1[0])
trade1['skey'] = np.where(trade1['exchId'] == 2, trade1['SecurityID'] + 2000000, trade1['SecurityID'] + 1000000)
trade1 = trade1[trade1['skey'].isin(trade['skey'].unique())]
re = pd.merge(trade, trade1[['skey', 'ApplSeqNum', 'sequenceNo', 'clockAtArrival']], on=['skey', 'ApplSeqNum'],
             how='outer')
try:
    assert(re.shape[0] == trade.shape[0])
    display('trade data is complete')
    k = 0
except:
    display('%.2f%%' % (trade.shape[0]/re.shape[0] * 100))
    k = 1
    display('trade data incomplete')
    k1 = pd.merge(trade1, re[re['date'].isnull()][['skey', 'ApplSeqNum']], on=['skey', 'ApplSeqNum'], how='right')
    display(k1.shape[0])
    display(k1['ExecType'].unique())
    display(k1['TransactTime'].unique())
    k1['date'] = trade['date'].iloc[0]
    new_trade_data += [k1[['clockAtArrival', 'sequenceNo', 'TransactTime', 'ApplSeqNum', 'date', 'skey', 'ExecType', 'TradeBSFlag', 
   'TradePrice', 'TradeQty', 'BidApplSeqNum', 'OfferApplSeqNum']]]
    re = pd.merge(trade, trade1[['skey', 'ApplSeqNum', 'sequenceNo', 'clockAtArrival']], on=['skey', 'ApplSeqNum'],
             how='left')
    assert(re.shape[0] == trade.shape[0])

re3 = re.sort_values(by=['skey', 'ApplSeqNum'])
re3['seq1'] = re3.groupby('skey')['sequenceNo'].ffill().bfill()
sl = list(set(trade['skey'].unique()) - set(trade1['skey'].unique()))
re3.loc[re3['skey'].isin(sl), 'seq1'] = np.nan
re3['count1'] = re3.groupby(['seq1']).cumcount()
re3['count2'] = re3.groupby(['seq1'])['count1'].transform('nunique')
re3['min_seq'] = re3.groupby('skey')['sequenceNo'].transform('min')
re3['count'] = np.where(re3['seq1'] != re3['min_seq'], re3['count1'], re3['count1']+1-re3['count2'])
re3.drop(["min_seq"],axis=1,inplace=True)
re3.drop(["count1"],axis=1,inplace=True)
re3.drop(["count2"],axis=1,inplace=True)
re3['dup'] = np.where(~re3["sequenceNo"].isnull(), re3.groupby(['sequenceNo']).cumcount(), 0)
re3['dup1'] = np.where(~re3["sequenceNo"].isnull(), re3.groupby(['sequenceNo'])['ApplSeqNum'].transform('nunique'), 0)
re3['nan'] = np.where((re3['sequenceNo'].isnull()) | (re3['dup'] != 0), 1, 0)
re3.loc[(re3['dup1'] > 1) & (re3['dup'] > 0), 'sequenceNo'] = np.nan
assert((len(set(sl) - set(re3[re3['seq1'].isnull()]['skey'].unique())) == 0) & 
       (len(set(re3[re3['seq1'].isnull()]['skey'].unique()) - set(sl)) == 0))
assert(re3.shape[0] == trade.shape[0])
if k == 1:
    k1['seq1'] = k1['sequenceNo']
    k1['count'] = 0
    k1['nan'] = 0
    k1['dup1'] = 1
    re3 = pd.concat([re3, k1[['clockAtArrival', 'date', 'sequenceNo', 'skey', 'ApplSeqNum', 'seq1', 
                              'count', 'nan', 'dup1']]])

display('%.2f%%' % (re3[re3['sequenceNo'].isnull()].shape[0]/re3.shape[0] * 100))

  interactivity=interactivity, compiler=compiler, result=result)


'trade data is complete'

'0.22%'

In [4]:
re3[re3['skey'] == 2002504]

Unnamed: 0,skey,date,ApplSeqNum,sequenceNo,clockAtArrival,seq1,count,dup,dup1,nan
33589207,2002504,20200430,38201,415653.0,1.588209e+15,415653.0,-1.0,0,1.0,0
33589208,2002504,20200430,103751,,,415653.0,0.0,0,0.0,1
33589209,2002504,20200430,118743,745846.0,1.588209e+15,745846.0,0.0,0,1.0,0
33589210,2002504,20200430,118746,745878.0,1.588209e+15,745878.0,0.0,0,1.0,0
33589211,2002504,20200430,119060,748176.0,1.588209e+15,748176.0,0.0,0,1.0,0
...,...,...,...,...,...,...,...,...,...,...
33601660,2002504,20200430,18160566,117172178.0,1.588230e+15,117172178.0,0.0,0,1.0,0
33601661,2002504,20200430,18160567,117172179.0,1.588230e+15,117172179.0,0.0,0,1.0,0
33601662,2002504,20200430,18160568,117172180.0,1.588230e+15,117172180.0,0.0,0,1.0,0
33601663,2002504,20200430,18160569,117172181.0,1.588230e+15,117172181.0,0.0,0,1.0,0


In [3]:
re3[re3['seq1'] == 415653.0]

Unnamed: 0,skey,date,ApplSeqNum,sequenceNo,clockAtArrival,seq1,count,dup,dup1,nan
33589207,2002504,20200430,38201,415653.0,1588209000000000.0,415653.0,-1.0,0,1.0,0
33589208,2002504,20200430,103751,,,415653.0,0.0,0,0.0,1


In [None]:
startDate = 20200221
endDate = 20200221
database_name = 'com_md_eq_cn'
user = "zhenyuy"
password = "bnONBrzSMGoE"

db = DB("192.168.10.178", database_name, user, password)
order = db.read('md_order', start_date=startDate, end_date=endDate)[['skey', 'date', 'ApplSeqNum']]

startDate = '20200221'
endDate = '20200221'

new_order_data = []

readPath = '/mnt/e/result/logs_***_zs_92_01_day_data'
dataPathLs = np.array(glob.glob(readPath))
dateLs = np.array([os.path.basename(i).split('_')[1] for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]
for n in range(len(dataPathLs)):
    path1 = np.array(glob.glob(dataPathLs[n] + '/mdOrderLog***'))
    order1 = pd.read_csv(path1[0])
order1['skey'] = order1['SecurityID'] + 2000000
order1 = order1[order1['skey'].isin(order['skey'].unique())]
re = pd.merge(order, order1[['skey', 'ApplSeqNum', 'sequenceNo', 'clockAtArrival']], on=['skey', 'ApplSeqNum'],
             how='outer')
try:
    assert(re.shape[0] == order.shape[0])
    display('order data is complete')
    k = 0
except:
    display('%.2f%%' % (order.shape[0]/re.shape[0] * 100))
    k = 1
    display('order data incomplete')
    k2 = pd.merge(order1, re[re['date'].isnull()][['skey', 'ApplSeqNum']], on=['skey', 'ApplSeqNum'], how='right')
    display(k2.shape[0])
    display(k2['SecurityID'].unique())
    display(k2['TransactTime'].unique())
    k2['date'] = order['date'].iloc[0]
    new_order_data += [k2[['clockAtArrival', 'sequenceNo', 'TransactTime', 'ApplSeqNum', 'date', 'skey', 'Side', 
   'OrderType', 'Price', 'OrderQty']]]
    re = pd.merge(order, order1[['skey', 'ApplSeqNum', 'sequenceNo', 'clockAtArrival']], on=['skey', 'ApplSeqNum'],
             how='left')
    assert(re.shape[0] == order.shape[0])

re4 = re.sort_values(by=['skey', 'ApplSeqNum'])
re4['seq1'] = re4.groupby('skey')['sequenceNo'].bfill().ffill()
sl = list(set(order['skey'].unique()) - set(order1['skey'].unique()))
re4.loc[re4['skey'].isin(sl), 'seq1'] = np.nan
re4['count1'] = re4.groupby(['seq1']).cumcount()
re4['count2'] = re4.groupby(['seq1'])['count1'].transform('nunique')
re4['max_seq'] = re4.groupby('skey')['sequenceNo'].transform('max')
re4['count'] = np.where((re4['seq1'] != re4['max_seq'])|(~re4['sequenceNo'].isnull()), re4['count1'] + 1 - re4['count2'], re4['count1'] - re4['count2'])
re4.drop(["max_seq"],axis=1,inplace=True)
re4.drop(["count1"],axis=1,inplace=True)
re4.drop(["count2"],axis=1,inplace=True)
re4['dup'] = np.where(~re4["sequenceNo"].isnull(), re4.groupby(['sequenceNo']).cumcount(), 0)
re4['dup1'] = np.where(~re4["sequenceNo"].isnull(), re4.groupby(['sequenceNo'])['ApplSeqNum'].transform('nunique'), 0)
re4['nan'] = np.where((re4['sequenceNo'].isnull()) | (re4['dup'] != 0), 1, 0)
re4.loc[(re4['dup1'] > 1) & (re4['count'] < 0), 'sequenceNo'] = np.nan
assert((len(set(sl) - set(re4[re4['seq1'].isnull()]['skey'].unique())) == 0) & 
       (len(set(re4[re4['seq1'].isnull()]['skey'].unique()) - set(sl)) == 0))
assert(re4.shape[0] == order.shape[0])
if k == 1:
    k2['seq1'] = k2['ApplSeqNum']
    k2['count'] = 0
    k2['nan'] = 0
    k2['dup1'] = 1
    re4 = pd.concat([re4, k2[['clockAtArrival', 'date', 'sequenceNo', 'skey', 'ApplSeqNum', 'seq1', 'count', 'nan', 'dup1']]])


display('%.2f%%' % (re4[re4['sequenceNo'].isnull()].shape[0]/re4.shape[0] * 100))

In [None]:
startDate = 20200221
endDate = 20200221
database_name = 'com_md_eq_cn'
user = "zhenyuy"
password = "bnONBrzSMGoE"

db = DB("192.168.10.178", database_name, user, password)
index = db.read('md_index', start_date=startDate, end_date=endDate)

startDate = '20200221'
endDate = '20200221'

readPath = '/mnt/e/result/logs_***_zs_92_01_day_data'
dataPathLs = np.array(glob.glob(readPath))
dateLs = np.array([os.path.basename(i).split('_')[1] for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]


for n in range(len(dataPathLs)):
    path1 = np.array(glob.glob(dataPathLs[n] + '/mdLog_SH_***'))
    index1 = pd.read_csv(path1[0])
    index1 = index1[index1['StockID'].isin([16, 300, 852, 905])]
    
    index1['skey'] = index1['StockID'] + 1000000
    index1 = index1.rename(columns={"openPrice":"open"})
    index1["open"] = np.where(index1["cum_volume"] > 0, index1.groupby("skey")["open"].transform("max"), index1["open"])
    index1['close'] = np.where(index1['cum_volume'] == 0, 0, index1['close'])
    index1["time"] = index1["time"].apply(lambda x: int((x.replace(':', "")).replace(".", "")) * 1000)
index['close'] = np.where(index['cum_volume'] == 0, 0, index['close'])
index['num'] = index['skey'] * 10000 + index['ordering']
index = index[['skey', 'date', 'cum_volume', 'cum_amount', "close", "open", 'num']]
index1 = index1[['clockAtArrival', 'sequenceNo', 'skey', 'cum_volume', 'cum_amount', "close", "open", "time"]]
for cols in ['cum_amount']:
    index1[cols] = index1[cols].round(1)
cols = ['skey', 'cum_volume', 'cum_amount', "close", "open"]
index1 = index1[index1['skey'].isin(index['skey'].unique())]
re = pd.merge(index, index1, on=cols, how='outer')

display(re.shape[0])
display(re[~re['sequenceNo'].isnull()].shape[0])
display(re[~re['date'].isnull()].shape[0])
display(index.shape[0])
display(index1.shape[0])

try:
    assert(re.shape[0] == re[~re['date'].isnull()].shape[0])
    print('index data is complete')
except:
    display('%.2f%%' % (re[~re['date'].isnull()].shape[0]/re.shape[0] * 100))
    re = pd.merge(index, index1, on=cols, how='left')
    print('index data is not complete')

p11 = re[re.duplicated('num', keep=False)]
p2 = re.drop_duplicates('num', keep=False)
p11["order1"] = p11.groupby(["num"]).cumcount()
p11["order2"] = p11.groupby(["sequenceNo"]).cumcount()
p11 = p11[p11['order1'] == p11['order2']]

p12 = re[re.duplicated('num', keep=False)].drop_duplicates('num')
p12 = pd.merge(p12, p11[['num', 'order1']], on='num', how='left')
p12 = p12[p12['order1'].isnull()]
p12['sequenceNo'] = np.nan

p11.drop(['order1', 'order2'],axis=1,inplace=True)
p12.drop(['order1'],axis=1,inplace=True)
p1 = pd.concat([p11, p12])

re = pd.concat([p1, p2])
assert(re[re.duplicated('num', keep=False)].shape[0] == 0)

if re[re['sequenceNo'].isnull()].shape[0] != 0:
    re5 = re.sort_values(by='num')
    re5['seq1'] = re5.groupby('skey')['sequenceNo'].bfill().ffill()
    sl = list(set(index['skey'].unique()) - set(index1['skey'].unique()))
    re5.loc[re5['skey'].isin(sl), 'seq1'] = np.nan
    re5['count1'] = re5.groupby(['seq1']).cumcount()
    re5['count2'] = re5.groupby(['seq1'])['count1'].transform('nunique')
    re5['max_seq'] = re5.groupby('skey')['sequenceNo'].transform('max')
    re5['count'] = np.where((re5['seq1'] != re5['max_seq'])|(~re5['sequenceNo'].isnull()), re5['count1'] + 1 - re5['count2'], re5['count1'] - re5['count2'])
    re5.drop(["max_seq"],axis=1,inplace=True)
    re5.drop(["count1"],axis=1,inplace=True)
    re5.drop(["count2"],axis=1,inplace=True)
    re5['dup'] = np.where(~re5["sequenceNo"].isnull(), re5.groupby(['sequenceNo']).cumcount(), 0)
    re5['dup1'] = np.where(~re5["sequenceNo"].isnull(), re5.groupby(['sequenceNo'])['num'].transform('nunique'), 0)
    re5['nan'] = np.where((re5['sequenceNo'].isnull()) | (re5['dup'] != 0), 1, 0)
    re5.loc[(re5['dup1'] > 1) & (re5['count'] < 0), 'sequenceNo'] = np.nan
    assert((len(set(sl) - set(re5[re5['seq1'].isnull()]['skey'].unique())) == 0) & 
           (len(set(re5[re5['seq1'].isnull()]['skey'].unique()) - set(sl)) == 0))
    assert(re5.shape[0] == index.shape[0])

    display('%.2f%%' % (re5[re5['sequenceNo'].isnull()].shape[0]/re5.shape[0] * 100))
else:
    re5 = re.sort_values(by='num')
    re5['seq1'] = re5['sequenceNo']
    sl = list(set(index['skey'].unique()) - set(index1['skey'].unique()))
    re5.loc[re5['skey'].isin(sl), 'seq1'] = np.nan
    re5['count1'] = re5.groupby(['seq1']).cumcount()
    re5['count2'] = re5.groupby(['seq1'])['count1'].transform('nunique')
    re5['max_seq'] = re5.groupby('skey')['sequenceNo'].transform('max')
    re5['count'] = np.where((re5['seq1'] != re5['max_seq'])|(~re5['sequenceNo'].isnull()), re5['count1'] + 1 - re5['count2'], re5['count1'] - re5['count2'])
    re5.drop(["max_seq"],axis=1,inplace=True)
    re5.drop(["count1"],axis=1,inplace=True)
    re5.drop(["count2"],axis=1,inplace=True)
    re5['dup'] = np.where(~re5["sequenceNo"].isnull(), re5.groupby(['sequenceNo']).cumcount(), 0)
    re5['dup1'] = np.where(~re5["sequenceNo"].isnull(), re5.groupby(['sequenceNo'])['num'].transform('nunique'), 0)
    re5['nan'] = np.where((re5['sequenceNo'].isnull()) | (re5['dup'] != 0), 1, 0)
    re5.loc[(re5['dup1'] > 1) & (re5['count'] < 0), 'sequenceNo'] = np.nan
    assert((len(set(sl) - set(re5[re5['seq1'].isnull()]['skey'].unique())) == 0) & 
           (len(set(re5[re5['seq1'].isnull()]['skey'].unique()) - set(sl)) == 0))
    assert(re5.shape[0] == index.shape[0])

    display('%.2f%%' % (re5[re5['sequenceNo'].isnull()].shape[0]/re5.shape[0] * 100))
