In [1]:
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz


def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name)


class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db'):
        self.db_name = db_name
        self.uri = uri
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = 1
            seg = {'ver': version, 'data': self.ser(df_seg, version), 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)

        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        else:
            raise Exception('unknown version')


def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()


import pandas as pd
import random
import numpy as np
import glob
import pickle
import os
import datetime
import time
pd.set_option("max_columns", 200)


startDate = 20200909
endDate = 20200918
database_name = 'com_md_eq_cn'
user = "zhenyuy"
password = "bnONBrzSMGoE"

db = DB("192.168.10.178", database_name, user, password)
test = db.read('md_index', start_date=startDate, end_date=endDate, symbol=[1000300])
date_list = test['date'].unique()
del test

new_trade_data = []
new_order_data = []

for i in date_list:
    print('--------------------------------------------------------------------------------------------')
    print(i)
    print('SH lv2')
    startDate = str(i)
    endDate = str(i)
    db = DB("192.168.10.178", database_name, user, password)
    SH = db.read('md_snapshot_l2', start_date=startDate, end_date=endDate)
    SZ = SH[SH['skey'] > 2000000]
    SH = SH[SH['skey'] < 2000000]
    assert(SH['ordering'].max() < 10000)
    assert(SZ['ordering'].max() < 10000)
    SH['num'] = SH['skey'] * 10000 + SH['ordering']
    SZ['num'] = SZ['skey'] * 10000 + SZ['ordering']
    
    SH = SH[['date', 'skey', 'time', 'cum_volume', 'cum_amount', 'cum_trades_cnt', "close", "bid1p", "bid2p", "bid3p", "bid4p", "bid5p", "bid1q", "bid2q",
               "bid3q", "bid4q", "bid5q", "ask1p", "ask2p", "ask3p", "ask4p", "ask5p", "ask1q", "ask2q", "ask3q",
               "ask4q", "ask5q", "open", 'num']]
    SH = SH.rename(columns={'cum_trades_cnt': 'numTrades'})
    SZ = SZ[['date', 'skey', 'time', 'cum_volume', 'cum_amount', 'cum_trades_cnt', "close", "bid1p", "bid2p", "bid3p", "bid4p", "bid5p", "bid1q", "bid2q",
               "bid3q", "bid4q", "bid5q", "ask1p", "ask2p", "ask3p", "ask4p", "ask5p", "ask1q", "ask2q", "ask3q",
               "ask4q", "ask5q", "open", 'num']]
    SZ = SZ.rename(columns={'cum_trades_cnt': 'numTrades'})

    startDate = str(i)
    endDate = str(i)

    readPath = '/mnt/e/new_record_data/***'
    dataPathLs = np.array(glob.glob(readPath))
    dateLs = np.array([os.path.basename(i) for i in dataPathLs])
    dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]

    path1 = np.array(glob.glob(dataPathLs[0] + '/***_zt_88_03_day_88data.tar.gz/mdLog_SH_***'))
    SH1 = pd.read_csv(path1[0])
    index1 = SH1[SH1['StockID'].isin([16, 300, 852, 905])]
    SH1 = SH1[SH1['source'] == 23]

    SH1['skey'] = SH1['StockID'] + 1000000
    SH1 = SH1.rename(columns={"openPrice":"open"})
    SH1["open"] = np.where(SH1["cum_volume"] > 0, SH1.groupby("skey")["open"].transform("max"), SH1["open"])
    SH1["time"] = SH1["time"].apply(lambda x: int((x.replace(':', "")).replace(".", "")) * 1000)

    SH1 = SH1[['clockAtArrival', 'sequenceNo', 'skey', 'time', 'cum_volume', 'cum_amount', "close", "bid1p", "bid2p", "bid3p", "bid4p", "bid5p", "bid1q", "bid2q",
               "bid3q", "bid4q", "bid5q", "ask1p", "ask2p", "ask3p", "ask4p", "ask5p", "ask1q", "ask2q", "ask3q",
               "ask4q", "ask5q", "open", 'numTrades']]
    for cols in ['cum_amount', "close", 'open']:
        SH1[cols] = SH1[cols].round(2)
    cols = ['skey', 'time', 'cum_volume', 'cum_amount', "close", "bid1p", "bid2p", "bid3p", "bid4p", "bid5p", "bid1q", "bid2q",
            "bid3q", "bid4q", "bid5q", "ask1p", "ask2p", "ask3p", "ask4p", "ask5p", "ask1q", "ask2q", "ask3q",
            "ask4q", "ask5q", 'numTrades']
    SH1 = SH1[SH1['skey'].isin(SH['skey'].unique())]
    re = pd.merge(SH, SH1, on=cols, how='left')
    display(re[re['sequenceNo'].isnull()].groupby('skey')['date'].size().sort_values())
    assert(re.shape[0] == re[~re['date'].isnull()].shape[0])
    sl = list(set(SH['skey'].unique()) - set(SH1['skey'].unique()))
    assert((len(set(sl) - set(re[re['sequenceNo'].isnull()]['skey'].unique())) == 0) & 
               (len(set(re[re['sequenceNo'].isnull()]['skey'].unique()) - set(sl)) == 0))
    
    if re[re.duplicated('num', keep=False)].shape[0] != 0:
        p1 = re[re['num'].duplicated(keep=False)]
        p2 = re.drop_duplicates(['num'], keep=False)
        p1["order1"] = p1.groupby(["num"]).cumcount()
        p1["order2"] = p1.groupby(["sequenceNo"]).cumcount()
        p1 = p1[p1['order1'] == p1['order2']]
        p1.drop(['order1', 'order2'],axis=1,inplace=True)
        re = pd.concat([p1, p2])

    re1 = re.sort_values(by='num')
    assert(re1.shape[0] == SH.shape[0])   
    assert(re1[~re1['sequenceNo'].isnull()].shape[0] == re1[~re1['sequenceNo'].isnull()]['sequenceNo'].nunique())
    re1.loc[re['sequenceNo'].isnull(), 'sequenceNo'] = -1
    re1['nan'] = 0
    re1['count'] = 0
    re1 = re1[['skey', 'date', 'num', 'sequenceNo', 'clockAtArrival', 'nan', 'count']]
    print('zt_88 finished')
    
    
    
    
    print('-------------------------------------------------------------------------------------------')
    print('SZ lv2')
    startDate = str(i)
    endDate = str(i)

    readPath = '/mnt/e/new_record_data/***'
    dataPathLs = np.array(glob.glob(readPath))
    dateLs = np.array([os.path.basename(i) for i in dataPathLs])
    dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]

    path1 = np.array(glob.glob(dataPathLs[0] + '/***_zt_88_03_day_88data.tar.gz/mdLog_SZ_***'))
    SZ1 = pd.read_csv(path1[0])
    SZ1 = SZ1[SZ1['source'] == 12]
    SZ1['skey'] = SZ1['StockID'] + 2000000
    SZ1 = SZ1.rename(columns={"openPrice":"open"})
    SZ1["open"] = np.where(SZ1["cum_volume"] > 0, SZ1.groupby("skey")["open"].transform("max"), SZ1["open"])
    SZ1["time"] = SZ1["time"].apply(lambda x: int(x.replace(':', "")) * 1000000)

    SZ1 = SZ1[['clockAtArrival', 'sequenceNo', 'skey', 'time', 'cum_volume', 'cum_amount', "close", "bid1p", "bid2p", "bid3p", "bid4p", "bid5p", "bid1q", "bid2q",
               "bid3q", "bid4q", "bid5q", "ask1p", "ask2p", "ask3p", "ask4p", "ask5p", "ask1q", "ask2q", "ask3q",
               "ask4q", "ask5q", "open", 'numTrades']]
    for cols in ['cum_amount']:
        SZ1[cols] = SZ1[cols].round(2)
    cols = ['skey', 'time', 'cum_volume', "close", "bid1p", "bid2p", "bid3p", "bid4p", "bid5p", "bid1q", "bid2q",
               "bid3q", "bid4q", "bid5q", "ask1p", "ask2p", "ask3p", "ask4p", "ask5p", "ask1q", "ask2q", "ask3q",
               "ask4q", "ask5q", "open", 'numTrades']
    SZ1 = SZ1[SZ1['skey'].isin(SZ['skey'].unique())]
    re = pd.merge(SZ, SZ1, on=cols, how='left')
    try:
        assert(re[re['sequenceNo'].isnull()].shape[0] == 0)
    except:
        print(re[re['sequenceNo'].isnull()])
        assert(SZ[SZ['skey'].isin(re[re['sequenceNo'].isnull()]['skey'].unique())]['cum_volume'].unique() == [0])
        
    if re[re.duplicated('num', keep=False)].shape[0] != 0:
        p1 = re[re['num'].duplicated(keep=False)]
        p2 = re.drop_duplicates(['num'], keep=False)
        p1["order1"] = p1.groupby(["num"]).cumcount()
        p1["order2"] = p1.groupby(["sequenceNo"]).cumcount()
        p1 = p1[p1['order1'] == p1['order2']]
        p1.drop(['order1', 'order2'],axis=1,inplace=True)
        re = pd.concat([p1, p2])
    re2 = re.sort_values(by='num')
    assert(re2.shape[0] == SZ.shape[0])
    assert(re2[~re2['sequenceNo'].isnull()].shape[0] == re2[~re2['sequenceNo'].isnull()]['sequenceNo'].nunique())
    re2['nan'] = 0
    re2['count'] = 0
    re2.loc[re2['sequenceNo'].isnull(), 'sequenceNo'] = -1
    re2 = re2[['skey', 'date', 'num', 'sequenceNo', 'clockAtArrival', 'nan', 'count']]
    print('zt_88 finished')
    
    
    
    
    
    
    print('----------------------------------------------------------------------------------------------')
    print('SH & SZ trade')
    
    startDate = str(i)
    endDate = str(i)
    database_name = 'com_md_eq_cn'
    user = "zhenyuy"
    password = "bnONBrzSMGoE"

    db = DB("192.168.10.178", database_name, user, password)
    trade = db.read('md_trade', start_date=startDate, end_date=endDate)[['skey', 'date', 'ApplSeqNum']]

    startDate = str(i)
    endDate = str(i)

    readPath = '/mnt/e/new_record_data/***'
    dataPathLs = np.array(glob.glob(readPath))
    dateLs = np.array([os.path.basename(i) for i in dataPathLs])
    dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]

    path1 = np.array(glob.glob(dataPathLs[0] + '/***_zt_88_03_day_88data.tar.gz/mdTradeLog***'))
    trade1 = pd.read_csv(path1[0])
    trade1['skey'] = np.where(trade1['exchId'] == 2, trade1['SecurityID'] + 2000000, trade1['SecurityID'] + 1000000)
    trade1 = trade1[trade1['skey'].isin(trade['skey'].unique())]
    re = pd.merge(trade, trade1[['skey', 'ApplSeqNum', 'sequenceNo', 'clockAtArrival']], on=['skey', 'ApplSeqNum'],
                 how='left')
    display(re[re['sequenceNo'].isnull()].groupby('skey')['date'].size().sort_values())
    assert(re.shape[0] == trade.shape[0])
    re3 = re.sort_values(by=['skey', 'ApplSeqNum'])
    sl = list(set(trade['skey'].unique()) - set(trade1['skey'].unique()))
    assert((len(set(sl) - set(re3[re3['sequenceNo'].isnull()]['skey'].unique())) == 0) & 
       (len(set(re3[re3['sequenceNo'].isnull()]['skey'].unique()) - set(sl)) == 0))
    assert(re3[~re3['sequenceNo'].isnull()].shape[0] == re3[~re3['sequenceNo'].isnull()]['sequenceNo'].nunique())
    re3.loc[re3['sequenceNo'].isnull(), 'sequenceNo'] = -1
    re3['nan'] = 0
    re3['count'] = 0
    re3 = re3[['clockAtArrival', 'date', 'sequenceNo', 'skey', 'ApplSeqNum', 'nan', 'count']]
    print('zt_88 finished')
    
    
    
    
    
    
    print('--------------------------------------------------------------------------------------------------')
    print('SZ order data')

    startDate = str(i)
    endDate = str(i)
    database_name = 'com_md_eq_cn'
    user = "zhenyuy"
    password = "bnONBrzSMGoE"

    db = DB("192.168.10.178", database_name, user, password)
    order = db.read('md_order', start_date=startDate, end_date=endDate)[['skey', 'date', 'ApplSeqNum']]

    startDate = str(i)
    endDate = str(i)

    readPath = '/mnt/e/new_record_data/***'
    dataPathLs = np.array(glob.glob(readPath))
    dateLs = np.array([os.path.basename(i) for i in dataPathLs])
    dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]

    path1 = np.array(glob.glob(dataPathLs[0] + '/***_zt_88_03_day_88data.tar.gz/mdOrderLog***'))
    order1 = pd.read_csv(path1[0])
    order1['skey'] = order1['SecurityID'] + 2000000
    order1 = order1[order1['skey'].isin(order['skey'].unique())]
    re = pd.merge(order, order1[['skey', 'ApplSeqNum', 'sequenceNo', 'clockAtArrival']], on=['skey', 'ApplSeqNum'],
                 how='left')
    try:
        assert(re[re['sequenceNo'].isnull()].shape[0] == 0)
        re4 = re.sort_values(by=['skey', 'ApplSeqNum'])
        assert(re4.shape[0] == order.shape[0])
        assert(re4[~re4['sequenceNo'].isnull()].shape[0] == re4[~re4['sequenceNo'].isnull()]['sequenceNo'].nunique())
        re4['nan'] = 0
        re4['count'] = 0
    except:
        print('Attention!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!There are ticks missing in order data')
        print(re[re['sequenceNo'].isnull()])
        re4 = re.sort_values(by=['skey', 'ApplSeqNum'])
        assert(re4.shape[0] == order.shape[0])
        assert(re4[~re4['sequenceNo'].isnull()].shape[0] == re4[~re4['sequenceNo'].isnull()]['sequenceNo'].nunique())
        re4['nan'] = np.where(re4['sequenceNo'].isnull(), 1, 0)
        re4['sequenceNo'] = re4.groupby('skey')['sequenceNo'].ffill().bfill()
        re4['count1'] = re4.groupby(['sequenceNo']).cumcount()
        re4['count2'] = re4.groupby(['sequenceNo'])['count1'].transform('nunique')
        re4['min_seq'] = re4.groupby('sequenceNo')['sequenceNo'].transform('min')
        re4['count'] = np.where(re4['sequenceNo'] != re4['min_seq'], re4['count1'], re4['count1']+1-re4['count2'])

    re4 = re4[['skey', 'date', 'ApplSeqNum', 'sequenceNo', 'clockAtArrival', 'nan', 'count']]
    print('zt_88 finished')   
    
    
    
    
    
    print('-----------------------------------------------------------------------------------------------------')
    print('SH index data')
    
    startDate = str(i)
    endDate = str(i)
    database_name = 'com_md_eq_cn'
    user = "zhenyuy"
    password = "bnONBrzSMGoE"

    db = DB("192.168.10.178", database_name, user, password)
    index = db.read('md_index', start_date=startDate, end_date=endDate)
    index = index[index['skey'].isin([1000016, 1000300, 1000852, 1000905])]

    index1['skey'] = index1['StockID'] + 1000000
    index1 = index1.rename(columns={"openPrice":"open"})
    index1["open"] = np.where(index1["cum_volume"] > 0, index1.groupby("skey")["open"].transform("max"), index1["open"])
    index1['close'] = np.where(index1['cum_volume'] == 0, 0, index1['close'])
    index1["time"] = index1["time"].apply(lambda x: int((x.replace(':', "")).replace(".", "")) * 1000)
    index['close'] = np.where(index['cum_volume'] == 0, 0, index['close'])
    index['num'] = index['skey'] * 10000 + index['ordering']
    index = index[['skey', 'date', 'cum_volume', 'cum_amount', "close", "open", 'num', 'time']]
    index1 = index1[['clockAtArrival', 'sequenceNo', 'skey', 'cum_volume', 'cum_amount', "close", "open", "time"]]
    for cols in ['cum_amount']:
        index1[cols] = index1[cols].round(1)
    cols = ['skey', 'cum_volume', 'cum_amount', "close", "open"]
    index1 = index1[index1['skey'].isin(index['skey'].unique())]
    re = pd.merge(index, index1, on=cols, how='outer')
    try:
        assert(re[re['date'].isnull()]['cum_volume'].unique() == [0])
    except:
        print('Attention here!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! api data have more ticks in SH index')
        print(re[(re['date'].isnull()) & (re['cum_volume'] > 0)])
        print(index1[index1['sequenceNo'].isin(re[(re['date'].isnull()) & (re['cum_volume'] > 0)]['sequenceNo'].unique())])
        index1 = index1[~index1['sequenceNo'].isin(re[(re['date'].isnull()) & (re['cum_volume'] > 0)]['sequenceNo'].unique())]
    assert(re[re['sequenceNo'].isnull()].shape[0] == 0)
    index1 = index1[index1['cum_volume'] > 0]
    re = pd.concat([index1, index])
    re = re.sort_values(by=['skey', 'time'])
    re['sequenceNo'] = re.groupby('skey')['sequenceNo'].ffill()
    re['clockAtArrival'] = re.groupby('skey')['clockAtArrival'].ffill()
    re['count'] = re.groupby(['skey', 'sequenceNo']).cumcount()
    re['sequenceNo'] = re['sequenceNo'].fillna(-1)
    re.loc[re['count'] > 1, 'sequenceNo'] = -1
    re.loc[re['count'] > 1, 'clockAtArrival'] = np.nan
    re5 = re[~re['date'].isnull()]
    assert(re5.shape[0] == index.shape[0])
    re5['nan'] = 0
    re5['count'] = 0
    re5 = re5[['skey', 'date', 'num', 'sequenceNo', 'clockAtArrival', 'nan', 'count']]
    print('zt_88 finished')
   
    
    
    
    print('----------------------------------------------------------------------------------------------------')
    print('final concat zt88')
    try:
        assert(len(set(SZ1['sequenceNo']) & set(SH1['sequenceNo'])) == 0)
    except:
        display(SZ1[SZ1['sequenceNo'].isin(list(set(SZ1['sequenceNo']) & set(SH1['sequenceNo'])))])
        display(SH1[SH1['sequenceNo'].isin(list(set(SZ1['sequenceNo']) & set(SH1['sequenceNo'])))])
    try:
        assert(len(set(SZ1['sequenceNo']) & set(trade1['sequenceNo'])) == 0)
    except:
        display(SZ1[SZ1['sequenceNo'].isin(list(set(SZ1['sequenceNo']) & set(trade1['sequenceNo'])))])
        display(trade1[trade1['sequenceNo'].isin(list(set(SZ1['sequenceNo']) & set(trade1['sequenceNo'])))])
    try:
        assert(len(set(SZ1['sequenceNo']) & set(order1['sequenceNo'])) == 0)
    except:
        display(SZ1[SZ1['sequenceNo'].isin(list(set(SZ1['sequenceNo']) & set(order1['sequenceNo'])))])
        display(order1[order1['sequenceNo'].isin(list(set(SZ1['sequenceNo']) & set(order1['sequenceNo'])))])
    try:
        assert(len(set(SZ1['sequenceNo']) & set(index1['sequenceNo'])) == 0)
    except:
        display(SZ1[SZ1['sequenceNo'].isin(list(set(SZ1['sequenceNo']) & set(index1['sequenceNo'])))])
        display(index1[index1['sequenceNo'].isin(list(set(SZ1['sequenceNo']) & set(index1['sequenceNo'])))])
    try:
        assert(len(set(SH1['sequenceNo']) & set(index1['sequenceNo'])) == 0)
    except:
        display(SH1[SH1['sequenceNo'].isin(list(set(SH1['sequenceNo']) & set(index1['sequenceNo'])))])
        display(index1[index1['sequenceNo'].isin(list(set(SH1['sequenceNo']) & set(index1['sequenceNo'])))])
    try:
        assert(len(set(SH1['sequenceNo']) & set(trade1['sequenceNo'])) == 0)
    except:
        display(SH1[SH1['sequenceNo'].isin(list(set(SH1['sequenceNo']) & set(trade1['sequenceNo'])))])
        display(trade1[trade1['sequenceNo'].isin(list(set(SH1['sequenceNo']) & set(trade1['sequenceNo'])))])
    try:
        assert(len(set(SH1['sequenceNo']) & set(order1['sequenceNo'])) == 0)
    except:
        display(SH1[SH1['sequenceNo'].isin(list(set(SH1['sequenceNo']) & set(order1['sequenceNo'])))])
        display(order1[order1['sequenceNo'].isin(list(set(SH1['sequenceNo']) & set(order1['sequenceNo'])))])
    try:
        assert(len(set(trade1['sequenceNo']) & set(order1['sequenceNo'])) == 0)
    except:
        display(trade1[trade1['sequenceNo'].isin(list(set(trade1['sequenceNo']) & set(order1['sequenceNo'])))])
        display(order1[order1['sequenceNo'].isin(list(set(trade1['sequenceNo']) & set(order1['sequenceNo'])))])
    try:
        assert(len(set(trade1['sequenceNo']) & set(index1['sequenceNo'])) == 0)
    except:
        display(trade1[trade1['sequenceNo'].isin(list(set(trade1['sequenceNo']) & set(index1['sequenceNo'])))])
        display(index1[index1['sequenceNo'].isin(list(set(trade1['sequenceNo']) & set(index1['sequenceNo'])))])
    try:
        assert(len(set(index1['sequenceNo']) & set(order1['sequenceNo'])) == 0)
    except:
        display(index1[index1['sequenceNo'].isin(list(set(index1['sequenceNo']) & set(order1['sequenceNo'])))])
        display(order1[order1['sequenceNo'].isin(list(set(index1['sequenceNo']) & set(order1['sequenceNo'])))])
        
        
        
        
    
    print('-------------------------------------------------------------------------------------------')
    print('SH lv2')
    
    path1 = np.array(glob.glob(dataPathLs[0] + '/***_zs_96_03_day_96data.tar.gz/mdLog_SH_***'))
    SH1 = pd.read_csv(path1[0])
    index11 = SH1[SH1['StockID'].isin([16, 300, 852, 905])]
    SH1 = SH1[SH1['source'] == 13]

    SH1['skey'] = SH1['StockID'] + 1000000
    SH1 = SH1.rename(columns={"openPrice":"open"})
    SH1["open"] = np.where(SH1["cum_volume"] > 0, SH1.groupby("skey")["open"].transform("max"), SH1["open"])
    SH1["time"] = SH1["time"].apply(lambda x: int((x.replace(':', "")).replace(".", "")) * 1000)

    SH1 = SH1[['clockAtArrival', 'sequenceNo', 'skey', 'time', 'cum_volume', 'cum_amount', "close", "bid1p", "bid2p", "bid3p", "bid4p", "bid5p", "bid1q", "bid2q",
               "bid3q", "bid4q", "bid5q", "ask1p", "ask2p", "ask3p", "ask4p", "ask5p", "ask1q", "ask2q", "ask3q",
               "ask4q", "ask5q", "open", 'numTrades']]
    for cols in ['cum_amount', "close", 'open']:
        SH1[cols] = SH1[cols].round(2)
    cols = ['skey', 'time', 'cum_volume', 'cum_amount', "close", "bid1p", "bid2p", "bid3p", "bid4p", "bid5p", "bid1q", "bid2q",
            "bid3q", "bid4q", "bid5q", "ask1p", "ask2p", "ask3p", "ask4p", "ask5p", "ask1q", "ask2q", "ask3q",
            "ask4q", "ask5q", 'open', 'numTrades']
    SH1 = SH1[SH1['skey'].isin(SH['skey'].unique())]
    re = pd.merge(SH, SH1, on=cols, how='left')
    display(re[re['sequenceNo'].isnull()].groupby('skey')['date'].size().sort_values())
    # For KCB orders, zs96 shows snapshot data after 15:00 which is different from the data shows in database.
    assert(re.shape[0] == re[~re['date'].isnull()].shape[0])
    sl = list(set(SH['skey'].unique()) - set(SH1['skey'].unique()))
    
    if re[re.duplicated('num', keep=False)].shape[0] != 0:
        p1 = re[re['num'].duplicated(keep=False)]
        p2 = re.drop_duplicates(['num'], keep=False)
        p1["order1"] = p1.groupby(["num"]).cumcount()
        p1["order2"] = p1.groupby(["sequenceNo"]).cumcount()
        p1 = p1[p1['order1'] == p1['order2']]
        p1.drop(['order1', 'order2'],axis=1,inplace=True)
        re = pd.concat([p1, p2])

    ree1 = re.sort_values(by='num')
    assert(ree1.shape[0] == SH.shape[0])
    assert(ree1[~ree1['sequenceNo'].isnull()].shape[0] == ree1[~ree1['sequenceNo'].isnull()]['sequenceNo'].nunique())
    ree1['nan'] = np.where(ree1['sequenceNo'].isnull(), 1, 0)
    ree1['sequenceNo'] = ree1.groupby('skey')['sequenceNo'].ffill().bfill()
    ree1.loc[ree1['skey'].isin(sl), 'sequenceNo'] = np.nan
    
    assert((len(set(sl) - set(ree1[ree1['sequenceNo'].isnull()]['skey'].unique())) == 0) & 
           (len(set(ree1[ree1['sequenceNo'].isnull()]['skey'].unique()) - set(sl)) == 0))
    ree1.loc[ree1['sequenceNo'].isnull(), 'sequenceNo'] = -1
    ree1 = ree1[['skey', 'date', 'num', 'sequenceNo', 'clockAtArrival', 'nan']]
    print('zs_96 finished')
    
    
    
    
    print('-------------------------------------------------------------------------------------------')
    print('SZ lv2')    
    
    path1 = np.array(glob.glob(dataPathLs[0] + '/***_zs_96_03_day_96data.tar.gz/mdLog_SZ_***'))
    SZ1 = pd.read_csv(path1[0])
    SZ1 = SZ1[SZ1['source'] == 24]
    SZ1['skey'] = SZ1['StockID'] + 2000000
    SZ1 = SZ1.rename(columns={"openPrice":"open"})
    SZ1["open"] = np.where(SZ1["cum_volume"] > 0, SZ1.groupby("skey")["open"].transform("max"), SZ1["open"])
    SZ1["time"] = SZ1["time"].apply(lambda x: int((x.replace(':', "")).replace(".", "")) * 1000)

    SZ1 = SZ1[['clockAtArrival', 'sequenceNo', 'skey', 'time', 'cum_volume', 'cum_amount', "close", "bid1p", "bid2p", "bid3p", "bid4p", "bid5p", "bid1q", "bid2q",
               "bid3q", "bid4q", "bid5q", "ask1p", "ask2p", "ask3p", "ask4p", "ask5p", "ask1q", "ask2q", "ask3q",
               "ask4q", "ask5q", "open", 'numTrades']]
    for cols in ['cum_amount']:
        SZ1[cols] = SZ1[cols].round(2)
    cols = ['skey', 'time', 'cum_volume', 'cum_amount', "close", "bid1p", "bid2p", "bid3p", "bid4p", "bid5p", "bid1q", "bid2q",
               "bid3q", "bid4q", "bid5q", "ask1p", "ask2p", "ask3p", "ask4p", "ask5p", "ask1q", "ask2q", "ask3q",
               "ask4q", "ask5q", "open", 'numTrades']
    SZ1 = SZ1[SZ1['skey'].isin(SZ['skey'].unique())]
    re = pd.merge(SZ, SZ1, on=cols, how='left')
    display(re[re['sequenceNo'].isnull()].groupby('skey')['date'].size().sort_values())
    assert(re.shape[0] == re[~re['date'].isnull()].shape[0])
    sl = list(set(SZ['skey'].unique()) - set(SZ1['skey'].unique()))
    assert((len(set(sl) - set(re[re['sequenceNo'].isnull()]['skey'].unique())) == 0) & 
               (len(set(re[re['sequenceNo'].isnull()]['skey'].unique()) - set(sl)) == 0))
    
    if re[re.duplicated('num', keep=False)].shape[0] != 0:
        p1 = re[re['num'].duplicated(keep=False)]
        p2 = re.drop_duplicates(['num'], keep=False)
        p1["order1"] = p1.groupby(["num"]).cumcount()
        p1["order2"] = p1.groupby(["sequenceNo"]).cumcount()
        p1 = p1[p1['order1'] == p1['order2']]
        p1.drop(['order1', 'order2'],axis=1,inplace=True)
        re = pd.concat([p1, p2])

    ree2 = re.sort_values(by='num')
    assert(ree2.shape[0] == SZ.shape[0])
    assert(ree2[~ree2['sequenceNo'].isnull()].shape[0] == ree2[~ree2['sequenceNo'].isnull()]['sequenceNo'].nunique())
    ree2['nan'] = 0
    ree2.loc[ree2['sequenceNo'].isnull(), 'sequenceNo'] = -1
    ree2 = ree2[['skey', 'date', 'num', 'sequenceNo', 'clockAtArrival', 'nan']]
    print('zs_96 finished')
    
    
    
    
    print('-------------------------------------------------------------------------------------------')
    print('SH & SZ trade')     
        
    path1 = np.array(glob.glob(dataPathLs[0] + '/***_zs_96_03_day_96data.tar.gz/mdTradeLog***'))
    trade1 = pd.read_csv(path1[0])
    trade1['skey'] = np.where(trade1['exchId'] == 2, trade1['SecurityID'] + 2000000, trade1['SecurityID'] + 1000000)
    trade1 = trade1[trade1['skey'].isin(trade['skey'].unique())]
    assert(trade1[trade1['ChannelNo'] == 103]['TransactTime'].min() > 150000000)
    assert(trade1[trade1['ChannelNo'] == 103]['skey'].min() > 1688000)
    trade1 = trade1[trade1['ChannelNo'] != 103]
    
    re = pd.merge(trade, trade1[['skey', 'ApplSeqNum', 'sequenceNo', 'clockAtArrival']], on=['skey', 'ApplSeqNum'],
                 how='left')
    display(re[re['sequenceNo'].isnull()].groupby('skey')['date'].size().sort_values())
    assert(re.shape[0] == trade.shape[0])
    ree3 = re.sort_values(by=['skey', 'ApplSeqNum'])
    sl = list(set(trade['skey'].unique()) - set(trade1['skey'].unique()))
    assert((len(set(sl) - set(ree3[ree3['sequenceNo'].isnull()]['skey'].unique())) == 0) & 
       (len(set(ree3[ree3['sequenceNo'].isnull()]['skey'].unique()) - set(sl)) == 0))
    assert(ree3[~ree3['sequenceNo'].isnull()].shape[0] == ree3[~ree3['sequenceNo'].isnull()]['sequenceNo'].nunique())
    ree3.loc[ree3['sequenceNo'].isnull(), 'sequenceNo'] = -1
    ree3['nan'] = 0
    ree3 = ree3[['clockAtArrival', 'date', 'sequenceNo', 'skey', 'ApplSeqNum', 'nan']]
    print('zs_96 finished')
    
    
    
    
    print('-------------------------------------------------------------------------------------------')
    print('SZ order data') 
    
    path1 = np.array(glob.glob(dataPathLs[0] + '/***_zs_96_03_day_96data.tar.gz/mdOrderLog***'))
    order1 = pd.read_csv(path1[0])
    order1['skey'] = order1['SecurityID'] + 2000000
    order1 = order1[order1['skey'].isin(order['skey'].unique())]
    re = pd.merge(order, order1[['skey', 'ApplSeqNum', 'sequenceNo', 'clockAtArrival']], on=['skey', 'ApplSeqNum'],
                 how='left')
    display(re[re['sequenceNo'].isnull()].groupby('skey')['date'].size().sort_values())
    assert(re.shape[0] == order.shape[0])
    ree4 = re.sort_values(by=['skey', 'ApplSeqNum'])
    sl = list(set(order['skey'].unique()) - set(order1['skey'].unique()))
    assert((len(set(sl) - set(ree4[ree4['sequenceNo'].isnull()]['skey'].unique())) == 0) & 
       (len(set(ree4[ree4['sequenceNo'].isnull()]['skey'].unique()) - set(sl)) == 0))
    assert(ree4[~ree4['sequenceNo'].isnull()].shape[0] == ree4[~ree4['sequenceNo'].isnull()]['sequenceNo'].nunique())
    ree4['nan'] = 0
    ree4.loc[ree4['sequenceNo'].isnull(), 'sequenceNo'] = -1
    ree4 = ree4[['clockAtArrival', 'date', 'sequenceNo', 'skey', 'ApplSeqNum', 'nan']]
    print('zs_96 finished')
    
    
    
    print('-------------------------------------------------------------------------------------------')
    print('SH index data')
    
    index11['skey'] = index11['StockID'] + 1000000
    index11 = index11.rename(columns={"openPrice":"open"})
    index11["open"] = np.where(index11["cum_volume"] > 0, index11.groupby("skey")["open"].transform("max"), index11["open"])
    index11['close'] = np.where(index11['cum_volume'] == 0, 0, index11['close'])
    index11["time"] = index11["time"].apply(lambda x: int((x.replace(':', "")).replace(".", "")) * 1000)
    index11 = index11[['clockAtArrival', 'sequenceNo', 'skey', 'cum_volume', 'cum_amount', "close", "open", "time"]]
    for cols in ['cum_amount']:
        index11[cols] = index11[cols].round(1)
    cols = ['skey', 'cum_volume', 'cum_amount', "close", "open"]
    index11 = index11[index11['skey'].isin(index['skey'].unique())]
    re = pd.merge(index, index11, on=cols, how='outer')
    try:
        assert(re[re['date'].isnull()]['cum_volume'].unique() == [0])
    except:
        print('Attention here!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! api data have more ticks in SH index')
        print(re[(re['date'].isnull()) & (re['cum_volume'] > 0)])
        print(index11[index11['sequenceNo'].isin(re[(re['date'].isnull()) & (re['cum_volume'] > 0)]['sequenceNo'].unique())])
        index11 = index11[~index11['sequenceNo'].isin(re[(re['date'].isnull()) & (re['cum_volume'] > 0)]['sequenceNo'].unique())]
    assert(re[re['sequenceNo'].isnull()].shape[0] == 0)
    index11 = index11[index11['cum_volume'] > 0]
    re = pd.concat([index11, index])
    re = re.sort_values(by=['skey', 'time'])
    re['sequenceNo'] = re.groupby('skey')['sequenceNo'].ffill()
    re['clockAtArrival'] = re.groupby('skey')['clockAtArrival'].ffill()
    re['count'] = re.groupby(['skey', 'sequenceNo']).cumcount()
    re['sequenceNo'] = re['sequenceNo'].fillna(-1)
    re.loc[re['count'] > 1, 'sequenceNo'] = -1
    re.loc[re['count'] > 1, 'clockAtArrival'] = np.nan
    ree5 = re[~re['date'].isnull()]
    assert(ree5.shape[0] == index.shape[0])
    ree5['nan'] = 0
    ree5 = ree5[['skey', 'date', 'num', 'sequenceNo', 'clockAtArrival', 'nan']]
    print('zs_96 finished')
    

    
    
    
    print('----------------------------------------------------------------------------------------------------')
    print('final concat zs96')
    try:
        assert(len(set(SZ1['sequenceNo']) & set(SH1['sequenceNo'])) == 0)
    except:
        display(SZ1[SZ1['sequenceNo'].isin(list(set(SZ1['sequenceNo']) & set(SH1['sequenceNo'])))])
        display(SH1[SH1['sequenceNo'].isin(list(set(SZ1['sequenceNo']) & set(SH1['sequenceNo'])))])
    try:
        assert(len(set(SZ1['sequenceNo']) & set(trade1['sequenceNo'])) == 0)
    except:
        display(SZ1[SZ1['sequenceNo'].isin(list(set(SZ1['sequenceNo']) & set(trade1['sequenceNo'])))])
        display(trade1[trade1['sequenceNo'].isin(list(set(SZ1['sequenceNo']) & set(trade1['sequenceNo'])))])
    try:
        assert(len(set(SZ1['sequenceNo']) & set(order1['sequenceNo'])) == 0)
    except:
        display(SZ1[SZ1['sequenceNo'].isin(list(set(SZ1['sequenceNo']) & set(order1['sequenceNo'])))])
        display(order1[order1['sequenceNo'].isin(list(set(SZ1['sequenceNo']) & set(order1['sequenceNo'])))])
    try:
        assert(len(set(SZ1['sequenceNo']) & set(index1['sequenceNo'])) == 0)
    except:
        display(SZ1[SZ1['sequenceNo'].isin(list(set(SZ1['sequenceNo']) & set(index11['sequenceNo'])))])
        display(index11[index11['sequenceNo'].isin(list(set(SZ1['sequenceNo']) & set(index11['sequenceNo'])))])
    try:
        assert(len(set(SH1['sequenceNo']) & set(index1['sequenceNo'])) == 0)
    except:
        display(SH1[SH1['sequenceNo'].isin(list(set(SH1['sequenceNo']) & set(index11['sequenceNo'])))])
        display(index11[index11['sequenceNo'].isin(list(set(SH1['sequenceNo']) & set(index11['sequenceNo'])))])
    try:
        assert(len(set(SH1['sequenceNo']) & set(trade1['sequenceNo'])) == 0)
    except:
        display(SH1[SH1['sequenceNo'].isin(list(set(SH1['sequenceNo']) & set(trade1['sequenceNo'])))])
        display(trade1[trade1['sequenceNo'].isin(list(set(SH1['sequenceNo']) & set(trade1['sequenceNo'])))])
    try:
        assert(len(set(SH1['sequenceNo']) & set(order1['sequenceNo'])) == 0)
    except:
        display(SH1[SH1['sequenceNo'].isin(list(set(SH1['sequenceNo']) & set(order1['sequenceNo'])))])
        display(order1[order1['sequenceNo'].isin(list(set(SH1['sequenceNo']) & set(order1['sequenceNo'])))])
    try:
        assert(len(set(trade1['sequenceNo']) & set(order1['sequenceNo'])) == 0)
    except:
        display(trade1[trade1['sequenceNo'].isin(list(set(trade1['sequenceNo']) & set(order1['sequenceNo'])))])
        display(order1[order1['sequenceNo'].isin(list(set(trade1['sequenceNo']) & set(order1['sequenceNo'])))])
    try:
        assert(len(set(trade1['sequenceNo']) & set(index1['sequenceNo'])) == 0)
    except:
        display(trade1[trade1['sequenceNo'].isin(list(set(trade1['sequenceNo']) & set(index11['sequenceNo'])))])
        display(index11[index11['sequenceNo'].isin(list(set(trade1['sequenceNo']) & set(index11['sequenceNo'])))])
    try:
        assert(len(set(index11['sequenceNo']) & set(order1['sequenceNo'])) == 0)
    except:
        display(index11[index11['sequenceNo'].isin(list(set(index11['sequenceNo']) & set(order1['sequenceNo'])))])
        display(order1[order1['sequenceNo'].isin(list(set(index11['sequenceNo']) & set(order1['sequenceNo'])))])
        
    

    del SH
    del SH1
    del SZ
    del SZ1
    del trade
    del trade1
    del order
    del order1
    del index
    del index1
    
    
    
    
    
    
    
    
    
    
    
    ree1['tag'] = 'SH'
    ree2['tag'] = 'SZ'
    ree3['tag'] = 'trade'
    ree4['tag'] = 'order'
    ree5['tag'] = 'index'
    re1['tag'] = 'SH'
    re2['tag'] = 'SZ'
    re3['tag'] = 'trade'
    re4['tag'] = 'order'
    re5['tag'] = 'index'

    fr1 = []
    fr2 = []
    fr1 += [ree1]
    fr2 += [re1]
    del ree1
    del re1
    display('1. here~')
    fr1 += [ree2]
    fr2 += [re2]
    del ree2
    del re2
    display('2. here~')
    fr1 += [ree3]
    fr2 += [re3]
    del ree3
    del re3
    display('3. here~')
    fr1 += [ree4]
    fr2 += [re4]
    del ree4
    del re4
    display('4. here~')
    fr1 += [ree5]
    fr2 += [re5]
    del ree5
    del re5
    display('5. here~')
    fr1 = pd.concat(fr1).reset_index(drop=True)
    fr1 = fr1.sort_values(by=['sequenceNo', 'num'])
    fr1['sum_nan'] = fr1['nan'].cumsum()
    fr1['sequenceNo'] = np.where(fr1['sequenceNo'] != -1, fr1['sequenceNo'] + fr1['sum_nan'], fr1['sequenceNo'])
    
    fr2 = pd.concat(fr2).reset_index(drop=True)
    fr2 = fr2.sort_values(by=['sequenceNo', 'num'])
    fr2['sum_nan'] = fr2['nan'].cumsum()
    fr2['sequenceNo'] = np.where(fr2['sequenceNo'] != -1, fr2['sequenceNo'] + fr2['sum_nan'] + fr2['count'], fr2['sequenceNo'])

    assert(fr1[fr1['sequenceNo'] != -1][fr1[fr1['sequenceNo'] != -1].duplicated('sequenceNo', keep=False)].shape[0] == 0)
    assert(fr2[fr2['sequenceNo'] != -1][fr2[fr2['sequenceNo'] != -1].duplicated('sequenceNo', keep=False)].shape[0] == 0)
    assert(sum(~fr1[fr1['sequenceNo'] == -1]['clockAtArrival'].isnull()) == 0)
    assert(sum(~fr2[fr2['sequenceNo'] == -1]['clockAtArrival'].isnull()) == 0)
    
    
    import pickle
    os.mkdir('/mnt/e/result/' + startDate)
    SH = fr1[fr1['tag'] == 'SH'][["skey", "date", "num", 'sequenceNo', "clockAtArrival"]]
    SH = SH.rename(columns={'sequenceNo':'sequenceNo_96', 'clockAtArrival':"clockAtArrival_96"})
    SH1 = fr2[fr2['tag'] == 'SH'][["skey", "date", "num", 'sequenceNo', "clockAtArrival"]]
    SH1 = SH1.rename(columns={'sequenceNo':'sequenceNo_88', 'clockAtArrival':"clockAtArrival_88"})
    SH = pd.merge(SH, SH1, on=['skey', 'date', 'num'])
    assert(SH.shape[0] == SH1.shape[0])
    SH.to_pickle('/mnt/e/result/' + startDate + '/SH.pkl')
    del SH

    SZ = fr1[fr1['tag'] == 'SZ'][["skey", "date", "num", 'sequenceNo', "clockAtArrival"]]
    SZ = SZ.rename(columns={'sequenceNo':'sequenceNo_96', 'clockAtArrival':"clockAtArrival_96"})
    SZ1 = fr2[fr2['tag'] == 'SZ'][["skey", "date", "num", 'sequenceNo', "clockAtArrival"]]
    SZ1 = SZ1.rename(columns={'sequenceNo':'sequenceNo_88', 'clockAtArrival':"clockAtArrival_88"})
    SZ = pd.merge(SZ, SZ1, on=['skey', 'date', 'num'])
    assert(SZ.shape[0] == SZ1.shape[0])
    SZ.to_pickle('/mnt/e/result/' + startDate + '/SZ.pkl')
    del SZ
    
    trade = fr1[fr1['tag'] == 'trade'][["skey", "date", "ApplSeqNum", 'sequenceNo', "clockAtArrival"]]
    trade = trade.rename(columns={'sequenceNo':'sequenceNo_96', 'clockAtArrival':"clockAtArrival_96"})
    trade1 = fr2[fr2['tag'] == 'trade'][["skey", "date", "ApplSeqNum", 'sequenceNo', "clockAtArrival"]]
    trade1 = trade1.rename(columns={'sequenceNo':'sequenceNo_88', 'clockAtArrival':"clockAtArrival_88"})
    trade = pd.merge(trade, trade1, on=["skey", "date", "ApplSeqNum"])
    assert(trade.shape[0] == trade1.shape[0])
    trade.to_pickle('/mnt/e/result/' + startDate + '/trade.pkl')
    del trade
    
    
    order = fr1[fr1['tag'] == 'order'][["skey", "date", "ApplSeqNum", 'sequenceNo', "clockAtArrival"]]
    order = order.rename(columns={'sequenceNo':'sequenceNo_96', 'clockAtArrival':"clockAtArrival_96"})
    order1 = fr2[fr2['tag'] == 'order'][["skey", "date", "ApplSeqNum", 'sequenceNo', "clockAtArrival"]]
    order1 = order1.rename(columns={'sequenceNo':'sequenceNo_88', 'clockAtArrival':"clockAtArrival_88"})
    order = pd.merge(order, order1, on=["skey", "date", "ApplSeqNum"])
    assert(order.shape[0] == order1.shape[0])
    order.to_pickle('/mnt/e/result/' + startDate + '/order.pkl')
    del order
    
    index = fr1[fr1['tag'] == 'index'][["skey", "date", "num", 'sequenceNo', "clockAtArrival"]]
    index = index.rename(columns={'sequenceNo':'sequenceNo_96', 'clockAtArrival':"clockAtArrival_96"})
    index1 = fr2[fr2['tag'] == 'index'][["skey", "date", "num", 'sequenceNo', "clockAtArrival"]]
    index1 = index1.rename(columns={'sequenceNo':'sequenceNo_88', 'clockAtArrival':"clockAtArrival_88"})
    index = pd.merge(index, index1, on=["skey", "date", "num"])
    assert(index.shape[0] == index1.shape[0])
    index.to_pickle('/mnt/e/result/' + startDate + '/index.pkl')
    del index
    del fr2
    del fr1
    
    print(str(i) + ' finished')



--------------------------------------------------------------------------------------------
20200909
SH lv2


skey
1688095    4858
1601702    4992
1605003    5136
1688559    5147
Name: date, dtype: int64

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


zt_88 finished
-------------------------------------------------------------------------------------------
SZ lv2
             date     skey          time  cum_volume  cum_amount_x  numTrades  \
88834    20200909  2000029   93200000000           0           0.0          0   
88835    20200909  2000029   93300000000           0           0.0          0   
88836    20200909  2000029   93400000000           0           0.0          0   
88837    20200909  2000029   93500000000           0           0.0          0   
88838    20200909  2000029   93600000000           0           0.0          0   
...           ...      ...           ...         ...           ...        ...   
8567680  20200909  2300678  145403000000           0           0.0          0   
8567681  20200909  2300678  145503000000           0           0.0          0   
8567682  20200909  2300678  145603000000           0           0.0          0   
8567684  20200909  2300678  145803000000           0           0.0          

  interactivity=interactivity, compiler=compiler, result=result)


skey
2002912        1
2002946        1
2002950        1
2300745        1
2002982        1
           ...  
2300441       27
1601702      716
1688095    10983
1605003    32716
1688559    71521
Name: date, Length: 371, dtype: int64

AssertionError: 

In [4]:
re3[(re3['skey'] == 1600000) & (re3['sequenceNo'].isnull())]

Unnamed: 0,skey,date,ApplSeqNum,sequenceNo,clockAtArrival


In [2]:
re.shape[0]

68874433

In [3]:
trade.shape[0]

68874432

In [12]:
re[re['clockAtArrival'].isnull()]['skey'].unique()

array([1601702, 1605003, 1688095], dtype=int32)

In [15]:
trade1[trade1.duplicated(['skey', 'ApplSeqNum'], keep=False)]

Unnamed: 0,clockAtArrival,sequenceNo,exchId,securityType,__isRepeated,TransactTime,ChannelNo,ApplSeqNum,SecurityID,secid,mdSource,ExecType,TradeBSFlag,__origTickSeq,TradePrice,TradeQty,TradeMoney,BidApplSeqNum,OfferApplSeqNum,skey
40452,1599528315255486,3345407,1,1,0,92500000,6,58,688012,1688012,13,F,N,-1,1638000,300,491400000,147264,148633,1688012
68845317,1599549079728565,271026861,1,1,0,151104470,103,58,688012,1688012,13,F,S,-1,1617600,200,323520000,113,284,1688012


In [26]:
trade1[trade1['ChannelNo'] == 103]['skey'].min()

1688002

In [16]:
trade1['']

ChannelNo
1        3103361
2        3191336
3        3479197
4        1986083
5        3607769
6        3219022
103          120
2011    12691691
2012    12955625
2013    12525553
2014    12085628
Name: secid, dtype: int64

In [None]:
trade[trade.duplicated(['skey', 'ApplSeqNum'], keep=False)]

In [9]:
re.head()

Unnamed: 0,skey,date,ApplSeqNum,sequenceNo,clockAtArrival
0,1600000,20200908,1449,3644786.0,1599528000000000.0
1,1600000,20200908,1450,3644787.0,1599528000000000.0
2,1600000,20200908,1451,3644788.0,1599528000000000.0
3,1600000,20200908,1452,3644789.0,1599528000000000.0
4,1600000,20200908,1453,3644790.0,1599528000000000.0


In [None]:
startDate = 20200901
endDate = 20200918
database_name = 'com_md_eq_cn'
user = "zhenyuy"
password = "bnONBrzSMGoE"

print('start to add four columns in database')
dl = db.read('md_index', start_date = startDate, end_date = endDate, 1000300)['date'].unique()
for d in dl:
    SH = db.read('md_snapshot_l2', start_date=str(d), end_date=str(d))
    SZ = SH[SH['skey'] > 2000000]
    SH = SH[SH['skey'] < 2000000]
    SH['num'] = SH['skey'] * 10000 + SH['ordering']
    SZ['num'] = SZ['skey'] * 10000 + SZ['ordering']

    SH1 = pd.read_pickle('/mnt/e/result/' + str(d) + '/SH.pkl')

    assert(SH.shape[0] == SH1.shape[0])

    SH = pd.merge(SH, SH1, on=['skey', 'date', 'num'], how='outer')
    assert(SH[SH['sequenceNo_88'].isnull()].shape[0] == 0)
    assert(SH[SH['time'].isnull()].shape[0] == 0)
    SH.drop(['num'],axis=1,inplace=True)
    SH = SH.sort_values(by=['skey', 'ordering'])
    
    db.write('md_snapshot_l2', SH)
    print('finish write snapshot SH data')
    
    
    SZ1 = pd.read_pickle('/mnt/e/result/' + str(d) + '/SZ.pkl')

    assert(SZ.shape[0] == SZ1.shape[0])

    SZ = pd.merge(SZ, SZ1, on=['skey', 'date', 'num'], how='outer')
    assert(SZ[SZ['sequenceNo_88'].isnull()].shape[0] == 0)
    assert(SZ[SZ['time'].isnull()].shape[0] == 0)
    SZ.drop(['num'],axis=1,inplace=True)
    SZ = SZ.sort_values(by=['skey', 'ordering'])

    db.write('md_snapshot_l2', SZ)
    print('finish write snapshot SZ data')
    
    
    trade = db.read('md_trade', start_date=str(d), end_date=str(d))

    trade1 = pd.read_pickle('/mnt/e/result/' + str(d) + '/trade.pkl')

    assert(trade.shape[0] == trade1.shape[0])

    trade = pd.merge(trade, trade1, on=["skey", "date", "ApplSeqNum"], how='outer')
    assert(trade[trade['sequenceNo_88'].isnull()].shape[0] == 0)
    assert(trade[trade['time'].isnull()].shape[0] == 0)
    trade = trade.sort_values(by=['skey', 'ApplSeqNum'])

    db.write('md_trade', trade)
    print('finish write trade data')
    
    
    
    order = db.read('md_order', start_date=str(d), end_date=str(d))

    order1 = pd.read_pickle('/mnt/e/result/' + str(d) + '/order.pkl')

    assert(order.shape[0] == order1.shape[0])
    order = pd.merge(order, order1, on=["skey", "date", "ApplSeqNum"], how='outer')
    assert(order[order['sequenceNo_88'].isnull()].shape[0] == 0)
    assert(order[order['time'].isnull()].shape[0] == 0)
    order = order.sort_values(by=['skey', 'ApplSeqNum'])

    db.write('md_order', order)
    print('finish write order data')
    
    
    index = db.read('md_index', start_date=str(d), end_date=str(d))
    index['num'] = index['skey'] * 10000 + index['ordering']

    index1 = pd.read_pickle('/mnt/e/result/' + str(d) + '/index.pkl')

    assert(index.shape[0] == index1.shape[0])
    index = pd.merge(index, index1, on=['skey', 'date', 'num'], how='outer')
    assert(index[index['sequenceNo_88'].isnull()].shape[0] == 0)
    assert(index[index['time'].isnull()].shape[0] == 0)
    index.drop(['num'],axis=1,inplace=True)
    index = index.sort_values(by=['skey', 'ordering'])

    db.write('md_index', index)
    print('finish write index data')

    
    
    
    