In [13]:
# 2020 version
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz


def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name)


class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db'):
        self.db_name = db_name
        self.uri = uri
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = 1
            seg = {'ver': version, 'data': self.ser(df_seg, version), 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)

        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        else:
            raise Exception('unknown version')


def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()

import pandas as pd
import random
import numpy as np
import glob
import os
from unrar import rarfile
import py7zr
import pickle
import datetime

columns1 = ["Date","OrigTime","SendTime","ercvtime","dbtime","ChannelNo","SecurityID","SecurityIDsource", "MDStreamID","PreClosePx",
                   "PxChnage1","PXChange2","openPrice","HighPx","LowPx","close","NumTrades","cum_volume","cum_amount","PE1","PE2","TradingPhase",
                   "totalofferqty", "wa_offerPrice", "totalbidqty", "wa_bidPrice", "PreNAV", "RealTimeNAV", "WarrantPremiumRate", "UpLimitPx",
                   "DownLimitPx", "TotalLongPosition", "unknown1", "unknown2", "unknown3"]
columns2 = ['Date',"OrigTime","SendTime","ercvtime","dbtime","ChannelNo","SecurityID","SecurityIDsource", "MDStreamID",'ask1p','bid1p',
                   "ask1q","bid1q", 'ask2p','bid2p',"ask2q","bid2q",'ask3p','bid3p',"ask3q","bid3q",'ask4p','bid4p',"ask4q","bid4q",'ask5p',
                    'bid5p',"ask5q","bid5q",'ask6p','bid6p',"ask6q","bid6q",'ask7p','bid7p',"ask7q","bid7q",'ask8p','bid8p',"ask8q","bid8q",
                   'ask9p','bid9p',"ask9q","bid9q",'ask10p','bid10p',"ask10q","bid10q","NUMORDERS_B1","NOORDERS_B1","ORDERQTY_B1",
                    "NUMORDERS_S1","NOORDERS_S1","ORDERQTY_S1"]
columns3 =  ["Date","OrigTime","SendTime","ercvtime","dbtime","ChannelNo","SecurityID","SecurityIDsource", "MDStreamID","PreClosePx",
                   "PxChnage1","PXChange2","openPrice","HighPx","LowPx","close","NumTrades","cum_volume","cum_amount","PE1","PE2","TradingPhase",
                   "totalofferqty", "wa_offerPrice", "totalbidqty", "wa_bidPrice", "PreNAV", "RealTimeNAV", "WarrantPremiumRate", "UpLimitPx",
                   "DownLimitPx", "TotalLongPosition"]

# startTm = datetime.datetime.now()
# readPath = r'\\192.168.10.30\Kevin_zhenyu\day_stock\***'
# dataPathLs = np.array(glob.glob(readPath))
# dataPathLs = dataPathLs[[np.array([os.path.basename(i).split('.')[0][:2] == 'SZ' for i in dataPathLs])]]
# db = pd.DataFrame()
# for p in dataPathLs:
#     dayData = pd.read_csv(p, compression='gzip')
#     db = pd.concat([db, dayData])
# print(datetime.datetime.now() - startTm)

startTm = datetime.datetime.now()
startDate = "20200327"
endDate = "20200327"
df = []
bad = []
readPath = 'A:\\rawData\\logs_***_zs_92_01_day_data'
dataPathLs = np.array(glob.glob(readPath))
dateLs = np.array([os.path.basename(i).split('_')[1] for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]

for data in dataPathLs:
    readPath = data + '\\mdOrderLog***'
    dataPathLs = np.array(glob.glob(readPath))
    OrderLog1 = pd.read_csv(dataPathLs[0], encoding="utf-8").loc[:, ["clockAtArrival", "sequenceNo", "exchId", "TransactTime",
                                                 "ApplSeqNum", "SecurityID", "Side", "OrderType", "Price",
                                                 "OrderQty"]]
    OrderLog1 = OrderLog1[(OrderLog1["SecurityID"] < 4000) | (OrderLog1["SecurityID"] > 300000)]
    OrderLog1 = OrderLog1.rename(columns={"Side":"order_side", "OrderType":"order_type", "Price":"order_price",
                                             "OrderQty":'order_qty'})
    OrderLog1['date'] = int(os.path.basename(dataPathLs[0]).split('_')[1])
    OrderLog1["skey"] = OrderLog1["SecurityID"] + 2000000
    OrderLog1["time"] = OrderLog1['TransactTime'].astype(np.int64)*1000
    OrderLog1['TransactTime'] = OrderLog1['date'] * 1000000000 + OrderLog1['TransactTime']
    OrderLog1["clockAtArrival"] = OrderLog1["TransactTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S%f').timestamp()*1e6))
    OrderLog1['datetime'] = OrderLog1["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))
    OrderLog1["order_type"] =np.where(OrderLog1["order_type"] == 'U', 3, OrderLog1["order_type"])
    for col in ["skey", "date", "ApplSeqNum", "order_qty", "order_side", "order_type"]:
        OrderLog1[col] = OrderLog1[col].astype('int32')
    OrderLog1['order_price'] = OrderLog1['order_price']/10000
    display(OrderLog1["order_price"].astype(str).apply(lambda x: len(x.split('.')[1])).unique())
    
    assert(OrderLog1[((OrderLog1["order_side"] != 1) & (OrderLog1["order_side"] != 2)) | (OrderLog1["order_type"].isnull())].shape[0] == 0)
    da_te = str(OrderLog1["date"].iloc[0]) 
    da_te = da_te[:4] + '-' + da_te[4:6] + '-' + da_te[6:8]
    db1 = db[db["date"] == da_te]
    sl = (db1["ID"].str[2:].astype(int) + 2000000).unique()
    del db1
    try:
        assert(len(set(sl) - set(OrderLog1["skey"].unique())) == 0)
    except:
        print(set(sl) - set(OrderLog1["skey"].unique()))
    
    OrderLog1 = OrderLog1[["skey", "date", "time", "clockAtArrival", "datetime", "ApplSeqNum", "order_side", "order_type", "order_price",
                                                 "order_qty"]]
    
    print(OrderLog1["date"].iloc[0])
    print("order finished")
    
    print(datetime.datetime.now() - startTm)

startDate = 20200327
endDate = 20200327
database_name = 'com_md_eq_cn'
user = "zhenyuy"
password = "bnONBrzSMGoE"
db1 = DB("192.168.10.223", database_name, user, password)
trade = db1.read('md_trade', start_date=startDate, end_date=endDate)
trade = trade[trade['skey'] > 2000000]
t1 = trade.groupby('skey')['BidApplSeqNum'].unique().reset_index()
t2 = trade.groupby('skey')['OfferApplSeqNum'].unique().reset_index()
t3 = OrderLog1.groupby('skey')['ApplSeqNum'].unique().reset_index()
t = pd.merge(t1, t2, on='skey', how='outer')
display(t[(t['BidApplSeqNum'].isnull()) | (t['OfferApplSeqNum'].isnull())])
t['union'] = [list(set(a) | set(b)) for a, b in zip(t.BidApplSeqNum, t.OfferApplSeqNum)]
t = pd.merge(t, t3, on='skey', how='outer')
display(t[(t['BidApplSeqNum'].isnull()) | (t['OfferApplSeqNum'].isnull()) | (t['ApplSeqNum'].isnull())])
t['less'] = [len(set(a) - set(b)) for a, b in zip(t.union, t.ApplSeqNum)]
t['less1'] = [list(set(a) - set(b))[0] for a, b in zip(t.union, t.ApplSeqNum)]
display(t['less1'].unique())
t[t['less'] > 1]

array([2, 1], dtype=int64)

{2002977, 2300819, 2300821, 2300822, 2300823, 2300825}
20200327
order finished
0:09:00.531404


Unnamed: 0,skey,BidApplSeqNum,OfferApplSeqNum


Unnamed: 0,skey,BidApplSeqNum,OfferApplSeqNum,union,ApplSeqNum
1398,2002977,"[694, 1092, 54381, 16564, 0, 93550, 105808, 11...","[0, 55123, 57506, 134503, 131765, 46842, 5960,...","[0, 393216, 2, 3, 786436, 5, 6, 7, 8, 9, 78643...",
2192,2300819,"[22161, 95881, 0, 134998, 143208, 151795, 1168...","[0, 21933, 51189, 135659, 117144, 150273, 1539...","[0, 12189696, 917504, 13959168, 2686982, 59637...",
2194,2300821,"[0, 82924, 85511, 72015, 141174, 81984, 148629...","[108453, 76094, 0, 34335, 140114, 145657, 1365...","[0, 6815744, 12189698, 13762561, 2621443, 6160...",
2195,2300822,"[10589, 0, 49813, 129419, 130898, 131353, 1375...","[0, 26772, 62504, 71638, 135411, 133882, 13582...","[0, 16908288, 4587529, 4587530, 11403276, 1310...",
2196,2300823,"[5882, 5883, 5884, 5903, 5902, 5904, 11665, 13...","[0, 90705, 101163, 99091, 134440, 50522, 14015...","[0, 1, 2, 4, 5, 12976134, 4194311, 13762567, 9...",
2197,2300825,"[248, 37340, 42273, 45625, 60095, 60017, 61741...","[0, 142805, 165160, 207855, 217078, 179760, 18...","[0, 1, 2, 3, 4, 196613, 6, 8, 10, 11, 12, 13, ...",


TypeError: 'float' object is not iterable

In [14]:
t = pd.merge(t1, t2, on='skey')
t['union'] = [list(set(a) | set(b)) for a, b in zip(t.BidApplSeqNum, t.OfferApplSeqNum)]
t = pd.merge(t, t3, on='skey')
t['less'] = [len(set(a) - set(b)) for a, b in zip(t.union, t.ApplSeqNum)]
t['less1'] = [list(set(a) - set(b))[0] for a, b in zip(t.union, t.ApplSeqNum)]
display(t['less1'].unique())
t[t['less'] > 1]

array([0], dtype=int64)

Unnamed: 0,skey,BidApplSeqNum,OfferApplSeqNum,union,ApplSeqNum,less,less1
12,2000016,"[0, 84818, 36514, 138993, 139668, 139126, 1440...","[5031, 64213, 73250, 0, 93332, 21514, 126063, ...","[0, 12582913, 2621449, 12582924, 17825807, 104...","[436, 709, 992, 995, 1568, 1596, 1835, 1838, 1...",2,0
208,2000651,"[27137, 27136, 37406, 44478, 0, 73088, 81731, ...","[0, 65212, 65435, 55626, 47667, 47668, 95164, ...","[0, 1572867, 2883589, 2097160, 5767178, 262161...","[125, 130, 131, 140, 149, 201, 269, 270, 272, ...",2,0
264,2000725,"[48, 19557, 30114, 0, 38845, 38854, 46441, 497...","[0, 13512, 42058, 9914, 67107, 37566, 76448, 8...","[0, 1572865, 3670022, 12582918, 14155782, 1048...","[18, 48, 53, 65, 85, 111, 120, 121, 122, 123, ...",2,0
302,2000789,"[0, 46957, 127386, 128405, 146109, 128396, 467...","[37428, 107711, 0, 106598, 104751, 86892, 1935...","[0, 8192002, 2949124, 10420229, 10289158, 6094...","[511, 984, 1076, 1156, 1494, 1783, 1983, 2678,...",2,0
303,2000790,"[0, 42444, 95209, 130793, 122137, 142155, 2097...","[112615, 0, 86160, 97131, 103406, 138810, 1402...","[0, 131072, 13369346, 2359300, 13598725, 56361...","[2810, 3091, 3099, 4221, 4222, 4223, 4397, 494...",2,0
420,2000961,"[40952, 33796, 33795, 102154, 47901, 128036, 0...","[0, 152713, 127053, 127055, 127069, 258175, 20...","[0, 11403273, 13238289, 8650769, 9502739, 1638...","[1356, 1455, 2452, 3012, 3044, 3190, 3191, 348...",2,0
442,2000988,"[0, 75928, 93528, 94756, 103219, 117321, 53167...","[35425, 35426, 0, 109262, 125310, 125539, 1259...","[0, 16580608, 14876673, 16646147, 7733252, 347...","[1544, 1668, 2501, 2603, 2704, 2984, 3071, 313...",2,0
478,2002024,"[1912, 0, 108015, 48653, 132291, 85992, 47299,...","[0, 67918, 100170, 37175, 7619, 54471, 11873, ...","[0, 12320773, 851977, 9306123, 589838, 1369704...","[1462, 1543, 1636, 1840, 1846, 1912, 2252, 334...",2,0
484,2002030,"[0, 62637, 86610, 120606, 126463, 130471, 5150...","[60657, 0, 71947, 72085, 78606, 88562, 40162, ...","[0, 3801088, 11403265, 13238272, 2752518, 9306...","[851, 1090, 1209, 1348, 1370, 1371, 1374, 1375...",2,0
497,2002043,"[64386, 47599, 119054, 0, 146388, 148647, 1986...","[0, 130950, 293461, 147382, 226666, 299730, 27...","[0, 8585216, 17806254, 786443, 18808844, 16285...","[2043, 2185, 3193, 4031, 5791, 5792, 5807, 905...",2,0


In [15]:
# 2020 ftp data version
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz


def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name)


class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db'):
        self.db_name = db_name
        self.uri = uri
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = 1
            seg = {'ver': version, 'data': self.ser(df_seg, version), 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)

        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        else:
            raise Exception('unknown version')


def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()

import pandas as pd
import random
import numpy as np
import glob
import os
from unrar import rarfile
import py7zr
import pickle
import datetime

columns1 = ["Date","OrigTime","SendTime","ercvtime","dbtime","ChannelNo","SecurityID","SecurityIDsource", "MDStreamID","PreClosePx",
                   "PxChnage1","PXChange2","openPrice","HighPx","LowPx","close","NumTrades","cum_volume","cum_amount","PE1","PE2","TradingPhase",
                   "totalofferqty", "wa_offerPrice", "totalbidqty", "wa_bidPrice", "PreNAV", "RealTimeNAV", "WarrantPremiumRate", "UpLimitPx",
                   "DownLimitPx", "TotalLongPosition", "unknown1", "unknown2", "unknown3"]
columns2 = ['Date',"OrigTime","SendTime","ercvtime","dbtime","ChannelNo","SecurityID","SecurityIDsource", "MDStreamID",'ask1p','bid1p',
                   "ask1q","bid1q", 'ask2p','bid2p',"ask2q","bid2q",'ask3p','bid3p',"ask3q","bid3q",'ask4p','bid4p',"ask4q","bid4q",'ask5p',
                    'bid5p',"ask5q","bid5q",'ask6p','bid6p',"ask6q","bid6q",'ask7p','bid7p',"ask7q","bid7q",'ask8p','bid8p',"ask8q","bid8q",
                   'ask9p','bid9p',"ask9q","bid9q",'ask10p','bid10p',"ask10q","bid10q","NUMORDERS_B1","NOORDERS_B1","ORDERQTY_B1",
                    "NUMORDERS_S1","NOORDERS_S1","ORDERQTY_S1"]
columns3 =  ["Date","OrigTime","SendTime","ercvtime","dbtime","ChannelNo","SecurityID","SecurityIDsource", "MDStreamID","PreClosePx",
                   "PxChnage1","PXChange2","openPrice","HighPx","LowPx","close","NumTrades","cum_volume","cum_amount","PE1","PE2","TradingPhase",
                   "totalofferqty", "wa_offerPrice", "totalbidqty", "wa_bidPrice", "PreNAV", "RealTimeNAV", "WarrantPremiumRate", "UpLimitPx",
                   "DownLimitPx", "TotalLongPosition"]

# startTm = datetime.datetime.now()
# readPath = r'\\192.168.10.30\Kevin_zhenyu\day_stock\***'
# dataPathLs = np.array(glob.glob(readPath))
# dataPathLs = dataPathLs[[np.array([os.path.basename(i).split('.')[0][:2] == 'SZ' for i in dataPathLs])]]
# db = pd.DataFrame()
# for p in dataPathLs:
#     dayData = pd.read_csv(p, compression='gzip')
#     db = pd.concat([db, dayData])
# print(datetime.datetime.now() - startTm)

startTm = datetime.datetime.now()
year = "2020"
startDate = "0316"
endDate = "0316"
df = []
bad = []
readPath = 'L:\\backup_data\\' + year + '\\***'
dataPathLs = np.array(glob.glob(readPath))
dateLs = np.array([os.path.basename(i) for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]

for data in dataPathLs:
    if len(np.array(glob.glob(data +'\\***'))) == 0:
        continue
    
    if len(np.array(glob.glob(data +'\\am_hq_order_spot.7z'))) == 1:
        date = os.path.basename(data)
        path = 'L:\\backup_data\\' + year 
        os.chdir(data)
        try:
            a = py7zr.SevenZipFile(data + '\\am_hq_order_spot.7z','r',filters=None)
        except:
            print("Bad unzip here!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            print(data + '\\am_hq_order_spot.7z')
            bad.append(data + '\\am_hq_order_spot.7z')
            continue
        path1 = path + '\\' + date
        a.extractall(path = path1)
        a.close()
        try:
            a = py7zr.SevenZipFile(data + '\\pm_hq_order_spot.7z','r',filters=None)
        except:
            print("Bad unzip here!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            print(data + '\\pm_hq_order_spot.7z')
            bad.append(data + '\\pm_hq_order_spot.7z')
            continue
        a.extractall(path = path1)
        a.close()
        
        am_order = pd.read_table(path1 + '\\am_hq_order_spot.txt',header=None)
        am_order.columns = ["date","OrigTime","SendTime","recvtime","dbtime","ChannelNo","MDStreamID","ApplSeqNum", "SecurityID","SecurityIDSource", "order_price",
                   "order_qty","TransactTime","order_side","order_type","ConfirmID","Contactor","ContactInfo","ExpirationDays","ExpirationType"]
        pm_order = pd.read_table(path1 + '\\pm_hq_order_spot.txt',header=None)
        pm_order.columns = ["date","OrigTime","SendTime","recvtime","dbtime","ChannelNo","MDStreamID","ApplSeqNum", "SecurityID","SecurityIDSource", "order_price",
                   "order_qty","TransactTime","order_side","order_type","ConfirmID","Contactor","ContactInfo","ExpirationDays","ExpirationType"]
        OrderLog1 = pd.concat([am_order, pm_order])
        del am_order
        del pm_order
  
    
    elif len(np.array(glob.glob(data +'\\am_hq_order_spot.7z.001'))) == 1:
        date = os.path.basename(data)
        path = 'L:\\backup_data\\' + year 
        os.chdir(data)
        os.system("copy /b am_hq_order_spot.7z.* am_hq_order_spot.7z")
        try:
            a = py7zr.SevenZipFile(data + '\\am_hq_order_spot.7z','r',filters=None)
        except:
            print("Bad unzip here!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            print(data + '\\am_hq_order_spot.7z')
            bad.append(data + '\\am_hq_order_spot.7z')
            continue
        path1 = path + '\\' + date
        a.extractall(path = path1)
        a.close()
        os.system("copy /b pm_hq_order_spot.7z.* pm_hq_order_spot.7z")
        try:
            a = py7zr.SevenZipFile(data + '\\pm_hq_order_spot.7z','r',filters=None)
        except:
            print("Bad unzip here!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            print(data + '\\pm_hq_order_spot.7z')
            bad.append(data + '\\pm_hq_order_spot.7z')
            continue
        a.extractall(path = path1)
        a.close()
        
        am_order = pd.read_table(path1 + '\\am_hq_order_spot.txt',header=None)
        am_order.columns = ["date","OrigTime","SendTime","recvtime","dbtime","ChannelNo","MDStreamID","ApplSeqNum", "SecurityID","SecurityIDSource", "order_price",
                   "order_qty","TransactTime","order_side","order_type","ConfirmID","Contactor","ContactInfo","ExpirationDays","ExpirationType"]
        pm_order = pd.read_table(path1 + '\\pm_hq_order_spot.txt',header=None)
        pm_order.columns = ["date","OrigTime","SendTime","recvtime","dbtime","ChannelNo","MDStreamID","ApplSeqNum", "SecurityID","SecurityIDSource", "order_price",
                   "order_qty","TransactTime","order_side","order_type","ConfirmID","Contactor","ContactInfo","ExpirationDays","ExpirationType"]
        OrderLog1 = pd.concat([am_order, pm_order])
        del am_order
        del pm_order

    elif len(np.array(glob.glob(data +'\\hq_order.7z'))) == 1:
        date = os.path.basename(data)
        path = 'L:\\backup_data\\' + year 
        os.chdir(data)
        try:
            a = py7zr.SevenZipFile(data + '\\hq_order.7z','r',filters=None)
        except:
            print("Bad unzip here!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            print(data + '\\hq_order.7z')
            bad.append(data + '\\hq_order.7z')
            continue
        path1 = path + '\\' + date
        a.extractall(path = path1)
        a.close()
        OrderLog1 = pd.read_table(path1 + '\\hq_order.txt',header=None)
        OrderLog1.columns = ["date","OrigTime","SendTime","recvtime","dbtime","ChannelNo","MDStreamID","ApplSeqNum", "SecurityID","SecurityIDSource", "order_price",
                   "order_qty","TransactTime","order_side","order_type","ConfirmID","Contactor","ContactInfo","ExpirationDays","ExpirationType"]
    
    
    OrderLog1 = OrderLog1[(OrderLog1["SecurityID"] < 4000) | (OrderLog1["SecurityID"] > 300000)]
    OrderLog1["skey"] = OrderLog1["SecurityID"] + 2000000
    OrderLog1["clockAtArrival"] = OrderLog1["TransactTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S%f').timestamp()*1e6))
    OrderLog1['datetime'] = OrderLog1["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))
    OrderLog1["time"] = (OrderLog1['TransactTime'] - int(OrderLog1['TransactTime'].iloc[0]//1000000000*1000000000)).astype(np.int64)*1000
    OrderLog1["order_type"] =np.where(OrderLog1["order_type"] == 'U', 3, OrderLog1["order_type"])
    for col in ["skey", "date", "ApplSeqNum", "order_qty", "order_side", "order_type"]:
        OrderLog1[col] = OrderLog1[col].astype('int32')
    display(OrderLog1["order_price"].astype(str).apply(lambda x: len(x.split('.')[1])).unique())
    
    assert(OrderLog1[((OrderLog1["order_side"] != 1) & (OrderLog1["order_side"] != 2)) | (OrderLog1["order_type"].isnull())].shape[0] == 0)
    da_te = str(OrderLog1["date"].iloc[0]) 
    da_te = da_te[:4] + '-' + da_te[4:6] + '-' + da_te[6:8]
    db1 = db[db["date"] == da_te]
    sl = (db1["ID"].str[2:].astype(int) + 2000000).unique()
    del db1
    try:
        assert(len(set(sl) - set(OrderLog1["skey"].unique())) == 0)
    except:
        print(set(sl) - set(OrderLog1["skey"].unique()))
    
    OrderLog1 = OrderLog1[["skey", "date", "time", "clockAtArrival", "datetime", "ApplSeqNum", "order_side", "order_type", "order_price",
                                                 "order_qty"]]
    
    print(OrderLog1.dtypes)
    print(OrderLog1["date"].iloc[0])
    print("order finished")
    
    print(datetime.datetime.now() - startTm)

  interactivity=interactivity, compiler=compiler, result=result)


array([2, 1], dtype=int64)

skey                       int32
date                       int32
time                       int64
clockAtArrival             int64
datetime          datetime64[ns]
ApplSeqNum                 int32
order_side                 int32
order_type                 int32
order_price              float64
order_qty                  int32
dtype: object
20200316
order finished
0:36:31.944359


ServerSelectionTimeoutError: 192.168.10.223:27017: [WinError 10061] 由于目标计算机积极拒绝，无法连接。

In [16]:
year = "2020"
startDate = '20200316'
endDate = '20200316'
readPath = 'K:\\data\\' + year + '\\***\\***'
dataPathLs = np.array(glob.glob(readPath))
dateLs = np.array([os.path.basename(i) for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]

for data in dataPathLs:
    
    startTm = datetime.datetime.now()
    
    readPath = data + '\\SZ\\tick\\***'
    dataPathLs = np.array(glob.glob(readPath))
    dateLs = np.array([int(os.path.basename(i).split('.')[0]) for i in dataPathLs])
    dataPathLs = dataPathLs[(dateLs < 4000) | ((dateLs > 300000) & (dateLs < 310000))]
    trade = []
    ll = []
    
    for i in dataPathLs:
        try:
            df = pd.read_csv(i)
        except:
            print("empty data")
            print(i)
            ll.append(int(os.path.basename(i).split('.')[0]))
            continue
        df["SecurityID"] = int(os.path.basename(i).split('.')[0])
        trade += [df]
    trade = pd.concat(trade).reset_index(drop=True)
    trade = trade[trade["ChannelNo"] != 4001]

    trade["skey"] = trade["SecurityID"] + 2000000

t1 = trade.groupby('skey')['BidApplSeqNum'].unique().reset_index()
t2 = trade.groupby('skey')['OfferApplSeqNum'].unique().reset_index()
t3 = OrderLog1.groupby('skey')['ApplSeqNum'].unique().reset_index()
t = pd.merge(t1, t2, on='skey', how='outer')
display(t[(t['BidApplSeqNum'].isnull()) | (t['OfferApplSeqNum'].isnull())])
t['union'] = [list(set(a) | set(b)) for a, b in zip(t.BidApplSeqNum, t.OfferApplSeqNum)]
t = pd.merge(t, t3, on='skey', how='outer')
display(t[(t['BidApplSeqNum'].isnull()) | (t['OfferApplSeqNum'].isnull()) | (t['ApplSeqNum'].isnull())])
t['less'] = [len(set(a) - set(b)) for a, b in zip(t.union, t.ApplSeqNum)]
t['less1'] = [list(set(a) - set(b))[0] for a, b in zip(t.union, t.ApplSeqNum)]
display(t['less1'].unique())
t[t['less'] > 1]

Unnamed: 0,skey,BidApplSeqNum,OfferApplSeqNum


Unnamed: 0,skey,BidApplSeqNum,OfferApplSeqNum,union,ApplSeqNum


array([0], dtype=int64)

Unnamed: 0,skey,BidApplSeqNum,OfferApplSeqNum,union,ApplSeqNum,less,less1


In [None]:
startDate = 20200316
endDate = 20200316
database_name = 'com_md_eq_cn'
user = "zhenyuy"
password = "bnONBrzSMGoE"
db1 = DB("192.168.10.223", database_name, user, password)
trade = db1.read('md_trade', start_date=startDate, end_date=endDate)
trade = trade[trade['skey'] > 2000000]
t1 = trade.groupby('skey')['BidApplSeqNum'].unique().reset_index()
t2 = trade.groupby('skey')['OfferApplSeqNum'].unique().reset_index()
t3 = OrderLog1.groupby('skey')['ApplSeqNum'].unique().reset_index()
t = pd.merge(t1, t2, on='skey', how='outer')
display(t[(t['BidApplSeqNum'].isnull()) | (t['OfferApplSeqNum'].isnull())])
t['union'] = [list(set(a) | set(b)) for a, b in zip(t.BidApplSeqNum, t.OfferApplSeqNum)]
t = pd.merge(t, t3, on='skey', how='outer')
display(t[(t['BidApplSeqNum'].isnull()) | (t['OfferApplSeqNum'].isnull()) | (t['ApplSeqNum'].isnull())])
t['less'] = [len(set(a) - set(b)) for a, b in zip(t.union, t.ApplSeqNum)]
t['less1'] = [list(set(a) - set(b))[0] for a, b in zip(t.union, t.ApplSeqNum)]
display(t['less1'].unique())
t[t['less'] > 1]

In [19]:
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz


def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name)


class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db'):
        self.db_name = db_name
        self.uri = uri
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = 1
            seg = {'ver': version, 'data': self.ser(df_seg, version), 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)

        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        else:
            raise Exception('unknown version')


def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()

import pandas as pd
import random
import numpy as np
import glob
import os
from unrar import rarfile
import py7zr
import pickle
import datetime

columns1 = ["Date","OrigTime","SendTime","ercvtime","dbtime","ChannelNo","SecurityID","SecurityIDsource", "MDStreamID","PreClosePx",
                   "PxChnage1","PXChange2","openPrice","HighPx","LowPx","close","NumTrades","cum_volume","cum_amount","PE1","PE2","TradingPhase",
                   "totalofferqty", "wa_offerPrice", "totalbidqty", "wa_bidPrice", "PreNAV", "RealTimeNAV", "WarrantPremiumRate", "UpLimitPx",
                   "DownLimitPx", "TotalLongPosition", "unknown1", "unknown2", "unknown3"]
columns2 = ['Date',"OrigTime","SendTime","ercvtime","dbtime","ChannelNo","SecurityID","SecurityIDsource", "MDStreamID",'ask1p','bid1p',
                   "ask1q","bid1q", 'ask2p','bid2p',"ask2q","bid2q",'ask3p','bid3p',"ask3q","bid3q",'ask4p','bid4p',"ask4q","bid4q",'ask5p',
                    'bid5p',"ask5q","bid5q",'ask6p','bid6p',"ask6q","bid6q",'ask7p','bid7p',"ask7q","bid7q",'ask8p','bid8p',"ask8q","bid8q",
                   'ask9p','bid9p',"ask9q","bid9q",'ask10p','bid10p',"ask10q","bid10q","NUMORDERS_B1","NOORDERS_B1","ORDERQTY_B1",
                    "NUMORDERS_S1","NOORDERS_S1","ORDERQTY_S1"]
columns3 =  ["Date","OrigTime","SendTime","ercvtime","dbtime","ChannelNo","SecurityID","SecurityIDsource", "MDStreamID","PreClosePx",
                   "PxChnage1","PXChange2","openPrice","HighPx","LowPx","close","NumTrades","cum_volume","cum_amount","PE1","PE2","TradingPhase",
                   "totalofferqty", "wa_offerPrice", "totalbidqty", "wa_bidPrice", "PreNAV", "RealTimeNAV", "WarrantPremiumRate", "UpLimitPx",
                   "DownLimitPx", "TotalLongPosition"]

# startTm = datetime.datetime.now()
# readPath = r'\\192.168.10.30\Kevin_zhenyu\day_stock\***'
# dataPathLs = np.array(glob.glob(readPath))
# dataPathLs = dataPathLs[[np.array([os.path.basename(i).split('.')[0][:2] == 'SZ' for i in dataPathLs])]]
# db = pd.DataFrame()
# for p in dataPathLs:
#     dayData = pd.read_csv(p, compression='gzip')
#     db = pd.concat([db, dayData])
# print(datetime.datetime.now() - startTm)

startTm = datetime.datetime.now()
year = "2019"
startDate = "1113"
endDate = "1113"
df = []
bad = []
readPath = 'L:\\backup_data\\' + year + '\\***'
dataPathLs = np.array(glob.glob(readPath))
dateLs = np.array([os.path.basename(i) for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]

for data in dataPathLs:
    am_order = pd.read_table(data + '\\am_hq_order_spot.txt',header=None)
    am_order.columns = ["date","OrigTime","SendTime","recvtime","dbtime","ChannelNo","MDStreamID","ApplSeqNum", "SecurityID","SecurityIDSource", "order_price",
                   "order_qty","TransactTime","order_side","order_type","ConfirmID","Contactor","ContactInfo","ExpirationDays","ExpirationType"]
    pm_order = pd.read_table(data + '\\pm_hq_order_spot.txt',header=None)
    pm_order.columns = ["date","OrigTime","SendTime","recvtime","dbtime","ChannelNo","MDStreamID","ApplSeqNum", "SecurityID","SecurityIDSource", "order_price",
                   "order_qty","TransactTime","order_side","order_type","ConfirmID","Contactor","ContactInfo","ExpirationDays","ExpirationType"]
    OrderLog1 = pd.concat([am_order, pm_order])
    del am_order
    del pm_order

    OrderLog1 = OrderLog1[(OrderLog1["SecurityID"] < 4000) | (OrderLog1["SecurityID"] > 300000)]
    OrderLog1["skey"] = OrderLog1["SecurityID"] + 2000000
    OrderLog1["clockAtArrival"] = OrderLog1["TransactTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S%f').timestamp()*1e6))
    OrderLog1['datetime'] = OrderLog1["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))
    OrderLog1["time"] = (OrderLog1['TransactTime'] - int(OrderLog1['TransactTime'].iloc[0]//1000000000*1000000000)).astype(np.int64)*1000
    OrderLog1["order_type"] =np.where(OrderLog1["order_type"] == 'U', 3, OrderLog1["order_type"])
    for col in ["skey", "date", "ApplSeqNum", "order_qty", "order_side", "order_type"]:
        OrderLog1[col] = OrderLog1[col].astype('int32')
    display(OrderLog1["order_price"].astype(str).apply(lambda x: len(x.split('.')[1])).unique())
    
    assert(OrderLog1[((OrderLog1["order_side"] != 1) & (OrderLog1["order_side"] != 2)) | (OrderLog1["order_type"].isnull())].shape[0] == 0)
    da_te = str(OrderLog1["date"].iloc[0]) 
    da_te = da_te[:4] + '-' + da_te[4:6] + '-' + da_te[6:8]
    db1 = db[db["date"] == da_te]
    sl = (db1["ID"].str[2:].astype(int) + 2000000).unique()
    del db1
    try:
        assert(len(set(sl) - set(OrderLog1["skey"].unique())) == 0)
    except:
        print(set(sl) - set(OrderLog1["skey"].unique()))
    
    OrderLog1 = OrderLog1[["skey", "date", "time", "clockAtArrival", "datetime", "ApplSeqNum", "order_side", "order_type", "order_price",
                                                 "order_qty"]]
    
    print(OrderLog1.dtypes)
    print(OrderLog1["date"].iloc[0])
    print("order finished")
    
    print(datetime.datetime.now() - startTm)

    
startDate = 20191113
endDate = 20191113
database_name = 'com_md_eq_cn'
user = "zhenyuy"
password = "bnONBrzSMGoE"
db1 = DB("192.168.10.178", database_name, user, password)
trade = db1.read('md_trade', start_date=startDate, end_date=endDate)
trade = trade[trade['skey'] > 2000000]
t1 = trade.groupby('skey')['BidApplSeqNum'].unique().reset_index()
t2 = trade.groupby('skey')['OfferApplSeqNum'].unique().reset_index()
t3 = OrderLog1.groupby('skey')['ApplSeqNum'].unique().reset_index()
t = pd.merge(t1, t2, on='skey')
t['union'] = [list(set(a) | set(b)) for a, b in zip(t.BidApplSeqNum, t.OfferApplSeqNum)]
t = pd.merge(t, t3, on='skey')
t['less'] = [len(set(a) - set(b)) for a, b in zip(t.union, t.ApplSeqNum)]
t[t['less'] > 1]



0:03:41.638007


array([2, 1], dtype=int64)

{2001914}
skey                       int32
date                       int32
time                       int64
clockAtArrival             int64
datetime          datetime64[ns]
ApplSeqNum                 int32
order_side                 int32
order_type                 int32
order_price              float64
order_qty                  int32
dtype: object
20191113
order finished
0:07:15.950505


Unnamed: 0,skey,BidApplSeqNum,OfferApplSeqNum,union,ApplSeqNum,less


In [1]:
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz


def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name)


class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db'):
        self.db_name = db_name
        self.uri = uri
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = 1
            seg = {'ver': version, 'data': self.ser(df_seg, version), 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)

        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        else:
            raise Exception('unknown version')


def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()

import pandas as pd
import random
import numpy as np
import glob
import os
from unrar import rarfile
import py7zr
import pickle
import datetime

columns1 = ["Date","OrigTime","SendTime","ercvtime","dbtime","ChannelNo","SecurityID","SecurityIDsource", "MDStreamID","PreClosePx",
                   "PxChnage1","PXChange2","openPrice","HighPx","LowPx","close","NumTrades","cum_volume","cum_amount","PE1","PE2","TradingPhase",
                   "totalofferqty", "wa_offerPrice", "totalbidqty", "wa_bidPrice", "PreNAV", "RealTimeNAV", "WarrantPremiumRate", "UpLimitPx",
                   "DownLimitPx", "TotalLongPosition", "unknown1", "unknown2", "unknown3"]
columns2 = ['Date',"OrigTime","SendTime","ercvtime","dbtime","ChannelNo","SecurityID","SecurityIDsource", "MDStreamID",'ask1p','bid1p',
                   "ask1q","bid1q", 'ask2p','bid2p',"ask2q","bid2q",'ask3p','bid3p',"ask3q","bid3q",'ask4p','bid4p',"ask4q","bid4q",'ask5p',
                    'bid5p',"ask5q","bid5q",'ask6p','bid6p',"ask6q","bid6q",'ask7p','bid7p',"ask7q","bid7q",'ask8p','bid8p',"ask8q","bid8q",
                   'ask9p','bid9p',"ask9q","bid9q",'ask10p','bid10p',"ask10q","bid10q","NUMORDERS_B1","NOORDERS_B1","ORDERQTY_B1",
                    "NUMORDERS_S1","NOORDERS_S1","ORDERQTY_S1"]
columns3 =  ["Date","OrigTime","SendTime","ercvtime","dbtime","ChannelNo","SecurityID","SecurityIDsource", "MDStreamID","PreClosePx",
                   "PxChnage1","PXChange2","openPrice","HighPx","LowPx","close","NumTrades","cum_volume","cum_amount","PE1","PE2","TradingPhase",
                   "totalofferqty", "wa_offerPrice", "totalbidqty", "wa_bidPrice", "PreNAV", "RealTimeNAV", "WarrantPremiumRate", "UpLimitPx",
                   "DownLimitPx", "TotalLongPosition"]

startTm = datetime.datetime.now()
readPath = r'\\192.168.10.30\Kevin_zhenyu\day_stock\***'
dataPathLs = np.array(glob.glob(readPath))
dataPathLs = dataPathLs[[np.array([os.path.basename(i).split('.')[0][:2] == 'SZ' for i in dataPathLs])]]
db = pd.DataFrame()
for p in dataPathLs:
    dayData = pd.read_csv(p, compression='gzip')
    db = pd.concat([db, dayData])
print(datetime.datetime.now() - startTm)

startTm = datetime.datetime.now()
year = "2020"
startDate = "0316"
endDate = "0316"
df = []
bad = []
readPath = 'L:\\backup_data\\' + year + '\\***'
dataPathLs = np.array(glob.glob(readPath))
dateLs = np.array([os.path.basename(i) for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]

for data in dataPathLs:
    
    path1 = data
    am_order = pd.read_table(path1 + '\\am_hq_order_spot.txt',header=None)
    am_order.columns = ["date","OrigTime","SendTime","recvtime","dbtime","ChannelNo","MDStreamID","ApplSeqNum", "SecurityID","SecurityIDSource", "order_price",
               "order_qty","TransactTime","order_side","order_type","ConfirmID","Contactor","ContactInfo","ExpirationDays","ExpirationType"]
    pm_order = pd.read_table(path1 + '\\pm_hq_order_spot.txt',header=None)
    pm_order.columns = ["date","OrigTime","SendTime","recvtime","dbtime","ChannelNo","MDStreamID","ApplSeqNum", "SecurityID","SecurityIDSource", "order_price",
               "order_qty","TransactTime","order_side","order_type","ConfirmID","Contactor","ContactInfo","ExpirationDays","ExpirationType"]
    OrderLog1 = pd.concat([am_order, pm_order])
    del am_order
    del pm_order

    OrderLog1 = OrderLog1[(OrderLog1["SecurityID"] < 4000) | (OrderLog1["SecurityID"] > 300000)]
    OrderLog1["skey"] = OrderLog1["SecurityID"] + 2000000
    OrderLog1["clockAtArrival"] = OrderLog1["TransactTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S%f').timestamp()*1e6))
    OrderLog1['datetime'] = OrderLog1["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))
    OrderLog1["time"] = (OrderLog1['TransactTime'] - int(OrderLog1['TransactTime'].iloc[0]//1000000000*1000000000)).astype(np.int64)*1000
    OrderLog1["order_type"] =np.where(OrderLog1["order_type"] == 'U', 3, OrderLog1["order_type"])
    for col in ["skey", "date", "ApplSeqNum", "order_qty", "order_side", "order_type"]:
        OrderLog1[col] = OrderLog1[col].astype('int32')
    display(OrderLog1["order_price"].astype(str).apply(lambda x: len(x.split('.')[1])).unique())
    
    assert(OrderLog1[((OrderLog1["order_side"] != 1) & (OrderLog1["order_side"] != 2)) | (OrderLog1["order_type"].isnull())].shape[0] == 0)
    da_te = str(OrderLog1["date"].iloc[0]) 
    da_te = da_te[:4] + '-' + da_te[4:6] + '-' + da_te[6:8]
    db1 = db[db["date"] == da_te]
    sl = (db1["ID"].str[2:].astype(int) + 2000000).unique()
    del db1
    try:
        assert(len(set(sl) - set(OrderLog1["skey"].unique())) == 0)
    except:
        print(set(sl) - set(OrderLog1["skey"].unique()))
    
    OrderLog1 = OrderLog1[["skey", "date", "time", "clockAtArrival", "datetime", "ApplSeqNum", "order_side", "order_type", "order_price",
                                                 "order_qty"]]
    
    print(OrderLog1.dtypes)
    print(OrderLog1["date"].iloc[0])
    print("order finished")
    
    print(datetime.datetime.now() - startTm)
    
startDate = 20200316
endDate = 20200316
database_name = 'com_md_eq_cn'
user = "zhenyuy"
password = "bnONBrzSMGoE"
db1 = DB("192.168.10.178", database_name, user, password)
trade = db1.read('md_trade', start_date=startDate, end_date=endDate)
trade = trade[trade['skey'] > 2000000]
t1 = trade.groupby('skey')['BidApplSeqNum'].unique().reset_index()
t2 = trade.groupby('skey')['OfferApplSeqNum'].unique().reset_index()
t3 = OrderLog1.groupby('skey')['ApplSeqNum'].unique().reset_index()
t = pd.merge(t1, t2, on='skey')
t['union'] = [list(set(a) | set(b)) for a, b in zip(t.BidApplSeqNum, t.OfferApplSeqNum)]
t = pd.merge(t, t3, on='skey')
t['less'] = [len(set(a) - set(b)) for a, b in zip(t.union, t.ApplSeqNum)]
t[t['less'] > 1]



0:04:47.195677


  interactivity=interactivity, compiler=compiler, result=result)


array([2, 1], dtype=int64)

skey                       int32
date                       int32
time                       int64
clockAtArrival             int64
datetime          datetime64[ns]
ApplSeqNum                 int32
order_side                 int32
order_type                 int32
order_price              float64
order_qty                  int32
dtype: object
20200316
order finished
0:17:38.436389


Unnamed: 0,skey,BidApplSeqNum,OfferApplSeqNum,union,ApplSeqNum,less


In [5]:
# 2017-2019 version
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz


def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name)


class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db'):
        self.db_name = db_name
        self.uri = uri
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = 1
            seg = {'ver': version, 'data': self.ser(df_seg, version), 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)

        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        else:
            raise Exception('unknown version')


def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()

import pandas as pd
import random
import numpy as np
import glob
import os
from unrar import rarfile
import py7zr
import pickle
import datetime

columns1 = ["Date","OrigTime","SendTime","ercvtime","dbtime","ChannelNo","SecurityID","SecurityIDsource", "MDStreamID","PreClosePx",
                   "PxChnage1","PXChange2","openPrice","HighPx","LowPx","close","NumTrades","cum_volume","cum_amount","PE1","PE2","TradingPhase",
                   "totalofferqty", "wa_offerPrice", "totalbidqty", "wa_bidPrice", "PreNAV", "RealTimeNAV", "WarrantPremiumRate", "UpLimitPx",
                   "DownLimitPx", "TotalLongPosition", "unknown1", "unknown2", "unknown3"]
columns2 = ['Date',"OrigTime","SendTime","ercvtime","dbtime","ChannelNo","SecurityID","SecurityIDsource", "MDStreamID",'ask1p','bid1p',
                   "ask1q","bid1q", 'ask2p','bid2p',"ask2q","bid2q",'ask3p','bid3p',"ask3q","bid3q",'ask4p','bid4p',"ask4q","bid4q",'ask5p',
                    'bid5p',"ask5q","bid5q",'ask6p','bid6p',"ask6q","bid6q",'ask7p','bid7p',"ask7q","bid7q",'ask8p','bid8p',"ask8q","bid8q",
                   'ask9p','bid9p',"ask9q","bid9q",'ask10p','bid10p',"ask10q","bid10q","NUMORDERS_B1","NOORDERS_B1","ORDERQTY_B1",
                    "NUMORDERS_S1","NOORDERS_S1","ORDERQTY_S1"]
columns3 =  ["Date","OrigTime","SendTime","ercvtime","dbtime","ChannelNo","SecurityID","SecurityIDsource", "MDStreamID","PreClosePx",
                   "PxChnage1","PXChange2","openPrice","HighPx","LowPx","close","NumTrades","cum_volume","cum_amount","PE1","PE2","TradingPhase",
                   "totalofferqty", "wa_offerPrice", "totalbidqty", "wa_bidPrice", "PreNAV", "RealTimeNAV", "WarrantPremiumRate", "UpLimitPx",
                   "DownLimitPx", "TotalLongPosition"]

# startTm = datetime.datetime.now()
# readPath = r'\\192.168.10.30\Kevin_zhenyu\day_stock\***'
# dataPathLs = np.array(glob.glob(readPath))
# dataPathLs = dataPathLs[[np.array([os.path.basename(i).split('.')[0][:2] == 'SZ' for i in dataPathLs])]]
# db = pd.DataFrame()
# for p in dataPathLs:
#     dayData = pd.read_csv(p, compression='gzip')
#     db = pd.concat([db, dayData])
# print(datetime.datetime.now() - startTm)

startTm = datetime.datetime.now()
year = "2017"
startDate = "0804"
endDate = "0804"
df = []
bad = []
readPath = 'J:\\LEVEL2_shenzhen\\' + year + '\\***'
dataPathLs = np.array(glob.glob(readPath))
dateLs = np.array([os.path.basename(i) for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]

for data in dataPathLs:
    if len(np.array(glob.glob(data +'\\***'))) == 0:
        continue
    
    if len(np.array(glob.glob(data +'\\pm_hq_order_spot.7z'))) == 1:
        date = os.path.basename(data)
        path = 'L:\\backup_data\\' + year 
        os.chdir(data)
        try:
            a = py7zr.SevenZipFile(data + '\\am_hq_order_spot.7z','r',filters=None)
        except:
            print("Bad unzip here!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            print(data + '\\am_hq_order_spot.7z')
            bad.append(data + '\\am_hq_order_spot.7z')
            continue
        path1 = path + '\\' + date
        a.extractall(path = path1)
        a.close()
        try:
            a = py7zr.SevenZipFile(data + '\\pm_hq_order_spot.7z','r',filters=None)
        except:
            print("Bad unzip here!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            print(data + '\\pm_hq_order_spot.7z')
            bad.append(data + '\\pm_hq_order_spot.7z')
            continue
        a.extractall(path = path1)
        a.close()
        
        am_order = pd.read_table(path1 + '\\am_hq_order_spot.txt',header=None)
        am_order.columns = ["date","OrigTime","SendTime","recvtime","dbtime","ChannelNo","MDStreamID","ApplSeqNum", "SecurityID","SecurityIDSource", "order_price",
                   "order_qty","TransactTime","order_side","order_type","ConfirmID","Contactor","ContactInfo","ExpirationDays","ExpirationType"]
        pm_order = pd.read_table(path1 + '\\pm_hq_order_spot.txt',header=None)
        pm_order.columns = ["date","OrigTime","SendTime","recvtime","dbtime","ChannelNo","MDStreamID","ApplSeqNum", "SecurityID","SecurityIDSource", "order_price",
                   "order_qty","TransactTime","order_side","order_type","ConfirmID","Contactor","ContactInfo","ExpirationDays","ExpirationType"]
        OrderLog1 = pd.concat([am_order, pm_order])
        del am_order
        del pm_order
  
    
    elif len(np.array(glob.glob(data +'\\pm_hq_order_spot.7z.001'))) == 1:
        date = os.path.basename(data)
        path = 'L:\\backup_data\\' + year 
        os.chdir(data)
        os.system("copy /b am_hq_order_spot.7z.* am_hq_order_spot.7z")
        try:
            a = py7zr.SevenZipFile(data + '\\am_hq_order_spot.7z','r',filters=None)
        except:
            print("Bad unzip here!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            print(data + '\\am_hq_order_spot.7z')
            bad.append(data + '\\am_hq_order_spot.7z')
            continue
        path1 = path + '\\' + date
        a.extractall(path = path1)
        a.close()
        os.system("copy /b pm_hq_order_spot.7z.* pm_hq_order_spot.7z")
        try:
            a = py7zr.SevenZipFile(data + '\\pm_hq_order_spot.7z','r',filters=None)
        except:
            print("Bad unzip here!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            print(data + '\\pm_hq_order_spot.7z')
            bad.append(data + '\\pm_hq_order_spot.7z')
            continue
        a.extractall(path = path1)
        a.close()
        
        am_order = pd.read_table(path1 + '\\am_hq_order_spot.txt',header=None)
        am_order.columns = ["date","OrigTime","SendTime","recvtime","dbtime","ChannelNo","MDStreamID","ApplSeqNum", "SecurityID","SecurityIDSource", "order_price",
                   "order_qty","TransactTime","order_side","order_type","ConfirmID","Contactor","ContactInfo","ExpirationDays","ExpirationType"]
        pm_order = pd.read_table(path1 + '\\pm_hq_order_spot.txt',header=None)
        pm_order.columns = ["date","OrigTime","SendTime","recvtime","dbtime","ChannelNo","MDStreamID","ApplSeqNum", "SecurityID","SecurityIDSource", "order_price",
                   "order_qty","TransactTime","order_side","order_type","ConfirmID","Contactor","ContactInfo","ExpirationDays","ExpirationType"]
        OrderLog1 = pd.concat([am_order, pm_order])
        del am_order
        del pm_order

    elif len(np.array(glob.glob(data +'\\hq_order.7z'))) == 1:
        date = os.path.basename(data)
        path = 'L:\\backup_data\\' + year 
        os.chdir(data)
        try:
            a = py7zr.SevenZipFile(data + '\\hq_order.7z','r',filters=None)
        except:
            print("Bad unzip here!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            print(data + '\\hq_order.7z')
            bad.append(data + '\\hq_order.7z')
            continue
        path1 = path + '\\' + date
        a.extractall(path = path1)
        a.close()
        OrderLog1 = pd.read_table(path1 + '\\hq_order.txt',header=None)
        OrderLog1.columns = ["date","OrigTime","SendTime","recvtime","dbtime","ChannelNo","MDStreamID","ApplSeqNum", "SecurityID","SecurityIDSource", "order_price",
                   "order_qty","TransactTime","order_side","order_type","ConfirmID","Contactor","ContactInfo","ExpirationDays","ExpirationType"]
    
    
    OrderLog1 = OrderLog1[(OrderLog1["SecurityID"] < 4000) | (OrderLog1["SecurityID"] > 300000)]
    OrderLog1["skey"] = OrderLog1["SecurityID"] + 2000000
    OrderLog1["clockAtArrival"] = OrderLog1["TransactTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S%f').timestamp()*1e6))
    OrderLog1['datetime'] = OrderLog1["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))
    OrderLog1["time"] = (OrderLog1['TransactTime'] - int(OrderLog1['TransactTime'].iloc[0]//1000000000*1000000000)).astype(np.int64)*1000
    OrderLog1["order_type"] =np.where(OrderLog1["order_type"] == 'U', 3, OrderLog1["order_type"])
    for col in ["skey", "date", "ApplSeqNum", "order_qty", "order_side", "order_type"]:
        OrderLog1[col] = OrderLog1[col].astype('int32')
    display(OrderLog1["order_price"].astype(str).apply(lambda x: len(x.split('.')[1])).unique())
    
    assert(OrderLog1[((OrderLog1["order_side"] != 1) & (OrderLog1["order_side"] != 2)) | (OrderLog1["order_type"].isnull())].shape[0] == 0)
    da_te = str(OrderLog1["date"].iloc[0]) 
    da_te = da_te[:4] + '-' + da_te[4:6] + '-' + da_te[6:8]
    db1 = db[db["date"] == da_te]
    sl = (db1["ID"].str[2:].astype(int) + 2000000).unique()
    del db1
    try:
        assert(len(set(sl) - set(OrderLog1["skey"].unique())) == 0)
    except:
        print(set(sl) - set(OrderLog1["skey"].unique()))
    
    OrderLog1 = OrderLog1[["skey", "date", "time", "clockAtArrival", "datetime", "ApplSeqNum", "order_side", "order_type", "order_price",
                                                 "order_qty"]]
    
    print(OrderLog1.dtypes)
    print(OrderLog1["date"].iloc[0])
    print("order finished")
    
    print(datetime.datetime.now() - startTm)

    
startDate = 20170804
endDate = 20170804
database_name = 'com_md_eq_cn'
user = "zhenyuy"
password = "bnONBrzSMGoE"
db1 = DB("192.168.10.178", database_name, user, password)
trade = db1.read('md_trade', start_date=startDate, end_date=endDate)
trade = trade[trade['skey'] > 2000000]
t1 = trade.groupby('skey')['BidApplSeqNum'].unique().reset_index()
t2 = trade.groupby('skey')['OfferApplSeqNum'].unique().reset_index()
t3 = OrderLog1.groupby('skey')['ApplSeqNum'].unique().reset_index()
t = pd.merge(t1, t2, on='skey')
t['union'] = [list(set(a) | set(b)) for a, b in zip(t.BidApplSeqNum, t.OfferApplSeqNum)]
t = pd.merge(t, t3, on='skey')
t['less'] = [len(set(a) - set(b)) for a, b in zip(t.union, t.ApplSeqNum)]
t['less1'] = [list(set(a) - set(b))[0] for a, b in zip(t.union, t.ApplSeqNum)]
display(t['less1'].unique())
t[t['less'] > 1]

array([2, 1], dtype=int64)

{2001872, 2001914}
skey                       int32
date                       int32
time                       int64
clockAtArrival             int64
datetime          datetime64[ns]
ApplSeqNum                 int32
order_side                 int32
order_type                 int32
order_price              float64
order_qty                  int32
dtype: object
20170804
order finished
0:13:04.956434


array([0], dtype=int64)

Unnamed: 0,skey,BidApplSeqNum,OfferApplSeqNum,union,ApplSeqNum,less,less1


In [6]:
OrderLog1.head(5)

Unnamed: 0,skey,date,time,clockAtArrival,datetime,ApplSeqNum,order_side,order_type,order_price,order_qty
0,2002889,20170804,91500000000,1501809300000000,2017-08-04 09:15:00,1,1,2,27.27,94700
1,2002889,20170804,91500000000,1501809300000000,2017-08-04 09:15:00,2,1,2,27.27,36700
2,2002889,20170804,91500000000,1501809300000000,2017-08-04 09:15:00,3,1,2,27.27,33700
3,2002889,20170804,91500000000,1501809300000000,2017-08-04 09:15:00,4,1,2,27.27,15500
4,2002890,20170804,91500000000,1501809300000000,2017-08-04 09:15:00,5,1,2,22.23,34700


In [7]:
database_name = 'com_md_eq_cn'
user = "zhenyuy"
password = "bnONBrzSMGoE"

db1 = DB("192.168.10.178", database_name, user, password)
db1.write('md_order', OrderLog1)

In [7]:
import pandas as pd
import random
import numpy as np
import glob
import os
from unrar import rarfile
import py7zr
import pickle
import datetime

columns1 = ["Date","OrigTime","SendTime","ercvtime","dbtime","ChannelNo","SecurityID","SecurityIDsource", "MDStreamID","PreClosePx",
                   "PxChnage1","PXChange2","openPrice","HighPx","LowPx","close","NumTrades","cum_volume","cum_amount","PE1","PE2","TradingPhase",
                   "totalofferqty", "wa_offerPrice", "totalbidqty", "wa_bidPrice", "PreNAV", "RealTimeNAV", "WarrantPremiumRate", "UpLimitPx",
                   "DownLimitPx", "TotalLongPosition", "unknown1", "unknown2", "unknown3"]
columns2 = ['Date',"OrigTime","SendTime","ercvtime","dbtime","ChannelNo","SecurityID","SecurityIDsource", "MDStreamID",'ask1p','bid1p',
                   "ask1q","bid1q", 'ask2p','bid2p',"ask2q","bid2q",'ask3p','bid3p',"ask3q","bid3q",'ask4p','bid4p',"ask4q","bid4q",'ask5p',
                    'bid5p',"ask5q","bid5q",'ask6p','bid6p',"ask6q","bid6q",'ask7p','bid7p',"ask7q","bid7q",'ask8p','bid8p',"ask8q","bid8q",
                   'ask9p','bid9p',"ask9q","bid9q",'ask10p','bid10p',"ask10q","bid10q","NUMORDERS_B1","NOORDERS_B1","ORDERQTY_B1",
                    "NUMORDERS_S1","NOORDERS_S1","ORDERQTY_S1"]
columns3 =  ["Date","OrigTime","SendTime","ercvtime","dbtime","ChannelNo","SecurityID","SecurityIDsource", "MDStreamID","PreClosePx",
                   "PxChnage1","PXChange2","openPrice","HighPx","LowPx","close","NumTrades","cum_volume","cum_amount","PE1","PE2","TradingPhase",
                   "totalofferqty", "wa_offerPrice", "totalbidqty", "wa_bidPrice", "PreNAV", "RealTimeNAV", "WarrantPremiumRate", "UpLimitPx",
                   "DownLimitPx", "TotalLongPosition"]

# startTm = datetime.datetime.now()
# readPath = r'\\192.168.10.30\Kevin_zhenyu\day_stock\***'
# dataPathLs = np.array(glob.glob(readPath))
# dataPathLs = dataPathLs[[np.array([os.path.basename(i).split('.')[0][:2] == 'SZ' for i in dataPathLs])]]
# db = pd.DataFrame()
# for p in dataPathLs:
#     dayData = pd.read_csv(p, compression='gzip')
#     db = pd.concat([db, dayData])
# print(datetime.datetime.now() - startTm)

startTm = datetime.datetime.now()
year = "2017"
startDate = "0804"
endDate = "0804"
df = []
bad = []
readPath = 'J:\\LEVEL2_shenzhen\\' + year + '\\***'
dataPathLs = np.array(glob.glob(readPath))
dateLs = np.array([os.path.basename(i) for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]

for data in dataPathLs:
    if len(np.array(glob.glob(data +'\\***'))) == 0:
        continue
    
    if len(np.array(glob.glob(data +'\\am_hq_trade_spot.7z'))) == 1:
        date = os.path.basename(data)
        path = 'L:\\backup data\\' + year 
        os.chdir(data)
        try:
            a = py7zr.SevenZipFile(data + '\\am_hq_trade_spot.7z','r',filters=None)
        except:
            print("Bad unzip here!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            print(data + '\\am_hq_trade_spot.7z')
            bad.append(data + '\\am_hq_trade_spot.7z')
            continue
        path1 = path + '\\' + date
        a.extractall(path = path1)
        a.close()
        try:
            a = py7zr.SevenZipFile(data + '\\pm_hq_trade_spot.7z','r',filters=None)
        except:
            print("Bad unzip here!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            print(data + '\\pm_hq_trade_spot.7z')
            bad.append(data + '\\pm_hq_trade_spot.7z')
            continue
        a.extractall(path = path1)
        a.close()

        am_trade = pd.read_table(path1 + "\\am_hq_trade_spot.txt",header=None)
        am_trade.columns = ["date","OrigTime","SendTime","recvtime","dbtime","ChannelNo","MDStreamID","ApplSeqNum", "SecurityID","SecurityIDSource", "BidApplSeqNum",
                   "OfferApplSeqNum","trade_price","trade_qty","trade_type","TransactTime"]
        pm_trade = pd.read_table(path1 + "\\pm_hq_trade_spot.txt",header=None)
        pm_trade.columns = ["date","OrigTime","SendTime","recvtime","dbtime","ChannelNo","MDStreamID","ApplSeqNum", "SecurityID","SecurityIDSource", "BidApplSeqNum",
                   "OfferApplSeqNum","trade_price","trade_qty","trade_type","TransactTime"]
        TradeLogSZ1 = pd.concat([am_trade, pm_trade])
        del am_trade
        del pm_trade
        
    elif len(np.array(glob.glob(data +'\\am_hq_trade_spot.7z.001'))) == 1:
        date = os.path.basename(data)
        path = 'L:\\backup data\\' + year 
        os.chdir(data)
        os.system("copy /b am_hq_trade_spot.7z.* am_hq_trade_spot.7z")
        try:
            a = py7zr.SevenZipFile(data + '\\am_hq_trade_spot.7z','r',filters=None)
        except:
            print("Bad unzip here!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            print(data + '\\am_hq_trade_spot.7z')
            bad.append(data + '\\am_hq_trade_spot.7z')
            continue
        path1 = path + '\\' + date
        a.extractall(path = path1)
        a.close()
        os.system("copy /b pm_hq_trade_spot.7z.* pm_hq_trade_spot.7z")
        try:
            a = py7zr.SevenZipFile(data + '\\pm_hq_trade_spot.7z','r',filters=None)
        except:
            print("Bad unzip here!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            print(data + '\\pm_hq_trade_spot.7z')
            bad.append(data + '\\pm_hq_trade_spot.7z')
            continue
        a.extractall(path = path1)
        a.close()
        
        am_trade = pd.read_table(path1 + "\\am_hq_trade_spot.txt",header=None)
        am_trade.columns = ["date","OrigTime","SendTime","recvtime","dbtime","ChannelNo","MDStreamID","ApplSeqNum", "SecurityID","SecurityIDSource", "BidApplSeqNum",
                   "OfferApplSeqNum","trade_price","trade_qty","trade_type","TransactTime"]
        pm_trade = pd.read_table(path1 + "\\pm_hq_trade_spot.txt",header=None)
        pm_trade.columns = ["date","OrigTime","SendTime","recvtime","dbtime","ChannelNo","MDStreamID","ApplSeqNum", "SecurityID","SecurityIDSource", "BidApplSeqNum",
                   "OfferApplSeqNum","trade_price","trade_qty","trade_type","TransactTime"]
        TradeLogSZ1 = pd.concat([am_trade, pm_trade])
        del am_trade
        del pm_trade
        
    elif len(np.array(glob.glob(data +'\\hq_trade.7z'))) == 1:
        date = os.path.basename(data)
        path = 'L:\\backup data\\' + year 
        os.chdir(data)
        try:
            a = py7zr.SevenZipFile(data + '\\hq_trade.7z','r',filters=None)
        except:
            print("Bad unzip here!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            print(data + '\\hq_trade.7z')
            bad.append(data + '\\hq_trade.7z')
            continue
        path1 = path + '\\' + date
        a.extractall(path = path1)
        a.close()
        
        TradeLogSZ1 = pd.read_table(path1 + "\\hq_trade.txt",header=None)
        TradeLogSZ1.columns = ["date","OrigTime","SendTime","recvtime","dbtime","ChannelNo","MDStreamID","ApplSeqNum", "SecurityID","SecurityIDSource", "BidApplSeqNum",
                   "OfferApplSeqNum","trade_price","trade_qty","trade_type","TransactTime"]

    
    TradeLogSZ1 = TradeLogSZ1[(TradeLogSZ1["SecurityID"] < 4000) | (TradeLogSZ1["SecurityID"] > 300000)]
    TradeLogSZ1["trade_money"] = TradeLogSZ1["trade_price"] * TradeLogSZ1["trade_qty"]
    TradeLogSZ1["trade_flag"] = 0
    TradeLogSZ1["skey"] = TradeLogSZ1["SecurityID"] + 2000000
    TradeLogSZ1["clockAtArrival"] = TradeLogSZ1["TransactTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S%f').timestamp()*1e6))
    TradeLogSZ1['datetime'] = TradeLogSZ1["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))
    TradeLogSZ1["time"] = (TradeLogSZ1['TransactTime'] - int(TradeLogSZ1['TransactTime'].iloc[0]//1000000000*1000000000)).astype(np.int64)*1000
    TradeLogSZ1["trade_type"] = np.where(TradeLogSZ1["trade_type"] == 'F', 1, TradeLogSZ1["trade_type"])
    for col in ["skey", "date", "ApplSeqNum", "BidApplSeqNum", "OfferApplSeqNum", "trade_qty", "trade_type", "trade_flag"]:
        TradeLogSZ1[col] = TradeLogSZ1[col].astype('int32')
    for cols in ["trade_money"]:
        TradeLogSZ1[cols] = TradeLogSZ1[cols].round(2)
    display(TradeLogSZ1["trade_price"].astype(str).apply(lambda x: len(x.split('.')[1])).unique())
    
    da_te = str(TradeLogSZ1["date"].iloc[0]) 
    da_te = da_te[:4] + '-' + da_te[4:6] + '-' + da_te[6:8]
    db1 = db[db["date"] == da_te]
    sl = (db1["ID"].str[2:].astype(int) + 2000000).unique()
    db1["max_volume"] = db1.groupby("ID")["d_volume"].transform("max")
    db1["max_amount"] = db1.groupby("ID")["d_amount"].transform("max")
    t1 = db1.groupby("ID")["max_volume", "max_amount"].first().reset_index()
    del db1
    t1["skey"] = t1["ID"].str[2:].astype(int) + 2000000
    trade1 = TradeLogSZ1[TradeLogSZ1["trade_type"] == 1].groupby("skey")["trade_qty"].sum().reset_index()
    trade1.columns=["skey", "cum_volume"]
    trade2 = TradeLogSZ1[TradeLogSZ1["trade_type"] == 1].groupby("skey")["trade_money"].sum().reset_index()
    trade2.columns=["skey", "cum_amount"]
    t2 = pd.merge(trade1, trade2, on="skey")
    re = pd.merge(t1, t2, on="skey", how="outer")
    try:
        assert(t1.shape[0] == t2.shape[0])
        assert(re[re["cum_volume"] != re["max_volume"]].shape[0] == 0)
        assert(re[re["cum_amount"].round(2) != re["max_amount"]].shape[0] == 0)
    except:
        display(set(t1["skey"]) - set(t2["skey"]))
        display(re[re["cum_volume"] != re["max_volume"]])
        display(re[re["cum_amount"].round(2) != re["max_amount"]])
    del t1
    del t2
    del re
 
    TradeLogSZ1 = TradeLogSZ1[["skey", "date", "time", "clockAtArrival", "datetime", "ApplSeqNum", "trade_type", "trade_flag",
                                                 "trade_price", "trade_qty", "BidApplSeqNum", "OfferApplSeqNum"]]
    print(da_te)
    print("trade finished")

    print(datetime.datetime.now() - startTm)

array([1, 2], dtype=int64)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


{2001872, 2001914}

Unnamed: 0,ID,max_volume,max_amount,skey,cum_volume,cum_amount
417,SZ001872,13885068.0,410572800.0,2001872,,
419,SZ001914,4722353.0,49910920.0,2001914,,


Unnamed: 0,ID,max_volume,max_amount,skey,cum_volume,cum_amount
417,SZ001872,13885068.0,410572800.0,2001872,,
419,SZ001914,4722353.0,49910920.0,2001914,,


2017-08-04
trade finished
0:12:30.713481


In [9]:
TradeLogSZ1.dtypes

skey                        int32
date                        int32
time                        int64
clockAtArrival              int64
datetime           datetime64[ns]
ApplSeqNum                  int32
trade_type                  int32
trade_flag                  int32
trade_price               float64
trade_qty                   int32
BidApplSeqNum               int32
OfferApplSeqNum             int32
dtype: object

In [11]:
# database_name = 'com_md_eq_cn'
# user = "zhenyuy"
# password = "bnONBrzSMGoE"

# db1 = DB("192.168.10.178", database_name, user, password)
# db1.write('md_trade', TradeLogSZ1)