In [8]:
import os
import glob
import pymongo
import numpy as np
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz

class DB(object):
    def __init__(self, uri, symbol_column='ID', clock_column='clockAtArrival'):
        self.db_name = 'white_db'
        user, passwd, host = self.parse_uri(uri)
        auth_db = 'admin' if user in ('admin', 'root') else self.db_name
        self.uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)

        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.clock_column = clock_column
        self.date_column = 'date'

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def write(self, table_name, df):
        if len(df) == 0: return

        if self.clock_column in df.columns:
            date = datetime.datetime.fromtimestamp(df.head(1)[self.clock_column].iloc[0] / 1e6, pytz.timezone('Asia/Shanghai')).strftime('%Y%m%d')
        elif self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
        else:
            raise Exception('DataFrame should contain either one of columns: `%s`, `%s`' % (self.clock_column, self.date_column))

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        for symbol, sub_df in df.groupby([self.symbol_column]):
            collection.delete_many({'date': date, 'symbol': symbol})
            self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = 1
            seg = {'ver': version, 'data': self.ser(df_seg, version), 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date and end_date:
            query['date'] = {'$gte': parse_date(start_date), '$lte': parse_date(end_date)}

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat(x['data'] for x in segs) if segs else None

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        if version == 1:
            return gzip.compress(pickle.dumps(s), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s), preset=1)
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)

        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        else:
            raise Exception('unknown version')

#### upload data

#### 1. Upload SH 2018 data

In [1]:
import pandas as pd
import random
import numpy as np
import glob
import os
from unrar import rarfile
import py7zr
import pickle
import datetime
import time
pd.set_option("max_columns", 200)



startTm = datetime.datetime.now()
    
colnames=['StockID', 'DateTime', 'prevClose', 'open','HighPx','LowPx','close','cum_volume','cum_amount','InstrumentStatus',
         'bid1p','bid2p','bid3p','bid4p','bid5p','bid6p','bid7p','bid8p','bid9p','bid10p','bid1q','bid2q','bid3q','bid4q','bid5q','bid6q','bid7q',
         'bid8q','bid9q','bid10q','b1','b2','b3','b4','b5','b6','b7','b8','b9','b10','b11','b12','b13','b14','b15','b16','b17','b18','b19','b20',
          'b21','b22','b23','b24','b25','b26','b27','b28','b29','b30','b31','b32','b33','b34','b35','b36','b37','b38','b39','b40','b41','b42','b43',
          'b44','b45','b46','b47','b48','b49','b50','ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p','ask1q','ask2q',
          'ask3q','ask4q','ask5q','ask6q','ask7q','ask8q','ask9q','ask10q','a1','a2','a3','a4','a5','a6','a7','a8','a9','a10','a11','a12','a13','a14',
         'a15','a16','a17','a18','a19','a20','a21','a22','a23','a24','a25','a26','a27','a28','a29','a30','a31','a32','a33','a34','a35','a36','a37','a38',
         'a39','a40','a41','a42','a43','a44','a45','a46','a47','a48','a49','a50','NumTrades','IOPV','TotalBidQty','TotalOfferQty','WeightedAvgBidPx',
         'WeightedAvgOfferPx','TotalBidNumber','TotalOfferNumber','BidTradeMaxDuration','OfferTradeMaxDuration','NumBidOrders','NumOfferOrders','WithdrawBuyNumber',
         'WithdrawBuyAmount','WithdrawBuyMoney','WithdrawSellNumber','WithdrawSellAmount','WithdrawSellMoney','ETFBuyNumber','ETFBuyAmount','ETFBuyMoney',
         'ETFSellNumber','ETFSellAmount','ETFSellMoney'] 
colnames1=['StockID', 'DateTime', 'prevClose', 'open','HighPx','LowPx','close','cum_volume','cum_amount','InstrumentStatus',
         'bid1p','bid2p','bid3p','bid4p','bid5p','bid6p','bid7p','bid8p','bid9p','bid10p','bid1q','bid2q','bid3q','bid4q','bid5q','bid6q','bid7q',
         'bid8q','bid9q','bid10q','bid1n', 'bid2n', 'bid3n', 'bid4n', 'bid5n', 'bid6n', 'bid7n', 'bid8n', 'bid9n', 'bid10n',
           'bid1Top1q','bid1Top2q','bid1Top3q','bid1Top4q','bid1Top5q','bid1Top6q',
        'bid1Top7q','bid1Top8q','bid1Top9q','bid1Top10q','bid1Top11q','bid1Top12q','bid1Top13q','bid1Top14q','bid1Top15q','bid1Top16q','bid1Top17q','bid1Top18q',
        'bid1Top19q','bid1Top20q','bid1Top21q','bid1Top22q','bid1Top23q','bid1Top24q','bid1Top25q','bid1Top26q','bid1Top27q','bid1Top28q','bid1Top29q',
        'bid1Top30q','bid1Top31q','bid1Top32q','bid1Top33q','bid1Top34q','bid1Top35q','bid1Top36q','bid1Top37q','bid1Top38q','bid1Top39q','bid1Top40q',
        'bid1Top41q','bid1Top42q','bid1Top43q','bid1Top44q','bid1Top45q','bid1Top46q','bid1Top47q','bid1Top48q','bid1Top49q','bid1Top50q','ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p','ask1q','ask2q',
          'ask3q','ask4q','ask5q','ask6q','ask7q','ask8q','ask9q','ask10q','ask1n', 'ask2n', 'ask3n', 'ask4n', 'ask5n', 'ask6n',
           'ask7n', 'ask8n', 'ask9n', 'ask10n','ask1Top1q',
        'ask1Top2q','ask1Top3q','ask1Top4q','ask1Top5q','ask1Top6q','ask1Top7q','ask1Top8q','ask1Top9q','ask1Top10q','ask1Top11q','ask1Top12q','ask1Top13q',
        'ask1Top14q','ask1Top15q','ask1Top16q','ask1Top17q','ask1Top18q','ask1Top19q','ask1Top20q','ask1Top21q','ask1Top22q','ask1Top23q',
        'ask1Top24q','ask1Top25q','ask1Top26q','ask1Top27q','ask1Top28q','ask1Top29q','ask1Top30q','ask1Top31q','ask1Top32q','ask1Top33q',
        'ask1Top34q','ask1Top35q','ask1Top36q','ask1Top37q','ask1Top38q','ask1Top39q','ask1Top40q','ask1Top41q','ask1Top42q','ask1Top43q',
        'ask1Top44q','ask1Top45q','ask1Top46q','ask1Top47q','ask1Top48q','ask1Top49q','ask1Top50q','NumTrades','IOPV','TotalBidQty','TotalOfferQty','WeightedAvgBidPx',
         'WeightedAvgOfferPx','TotalBidNumber','TotalOfferNumber','BidTradeMaxDuration','OfferTradeMaxDuration','NumBidOrders','NumOfferOrders','WithdrawBuyNumber',
         'WithdrawBuyAmount','WithdrawBuyMoney','WithdrawSellNumber','WithdrawSellAmount','WithdrawSellMoney','ETFBuyNumber','ETFBuyAmount','ETFBuyMoney',
         'ETFSellNumber','ETFSellAmount','ETFSellMoney'] 
colnames2=['SecurityID', 'DateTime', 'Price', 'Qty', 'LeaveQty', "Side"]

year = "2018"
df = []
bad = []

readPath = 'G:\\' + year + '\\***' 
dataPathLs = np.array(glob.glob(readPath))

for data in dataPathLs:
    try1 = pd.read_csv(data +'\\Auction.csv', nrows=1, header=None)
    assert(try1.shape[1] == 2)

AssertionError: 

In [40]:
import pandas as pd
import random
import numpy as np
import glob
import os
from unrar import rarfile
import py7zr
import pickle
import datetime
import time
pd.set_option("max_columns", 200)



startTm = datetime.datetime.now()
    
colnames=['StockID', 'DateTime', 'prevClose', 'open','HighPx','LowPx','close','cum_volume','cum_amount','InstrumentStatus',
         'bid1p','bid2p','bid3p','bid4p','bid5p','bid6p','bid7p','bid8p','bid9p','bid10p','bid1q','bid2q','bid3q','bid4q','bid5q','bid6q','bid7q',
         'bid8q','bid9q','bid10q','b1','b2','b3','b4','b5','b6','b7','b8','b9','b10','b11','b12','b13','b14','b15','b16','b17','b18','b19','b20',
          'b21','b22','b23','b24','b25','b26','b27','b28','b29','b30','b31','b32','b33','b34','b35','b36','b37','b38','b39','b40','b41','b42','b43',
          'b44','b45','b46','b47','b48','b49','b50','ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p','ask1q','ask2q',
          'ask3q','ask4q','ask5q','ask6q','ask7q','ask8q','ask9q','ask10q','a1','a2','a3','a4','a5','a6','a7','a8','a9','a10','a11','a12','a13','a14',
         'a15','a16','a17','a18','a19','a20','a21','a22','a23','a24','a25','a26','a27','a28','a29','a30','a31','a32','a33','a34','a35','a36','a37','a38',
         'a39','a40','a41','a42','a43','a44','a45','a46','a47','a48','a49','a50','NumTrades','IOPV','TotalBidQty','TotalOfferQty','WeightedAvgBidPx',
         'WeightedAvgOfferPx','TotalBidNumber','TotalOfferNumber','BidTradeMaxDuration','OfferTradeMaxDuration','NumBidOrders','NumOfferOrders','WithdrawBuyNumber',
         'WithdrawBuyAmount','WithdrawBuyMoney','WithdrawSellNumber','WithdrawSellAmount','WithdrawSellMoney','ETFBuyNumber','ETFBuyAmount','ETFBuyMoney',
         'ETFSellNumber','ETFSellAmount','ETFSellMoney'] 
colnames1=['StockID', 'DateTime', 'prevClose', 'open','HighPx','LowPx','close','cum_volume','cum_amount','InstrumentStatus',
         'bid1p','bid2p','bid3p','bid4p','bid5p','bid6p','bid7p','bid8p','bid9p','bid10p','bid1q','bid2q','bid3q','bid4q','bid5q','bid6q','bid7q',
         'bid8q','bid9q','bid10q','bid1n', 'bid2n', 'bid3n', 'bid4n', 'bid5n', 'bid6n', 'bid7n', 'bid8n', 'bid9n', 'bid10n',
           'bid1Top1q','bid1Top2q','bid1Top3q','bid1Top4q','bid1Top5q','bid1Top6q',
        'bid1Top7q','bid1Top8q','bid1Top9q','bid1Top10q','bid1Top11q','bid1Top12q','bid1Top13q','bid1Top14q','bid1Top15q','bid1Top16q','bid1Top17q','bid1Top18q',
        'bid1Top19q','bid1Top20q','bid1Top21q','bid1Top22q','bid1Top23q','bid1Top24q','bid1Top25q','bid1Top26q','bid1Top27q','bid1Top28q','bid1Top29q',
        'bid1Top30q','bid1Top31q','bid1Top32q','bid1Top33q','bid1Top34q','bid1Top35q','bid1Top36q','bid1Top37q','bid1Top38q','bid1Top39q','bid1Top40q',
        'bid1Top41q','bid1Top42q','bid1Top43q','bid1Top44q','bid1Top45q','bid1Top46q','bid1Top47q','bid1Top48q','bid1Top49q','bid1Top50q','ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p','ask1q','ask2q',
          'ask3q','ask4q','ask5q','ask6q','ask7q','ask8q','ask9q','ask10q','ask1n', 'ask2n', 'ask3n', 'ask4n', 'ask5n', 'ask6n',
           'ask7n', 'ask8n', 'ask9n', 'ask10n','ask1Top1q',
        'ask1Top2q','ask1Top3q','ask1Top4q','ask1Top5q','ask1Top6q','ask1Top7q','ask1Top8q','ask1Top9q','ask1Top10q','ask1Top11q','ask1Top12q','ask1Top13q',
        'ask1Top14q','ask1Top15q','ask1Top16q','ask1Top17q','ask1Top18q','ask1Top19q','ask1Top20q','ask1Top21q','ask1Top22q','ask1Top23q',
        'ask1Top24q','ask1Top25q','ask1Top26q','ask1Top27q','ask1Top28q','ask1Top29q','ask1Top30q','ask1Top31q','ask1Top32q','ask1Top33q',
        'ask1Top34q','ask1Top35q','ask1Top36q','ask1Top37q','ask1Top38q','ask1Top39q','ask1Top40q','ask1Top41q','ask1Top42q','ask1Top43q',
        'ask1Top44q','ask1Top45q','ask1Top46q','ask1Top47q','ask1Top48q','ask1Top49q','ask1Top50q','NumTrades','IOPV','TotalBidQty','TotalOfferQty','WeightedAvgBidPx',
         'WeightedAvgOfferPx','TotalBidNumber','TotalOfferNumber','BidTradeMaxDuration','OfferTradeMaxDuration','NumBidOrders','NumOfferOrders','WithdrawBuyNumber',
         'WithdrawBuyAmount','WithdrawBuyMoney','WithdrawSellNumber','WithdrawSellAmount','WithdrawSellMoney','ETFBuyNumber','ETFBuyAmount','ETFBuyMoney',
         'ETFSellNumber','ETFSellAmount','ETFSellMoney'] 
colnames2=['SecurityID', 'DateTime', 'Price', 'Qty', 'LeaveQty', "Side"]

year = "2017"
df = []
bad = []

readPath = 'G:\\' + year + '\\***' 
dataPathLs = np.array(glob.glob(readPath))

for data in dataPathLs:
    startTm = datetime.datetime.now()
    path = np.array(glob.glob(data +'\\***'))[[os.path.basename(i).split('.')[0] == 'Snapshot' \
                                                               for i in np.array(glob.glob(data +'\\***'))]][0]
    
    if len(np.array(glob.glob(data +'\\***'))) == 0:
        continue

    else:
        try1 = pd.read_csv(path, nrows=1, header=None)
        if try1.shape[1] == 2:
            print("native lv2 data")
            snapshot = pd.read_csv(path, header=None, names=colnames1).iloc[1:, :]
            snapshot["DateTime"] = snapshot["DateTime"].apply(lambda x: int(x))
            snapshot['time'] = ((snapshot["DateTime"] - int(snapshot["DateTime"].iloc[0]//1000000 * 1000000))*1000000).astype(np.int64)
        else:
            snapshot = pd.read_csv(path, header=None)
            assert(snapshot.shape[1] == len(colnames))
            print("XTP lv2 data")
            snapshot.columns = colnames
            snapshot["DateTime"] = snapshot["DateTime"].apply(lambda x: int(x))
            snapshot['time'] = ((snapshot["DateTime"] - int(snapshot["DateTime"].iloc[0]//1000000 * 1000000))*1000000).astype(np.int64)           
        print("csv is loaded")
        
        snapshot["StockID"] = snapshot["StockID"].astype(int)
        snapshot = snapshot[(snapshot["StockID"] >= 600000) & (snapshot["StockID"] < 700000)].reset_index(drop=True)
        snapshot = snapshot.rename(columns={"StockID": "ID"})
        snapshot["date"] = int(snapshot["DateTime"].iloc[0]//1000000)
        snapshot["clockAtArrival"] = snapshot["DateTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S').timestamp()*1e6))
        # snapshot["clockAtArrival"] = pd.to_datetime(snapshot["DateTime"].astype(str), format = '%Y%m%d%H%M%S').apply(lambda x: int(x.timestamp()*1e6))
        snapshot['ID'] = 1000000 + snapshot["ID"]
        snapshot = snapshot.rename(columns={"NumTrades": "cum_tradesCnt", "HighPx":"high", "LowPx":"low", "TotalBidQty":"totalBidQuantity",
                                   "TotalOfferQty":"totalAskQuantity", "WeightedAvgBidPx":"vwapBid", "WeightedAvgOfferPx":"vwapAsk",
                                   "TotalBidNumber":"totalBidOrders", "TotalOfferNumber":"totalAskOrders", "NumBidOrders":"totalBidLevels",
                                   "NumOfferOrders":"totalAskLevels", "WithdrawBuyNumber":"cum_canceledBuyOrders",
                                   "WithdrawBuyAmount":"cum_canceledBuyVolume", "WithdrawBuyMoney":"cum_canceledBuyAmount",
                                   "WithdrawSellNumber":"cum_canceledSellOrders", "WithdrawSellAmount":"cum_canceledSellVolume",
                                   "WithdrawSellMoney":"cum_canceledSellAmount", "BidTradeMaxDuration":"bidTradeMaxDuration",
                                    "OfferTradeMaxDuration":"askTradeMaxDuration"})
        
        try1 = pd.read_csv(data +'\\Auction.csv', nrows=1, header=None)
        if try1.shape[1] == 2:
            print("native auction data")
            aucData = pd.read_csv(data +'\\Auction.csv', header=None, names=colnames2).iloc[1:, :]
        else:
            aucData = pd.read_csv(data +'\\Auction.csv', header=None, names=colnames2)
        aucData['bid1p'] = aucData['Price']
        aucData['ask1p'] = aucData['Price']
        aucData['bid1q'] = aucData['Qty']
        aucData['ask1q'] = aucData['Qty']
        aucData['bid2q'] = np.where(aucData['Side'] == 1, aucData['LeaveQty'], 0)
        aucData['ask2q'] = np.where(aucData['Side'] == 2, aucData['LeaveQty'], 0)
        aucData["SecurityID"] = aucData["SecurityID"].astype(int)
        aucData = aucData[(aucData["SecurityID"] >= 600000) & (aucData["SecurityID"] < 700000)].reset_index(drop=True)
        aucData["ID"] = aucData["SecurityID"] + 1000000
        aucData["DateTime"] = aucData["DateTime"].apply(lambda x: int(x))
        aucData["date"] = int(aucData["DateTime"].iloc[0]//1000000)
        m_in = 91500 + int(aucData["DateTime"].iloc[0]//1000000 * 1000000)
        m_ax = 92459 + int(aucData["DateTime"].iloc[0]//1000000 * 1000000)
        aucData["DateTime"] = aucData['DateTime'].clip(m_in, m_ax)
        aucData['time'] = ((aucData["DateTime"] - int(aucData["DateTime"].iloc[0]//1000000 * 1000000))*1000000).astype(np.int64)
        aucData["clockAtArrival"] = aucData["DateTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S').timestamp()*1e6))
        aucData = aucData[['date', 'time', 'clockAtArrival', 'ID', 'bid1p', 'ask1p', 'bid2q', 'bid1q', 'ask1q', 'ask2q']].reset_index(drop=True)
        
        test1 = aucData.groupby("ID")["time"].max().reset_index()
        test2 = snapshot.groupby("ID")["time"].min().reset_index()
        test = pd.merge(test1, test2, on="ID")
        assert(test[test["time_x"] >= test["time_y"]].shape[0]==0)
        
        snapshot = pd.concat([aucData, snapshot]).reset_index(drop=True)
        snapshot = snapshot.fillna(0)
        
        snapshot["ordering"] = snapshot.groupby("ID").cumcount()
        snapshot["ordering"] = snapshot["ordering"] + 1
        
        for col in ["ID", "cum_tradesCnt", "cum_volume", 'ask1q','ask2q','ask3q','ask4q','ask5q','ask6q','ask7q','ask8q','ask9q',
                        'ask10q', 'bid1q','bid2q','bid3q','bid4q','bid5q','bid6q','bid7q','bid8q','bid9q','bid10q', 'bid1Top1q','bid1Top2q','bid1Top3q','bid1Top4q','bid1Top5q','bid1Top6q',
        'bid1Top7q','bid1Top8q','bid1Top9q','bid1Top10q','bid1Top11q','bid1Top12q','bid1Top13q','bid1Top14q','bid1Top15q','bid1Top16q','bid1Top17q','bid1Top18q',
        'bid1Top19q','bid1Top20q','bid1Top21q','bid1Top22q','bid1Top23q','bid1Top24q','bid1Top25q','bid1Top26q','bid1Top27q','bid1Top28q','bid1Top29q',
        'bid1Top30q','bid1Top31q','bid1Top32q','bid1Top33q','bid1Top34q','bid1Top35q','bid1Top36q','bid1Top37q','bid1Top38q','bid1Top39q','bid1Top40q',
        'bid1Top41q','bid1Top42q','bid1Top43q','bid1Top44q','bid1Top45q','bid1Top46q','bid1Top47q','bid1Top48q','bid1Top49q','bid1Top50q', 'ask1Top1q',
        'ask1Top2q','ask1Top3q','ask1Top4q','ask1Top5q','ask1Top6q','ask1Top7q','ask1Top8q','ask1Top9q','ask1Top10q','ask1Top11q','ask1Top12q','ask1Top13q',
        'ask1Top14q','ask1Top15q','ask1Top16q','ask1Top17q','ask1Top18q','ask1Top19q','ask1Top20q','ask1Top21q','ask1Top22q','ask1Top23q',
        'ask1Top24q','ask1Top25q','ask1Top26q','ask1Top27q','ask1Top28q','ask1Top29q','ask1Top30q','ask1Top31q','ask1Top32q','ask1Top33q',
        'ask1Top34q','ask1Top35q','ask1Top36q','ask1Top37q','ask1Top38q','ask1Top39q','ask1Top40q','ask1Top41q','ask1Top42q','ask1Top43q',
        'ask1Top44q','ask1Top45q','ask1Top46q','ask1Top47q','ask1Top48q','ask1Top49q','ask1Top50q', "totalBidQuantity", "totalAskQuantity", "totalBidOrders",
        'totalAskOrders', 'totalBidLevels', 'totalAskLevels', 'cum_canceledBuyOrders', 'cum_canceledBuyVolume', 'cum_canceledSellOrders',
        'cum_canceledSellVolume', "ordering", 'bid1n', 'bid2n', 'bid3n', 'bid4n', 'bid5n', 'bid6n', 'bid7n', 'bid8n', 'bid9n', 'bid10n', 'ask1n', 'ask2n', 'ask3n', 'ask4n', 'ask5n', 'ask6n',
        'ask7n', 'ask8n', 'ask9n', 'ask10n', 'bidTradeMaxDuration', 'askTradeMaxDuration']:
            snapshot[col] = snapshot[col].astype(np.int64)

        
        for cols in ["prevClose", 'open', "high", "low", "close", 'bid10p','bid9p','bid8p','bid7p','bid6p','bid5p','bid4p','bid3p',
             'bid2p','bid1p','ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p', 'vwapBid', "vwapAsk"]:
            snapshot[cols] = (snapshot[cols] * 10000).round(0).astype(np.int64)
        
        for cols in ["cum_amount", "cum_canceledBuyAmount", "cum_canceledSellAmount"]:
            snapshot[cols] = (snapshot[cols] * 10000).round(0).astype(np.int64)
        
        
        display(snapshot["date"].iloc[0])
        display(snapshot.groupby("ID")["time"].min().max())
        
        snapshot['datetime'] = snapshot["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))
        
        snapshot["prevClose"] = np.where(snapshot["time"] >= 91500000000, snapshot.groupby("ID")["prevClose"].transform("max"), snapshot["prevClose"]) 
        snapshot["open"] = np.where(snapshot["cum_volume"] > 0, snapshot.groupby("ID")["open"].transform("max"), snapshot["open"])
        assert(sum(snapshot[snapshot["open"] != 0].groupby("ID")["open"].nunique() != 1) == 0)
        assert(sum(snapshot[snapshot["prevClose"] != 0].groupby("ID")["prevClose"].nunique() != 1) == 0)
        assert(snapshot[snapshot["cum_volume"] > 0]["open"].min() > 0)
        
        snapshot = snapshot[["date", "time", "clockAtArrival", "datetime", "ID", "ordering", "cum_tradesCnt", "cum_volume", "cum_amount", "prevClose",
                            "open", "high", "low", "close", 'bid10p','bid9p','bid8p','bid7p','bid6p','bid5p','bid4p','bid3p','bid2p','bid1p',
                            'ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p', 'bid10q','bid9q','bid8q',
                             'bid7q','bid6q','bid5q','bid4q','bid3q','bid2q','bid1q', 'ask1q','ask2q','ask3q','ask4q','ask5q','ask6q',
                             'ask7q','ask8q','ask9q','ask10q', 'bid10n', 'bid9n', 'bid8n', 'bid7n', 'bid6n', 'bid5n', 'bid4n', 'bid3n', 'bid2n', 'bid1n', 
                             'ask1n', 'ask2n', 'ask3n', 'ask4n', 'ask5n', 'ask6n','ask7n', 'ask8n', 'ask9n', 'ask10n','bid1Top1q','bid1Top2q','bid1Top3q','bid1Top4q','bid1Top5q','bid1Top6q',
        'bid1Top7q','bid1Top8q','bid1Top9q','bid1Top10q','bid1Top11q','bid1Top12q','bid1Top13q','bid1Top14q','bid1Top15q','bid1Top16q','bid1Top17q','bid1Top18q',
        'bid1Top19q','bid1Top20q','bid1Top21q','bid1Top22q','bid1Top23q','bid1Top24q','bid1Top25q','bid1Top26q','bid1Top27q','bid1Top28q','bid1Top29q',
        'bid1Top30q','bid1Top31q','bid1Top32q','bid1Top33q','bid1Top34q','bid1Top35q','bid1Top36q','bid1Top37q','bid1Top38q','bid1Top39q','bid1Top40q',
        'bid1Top41q','bid1Top42q','bid1Top43q','bid1Top44q','bid1Top45q','bid1Top46q','bid1Top47q','bid1Top48q','bid1Top49q','bid1Top50q', 'ask1Top1q',
        'ask1Top2q','ask1Top3q','ask1Top4q','ask1Top5q','ask1Top6q','ask1Top7q','ask1Top8q','ask1Top9q','ask1Top10q','ask1Top11q','ask1Top12q','ask1Top13q',
        'ask1Top14q','ask1Top15q','ask1Top16q','ask1Top17q','ask1Top18q','ask1Top19q','ask1Top20q','ask1Top21q','ask1Top22q','ask1Top23q',
        'ask1Top24q','ask1Top25q','ask1Top26q','ask1Top27q','ask1Top28q','ask1Top29q','ask1Top30q','ask1Top31q','ask1Top32q','ask1Top33q',
        'ask1Top34q','ask1Top35q','ask1Top36q','ask1Top37q','ask1Top38q','ask1Top39q','ask1Top40q','ask1Top41q','ask1Top42q','ask1Top43q',
        'ask1Top44q','ask1Top45q','ask1Top46q','ask1Top47q','ask1Top48q','ask1Top49q','ask1Top50q',"totalBidQuantity", "totalAskQuantity","vwapBid", "vwapAsk",
        "totalBidOrders",'totalAskOrders','totalBidLevels', 'totalAskLevels', 'bidTradeMaxDuration', 'askTradeMaxDuration', 'cum_canceledBuyOrders', 'cum_canceledBuyVolume',
        "cum_canceledBuyAmount", "cum_canceledSellOrders", 'cum_canceledSellVolume',"cum_canceledSellAmount"]]
        
        
        db = DB("mongodb://user_rw:faa96dfc@192.168.10.223")
        db.write('snapshot', snapshot)
        
        print(datetime.datetime.now() - startTm)

native lv2 data
csv is loaded


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




20181224

92541000000

0:06:17.973817
native lv2 data
csv is loaded


20181225

92536000000

0:06:38.910107
native lv2 data
csv is loaded


20181226

92552000000

0:05:49.608443
native lv2 data
csv is loaded


20181227

92549000000

0:06:17.043008
native lv2 data
csv is loaded


20181228

92535000000

0:06:01.347330


In [30]:
startTm = datetime.datetime.now()
snapshot['datetime'] = snapshot["date"].astype(str) + snapshot["time"].astype(str).apply(lambda x: x.rjust(12, '0'))
snapshot["datetime"] = snapshot["datetime"].apply(lambda x: datetime.datetime(int(x[:4]), int(x[4:6]), int(x[6:8]),
                  int(x[8:10]), int(x[10:12]), int(x[12:14]),int(x[14:])))
display(snapshot["datetime"].iloc[0])
display(snapshot["datetime"].dt.minute)
print(datetime.datetime.now() - startTm)

startTm = datetime.datetime.now()
snapshot['datetime'] = snapshot["date"].astype(str) + snapshot["time"].astype(str).apply(lambda x: x.rjust(12, '0'))
snapshot["datetime"] = snapshot["datetime"].apply(lambda x: datetime.datetime.strptime(x, '%Y%m%d%H%M%S%f'))
display(snapshot["datetime"].iloc[0])
display(snapshot["datetime"].dt.minute)
print(datetime.datetime.now() - startTm)

startTm = datetime.datetime.now()
snapshot['datetime'] = snapshot["date"].astype(str) + snapshot["time"].astype(str).apply(lambda x: x.rjust(12, '0'))
snapshot["datetime"] = pd.to_datetime(snapshot["datetime"], format='%Y%m%d%H%M%S%f')
display(snapshot["datetime"].iloc[0])
display(snapshot["datetime"].dt.minute)
print(datetime.datetime.now() - startTm)

startTm = datetime.datetime.now()
snapshot['datetime'] = snapshot["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))
display(snapshot["datetime"].iloc[0])
display(snapshot["datetime"].dt.minute)
print(datetime.datetime.now() - startTm)

Timestamp('2018-01-02 09:15:00')

0          15
1          15
2          15
3          15
4          15
           ..
4414993     0
4414994     0
4414995     0
4414996     0
4414997     0
Name: datetime, Length: 4414998, dtype: int64

0:00:11.611807


Timestamp('2018-01-02 09:15:00')

0          15
1          15
2          15
3          15
4          15
           ..
4414993     0
4414994     0
4414995     0
4414996     0
4414997     0
Name: datetime, Length: 4414998, dtype: int64

0:00:41.642430


Timestamp('2018-01-02 09:15:00')

0          15
1          15
2          15
3          15
4          15
           ..
4414993     0
4414994     0
4414995     0
4414996     0
4414997     0
Name: datetime, Length: 4414998, dtype: int64

0:00:05.208686


Timestamp('2018-01-02 09:14:54')

0          14
1          14
2          14
3          14
4          14
           ..
4414993     0
4414994     0
4414995     0
4414996     0
4414997     0
Name: datetime, Length: 4414998, dtype: int64

0:00:02.434567


In [47]:
datetime.datetime.fromtimestamp(1589870769000843044/1e6).strftime("%Y-%m-%d %H:%M:%S %f")

'2018-01-02 15:00:00 000000'

In [14]:
pd.set_option("max_rows", 200)
snapshot.dtypes

date                               int64
time                               int64
clockAtArrival                     int64
datetime                  datetime64[ns]
ID                                 int64
ordering                           int64
cum_tradesCnt                      int64
cum_volume                         int64
cum_amount                         int64
prevClose                          int64
open                               int64
high                               int64
low                                int64
close                              int64
bid10p                             int64
bid9p                              int64
bid8p                              int64
bid7p                              int64
bid6p                              int64
bid5p                              int64
bid4p                              int64
bid3p                              int64
bid2p                              int64
bid1p                              int64
ask1p           

#### 2. Upload SH 2019 data (20190102-20190225未解决)

In [10]:
import pandas as pd
import random
import numpy as np
import glob
import os
from unrar import rarfile
import py7zr
import pickle
import datetime
import time
pd.set_option("max_columns", 200)

    
colnames=['StockID', 'DateTime', 'prevClose', 'open','HighPx','LowPx','close','cum_volume','cum_amount','InstrumentStatus',
         'bid1p','bid2p','bid3p','bid4p','bid5p','bid6p','bid7p','bid8p','bid9p','bid10p','bid1q','bid2q','bid3q','bid4q','bid5q','bid6q','bid7q',
         'bid8q','bid9q','bid10q','b1','b2','b3','b4','b5','b6','b7','b8','b9','b10','b11','b12','b13','b14','b15','b16','b17','b18','b19','b20',
          'b21','b22','b23','b24','b25','b26','b27','b28','b29','b30','b31','b32','b33','b34','b35','b36','b37','b38','b39','b40','b41','b42','b43',
          'b44','b45','b46','b47','b48','b49','b50','ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p','ask1q','ask2q',
          'ask3q','ask4q','ask5q','ask6q','ask7q','ask8q','ask9q','ask10q','a1','a2','a3','a4','a5','a6','a7','a8','a9','a10','a11','a12','a13','a14',
         'a15','a16','a17','a18','a19','a20','a21','a22','a23','a24','a25','a26','a27','a28','a29','a30','a31','a32','a33','a34','a35','a36','a37','a38',
         'a39','a40','a41','a42','a43','a44','a45','a46','a47','a48','a49','a50','NumTrades','IOPV','TotalBidQty','TotalOfferQty','WeightedAvgBidPx',
         'WeightedAvgOfferPx','TotalBidNumber','TotalOfferNumber','BidTradeMaxDuration','OfferTradeMaxDuration','NumBidOrders','NumOfferOrders','WithdrawBuyNumber',
         'WithdrawBuyAmount','WithdrawBuyMoney','WithdrawSellNumber','WithdrawSellAmount','WithdrawSellMoney','ETFBuyNumber','ETFBuyAmount','ETFBuyMoney',
         'ETFSellNumber','ETFSellAmount','ETFSellMoney'] 
colnames1=['StockID', 'DateTime', 'prevClose', 'open','HighPx','LowPx','close','cum_volume','cum_amount','InstrumentStatus',
         'bid1p','bid2p','bid3p','bid4p','bid5p','bid6p','bid7p','bid8p','bid9p','bid10p','bid1q','bid2q','bid3q','bid4q','bid5q','bid6q','bid7q',
         'bid8q','bid9q','bid10q','bid1n', 'bid2n', 'bid3n', 'bid4n', 'bid5n', 'bid6n', 'bid7n', 'bid8n', 'bid9n', 'bid10n',
           'bid1Top1q','bid1Top2q','bid1Top3q','bid1Top4q','bid1Top5q','bid1Top6q',
        'bid1Top7q','bid1Top8q','bid1Top9q','bid1Top10q','bid1Top11q','bid1Top12q','bid1Top13q','bid1Top14q','bid1Top15q','bid1Top16q','bid1Top17q','bid1Top18q',
        'bid1Top19q','bid1Top20q','bid1Top21q','bid1Top22q','bid1Top23q','bid1Top24q','bid1Top25q','bid1Top26q','bid1Top27q','bid1Top28q','bid1Top29q',
        'bid1Top30q','bid1Top31q','bid1Top32q','bid1Top33q','bid1Top34q','bid1Top35q','bid1Top36q','bid1Top37q','bid1Top38q','bid1Top39q','bid1Top40q',
        'bid1Top41q','bid1Top42q','bid1Top43q','bid1Top44q','bid1Top45q','bid1Top46q','bid1Top47q','bid1Top48q','bid1Top49q','bid1Top50q','ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p','ask1q','ask2q',
          'ask3q','ask4q','ask5q','ask6q','ask7q','ask8q','ask9q','ask10q','ask1n', 'ask2n', 'ask3n', 'ask4n', 'ask5n', 'ask6n',
           'ask7n', 'ask8n', 'ask9n', 'ask10n','ask1Top1q',
        'ask1Top2q','ask1Top3q','ask1Top4q','ask1Top5q','ask1Top6q','ask1Top7q','ask1Top8q','ask1Top9q','ask1Top10q','ask1Top11q','ask1Top12q','ask1Top13q',
        'ask1Top14q','ask1Top15q','ask1Top16q','ask1Top17q','ask1Top18q','ask1Top19q','ask1Top20q','ask1Top21q','ask1Top22q','ask1Top23q',
        'ask1Top24q','ask1Top25q','ask1Top26q','ask1Top27q','ask1Top28q','ask1Top29q','ask1Top30q','ask1Top31q','ask1Top32q','ask1Top33q',
        'ask1Top34q','ask1Top35q','ask1Top36q','ask1Top37q','ask1Top38q','ask1Top39q','ask1Top40q','ask1Top41q','ask1Top42q','ask1Top43q',
        'ask1Top44q','ask1Top45q','ask1Top46q','ask1Top47q','ask1Top48q','ask1Top49q','ask1Top50q','NumTrades','IOPV','TotalBidQty','TotalOfferQty','WeightedAvgBidPx',
         'WeightedAvgOfferPx','TotalBidNumber','TotalOfferNumber','BidTradeMaxDuration','OfferTradeMaxDuration','NumBidOrders','NumOfferOrders','WithdrawBuyNumber',
         'WithdrawBuyAmount','WithdrawBuyMoney','WithdrawSellNumber','WithdrawSellAmount','WithdrawSellMoney','ETFBuyNumber','ETFBuyAmount','ETFBuyMoney',
         'ETFSellNumber','ETFSellAmount','ETFSellMoney'] 
colnames2=['SecurityID', 'DateTime', 'Price', 'Qty', 'LeaveQty', "Side"]

year = "2019"
df = []
bad = []

readPath = 'G:\\' + year +'\\***' 
# readPath = 'F:\\' + year +'\\***'
dataPathLs = np.array(glob.glob(readPath))

for data in dataPathLs[106:]:
    startTm = datetime.datetime.now()
    
    if len(np.array(glob.glob(data +'\\***'))) == 0:
        continue

    else:
        try1 = pd.read_csv(data + "\\Snapshot.csv", nrows=1, header=None)
        if try1.shape[1] == 2:
            print("native lv2 data")
            snapshot = pd.read_csv(data + "\\Snapshot.csv", header=None, names=colnames1).iloc[1:, :]
            snapshot["DateTime"] = snapshot["DateTime"].apply(lambda x: int(x))
            snapshot['time'] = ((snapshot["DateTime"] - int(snapshot["DateTime"].iloc[0]//1000000 * 1000000))*1000000).astype(np.int64)
        else:
            snapshot = pd.read_csv(data + "\\Snapshot.csv", header=None)
            assert(snapshot.shape[1] == len(colnames))
            print("XTP lv2 data")
            snapshot.columns = colnames
            snapshot["DateTime"] = snapshot["DateTime"].apply(lambda x: int(x))
            snapshot['time'] = ((snapshot["DateTime"] - int(snapshot["DateTime"].iloc[0]//1000000 * 1000000))*1000000).astype(np.int64)                 
        print("csv is loaded")
        
        snapshot["StockID"] = snapshot["StockID"].astype(int)
        snapshot = snapshot[(snapshot["StockID"] >= 600000) & (snapshot["StockID"] < 700000)].reset_index(drop=True)
        snapshot = snapshot.rename(columns={"StockID": "ID"})
        snapshot["date"] = int(snapshot["DateTime"].iloc[0]//1000000)
        snapshot["clockAtArrival"] = snapshot["DateTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S').timestamp()*1e6))
        # snapshot["clockAtArrival"] = pd.to_datetime(snapshot["DateTime"].astype(str), format = '%Y%m%d%H%M%S').apply(lambda x: int(x.timestamp()*1e6))
        snapshot['ID'] = 1000000 + snapshot["ID"]
        
        snapshot = snapshot.rename(columns={"NumTrades": "cum_tradesCnt", "HighPx":"high", "LowPx":"low", "TotalBidQty":"totalBidQuantity",
                                   "TotalOfferQty":"totalAskQuantity", "WeightedAvgBidPx":"vwapBid", "WeightedAvgOfferPx":"vwapAsk",
                                   "TotalBidNumber":"totalBidOrders", "TotalOfferNumber":"totalAskOrders", "NumBidOrders":"totalBidLevels",
                                   "NumOfferOrders":"totalAskLevels", "WithdrawBuyNumber":"cum_canceledBuyOrders",
                                   "WithdrawBuyAmount":"cum_canceledBuyVolume", "WithdrawBuyMoney":"cum_canceledBuyAmount",
                                   "WithdrawSellNumber":"cum_canceledSellOrders", "WithdrawSellAmount":"cum_canceledSellVolume",
                                   "WithdrawSellMoney":"cum_canceledSellAmount", "BidTradeMaxDuration":"bidTradeMaxDuration",
                                    "OfferTradeMaxDuration":"askTradeMaxDuration"})
        
        snapshot['datetime'] = snapshot["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))
        
        if snapshot["date"].iloc[0] <= 20190225:
            F1 = open("\\\\192.168.10.30\\Kevin_zhenyu\\auction\\mdAucLog_SH_" + str(snapshot["date"].iloc[0]) + ".pkl", 'rb')
            aucData = pickle.load(F1)
            aucData = aucData.rename(columns={"time":"datetime"})
            aucData["DateTime"] = aucData["datetime"].apply(lambda x: x.strftime("%Y%m%d%H%M%S"))
            aucData["DateTime"] = aucData["DateTime"].astype("int64")
            m_ax = 92459 + int(aucData["DateTime"].iloc[0]//1000000 * 1000000)
            aucData["DateTime"] = aucData['DateTime'].clip(upper=m_ax)
            aucData['time'] = ((aucData["DateTime"] - int(aucData["DateTime"].iloc[0]//1000000 * 1000000))*1000000).astype(np.int64)
            aucData["clockAtArrival"] = aucData["DateTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S').timestamp()*1e6))
            aucData['datetime'] = aucData["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))
            aucData = aucData[['date', 'time', 'datetime', 'clockAtArrival', 'ID', 'bid1p', 'ask1p', 'bid2q', 'bid1q', 'ask1q', 'ask2q']].reset_index(drop=True)

            test1 = aucData.groupby("ID")["time"].max().reset_index()
            test2 = snapshot.groupby("ID")["time"].min().reset_index()
            test = pd.merge(test1, test2, on="ID")
            assert(test[test["time_x"] >= test["time_y"]].shape[0]==0)
            
            print("aucData is loaded")

            snapshot = pd.concat([aucData, snapshot]).reset_index(drop=True)
        snapshot = snapshot.fillna(0)
        
        snapshot = snapshot[~((snapshot["bid1p"] == 0) & (snapshot["ask1p"] == 0))]
        snapshot["ordering"] = snapshot.groupby("ID").cumcount()
        snapshot["ordering"] = snapshot["ordering"] + 1
    
        
        for col in ["ID", "cum_tradesCnt", "cum_volume", 'ask1q','ask2q','ask3q','ask4q','ask5q','ask6q','ask7q','ask8q','ask9q',
                        'ask10q', 'bid1q','bid2q','bid3q','bid4q','bid5q','bid6q','bid7q','bid8q','bid9q','bid10q', 'bid1Top1q','bid1Top2q','bid1Top3q','bid1Top4q','bid1Top5q','bid1Top6q',
        'bid1Top7q','bid1Top8q','bid1Top9q','bid1Top10q','bid1Top11q','bid1Top12q','bid1Top13q','bid1Top14q','bid1Top15q','bid1Top16q','bid1Top17q','bid1Top18q',
        'bid1Top19q','bid1Top20q','bid1Top21q','bid1Top22q','bid1Top23q','bid1Top24q','bid1Top25q','bid1Top26q','bid1Top27q','bid1Top28q','bid1Top29q',
        'bid1Top30q','bid1Top31q','bid1Top32q','bid1Top33q','bid1Top34q','bid1Top35q','bid1Top36q','bid1Top37q','bid1Top38q','bid1Top39q','bid1Top40q',
        'bid1Top41q','bid1Top42q','bid1Top43q','bid1Top44q','bid1Top45q','bid1Top46q','bid1Top47q','bid1Top48q','bid1Top49q','bid1Top50q', 'ask1Top1q',
        'ask1Top2q','ask1Top3q','ask1Top4q','ask1Top5q','ask1Top6q','ask1Top7q','ask1Top8q','ask1Top9q','ask1Top10q','ask1Top11q','ask1Top12q','ask1Top13q',
        'ask1Top14q','ask1Top15q','ask1Top16q','ask1Top17q','ask1Top18q','ask1Top19q','ask1Top20q','ask1Top21q','ask1Top22q','ask1Top23q',
        'ask1Top24q','ask1Top25q','ask1Top26q','ask1Top27q','ask1Top28q','ask1Top29q','ask1Top30q','ask1Top31q','ask1Top32q','ask1Top33q',
        'ask1Top34q','ask1Top35q','ask1Top36q','ask1Top37q','ask1Top38q','ask1Top39q','ask1Top40q','ask1Top41q','ask1Top42q','ask1Top43q',
        'ask1Top44q','ask1Top45q','ask1Top46q','ask1Top47q','ask1Top48q','ask1Top49q','ask1Top50q', "totalBidQuantity", "totalAskQuantity", "totalBidOrders",
        'totalAskOrders', 'totalBidLevels', 'totalAskLevels', 'cum_canceledBuyOrders', 'cum_canceledBuyVolume', 'cum_canceledSellOrders',
        'cum_canceledSellVolume', "ordering", 'bid1n', 'bid2n', 'bid3n', 'bid4n', 'bid5n', 'bid6n', 'bid7n', 'bid8n', 'bid9n', 'bid10n', 'ask1n', 'ask2n', 'ask3n', 'ask4n', 'ask5n', 'ask6n',
        'ask7n', 'ask8n', 'ask9n', 'ask10n', 'bidTradeMaxDuration', 'askTradeMaxDuration']:
            snapshot[col] = snapshot[col].astype(np.int64)
        
        for cols in ["prevClose", "open", "high", "low", "close", 'bid10p','bid9p','bid8p','bid7p','bid6p','bid5p','bid4p','bid3p',
             'bid2p','bid1p','ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p', 'vwapBid', "vwapAsk"]:
            snapshot[cols] = (snapshot[cols] * 10000).round(0).astype(np.int64)
        
        for cols in ["cum_amount", "cum_canceledBuyAmount", "cum_canceledSellAmount"]:
            snapshot[cols] = (snapshot[cols] * 10000).round(0).astype(np.int64)
        
        
        display(snapshot["date"].iloc[0])
        display(snapshot.groupby("ID")["time"].min().max())
        
        
        
        snapshot["prevClose"] = np.where(snapshot["time"] >= 91500000000, snapshot.groupby("ID")["prevClose"].transform("max"), snapshot["prevClose"]) 
        snapshot["open"] = np.where(snapshot["cum_volume"] > 0, snapshot.groupby("ID")["open"].transform("max"), snapshot["open"])
        assert(sum(snapshot[snapshot["open"] != 0].groupby("ID")["open"].nunique() != 1) == 0)
        assert(sum(snapshot[snapshot["prevClose"] != 0].groupby("ID")["prevClose"].nunique() != 1) == 0)
        assert(snapshot[snapshot["cum_volume"] > 0]["open"].min() > 0)
        
        
        snapshot = snapshot[["date", "time", "clockAtArrival", "datetime", "ID", "ordering", "cum_tradesCnt", "cum_volume", "cum_amount", "prevClose",
                            "open", "high", "low", "close", 'bid10p','bid9p','bid8p','bid7p','bid6p','bid5p','bid4p','bid3p','bid2p','bid1p',
                            'ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p', 'bid10q','bid9q','bid8q',
                             'bid7q','bid6q','bid5q','bid4q','bid3q','bid2q','bid1q', 'ask1q','ask2q','ask3q','ask4q','ask5q','ask6q',
                             'ask7q','ask8q','ask9q','ask10q', 'bid10n', 'bid9n', 'bid8n', 'bid7n', 'bid6n', 'bid5n', 'bid4n', 'bid3n', 'bid2n', 'bid1n', 
                             'ask1n', 'ask2n', 'ask3n', 'ask4n', 'ask5n', 'ask6n','ask7n', 'ask8n', 'ask9n', 'ask10n','bid1Top1q','bid1Top2q','bid1Top3q','bid1Top4q','bid1Top5q','bid1Top6q',
        'bid1Top7q','bid1Top8q','bid1Top9q','bid1Top10q','bid1Top11q','bid1Top12q','bid1Top13q','bid1Top14q','bid1Top15q','bid1Top16q','bid1Top17q','bid1Top18q',
        'bid1Top19q','bid1Top20q','bid1Top21q','bid1Top22q','bid1Top23q','bid1Top24q','bid1Top25q','bid1Top26q','bid1Top27q','bid1Top28q','bid1Top29q',
        'bid1Top30q','bid1Top31q','bid1Top32q','bid1Top33q','bid1Top34q','bid1Top35q','bid1Top36q','bid1Top37q','bid1Top38q','bid1Top39q','bid1Top40q',
        'bid1Top41q','bid1Top42q','bid1Top43q','bid1Top44q','bid1Top45q','bid1Top46q','bid1Top47q','bid1Top48q','bid1Top49q','bid1Top50q', 'ask1Top1q',
        'ask1Top2q','ask1Top3q','ask1Top4q','ask1Top5q','ask1Top6q','ask1Top7q','ask1Top8q','ask1Top9q','ask1Top10q','ask1Top11q','ask1Top12q','ask1Top13q',
        'ask1Top14q','ask1Top15q','ask1Top16q','ask1Top17q','ask1Top18q','ask1Top19q','ask1Top20q','ask1Top21q','ask1Top22q','ask1Top23q',
        'ask1Top24q','ask1Top25q','ask1Top26q','ask1Top27q','ask1Top28q','ask1Top29q','ask1Top30q','ask1Top31q','ask1Top32q','ask1Top33q',
        'ask1Top34q','ask1Top35q','ask1Top36q','ask1Top37q','ask1Top38q','ask1Top39q','ask1Top40q','ask1Top41q','ask1Top42q','ask1Top43q',
        'ask1Top44q','ask1Top45q','ask1Top46q','ask1Top47q','ask1Top48q','ask1Top49q','ask1Top50q',"totalBidQuantity", "totalAskQuantity","vwapBid", "vwapAsk",
        "totalBidOrders",'totalAskOrders','totalBidLevels', 'totalAskLevels', 'bidTradeMaxDuration', 'askTradeMaxDuration', 'cum_canceledBuyOrders', 'cum_canceledBuyVolume',
        "cum_canceledBuyAmount", "cum_canceledSellOrders", 'cum_canceledSellVolume',"cum_canceledSellAmount"]]
        
        db = DB("mongodb://user_rw:faa96dfc@192.168.10.223")
        db.write('snapshot', snapshot)
        
        print(datetime.datetime.now() - startTm)

native lv2 data


  interactivity=interactivity, compiler=compiler, result=result)


csv is loaded


20191206

92501000000

0:06:36.199828
native lv2 data
csv is loaded


20191209

92500000000

0:07:01.258469
native lv2 data
csv is loaded


20191210

92503000000

0:07:21.265207
native lv2 data
csv is loaded


20191211

92501000000

0:07:10.671327
native lv2 data
csv is loaded


20191212

92501000000

0:07:10.120166
native lv2 data
csv is loaded


20191213

92501000000

0:07:42.784473
native lv2 data
csv is loaded


20191216

92501000000

0:08:20.136170
native lv2 data
csv is loaded


20191217

92500000000

0:08:38.173318
native lv2 data
csv is loaded


20191218

92500000000

0:08:08.897070
native lv2 data
csv is loaded


20191219

92502000000

0:08:07.991096
native lv2 data
csv is loaded


20191220

92501000000

0:08:09.444132
native lv2 data
csv is loaded


20191223

92501000000

0:08:13.336759
native lv2 data
csv is loaded


20191224

92501000000

0:07:35.370387
native lv2 data
csv is loaded


20191225

92501000000

0:07:36.531969
native lv2 data
csv is loaded


20191226

92501000000

0:07:49.837337
native lv2 data
csv is loaded


20191227

92501000000

0:08:23.171111
native lv2 data
csv is loaded


20191230

92501000000

0:08:25.115669
native lv2 data
csv is loaded


20191231

92501000000

0:08:12.251851


#### 3. Upload SH 2020 data

In [2]:
import pandas as pd
import random
import numpy as np
import glob
import os
from unrar import rarfile
import py7zr
import pickle
import datetime
import time
pd.set_option("max_columns", 200)

    
colnames=['StockID', 'DateTime', 'prevClose', 'open','HighPx','LowPx','close','cum_volume','cum_amount','InstrumentStatus',
         'bid1p','bid2p','bid3p','bid4p','bid5p','bid6p','bid7p','bid8p','bid9p','bid10p','bid1q','bid2q','bid3q','bid4q','bid5q','bid6q','bid7q',
         'bid8q','bid9q','bid10q','bid1Top1q','bid1Top2q','bid1Top3q','bid1Top4q','bid1Top5q','bid1Top6q',
        'bid1Top7q','bid1Top8q','bid1Top9q','bid1Top10q','bid1Top11q','bid1Top12q','bid1Top13q','bid1Top14q','bid1Top15q','bid1Top16q','bid1Top17q','bid1Top18q',
        'bid1Top19q','bid1Top20q','bid1Top21q','bid1Top22q','bid1Top23q','bid1Top24q','bid1Top25q','bid1Top26q','bid1Top27q','bid1Top28q','bid1Top29q',
        'bid1Top30q','bid1Top31q','bid1Top32q','bid1Top33q','bid1Top34q','bid1Top35q','bid1Top36q','bid1Top37q','bid1Top38q','bid1Top39q','bid1Top40q',
        'bid1Top41q','bid1Top42q','bid1Top43q','bid1Top44q','bid1Top45q','bid1Top46q','bid1Top47q','bid1Top48q','bid1Top49q','bid1Top50q','ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p','ask1q','ask2q',
          'ask3q','ask4q','ask5q','ask6q','ask7q','ask8q','ask9q','ask10q','ask1Top1q',
        'ask1Top2q','ask1Top3q','ask1Top4q','ask1Top5q','ask1Top6q','ask1Top7q','ask1Top8q','ask1Top9q','ask1Top10q','ask1Top11q','ask1Top12q','ask1Top13q',
        'ask1Top14q','ask1Top15q','ask1Top16q','ask1Top17q','ask1Top18q','ask1Top19q','ask1Top20q','ask1Top21q','ask1Top22q','ask1Top23q',
        'ask1Top24q','ask1Top25q','ask1Top26q','ask1Top27q','ask1Top28q','ask1Top29q','ask1Top30q','ask1Top31q','ask1Top32q','ask1Top33q',
        'ask1Top34q','ask1Top35q','ask1Top36q','ask1Top37q','ask1Top38q','ask1Top39q','ask1Top40q','ask1Top41q','ask1Top42q','ask1Top43q',
        'ask1Top44q','ask1Top45q','ask1Top46q','ask1Top47q','ask1Top48q','ask1Top49q','ask1Top50q','NumTrades','IOPV','TotalBidQty','TotalOfferQty','WeightedAvgBidPx',
         'WeightedAvgOfferPx','TotalBidNumber','TotalOfferNumber','BidTradeMaxDuration','OfferTradeMaxDuration','NumBidOrders','NumOfferOrders','WithdrawBuyNumber',
         'WithdrawBuyAmount','WithdrawBuyMoney','WithdrawSellNumber','WithdrawSellAmount','WithdrawSellMoney','ETFBuyNumber','ETFBuyAmount','ETFBuyMoney',
         'ETFSellNumber','ETFSellAmount','ETFSellMoney'] 
colnames1=['StockID', 'DateTime', 'prevClose', 'open','HighPx','LowPx','close','cum_volume','cum_amount','InstrumentStatus',
         'bid1p','bid2p','bid3p','bid4p','bid5p','bid6p','bid7p','bid8p','bid9p','bid10p','bid1q','bid2q','bid3q','bid4q','bid5q','bid6q','bid7q',
         'bid8q','bid9q','bid10q','bid1n', 'bid2n', 'bid3n', 'bid4n', 'bid5n', 'bid6n', 'bid7n', 'bid8n', 'bid9n', 'bid10n',
           'bid1Top1q','bid1Top2q','bid1Top3q','bid1Top4q','bid1Top5q','bid1Top6q',
        'bid1Top7q','bid1Top8q','bid1Top9q','bid1Top10q','bid1Top11q','bid1Top12q','bid1Top13q','bid1Top14q','bid1Top15q','bid1Top16q','bid1Top17q','bid1Top18q',
        'bid1Top19q','bid1Top20q','bid1Top21q','bid1Top22q','bid1Top23q','bid1Top24q','bid1Top25q','bid1Top26q','bid1Top27q','bid1Top28q','bid1Top29q',
        'bid1Top30q','bid1Top31q','bid1Top32q','bid1Top33q','bid1Top34q','bid1Top35q','bid1Top36q','bid1Top37q','bid1Top38q','bid1Top39q','bid1Top40q',
        'bid1Top41q','bid1Top42q','bid1Top43q','bid1Top44q','bid1Top45q','bid1Top46q','bid1Top47q','bid1Top48q','bid1Top49q','bid1Top50q','ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p','ask1q','ask2q',
          'ask3q','ask4q','ask5q','ask6q','ask7q','ask8q','ask9q','ask10q','ask1n', 'ask2n', 'ask3n', 'ask4n', 'ask5n', 'ask6n',
           'ask7n', 'ask8n', 'ask9n', 'ask10n','ask1Top1q',
        'ask1Top2q','ask1Top3q','ask1Top4q','ask1Top5q','ask1Top6q','ask1Top7q','ask1Top8q','ask1Top9q','ask1Top10q','ask1Top11q','ask1Top12q','ask1Top13q',
        'ask1Top14q','ask1Top15q','ask1Top16q','ask1Top17q','ask1Top18q','ask1Top19q','ask1Top20q','ask1Top21q','ask1Top22q','ask1Top23q',
        'ask1Top24q','ask1Top25q','ask1Top26q','ask1Top27q','ask1Top28q','ask1Top29q','ask1Top30q','ask1Top31q','ask1Top32q','ask1Top33q',
        'ask1Top34q','ask1Top35q','ask1Top36q','ask1Top37q','ask1Top38q','ask1Top39q','ask1Top40q','ask1Top41q','ask1Top42q','ask1Top43q',
        'ask1Top44q','ask1Top45q','ask1Top46q','ask1Top47q','ask1Top48q','ask1Top49q','ask1Top50q','NumTrades','IOPV','TotalBidQty','TotalOfferQty','WeightedAvgBidPx',
         'WeightedAvgOfferPx','TotalBidNumber','TotalOfferNumber','BidTradeMaxDuration','OfferTradeMaxDuration','NumBidOrders','NumOfferOrders','WithdrawBuyNumber',
         'WithdrawBuyAmount','WithdrawBuyMoney','WithdrawSellNumber','WithdrawSellAmount','WithdrawSellMoney','ETFBuyNumber','ETFBuyAmount','ETFBuyMoney',
         'ETFSellNumber','ETFSellAmount','ETFSellMoney'] 
colnames2=['SecurityID', 'DateTime', 'Price', 'Qty', 'LeaveQty', "Side"]


df = []
bad = []

readPath = 'L:\\2020 data\\SH\\***' 
dataPathLs = np.array(glob.glob(readPath))

for data in dataPathLs[2:]:
    startTm = datetime.datetime.now()
    
    if len(np.array(glob.glob(data +'\\***'))) == 0:
        continue

    else:
        try:
            snapshot = pd.read_csv(data +'\\Snapshot.csv', header=None)
            assert(snapshot.shape[1] == len(colnames))
            print("XTP lv2 data")
            snapshot.columns = colnames
            snapshot["DateTime"] = snapshot["DateTime"].apply(lambda x: int(x))
            snapshot['time'] = ((snapshot["DateTime"] - int(snapshot["DateTime"].iloc[0]//1000000 * 1000000))*1000000).astype(np.int64)
        except:
            try1 = pd.read_csv(data +'\\Snapshot.csv', nrows=1, header=None)
            if try1.shape[1] == 2:
                print("native lv2 data")
                snapshot = pd.read_csv(data +'\\Snapshot.csv', header=None, names=colnames1).iloc[1:, :]
                snapshot["DateTime"] = snapshot["DateTime"].apply(lambda x: int(x))
                snapshot['time'] = ((snapshot["DateTime"] - int(snapshot["DateTime"].iloc[0]//1000000 * 1000000))*1000000).astype(np.int64)                
        print("csv is loaded")
        
        snapshot["StockID"] = snapshot["StockID"].astype(int)
        snapshot = snapshot[(snapshot["StockID"] >= 600000) & (snapshot["StockID"] < 700000)].reset_index(drop=True)
        snapshot = snapshot.rename(columns={"StockID": "ID"})
        snapshot["date"] = int(snapshot["DateTime"].iloc[0]//1000000)
        snapshot["clockAtArrival"] = snapshot["DateTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S').timestamp()*1e6))
        snapshot['ID'] = 1000000 + snapshot["ID"]
        snapshot = snapshot.rename(columns={"NumTrades": "cum_tradesCnt", "HighPx":"high", "LowPx":"low", "TotalBidQty":"totalBidQuantity",
                                   "TotalOfferQty":"totalAskQuantity", "WeightedAvgBidPx":"vwapBid", "WeightedAvgOfferPx":"vwapAsk",
                                   "TotalBidNumber":"totalBidOrders", "TotalOfferNumber":"totalAskOrders", "NumBidOrders":"totalBidLevels",
                                   "NumOfferOrders":"totalAskLevels", "WithdrawBuyNumber":"cum_canceledBuyOrders",
                                   "WithdrawBuyAmount":"cum_canceledBuyVolume", "WithdrawBuyMoney":"cum_canceledBuyAmount",
                                   "WithdrawSellNumber":"cum_canceledSellOrders", "WithdrawSellAmount":"cum_canceledSellVolume",
                                   "WithdrawSellMoney":"cum_canceledSellAmount", "BidTradeMaxDuration":"bidTradeMaxDuration",
                                    "OfferTradeMaxDuration":"askTradeMaxDuration"})
        
        try1 = pd.read_csv(data +'\\Auction.csv', nrows=1, header=None)
        if try1.shape[1] == 2:
            print("native auction data")
            aucData = pd.read_csv(data +'\\Auction.csv', header=None, names=colnames2).iloc[1:, :]
        else:
            aucData = pd.read_csv(data +'\\Auction.csv', header=None)
            assert(aucData.shape[1] == len(colnames2))
            print("native auction data")
            aucData.columns = colnames2
            
        aucData['bid1p'] = aucData['Price']
        aucData['ask1p'] = aucData['Price']
        aucData['bid1q'] = aucData['Qty']
        aucData['ask1q'] = aucData['Qty']
        aucData['bid2q'] = np.where(aucData['Side'] == 1, aucData['LeaveQty'], 0)
        aucData['ask2q'] = np.where(aucData['Side'] == 2, aucData['LeaveQty'], 0)
        aucData["SecurityID"] = aucData["SecurityID"].astype(int)
        aucData = aucData[(aucData["SecurityID"] >= 600000) & (aucData["SecurityID"] < 700000)].reset_index(drop=True)
        aucData["ID"] = aucData["SecurityID"] + 1000000
        aucData["DateTime"] = aucData["DateTime"].apply(lambda x: int(x))
        aucData["date"] = int(aucData["DateTime"].iloc[0]//1000000)
        m_ax = 92459 + int(aucData["DateTime"].iloc[0]//1000000 * 1000000)
        aucData["DateTime"] = aucData['DateTime'].clip(upper=m_ax)
        aucData['time'] = ((aucData["DateTime"] - int(aucData["DateTime"].iloc[0]//1000000 * 1000000))*1000000).astype(np.int64)
        aucData["clockAtArrival"] = aucData["DateTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S').timestamp()*1e6))
        aucData = aucData[['date', 'time', 'clockAtArrival', 'ID', 'bid1p', 'ask1p', 'bid2q', 'bid1q', 'ask1q', 'ask2q']].reset_index(drop=True)
        
        snapshot = snapshot[~((snapshot["bid1p"] == 0) & (snapshot["ask1p"] == 0))]
        test1 = aucData.groupby("ID")["time"].max().reset_index()
        test2 = snapshot.groupby("ID")["time"].min().reset_index()
        test = pd.merge(test1, test2, on="ID")
        assert(test[test["time_x"] >= test["time_y"]].shape[0]==0)
        
        snapshot = pd.concat([aucData, snapshot]).reset_index(drop=True)
        snapshot = snapshot.fillna(0)
        snapshot = snapshot[~((snapshot["bid1p"] == 0) & (snapshot["ask1p"] == 0))]
        
        snapshot["ordering"] = snapshot.groupby("ID").cumcount()
        snapshot["ordering"] = snapshot["ordering"] + 1
        
        for col in ["ID", "cum_tradesCnt", "cum_volume", 'ask1q','ask2q','ask3q','ask4q','ask5q','ask6q','ask7q','ask8q','ask9q',
                        'ask10q', 'bid1q','bid2q','bid3q','bid4q','bid5q','bid6q','bid7q','bid8q','bid9q','bid10q', 'bid1Top1q','bid1Top2q','bid1Top3q','bid1Top4q','bid1Top5q','bid1Top6q',
        'bid1Top7q','bid1Top8q','bid1Top9q','bid1Top10q','bid1Top11q','bid1Top12q','bid1Top13q','bid1Top14q','bid1Top15q','bid1Top16q','bid1Top17q','bid1Top18q',
        'bid1Top19q','bid1Top20q','bid1Top21q','bid1Top22q','bid1Top23q','bid1Top24q','bid1Top25q','bid1Top26q','bid1Top27q','bid1Top28q','bid1Top29q',
        'bid1Top30q','bid1Top31q','bid1Top32q','bid1Top33q','bid1Top34q','bid1Top35q','bid1Top36q','bid1Top37q','bid1Top38q','bid1Top39q','bid1Top40q',
        'bid1Top41q','bid1Top42q','bid1Top43q','bid1Top44q','bid1Top45q','bid1Top46q','bid1Top47q','bid1Top48q','bid1Top49q','bid1Top50q', 'ask1Top1q',
        'ask1Top2q','ask1Top3q','ask1Top4q','ask1Top5q','ask1Top6q','ask1Top7q','ask1Top8q','ask1Top9q','ask1Top10q','ask1Top11q','ask1Top12q','ask1Top13q',
        'ask1Top14q','ask1Top15q','ask1Top16q','ask1Top17q','ask1Top18q','ask1Top19q','ask1Top20q','ask1Top21q','ask1Top22q','ask1Top23q',
        'ask1Top24q','ask1Top25q','ask1Top26q','ask1Top27q','ask1Top28q','ask1Top29q','ask1Top30q','ask1Top31q','ask1Top32q','ask1Top33q',
        'ask1Top34q','ask1Top35q','ask1Top36q','ask1Top37q','ask1Top38q','ask1Top39q','ask1Top40q','ask1Top41q','ask1Top42q','ask1Top43q',
        'ask1Top44q','ask1Top45q','ask1Top46q','ask1Top47q','ask1Top48q','ask1Top49q','ask1Top50q', "totalBidQuantity", "totalAskQuantity", "totalBidOrders",
        'totalAskOrders', 'totalBidLevels', 'totalAskLevels', 'cum_canceledBuyOrders', 'cum_canceledBuyVolume', 'cum_canceledSellOrders',
        'cum_canceledSellVolume', "ordering", 'bidTradeMaxDuration', 'askTradeMaxDuration']:
            snapshot[col] = snapshot[col].astype(np.int64)

        
        for cols in ["prevClose", 'open', "high", "low", "close", 'bid10p','bid9p','bid8p','bid7p','bid6p','bid5p','bid4p','bid3p',
             'bid2p','bid1p','ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p', 'vwapBid', "vwapAsk"]:
            snapshot[cols] = (snapshot[cols] * 10000).round(0).astype(np.int64)
        
        for cols in ["cum_amount", "cum_canceledBuyAmount", "cum_canceledSellAmount"]:
            snapshot[cols] = (snapshot[cols] * 10000).round(0).astype(np.int64)
        
        for cols in ['bid1n', 'bid2n', 'bid3n', 'bid4n', 'bid5n', 'bid6n', 'bid7n', 'bid8n', 'bid9n', 'bid10n', 'ask1n', 'ask2n', 'ask3n', 'ask4n', 'ask5n', 'ask6n',
        'ask7n', 'ask8n', 'ask9n', 'ask10n']:
            snapshot[cols] = 0
        
        display(snapshot["date"].iloc[0])
        display(snapshot.groupby("ID")["time"].min().max())
        
        snapshot['datetime'] = snapshot["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))
        
        assert(sum(snapshot[snapshot["open"] != 0].groupby("ID")["open"].nunique() != 1) == 0)
        assert(sum(snapshot[snapshot["prevClose"] != 0].groupby("ID")["prevClose"].nunique() != 1) == 0)
        snapshot["prevClose"] = np.where(snapshot["time"] >= 91500000000, snapshot.groupby("ID")["prevClose"].transform("max"), snapshot["prevClose"]) 
        snapshot["open"] = np.where(snapshot["cum_volume"] > 0, snapshot.groupby("ID")["open"].transform("max"), snapshot["open"])
        assert(sum(snapshot[snapshot["open"] != 0].groupby("ID")["open"].nunique() != 1) == 0)
        assert(sum(snapshot[snapshot["prevClose"] != 0].groupby("ID")["prevClose"].nunique() != 1) == 0)
        assert(snapshot[snapshot["cum_volume"] > 0]["open"].min() > 0)
        
        snapshot = snapshot[["date", "time", "clockAtArrival", "datetime", "ID", "ordering", "cum_tradesCnt", "cum_volume", "cum_amount", "prevClose",
                            "open", "high", "low", "close", 'bid10p','bid9p','bid8p','bid7p','bid6p','bid5p','bid4p','bid3p','bid2p','bid1p',
                            'ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p', 'bid10q','bid9q','bid8q',
                             'bid7q','bid6q','bid5q','bid4q','bid3q','bid2q','bid1q', 'ask1q','ask2q','ask3q','ask4q','ask5q','ask6q',
                             'ask7q','ask8q','ask9q','ask10q', 'bid10n', 'bid9n', 'bid8n', 'bid7n', 'bid6n', 'bid5n', 'bid4n', 'bid3n', 'bid2n', 'bid1n', 
                             'ask1n', 'ask2n', 'ask3n', 'ask4n', 'ask5n', 'ask6n','ask7n', 'ask8n', 'ask9n', 'ask10n','bid1Top1q','bid1Top2q','bid1Top3q','bid1Top4q','bid1Top5q','bid1Top6q',
        'bid1Top7q','bid1Top8q','bid1Top9q','bid1Top10q','bid1Top11q','bid1Top12q','bid1Top13q','bid1Top14q','bid1Top15q','bid1Top16q','bid1Top17q','bid1Top18q',
        'bid1Top19q','bid1Top20q','bid1Top21q','bid1Top22q','bid1Top23q','bid1Top24q','bid1Top25q','bid1Top26q','bid1Top27q','bid1Top28q','bid1Top29q',
        'bid1Top30q','bid1Top31q','bid1Top32q','bid1Top33q','bid1Top34q','bid1Top35q','bid1Top36q','bid1Top37q','bid1Top38q','bid1Top39q','bid1Top40q',
        'bid1Top41q','bid1Top42q','bid1Top43q','bid1Top44q','bid1Top45q','bid1Top46q','bid1Top47q','bid1Top48q','bid1Top49q','bid1Top50q', 'ask1Top1q',
        'ask1Top2q','ask1Top3q','ask1Top4q','ask1Top5q','ask1Top6q','ask1Top7q','ask1Top8q','ask1Top9q','ask1Top10q','ask1Top11q','ask1Top12q','ask1Top13q',
        'ask1Top14q','ask1Top15q','ask1Top16q','ask1Top17q','ask1Top18q','ask1Top19q','ask1Top20q','ask1Top21q','ask1Top22q','ask1Top23q',
        'ask1Top24q','ask1Top25q','ask1Top26q','ask1Top27q','ask1Top28q','ask1Top29q','ask1Top30q','ask1Top31q','ask1Top32q','ask1Top33q',
        'ask1Top34q','ask1Top35q','ask1Top36q','ask1Top37q','ask1Top38q','ask1Top39q','ask1Top40q','ask1Top41q','ask1Top42q','ask1Top43q',
        'ask1Top44q','ask1Top45q','ask1Top46q','ask1Top47q','ask1Top48q','ask1Top49q','ask1Top50q',"totalBidQuantity", "totalAskQuantity","vwapBid", "vwapAsk",
        "totalBidOrders",'totalAskOrders','totalBidLevels', 'totalAskLevels', 'bidTradeMaxDuration', 'askTradeMaxDuration', 'cum_canceledBuyOrders', 'cum_canceledBuyVolume',
        "cum_canceledBuyAmount", "cum_canceledSellOrders", 'cum_canceledSellVolume',"cum_canceledSellAmount"]]
        
        
        db = DB("mongodb://user_rw:faa96dfc@192.168.10.223")
        db.write('snapshot', snapshot)
        
        print(datetime.datetime.now() - startTm)

  interactivity=interactivity, compiler=compiler, result=result)


XTP lv2 data
csv is loaded
native auction data


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




20200106

92459000000

0:08:39.598756
XTP lv2 data
csv is loaded
native auction data


20200107

92459000000

0:08:54.113808
XTP lv2 data
csv is loaded
native auction data


20200108

92527000000

0:09:36.175646
XTP lv2 data
csv is loaded
native auction data


20200109

92459000000

0:09:24.744605
XTP lv2 data
csv is loaded
native auction data


20200110

92459000000

0:09:13.736625
XTP lv2 data
csv is loaded
native auction data


20200113

92459000000

0:09:43.965205
XTP lv2 data
csv is loaded
native auction data


20200114

92459000000

0:09:56.639565
XTP lv2 data
csv is loaded
native auction data


20200115

92459000000

0:09:10.761935
XTP lv2 data
csv is loaded
native auction data


20200116

92459000000

0:09:17.899166
XTP lv2 data
csv is loaded
native auction data


20200117

92459000000

0:09:14.383018
XTP lv2 data
csv is loaded
native auction data


20200120

92459000000

0:09:19.768174
XTP lv2 data
csv is loaded
native auction data


20200121

92532000000

0:09:23.203480
XTP lv2 data
csv is loaded
native auction data


20200122

92459000000

0:09:40.603453
XTP lv2 data
csv is loaded
native auction data


20200123

92558000000

0:10:26.755053
XTP lv2 data
csv is loaded
native auction data


20200203

91850000000

0:07:31.371794
XTP lv2 data
csv is loaded
native auction data


20200204

92459000000

0:10:38.591020
XTP lv2 data
csv is loaded
native auction data


20200205

92459000000

0:10:54.783460
XTP lv2 data
csv is loaded
native auction data


20200206

92552000000

0:10:35.575452
XTP lv2 data
csv is loaded
native auction data


20200207

92459000000

0:10:12.997821
XTP lv2 data
csv is loaded
native auction data


20200210

92459000000

0:10:17.880766
XTP lv2 data
csv is loaded
native auction data


20200211

92459000000

0:10:05.121885
XTP lv2 data
csv is loaded
native auction data


20200212

92605000000

0:10:03.076354
XTP lv2 data
csv is loaded
native auction data


20200213

92459000000

0:10:16.341882
XTP lv2 data
csv is loaded
native auction data


20200214

92459000000

0:10:04.149513
XTP lv2 data
csv is loaded
native auction data


20200217

92459000000

0:10:33.177365
XTP lv2 data
csv is loaded
native auction data


20200218

92459000000

0:10:40.249220
XTP lv2 data
csv is loaded
native auction data


20200219

92459000000

0:10:36.831839
XTP lv2 data
csv is loaded
native auction data


20200220

92459000000

0:10:50.762584
XTP lv2 data
csv is loaded
native auction data


20200221

92459000000

0:10:56.525437
XTP lv2 data
csv is loaded
native auction data


20200224

92559000000

0:10:59.181920
XTP lv2 data
csv is loaded
native auction data


20200225

92459000000

0:11:27.206720
XTP lv2 data
csv is loaded
native auction data


20200226

92502000000

0:11:25.220204
XTP lv2 data
csv is loaded
native auction data


20200227

92459000000

0:10:59.563849
XTP lv2 data
csv is loaded
native auction data


20200228

92459000000

0:11:38.007494
XTP lv2 data
csv is loaded
native auction data


20200302

92459000000

0:11:34.085991
XTP lv2 data
csv is loaded
native auction data


20200303

92459000000

0:11:30.898515
XTP lv2 data
csv is loaded
native auction data


20200304

92459000000

0:11:10.647694
XTP lv2 data
csv is loaded
native auction data


20200305

92459000000

0:11:47.170979
XTP lv2 data
csv is loaded
native auction data


20200306

92459000000

0:11:26.377636
XTP lv2 data
csv is loaded
native auction data


20200309

92459000000

0:11:43.403072
XTP lv2 data
csv is loaded
native auction data


20200310

92459000000

0:11:40.424017
XTP lv2 data
csv is loaded
native auction data


20200311

92459000000

0:11:29.849321
XTP lv2 data
csv is loaded
native auction data


20200312

92459000000

0:11:18.161573
XTP lv2 data
csv is loaded
native auction data


20200313

92459000000

0:11:48.073617
XTP lv2 data
csv is loaded
native auction data


20200316

92501000000

0:11:35.874181
XTP lv2 data
csv is loaded
native auction data


20200317

92459000000

0:11:41.077296
XTP lv2 data
csv is loaded
native auction data


20200318

92459000000

0:11:30.117631
XTP lv2 data
csv is loaded
native auction data


20200319

92459000000

0:11:28.162803
XTP lv2 data
csv is loaded
native auction data


20200320

92551000000

0:11:27.069782
XTP lv2 data
csv is loaded
native auction data


20200323

92459000000

0:11:13.857081
XTP lv2 data
csv is loaded
native auction data


20200324

92459000000

0:11:09.315202
XTP lv2 data
csv is loaded
native auction data


20200325

92533000000

0:11:15.827815
XTP lv2 data
csv is loaded
native auction data


20200326

92459000000

0:11:27.927459
XTP lv2 data
csv is loaded
native auction data


20200327

92459000000

0:11:13.030327
XTP lv2 data
csv is loaded
native auction data


20200330

92459000000

0:11:15.768940
XTP lv2 data
csv is loaded
native auction data


20200331

92459000000

0:10:54.628530
XTP lv2 data
csv is loaded
native auction data


20200401

92548000000

0:10:58.199708
XTP lv2 data
csv is loaded
native auction data


20200402

92459000000

0:11:10.957381
XTP lv2 data
csv is loaded
native auction data


20200403

92459000000

0:11:05.892360
XTP lv2 data
csv is loaded
native auction data


20200407

92459000000

0:11:29.744800
XTP lv2 data
csv is loaded
native auction data


20200408

92544000000

0:11:33.153246
XTP lv2 data
csv is loaded
native auction data


20200409

92459000000

0:11:53.430267
XTP lv2 data
csv is loaded
native auction data


20200410

92556000000

0:11:33.879660
XTP lv2 data
csv is loaded
native auction data


20200413

92459000000

0:10:46.706377
XTP lv2 data
csv is loaded
native auction data


20200414

92606000000

0:11:03.856150
XTP lv2 data
csv is loaded
native auction data


20200415

92459000000

0:11:09.566120
XTP lv2 data
csv is loaded
native auction data


20200416

92459000000

0:11:07.301381
XTP lv2 data
csv is loaded
native auction data


20200417

92606000000

0:11:22.001861
XTP lv2 data
csv is loaded
native auction data


20200420

92459000000

0:11:14.297719
XTP lv2 data
csv is loaded
native auction data


20200421

92459000000

0:11:29.759911
XTP lv2 data
csv is loaded
native auction data


20200422

92558000000

0:11:27.365112
XTP lv2 data
csv is loaded
native auction data


20200423

92554000000

0:12:02.752698
XTP lv2 data
csv is loaded
native auction data


20200424

92556000000

0:11:50.059819
XTP lv2 data
csv is loaded
native auction data


20200427

92459000000

0:11:56.798906
XTP lv2 data
csv is loaded
native auction data


20200428

92459000000

0:12:12.531531
XTP lv2 data
csv is loaded
native auction data


20200429

92459000000

0:11:23.044253
XTP lv2 data
csv is loaded
native auction data


20200430

92459000000

0:12:14.224154
XTP lv2 data
csv is loaded
native auction data


20200506

92459000000

0:12:14.370024
XTP lv2 data
csv is loaded
native auction data


20200507

92459000000

0:11:45.023672
XTP lv2 data
csv is loaded
native auction data


20200508

92459000000

0:12:13.129585


#### Upload SH 2017 data

In [6]:
import pandas as pd
import random
import numpy as np
import glob
import os
from unrar import rarfile
import py7zr
import pickle
import datetime
import time
pd.set_option("max_columns", 200)

    
colnames=['StockID', 'DateTime', 'prevClose', 'open','HighPx','LowPx','close','cum_volume','cum_amount','InstrumentStatus',
         'bid1p','bid2p','bid3p','bid4p','bid5p','bid6p','bid7p','bid8p','bid9p','bid10p','bid1q','bid2q','bid3q','bid4q','bid5q','bid6q','bid7q',
         'bid8q','bid9q','bid10q','bid1Top1q','bid1Top2q','bid1Top3q','bid1Top4q','bid1Top5q','bid1Top6q',
        'bid1Top7q','bid1Top8q','bid1Top9q','bid1Top10q','bid1Top11q','bid1Top12q','bid1Top13q','bid1Top14q','bid1Top15q','bid1Top16q','bid1Top17q','bid1Top18q',
        'bid1Top19q','bid1Top20q','bid1Top21q','bid1Top22q','bid1Top23q','bid1Top24q','bid1Top25q','bid1Top26q','bid1Top27q','bid1Top28q','bid1Top29q',
        'bid1Top30q','bid1Top31q','bid1Top32q','bid1Top33q','bid1Top34q','bid1Top35q','bid1Top36q','bid1Top37q','bid1Top38q','bid1Top39q','bid1Top40q',
        'bid1Top41q','bid1Top42q','bid1Top43q','bid1Top44q','bid1Top45q','bid1Top46q','bid1Top47q','bid1Top48q','bid1Top49q','bid1Top50q','ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p','ask1q','ask2q',
          'ask3q','ask4q','ask5q','ask6q','ask7q','ask8q','ask9q','ask10q','ask1Top1q',
        'ask1Top2q','ask1Top3q','ask1Top4q','ask1Top5q','ask1Top6q','ask1Top7q','ask1Top8q','ask1Top9q','ask1Top10q','ask1Top11q','ask1Top12q','ask1Top13q',
        'ask1Top14q','ask1Top15q','ask1Top16q','ask1Top17q','ask1Top18q','ask1Top19q','ask1Top20q','ask1Top21q','ask1Top22q','ask1Top23q',
        'ask1Top24q','ask1Top25q','ask1Top26q','ask1Top27q','ask1Top28q','ask1Top29q','ask1Top30q','ask1Top31q','ask1Top32q','ask1Top33q',
        'ask1Top34q','ask1Top35q','ask1Top36q','ask1Top37q','ask1Top38q','ask1Top39q','ask1Top40q','ask1Top41q','ask1Top42q','ask1Top43q',
        'ask1Top44q','ask1Top45q','ask1Top46q','ask1Top47q','ask1Top48q','ask1Top49q','ask1Top50q','NumTrades','IOPV','TotalBidQty','TotalOfferQty','WeightedAvgBidPx',
         'WeightedAvgOfferPx','TotalBidNumber','TotalOfferNumber','BidTradeMaxDuration','OfferTradeMaxDuration','NumBidOrders','NumOfferOrders','WithdrawBuyNumber',
         'WithdrawBuyAmount','WithdrawBuyMoney','WithdrawSellNumber','WithdrawSellAmount','WithdrawSellMoney','ETFBuyNumber','ETFBuyAmount','ETFBuyMoney',
         'ETFSellNumber','ETFSellAmount','ETFSellMoney'] 
colnames1=['StockID', 'DateTime', 'prevClose', 'open','HighPx','LowPx','close','cum_volume','cum_amount','InstrumentStatus',
         'bid1p','bid2p','bid3p','bid4p','bid5p','bid6p','bid7p','bid8p','bid9p','bid10p','bid1q','bid2q','bid3q','bid4q','bid5q','bid6q','bid7q',
         'bid8q','bid9q','bid10q','bid1n', 'bid2n', 'bid3n', 'bid4n', 'bid5n', 'bid6n', 'bid7n', 'bid8n', 'bid9n', 'bid10n',
           'bid1Top1q','bid1Top2q','bid1Top3q','bid1Top4q','bid1Top5q','bid1Top6q',
        'bid1Top7q','bid1Top8q','bid1Top9q','bid1Top10q','bid1Top11q','bid1Top12q','bid1Top13q','bid1Top14q','bid1Top15q','bid1Top16q','bid1Top17q','bid1Top18q',
        'bid1Top19q','bid1Top20q','bid1Top21q','bid1Top22q','bid1Top23q','bid1Top24q','bid1Top25q','bid1Top26q','bid1Top27q','bid1Top28q','bid1Top29q',
        'bid1Top30q','bid1Top31q','bid1Top32q','bid1Top33q','bid1Top34q','bid1Top35q','bid1Top36q','bid1Top37q','bid1Top38q','bid1Top39q','bid1Top40q',
        'bid1Top41q','bid1Top42q','bid1Top43q','bid1Top44q','bid1Top45q','bid1Top46q','bid1Top47q','bid1Top48q','bid1Top49q','bid1Top50q','ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p','ask1q','ask2q',
          'ask3q','ask4q','ask5q','ask6q','ask7q','ask8q','ask9q','ask10q','ask1n', 'ask2n', 'ask3n', 'ask4n', 'ask5n', 'ask6n',
           'ask7n', 'ask8n', 'ask9n', 'ask10n','ask1Top1q',
        'ask1Top2q','ask1Top3q','ask1Top4q','ask1Top5q','ask1Top6q','ask1Top7q','ask1Top8q','ask1Top9q','ask1Top10q','ask1Top11q','ask1Top12q','ask1Top13q',
        'ask1Top14q','ask1Top15q','ask1Top16q','ask1Top17q','ask1Top18q','ask1Top19q','ask1Top20q','ask1Top21q','ask1Top22q','ask1Top23q',
        'ask1Top24q','ask1Top25q','ask1Top26q','ask1Top27q','ask1Top28q','ask1Top29q','ask1Top30q','ask1Top31q','ask1Top32q','ask1Top33q',
        'ask1Top34q','ask1Top35q','ask1Top36q','ask1Top37q','ask1Top38q','ask1Top39q','ask1Top40q','ask1Top41q','ask1Top42q','ask1Top43q',
        'ask1Top44q','ask1Top45q','ask1Top46q','ask1Top47q','ask1Top48q','ask1Top49q','ask1Top50q','NumTrades','IOPV','TotalBidQty','TotalOfferQty','WeightedAvgBidPx',
         'WeightedAvgOfferPx','TotalBidNumber','TotalOfferNumber','BidTradeMaxDuration','OfferTradeMaxDuration','NumBidOrders','NumOfferOrders','WithdrawBuyNumber',
         'WithdrawBuyAmount','WithdrawBuyMoney','WithdrawSellNumber','WithdrawSellAmount','WithdrawSellMoney','ETFBuyNumber','ETFBuyAmount','ETFBuyMoney',
         'ETFSellNumber','ETFSellAmount','ETFSellMoney'] 
colnames2=['SecurityID', 'DateTime', 'Price', 'Qty', 'LeaveQty', "Side"]


year = "2017"
df = []
bad = []

readPath = 'G:\\' + year +'\\***' 
dataPathLs = np.array(glob.glob(readPath))

for data in dataPathLs[176:]:
    startTm = datetime.datetime.now()
    
    if len(np.array(glob.glob(data +'\\***'))) == 0:
        continue

    else:
        try:
            snapshot = pd.read_csv(data +'\\Snapshot.csv', header=None)
            assert(snapshot.shape[1] == len(colnames))
            print("XTP lv2 data")
            snapshot.columns = colnames
            snapshot["DateTime"] = snapshot["DateTime"].apply(lambda x: int(x))
            snapshot['time'] = ((snapshot["DateTime"] - int(snapshot["DateTime"].iloc[0]//1000000 * 1000000))*1000000).astype(np.int64)
        except:
            try1 = pd.read_csv(data +'\\Snapshot.csv', nrows=1, header=None)
            if try1.shape[1] == 2:
                print("native lv2 data")
                snapshot = pd.read_csv(data +'\\Snapshot.csv', header=None, names=colnames1).iloc[1:, :]
                snapshot["DateTime"] = snapshot["DateTime"].apply(lambda x: int(x))
                snapshot['time'] = ((snapshot["DateTime"] - int(snapshot["DateTime"].iloc[0]//1000000 * 1000000))*1000000).astype(np.int64)                
        print("csv is loaded")
        
        snapshot["StockID"] = snapshot["StockID"].astype(int)
        snapshot = snapshot[(snapshot["StockID"] >= 600000) & (snapshot["StockID"] < 700000)].reset_index(drop=True)
        snapshot = snapshot.rename(columns={"StockID": "ID"})
        snapshot["date"] = int(snapshot["DateTime"].iloc[0]//1000000)
        snapshot["clockAtArrival"] = snapshot["DateTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S').timestamp()*1e6))
        snapshot['ID'] = 1000000 + snapshot["ID"]
        snapshot = snapshot.rename(columns={"NumTrades": "cum_tradesCnt", "HighPx":"high", "LowPx":"low", "TotalBidQty":"totalBidQuantity",
                                   "TotalOfferQty":"totalAskQuantity", "WeightedAvgBidPx":"vwapBid", "WeightedAvgOfferPx":"vwapAsk",
                                   "TotalBidNumber":"totalBidOrders", "TotalOfferNumber":"totalAskOrders", "NumBidOrders":"totalBidLevels",
                                   "NumOfferOrders":"totalAskLevels", "WithdrawBuyNumber":"cum_canceledBuyOrders",
                                   "WithdrawBuyAmount":"cum_canceledBuyVolume", "WithdrawBuyMoney":"cum_canceledBuyAmount",
                                   "WithdrawSellNumber":"cum_canceledSellOrders", "WithdrawSellAmount":"cum_canceledSellVolume",
                                   "WithdrawSellMoney":"cum_canceledSellAmount", "BidTradeMaxDuration":"bidTradeMaxDuration",
                                    "OfferTradeMaxDuration":"askTradeMaxDuration"})
        
        try1 = pd.read_csv(data +'\\Auction.csv', nrows=1, header=None)
        if try1.shape[1] == 2:
            print("native auction data")
            aucData = pd.read_csv(data +'\\Auction.csv', header=None, names=colnames2).iloc[1:, :]
        else:
            aucData = pd.read_csv(data +'\\Auction.csv', header=None)
            assert(aucData.shape[1] == len(colnames2))
            print("native auction data")
            aucData.columns = colnames2
            
        aucData['bid1p'] = aucData['Price']
        aucData['ask1p'] = aucData['Price']
        aucData['bid1q'] = aucData['Qty']
        aucData['ask1q'] = aucData['Qty']
        aucData['bid2q'] = np.where(aucData['Side'] == 1, aucData['LeaveQty'], 0)
        aucData['ask2q'] = np.where(aucData['Side'] == 2, aucData['LeaveQty'], 0)
        aucData["SecurityID"] = aucData["SecurityID"].astype(int)
        aucData = aucData[(aucData["SecurityID"] >= 600000) & (aucData["SecurityID"] < 700000)].reset_index(drop=True)
        aucData["ID"] = aucData["SecurityID"] + 1000000
        aucData["DateTime"] = aucData["DateTime"].apply(lambda x: int(x))
        aucData["date"] = int(aucData["DateTime"].iloc[0]//1000000)
        m_ax = 92459 + int(aucData["DateTime"].iloc[0]//1000000 * 1000000)
        aucData["DateTime"] = aucData['DateTime'].clip(upper=m_ax)
        aucData['time'] = ((aucData["DateTime"] - int(aucData["DateTime"].iloc[0]//1000000 * 1000000))*1000000).astype(np.int64)
        aucData["clockAtArrival"] = aucData["DateTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S').timestamp()*1e6))
        aucData = aucData[['date', 'time', 'clockAtArrival', 'ID', 'bid1p', 'ask1p', 'bid2q', 'bid1q', 'ask1q', 'ask2q']].reset_index(drop=True)
        
        snapshot = snapshot[~((snapshot["bid1p"] == 0) & (snapshot["ask1p"] == 0))]
        test1 = aucData.groupby("ID")["time"].max().reset_index()
        test2 = snapshot.groupby("ID")["time"].min().reset_index()
        test = pd.merge(test1, test2, on="ID")
        assert(test[test["time_x"] >= test["time_y"]].shape[0]==0)
        
        snapshot = pd.concat([aucData, snapshot]).reset_index(drop=True)
        snapshot = snapshot.fillna(0)
        snapshot = snapshot[~((snapshot["bid1p"] == 0) & (snapshot["ask1p"] == 0))]
        
        snapshot["ordering"] = snapshot.groupby("ID").cumcount()
        snapshot["ordering"] = snapshot["ordering"] + 1
        
        for col in ["ID", "cum_tradesCnt", "cum_volume", 'ask1q','ask2q','ask3q','ask4q','ask5q','ask6q','ask7q','ask8q','ask9q',
                        'ask10q', 'bid1q','bid2q','bid3q','bid4q','bid5q','bid6q','bid7q','bid8q','bid9q','bid10q', 'bid1Top1q','bid1Top2q','bid1Top3q','bid1Top4q','bid1Top5q','bid1Top6q',
        'bid1Top7q','bid1Top8q','bid1Top9q','bid1Top10q','bid1Top11q','bid1Top12q','bid1Top13q','bid1Top14q','bid1Top15q','bid1Top16q','bid1Top17q','bid1Top18q',
        'bid1Top19q','bid1Top20q','bid1Top21q','bid1Top22q','bid1Top23q','bid1Top24q','bid1Top25q','bid1Top26q','bid1Top27q','bid1Top28q','bid1Top29q',
        'bid1Top30q','bid1Top31q','bid1Top32q','bid1Top33q','bid1Top34q','bid1Top35q','bid1Top36q','bid1Top37q','bid1Top38q','bid1Top39q','bid1Top40q',
        'bid1Top41q','bid1Top42q','bid1Top43q','bid1Top44q','bid1Top45q','bid1Top46q','bid1Top47q','bid1Top48q','bid1Top49q','bid1Top50q', 'ask1Top1q',
        'ask1Top2q','ask1Top3q','ask1Top4q','ask1Top5q','ask1Top6q','ask1Top7q','ask1Top8q','ask1Top9q','ask1Top10q','ask1Top11q','ask1Top12q','ask1Top13q',
        'ask1Top14q','ask1Top15q','ask1Top16q','ask1Top17q','ask1Top18q','ask1Top19q','ask1Top20q','ask1Top21q','ask1Top22q','ask1Top23q',
        'ask1Top24q','ask1Top25q','ask1Top26q','ask1Top27q','ask1Top28q','ask1Top29q','ask1Top30q','ask1Top31q','ask1Top32q','ask1Top33q',
        'ask1Top34q','ask1Top35q','ask1Top36q','ask1Top37q','ask1Top38q','ask1Top39q','ask1Top40q','ask1Top41q','ask1Top42q','ask1Top43q',
        'ask1Top44q','ask1Top45q','ask1Top46q','ask1Top47q','ask1Top48q','ask1Top49q','ask1Top50q', "totalBidQuantity", "totalAskQuantity", "totalBidOrders",
        'totalAskOrders', 'totalBidLevels', 'totalAskLevels', 'cum_canceledBuyOrders', 'cum_canceledBuyVolume', 'cum_canceledSellOrders',
        'cum_canceledSellVolume', "ordering", 'bidTradeMaxDuration', 'askTradeMaxDuration']:
            snapshot[col] = snapshot[col].astype(np.int64)

        
        for cols in ["prevClose", 'open', "high", "low", "close", 'bid10p','bid9p','bid8p','bid7p','bid6p','bid5p','bid4p','bid3p',
             'bid2p','bid1p','ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p', 'vwapBid', "vwapAsk"]:
            snapshot[cols] = (snapshot[cols] * 10000).round(0).astype(np.int64)
        
        for cols in ["cum_amount", "cum_canceledBuyAmount", "cum_canceledSellAmount"]:
            snapshot[cols] = (snapshot[cols] * 10000).round(0).astype(np.int64)
        
        for cols in ['bid1n', 'bid2n', 'bid3n', 'bid4n', 'bid5n', 'bid6n', 'bid7n', 'bid8n', 'bid9n', 'bid10n', 'ask1n', 'ask2n', 'ask3n', 'ask4n', 'ask5n', 'ask6n',
        'ask7n', 'ask8n', 'ask9n', 'ask10n']:
            snapshot[cols] = 0
        
        display(snapshot["date"].iloc[0])
        display(snapshot.groupby("ID")["time"].min().max())
        
        snapshot['datetime'] = snapshot["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))
        
        assert(sum(snapshot[snapshot["open"] != 0].groupby("ID")["open"].nunique() != 1) == 0)
        assert(sum(snapshot[snapshot["prevClose"] != 0].groupby("ID")["prevClose"].nunique() != 1) == 0)
        snapshot["prevClose"] = np.where(snapshot["time"] >= 91500000000, snapshot.groupby("ID")["prevClose"].transform("max"), snapshot["prevClose"]) 
        snapshot["open"] = np.where(snapshot["cum_volume"] > 0, snapshot.groupby("ID")["open"].transform("max"), snapshot["open"])
        assert(sum(snapshot[snapshot["open"] != 0].groupby("ID")["open"].nunique() != 1) == 0)
        assert(sum(snapshot[snapshot["prevClose"] != 0].groupby("ID")["prevClose"].nunique() != 1) == 0)
        assert(snapshot[snapshot["cum_volume"] > 0]["open"].min() > 0)
        
        snapshot = snapshot[["date", "time", "clockAtArrival", "datetime", "ID", "ordering", "cum_tradesCnt", "cum_volume", "cum_amount", "prevClose",
                            "open", "high", "low", "close", 'bid10p','bid9p','bid8p','bid7p','bid6p','bid5p','bid4p','bid3p','bid2p','bid1p',
                            'ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p', 'bid10q','bid9q','bid8q',
                             'bid7q','bid6q','bid5q','bid4q','bid3q','bid2q','bid1q', 'ask1q','ask2q','ask3q','ask4q','ask5q','ask6q',
                             'ask7q','ask8q','ask9q','ask10q', 'bid10n', 'bid9n', 'bid8n', 'bid7n', 'bid6n', 'bid5n', 'bid4n', 'bid3n', 'bid2n', 'bid1n', 
                             'ask1n', 'ask2n', 'ask3n', 'ask4n', 'ask5n', 'ask6n','ask7n', 'ask8n', 'ask9n', 'ask10n','bid1Top1q','bid1Top2q','bid1Top3q','bid1Top4q','bid1Top5q','bid1Top6q',
        'bid1Top7q','bid1Top8q','bid1Top9q','bid1Top10q','bid1Top11q','bid1Top12q','bid1Top13q','bid1Top14q','bid1Top15q','bid1Top16q','bid1Top17q','bid1Top18q',
        'bid1Top19q','bid1Top20q','bid1Top21q','bid1Top22q','bid1Top23q','bid1Top24q','bid1Top25q','bid1Top26q','bid1Top27q','bid1Top28q','bid1Top29q',
        'bid1Top30q','bid1Top31q','bid1Top32q','bid1Top33q','bid1Top34q','bid1Top35q','bid1Top36q','bid1Top37q','bid1Top38q','bid1Top39q','bid1Top40q',
        'bid1Top41q','bid1Top42q','bid1Top43q','bid1Top44q','bid1Top45q','bid1Top46q','bid1Top47q','bid1Top48q','bid1Top49q','bid1Top50q', 'ask1Top1q',
        'ask1Top2q','ask1Top3q','ask1Top4q','ask1Top5q','ask1Top6q','ask1Top7q','ask1Top8q','ask1Top9q','ask1Top10q','ask1Top11q','ask1Top12q','ask1Top13q',
        'ask1Top14q','ask1Top15q','ask1Top16q','ask1Top17q','ask1Top18q','ask1Top19q','ask1Top20q','ask1Top21q','ask1Top22q','ask1Top23q',
        'ask1Top24q','ask1Top25q','ask1Top26q','ask1Top27q','ask1Top28q','ask1Top29q','ask1Top30q','ask1Top31q','ask1Top32q','ask1Top33q',
        'ask1Top34q','ask1Top35q','ask1Top36q','ask1Top37q','ask1Top38q','ask1Top39q','ask1Top40q','ask1Top41q','ask1Top42q','ask1Top43q',
        'ask1Top44q','ask1Top45q','ask1Top46q','ask1Top47q','ask1Top48q','ask1Top49q','ask1Top50q',"totalBidQuantity", "totalAskQuantity","vwapBid", "vwapAsk",
        "totalBidOrders",'totalAskOrders','totalBidLevels', 'totalAskLevels', 'bidTradeMaxDuration', 'askTradeMaxDuration', 'cum_canceledBuyOrders', 'cum_canceledBuyVolume',
        "cum_canceledBuyAmount", "cum_canceledSellOrders", 'cum_canceledSellVolume',"cum_canceledSellAmount"]]
        
        
        db = DB("mongodb://user_rw:faa96dfc@192.168.10.223")
        db.write('snapshot', snapshot)
        
        print(datetime.datetime.now() - startTm)

native lv2 data


  interactivity=interactivity, compiler=compiler, result=result)


csv is loaded
native auction data


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




20170920

92509000000

0:06:06.558090
native lv2 data
csv is loaded
native auction data


20170921

92510000000

0:06:29.051651
native lv2 data
csv is loaded
native auction data


20170922

92512000000

0:06:27.117824
native lv2 data
csv is loaded
native auction data


20170925

92512000000

0:06:20.417766
native lv2 data
csv is loaded
native auction data


20170926

92510000000

0:05:58.397815
native lv2 data
csv is loaded
native auction data


20170927

92509000000

0:05:54.894988
native lv2 data
csv is loaded
native auction data


20170928

92506000000

0:06:11.302169
native lv2 data
csv is loaded
native auction data


20170929

92508000000

0:06:00.747339
native lv2 data
csv is loaded
native auction data


20171009

92511000000

0:06:08.866627
native lv2 data
csv is loaded
native auction data


20171010

92511000000

0:06:32.946237
native lv2 data
csv is loaded
native auction data


20171011

92506000000

0:06:44.329796
native lv2 data
csv is loaded
native auction data


20171012

92506000000

0:06:30.981490
native lv2 data
csv is loaded
native auction data


20171013

92511000000

0:06:17.022817
native lv2 data
csv is loaded
native auction data


20171016

92511000000

0:06:49.697394
native lv2 data
csv is loaded
native auction data


20171017

92508000000

0:06:20.336594
native lv2 data
csv is loaded
native auction data


20171018

92511000000

0:06:18.363232
native lv2 data
csv is loaded
native auction data


20171019

92512000000

0:06:18.858906
native lv2 data
csv is loaded
native auction data


20171020

92514000000

0:06:06.951996
native lv2 data
csv is loaded
native auction data


20171023

92509000000

0:06:11.780834
native lv2 data
csv is loaded
native auction data


20171024

92510000000

0:06:04.281886
native lv2 data
csv is loaded
native auction data


20171025

92507000000

0:06:19.519142
native lv2 data
csv is loaded
native auction data


20171026

92513000000

0:06:47.857363
native lv2 data
csv is loaded
native auction data


20171027

92513000000

0:06:40.039268
native lv2 data
csv is loaded
native auction data


20171030

92514000000

0:06:46.273599
native lv2 data
csv is loaded
native auction data


20171031

92510000000

0:06:36.444882
native lv2 data
csv is loaded
native auction data


20171101

92506000000

0:06:40.563865
native lv2 data
csv is loaded
native auction data


20171102

92508000000

0:06:46.038365
native lv2 data
csv is loaded
native auction data


20171103

92507000000

0:06:29.988394
native lv2 data
csv is loaded
native auction data


20171106

92509000000

0:06:37.212828
native lv2 data
csv is loaded
native auction data


20171107

92512000000

0:06:40.140997
native lv2 data
csv is loaded
native auction data


20171108

92506000000

0:06:58.944742
native lv2 data
csv is loaded
native auction data


20171109

92511000000

0:06:22.225876
native lv2 data
csv is loaded
native auction data


20171110

92506000000

0:06:52.117971
native lv2 data
csv is loaded
native auction data


20171113

92515000000

0:06:50.335736
native lv2 data
csv is loaded
native auction data


20171114

92507000000

0:07:01.838976
native lv2 data
csv is loaded
native auction data


20171115

92507000000

0:06:33.042008
native lv2 data
csv is loaded
native auction data


20171116

92511000000

0:06:37.836133
native lv2 data
csv is loaded
native auction data


20171117

92507000000

0:07:15.662012
native lv2 data
csv is loaded
native auction data


20171120

92510000000

0:06:44.777599
native lv2 data
csv is loaded
native auction data


20171121

92514000000

0:06:32.606146
native lv2 data
csv is loaded
native auction data


20171122

92510000000

0:06:34.609788
native lv2 data
csv is loaded
native auction data


20171123

92507000000

0:06:44.611045
native lv2 data
csv is loaded
native auction data


20171124

92512000000

0:06:08.516562
native lv2 data
csv is loaded
native auction data


20171127

92508000000

0:05:58.262983
native lv2 data
csv is loaded
native auction data


20171128

92510000000

0:06:08.143560
native lv2 data
csv is loaded
native auction data


20171129

92511000000

0:06:18.516822
native lv2 data
csv is loaded
native auction data


20171130

92513000000

0:06:16.152145
native lv2 data
csv is loaded
native auction data


20171201

92515000000

0:06:15.235596
native lv2 data
csv is loaded
native auction data


20171204

92511000000

0:06:18.947670
native lv2 data
csv is loaded
native auction data


20171205

92508000000

0:06:50.977021
native lv2 data
csv is loaded
native auction data


20171206

92507000000

0:06:20.908426
native lv2 data
csv is loaded
native auction data


20171207

92508000000

0:06:02.654268
native lv2 data
csv is loaded
native auction data


20171208

92512000000

0:06:24.335235
native lv2 data
csv is loaded
native auction data


20171211

92512000000

0:06:04.798506
native lv2 data
csv is loaded
native auction data


20171212

92509000000

0:06:14.929415
native lv2 data
csv is loaded
native auction data


20171213

92510000000

0:05:55.133350
native lv2 data
csv is loaded
native auction data


20171214

92508000000

0:05:53.639346
native lv2 data
csv is loaded
native auction data


20171215

92507000000

0:06:15.236593
native lv2 data
csv is loaded
native auction data


20171218

92513000000

0:05:54.775309
native lv2 data
csv is loaded
native auction data


20171219

92510000000

0:05:59.527600
native lv2 data
csv is loaded
native auction data


20171220

92512000000

0:06:09.292488
native lv2 data
csv is loaded
native auction data


20171221

92511000000

0:06:26.154399
native lv2 data
csv is loaded
native auction data


20171222

92515000000

0:06:05.769908
native lv2 data
csv is loaded
native auction data


20171225

92511000000

0:06:07.633923
native lv2 data
csv is loaded
native auction data


20171226

92512000000

0:06:29.190281
native lv2 data
csv is loaded
native auction data


20171227

92507000000

0:06:30.245458
native lv2 data
csv is loaded
native auction data


20171228

92512000000

0:06:24.408069
native lv2 data
csv is loaded
native auction data


20171229

92511000000

0:06:20.205307


In [15]:
# need date, ID, clockAtArrival three columns when upload
db = DB("mongodb://user_rw:faa96dfc@192.168.10.223")
db.write('snapshot', snapshot)

#### download data

In [10]:
startDate = 20170101
endDate = 20181231
targetStockLs = [1600000]

db = DB("mongodb://user_rw:faa96dfc@192.168.10.223")

# pick all stocks from certain period
mdData = db.read('order', start_date=startDate, end_date=endDate)

# # pick certain stocks from whole period
# mdData = db.read('trade', symbol=targetStockLs)

# # pick certain stocks from certain period
# mdData = db.read('snapshot', start_date=startDate, end_date=endDate, symbol=targetStockLs)

In [11]:
mdData

In [17]:
pd.set_option("max_rows", 200)
mdData.dtypes

skey                                 int32
date                                 int32
time                                 int64
clockAtArrival                       int64
datetime                    datetime64[ns]
ordering                             int32
has_missing                          int32
cum_trades_cnt                       int32
cum_volume                           int64
cum_amount                         float64
prev_close                         float64
open                               float64
high                               float64
low                                float64
close                              float64
bid10p                             float64
bid9p                              float64
bid8p                              float64
bid7p                              float64
bid6p                              float64
bid5p                              float64
bid4p                              float64
bid3p                              float64
bid2p      

In [9]:
startDate = 20200103
endDate = 20200113
targetStockLs = [2000001]
mdData = db.read('snapshot', start_date=startDate, end_date=endDate, symbol=targetStockLs)
l = mdData["date"].unique()
print(l)

[20200103 20200106 20200107 20200108 20200109 20200110 20200113]


In [13]:
startDate = 20200113
endDate = 20200113
db = DB("mongodb://user_rw:faa96dfc@192.168.10.223")
mdData = db.read('snapshot', start_date=startDate, end_date=endDate)
for cols in ['bid10n', 'bid9n', 'bid8n', 'bid7n', 'bid6n', 'bid5n', 'bid4n', 'bid3n', 'bid2n', 'bid1n', 
                             'ask1n', 'ask2n', 'ask3n', 'ask4n', 'ask5n', 'ask6n','ask7n', 'ask8n', 'ask9n', 'ask10n','bid1Top1q','bid1Top2q','bid1Top3q','bid1Top4q','bid1Top5q','bid1Top6q',
        'bid1Top7q','bid1Top8q','bid1Top9q','bid1Top10q','bid1Top11q','bid1Top12q','bid1Top13q','bid1Top14q','bid1Top15q','bid1Top16q','bid1Top17q','bid1Top18q',
        'bid1Top19q','bid1Top20q','bid1Top21q','bid1Top22q','bid1Top23q','bid1Top24q','bid1Top25q','bid1Top26q','bid1Top27q','bid1Top28q','bid1Top29q',
        'bid1Top30q','bid1Top31q','bid1Top32q','bid1Top33q','bid1Top34q','bid1Top35q','bid1Top36q','bid1Top37q','bid1Top38q','bid1Top39q','bid1Top40q',
        'bid1Top41q','bid1Top42q','bid1Top43q','bid1Top44q','bid1Top45q','bid1Top46q','bid1Top47q','bid1Top48q','bid1Top49q','bid1Top50q', 'ask1Top1q',
        'ask1Top2q','ask1Top3q','ask1Top4q','ask1Top5q','ask1Top6q','ask1Top7q','ask1Top8q','ask1Top9q','ask1Top10q','ask1Top11q','ask1Top12q','ask1Top13q',
        'ask1Top14q','ask1Top15q','ask1Top16q','ask1Top17q','ask1Top18q','ask1Top19q','ask1Top20q','ask1Top21q','ask1Top22q','ask1Top23q',
        'ask1Top24q','ask1Top25q','ask1Top26q','ask1Top27q','ask1Top28q','ask1Top29q','ask1Top30q','ask1Top31q','ask1Top32q','ask1Top33q',
        'ask1Top34q','ask1Top35q','ask1Top36q','ask1Top37q','ask1Top38q','ask1Top39q','ask1Top40q','ask1Top41q','ask1Top42q','ask1Top43q',
        'ask1Top44q','ask1Top45q','ask1Top46q','ask1Top47q','ask1Top48q','ask1Top49q','ask1Top50q']:
    mdData[cols] = mdData[cols].astype('int32')
pd.set_option("max_rows", 200)
display(mdData.dtypes)
db = DB("mongodb://user_rw:faa96dfc@192.168.10.223")
db.write('snapshot', mdData)
del mdData

ID                                   int32
date                                 int32
time                                 int64
clockAtArrival                       int64
datetime                    datetime64[ns]
ordering                             int32
has_missing                          int32
cum_trades_cnt                       int32
cum_volume                           int64
cum_amount                         float64
prev_close                         float64
open                               float64
high                               float64
low                                float64
close                              float64
bid10p                             float64
bid9p                              float64
bid8p                              float64
bid7p                              float64
bid6p                              float64
bid5p                              float64
bid4p                              float64
bid3p                              float64
bid2p      

#### delete data

In [1]:
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz

class DB(object):
    def __init__(self, uri, symbol_column='skey'):
        self.db_name = 'white_db'
        user, passwd, host = self.parse_uri(uri)
        auth_db = 'admin' if user in ('admin', 'root') else self.db_name
        self.uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)

        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = 1
            seg = {'ver': version, 'data': self.ser(df_seg, version), 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        if version == 1:
            return gzip.compress(pickle.dumps(s), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s), preset=1)
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)

        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        else:
            raise Exception('unknown version')

In [5]:
startDate = 20170101
endDate = 20181231
targetStockLs = [1600004, 1600006]

db = DB("mongodb://user_rw:faa96dfc@192.168.10.223")

# delete all stocks from certain period
db.delete('order', start_date=startDate, end_date=endDate)

# delete certain stocks from whole period
# db.delete('snapshot', symbol=targetStockLs)

# # delete certain stocks from certain period
# db.delete('snapshot', start_date=startDate, end_date=endDate, symbol=targetStockLs)

In [14]:
startDate = 20200101
endDate = 20201231
targetStockLs = [1600004, 1600006]

db = DB("mongodb://user_rw:faa96dfc@192.168.10.223")

# delete all stocks from certain period
db.delete('order', start_date=startDate, end_date=endDate)

# delete certain stocks from whole period
# db.delete('snapshot', symbol=targetStockLs)

# # delete certain stocks from certain period
# db.delete('snapshot', start_date=startDate, end_date=endDate, symbol=targetStockLs)