In [1]:
import os
import glob
import pymongo
import numpy as np
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz

class DB(object):
    def __init__(self, uri, symbol_column='ID', clock_column='clockAtArrival'):
        self.db_name = 'white_db'
        user, passwd, host = self.parse_uri(uri)
        auth_db = 'admin' if user in ('admin', 'root') else self.db_name
        self.uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)

        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.clock_column = clock_column
        self.date_column = 'date'

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def write(self, table_name, df):
        if len(df) == 0: return

        if self.clock_column in df.columns:
            date = datetime.datetime.fromtimestamp(df.head(1)[self.clock_column].iloc[0] / 1e6, pytz.timezone('Asia/Shanghai')).strftime('%Y%m%d')
        elif self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
        else:
            raise Exception('DataFrame should contain either one of columns: `%s`, `%s`' % (self.clock_column, self.date_column))

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        for symbol, sub_df in df.groupby([self.symbol_column]):
            collection.delete_many({'date': date, 'symbol': symbol})
            self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = 1
            seg = {'ver': version, 'data': self.ser(df_seg, version), 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date and end_date:
            query['date'] = {'$gte': parse_date(start_date), '$lte': parse_date(end_date)}

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat(x['data'] for x in segs) if segs else None

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        if version == 1:
            return gzip.compress(pickle.dumps(s), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s), preset=1)
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)

        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        else:
            raise Exception('unknown version')

#### upload data 

#### 1. upload SZ 2018 data

In [7]:
import pandas as pd
import random
import numpy as np
import glob
import os
from unrar import rarfile
import py7zr
import pickle
import datetime

columns1 = ["Date","OrigTime","SendTime","ercvtime","dbtime","ChannelNo","SecurityID","SecurityIDsource", "MDStreamID","PreClosePx",
                   "PxChnage1","PXChange2","openPrice","HighPx","LowPx","close","NumTrades","cum_volume","cum_amount","PE1","PE2","TradingPhase",
                   "totalofferqty", "vwapAsk", "totalbidqty", "vwapBid", "PreNAV", "RealTimeNAV", "WarrantPremiumRate", "UpLimitPx",
                   "DownLimitPx", "TotalLongPosition", "unknown1", "unknown2", "unknown3"]
columns2 = ['Date',"OrigTime","SendTime","ercvtime","dbtime","ChannelNo","SecurityID","SecurityIDsource", "MDStreamID",'ask1p','bid1p',
                   "ask1q","bid1q", 'ask2p','bid2p',"ask2q","bid2q",'ask3p','bid3p',"ask3q","bid3q",'ask4p','bid4p',"ask4q","bid4q",'ask5p',
                    'bid5p',"ask5q","bid5q",'ask6p','bid6p',"ask6q","bid6q",'ask7p','bid7p',"ask7q","bid7q",'ask8p','bid8p',"ask8q","bid8q",
                   'ask9p','bid9p',"ask9q","bid9q",'ask10p','bid10p',"ask10q","bid10q","bid1n","NOORDERS_B1","ORDERQTY_B1",
                    "ask1n","NOORDERS_S1","ORDERQTY_S1"]
columns3 = ["Date", "SENDTIME", "SecurityID", "DATATIMESTAMP", "PreClosePx", "openPrice", "HighPx", "LowPx", "close", "NumTrades", "cum_volume",
           "cum_amount", "TOTALLONGPOSITION", "PERATIO1", "PERATIO2", "ENDOFDAYMAKER", "totalofferqty", "vwapAsk", "totalbidqty", "vwapBid"]
columns4 = ["Date", "SENDTIME", "SecurityID", "DATATIMESTAMP", "ask10p", "ask9p", "ask8p", "ask7p", "ask6p", "ask5p", "ask4p", "ask3p", "ask2p",
           "ask1p", "bid1p", "bid2p", "bid3p", "bid4p", "bid5p", "bid6p", "bid7p", "bid8p", "bid9p", "bid10p", "ask10q", "ask9q", "ask8q", "ask7q",
           "ask6q", "ask5q", "ask4q", "ask3q", "ask2q", "ask1q", "bid1q", "bid2q", "bid3q", "bid4q", "bid5q", "bid6q", "bid7q", "bid8q", "bid9q", 
           "bid10q", "bid1n", "NOORDERS _B1", "ORDERQTY_B1", "ask1n", "NOORDERS _S1", "ORDERQTY_S1"]
columns5 =  ["Date","OrigTime","SendTime","ercvtime","dbtime","ChannelNo","SecurityID","SecurityIDsource", "MDStreamID","PreClosePx",
                   "PxChnage1","PXChange2","openPrice","HighPx","LowPx","close","NumTrades","cum_volume","cum_amount","PE1","PE2","TradingPhase",
                   "totalofferqty", "vwapAsk", "totalbidqty", "vwapBid", "PreNAV", "RealTimeNAV", "WarrantPremiumRate", "UpLimitPx",
                   "DownLimitPx", "TotalLongPosition"]


year = "2018"
df = []
bad = []
readPath = 'J:\\LEVEL2_shenzhen\\' + year + '\\***'
dataPathLs = np.array(glob.glob(readPath))
for data in dataPathLs[-1:]:
    if len(np.array(glob.glob(data +'\\***'))) == 0:
        continue
        
    # am & pm  
    startTm = datetime.datetime.now()
    if len(np.array(glob.glob(data +'\\am_snap_level_spot.7z'))) == 1:
        date = os.path.basename(data)
        path = r'F:\SZ\2018' 
        os.chdir(data)
        try:
            a = py7zr.SevenZipFile(data + '\\am_snap_level_spot.7z','r',filters=None)
        except:
            print("Bad unzip here!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            print(data + '\\am_snap_level_spot.7z')
            bad.append(data + '\\am_snap_level_spot.7z')
            continue
        path1 = path + '\\' + date
        a.extractall(path = path1)
        a.close()
        try:
            a = py7zr.SevenZipFile(data + '\\pm_snap_level_spot.7z','r',filters=None)
        except:
            print("Bad unzip here!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            print(data + '\\pm_snap_level_spot.7z')
            bad.append(data + '\\pm_snap_level_spot.7z')
            continue
        a.extractall(path = path1)
        a.close()
        
        if len(np.array(glob.glob(path1 +'\\***_hq_snap_spot.txt'))) != 2:
            print("Less data!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            bad.append(data + '\\***_hq_snap_spot.txt')
            print(np.array(glob.glob(path1 +'\\***'))[0])
            continue
        am_snap1 = pd.read_table(path1 + "\\am_hq_snap_spot.txt", header=None)
        try:
            assert(am_snap1.shape[1] == len(columns1))
            am_snap1.columns = columns1       
        except:
            assert(am_snap1.shape[1] == len(columns5))
            am_snap1.columns = columns5     
        pm_snap1 = pd.read_table(path1 + "\\pm_hq_snap_spot.txt", header=None)
        try:
            assert(pm_snap1.shape[1] == len(columns1))
            pm_snap1.columns = columns1       
        except:
            assert(pm_snap1.shape[1] == len(columns5))
            pm_snap1.columns = columns5     
        snapshot1 = pd.concat([am_snap1, pm_snap1]).sort_values(by=["SecurityID", "OrigTime"])
        del am_snap1
        del pm_snap1
        
        am_snap1 = pd.read_table(path1 + "\\am_snap_level_spot.txt", header=None)
        assert(am_snap1.shape[1] == len(columns2))
        am_snap1.columns = columns2       
        pm_snap1 = pd.read_table(path1 + "\\pm_snap_level_spot.txt", header=None)
        assert(pm_snap1.shape[1] == len(columns2))
        pm_snap1.columns = columns2       
        snapshot2 = pd.concat([am_snap1, pm_snap1]).sort_values(by=["SecurityID", "OrigTime"])
        del am_snap1
        del pm_snap1
        
        snapshot1 = snapshot1[(snapshot1["SecurityID"] < 4000) | (snapshot1["SecurityID"] > 300000)]
        snapshot2 = snapshot2[(snapshot2["SecurityID"] < 4000) | (snapshot2["SecurityID"] > 300000)]
        snapshot1['time'] = (snapshot1['OrigTime'] - int(snapshot1['OrigTime'].iloc[0]//1000000000*1000000000)).astype(int)
        snapshot2['time'] = (snapshot2['OrigTime'] - int(snapshot2['OrigTime'].iloc[0]//1000000000*1000000000)).astype(int)
        snapshot = pd.merge(snapshot1, snapshot2, on=['Date',"OrigTime","ChannelNo","SecurityID","SecurityIDsource", "MDStreamID", "time"], how="outer")
        try:
            assert((snapshot.shape[0] == snapshot1.shape[0]) & (snapshot.shape[0] == snapshot2.shape[0]))
        except:
            if snapshot.shape[0] == snapshot1.shape[0]:
                print("snapshot1 have more ticks than snapshot2")
                if all(snapshot[(snapshot["dbtime_y"].isnull()) & ((snapshot["SecurityID"] < 4000) | (snapshot["SecurityID"] > 200000))]["time"].unique() > 150000000):
                    print("More ticks happens after 15:00")
                elif all(snapshot[(snapshot["dbtime_y"].isnull()) & ((snapshot["SecurityID"] < 4000) | (snapshot["SecurityID"] > 200000))]["time"].unique() < 93000000):
                    print("More ticks happens before 9:30")
                else:
                    print("There are ticks happens before 15:00, after 9:30")
            if snapshot.shape[0] == snapshot2.shape[0]:
                print("snapshot2 have more ticks than snapshot1")
                if all(snapshot[(snapshot["dbtime_x"].isnull()) & ((snapshot["SecurityID"] < 4000) | (snapshot["SecurityID"] > 200000))]["time"].unique() > 150000000):
                    print("More ticks happens after 15:00")
                elif all(snapshot[(snapshot["dbtime_x"].isnull()) & ((snapshot["SecurityID"] < 4000) | (snapshot["SecurityID"] > 200000))]["time"].unique() < 93000000):
                    print("More ticks happens before 9:30")
                else:
                    print("There are ticks happens before 15:00, after 9:30")
            else:
                print("snapshot2 don't join with snapshot1")
            snapshot = pd.merge(snapshot1, snapshot2, on=['Date',"OrigTime","ChannelNo","SecurityID","SecurityIDsource", "MDStreamID", "time"])
        del snapshot1
        del snapshot2
           
        snapshot = snapshot.rename(columns={"SecurityID":"StockID"})
        
        print("am & pm data 未分卷")
        
    elif len(np.array(glob.glob(data +'\\am_snap_level_spot.7z.001'))) == 1:    
        date = os.path.basename(data)
        path = r'F:\SZ\2018' 
        os.chdir(data)
        os.system("copy /b am_snap_level_spot.7z.* am_snap_level_spot.7z")
        try:
            a = py7zr.SevenZipFile(data + '\\am_snap_level_spot.7z','r',filters=None)
        except:
            print("Bad unzip here!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            print(data + '\\am_snap_level_spot.7z')
            bad.append(data + '\\am_snap_level_spot.7z')
            continue
        path1 = path + '\\' + date
        a.extractall(path = path1)
        a.close()
        
        os.system("copy /b pm_snap_level_spot.7z.* pm_snap_level_spot.7z")
        try:
            a = py7zr.SevenZipFile(data + '\\pm_snap_level_spot.7z','r',filters=None)
        except:
            print("Bad unzip here!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            print(data + '\\pm_snap_level_spot.7z')
            bad.append(data + '\\pm_snap_level_spot.7z')
            continue
        a.extractall(path = path1)
        a.close()
        
        if len(np.array(glob.glob(path1 +'\\***_hq_snap_spot.txt'))) != 2:
            print("Less data!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            bad.append(data + '\\***_hq_snap_spot.txt')
            print(np.array(glob.glob(path1 +'\\***'))[0])
            continue
        am_snap1 = pd.read_table(path1 + "\\am_hq_snap_spot.txt", header=None)
        try:
            assert(am_snap1.shape[1] == len(columns1))
            am_snap1.columns = columns1       
        except:
            assert(am_snap1.shape[1] == len(columns5))
            am_snap1.columns = columns5       
        pm_snap1 = pd.read_table(path1 + "\\pm_hq_snap_spot.txt", header=None)
        try:
            assert(pm_snap1.shape[1] == len(columns1))
            pm_snap1.columns = columns1       
        except:
            assert(pm_snap1.shape[1] == len(columns5))
            pm_snap1.columns = columns5     
        snapshot1 = pd.concat([am_snap1, pm_snap1]).sort_values(by=["SecurityID", "OrigTime"])
        del am_snap1
        del pm_snap1
        
        am_snap1 = pd.read_table(path1 + "\\am_snap_level_spot.txt", header=None)
        assert(am_snap1.shape[1] == len(columns2))
        am_snap1.columns = columns2        
        pm_snap1 = pd.read_table(path1 + "\\pm_snap_level_spot.txt", header=None)
        assert(pm_snap1.shape[1] == len(columns2))
        pm_snap1.columns = columns2       
        snapshot2 = pd.concat([am_snap1, pm_snap1]).sort_values(by=["SecurityID", "OrigTime"])
        del am_snap1
        del pm_snap1
        
        snapshot1 = snapshot1[(snapshot1["SecurityID"] < 4000) | (snapshot1["SecurityID"] > 300000)]
        snapshot2 = snapshot2[(snapshot2["SecurityID"] < 4000) | (snapshot2["SecurityID"] > 300000)]
        snapshot1['time'] = (snapshot1['OrigTime'] - int(snapshot1['OrigTime'].iloc[0]//1000000000*1000000000)).astype(int)
        snapshot2['time'] = (snapshot2['OrigTime'] - int(snapshot2['OrigTime'].iloc[0]//1000000000*1000000000)).astype(int)
        snapshot = pd.merge(snapshot1, snapshot2, on=['Date',"OrigTime","ChannelNo","SecurityID","SecurityIDsource", "MDStreamID", "time"], how="outer")
        try:
            assert((snapshot.shape[0] == snapshot1.shape[0]) & (snapshot.shape[0] == snapshot2.shape[0]))
        except:
            if snapshot.shape[0] == snapshot1.shape[0]:
                print("snapshot1 have more ticks than snapshot2")
                if all(snapshot[(snapshot["dbtime_y"].isnull()) & ((snapshot["SecurityID"] < 4000) | (snapshot["SecurityID"] > 200000))]["time"].unique() > 150000000):
                    print("More ticks happens after 15:00")
                elif all(snapshot[(snapshot["dbtime_y"].isnull()) & ((snapshot["SecurityID"] < 4000) | (snapshot["SecurityID"] > 200000))]["time"].unique() < 93000000):
                    print("More ticks happens before 9:30")
                else:
                    print("There are ticks happens before 15:00, after 9:30")
            if snapshot.shape[0] == snapshot2.shape[0]:
                print("snapshot2 have more ticks than snapshot1")
                if all(snapshot[(snapshot["dbtime_x"].isnull()) & ((snapshot["SecurityID"] < 4000) | (snapshot["SecurityID"] > 200000))]["time"].unique() > 150000000):
                    print("More ticks happens after 15:00")
                elif all(snapshot[(snapshot["dbtime_x"].isnull()) & ((snapshot["SecurityID"] < 4000) | (snapshot["SecurityID"] > 200000))]["time"].unique() < 93000000):
                    print("More ticks happens before 9:30")
                else:
                    print("There are ticks happens before 15:00, after 9:30")
            else:
                print("snapshot2 don't join with snapshot1")
            snapshot = pd.merge(snapshot1, snapshot2, on=['Date',"OrigTime","ChannelNo","SecurityID","SecurityIDsource", "MDStreamID", "time"])
        del snapshot1
        del snapshot2
        
        snapshot = snapshot.rename(columns={"SecurityID":"StockID"})
        
        print("am & pm data 分卷") 
   

    elif len(np.array(glob.glob(data +'\\snap_level.7z'))) == 1: 
        date = os.path.basename(data)
        path = r'F:\SZ\2018' 
        os.chdir(data)
        try:
            a = py7zr.SevenZipFile(data + '\\snap_level.7z','r',filters=None)
        except:
            print("Bad unzip here!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            print(data + '\\snap_level.7z')
            bad.append(data + '\\snap_level.7z')
            continue
        path1 = path + '\\' + date
        a.extractall(path = path1)
        a.close()
        
        if len(np.array(glob.glob(path1 +'\\hq_snap.txt'))) != 1:
            print("Less data!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            bad.append(data + '\\hq_snap.txt')
            print(np.array(glob.glob(path1 +'\\***'))[0])
            continue
        snapshot1 = pd.read_table(path1 + "\\hq_snap.txt", header=None, encoding="UTF-8-sig")
        try:
            assert(snapshot1.shape[1] == len(columns1))
            snapshot1.columns = columns1
        except:
            assert(snapshot1.shape[1] == len(columns5))
            snapshot1.columns = columns5
        
        snapshot1["SecurityID"] = snapshot1["SecurityID"].astype(int)
        snapshot2["SecurityID"] = snapshot2["SecurityID"].astype(int)
        snapshot1 = snapshot1[(snapshot1["SecurityID"] < 4000) | (snapshot1["SecurityID"] > 300000)]
        snapshot2 = snapshot2[(snapshot2["SecurityID"] < 4000) | (snapshot2["SecurityID"] > 300000)]
        snapshot1['time'] = (snapshot1['OrigTime'] - int(snapshot1['OrigTime'].iloc[0]//1000000000*1000000000)).astype(np.int64)
        snapshot2['time'] = (snapshot2['OrigTime'] - int(snapshot2['OrigTime'].iloc[0]//1000000000*1000000000)).astype(np.int64)
        
        snapshot = pd.merge(snapshot1, snapshot2, on=['Date',"OrigTime","ChannelNo","SecurityID","SecurityIDsource", "MDStreamID", "time"], how="outer")
        try:
            assert((snapshot.shape[0] == snapshot1.shape[0]) & (snapshot.shape[0] == snapshot2.shape[0]))
        except:
            if snapshot.shape[0] == snapshot1.shape[0]:
                print("snapshot1 have more ticks than snapshot2")
                if all(snapshot[(snapshot["dbtime_y"].isnull()) & ((snapshot["SecurityID"] < 4000) | (snapshot["SecurityID"] > 200000))]["time"].unique() > 150000000):
                    print("More ticks happens after 15:00")
                elif all(snapshot[(snapshot["dbtime_y"].isnull()) & ((snapshot["SecurityID"] < 4000) | (snapshot["SecurityID"] > 200000))]["time"].unique() < 93000000):
                    print("More ticks happens before 9:30")
                else:
                    print("There are ticks happens before 15:00, after 9:30")
            if snapshot.shape[0] == snapshot2.shape[0]:
                print("snapshot2 have more ticks than snapshot1")
                if all(snapshot[(snapshot["dbtime_x"].isnull()) & ((snapshot["SecurityID"] < 4000) | (snapshot["SecurityID"] > 200000))]["time"].unique() > 150000000):
                    print("More ticks happens after 15:00")
                elif all(snapshot[(snapshot["dbtime_x"].isnull()) & ((snapshot["SecurityID"] < 4000) | (snapshot["SecurityID"] > 200000))]["time"].unique() < 93000000):
                    print("More ticks happens before 9:30")
                else:
                    print("There are ticks happens before 15:00, after 9:30")
            else:
                print("snapshot2 don't join with snapshot1")
            snapshot = pd.merge(snapshot1, snapshot2, on=['Date',"OrigTime","ChannelNo","SecurityID","SecurityIDsource", "MDStreamID", "time"])
        del snapshot1
        del snapshot2
        
        snapshot = snapshot.rename(columns={"SecurityID":"StockID"})
        
        print("深交所数据")
        
    
    snapshot["clockAtArrival"] = snapshot["OrigTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S%f').timestamp()*1e6))
    snapshot = snapshot.rename(columns={"Date":"date", "NumTrades":"cum_tradesCnt", "HighPx":"high", "LowPx":"low", "totalofferqty":
                                   "totalAskQuantity", "totalbidqty":"totalBidQuantity", "PreClosePx":"prevClose", "openPrice":"open"})
    snapshot["ID"] = snapshot["StockID"] + 2000000
    snapshot["ordering"] = snapshot.groupby("ID").cumcount()
    snapshot["ordering"] = snapshot["ordering"] + 1
    snapshot["time"] = snapshot["time"].astype('int64') * 1000
    

    snapshot.loc[(~snapshot["ORDERQTY_B1"].isnull()), "ORDERQTY_B1"]=snapshot[(~snapshot["ORDERQTY_B1"].isnull())]["ORDERQTY_B1"].apply(lambda x: [int(i) for i in x.split('|')])
    snapshot.loc[(~snapshot["ORDERQTY_S1"].isnull()), "ORDERQTY_S1"]=snapshot[(~snapshot["ORDERQTY_S1"].isnull())]["ORDERQTY_S1"].apply(lambda x: [int(i) for i in x.split('|')])

    for i in range(1, 51):
        snapshot["bid1Top" + str(i) + 'q'] = 0
        snapshot["ask1Top" + str(i) + 'q'] = 0
    for i in range(1, 51):
        snapshot.loc[i <= snapshot["bid1n"], "bid1Top" + str(i) + 'q'] = snapshot.loc[i <= snapshot["bid1n"], "ORDERQTY_B1"].apply(lambda x: x[i-1])
        snapshot.loc[i <= snapshot["ask1n"], "ask1Top" + str(i) + 'q'] = snapshot.loc[i <= snapshot["ask1n"], "ORDERQTY_S1"].apply(lambda x: x[i-1])    

    
    
    for columns in ["cum_tradesCnt", "cum_volume", 'ask1q','ask2q','ask3q','ask4q','ask5q','ask6q','ask7q','ask8q','ask9q',
                        'ask10q', 'bid1q','bid2q','bid3q','bid4q','bid5q','bid6q','bid7q','bid8q','bid9q','bid10q',
                    "totalBidQuantity", "totalAskQuantity", "bid1n", "ask1n"]:
        snapshot[columns] = snapshot[columns].astype('int64')

    
    
    for cols in ["prevClose", 'open', "high", "low", "close", 'bid10p','bid9p','bid8p','bid7p','bid6p','bid5p','bid4p','bid3p',
             'bid2p','bid1p','ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p', 'vwapBid', "vwapAsk"]:
        snapshot[cols] = (snapshot[cols] * 10000).round(0).astype('int64') # 'int64'
  

    for cols in ["cum_amount"]:
        snapshot[cols] = (snapshot[cols] * 10000).round(0).astype('int64')
    
    snapshot['datetime'] = snapshot["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))



    for cols in ['bid10n', 'bid9n', 'bid8n', 'bid7n', 'bid6n', 'bid5n', 'bid4n', 'bid3n', 'bid2n', 'ask2n', 'ask3n', 'ask4n', 'ask5n', 'ask6n','ask7n', 
                 'ask8n', 'ask9n', 'ask10n', "totalBidOrders",'totalAskOrders','totalBidLevels', 'totalAskLevels', 'bidTradeMaxDuration', 'askTradeMaxDuration', 'cum_canceledBuyOrders', 'cum_canceledBuyVolume',
        "cum_canceledBuyAmount", "cum_canceledSellOrders", 'cum_canceledSellVolume',"cum_canceledSellAmount"]:
        snapshot[cols] = 0
    
    snapshot["prevClose"] = np.where(snapshot["time"] >= 91500000000, snapshot.groupby("ID")["prevClose"].transform("max"), snapshot["prevClose"]) 
    snapshot["open"] = np.where(snapshot["cum_volume"] > 0, snapshot.groupby("ID")["open"].transform("max"), snapshot["open"])
    assert(sum(snapshot[snapshot["open"] != 0].groupby("ID")["open"].nunique() != 1) == 0)
    assert(sum(snapshot[snapshot["prevClose"] != 0].groupby("ID")["prevClose"].nunique() != 1) == 0)
    assert(snapshot[snapshot["time"] >= 91500000000]["prevClose"].min() > 0)
    assert(snapshot[snapshot["cum_volume"] > 0]["open"].min() > 0)
        
    snapshot = snapshot[["date", "time", "clockAtArrival", "datetime", "ID", "ordering", "cum_tradesCnt", "cum_volume", "cum_amount", "prevClose",
                            "open", "high", "low", "close", 'bid10p','bid9p','bid8p','bid7p','bid6p','bid5p','bid4p','bid3p','bid2p','bid1p',
                            'ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p', 'bid10q','bid9q','bid8q',
                             'bid7q','bid6q','bid5q','bid4q','bid3q','bid2q','bid1q', 'ask1q','ask2q','ask3q','ask4q','ask5q','ask6q',
                             'ask7q','ask8q','ask9q','ask10q', 'bid10n', 'bid9n', 'bid8n', 'bid7n', 'bid6n', 'bid5n', 'bid4n', 'bid3n', 'bid2n', 'bid1n', 
                             'ask1n', 'ask2n', 'ask3n', 'ask4n', 'ask5n', 'ask6n','ask7n', 'ask8n', 'ask9n', 'ask10n','bid1Top1q','bid1Top2q','bid1Top3q','bid1Top4q','bid1Top5q','bid1Top6q',
        'bid1Top7q','bid1Top8q','bid1Top9q','bid1Top10q','bid1Top11q','bid1Top12q','bid1Top13q','bid1Top14q','bid1Top15q','bid1Top16q','bid1Top17q','bid1Top18q',
        'bid1Top19q','bid1Top20q','bid1Top21q','bid1Top22q','bid1Top23q','bid1Top24q','bid1Top25q','bid1Top26q','bid1Top27q','bid1Top28q','bid1Top29q',
        'bid1Top30q','bid1Top31q','bid1Top32q','bid1Top33q','bid1Top34q','bid1Top35q','bid1Top36q','bid1Top37q','bid1Top38q','bid1Top39q','bid1Top40q',
        'bid1Top41q','bid1Top42q','bid1Top43q','bid1Top44q','bid1Top45q','bid1Top46q','bid1Top47q','bid1Top48q','bid1Top49q','bid1Top50q', 'ask1Top1q',
        'ask1Top2q','ask1Top3q','ask1Top4q','ask1Top5q','ask1Top6q','ask1Top7q','ask1Top8q','ask1Top9q','ask1Top10q','ask1Top11q','ask1Top12q','ask1Top13q',
        'ask1Top14q','ask1Top15q','ask1Top16q','ask1Top17q','ask1Top18q','ask1Top19q','ask1Top20q','ask1Top21q','ask1Top22q','ask1Top23q',
        'ask1Top24q','ask1Top25q','ask1Top26q','ask1Top27q','ask1Top28q','ask1Top29q','ask1Top30q','ask1Top31q','ask1Top32q','ask1Top33q',
        'ask1Top34q','ask1Top35q','ask1Top36q','ask1Top37q','ask1Top38q','ask1Top39q','ask1Top40q','ask1Top41q','ask1Top42q','ask1Top43q',
        'ask1Top44q','ask1Top45q','ask1Top46q','ask1Top47q','ask1Top48q','ask1Top49q','ask1Top50q',"totalBidQuantity", "totalAskQuantity","vwapBid", "vwapAsk",
        "totalBidOrders",'totalAskOrders','totalBidLevels', 'totalAskLevels', 'bidTradeMaxDuration', 'askTradeMaxDuration', 'cum_canceledBuyOrders', 'cum_canceledBuyVolume',
        "cum_canceledBuyAmount", "cum_canceledSellOrders", 'cum_canceledSellVolume',"cum_canceledSellAmount"]]
    
    print(snapshot["date"].iloc[0])
    print(snapshot.groupby("ID")["time"].min().max())
    
    db = DB("mongodb://user_rw:faa96dfc@192.168.10.223")
    db.write('snapshot', snapshot)
    print(datetime.datetime.now() - startTm)

print(bad) 

  interactivity=interactivity, compiler=compiler, result=result)


am & pm data 未分卷
20181228
90003000000
0:07:39.709732
[]


#### 2. upload SZ 2019 data

In [9]:
import pandas as pd
import random
import numpy as np
import glob
import os
from unrar import rarfile
import py7zr
import pickle
import datetime

year = "2019"
df = []
bad = []
readPath = 'E:\\SZ\\' + year + '\\***'
dataPathLs = np.array(glob.glob(readPath))
dataPathLs = np.delete(dataPathLs, np.argwhere(dataPathLs == 'E:\\SZ\\2019\\0109(深交所数据)')[0])
dataPathLs = np.delete(dataPathLs, np.argwhere(dataPathLs == 'E:\\SZ\\2019\\0314')[0])
dataPathLs = np.delete(dataPathLs, np.argwhere(dataPathLs == 'E:\\SZ\\2019\\0315(深交所数据)')[0])
dataPathLs[46:47]

array(['E:\\SZ\\2019\\0314(深交所数据)'], dtype='<U22')

In [11]:
import pandas as pd
import random
import numpy as np
import glob
import os
from unrar import rarfile
import py7zr
import pickle
import datetime

year = "2019"
df = []
bad = []
readPath = 'E:\\SZ\\' + year + '\\***'
dataPathLs = np.array(glob.glob(readPath))
dataPathLs = np.delete(dataPathLs, np.argwhere(dataPathLs == 'E:\\SZ\\2019\\0109(深交所数据)')[0])
dataPathLs = np.delete(dataPathLs, np.argwhere(dataPathLs == 'E:\\SZ\\2019\\0314')[0])
dataPathLs = np.delete(dataPathLs, np.argwhere(dataPathLs == 'E:\\SZ\\2019\\0315(深交所数据)')[0])

for data in dataPathLs[46:47]:
    
    if len(np.array(glob.glob(data +'\\***.txt'))) != 4:
        print("Less data!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
        print(data + '\\Snapshot.pkl')
        bad.append(data + '\\Snapshot.pkl')
        continue
    
    print('---------------------------------------------------------------------------------------------')
    
    if os.path.basename(data) == "0610":
        print("skip 20190610")
        continue
    
    startTm = datetime.datetime.now()
    
    F1 = open(data + "\\Snapshot.pkl", 'rb')
    snapshot = pickle.load(F1)
    snapshot['time'] = (snapshot['OrigTime'] - int(snapshot["OrigTime"].iloc[0]//1000000000 * 1000000000)).astype(int)
    snapshot = snapshot[(snapshot["StockID"] < 4000) | (snapshot["StockID"] > 300000)]
 
    
    snapshot["clockAtArrival"] = snapshot["OrigTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S%f').timestamp()*1e6))
    snapshot = snapshot.rename(columns={"Date":"date", "NumTrades":"cum_tradesCnt", "HighPx":"high", "LowPx":"low", "totalofferqty":
                                   "totalAskQuantity", "totalbidqty":"totalBidQuantity", "NUMORDERS_B1":"bid1n", "NUMORDERS_S1":"ask1n",
                                    "wa_offerPrice": "vwapAsk", "wa_bidPrice":"vwapBid", "PreClosePx":"prevClose", "openPrice":"open"})
    snapshot["ID"] = snapshot["StockID"] + 2000000
    snapshot["time"] = snapshot["time"].astype(np.int64) * 1000
    
    snapshot.loc[(~snapshot["ORDERQTY_B1"].isnull()), "ORDERQTY_B1"]=snapshot[(~snapshot["ORDERQTY_B1"].isnull())]["ORDERQTY_B1"].apply(lambda x: [int(i) for i in x.split('|')])
    snapshot.loc[(~snapshot["ORDERQTY_S1"].isnull()), "ORDERQTY_S1"]=snapshot[(~snapshot["ORDERQTY_S1"].isnull())]["ORDERQTY_S1"].apply(lambda x: [int(i) for i in x.split('|')])
    ## lambda x: [int(i) for i in x.split('|')]
    for i in range(1, 51):
        snapshot["bid1Top" + str(i) + 'q'] = 0
        snapshot["ask1Top" + str(i) + 'q'] = 0
    for i in range(1, 51):
        snapshot.loc[i <= snapshot["bid1n"], "bid1Top" + str(i) + 'q'] = snapshot.loc[i <= snapshot["bid1n"], "ORDERQTY_B1"].apply(lambda x: x[i-1])
        snapshot.loc[i <= snapshot["ask1n"], "ask1Top" + str(i) + 'q'] = snapshot.loc[i <= snapshot["ask1n"], "ORDERQTY_S1"].apply(lambda x: x[i-1])    


    snapshot = snapshot.fillna(0)
    snapshot = snapshot[~((snapshot["bid1p"] == 0) & (snapshot["ask1p"] == 0))]
    snapshot["ordering"] = snapshot.groupby("ID").cumcount()
    snapshot["ordering"] = snapshot["ordering"] + 1
    
    
    for columns in ["cum_tradesCnt", "cum_volume", 'ask1q','ask2q','ask3q','ask4q','ask5q','ask6q','ask7q','ask8q','ask9q',
                        'ask10q', 'bid1q','bid2q','bid3q','bid4q','bid5q','bid6q','bid7q','bid8q','bid9q','bid10q',
                    "totalBidQuantity", "totalAskQuantity", "bid1n", "ask1n"]:
        snapshot[columns] = snapshot[columns].astype('int64')


    for cols in ["prevClose", 'open', "high", "low", "close", 'bid10p','bid9p','bid8p','bid7p','bid6p','bid5p','bid4p','bid3p',
             'bid2p','bid1p','ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p', 'vwapBid', "vwapAsk"]:
        snapshot[cols] = (snapshot[cols] * 10000).round(0).astype('int64') 
    for cols in ["cum_amount"]:
        snapshot[cols] = (snapshot[cols] * 10000).round(0).astype('int64')
    snapshot['datetime'] = snapshot["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))
    for cols in ['bid10n', 'bid9n', 'bid8n', 'bid7n', 'bid6n', 'bid5n', 'bid4n', 'bid3n', 'bid2n', 'ask2n', 'ask3n', 'ask4n', 'ask5n', 'ask6n','ask7n', 
                 'ask8n', 'ask9n', 'ask10n', "totalBidOrders",'totalAskOrders','totalBidLevels', 'totalAskLevels', 'bidTradeMaxDuration', 'askTradeMaxDuration', 'cum_canceledBuyOrders', 'cum_canceledBuyVolume',
        "cum_canceledBuyAmount", "cum_canceledSellOrders", 'cum_canceledSellVolume',"cum_canceledSellAmount"]:
        snapshot[cols] = 0
    
    assert(sum(snapshot[snapshot["open"] != 0].groupby("ID")["open"].nunique() != 1) == 0)
    assert(sum(snapshot[snapshot["prevClose"] != 0].groupby("ID")["prevClose"].nunique() != 1) == 0)
    snapshot["prevClose"] = np.where(snapshot["time"] >= 91500000000, snapshot.groupby("ID")["prevClose"].transform("max"), snapshot["prevClose"]) 
    snapshot["open"] = np.where(snapshot["cum_volume"] > 0, snapshot.groupby("ID")["open"].transform("max"), snapshot["open"])
    assert(sum(snapshot[snapshot["open"] != 0].groupby("ID")["open"].nunique() != 1) == 0)
    assert(sum(snapshot[snapshot["prevClose"] != 0].groupby("ID")["prevClose"].nunique() != 1) == 0)
    assert(snapshot[snapshot["time"] >= 91500000000]["prevClose"].min() > 0)
    assert(snapshot[snapshot["cum_volume"] > 0]["open"].min() > 0)
    
    snapshot = snapshot[["date", "time", "clockAtArrival", "datetime", "ID", "ordering", "cum_tradesCnt", "cum_volume", "cum_amount", "prevClose",
                            "open", "high", "low", "close", 'bid10p','bid9p','bid8p','bid7p','bid6p','bid5p','bid4p','bid3p','bid2p','bid1p',
                            'ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p', 'bid10q','bid9q','bid8q',
                             'bid7q','bid6q','bid5q','bid4q','bid3q','bid2q','bid1q', 'ask1q','ask2q','ask3q','ask4q','ask5q','ask6q',
                             'ask7q','ask8q','ask9q','ask10q', 'bid10n', 'bid9n', 'bid8n', 'bid7n', 'bid6n', 'bid5n', 'bid4n', 'bid3n', 'bid2n', 'bid1n', 
                             'ask1n', 'ask2n', 'ask3n', 'ask4n', 'ask5n', 'ask6n','ask7n', 'ask8n', 'ask9n', 'ask10n','bid1Top1q','bid1Top2q','bid1Top3q','bid1Top4q','bid1Top5q','bid1Top6q',
        'bid1Top7q','bid1Top8q','bid1Top9q','bid1Top10q','bid1Top11q','bid1Top12q','bid1Top13q','bid1Top14q','bid1Top15q','bid1Top16q','bid1Top17q','bid1Top18q',
        'bid1Top19q','bid1Top20q','bid1Top21q','bid1Top22q','bid1Top23q','bid1Top24q','bid1Top25q','bid1Top26q','bid1Top27q','bid1Top28q','bid1Top29q',
        'bid1Top30q','bid1Top31q','bid1Top32q','bid1Top33q','bid1Top34q','bid1Top35q','bid1Top36q','bid1Top37q','bid1Top38q','bid1Top39q','bid1Top40q',
        'bid1Top41q','bid1Top42q','bid1Top43q','bid1Top44q','bid1Top45q','bid1Top46q','bid1Top47q','bid1Top48q','bid1Top49q','bid1Top50q', 'ask1Top1q',
        'ask1Top2q','ask1Top3q','ask1Top4q','ask1Top5q','ask1Top6q','ask1Top7q','ask1Top8q','ask1Top9q','ask1Top10q','ask1Top11q','ask1Top12q','ask1Top13q',
        'ask1Top14q','ask1Top15q','ask1Top16q','ask1Top17q','ask1Top18q','ask1Top19q','ask1Top20q','ask1Top21q','ask1Top22q','ask1Top23q',
        'ask1Top24q','ask1Top25q','ask1Top26q','ask1Top27q','ask1Top28q','ask1Top29q','ask1Top30q','ask1Top31q','ask1Top32q','ask1Top33q',
        'ask1Top34q','ask1Top35q','ask1Top36q','ask1Top37q','ask1Top38q','ask1Top39q','ask1Top40q','ask1Top41q','ask1Top42q','ask1Top43q',
        'ask1Top44q','ask1Top45q','ask1Top46q','ask1Top47q','ask1Top48q','ask1Top49q','ask1Top50q',"totalBidQuantity", "totalAskQuantity","vwapBid", "vwapAsk",
        "totalBidOrders",'totalAskOrders','totalBidLevels', 'totalAskLevels', 'bidTradeMaxDuration', 'askTradeMaxDuration', 'cum_canceledBuyOrders', 'cum_canceledBuyVolume',
        "cum_canceledBuyAmount", "cum_canceledSellOrders", 'cum_canceledSellVolume',"cum_canceledSellAmount"]]
    
    print(snapshot["date"].iloc[0])
    print(snapshot.groupby("ID")["time"].min().max())
    
    db = DB("mongodb://user_rw:faa96dfc@192.168.10.223")
    db.write('snapshot', snapshot)
    print(datetime.datetime.now() - startTm)

print(bad)    

    

---------------------------------------------------------------------------------------------
20190314
92503000000
0:11:28.531866
[]


In [5]:
datetime.datetime.fromtimestamp(1577928618000000/1e6).strftime("%Y-%m-%d %H:%M:%S %f")

'2020-01-02 09:30:18 000000'

#### 3. upload SZ 20190610 data

In [13]:
y = '20190610'

readPath = '\\\\192.168.10.30\\Kevin_zhenyu\\temp\\kuanrui\\szse_20190610_csv\\szse_' + y + '\\snap\\***'
dataPathLs = np.array(glob.glob(readPath))
dateLs = np.array([int(os.path.basename(i).split('.')[0]) for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs < 4000) | ((dateLs > 300000) & (dateLs < 310000))]
logSZ1 = []
startTm = datetime.datetime.now()
for i in dataPathLs:
    df = pd.read_csv(i)
    df["StockID"] = int(os.path.basename(i).split('.')[0])
    logSZ1 += [df]
del df
logSZ1 = pd.concat(logSZ1).reset_index(drop=True)
print("finish load data")
print(datetime.datetime.now() - startTm)
logSZ1 = logSZ1.rename(columns={"成交总量":"cum_volume", "成交总金额":"cum_amount", "最新价":"close", "开始价":"open",
                               "昨收价":"prevClose", "成交笔数":"cum_tradesCnt", "最高价":"high", "最低价":"low", 
                               "买入总量":"totalBidQuantity", "卖出总量":"totalAskQuantity", "委买加权平均价":"vwapBid",
                               "委卖加权平均价":"vwapAsk"})
for i in range(1, 11):
    logSZ1 = logSZ1.rename(columns={"申买价" + str(i): "bid" + str(i) + "p", "申卖价" + str(i): "ask" + str(i) + "p",
                                   "申买量" + str(i): "bid" + str(i) + "q", "申卖量" + str(i): "ask" + str(i) + "q",
                                   "申买价位总委托笔数" + str(i): "bid" + str(i) + "n", "申卖价位总委托笔数" + str(i): "ask" + str(i) + "n"})
for i in range(1, 51):
    logSZ1 = logSZ1.rename(columns={"买一价前50笔订单" + str(i): "bid1Top" + str(i) + "q", 
                                    "卖一价前50笔订单" + str(i): "ask1Top" + str(i) + "q"})
logSZ1["time"] = ((logSZ1["日期时间"] - 20190610000000000)*1000).astype("int64")
logSZ1["date"] = logSZ1["日期时间"]//1000000000
logSZ1["clockAtArrival"] = logSZ1["日期时间"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S%f').timestamp()*1e6))
logSZ1["ID"] = logSZ1["StockID"] + 2000000
    
logSZ1 = logSZ1.fillna(0)
logSZ1 = logSZ1[~((logSZ1["bid1p"] == 0) & (logSZ1["ask1p"] == 0))]
logSZ1["ordering"] = logSZ1.groupby("ID").cumcount()
logSZ1["ordering"] = logSZ1["ordering"] + 1

for columns in ["cum_tradesCnt", "cum_volume", 'ask1q','ask2q','ask3q','ask4q','ask5q','ask6q','ask7q','ask8q','ask9q',
                        'ask10q', 'bid1q','bid2q','bid3q','bid4q','bid5q','bid6q','bid7q','bid8q','bid9q','bid10q',
                    "totalBidQuantity", "totalAskQuantity", 'bid10n', 'bid9n', 'bid8n', 'bid7n', 'bid6n', 'bid5n', 'bid4n', 'bid3n', 'bid2n', 'bid1n', 
                             'ask1n', 'ask2n', 'ask3n', 'ask4n', 'ask5n', 'ask6n','ask7n', 'ask8n', 'ask9n', 'ask10n']:
    logSZ1[columns] = logSZ1[columns].astype('int64')


for cols in ["prevClose", 'open', "high", "low", "close", 'bid10p','bid9p','bid8p','bid7p','bid6p','bid5p','bid4p','bid3p',
             'bid2p','bid1p','ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p', 'vwapBid', "vwapAsk"]:
    logSZ1[cols] = (logSZ1[cols] * 10).astype('int64') 
for cols in ["cum_amount"]:
    logSZ1[cols] = (logSZ1[cols] * 10).astype('int64')

logSZ1['datetime'] = logSZ1["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))
for cols in ["totalBidOrders",'totalAskOrders','totalBidLevels', 'totalAskLevels', 'bidTradeMaxDuration', 'askTradeMaxDuration', 'cum_canceledBuyOrders', 'cum_canceledBuyVolume',
        "cum_canceledBuyAmount", "cum_canceledSellOrders", 'cum_canceledSellVolume',"cum_canceledSellAmount"]:
    logSZ1[cols] = 0

    
assert(sum(logSZ1[logSZ1["open"] != 0].groupby("ID")["open"].nunique() != 1) == 0)
assert(sum(logSZ1[logSZ1["prevClose"] != 0].groupby("ID")["prevClose"].nunique() != 1) == 0)
logSZ1["prevClose"] = np.where(logSZ1["time"] >= 91500000000, logSZ1.groupby("ID")["prevClose"].transform("max"), logSZ1["prevClose"]) 
logSZ1["open"] = np.where(logSZ1["cum_volume"] > 0, logSZ1.groupby("ID")["open"].transform("max"), logSZ1["open"])
assert(sum(logSZ1[logSZ1["open"] != 0].groupby("ID")["open"].nunique() != 1) == 0)
assert(sum(logSZ1[logSZ1["prevClose"] != 0].groupby("ID")["prevClose"].nunique() != 1) == 0)
assert(logSZ1[logSZ1["time"] >= 91500000000]["prevClose"].min() > 0)
assert(logSZ1[logSZ1["cum_volume"] > 0]["open"].min() > 0)
    
logSZ1 = logSZ1[["date", "time", "clockAtArrival", "datetime", "ID", "ordering", "cum_tradesCnt", "cum_volume", "cum_amount", "prevClose",
                            "open", "high", "low", "close", 'bid10p','bid9p','bid8p','bid7p','bid6p','bid5p','bid4p','bid3p','bid2p','bid1p',
                            'ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p', 'bid10q','bid9q','bid8q',
                             'bid7q','bid6q','bid5q','bid4q','bid3q','bid2q','bid1q', 'ask1q','ask2q','ask3q','ask4q','ask5q','ask6q',
                             'ask7q','ask8q','ask9q','ask10q', 'bid10n', 'bid9n', 'bid8n', 'bid7n', 'bid6n', 'bid5n', 'bid4n', 'bid3n', 'bid2n', 'bid1n', 
                             'ask1n', 'ask2n', 'ask3n', 'ask4n', 'ask5n', 'ask6n','ask7n', 'ask8n', 'ask9n', 'ask10n','bid1Top1q','bid1Top2q','bid1Top3q','bid1Top4q','bid1Top5q','bid1Top6q',
        'bid1Top7q','bid1Top8q','bid1Top9q','bid1Top10q','bid1Top11q','bid1Top12q','bid1Top13q','bid1Top14q','bid1Top15q','bid1Top16q','bid1Top17q','bid1Top18q',
        'bid1Top19q','bid1Top20q','bid1Top21q','bid1Top22q','bid1Top23q','bid1Top24q','bid1Top25q','bid1Top26q','bid1Top27q','bid1Top28q','bid1Top29q',
        'bid1Top30q','bid1Top31q','bid1Top32q','bid1Top33q','bid1Top34q','bid1Top35q','bid1Top36q','bid1Top37q','bid1Top38q','bid1Top39q','bid1Top40q',
        'bid1Top41q','bid1Top42q','bid1Top43q','bid1Top44q','bid1Top45q','bid1Top46q','bid1Top47q','bid1Top48q','bid1Top49q','bid1Top50q', 'ask1Top1q',
        'ask1Top2q','ask1Top3q','ask1Top4q','ask1Top5q','ask1Top6q','ask1Top7q','ask1Top8q','ask1Top9q','ask1Top10q','ask1Top11q','ask1Top12q','ask1Top13q',
        'ask1Top14q','ask1Top15q','ask1Top16q','ask1Top17q','ask1Top18q','ask1Top19q','ask1Top20q','ask1Top21q','ask1Top22q','ask1Top23q',
        'ask1Top24q','ask1Top25q','ask1Top26q','ask1Top27q','ask1Top28q','ask1Top29q','ask1Top30q','ask1Top31q','ask1Top32q','ask1Top33q',
        'ask1Top34q','ask1Top35q','ask1Top36q','ask1Top37q','ask1Top38q','ask1Top39q','ask1Top40q','ask1Top41q','ask1Top42q','ask1Top43q',
        'ask1Top44q','ask1Top45q','ask1Top46q','ask1Top47q','ask1Top48q','ask1Top49q','ask1Top50q',"totalBidQuantity", "totalAskQuantity","vwapBid", "vwapAsk",
        "totalBidOrders",'totalAskOrders','totalBidLevels', 'totalAskLevels', 'bidTradeMaxDuration', 'askTradeMaxDuration', 'cum_canceledBuyOrders', 'cum_canceledBuyVolume',
        "cum_canceledBuyAmount", "cum_canceledSellOrders", 'cum_canceledSellVolume',"cum_canceledSellAmount"]]
    
print(logSZ1["date"].iloc[0])
print(logSZ1.groupby("ID")["time"].min().max())
    
db = DB("mongodb://user_rw:faa96dfc@192.168.10.223")
db.write('snapshot', logSZ1)
print(datetime.datetime.now() - startTm)

finish load data
0:07:00.356552
20190610
92503000000
0:15:26.115033


#### 4. upload SZ 2020 data

In [8]:
import pandas as pd
import random
import numpy as np
import glob
import os
from unrar import rarfile
import py7zr
import pickle
import datetime

columns1 = ["Date","OrigTime","SendTime","ercvtime","dbtime","ChannelNo","SecurityID","SecurityIDsource", "MDStreamID","PreClosePx",
                   "PxChnage1","PXChange2","openPrice","HighPx","LowPx","close","NumTrades","cum_volume","cum_amount","PE1","PE2","TradingPhase",
                   "totalofferqty", "vwapAsk", "totalbidqty", "vwapBid", "PreNAV", "RealTimeNAV", "WarrantPremiumRate", "UpLimitPx",
                   "DownLimitPx", "TotalLongPosition", "unknown1", "unknown2", "unknown3"]
columns2 = ['Date',"OrigTime","SendTime","ercvtime","dbtime","ChannelNo","SecurityID","SecurityIDsource", "MDStreamID",'ask1p','bid1p',
                   "ask1q","bid1q", 'ask2p','bid2p',"ask2q","bid2q",'ask3p','bid3p',"ask3q","bid3q",'ask4p','bid4p',"ask4q","bid4q",'ask5p',
                    'bid5p',"ask5q","bid5q",'ask6p','bid6p',"ask6q","bid6q",'ask7p','bid7p',"ask7q","bid7q",'ask8p','bid8p',"ask8q","bid8q",
                   'ask9p','bid9p',"ask9q","bid9q",'ask10p','bid10p',"ask10q","bid10q","bid1n","NOORDERS_B1","ORDERQTY_B1",
                    "ask1n","NOORDERS_S1","ORDERQTY_S1"]
columns3 = ["Date", "SENDTIME", "SecurityID", "DATATIMESTAMP", "PreClosePx", "openPrice", "HighPx", "LowPx", "close", "NumTrades", "cum_volume",
           "cum_amount", "TOTALLONGPOSITION", "PERATIO1", "PERATIO2", "ENDOFDAYMAKER", "totalofferqty", "vwapAsk", "totalbidqty", "vwapBid"]
columns4 = ["Date", "SENDTIME", "SecurityID", "DATATIMESTAMP", "ask10p", "ask9p", "ask8p", "ask7p", "ask6p", "ask5p", "ask4p", "ask3p", "ask2p",
           "ask1p", "bid1p", "bid2p", "bid3p", "bid4p", "bid5p", "bid6p", "bid7p", "bid8p", "bid9p", "bid10p", "ask10q", "ask9q", "ask8q", "ask7q",
           "ask6q", "ask5q", "ask4q", "ask3q", "ask2q", "ask1q", "bid1q", "bid2q", "bid3q", "bid4q", "bid5q", "bid6q", "bid7q", "bid8q", "bid9q", 
           "bid10q", "bid1n", "NOORDERS _B1", "ORDERQTY_B1", "ask1n", "NOORDERS _S1", "ORDERQTY_S1"]
columns5 =  ["Date","OrigTime","SendTime","ercvtime","dbtime","ChannelNo","SecurityID","SecurityIDsource", "MDStreamID","PreClosePx",
                   "PxChnage1","PXChange2","openPrice","HighPx","LowPx","close","NumTrades","cum_volume","cum_amount","PE1","PE2","TradingPhase",
                   "totalofferqty", "vwapAsk", "totalbidqty", "vwapBid", "PreNAV", "RealTimeNAV", "WarrantPremiumRate", "UpLimitPx",
                   "DownLimitPx", "TotalLongPosition"]

df = []
bad = []
readPath = 'L:\\2020 data\\SZ\\***'
dataPathLs = np.array(glob.glob(readPath))
for data in dataPathLs[62:]:
    if len(np.array(glob.glob(data +'\\***'))) == 0:
        continue
        
    # am & pm  
    startTm = datetime.datetime.now()
    
    if len(np.array(glob.glob(data +'\\***_hq_snap_spot.txt'))) != 2:
            print("Less hq_snap_spot data!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            bad.append(data + '\\***_hq_snap_spot.txt')
            print(np.array(glob.glob(data +'\\***'))[0])
            continue
    
    if len(np.array(glob.glob(data +'\\***_snap_level_spot.txt'))) != 2:
            print("Less snap_level_spot data!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            bad.append(data + '\\***_snap_level_spot.txt')
            print(np.array(glob.glob(data +'\\***'))[0])
            continue
            

    am_snap1 = pd.read_table(data + "\\am_hq_snap_spot.txt", header=None)
    try:
        assert(am_snap1.shape[1] == len(columns1))
        am_snap1.columns = columns1       
    except:
        assert(am_snap1.shape[1] == len(columns5))
        am_snap1.columns = columns5     
    pm_snap1 = pd.read_table(data + "\\pm_hq_snap_spot.txt", header=None)
    try:
        assert(pm_snap1.shape[1] == len(columns1))
        pm_snap1.columns = columns1       
    except:
        assert(pm_snap1.shape[1] == len(columns5))
        pm_snap1.columns = columns5     
    snapshot1 = pd.concat([am_snap1, pm_snap1]).sort_values(by=["SecurityID", "OrigTime"])
    del am_snap1
    del pm_snap1
        
    am_snap1 = pd.read_table(data + "\\am_snap_level_spot.txt", header=None)
    assert(am_snap1.shape[1] == len(columns2))
    am_snap1.columns = columns2       
    pm_snap1 = pd.read_table(data + "\\pm_snap_level_spot.txt", header=None)
    assert(pm_snap1.shape[1] == len(columns2))
    pm_snap1.columns = columns2       
    snapshot2 = pd.concat([am_snap1, pm_snap1]).sort_values(by=["SecurityID", "OrigTime"])
    del am_snap1
    del pm_snap1
        
    snapshot1 = snapshot1[(snapshot1["SecurityID"] < 4000) | (snapshot1["SecurityID"] > 300000)]
    snapshot2 = snapshot2[(snapshot2["SecurityID"] < 4000) | (snapshot2["SecurityID"] > 300000)]
    snapshot1['time'] = (snapshot1['OrigTime'] - int(snapshot1['OrigTime'].iloc[0]//1000000000*1000000000)).astype(int)
    snapshot2['time'] = (snapshot2['OrigTime'] - int(snapshot2['OrigTime'].iloc[0]//1000000000*1000000000)).astype(int)
    snapshot = pd.merge(snapshot1, snapshot2, on=['Date',"OrigTime","ChannelNo","SecurityID","SecurityIDsource", "MDStreamID", "time"], how="outer")
    try:
        assert((snapshot.shape[0] == snapshot1.shape[0]) & (snapshot.shape[0] == snapshot2.shape[0]))
    except:
        if snapshot.shape[0] == snapshot1.shape[0]:
            print("snapshot1 have more ticks than snapshot2")
            if all(snapshot[(snapshot["dbtime_y"].isnull()) & ((snapshot["SecurityID"] < 4000) | (snapshot["SecurityID"] > 200000))]["time"].unique() > 150000000):
                print("More ticks happens after 15:00")
            elif all(snapshot[(snapshot["dbtime_y"].isnull()) & ((snapshot["SecurityID"] < 4000) | (snapshot["SecurityID"] > 200000))]["time"].unique() < 93000000):
                print("More ticks happens before 9:30")
            else:
                print("There are ticks happens before 15:00, after 9:30")
        elif snapshot.shape[0] == snapshot2.shape[0]:
            print("snapshot2 have more ticks than snapshot1")
            if all(snapshot[(snapshot["dbtime_x"].isnull()) & ((snapshot["SecurityID"] < 4000) | (snapshot["SecurityID"] > 200000))]["time"].unique() > 150000000):
                print("More ticks happens after 15:00")
            elif all(snapshot[(snapshot["dbtime_x"].isnull()) & ((snapshot["SecurityID"] < 4000) | (snapshot["SecurityID"] > 200000))]["time"].unique() < 93000000):
                print("More ticks happens before 9:30")
            else:
                print("There are ticks happens before 15:00, after 9:30")
        else:
            print("snapshot2 don't join with snapshot1")
        snapshot = pd.merge(snapshot1, snapshot2, on=['Date',"OrigTime","ChannelNo","SecurityID","SecurityIDsource", "MDStreamID", "time"])
    del snapshot1
    del snapshot2
           
    snapshot = snapshot.rename(columns={"SecurityID":"StockID"})
        
    print("am & pm data txt")
        
    
    snapshot["clockAtArrival"] = snapshot["OrigTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S%f').timestamp()*1e6))
    snapshot = snapshot.rename(columns={"Date":"date", "NumTrades":"cum_tradesCnt", "HighPx":"high", "LowPx":"low", "totalofferqty":
                                   "totalAskQuantity", "totalbidqty":"totalBidQuantity", "PreClosePx":"prevClose", "openPrice":"open"})
    snapshot["ID"] = snapshot["StockID"] + 2000000
    snapshot["time"] = snapshot["time"].astype('int64') * 1000
    

    snapshot.loc[(~snapshot["ORDERQTY_B1"].isnull()), "ORDERQTY_B1"]=snapshot[(~snapshot["ORDERQTY_B1"].isnull())]["ORDERQTY_B1"].apply(lambda x: [int(i) for i in x.split('|')])
    snapshot.loc[(~snapshot["ORDERQTY_S1"].isnull()), "ORDERQTY_S1"]=snapshot[(~snapshot["ORDERQTY_S1"].isnull())]["ORDERQTY_S1"].apply(lambda x: [int(i) for i in x.split('|')])

    for i in range(1, 51):
        snapshot["bid1Top" + str(i) + 'q'] = 0
        snapshot["ask1Top" + str(i) + 'q'] = 0
    for i in range(1, 51):
        snapshot.loc[i <= snapshot["bid1n"], "bid1Top" + str(i) + 'q'] = snapshot.loc[i <= snapshot["bid1n"], "ORDERQTY_B1"].apply(lambda x: x[i-1])
        snapshot.loc[i <= snapshot["ask1n"], "ask1Top" + str(i) + 'q'] = snapshot.loc[i <= snapshot["ask1n"], "ORDERQTY_S1"].apply(lambda x: x[i-1])    

    snapshot = snapshot.fillna(0)
    snapshot = snapshot[~((snapshot["bid1p"] == 0) & (snapshot["ask1p"] == 0))]
    snapshot["ordering"] = snapshot.groupby("ID").cumcount()
    snapshot["ordering"] = snapshot["ordering"] + 1
    
    for columns in ["cum_tradesCnt", "cum_volume", 'ask1q','ask2q','ask3q','ask4q','ask5q','ask6q','ask7q','ask8q','ask9q',
                        'ask10q', 'bid1q','bid2q','bid3q','bid4q','bid5q','bid6q','bid7q','bid8q','bid9q','bid10q',
                    "totalBidQuantity", "totalAskQuantity", "bid1n", "ask1n"]:
        snapshot[columns] = snapshot[columns].astype('int64')

    
    
    for cols in ["prevClose", 'open', "high", "low", "close", 'bid10p','bid9p','bid8p','bid7p','bid6p','bid5p','bid4p','bid3p',
             'bid2p','bid1p','ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p', 'vwapBid', "vwapAsk"]:
        snapshot[cols] = (snapshot[cols] * 10000).round(0).astype('int64') # 'int64'
  

    for cols in ["cum_amount"]:
        snapshot[cols] = (snapshot[cols] * 10000).round(0).astype('int64')
    
    snapshot['datetime'] = snapshot["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))



    for cols in ['bid10n', 'bid9n', 'bid8n', 'bid7n', 'bid6n', 'bid5n', 'bid4n', 'bid3n', 'bid2n', 'ask2n', 'ask3n', 'ask4n', 'ask5n', 'ask6n','ask7n', 
                 'ask8n', 'ask9n', 'ask10n', "totalBidOrders",'totalAskOrders','totalBidLevels', 'totalAskLevels', 'bidTradeMaxDuration', 'askTradeMaxDuration', 'cum_canceledBuyOrders', 'cum_canceledBuyVolume',
        "cum_canceledBuyAmount", "cum_canceledSellOrders", 'cum_canceledSellVolume',"cum_canceledSellAmount"]:
        snapshot[cols] = 0
    
    assert(sum(snapshot[snapshot["open"] != 0].groupby("ID")["open"].nunique() != 1) == 0)
    assert(sum(snapshot[snapshot["prevClose"] != 0].groupby("ID")["prevClose"].nunique() != 1) == 0)
    snapshot["prevClose"] = np.where(snapshot["time"] >= 91500000000, snapshot.groupby("ID")["prevClose"].transform("max"), snapshot["prevClose"]) 
    snapshot["open"] = np.where(snapshot["cum_volume"] > 0, snapshot.groupby("ID")["open"].transform("max"), snapshot["open"])
    assert(sum(snapshot[snapshot["open"] != 0].groupby("ID")["open"].nunique() != 1) == 0)
    assert(sum(snapshot[snapshot["prevClose"] != 0].groupby("ID")["prevClose"].nunique() != 1) == 0)
    assert(snapshot[snapshot["time"] >= 91500000000]["prevClose"].min() > 0)
    assert(snapshot[snapshot["cum_volume"] > 0]["open"].min() > 0)
        
    snapshot = snapshot[["date", "time", "clockAtArrival", "datetime", "ID", "ordering", "cum_tradesCnt", "cum_volume", "cum_amount", "prevClose",
                            "open", "high", "low", "close", 'bid10p','bid9p','bid8p','bid7p','bid6p','bid5p','bid4p','bid3p','bid2p','bid1p',
                            'ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p', 'bid10q','bid9q','bid8q',
                             'bid7q','bid6q','bid5q','bid4q','bid3q','bid2q','bid1q', 'ask1q','ask2q','ask3q','ask4q','ask5q','ask6q',
                             'ask7q','ask8q','ask9q','ask10q', 'bid10n', 'bid9n', 'bid8n', 'bid7n', 'bid6n', 'bid5n', 'bid4n', 'bid3n', 'bid2n', 'bid1n', 
                             'ask1n', 'ask2n', 'ask3n', 'ask4n', 'ask5n', 'ask6n','ask7n', 'ask8n', 'ask9n', 'ask10n','bid1Top1q','bid1Top2q','bid1Top3q','bid1Top4q','bid1Top5q','bid1Top6q',
        'bid1Top7q','bid1Top8q','bid1Top9q','bid1Top10q','bid1Top11q','bid1Top12q','bid1Top13q','bid1Top14q','bid1Top15q','bid1Top16q','bid1Top17q','bid1Top18q',
        'bid1Top19q','bid1Top20q','bid1Top21q','bid1Top22q','bid1Top23q','bid1Top24q','bid1Top25q','bid1Top26q','bid1Top27q','bid1Top28q','bid1Top29q',
        'bid1Top30q','bid1Top31q','bid1Top32q','bid1Top33q','bid1Top34q','bid1Top35q','bid1Top36q','bid1Top37q','bid1Top38q','bid1Top39q','bid1Top40q',
        'bid1Top41q','bid1Top42q','bid1Top43q','bid1Top44q','bid1Top45q','bid1Top46q','bid1Top47q','bid1Top48q','bid1Top49q','bid1Top50q', 'ask1Top1q',
        'ask1Top2q','ask1Top3q','ask1Top4q','ask1Top5q','ask1Top6q','ask1Top7q','ask1Top8q','ask1Top9q','ask1Top10q','ask1Top11q','ask1Top12q','ask1Top13q',
        'ask1Top14q','ask1Top15q','ask1Top16q','ask1Top17q','ask1Top18q','ask1Top19q','ask1Top20q','ask1Top21q','ask1Top22q','ask1Top23q',
        'ask1Top24q','ask1Top25q','ask1Top26q','ask1Top27q','ask1Top28q','ask1Top29q','ask1Top30q','ask1Top31q','ask1Top32q','ask1Top33q',
        'ask1Top34q','ask1Top35q','ask1Top36q','ask1Top37q','ask1Top38q','ask1Top39q','ask1Top40q','ask1Top41q','ask1Top42q','ask1Top43q',
        'ask1Top44q','ask1Top45q','ask1Top46q','ask1Top47q','ask1Top48q','ask1Top49q','ask1Top50q',"totalBidQuantity", "totalAskQuantity","vwapBid", "vwapAsk",
        "totalBidOrders",'totalAskOrders','totalBidLevels', 'totalAskLevels', 'bidTradeMaxDuration', 'askTradeMaxDuration', 'cum_canceledBuyOrders', 'cum_canceledBuyVolume',
        "cum_canceledBuyAmount", "cum_canceledSellOrders", 'cum_canceledSellVolume',"cum_canceledSellAmount"]]
    
    print(snapshot["date"].iloc[0])
    print(snapshot.groupby("ID")["time"].min().max())
    
    db = DB("mongodb://user_rw:faa96dfc@192.168.10.223")
    db.write('snapshot', snapshot)
    print(datetime.datetime.now() - startTm)

print(bad) 

  interactivity=interactivity, compiler=compiler, result=result)


am & pm data txt
20200408
92503000000
0:08:50.904691
am & pm data txt
20200409
92603000000
0:08:42.617013
am & pm data txt
20200410
92503000000
0:08:55.426669
am & pm data txt
20200413
92503000000
0:07:53.871584
am & pm data txt
20200414
92503000000
0:08:23.667178
am & pm data txt
20200415
92503000000
0:08:39.927697
am & pm data txt
20200416
92503000000
0:08:34.254867
am & pm data txt
20200417
92503000000
0:09:21.884571
am & pm data txt
20200420
92503000000
0:08:34.978929
am & pm data txt
20200421
92503000000
0:08:48.597511
am & pm data txt
20200422
92503000000
0:08:38.780289
am & pm data txt
20200423
92503000000
0:08:41.442645
am & pm data txt
20200424
92503000000
0:08:49.585872
am & pm data txt
20200427
92503000000
0:08:18.297537
am & pm data txt
20200428
92503000000
0:08:34.640841
am & pm data txt
20200429
92503000000
0:13:23.752430
am & pm data txt
20200430
92503000000
0:15:34.215146
am & pm data txt
20200506
92603000000
0:16:16.290470
am & pm data txt
20200507
92503000000
0:08:37.

#### 5. upload SZ 2017 data

In [None]:
import pandas as pd
import random
import numpy as np
import glob
import os
from unrar import rarfile
import py7zr
import pickle
import datetime


columns1 = ["Date","OrigTime","SendTime","ercvtime","dbtime","ChannelNo","SecurityID","SecurityIDsource", "MDStreamID","PreClosePx",
                   "PxChnage1","PXChange2","openPrice","HighPx","LowPx","close","NumTrades","cum_volume","cum_amount","PE1","PE2","TradingPhase",
                   "totalofferqty", "vwapAsk", "totalbidqty", "vwapBid", "PreNAV", "RealTimeNAV", "WarrantPremiumRate", "UpLimitPx",
                   "DownLimitPx", "TotalLongPosition", "unknown1", "unknown2", "unknown3"]
columns2 = ['Date',"OrigTime","SendTime","ercvtime","dbtime","ChannelNo","SecurityID","SecurityIDsource", "MDStreamID",'ask1p','bid1p',
                   "ask1q","bid1q", 'ask2p','bid2p',"ask2q","bid2q",'ask3p','bid3p',"ask3q","bid3q",'ask4p','bid4p',"ask4q","bid4q",'ask5p',
                    'bid5p',"ask5q","bid5q",'ask6p','bid6p',"ask6q","bid6q",'ask7p','bid7p',"ask7q","bid7q",'ask8p','bid8p',"ask8q","bid8q",
                   'ask9p','bid9p',"ask9q","bid9q",'ask10p','bid10p',"ask10q","bid10q","bid1n","NOORDERS_B1","ORDERQTY_B1",
                    "ask1n","NOORDERS_S1","ORDERQTY_S1"]
columns3 = ["Date", "SENDTIME", "SecurityID", "DATATIMESTAMP", "PreClosePx", "openPrice", "HighPx", "LowPx", "close", "NumTrades", "cum_volume",
           "cum_amount", "TOTALLONGPOSITION", "PERATIO1", "PERATIO2", "ENDOFDAYMAKER", "totalofferqty", "vwapAsk", "totalbidqty", "vwapBid"]
columns4 = ["Date", "SENDTIME", "SecurityID", "DATATIMESTAMP", "ask10p", "ask9p", "ask8p", "ask7p", "ask6p", "ask5p", "ask4p", "ask3p", "ask2p",
           "ask1p", "bid1p", "bid2p", "bid3p", "bid4p", "bid5p", "bid6p", "bid7p", "bid8p", "bid9p", "bid10p", "ask10q", "ask9q", "ask8q", "ask7q",
           "ask6q", "ask5q", "ask4q", "ask3q", "ask2q", "ask1q", "bid1q", "bid2q", "bid3q", "bid4q", "bid5q", "bid6q", "bid7q", "bid8q", "bid9q", 
           "bid10q", "bid1n", "NOORDERS _B1", "ORDERQTY_B1", "ask1n", "NOORDERS _S1", "ORDERQTY_S1"]
columns5 =  ["Date","OrigTime","SendTime","ercvtime","dbtime","ChannelNo","SecurityID","SecurityIDsource", "MDStreamID","PreClosePx",
                   "PxChnage1","PXChange2","openPrice","HighPx","LowPx","close","NumTrades","cum_volume","cum_amount","PE1","PE2","TradingPhase",
                   "totalofferqty", "vwapAsk", "totalbidqty", "vwapBid", "PreNAV", "RealTimeNAV", "WarrantPremiumRate", "UpLimitPx",
                   "DownLimitPx", "TotalLongPosition"]

year = "2017"
df = []
bad = []
readPath = 'J:\\LEVEL2_shenzhen\\' + year + '\\***'
dataPathLs = np.array(glob.glob(readPath))
dataPathLs = np.delete(dataPathLs, np.argwhere(dataPathLs == 'J:\\LEVEL2_shenzhen\\2017\\0620(深交所数据)')[0])
dataPathLs = np.delete(dataPathLs, np.argwhere(dataPathLs == 'J:\\LEVEL2_shenzhen\\2017\\0622(深交所数据)')[0])

for data in dataPathLs[196:]:
    
    if len(np.array(glob.glob(data +'\\***'))) == 0:
        continue
        
    # am & pm  
    startTm = datetime.datetime.now()
    if len(np.array(glob.glob(data +'\\pm_snap_level_spot.7z'))) == 1:
        date = os.path.basename(data)
        path = r'F:\SZ\2017' 
        os.chdir(data)
        try:
            a = py7zr.SevenZipFile(data + '\\am_snap_level_spot.7z','r',filters=None)
        except:
            print("Bad unzip here!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            print(data + '\\am_snap_level_spot.7z')
            bad.append(data + '\\am_snap_level_spot.7z')
            continue
        path1 = path + '\\' + date
        a.extractall(path = path1)
        a.close()
        try:
            a = py7zr.SevenZipFile(data + '\\pm_snap_level_spot.7z','r',filters=None)
        except:
            print("Bad unzip here!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            print(data + '\\pm_snap_level_spot.7z')
            bad.append(data + '\\pm_snap_level_spot.7z')
            continue
        a.extractall(path = path1)
        a.close()
        
        if len(np.array(glob.glob(path1 +'\\***_hq_snap_spot.txt'))) != 2:
            print("Less data!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            bad.append(data + '\\***_hq_snap_spot.txt')
            print(np.array(glob.glob(path1 +'\\***'))[0])
            continue
        am_snap1 = pd.read_table(path1 + "\\am_hq_snap_spot.txt", header=None)
        try:
            assert(am_snap1.shape[1] == len(columns1))
            am_snap1.columns = columns1       
        except:
            assert(am_snap1.shape[1] == len(columns5))
            am_snap1.columns = columns5     
        pm_snap1 = pd.read_table(path1 + "\\pm_hq_snap_spot.txt", header=None)
        try:
            assert(pm_snap1.shape[1] == len(columns1))
            pm_snap1.columns = columns1       
        except:
            assert(pm_snap1.shape[1] == len(columns5))
            pm_snap1.columns = columns5     
        snapshot1 = pd.concat([am_snap1, pm_snap1]).sort_values(by=["SecurityID", "OrigTime"])
        del am_snap1
        del pm_snap1
        
        am_snap1 = pd.read_table(path1 + "\\am_snap_level_spot.txt", header=None)
        assert(am_snap1.shape[1] == len(columns2))
        am_snap1.columns = columns2       
        pm_snap1 = pd.read_table(path1 + "\\pm_snap_level_spot.txt", header=None)
        assert(pm_snap1.shape[1] == len(columns2))
        pm_snap1.columns = columns2       
        snapshot2 = pd.concat([am_snap1, pm_snap1]).sort_values(by=["SecurityID", "OrigTime"])
        del am_snap1
        del pm_snap1
        
        snapshot1 = snapshot1[(snapshot1["SecurityID"] < 4000) | (snapshot1["SecurityID"] > 300000)]
        snapshot2 = snapshot2[(snapshot2["SecurityID"] < 4000) | (snapshot2["SecurityID"] > 300000)]
        snapshot1['time'] = (snapshot1['OrigTime'] - int(snapshot1['OrigTime'].iloc[0]//1000000000*1000000000)).astype(int)
        snapshot2['time'] = (snapshot2['OrigTime'] - int(snapshot2['OrigTime'].iloc[0]//1000000000*1000000000)).astype(int)
        snapshot = pd.merge(snapshot1, snapshot2, on=['Date',"OrigTime","ChannelNo","SecurityID","SecurityIDsource", "MDStreamID", "time"], how="outer")
        try:
            assert((snapshot.shape[0] == snapshot1.shape[0]) & (snapshot.shape[0] == snapshot2.shape[0]))
        except:
            if snapshot.shape[0] == snapshot1.shape[0]:
                print("snapshot1 have more ticks than snapshot2")
                if all(snapshot[(snapshot["dbtime_y"].isnull()) & ((snapshot["SecurityID"] < 4000) | (snapshot["SecurityID"] > 200000))]["time"].unique() > 150000000):
                    print("More ticks happens after 15:00")
                elif all(snapshot[(snapshot["dbtime_y"].isnull()) & ((snapshot["SecurityID"] < 4000) | (snapshot["SecurityID"] > 200000))]["time"].unique() < 93000000):
                    print("More ticks happens before 9:30")
                else:
                    print("There are ticks happens before 15:00, after 9:30")
            if snapshot.shape[0] == snapshot2.shape[0]:
                print("snapshot2 have more ticks than snapshot1")
                if all(snapshot[(snapshot["dbtime_x"].isnull()) & ((snapshot["SecurityID"] < 4000) | (snapshot["SecurityID"] > 200000))]["time"].unique() > 150000000):
                    print("More ticks happens after 15:00")
                elif all(snapshot[(snapshot["dbtime_x"].isnull()) & ((snapshot["SecurityID"] < 4000) | (snapshot["SecurityID"] > 200000))]["time"].unique() < 93000000):
                    print("More ticks happens before 9:30")
                else:
                    print("There are ticks happens before 15:00, after 9:30")
            else:
                print("snapshot2 don't join with snapshot1")
            snapshot = pd.merge(snapshot1, snapshot2, on=['Date',"OrigTime","ChannelNo","SecurityID","SecurityIDsource", "MDStreamID", "time"])
        del snapshot1
        del snapshot2
           
        snapshot = snapshot.rename(columns={"SecurityID":"StockID"})
        
        print("am & pm data 未分卷")
        
    elif len(np.array(glob.glob(data +'\\am_snap_level_spot.7z.001'))) == 1:    
        date = os.path.basename(data)
        path = r'F:\SZ\2017' 
        os.chdir(data)
        os.system("copy /b am_snap_level_spot.7z.* am_snap_level_spot.7z")
        try:
            a = py7zr.SevenZipFile(data + '\\am_snap_level_spot.7z','r',filters=None)
        except:
            print("Bad unzip here!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            print(data + '\\am_snap_level_spot.7z')
            bad.append(data + '\\am_snap_level_spot.7z')
            continue
        path1 = path + '\\' + date
        a.extractall(path = path1)
        a.close()
        
        os.system("copy /b pm_snap_level_spot.7z.* pm_snap_level_spot.7z")
        try:
            a = py7zr.SevenZipFile(data + '\\pm_snap_level_spot.7z','r',filters=None)
        except:
            print("Bad unzip here!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            print(data + '\\pm_snap_level_spot.7z')
            bad.append(data + '\\pm_snap_level_spot.7z')
            continue
        a.extractall(path = path1)
        a.close()
        
        if len(np.array(glob.glob(path1 +'\\***_hq_snap_spot.txt'))) != 2:
            print("Less data!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            bad.append(data + '\\***_hq_snap_spot.txt')
            print(np.array(glob.glob(path1 +'\\***'))[0])
            continue
        am_snap1 = pd.read_table(path1 + "\\am_hq_snap_spot.txt", header=None)
        try:
            assert(am_snap1.shape[1] == len(columns1))
            am_snap1.columns = columns1       
        except:
            assert(am_snap1.shape[1] == len(columns5))
            am_snap1.columns = columns5       
        pm_snap1 = pd.read_table(path1 + "\\pm_hq_snap_spot.txt", header=None)
        try:
            assert(pm_snap1.shape[1] == len(columns1))
            pm_snap1.columns = columns1       
        except:
            assert(pm_snap1.shape[1] == len(columns5))
            pm_snap1.columns = columns5     
        snapshot1 = pd.concat([am_snap1, pm_snap1]).sort_values(by=["SecurityID", "OrigTime"])
        del am_snap1
        del pm_snap1
        
        am_snap1 = pd.read_table(path1 + "\\am_snap_level_spot.txt", header=None)
        assert(am_snap1.shape[1] == len(columns2))
        am_snap1.columns = columns2        
        pm_snap1 = pd.read_table(path1 + "\\pm_snap_level_spot.txt", header=None)
        assert(pm_snap1.shape[1] == len(columns2))
        pm_snap1.columns = columns2       
        snapshot2 = pd.concat([am_snap1, pm_snap1]).sort_values(by=["SecurityID", "OrigTime"])
        del am_snap1
        del pm_snap1
        
        snapshot1 = snapshot1[(snapshot1["SecurityID"] < 4000) | (snapshot1["SecurityID"] > 300000)]
        snapshot2 = snapshot2[(snapshot2["SecurityID"] < 4000) | (snapshot2["SecurityID"] > 300000)]
        snapshot1['time'] = (snapshot1['OrigTime'] - int(snapshot1['OrigTime'].iloc[0]//1000000000*1000000000)).astype(int)
        snapshot2['time'] = (snapshot2['OrigTime'] - int(snapshot2['OrigTime'].iloc[0]//1000000000*1000000000)).astype(int)
        snapshot = pd.merge(snapshot1, snapshot2, on=['Date',"OrigTime","ChannelNo","SecurityID","SecurityIDsource", "MDStreamID", "time"], how="outer")
        try:
            assert((snapshot.shape[0] == snapshot1.shape[0]) & (snapshot.shape[0] == snapshot2.shape[0]))
        except:
            if snapshot.shape[0] == snapshot1.shape[0]:
                print("snapshot1 have more ticks than snapshot2")
                if all(snapshot[(snapshot["dbtime_y"].isnull()) & ((snapshot["SecurityID"] < 4000) | (snapshot["SecurityID"] > 200000))]["time"].unique() > 150000000):
                    print("More ticks happens after 15:00")
                elif all(snapshot[(snapshot["dbtime_y"].isnull()) & ((snapshot["SecurityID"] < 4000) | (snapshot["SecurityID"] > 200000))]["time"].unique() < 93000000):
                    print("More ticks happens before 9:30")
                else:
                    print("There are ticks happens before 15:00, after 9:30")
            if snapshot.shape[0] == snapshot2.shape[0]:
                print("snapshot2 have more ticks than snapshot1")
                if all(snapshot[(snapshot["dbtime_x"].isnull()) & ((snapshot["SecurityID"] < 4000) | (snapshot["SecurityID"] > 200000))]["time"].unique() > 150000000):
                    print("More ticks happens after 15:00")
                elif all(snapshot[(snapshot["dbtime_x"].isnull()) & ((snapshot["SecurityID"] < 4000) | (snapshot["SecurityID"] > 200000))]["time"].unique() < 93000000):
                    print("More ticks happens before 9:30")
                else:
                    print("There are ticks happens before 15:00, after 9:30")
            else:
                print("snapshot2 don't join with snapshot1")
            snapshot = pd.merge(snapshot1, snapshot2, on=['Date',"OrigTime","ChannelNo","SecurityID","SecurityIDsource", "MDStreamID", "time"])
        del snapshot1
        del snapshot2
        
        snapshot = snapshot.rename(columns={"SecurityID":"StockID"})
        
        print("am & pm data 分卷") 
   

    elif len(np.array(glob.glob(data +'\\snap_level.7z'))) == 1: 
        date = os.path.basename(data)
        path = r'F:\SZ\2017' 
        os.chdir(data)
        try:
            a = py7zr.SevenZipFile(data + '\\snap_level.7z','r',filters=None)
        except:
            print("Bad unzip here!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            print(data + '\\snap_level.7z')
            bad.append(data + '\\snap_level.7z')
            continue
        path1 = path + '\\' + date
        a.extractall(path = path1)
        a.close()
        
        if len(np.array(glob.glob(path1 +'\\hq_snap.txt'))) != 1:
            print("Less data!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            bad.append(data + '\\hq_snap.txt')
            print(np.array(glob.glob(path1 +'\\***'))[0])
            continue
        snapshot1 = pd.read_table(path1 + "\\hq_snap.txt", header=None, encoding="UTF-8-sig")
        try:
            assert(snapshot1.shape[1] == len(columns1))
            snapshot1.columns = columns1
        except:
            assert(snapshot1.shape[1] == len(columns5))
            snapshot1.columns = columns5
        
        snapshot1["SecurityID"] = snapshot1["SecurityID"].astype(int)
        snapshot2["SecurityID"] = snapshot2["SecurityID"].astype(int)
        snapshot1 = snapshot1[(snapshot1["SecurityID"] < 4000) | (snapshot1["SecurityID"] > 300000)]
        snapshot2 = snapshot2[(snapshot2["SecurityID"] < 4000) | (snapshot2["SecurityID"] > 300000)]
        snapshot1['time'] = (snapshot1['OrigTime'] - int(snapshot1['OrigTime'].iloc[0]//1000000000*1000000000)).astype(np.int64)
        snapshot2['time'] = (snapshot2['OrigTime'] - int(snapshot2['OrigTime'].iloc[0]//1000000000*1000000000)).astype(np.int64)
        
        snapshot = pd.merge(snapshot1, snapshot2, on=['Date',"OrigTime","ChannelNo","SecurityID","SecurityIDsource", "MDStreamID", "time"], how="outer")
        try:
            assert((snapshot.shape[0] == snapshot1.shape[0]) & (snapshot.shape[0] == snapshot2.shape[0]))
        except:
            if snapshot.shape[0] == snapshot1.shape[0]:
                print("snapshot1 have more ticks than snapshot2")
                if all(snapshot[(snapshot["dbtime_y"].isnull()) & ((snapshot["SecurityID"] < 4000) | (snapshot["SecurityID"] > 200000))]["time"].unique() > 150000000):
                    print("More ticks happens after 15:00")
                elif all(snapshot[(snapshot["dbtime_y"].isnull()) & ((snapshot["SecurityID"] < 4000) | (snapshot["SecurityID"] > 200000))]["time"].unique() < 93000000):
                    print("More ticks happens before 9:30")
                else:
                    print("There are ticks happens before 15:00, after 9:30")
            if snapshot.shape[0] == snapshot2.shape[0]:
                print("snapshot2 have more ticks than snapshot1")
                if all(snapshot[(snapshot["dbtime_x"].isnull()) & ((snapshot["SecurityID"] < 4000) | (snapshot["SecurityID"] > 200000))]["time"].unique() > 150000000):
                    print("More ticks happens after 15:00")
                elif all(snapshot[(snapshot["dbtime_x"].isnull()) & ((snapshot["SecurityID"] < 4000) | (snapshot["SecurityID"] > 200000))]["time"].unique() < 93000000):
                    print("More ticks happens before 9:30")
                else:
                    print("There are ticks happens before 15:00, after 9:30")
            else:
                print("snapshot2 don't join with snapshot1")
            snapshot = pd.merge(snapshot1, snapshot2, on=['Date',"OrigTime","ChannelNo","SecurityID","SecurityIDsource", "MDStreamID", "time"])
        del snapshot1
        del snapshot2
        
        snapshot = snapshot.rename(columns={"SecurityID":"StockID"})
        
        print("深交所数据")
    
    else:
        print("Not inside!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
 
    
    snapshot["clockAtArrival"] = snapshot["OrigTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S%f').timestamp()*1e6))
    snapshot = snapshot.rename(columns={"Date":"date", "NumTrades":"cum_tradesCnt", "HighPx":"high", "LowPx":"low", "totalofferqty":
                                   "totalAskQuantity", "totalbidqty":"totalBidQuantity", "NUMORDERS_B1":"bid1n", "NUMORDERS_S1":"ask1n",
                                    "wa_offerPrice": "vwapAsk", "wa_bidPrice":"vwapBid", "PreClosePx":"prevClose", "openPrice":"open"})
    snapshot["ID"] = snapshot["StockID"] + 2000000
    snapshot["time"] = snapshot["time"].astype(np.int64) * 1000
    
    snapshot.loc[(~snapshot["ORDERQTY_B1"].isnull()), "ORDERQTY_B1"]=snapshot[(~snapshot["ORDERQTY_B1"].isnull())]["ORDERQTY_B1"].apply(lambda x: [int(i) for i in x.split('|')])
    snapshot.loc[(~snapshot["ORDERQTY_S1"].isnull()), "ORDERQTY_S1"]=snapshot[(~snapshot["ORDERQTY_S1"].isnull())]["ORDERQTY_S1"].apply(lambda x: [int(i) for i in x.split('|')])
    ## lambda x: [int(i) for i in x.split('|')]
    for i in range(1, 51):
        snapshot["bid1Top" + str(i) + 'q'] = 0
        snapshot["ask1Top" + str(i) + 'q'] = 0
    for i in range(1, 51):
        snapshot.loc[i <= snapshot["bid1n"], "bid1Top" + str(i) + 'q'] = snapshot.loc[i <= snapshot["bid1n"], "ORDERQTY_B1"].apply(lambda x: x[i-1])
        snapshot.loc[i <= snapshot["ask1n"], "ask1Top" + str(i) + 'q'] = snapshot.loc[i <= snapshot["ask1n"], "ORDERQTY_S1"].apply(lambda x: x[i-1])    


    snapshot = snapshot.fillna(0)
    snapshot = snapshot[~((snapshot["bid1p"] == 0) & (snapshot["ask1p"] == 0))]
    snapshot["ordering"] = snapshot.groupby("ID").cumcount()
    snapshot["ordering"] = snapshot["ordering"] + 1
    
    
    for columns in ["cum_tradesCnt", "cum_volume", 'ask1q','ask2q','ask3q','ask4q','ask5q','ask6q','ask7q','ask8q','ask9q',
                        'ask10q', 'bid1q','bid2q','bid3q','bid4q','bid5q','bid6q','bid7q','bid8q','bid9q','bid10q',
                    "totalBidQuantity", "totalAskQuantity", "bid1n", "ask1n"]:
        snapshot[columns] = snapshot[columns].astype('int64')


    for cols in ["prevClose", 'open', "high", "low", "close", 'bid10p','bid9p','bid8p','bid7p','bid6p','bid5p','bid4p','bid3p',
             'bid2p','bid1p','ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p', 'vwapBid', "vwapAsk"]:
        snapshot[cols] = (snapshot[cols] * 10000).round(0).astype('int64') 
    for cols in ["cum_amount"]:
        snapshot[cols] = (snapshot[cols] * 10000).round(0).astype('int64')
    snapshot['datetime'] = snapshot["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))
    for cols in ['bid10n', 'bid9n', 'bid8n', 'bid7n', 'bid6n', 'bid5n', 'bid4n', 'bid3n', 'bid2n', 'ask2n', 'ask3n', 'ask4n', 'ask5n', 'ask6n','ask7n', 
                 'ask8n', 'ask9n', 'ask10n', "totalBidOrders",'totalAskOrders','totalBidLevels', 'totalAskLevels', 'bidTradeMaxDuration', 'askTradeMaxDuration', 'cum_canceledBuyOrders', 'cum_canceledBuyVolume',
        "cum_canceledBuyAmount", "cum_canceledSellOrders", 'cum_canceledSellVolume',"cum_canceledSellAmount"]:
        snapshot[cols] = 0
    
    assert(sum(snapshot[snapshot["open"] != 0].groupby("ID")["open"].nunique() != 1) == 0)
    assert(sum(snapshot[snapshot["prevClose"] != 0].groupby("ID")["prevClose"].nunique() != 1) == 0)
    snapshot["prevClose"] = np.where(snapshot["time"] >= 91500000000, snapshot.groupby("ID")["prevClose"].transform("max"), snapshot["prevClose"]) 
    snapshot["open"] = np.where(snapshot["cum_volume"] > 0, snapshot.groupby("ID")["open"].transform("max"), snapshot["open"])
    assert(sum(snapshot[snapshot["open"] != 0].groupby("ID")["open"].nunique() != 1) == 0)
    assert(sum(snapshot[snapshot["prevClose"] != 0].groupby("ID")["prevClose"].nunique() != 1) == 0)
    assert(snapshot[snapshot["time"] >= 91500000000]["prevClose"].min() > 0)
    assert(snapshot[snapshot["cum_volume"] > 0]["open"].min() > 0)
    
    snapshot = snapshot[["date", "time", "clockAtArrival", "datetime", "ID", "ordering", "cum_tradesCnt", "cum_volume", "cum_amount", "prevClose",
                            "open", "high", "low", "close", 'bid10p','bid9p','bid8p','bid7p','bid6p','bid5p','bid4p','bid3p','bid2p','bid1p',
                            'ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p', 'bid10q','bid9q','bid8q',
                             'bid7q','bid6q','bid5q','bid4q','bid3q','bid2q','bid1q', 'ask1q','ask2q','ask3q','ask4q','ask5q','ask6q',
                             'ask7q','ask8q','ask9q','ask10q', 'bid10n', 'bid9n', 'bid8n', 'bid7n', 'bid6n', 'bid5n', 'bid4n', 'bid3n', 'bid2n', 'bid1n', 
                             'ask1n', 'ask2n', 'ask3n', 'ask4n', 'ask5n', 'ask6n','ask7n', 'ask8n', 'ask9n', 'ask10n','bid1Top1q','bid1Top2q','bid1Top3q','bid1Top4q','bid1Top5q','bid1Top6q',
        'bid1Top7q','bid1Top8q','bid1Top9q','bid1Top10q','bid1Top11q','bid1Top12q','bid1Top13q','bid1Top14q','bid1Top15q','bid1Top16q','bid1Top17q','bid1Top18q',
        'bid1Top19q','bid1Top20q','bid1Top21q','bid1Top22q','bid1Top23q','bid1Top24q','bid1Top25q','bid1Top26q','bid1Top27q','bid1Top28q','bid1Top29q',
        'bid1Top30q','bid1Top31q','bid1Top32q','bid1Top33q','bid1Top34q','bid1Top35q','bid1Top36q','bid1Top37q','bid1Top38q','bid1Top39q','bid1Top40q',
        'bid1Top41q','bid1Top42q','bid1Top43q','bid1Top44q','bid1Top45q','bid1Top46q','bid1Top47q','bid1Top48q','bid1Top49q','bid1Top50q', 'ask1Top1q',
        'ask1Top2q','ask1Top3q','ask1Top4q','ask1Top5q','ask1Top6q','ask1Top7q','ask1Top8q','ask1Top9q','ask1Top10q','ask1Top11q','ask1Top12q','ask1Top13q',
        'ask1Top14q','ask1Top15q','ask1Top16q','ask1Top17q','ask1Top18q','ask1Top19q','ask1Top20q','ask1Top21q','ask1Top22q','ask1Top23q',
        'ask1Top24q','ask1Top25q','ask1Top26q','ask1Top27q','ask1Top28q','ask1Top29q','ask1Top30q','ask1Top31q','ask1Top32q','ask1Top33q',
        'ask1Top34q','ask1Top35q','ask1Top36q','ask1Top37q','ask1Top38q','ask1Top39q','ask1Top40q','ask1Top41q','ask1Top42q','ask1Top43q',
        'ask1Top44q','ask1Top45q','ask1Top46q','ask1Top47q','ask1Top48q','ask1Top49q','ask1Top50q',"totalBidQuantity", "totalAskQuantity","vwapBid", "vwapAsk",
        "totalBidOrders",'totalAskOrders','totalBidLevels', 'totalAskLevels', 'bidTradeMaxDuration', 'askTradeMaxDuration', 'cum_canceledBuyOrders', 'cum_canceledBuyVolume',
        "cum_canceledBuyAmount", "cum_canceledSellOrders", 'cum_canceledSellVolume',"cum_canceledSellAmount"]]
    
    print(snapshot["date"].iloc[0])
    print(snapshot.groupby("ID")["time"].min().max())
    
    db = DB("mongodb://user_rw:faa96dfc@192.168.10.223")
    db.write('snapshot', snapshot)
    print(datetime.datetime.now() - startTm)

print(bad)    

    

  interactivity=interactivity, compiler=compiler, result=result)


am & pm data 未分卷
