In [1]:
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz

class DB(object):
    def __init__(self, uri, symbol_column='skey'):
        self.db_name = 'white_db'
        user, passwd, host = self.parse_uri(uri)
        auth_db = 'admin' if user in ('admin', 'root') else self.db_name
        self.uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)

        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = 1
            seg = {'ver': version, 'data': self.ser(df_seg, version), 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        if version == 1:
            return gzip.compress(pickle.dumps(s), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s), preset=1)
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)

        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        else:
            raise Exception('unknown version')

In [2]:
import pandas as pd
import random
import numpy as np
import glob
import os
from unrar import rarfile
import py7zr
import pickle
import datetime
import time
pd.set_option("max_columns", 200)

startTm = datetime.datetime.now()
readPath = r'\\192.168.10.30\Kevin_zhenyu\day_stock_20200424\***'
dataPathLs = np.array(glob.glob(readPath))
dataPathLs = dataPathLs[[np.array([os.path.basename(i).split('.')[0][:2] == 'SH' for i in dataPathLs])]]
db = pd.DataFrame()
for p in dataPathLs:
    dayData = pd.read_csv(p, compression='gzip')
    db = pd.concat([db, dayData])
print(datetime.datetime.now() - startTm)

year = "2019"
startDate = '20190114'
endDate = '20190930'
readPath = 'G:\\KR\\' + year + '\\SH\\***'
dataPathLs = np.array(glob.glob(readPath))
dateLs = np.array([os.path.basename(i) for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]

for data in dataPathLs:
    
    readPath = data + '\\snapshot\\***2\\***'
    dataPathLs = np.array(glob.glob(readPath))
    dateLs = np.array([int(os.path.basename(i).split('.')[0]) for i in dataPathLs])
    dataPathLs = dataPathLs[((dateLs >= 600000) & (dateLs <= 700000))]
    SH = []
    ll = []
    startTm = datetime.datetime.now()
    for i in dataPathLs:
        try:
            df = pd.read_csv(i, usecols = [0,1,3,5,7,9,10,11,15,17,18,19,20,21,22,23,25,26,28,29,30,31,32,33,37,39,40,41,
                                          42,46,47,49,50])
        except:
            print("empty data")
            print(i)
            ll.append(int(os.path.basename(i).split('.')[0]))
            continue
        df["StockID"] = int(os.path.basename(i).split('.')[0])
        SH += [df]
    del df
    SH = pd.concat(SH).reset_index(drop=True)
    print(datetime.datetime.now() - startTm)
    
    startTm = datetime.datetime.now()
    SH["skey"] = SH["StockID"] + 1000000
    SH.drop(["StockID"],axis=1,inplace=True)
    SH["date"] = int(SH["QuotTime"].iloc[0]//1000000000)
    SH["time"] = (SH['QuotTime'] - int(SH['QuotTime'].iloc[0]//1000000000*1000000000)).astype(np.int64) * 1000
    SH["clockAtArrival"] = SH["QuotTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S%f').timestamp()*1e6))
    SH.drop(["QuotTime"],axis=1,inplace=True)
    SH['datetime'] = SH["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))
    print(datetime.datetime.now() - startTm)
    
    
#     startTm = datetime.datetime.now()

#     BidPrice = np.array([i[1:-1].split(',') for i in SH['BidPrice'].values])
#     SH.drop(["BidPrice"],axis=1,inplace=True)
#     for i in range(1, 11):
#         SH["bid%sp"%i] = BidPrice[:, i-1]
#     del BidPrice
#     print("1")
    
#     OfferPrice = np.array([i[1:-1].split(',') for i in SH['OfferPrice'].values])
#     SH.drop(["OfferPrice"],axis=1,inplace=True)
#     for i in range(1, 11):
#         SH["ask%sp"%i] = OfferPrice[:, i-1]
#     del OfferPrice
#     print("2")
    
#     BidOrderQty = np.array([i[1:-1].split(',') for i in SH['BidOrderQty'].values]).astype(np.int64)
#     SH.drop(["BidOrderQty"],axis=1,inplace=True)
#     for i in range(1, 11):
#         SH["bid%sq"%i] = BidOrderQty[:, i-1]
#     del BidOrderQty
#     print("3")
    
#     OfferOrderQty = np.array([i[1:-1].split(',') for i in SH['OfferOrderQty'].values]).astype(np.int64)
#     SH.drop(["OfferOrderQty"],axis=1,inplace=True)
#     for i in range(1, 11):
#         SH["ask%sq"%i] = OfferOrderQty[:, i-1]
#     del OfferOrderQty
#     print("4")
    
#     BidNumOrders = np.array([i[1:-1].split(',') for i in SH['BidNumOrders'].values]).astype(np.int32)
#     SH.drop(["BidNumOrders"],axis=1,inplace=True)
#     for i in range(1, 11):
#         SH["bid%sn"%i] = BidNumOrders[:, i-1]
#     del BidNumOrders
#     print("5")
    
#     OfferNumOrders = np.array([i[1:-1].split(',') for i in SH['OfferNumOrders'].values]).astype(np.int32)
#     SH.drop(["OfferNumOrders"],axis=1,inplace=True)
#     for i in range(1, 11):
#         SH["ask%sn"%i] = OfferNumOrders[:, i-1]
#     del OfferNumOrders
#     print("6")
    
#     bidOrders = np.array([i[1:-1].split(',') for i in SH['BidOrders'].values]).astype(np.int32)
#     SH.drop(["BidOrders"],axis=1,inplace=True)
#     for i in range(1, 51):
#         SH["bid1Top%sq"%i] = bidOrders[:, i-1]
#     del bidOrders
#     print("7")
    
#     offerOrders = np.array([i[1:-1].split(',') for i in SH['OfferOrders'].values]).astype(np.int32)
#     SH.drop(["OfferOrders"],axis=1,inplace=True)
#     for i in range(1, 51):
#         SH["ask1Top%sq"%i] = offerOrders[:, i-1]
#     del offerOrders
#     print("8")
#     print(datetime.datetime.now() - startTm)

    startTm = datetime.datetime.now()
    SH["BidPrice"] = SH["BidPrice"].apply(lambda x: [float(i) for i in x[1:-1].split(',')])
    SH["OfferPrice"] = SH["OfferPrice"].apply(lambda x: [float(i) for i in x[1:-1].split(',')])
    SH["BidOrderQty"] = SH["BidOrderQty"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])
    SH["OfferOrderQty"] = SH["OfferOrderQty"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])
    SH["BidNumOrders"] = SH["BidNumOrders"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])
    SH["OfferNumOrders"] = SH["OfferNumOrders"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])

    for i in range(1, 11):
        SH["bid" + str(i) + 'p'] = SH["BidPrice"].apply(lambda x: x[i-1],2)
    SH.drop(["BidPrice"],axis=1,inplace=True)
    print("1")
    for i in range(1, 11):
        SH["ask" + str(i) + 'p'] = SH["OfferPrice"].apply(lambda x: x[i-1],2)
    SH.drop(["OfferPrice"],axis=1,inplace=True)
    print("2")
    for i in range(1, 11):
        SH["bid" + str(i) + 'q'] = SH["BidOrderQty"].apply(lambda x: x[i-1])
    SH.drop(["BidOrderQty"],axis=1,inplace=True)
    print("3")
    for i in range(1, 11):
        SH["ask" + str(i) + 'q'] = SH["OfferOrderQty"].apply(lambda x: x[i-1])
    SH.drop(["OfferOrderQty"],axis=1,inplace=True)
    print("4")
    for i in range(1, 11):
        SH["bid" + str(i) + 'n'] = SH["BidNumOrders"].apply(lambda x: x[i-1])
        SH["bid" + str(i) + 'n'] = SH["bid" + str(i) + 'n'].astype('int32')
    SH.drop(["BidNumOrders"],axis=1,inplace=True)
    print("5")
    for i in range(1, 11):
        SH["ask" + str(i) + 'n'] = SH["OfferNumOrders"].apply(lambda x: x[i-1])
        SH["ask" + str(i) + 'n'] = SH["ask" + str(i) + 'n'].astype('int32') 
    SH.drop(["OfferNumOrders"],axis=1,inplace=True)
    print("6")
    
    SH["BidOrders"] = SH["BidOrders"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])
    SH["OfferOrders"] = SH["OfferOrders"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])

    for i in range(1, 51):
        SH["bid1Top" + str(i) + 'q'] = SH["BidOrders"].apply(lambda x: x[i-1])
        SH["bid1Top" + str(i) + 'q'] = SH["bid1Top" + str(i) + 'q'].astype('int32') 
    SH.drop(["BidOrders"],axis=1,inplace=True)
    print("7")
    
    for i in range(1, 51):
        SH["ask1Top" + str(i) + 'q'] = SH["OfferOrders"].apply(lambda x: x[i-1])
        SH["ask1Top" + str(i) + 'q'] = SH["ask1Top" + str(i) + 'q'].astype('int32') 
    SH.drop(["OfferOrders"],axis=1,inplace=True)
    print("8")
    print(datetime.datetime.now() - startTm)
    
    
    startTm = datetime.datetime.now()
#     SH = SH.rename(columns={"NumTrades":"cum_trades_cnt", "HighPx":"high", "OpenPx":"open", "PreClosePx":"prev_close", "LowPx":"low",
#                             "WeightedAvgBidPx":"total_bid_vwap", "WithdrawSellNumber":"cum_canceled_sell_orders", "TotalOfferNumber":"total_ask_orders",
#                            "OfferTradeMaxDuration":"ask_trade_max_duration", "TotalBidNumber":"total_bid_orders", "WithdrawSellMoney":"cum_canceled_sell_amount",
#                            "TotalOfferQty":"total_ask_quantity", "WithdrawBuyNumber":"cum_canceled_buy_orders", "WeightedAvgOfferPx":"total_ask_vwap",
#                            "WithdrawSellAmount":"cum_canceled_sell_volume", "Volume":"cum_volume", "NumOfferOrders":"total_ask_levels", "TotalBidQty":"total_bid_quantity",
#                            "WithdrawBuyAmount":"cum_canceled_buy_volume", "LastPx":"close", "BidTradeMaxDuration":"bid_trade_max_duration", 
#                            "NumBidOrders":"total_bid_levels", "Amount":"cum_amount", "WithdrawBuyMoney":"cum_canceled_buy_amount"})
    SH.columns = ['cum_trades_cnt', 'ask_trade_max_duration', 'total_bid_orders',
       'cum_canceled_sell_amount', 'total_ask_quantity', 'cum_canceled_buy_orders',
       'total_ask_vwap', 'cum_canceled_sell_volume', 'cum_volume', 'open',
       'high', 'prev_close', 'low', 'total_bid_vwap',
       'cum_canceled_sell_orders', 'total_ask_orders', 'total_ask_levels',
       'total_bid_quantity', 'cum_canceled_buy_volume', 'bid_trade_max_duration',
       'total_bid_levels', 'close', 'cum_amount', 'cum_canceled_buy_amount', 'skey', 'date', 'time', 'clockAtArrival',
       'datetime', 'bid1p', 'bid2p', 'bid3p', 'bid4p', 'bid5p', 'bid6p',
       'bid7p', 'bid8p', 'bid9p', 'bid10p', 'ask1p', 'ask2p', 'ask3p',
       'ask4p', 'ask5p', 'ask6p', 'ask7p', 'ask8p', 'ask9p', 'ask10p',
       'bid1q', 'bid2q', 'bid3q', 'bid4q', 'bid5q', 'bid6q', 'bid7q',
       'bid8q', 'bid9q', 'bid10q', 'ask1q', 'ask2q', 'ask3q', 'ask4q',
       'ask5q', 'ask6q', 'ask7q', 'ask8q', 'ask9q', 'ask10q', 'bid1n',
       'bid2n', 'bid3n', 'bid4n', 'bid5n', 'bid6n', 'bid7n', 'bid8n',
       'bid9n', 'bid10n', 'ask1n', 'ask2n', 'ask3n', 'ask4n', 'ask5n',
       'ask6n', 'ask7n', 'ask8n', 'ask9n', 'ask10n', 'bid1Top1q',
       'bid1Top2q', 'bid1Top3q', 'bid1Top4q', 'bid1Top5q', 'bid1Top6q',
       'bid1Top7q', 'bid1Top8q', 'bid1Top9q', 'bid1Top10q', 'bid1Top11q',
       'bid1Top12q', 'bid1Top13q', 'bid1Top14q', 'bid1Top15q',
       'bid1Top16q', 'bid1Top17q', 'bid1Top18q', 'bid1Top19q',
       'bid1Top20q', 'bid1Top21q', 'bid1Top22q', 'bid1Top23q',
       'bid1Top24q', 'bid1Top25q', 'bid1Top26q', 'bid1Top27q',
       'bid1Top28q', 'bid1Top29q', 'bid1Top30q', 'bid1Top31q',
       'bid1Top32q', 'bid1Top33q', 'bid1Top34q', 'bid1Top35q',
       'bid1Top36q', 'bid1Top37q', 'bid1Top38q', 'bid1Top39q',
       'bid1Top40q', 'bid1Top41q', 'bid1Top42q', 'bid1Top43q',
       'bid1Top44q', 'bid1Top45q', 'bid1Top46q', 'bid1Top47q',
       'bid1Top48q', 'bid1Top49q', 'bid1Top50q', 'ask1Top1q', 'ask1Top2q',
       'ask1Top3q', 'ask1Top4q', 'ask1Top5q', 'ask1Top6q', 'ask1Top7q',
       'ask1Top8q', 'ask1Top9q', 'ask1Top10q', 'ask1Top11q', 'ask1Top12q',
       'ask1Top13q', 'ask1Top14q', 'ask1Top15q', 'ask1Top16q',
       'ask1Top17q', 'ask1Top18q', 'ask1Top19q', 'ask1Top20q',
       'ask1Top21q', 'ask1Top22q', 'ask1Top23q', 'ask1Top24q',
       'ask1Top25q', 'ask1Top26q', 'ask1Top27q', 'ask1Top28q',
       'ask1Top29q', 'ask1Top30q', 'ask1Top31q', 'ask1Top32q',
       'ask1Top33q', 'ask1Top34q', 'ask1Top35q', 'ask1Top36q',
       'ask1Top37q', 'ask1Top38q', 'ask1Top39q', 'ask1Top40q',
       'ask1Top41q', 'ask1Top42q', 'ask1Top43q', 'ask1Top44q',
       'ask1Top45q', 'ask1Top46q', 'ask1Top47q', 'ask1Top48q',
       'ask1Top49q', 'ask1Top50q']
    SH = SH.fillna(0)
    SH["p1"] = SH["bid1p"] + SH["ask1p"]
    tt = SH[(SH["cum_volume"] > 0) & (SH["time"] < 145700000000)].groupby("skey")['p1'].min()
    SH.drop("p1", axis=1, inplace=True)
    try:
        assert(tt[tt == 0].shape[0] == 0)
    except:
        display(tt[tt == 0])
    SH = SH[~((SH["bid1p"] == 0) & (SH["ask1p"] == 0))]
    SH["ordering"] = SH.groupby("skey").cumcount()
    SH["ordering"] = SH["ordering"] + 1
    
    SH["has_missing"] = 0
    
    for col in ["skey", "date", "cum_trades_cnt", "total_bid_orders",
        'total_ask_orders', 'total_bid_levels', 'total_ask_levels', 'cum_canceled_buy_orders','cum_canceled_sell_orders',
            "ordering", 'bid_trade_max_duration', 'ask_trade_max_duration','has_missing']:
        SH[col] = SH[col].astype('int32')
    
#     for cols in ["prev_close", 'open', "high", "low", "close", 'bid10p','bid9p','bid8p','bid7p','bid6p','bid5p','bid4p','bid3p',
#              'bid2p','bid1p','ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p']:
# #         SH[cols] = SH[cols].apply(lambda x: round(x, 2)).astype('float64')
#         print(cols)
#         print(SH[cols].astype(str).apply(lambda x: len(str(x.split('.')[1]))).unique())
    
#     for cols in ['cum_amount', "cum_canceled_sell_amount", "cum_canceled_buy_amount"]:
# #         SH[cols] = SH[cols].apply(lambda x: round(x, 2)).astype('float64')
#         print(cols)
#         print(SH[cols].astype(str).apply(lambda x: len(str(x.split('.')[1]))).unique())
        
    for cols in ['total_bid_vwap', "total_ask_vwap"]:
        SH[cols] = SH[cols].apply(lambda x: round(x, 3))
        
   
    assert(sum(SH[SH["open"] != 0].groupby("skey")["open"].nunique() != 1) == 0)
    assert(sum(SH[SH["prev_close"] != 0].groupby("skey")["prev_close"].nunique() != 1) == 0)
    SH["prev_close"] = np.where(SH["time"] >= 91500000000, SH.groupby("skey")["prev_close"].transform("max"), SH["prev_close"]) 
    SH["open"] = np.where(SH["cum_volume"] > 0, SH.groupby("skey")["open"].transform("max"), SH["open"])
    assert(sum(SH[SH["open"] != 0].groupby("skey")["open"].nunique() != 1) == 0)
    assert(sum(SH[SH["prev_close"] != 0].groupby("skey")["prev_close"].nunique() != 1) == 0)
    assert(SH[SH["cum_volume"] > 0]["open"].min() > 0)
    print(datetime.datetime.now() - startTm)
    
    
    # check 1
    startTm = datetime.datetime.now()
    da_te = str(SH["date"].iloc[0]) 
    da_te = da_te[:4] + '-' + da_te[4:6] + '-' + da_te[6:8]
    db1 = db[db["date"] == da_te]
    db1["ID"] = db1["ID"].str[2:].astype(int) + 1000000
    db1["date"] = (db1["date"].str[:4] + db1["date"].str[5:7] + db1["date"].str[8:]).astype(int)
    SH["cum_max"] = SH.groupby("skey")["cum_volume"].transform(max)
    s2 = SH[SH["cum_volume"] == SH["cum_max"]].groupby("skey").first().reset_index()
    dd = SH[SH["cum_volume"] == SH["cum_max"]].groupby("skey")["time"].first().reset_index()
    SH.drop("cum_max", axis=1, inplace=True)
    s2 = s2.rename(columns={"skey": "ID", 'open':"d_open", "prev_close":"d_yclose","high":"d_high", "low":"d_low", "close":"d_close", "cum_volume":"d_volume", "cum_amount":"d_amount"})
    if SH["date"].iloc[0] < 20180820:
        s2["auction"] = 0
    else:
        dd["auction"] = np.where(dd["time"]<=145700000000, 0, 1)
        dd = dd.rename(columns={"skey": "ID"})
        s2 = pd.merge(s2, dd[["ID", "auction"]], on="ID")
    s2 = s2[["ID", "date", "d_open", "d_yclose", "d_high", "d_low", "d_close", "d_volume", "d_amount", "auction"]]
    re = pd.merge(db1, s2, on=["ID", "date", "d_open", "d_yclose","d_high", "d_low", "d_volume"], how="outer")
    try:
        assert(sum(re["d_amount_y"].isnull()) == 0)
    except:
        print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
        print(re[re["d_amount_y"].isnull()])
    print(datetime.datetime.now() - startTm)
    
    # check 2
    # first part
    startTm = datetime.datetime.now()
    date = pd.DataFrame(pd.date_range(start='2019-06-10 08:30:00', end='2019-06-10 18:00:00', freq='s'), columns=["Orig"])
    date["time"] = date["Orig"].apply(lambda x: int(x.strftime("%H%M%S"))*1000)
    date["group"] = date["time"]//30000
    SH["group"] = SH["time"]//30000000
    gl = date[((date["time"] >= 93000000) & (date["time"] <= 113000000))|((date["time"] >= 130000000) & (date["time"] <= 150000000))]["group"].unique()
    l = set(gl) - set(SH["group"].unique())
    SH["has_missing1"] = 0 
    if len(l) != 0:
        print("massive missing")
        print(l)
        SH["order"] = SH.groupby(["skey", "time"]).cumcount()
        for i in l:
            SH["t"] = SH[SH["group"] > i].groupby("StockID")["time"].transform("min")
            SH["has_missing1"] = np.where((SH["time"] == SH["t"]) & (SH["order"] == 0), 1, 0)
        SH.drop(["order", "t", "group"], axis=1, inplace=True)   
    else:
        print("no massive missing")
        SH.drop(["group"], axis=1, inplace=True)
    



    # second part

    SH["time_interval"] = SH.groupby("skey")["datetime"].apply(lambda x: x - x.shift(1))
    SH["time_interval"] = SH["time_interval"].apply(lambda x: x.seconds)
    SH["tn_update"] = SH.groupby("skey")["cum_trades_cnt"].apply(lambda x: x-x.shift(1))

    f1 = SH[(SH["time"] >= 93000000000) & (SH["tn_update"] != 0)].groupby("skey")["time"].min().reset_index()
    f1 = f1.rename(columns={"time": "time1"})
    f2 = SH[(SH["time"] >= 130000000000) & (SH["tn_update"] != 0)].groupby("skey")["time"].min().reset_index()
    f2 = f2.rename(columns={"time": "time2"})
    f3 = SH[(SH["time"] >= 150000000000) & (SH["tn_update"] != 0)].groupby("skey")["time"].min().reset_index()
    f3 = f3.rename(columns={"time": "time3"})
    SH = pd.merge(SH, f1, on="skey", how="left")
    del f1
    SH = pd.merge(SH, f2, on="skey", how="left")
    del f2
    SH = pd.merge(SH, f3, on="skey", how="left")
    del f3
    p99 = SH[(SH["time"] > 93000000000) & (SH["time"] < 145700000000) & (SH["time"] != SH["time2"]) & (SH["tn_update"] != 0)]\
    .groupby("skey")["tn_update"].apply(lambda x: x.describe([0.99])["99%"]).reset_index()
    p99 = p99.rename(columns={"tn_update":"99%"})
    SH = pd.merge(SH, p99, on="skey", how="left")

    SH["has_missing2"] = 0
    SH["has_missing2"] = np.where((SH["time_interval"] > 60) & (SH["tn_update"] > SH["99%"]) & 
         (SH["time"] > SH["time1"]) & (SH["time"] != SH["time2"]) & (SH["time"] != SH["time3"]), 1, 0)
    SH.drop(["time_interval", "tn_update", "time1", "time2", "time3", "99%"], axis=1, inplace=True) 

    SH["has_missing"] = np.where((SH["has_missing1"] == 1) | (SH["has_missing2"] == 1), 1, 0)
    SH.drop(["has_missing1", "has_missing2"], axis=1, inplace=True) 
    if SH[SH["has_missing"] == 1].shape[0] != 0:
        print("has missing!!!!!!!!!!!!!!!!!!!!!!!")
        print(SH[SH["has_missing"] == 1].shape[0])
    print(datetime.datetime.now() - startTm)
    
    
    
    startTm = datetime.datetime.now()
    SH = SH[["skey", "date", "time", "clockAtArrival", "datetime", "ordering", "has_missing", "cum_trades_cnt", "cum_volume", "cum_amount", "prev_close",
                            "open", "high", "low", "close", 'bid10p','bid9p','bid8p','bid7p','bid6p','bid5p','bid4p','bid3p','bid2p','bid1p',
                            'ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p', 'bid10q','bid9q','bid8q',
                             'bid7q','bid6q','bid5q','bid4q','bid3q','bid2q','bid1q', 'ask1q','ask2q','ask3q','ask4q','ask5q','ask6q',
                             'ask7q','ask8q','ask9q','ask10q', 'bid10n', 'bid9n', 'bid8n', 'bid7n', 'bid6n', 'bid5n', 'bid4n', 'bid3n', 'bid2n', 'bid1n', 
                             'ask1n', 'ask2n', 'ask3n', 'ask4n', 'ask5n', 'ask6n','ask7n', 'ask8n', 'ask9n', 'ask10n','bid1Top1q','bid1Top2q','bid1Top3q','bid1Top4q','bid1Top5q','bid1Top6q',
        'bid1Top7q','bid1Top8q','bid1Top9q','bid1Top10q','bid1Top11q','bid1Top12q','bid1Top13q','bid1Top14q','bid1Top15q','bid1Top16q','bid1Top17q','bid1Top18q',
        'bid1Top19q','bid1Top20q','bid1Top21q','bid1Top22q','bid1Top23q','bid1Top24q','bid1Top25q','bid1Top26q','bid1Top27q','bid1Top28q','bid1Top29q',
        'bid1Top30q','bid1Top31q','bid1Top32q','bid1Top33q','bid1Top34q','bid1Top35q','bid1Top36q','bid1Top37q','bid1Top38q','bid1Top39q','bid1Top40q',
        'bid1Top41q','bid1Top42q','bid1Top43q','bid1Top44q','bid1Top45q','bid1Top46q','bid1Top47q','bid1Top48q','bid1Top49q','bid1Top50q', 'ask1Top1q',
        'ask1Top2q','ask1Top3q','ask1Top4q','ask1Top5q','ask1Top6q','ask1Top7q','ask1Top8q','ask1Top9q','ask1Top10q','ask1Top11q','ask1Top12q','ask1Top13q',
        'ask1Top14q','ask1Top15q','ask1Top16q','ask1Top17q','ask1Top18q','ask1Top19q','ask1Top20q','ask1Top21q','ask1Top22q','ask1Top23q',
        'ask1Top24q','ask1Top25q','ask1Top26q','ask1Top27q','ask1Top28q','ask1Top29q','ask1Top30q','ask1Top31q','ask1Top32q','ask1Top33q',
        'ask1Top34q','ask1Top35q','ask1Top36q','ask1Top37q','ask1Top38q','ask1Top39q','ask1Top40q','ask1Top41q','ask1Top42q','ask1Top43q',
        'ask1Top44q','ask1Top45q','ask1Top46q','ask1Top47q','ask1Top48q','ask1Top49q','ask1Top50q',"total_bid_quantity", "total_ask_quantity","total_bid_vwap", "total_ask_vwap",
        "total_bid_orders",'total_ask_orders','total_bid_levels', 'total_ask_levels', 'bid_trade_max_duration', 'ask_trade_max_duration', 'cum_canceled_buy_orders', 'cum_canceled_buy_volume',
        "cum_canceled_buy_amount", "cum_canceled_sell_orders", 'cum_canceled_sell_volume',"cum_canceled_sell_amount"]]
    
    display(SH["date"].iloc[0])
    print("SH finished")
    print(datetime.datetime.now() - startTm)
    
    
    startDate = str(SH["date"].iloc[0])
    endDate = str(SH["date"].iloc[0])
    db1 = DB("mongodb://user_rw:faa96dfc@192.168.10.223")
    # delete all stocks from certain period
    db1.delete('snapshot', start_date=startDate, end_date=endDate)
    db1 = DB("mongodb://user_rw:faa96dfc@192.168.10.223")
    db1.write('snapshot', SH)
    
    del SH

  app.launch_new_instance()


0:02:10.720242
0:00:40.418855
0:00:55.078630
1
2
3
4
5
6
7
8
0:06:43.009699


skey
1600353    0.0
Name: p1, dtype: float64

0:00:32.012693


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
          ID      date  d_open  d_high  d_low  d_close_x  d_yclose  \
378  1600353  20190114     4.8    4.83   4.66       4.69      4.86   

     d_cumprodCAA  d_dayReturn  d_5dayReturn  d_ICDayReturn  d_CSIDayReturn  \
378      0.989619    -0.034979     -0.004246      -0.006565       -0.005933   

       d_volume  d_amount_x    TORate  allZT  hasZT  isZT  allDT  hasDT  isDT  \
378  10882631.0  51416487.0  0.020015    0.0    0.0   0.0    0.0    0.0   0.0   

     tmrHalted  haltedDays  marketShares  totalShares  d_close_y  d_amount_y  \
378        0.0         0.0   543720000.0  543720000.0        NaN         NaN   

     auction  
378      NaN  
0:00:02.599046
no massive missing
0:01:34.554150


20190114

SH finished
0:00:02.082429
0:02:01.913804
0:00:56.467913
1
2
3
4
5
6
7
8
0:06:53.814787
0:00:35.155936
0:00:02.628966
no massive missing
0:01:30.204647


20190115

SH finished
0:00:01.937815
0:01:46.174888
0:00:57.068307
1
2
3
4
5
6
7
8
0:07:08.260709


skey
1601298    0.0
Name: p1, dtype: float64

0:00:34.897627
0:00:02.790535
no massive missing
0:01:29.396807


20190121

SH finished
0:00:01.762311
0:01:34.397427
0:00:56.073967
1
2
3
4
5
6
7
8
0:06:53.041514


skey
1603700    0.0
Name: p1, dtype: float64

0:00:37.820350
0:00:02.774403
no massive missing
0:01:27.699320


20190122

SH finished
0:00:01.739346
0:01:23.994007
0:00:58.886442
1
2
3
4
5
6
7
8
0:06:57.998593


skey
1601615    0.0
Name: p1, dtype: float64

0:00:34.431873
0:00:02.645921
no massive missing
0:01:27.423596


20190123

SH finished
0:00:01.806167
0:01:23.796791
0:00:56.643443
1
2
3
4
5
6
7
8
0:07:07.274774
0:00:38.093077
0:00:02.877304
no massive missing
0:01:32.792719


20190124

SH finished
0:00:01.754306
0:02:51.590887
0:00:56.291386
1
2
3
4
5
6
7
8
0:06:50.013957
0:00:36.592121
0:00:02.847354
no massive missing
0:01:28.716628


20190125

SH finished
0:00:01.746328
0:01:23.540476
0:00:56.758137
1
2
3
4
5
6
7
8
0:07:06.320328
0:00:34.878678
0:00:02.726704
no massive missing
0:01:28.501204


20190128

SH finished
0:00:01.744333
0:01:25.088336
0:00:57.996823
1
2
3
4
5
6
7
8
0:07:03.020158
0:00:35.310521
0:00:02.770587
no massive missing
0:01:31.136154


20190129

SH finished
0:00:01.850050
0:01:14.363033
0:00:55.093287
1
2
3
4
5
6
7
8
0:06:43.588151


skey
1603351    0.0
Name: p1, dtype: float64

0:00:32.628697
0:00:02.527238
no massive missing
0:01:24.984613


20190130

SH finished
0:00:02.003640
0:01:28.300740
0:00:58.238122
1
2
3
4
5
6
7
8
0:07:02.933112
0:00:35.343434
0:00:02.770587
no massive missing
0:01:30.193676


20190131

SH finished
0:00:01.773255
0:01:23.282169
0:00:55.269120
1
2
3
4
5
6
7
8
0:06:51.419225
0:00:33.536242
0:00:02.878332
no massive missing
0:01:27.674383


20190201

SH finished
0:00:02.130300
0:02:00.508564
0:00:57.617836
1
2
3
4
5
6
7
8
0:07:05.437689
0:00:35.487050
0:00:03.186474
no massive missing
0:01:38.814608


20190211

SH finished
0:00:01.922855
0:01:42.670293
0:00:59.784040
1
2
3
4
5
6
7
8
0:07:27.034903
0:00:35.132000
0:00:02.747649
no massive missing
0:01:30.471930


20190212

SH finished
0:00:01.822125
0:01:56.153218
0:01:03.924961
1
2
3
4
5
6
7
8
0:07:59.077800
0:00:39.837410
0:00:02.994986
no massive missing
0:01:39.039009


20190213

SH finished
0:00:01.989676
0:01:29.686891
0:01:00.405274
1
2
3
4
5
6
7
8
0:07:25.293994
0:00:36.266964
0:00:02.853365
no massive missing
0:01:41.558268


20190214

SH finished
0:00:01.843068
0:01:36.244957
0:01:00.769336
1
2
3
4
5
6
7
8
0:07:23.224709
0:00:38.922858
0:00:03.047845
no massive missing
0:01:33.957604


20190215

SH finished
0:00:01.857031
0:01:33.661396
0:01:05.400014
1
2
3
4
5
6
7
8
0:08:13.430766
0:00:40.079315
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
           ID      date  d_open  d_high  d_low  d_close_x  d_yclose  \
1414  1600128  20190218     7.2    7.58   7.14       7.38      7.13   

      d_cumprodCAA  d_dayReturn  d_5dayReturn  d_ICDayReturn  d_CSIDayReturn  \
1414      0.982475     0.035063      0.080527       0.035148        0.035479   

        d_volume   d_amount_x    TORate  allZT  hasZT  isZT  allDT  hasDT  \
1414  21832880.0  160322361.0  0.088476    0.0    0.0   0.0    0.0    0.0   

      isDT  tmrHalted  haltedDays  marketShares  totalShares  d_close_y  \
1414   0.0        0.0         0.0   246767500.0  246767500.0        NaN   

      d_amount_y  auction  
1414         NaN      NaN  
0:00:03.144587
no massive missing
0:01:42.892731


20190218

SH finished
0:00:02.029570
0:02:16.211549
0:01:06.990758
1
2
3
4
5
6
7
8
0:08:03.163238
0:00:41.784200
0:00:03.104693
no massive missing
0:01:51.229393


20190219

SH finished
0:00:01.994662
0:01:42.065240
0:01:03.183944
1
2
3
4
5
6
7
8
0:07:46.588585
0:00:39.442467
0:00:03.175503
no massive missing
0:01:40.795310


20190220

SH finished
0:00:01.925847
0:02:24.963442
0:01:07.459503
1
2
3
4
5
6
7
8
0:08:23.742413
0:00:39.561150
0:00:03.117658
no massive missing
0:01:43.269408


20190221

SH finished
0:00:02.027875
0:01:32.186344
0:01:05.495758
1
2
3
4
5
6
7
8
0:08:04.226393


skey
1603956    0.0
Name: p1, dtype: float64

0:00:41.472036
0:00:03.275237
no massive missing
0:01:47.423575


20190222

SH finished
0:00:02.028572
0:01:44.864422
0:01:13.692826
1
2
3
4
5
6
7
8
0:09:05.816356


skey
1600522    0.0
Name: p1, dtype: float64

0:00:45.870268
0:00:03.446778
no massive missing
0:01:58.517891


20190225

SH finished
0:00:02.336748
0:01:51.258287
0:01:13.484383
1
2
3
4
5
6
7
8
0:09:10.411308
0:00:43.910511
0:00:03.521577
no massive missing
0:01:56.843375


20190226

SH finished
0:00:02.504469
0:01:41.661990
0:01:12.261654
1
2
3
4
5
6
7
8
0:08:39.843145


skey
1600604    0.0
Name: p1, dtype: float64

0:00:48.240926
0:00:04.591714
no massive missing
0:01:56.354678


20190227

SH finished
0:00:02.891265
0:01:39.541663
0:01:07.464491
1
2
3
4
5
6
7
8
0:08:13.216846
0:00:41.983669
0:00:03.482681
no massive missing
0:01:45.898655


20190228

SH finished
0:00:02.851371
0:01:39.689269
0:01:06.947872
1
2
3
4
5
6
7
8
0:08:19.092617


skey
1600928    0.0
Name: p1, dtype: float64

0:00:41.536862
0:00:03.076768
no massive missing
0:01:46.055908


20190301

SH finished
0:00:02.171202
0:01:42.598485
0:01:13.757051
1
2
3
4
5
6
7
8
0:09:07.608229
0:00:51.559048
0:00:03.394915
no massive missing
0:02:11.715580


20190304

SH finished
0:00:02.545189
0:02:07.564686
0:01:11.116717
1
2
3
4
5
6
7
8
0:08:45.782302
0:00:42.980003
0:00:03.352030
no massive missing
0:01:51.867685


20190305

SH finished
0:00:02.171190
0:02:03.175428
0:01:15.572344
1
2
3
4
5
6
7
8
0:09:09.936789


skey
1603032    0.0
Name: p1, dtype: float64

0:00:45.552090
0:00:03.465726
no massive missing
0:01:57.998693


20190306

SH finished
0:00:02.367665
0:01:50.374680
0:01:14.718082
1
2
3
4
5
6
7
8
0:09:16.494032
0:00:43.851669
0:00:03.375968
no massive missing
0:01:58.031145


20190307

SH finished
0:00:02.370657
0:01:53.918817
0:01:14.930220
1
2
3
4
5
6
7
8
0:09:07.262731


skey
1600635    0.0
1600909    0.0
1601311    0.0
Name: p1, dtype: float64

0:00:48.144184
0:00:03.494650
no massive missing
0:02:10.723234


20190308

SH finished
0:00:02.587078
0:02:28.309181
0:01:11.263326
1
2
3
4
5
6
7
8
0:08:35.571525


skey
1601228    0.0
Name: p1, dtype: float64

0:00:42.461389
0:00:03.312138
no massive missing
0:01:50.274946


20190311

SH finished
0:00:02.223051
0:01:58.866957
0:01:15.549857
1
2
3
4
5
6
7
8
0:09:08.459529
0:00:45.803447
0:00:03.517589
no massive missing
0:01:56.426486


20190312

SH finished
0:00:02.279901
0:02:05.282790
0:01:13.915231
1
2
3
4
5
6
7
8
0:09:00.523762
0:00:43.027874
0:00:03.345050
no massive missing
0:01:51.005990


20190313

SH finished
0:00:02.393596
0:01:53.958092
0:01:11.537592
1
2
3
4
5
6
7
8
0:08:26.159707
0:00:48.837330
0:00:03.610339
no massive missing
0:01:54.159553


20190314

SH finished
0:00:02.585084
0:02:19.115777
0:01:10.016662
1
2
3
4
5
6
7
8
0:08:28.447586
0:00:40.861669
0:00:03.322112
no massive missing
0:01:52.109038


20190315

SH finished
0:00:02.266934
0:02:05.984912
0:01:12.642635
1
2
3
4
5
6
7
8
0:08:51.212675
0:00:43.328070
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
          ID      date  d_open  d_high  d_low  d_close_x  d_yclose  \
197  1600328  20190318    8.54     8.6   8.44       8.55      8.46   

     d_cumprodCAA  d_dayReturn  d_5dayReturn  d_ICDayReturn  d_CSIDayReturn  \
197      0.979131     0.010638      0.046512        0.02653        0.024101   

       d_volume   d_amount_x    TORate  allZT  hasZT  isZT  allDT  hasDT  \
197  17237240.0  146753910.0  0.039352    0.0    0.0   0.0    0.0    0.0   

     isDT  tmrHalted  haltedDays  marketShares  totalShares  d_close_y  \
197   0.0        0.0         0.0   438031073.0  438031073.0        NaN   

     d_amount_y  auction  
197         NaN      NaN  
0:00:03.288202
no massive missing
0:01:52.407240


20190318

SH finished
0:00:02.339740
0:02:04.887847
0:01:11.294243
1
2
3
4
5
6
7
8
0:08:53.054746
0:00:44.681449
0:00:03.296182
no massive missing
0:01:53.790539


20190319

SH finished
0:00:02.303835
0:02:04.633527
0:01:11.711129
1
2
3
4
5
6
7
8
0:08:49.636491
0:00:46.953370
0:00:03.478693
no massive missing
0:02:01.256564


20190320

SH finished
0:00:02.552171
0:01:48.061866
0:01:13.649940
1
2
3
4
5
6
7
8
0:09:05.629102


skey
1600177    0.0
Name: p1, dtype: float64

0:00:48.452360
0:00:03.684142
no massive missing
0:01:54.679163


20190321

SH finished
0:00:02.225046
0:01:44.802588
0:01:13.941161
1
2
3
4
5
6
7
8
0:09:02.664036


skey
1601375    0.0
Name: p1, dtype: float64

0:00:42.884258
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
          ID      date  d_open  d_high  d_low  d_close_x  d_yclose  \
542  1603099  20190322   10.58   11.21  10.53       11.2     10.57   

     d_cumprodCAA  d_dayReturn  d_5dayReturn  d_ICDayReturn  d_CSIDayReturn  \
542      0.985802     0.059603      0.108911       0.005844        0.003991   

      d_volume  d_amount_x    TORate  allZT  hasZT  isZT  allDT  hasDT  isDT  \
542  7846042.0  85941553.0  0.029422    0.0    0.0   0.0    0.0    0.0   0.0   

     tmrHalted  haltedDays  marketShares  totalShares  d_close_y  d_amount_y  \
542        0.0         0.0   266670000.0  266670000.0        NaN         NaN   

     auction  
542      NaN  
0:00:03.265263
no massive missing
0:01:53.376647


20190322

SH finished
0:00:02.223052
0:01:45.600453
0:01:13.148283
1
2
3
4
5
6
7
8
0:08:50.506564


skey
1600855    0.0
1603700    0.0
Name: p1, dtype: float64

0:00:47.392196
0:00:03.709076
no massive missing
0:01:57.793828


20190325

SH finished
0:00:02.232028
0:03:19.450347
0:01:13.725737
1
2
3
4
5
6
7
8
0:08:49.400523


skey
1601989    0.0
1603681    0.0
Name: p1, dtype: float64

0:00:44.171813
0:00:03.407882
no massive missing
0:01:59.075399


20190326

SH finished
0:00:02.457425
0:01:44.806577
0:01:10.574170
1
2
3
4
5
6
7
8
0:08:31.598156
0:00:43.726006
0:00:03.563464
no massive missing
0:01:56.891244


20190327

SH finished
0:00:02.373649
0:01:48.096774
0:01:09.584817
1
2
3
4
5
6
7
8
0:08:30.672632
0:00:41.563791
0:00:03.387935
no massive missing
0:01:51.392955


20190328

SH finished
0:00:02.199116
0:01:42.069899
0:01:12.889973
1
2
3
4
5
6
7
8
0:08:58.468262
0:00:48.678754
0:00:04.214723
no massive missing
0:01:56.190120


20190329

SH finished
0:00:02.508288
0:01:57.635252
0:01:15.911888
1
2
3
4
5
6
7
8
0:09:12.687217
0:00:45.004585
0:00:03.501631
no massive missing
0:01:58.863965


20190401

SH finished
0:00:02.562144
0:02:10.698301
0:01:15.065154
1
2
3
4
5
6
7
8
0:09:14.077497
0:00:47.357290
0:00:03.956414
no massive missing
0:02:01.690403


20190402

SH finished
0:00:02.364672
0:01:45.819866
0:01:13.054533
1
2
3
4
5
6
7
8
0:09:08.733795
0:00:44.737300
0:00:03.298175
no massive missing
0:01:59.154189


20190403

SH finished
0:00:02.539206
0:01:49.830136
0:01:13.226073
1
2
3
4
5
6
7
8
0:09:08.546297
0:00:52.759835
0:00:03.719049
no massive missing
0:01:56.544172


20190404

SH finished
0:00:02.284887
0:02:08.296727
0:01:16.116342
1
2
3
4
5
6
7
8
0:09:15.823822
0:00:45.248931
0:00:03.564463
no massive missing
0:02:03.212330


20190408

SH finished
0:00:02.345724
0:01:55.920840
0:01:13.029600
1
2
3
4
5
6
7
8
0:08:46.140248


skey
1600422    0.0
Name: p1, dtype: float64

0:00:43.229334
0:00:03.269253
no massive missing
0:01:58.244621


20190409

SH finished
0:00:02.290870
0:01:42.009062
0:01:12.938843
1
2
3
4
5
6
7
8
0:08:50.127579


skey
1601949    0.0
Name: p1, dtype: float64

0:00:47.154831
0:00:03.403892
no massive missing
0:01:53.238018


20190410

SH finished
0:00:02.252972
0:01:39.034023
0:01:11.191518
1
2
3
4
5
6
7
8
0:08:47.780857
0:00:41.823097
0:00:03.291193
no massive missing
0:01:47.286941


20190411

SH finished
0:00:02.139276
0:01:34.551045
0:01:09.582795
1
2
3
4
5
6
7
8
0:08:23.230545
0:00:41.719374
0:00:03.109680
no massive missing
0:01:56.754609


20190412

SH finished
0:00:02.101377
0:03:31.989795
0:01:09.993724
1
2
3
4
5
6
7
8
0:08:32.453866


skey
1603606    0.0
Name: p1, dtype: float64

0:00:41.394244
0:00:03.177499
no massive missing
0:01:46.925906


20190415

SH finished
0:00:02.146257
0:01:36.120817
0:01:12.034263
1
2
3
4
5
6
7
8
0:08:37.777622


skey
1603317    0.0
Name: p1, dtype: float64

0:00:43.308124
0:00:03.305157
no massive missing
0:01:47.755686


20190416

SH finished
0:00:02.200113
0:01:53.250983
0:01:12.196828
1
2
3
4
5
6
7
8
0:08:45.850024
0:00:40.984341
0:00:03.189467
no massive missing
0:01:49.544898


20190417

SH finished
0:00:02.166204
0:02:03.081680
0:01:09.209820
1
2
3
4
5
6
7
8
0:08:31.271031
0:00:44.519882
0:00:03.147578
no massive missing
0:01:49.146964


20190418

SH finished
0:00:02.127308
0:02:45.398456
0:01:08.476781
1
2
3
4
5
6
7
8
0:08:21.972910


skey
1600758    0.0
Name: p1, dtype: float64

0:00:42.722690
0:00:04.006281
no massive missing
0:01:52.173892


20190419

SH finished
0:00:02.229009
0:01:35.806658
0:01:09.797248
1
2
3
4
5
6
7
8
0:08:23.786059
0:00:44.188767
0:00:03.291195
no massive missing
0:01:52.328451


20190422

SH finished
0:00:02.181164
0:01:32.713932
0:01:09.213809
1
2
3
4
5
6
7
8
0:08:31.210195
0:00:41.568778
0:00:03.094719
no massive missing
0:01:48.733072


20190423

SH finished
0:00:02.130300
0:01:30.948656
0:01:07.399664
1
2
3
4
5
6
7
8
0:08:13.279170
0:00:43.442764
0:00:03.819780
no massive missing
0:01:52.468077


20190424

SH finished
0:00:02.134290
0:02:15.615146
0:01:08.386024
1
2
3
4
5
6
7
8
0:08:24.537049
0:00:45.085368
0:00:03.237339
no massive missing
0:01:47.149308


20190425

SH finished
0:00:02.170194
0:01:31.191007
0:01:09.115073
1
2
3
4
5
6
7
8
0:08:09.296826
0:00:39.770588
0:00:03.077766
no massive missing
0:01:44.618081


20190426

SH finished
0:00:02.052508
0:02:09.128473
0:01:07.797598
1
2
3
4
5
6
7
8
0:08:19.692013
0:00:41.324431
0:00:03.136607
no massive missing
0:01:52.945801


20190429

SH finished
0:00:02.223052
0:01:39.212545
0:01:04.071568
1
2
3
4
5
6
7
8
0:07:51.832555
0:00:38.129977
0:00:03.012939
no massive missing
0:01:39.067932


20190430

SH finished
0:00:01.934822
0:01:45.654309
0:01:08.438884
1
2
3
4
5
6
7
8
0:08:08.016252


skey
1603858    0.0
Name: p1, dtype: float64

0:00:46.798784
0:00:03.489664
no massive missing
0:01:53.890272


20190506

SH finished
0:00:03.368986
0:02:15.406703
0:01:05.510718
1
2
3
4
5
6
7
8
0:07:57.550256
0:00:39.095395
0:00:03.095717
no massive missing
0:01:41.615117


20190507

SH finished
0:00:02.152240
0:01:55.549833
0:01:06.129062
1
2
3
4
5
6
7
8
0:07:53.694572
0:00:41.304485
0:00:03.195450
no massive missing
0:01:43.184916


20190508

SH finished
0:00:02.030567
0:01:39.165670
0:01:02.358153
1
2
3
4
5
6
7
8
0:07:37.383106
0:00:36.428530
0:00:02.914203
no massive missing
0:01:33.840917


20190509

SH finished
0:00:01.887949
0:02:22.240418
0:01:07.771668
1
2
3
4
5
6
7
8
0:08:07.929484
0:00:41.022240
0:00:03.413866
no massive missing
0:01:52.889948


20190510

SH finished
0:00:02.085421
0:03:11.833726
0:01:02.459880
1
2
3
4
5
6
7
8
0:07:33.062775
0:00:38.450122
0:00:02.996981
no massive missing
0:01:39.868788


20190513

SH finished
0:00:01.908894
0:01:45.627381
0:01:02.642392
1
2
3
4
5
6
7
8
0:07:38.017518
0:00:38.554841
0:00:03.246314
no massive missing
0:01:39.049980


20190514

SH finished
0:00:02.019596
0:02:12.311983
0:01:03.961862
1
2
3
4
5
6
7
8
0:07:55.853794
0:00:37.078792
0:00:03.089732
no massive missing
0:01:38.427672


20190515

SH finished
0:00:02.056471
0:01:52.907902
0:01:03.993776
1
2
3
4
5
6
7
8
0:07:52.946965


skey
1600989    0.0
Name: p1, dtype: float64

0:00:40.672177
0:00:03.447775
no massive missing
0:01:46.692531


20190516

SH finished
0:00:01.949783
0:03:16.307755
0:01:06.635707
1
2
3
4
5
6
7
8
0:08:05.781233
0:00:39.832423
0:00:03.043856
no massive missing
0:01:45.917604


20190517

SH finished
0:00:02.093399
0:02:07.551720
0:01:01.845525
1
2
3
4
5
6
7
8
0:07:24.375020
0:00:40.600368
0:00:03.055824
no massive missing
0:01:36.308315


20190520

SH finished
0:00:01.876978
0:01:40.969842
0:01:02.125774
1
2
3
4
5
6
7
8
0:07:42.751850
0:00:39.119332
0:00:03.383946
no massive missing
0:01:34.009465


20190521

SH finished
0:00:01.913880
0:01:40.193920
0:01:01.988142
1
2
3
4
5
6
7
8
0:07:20.207172


skey
1600082    0.0
1603982    0.0
Name: p1, dtype: float64

0:00:36.423544
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
          ID      date  d_open  d_high  d_low  d_close_x  d_yclose  \
493  1600082  20190522     4.7    4.77   4.56       4.63      4.73   

     d_cumprodCAA  d_dayReturn  d_5dayReturn  d_ICDayReturn  d_CSIDayReturn  \
493      0.997696    -0.021142     -0.021142      -0.006683       -0.004174   

      d_volume  d_amount_x    TORate  allZT  hasZT  isZT  allDT  hasDT  isDT  \
493  5296386.0  24730844.0  0.008356    0.0    0.0   0.0    0.0    0.0   0.0   

     tmrHalted  haltedDays  marketShares  totalShares  d_close_y  d_amount_y  \
493        0.0         0.0   633852202.0  646115826.0        NaN         NaN   

     auction  
493      NaN  
0:00:02.797515
no massive missing
0:01:44.023672


20190522

SH finished
0:00:02.071456
0:03:01.072518
0:01:00.728513
1
2
3
4
5
6
7
8
0:07:21.596454


skey
1603327    0.0
Name: p1, dtype: float64

0:00:36.845416
0:00:03.470714
no massive missing
0:01:39.727167


20190523

SH finished
0:00:01.916871
0:01:21.183783
0:00:57.832262
1
2
3
4
5
6
7
8
0:07:11.546820
0:00:36.427534
0:00:02.738672
no massive missing
0:01:35.647085


20190524

SH finished
0:00:01.880967
0:01:25.839326
0:01:02.653363
1
2
3
4
5
6
7
8
0:07:37.327580
0:00:42.130831
0:00:03.357026
no massive missing
0:01:47.894307


20190527

SH finished
0:00:01.889943
0:02:25.041922
0:01:02.441928
1
2
3
4
5
6
7
8
0:07:38.661795
0:00:38.428180
0:00:02.966064
no massive missing
0:01:37.289690


20190528

SH finished
0:00:01.940807
0:01:33.471905
0:01:00.158039
1
2
3
4
5
6
7
8
0:07:26.615502
0:00:37.446807
0:00:02.834416
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
1
0:01:36.950597


20190529

SH finished
0:00:01.901911
0:01:35.895420
0:01:01.238149
1
2
3
4
5
6
7
8
0:07:42.197335


skey
1603189    0.0
Name: p1, dtype: float64

0:00:36.941160
0:00:02.838404
no massive missing
0:01:34.069307


20190530

SH finished
0:00:01.973718
0:01:28.986904
0:01:01.764741
1
2
3
4
5
6
7
8
0:07:29.262941
0:00:37.028925
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
          ID      date  d_open  d_high  d_low  d_close_x  d_yclose  \
179  1600054  20190531    9.09     9.2   9.07       9.12      9.09   

     d_cumprodCAA  d_dayReturn  d_5dayReturn  d_ICDayReturn  d_CSIDayReturn  \
179      0.640882       0.0033      0.012209      -0.002924       -0.001448   

      d_volume  d_amount_x    TORate  allZT  hasZT  isZT  allDT  hasDT  isDT  \
179  1465362.0  13399973.0  0.002855    0.0    0.0   0.0    0.0    0.0   0.0   

     tmrHalted  haltedDays  marketShares  totalShares  d_close_y  d_amount_y  \
179        0.0         0.0   513300000.0  729379440.0        NaN         NaN   

     auction  
179      NaN  
0:00:02.908218
no massive missing
0:01:41.924290


20190531

SH finished
0:00:01.879969
0:02:09.483551
0:01:02.738136
1
2
3
4
5
6
7
8
0:07:34.431625
0:00:36.986039
0:00:02.932155
no massive missing
0:01:41.514385


20190603

SH finished
0:00:01.954770
0:01:26.088107
0:01:02.340201
1
2
3
4
5
6
7
8
0:07:31.884204
0:00:39.015609
0:00:02.856357
no massive missing
0:01:37.187962


20190604

SH finished
0:00:01.882962
0:01:25.631881
0:01:01.119936
1
2
3
4
5
6
7
8
0:07:32.471852
0:00:39.266937
0:00:02.840400
no massive missing
0:01:33.001783


20190605

SH finished
0:00:01.848055
0:02:06.787764
0:01:01.184294
1
2
3
4
5
6
7
8
0:07:20.940629
0:00:36.914231
0:00:02.854363
no massive missing
0:01:35.079603


20190606

SH finished
0:00:01.919863
0:01:43.550937
0:00:59.796008
1
2
3
4
5
6
7
8
0:07:14.464537
0:00:35.359391
0:00:02.780560
no massive missing
0:01:34.706601


20190610

SH finished
0:00:01.887948
0:01:32.590263
0:01:07.282977
1
2
3
4
5
6
7
8
0:08:14.547284
0:00:48.356615
0:00:03.422842
no massive missing
0:01:42.099819


20190611

SH finished
0:00:02.050515
0:01:55.595710
0:01:02.925634
1
2
3
4
5
6
7
8
0:07:37.054192
0:00:38.246361
0:00:03.014933
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
1
0:01:38.626621


20190612

SH finished
0:00:01.952775
0:01:32.743852
0:01:02.403033
1
2
3
4
5
6
7
8
0:07:39.679988
0:00:40.913530
0:00:02.984016
no massive missing
0:01:41.880407


20190613

SH finished
0:00:02.156231
0:01:28.103269
0:01:03.122109
1
2
3
4
5
6
7
8
0:07:45.497966


skey
1603915    0.0
Name: p1, dtype: float64

0:00:39.713741
0:00:02.931157
no massive missing
0:01:36.287601


20190614

SH finished
0:00:01.945794
0:01:39.179633
0:00:59.830916
1
2
3
4
5
6
7
8
0:07:14.157358
0:00:35.771290
0:00:02.837408
no massive missing
0:01:33.457942


20190617

SH finished
0:00:02.159222
0:02:09.359881
0:01:00.604845
1
2
3
4
5
6
7
8
0:07:11.020750
0:00:37.320146
0:00:02.872315
no massive missing
0:01:33.194646


20190618

SH finished
0:00:01.812151
0:01:28.725604
0:01:03.924382
1
2
3
4
5
6
7
8
0:07:49.158888
0:00:37.300199
0:00:02.896251
no massive missing
0:01:35.833585


20190619

SH finished
0:00:01.891937
0:01:42.111788
0:01:08.579506
1
2
3
4
5
6
7
8
0:08:19.340077


skey
1603217    0.0
Name: p1, dtype: float64

0:00:45.110302
0:00:03.071781
no massive missing
0:01:55.835069


20190620

SH finished
0:00:02.768593
0:02:32.907877
0:01:10.131322
1
2
3
4
5
6
7
8
0:08:17.738523


skey
1603863    0.0
Name: p1, dtype: float64

0:00:40.683148
0:00:03.134613
no massive missing
0:01:44.508879


20190621

SH finished
0:00:02.074450
0:01:27.073025
0:01:05.882118
1
2
3
4
5
6
7
8
0:07:51.236149
0:00:40.183616
0:00:03.106687
no massive missing
0:01:41.524359


20190624

SH finished
0:00:01.973720
0:01:32.505491
0:01:04.874420
1
2
3
4
5
6
7
8
0:07:58.537615
0:00:46.285170
0:00:03.063803
no massive missing
0:01:40.938926


20190625

SH finished
0:00:02.156230
0:01:57.803802
0:01:01.314943
1
2
3
4
5
6
7
8
0:07:24.438357


skey
1600968    0.0
Name: p1, dtype: float64

0:00:36.204131
0:00:02.889270
no massive missing
0:01:34.963912


20190626

SH finished
0:00:01.935820
0:02:12.982238
0:01:03.589857
1
2
3
4
5
6
7
8
0:07:40.317364
0:00:38.640613
0:00:02.953099
no massive missing
0:01:36.702261


20190627

SH finished
0:00:01.882961
0:01:31.490207
0:01:03.249767
1
2
3
4
5
6
7
8
0:07:40.663900


skey
1601698    0.0
Name: p1, dtype: float64

0:00:41.277557
0:00:03.599369
no massive missing
0:01:38.020737


20190628

SH finished
0:00:02.749643
0:01:39.296321
0:01:06.676692
1
2
3
4
5
6
7
8
0:08:09.986156
0:00:41.613658
0:00:03.410874
no massive missing
0:01:47.591127


20190701

SH finished
0:00:02.003638
0:02:28.957446
0:01:05.360121
1
2
3
4
5
6
7
8
0:07:55.809384


skey
1600239    0.0
Name: p1, dtype: float64

0:00:42.894239
0:00:03.005949
no massive missing
0:01:40.822238


20190702

SH finished
0:00:01.980700
0:01:39.610480
0:01:02.630424
1
2
3
4
5
6
7
8
0:07:48.723338
0:00:39.090408
0:00:02.990001
no massive missing
0:01:43.571881


20190703

SH finished
0:00:01.922855
0:01:51.717087
0:01:01.413680
1
2
3
4
5
6
7
8
0:07:37.955263
0:00:36.720730
0:00:02.893258
no massive missing
0:01:34.588917


20190704

SH finished
0:00:01.861020
0:01:29.858572
0:01:00.368477
1
2
3
4
5
6
7
8
0:07:26.231054


skey
1601236    0.0
Name: p1, dtype: float64

0:00:40.299175
0:00:02.766597
no massive missing
0:01:40.021381


20190705

SH finished
0:00:02.083426
0:02:19.010062
0:01:04.157338
1
2
3
4
5
6
7
8
0:07:48.021751
0:00:39.460449
0:00:03.357016
no massive missing
0:01:40.267693


20190708

SH finished
0:00:02.430496
0:01:21.613657
0:01:00.240794
1
2
3
4
5
6
7
8
0:07:17.108275
0:00:34.920565
0:00:02.732689
no massive missing
0:01:32.618546


20190709

SH finished
0:00:01.876978
0:01:56.593040
0:00:58.328934
1
2
3
4
5
6
7
8
0:07:12.700257
0:00:36.822477
0:00:02.729696
no massive missing
0:01:28.144159


20190710

SH finished
0:00:01.773256
0:01:21.478993
0:00:58.662005
1
2
3
4
5
6
7
8
0:07:12.199588
0:00:34.948491
0:00:02.747649
no massive missing
0:01:34.051354


20190711

SH finished
0:00:01.910887
0:01:31.180036
0:00:57.710588
1
2
3
4
5
6
7
8
0:07:04.410438
0:00:34.731072
0:00:02.624977
no massive missing
0:01:31.903102


20190712

SH finished
0:00:01.807165
0:01:40.387401
0:01:02.714201
1
2
3
4
5
6
7
8
0:07:44.456123
0:00:37.267287
0:00:02.914202
no massive missing
0:01:35.539372


20190715

SH finished
0:00:02.054504
0:01:36.507782
0:00:58.786708
1
2
3
4
5
6
7
8
0:07:18.439714


skey
1603236    0.0
Name: p1, dtype: float64

0:00:37.288551
0:00:02.858352
no massive missing
0:01:46.151088


20190716

SH finished
0:00:02.247985
0:01:33.283607
0:00:59.519141
1
2
3
4
5
6
7
8
0:07:15.888078
0:00:35.173561
0:00:02.788540
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
1
0:01:33.317291


20190717

SH finished
0:00:01.859026
0:02:03.153487
0:00:58.252139
1
2
3
4
5
6
7
8
0:07:13.382432
0:00:33.659938
0:00:02.769590
no massive missing
0:01:31.030437


20190718

SH finished
0:00:01.935821
0:01:26.282142
0:00:58.814633
1
2
3
4
5
6
7
8
0:07:08.280366


skey
1603256    0.0
Name: p1, dtype: float64

0:00:39.767596
0:00:03.154560
no massive missing
0:01:38.556301


20190719

SH finished
0:00:01.764280
0:02:20.688571
0:01:02.251111
1
2
3
4
5
6
7
8
0:07:30.034927


skey
1688001    0.0
1688002    0.0
1688003    0.0
1688005    0.0
1688006    0.0
1688007    0.0
1688009    0.0
1688010    0.0
1688011    0.0
1688012    0.0
1688015    0.0
1688016    0.0
1688018    0.0
1688019    0.0
1688022    0.0
1688028    0.0
1688033    0.0
1688066    0.0
1688088    0.0
1688122    0.0
1688333    0.0
1688388    0.0
Name: p1, dtype: float64

0:00:37.601393
0:00:02.891263
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
29
0:01:41.136398


20190722

SH finished
0:00:01.990673
0:01:31.700116
0:00:58.451605
1
2
3
4
5
6
7
8
0:07:07.736843


skey
1688010    0.0
Name: p1, dtype: float64

0:00:36.964098
0:00:02.842394
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
1
0:01:32.575304


20190723

SH finished
0:00:01.765277
0:01:28.931037
0:01:00.771398
1
2
3
4
5
6
7
8
0:07:33.964362


skey
1688010    0.0
1688028    0.0
Name: p1, dtype: float64

0:00:36.320820
0:00:02.882287
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
2
0:01:37.476792


20190724

SH finished
0:00:01.812151
0:01:22.457374
0:00:59.429988
1
2
3
4
5
6
7
8
0:07:19.084265
0:00:35.478074
0:00:02.831424
no massive missing
0:01:42.462848


20190725

SH finished
0:00:01.831100
0:01:26.155479
0:00:58.456593
1
2
3
4
5
6
7
8
0:07:11.673588


skey
1603687    0.0
Name: p1, dtype: float64

0:00:34.468747
0:00:02.765600
no massive missing
0:01:33.390415


20190726

SH finished
0:00:01.836088
0:01:23.472659
0:00:57.985851
1
2
3
4
5
6
7
8
0:07:08.899427
0:00:34.565515
0:00:02.722715
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
1
0:01:30.314353


20190729

SH finished
0:00:01.749320
0:01:26.795767
0:00:59.455919
1
2
3
4
5
6
7
8
0:07:33.161673


skey
1603613    0.0
Name: p1, dtype: float64

0:00:53.500866
0:00:04.206744
no massive missing
0:02:02.872240


20190730

SH finished
0:00:02.619990
0:01:25.445670
0:00:58.799646
1
2
3
4
5
6
7
8
0:07:15.458876
0:00:35.751343
0:00:02.816464
no massive missing
0:01:40.435273


20190731

SH finished
0:00:02.157228
0:01:32.727896
0:01:00.862155
1
2
3
4
5
6
7
8
0:07:21.272322
0:00:34.840779
0:00:02.803498
no massive missing
0:01:33.756144


20190801

SH finished
0:00:01.903906
0:01:29.774797
0:01:02.568589
1
2
3
4
5
6
7
8
0:07:43.035094
0:00:37.798865
0:00:03.561469
no massive missing
0:01:43.167962


20190802

SH finished
0:00:02.338742
0:01:28.054399
0:01:00.723527
1
2
3
4
5
6
7
8
0:07:33.751930
0:00:37.231384
0:00:03.887598
no massive missing
0:01:37.685630


20190805

SH finished
0:00:01.958759
0:01:27.761184
0:01:05.198554
1
2
3
4
5
6
7
8
0:07:53.562924
0:00:40.012940
0:00:03.118656
no massive missing
0:01:51.460773


20190806

SH finished
0:00:02.636945
0:01:27.026151
0:00:59.647406
1
2
3
4
5
6
7
8
0:07:16.480144
0:00:34.579478
0:00:02.769590
no massive missing
0:01:35.577271


20190807

SH finished
0:00:01.956763
0:01:19.364650
0:00:58.362842
1
2
3
4
5
6
7
8
0:07:12.438955
0:00:34.583468
0:00:02.790533
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
1
0:01:33.142785


20190808

SH finished
0:00:01.751314
0:01:29.215293
0:00:58.678997
1
2
3
4
5
6
7
8
0:07:09.977543
0:00:39.536215
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
          ID      date  d_open  d_high  d_low  d_close_x  d_yclose  \
854  1603997  20190809    7.39     7.5   7.25       7.33      7.35   

     d_cumprodCAA  d_dayReturn  d_5dayReturn  d_ICDayReturn  d_CSIDayReturn  \
854      0.616824    -0.002721     -0.039318      -0.011973       -0.010874   

     d_volume  d_amount_x    TORate  allZT  hasZT  isZT  allDT  hasDT  isDT  \
854  945500.0   6924395.0  0.001497    0.0    0.0   0.0    0.0    0.0   0.0   

     tmrHalted  haltedDays  marketShares  totalShares  d_close_y  d_amount_y  \
854        0.0         0.0   631503350.0  639413200.0        NaN         NaN   

     auction  
854      NaN  
0:00:03.079760
no massive missing
0:01:34.528079


20190809

SH finished
0:00:01.926844
0:02:04.688381
0:00:58.202272
1
2
3
4
5
6
7
8
0:07:00.678423
0:00:35.601743
0:00:02.753633
no massive missing
0:01:32.765825


20190812

SH finished
0:00:02.048487
0:01:23.404841
0:00:57.829270
1
2
3
4
5
6
7
8
0:07:02.229274
0:00:34.792907
0:00:03.329093
no massive missing
0:01:34.846226


20190813

SH finished
0:00:01.784227
0:01:22.525194
0:00:57.967900
1
2
3
4
5
6
7
8
0:07:14.979159
0:00:41.391253
0:00:02.829429
no massive missing
0:01:33.384139


20190814

SH finished
0:00:01.774253
0:01:26.707005
0:01:00.998790
1
2
3
4
5
6
7
8
0:07:27.609397
0:00:38.900915
0:00:02.930160
no massive missing
0:01:43.408329


20190815

SH finished
0:00:02.204103
0:01:34.107205
0:01:02.646094
1
2
3
4
5
6
7
8
0:07:29.496824
0:00:35.856062
0:00:02.852398
no massive missing
0:01:34.029383


20190816

SH finished
0:00:01.909890
0:01:35.469560
0:01:06.729457
1
2
3
4
5
6
7
8
0:08:11.879915
0:00:40.026902
0:00:03.347045
no massive missing
0:01:43.717519


20190819

SH finished
0:00:02.387575
0:01:36.460907
0:01:03.935932
1
2
3
4
5
6
7
8
0:07:57.339819
0:00:37.301195
0:00:02.928166
no massive missing
0:01:37.653715


20190820

SH finished
0:00:01.934823
0:01:27.204673
0:01:02.522712
1
2
3
4
5
6
7
8
0:07:40.655460
0:00:42.070435
0:00:03.180491
no massive missing
0:01:34.250821


20190821

SH finished
0:00:01.895926
0:01:32.356888
0:01:02.652365
1
2
3
4
5
6
7
8
0:07:33.950400
0:00:38.111030
0:00:03.519583
no massive missing
0:01:41.780673


20190822

SH finished
0:00:02.181164
0:01:26.121572
0:01:02.540664
1
2
3
4
5
6
7
8
0:07:29.860344
0:00:36.233053
0:00:02.881291
no massive missing
0:01:32.893452


20190823

SH finished
0:00:01.950780
0:01:23.772855
0:01:01.570261
1
2
3
4
5
6
7
8
0:07:36.790801
0:00:38.112025
0:00:02.960081
no massive missing
0:01:39.520720


20190826

SH finished
0:00:01.966737
0:01:31.446324
0:01:06.052269
1
2
3
4
5
6
7
8
0:08:08.841045
0:00:45.139225
0:00:03.131620
no massive missing
0:01:40.922969


20190827

SH finished
0:00:02.011618
0:01:29.171410
0:01:04.232139
1
2
3
4
5
6
7
8
0:07:49.601523
0:00:40.472710
0:00:04.347369
no massive missing
0:01:38.506434


20190828

SH finished
0:00:02.012615
0:01:28.605924
0:01:04.844500
1
2
3
4
5
6
7
8
0:07:46.585600
0:00:40.511598
0:00:03.139600
no massive missing
0:01:38.390744


20190829

SH finished
0:00:01.936817
0:01:27.859920
0:01:06.318556
1
2
3
4
5
6
7
8
0:08:06.044529
0:00:38.227717
0:00:03.024906
no massive missing
0:01:37.883101


20190830

SH finished
0:00:02.002642
0:01:33.026098
0:01:06.948869
1
2
3
4
5
6
7
8
0:08:09.364645
0:00:38.804174
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
          ID      date  d_open  d_high  d_low  d_close_x  d_yclose  \
483  1601881  20190902    10.5   10.87  10.47      10.75      10.5   

     d_cumprodCAA  d_dayReturn  d_5dayReturn  d_ICDayReturn  d_CSIDayReturn  \
483      0.963331      0.02381      0.024786       0.024926        0.025965   

       d_volume   d_amount_x    TORate  allZT  hasZT  isZT  allDT  hasDT  \
483  41064656.0  440405540.0  0.033426    0.0    0.0   0.0    0.0    0.0   

     isDT  tmrHalted  haltedDays  marketShares   totalShares  d_close_y  \
483   0.0        0.0         0.0  1.228531e+09  1.013726e+10        NaN   

     d_amount_y  auction  
483         NaN      NaN  
0:00:03.025905
no massive missing
0:01:52.445138


20190902

SH finished
0:00:02.024583
0:02:10.401096
0:01:03.774364
1
2
3
4
5
6
7
8
0:07:47.623815
0:00:38.693470
0:00:03.025905
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
1
0:01:42.720159


20190903

SH finished
0:00:02.478370
0:01:31.560020
0:01:05.844823
1
2
3
4
5
6
7
8
0:08:08.954742
0:00:41.728350
0:00:03.078763
no massive missing
0:01:39.915663


20190904

SH finished
0:00:01.988679
0:01:37.321604
0:01:11.356078
1
2
3
4
5
6
7
8
0:08:47.439770
0:00:40.817787
0:00:03.448772
no massive missing
0:01:51.106721


20190905

SH finished
0:00:02.162214
0:01:30.709269
0:01:05.611447
1
2
3
4
5
6
7
8
0:08:16.198360
0:00:42.608994
0:00:03.188469
no massive missing
0:01:43.608782


20190906

SH finished
0:00:02.014610
0:01:34.016447
0:01:07.913289
1
2
3
4
5
6
7
8
0:08:24.196958


skey
1603927    0.0
Name: p1, dtype: float64

0:00:40.534545
0:00:03.178496
no massive missing
0:01:53.815473


20190909

SH finished
0:00:02.328770
0:01:39.861808
0:01:07.840484
1
2
3
4
5
6
7
8
0:08:14.199708
0:00:38.842073
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
         ID      date  d_open  d_high  d_low  d_close_x  d_yclose  \
56  1600612  20190910    51.0    52.0  50.63      51.68     52.06   

    d_cumprodCAA  d_dayReturn  d_5dayReturn  d_ICDayReturn  d_CSIDayReturn  \
56      0.911015    -0.007299     -0.037616      -0.003449        0.002202   

     d_volume   d_amount_x    TORate  allZT  hasZT  isZT  allDT  hasDT  isDT  \
56  2992201.0  153585949.0  0.009436    0.0    0.0   0.0    0.0    0.0   0.0   

    tmrHalted  haltedDays  marketShares  totalShares  d_close_y  d_amount_y  \
56        0.0         0.0   317109630.0  523117764.0        NaN         NaN   

    auction  
56      NaN  
0:00:03.129627
no massive missing
0:01:45.518671


20190910

SH finished
0:00:02.101378
0:01:47.574171
0:01:06.824204
1
2
3
4
5
6
7
8
0:08:13.822717
0:00:42.575084
0:00:03.043856
no massive missing
0:01:50.194161


20190911

SH finished
0:00:02.088413
0:01:46.304569
0:01:04.266048
1
2
3
4
5
6
7
8
0:08:07.306152
0:00:39.447454
0:00:03.070783
no massive missing
0:01:38.892401


20190912

SH finished
0:00:01.952776
0:01:45.961487
0:01:05.972482
1
2
3
4
5
6
7
8
0:08:06.650905
0:00:49.579345
0:00:03.404889
no massive missing
0:01:43.429263


20190916

SH finished
0:00:02.240006
0:02:20.829194
0:01:09.043266
1
2
3
4
5
6
7
8
0:08:16.393837
0:00:40.393921
0:00:03.161541
no massive missing
0:01:46.269663


20190917

SH finished
0:00:02.102374
0:01:43.259715
0:01:04.979141
1
2
3
4
5
6
7
8
0:07:51.747781
0:00:39.514274
0:00:03.786868
no massive missing
0:01:45.564549


20190918

SH finished
0:00:02.016605
0:01:40.588862
0:01:03.448236
1
2
3
4
5
6
7
8
0:07:57.859428
0:00:37.554518
0:00:02.945120
no massive missing
0:01:36.606516


20190919

SH finished
0:00:01.918866
0:01:47.584145
0:01:03.953883
1
2
3
4
5
6
7
8
0:07:51.698912
0:00:38.649589
0:00:02.990996
no massive missing
0:01:45.490749


20190920

SH finished
0:00:03.239332
0:01:47.015667
0:01:04.497428
1
2
3
4
5
6
7
8
0:07:50.618802
0:00:40.611339
0:00:03.173509
no massive missing
0:01:37.534036


20190923

SH finished
0:00:01.930833
0:01:47.049576
0:01:04.592176
1
2
3
4
5
6
7
8
0:08:00.178223
0:00:38.120006
0:00:02.941131
no massive missing
0:01:39.229499


20190924

SH finished
0:00:01.925847
0:01:45.183569
0:01:05.474814
1
2
3
4
5
6
7
8
0:07:47.118168
0:00:38.286559
0:00:02.974043
no massive missing
0:01:43.923938


20190925

SH finished
0:00:01.912882
0:01:51.120682
0:01:06.239768
1
2
3
4
5
6
7
8
0:07:56.659638
0:00:37.904582
0:00:02.989002
no massive missing
0:01:40.053295


20190926

SH finished
0:00:02.063479
0:01:43.382387
0:01:01.693930
1
2
3
4
5
6
7
8
0:07:30.972368
0:00:35.578805
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
          ID      date  d_open  d_high  d_low  d_close_x  d_yclose  \
165  1603363  20190927   17.21   18.22   16.8      17.99     17.19   

     d_cumprodCAA  d_dayReturn  d_5dayReturn  d_ICDayReturn  d_CSIDayReturn  \
165      0.997205     0.046539     -0.160523        0.00712        0.009031   

       d_volume   d_amount_x    TORate  allZT  hasZT  isZT  allDT  hasDT  \
165  11649255.0  208271297.0  0.063314    0.0    0.0   0.0    0.0    0.0   

     isDT  tmrHalted  haltedDays  marketShares  totalShares  d_close_y  \
165   0.0        0.0         0.0   183992944.0  434205750.0        NaN   

     d_amount_y  auction  
165         NaN      NaN  
0:00:02.791558
no massive missing
0:01:33.892751


20190927

SH finished
0:00:02.012615
0:01:41.788653
0:00:59.028062
1
2
3
4
5
6
7
8
0:07:03.951666


skey
1688036    0.0
Name: p1, dtype: float64

0:00:39.367666
0:00:03.467722
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
1
0:01:33.219579


20190930

SH finished
0:00:01.786220


In [2]:
import pandas as pd
import random
import numpy as np
import glob
import os
from unrar import rarfile
import py7zr
import pickle
import datetime
import time
pd.set_option("max_columns", 200)

startTm = datetime.datetime.now()
readPath = r'\\192.168.10.30\Kevin_zhenyu\day_stock_20200424\***'
dataPathLs = np.array(glob.glob(readPath))
dataPathLs = dataPathLs[[np.array([os.path.basename(i).split('.')[0][:2] == 'SH' for i in dataPathLs])]]
db = pd.DataFrame()
for p in dataPathLs:
    dayData = pd.read_csv(p, compression='gzip')
    db = pd.concat([db, dayData])
print(datetime.datetime.now() - startTm)

year = "2019"
startDate = '20190102'
endDate = '20190930'
readPath = 'G:\\KR\\' + year + '\\SH\\***'
dataPathLs = np.array(glob.glob(readPath))
dateLs = np.array([os.path.basename(i) for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]

for data in dataPathLs:
    
    readPath = data + '\\snapshot\\***2\\***'
    dataPathLs = np.array(glob.glob(readPath))
    dateLs = np.array([int(os.path.basename(i).split('.')[0]) for i in dataPathLs])
    dataPathLs = dataPathLs[((dateLs >= 600000) & (dateLs <= 700000))]
    SH = []
    ll = []
    startTm = datetime.datetime.now()
    for i in dataPathLs:
        try:
            df = pd.read_csv(i, usecols = [0,1,3,5,7,9,10,11,15,17,18,19,20,21,22,23,25,26,28,29,30,31,32,33,37,39,40,41,
                                          42,46,47,49,50])
        except:
            print("empty data")
            print(i)
            ll.append(int(os.path.basename(i).split('.')[0]))
            continue
        df["StockID"] = int(os.path.basename(i).split('.')[0])
        SH += [df]
    del df
    SH = pd.concat(SH).reset_index(drop=True)
    print(datetime.datetime.now() - startTm)
    
    startTm = datetime.datetime.now()
    SH["skey"] = SH["StockID"] + 1000000
    SH.drop(["StockID"],axis=1,inplace=True)
    SH["date"] = int(SH["QuotTime"].iloc[0]//1000000000)
    SH["time"] = (SH['QuotTime'] - int(SH['QuotTime'].iloc[0]//1000000000*1000000000)).astype(np.int64) * 1000
    SH["clockAtArrival"] = SH["QuotTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S%f').timestamp()*1e6))
    SH.drop(["QuotTime"],axis=1,inplace=True)
    SH['datetime'] = SH["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))
    print(datetime.datetime.now() - startTm)
    
    
#     startTm = datetime.datetime.now()

#     BidPrice = np.array([i[1:-1].split(',') for i in SH['BidPrice'].values])
#     SH.drop(["BidPrice"],axis=1,inplace=True)
#     for i in range(1, 11):
#         SH["bid%sp"%i] = BidPrice[:, i-1]
#     del BidPrice
#     print("1")
    
#     OfferPrice = np.array([i[1:-1].split(',') for i in SH['OfferPrice'].values])
#     SH.drop(["OfferPrice"],axis=1,inplace=True)
#     for i in range(1, 11):
#         SH["ask%sp"%i] = OfferPrice[:, i-1]
#     del OfferPrice
#     print("2")
    
#     BidOrderQty = np.array([i[1:-1].split(',') for i in SH['BidOrderQty'].values]).astype(np.int64)
#     SH.drop(["BidOrderQty"],axis=1,inplace=True)
#     for i in range(1, 11):
#         SH["bid%sq"%i] = BidOrderQty[:, i-1]
#     del BidOrderQty
#     print("3")
    
#     OfferOrderQty = np.array([i[1:-1].split(',') for i in SH['OfferOrderQty'].values]).astype(np.int64)
#     SH.drop(["OfferOrderQty"],axis=1,inplace=True)
#     for i in range(1, 11):
#         SH["ask%sq"%i] = OfferOrderQty[:, i-1]
#     del OfferOrderQty
#     print("4")
    
#     BidNumOrders = np.array([i[1:-1].split(',') for i in SH['BidNumOrders'].values]).astype(np.int32)
#     SH.drop(["BidNumOrders"],axis=1,inplace=True)
#     for i in range(1, 11):
#         SH["bid%sn"%i] = BidNumOrders[:, i-1]
#     del BidNumOrders
#     print("5")
    
#     OfferNumOrders = np.array([i[1:-1].split(',') for i in SH['OfferNumOrders'].values]).astype(np.int32)
#     SH.drop(["OfferNumOrders"],axis=1,inplace=True)
#     for i in range(1, 11):
#         SH["ask%sn"%i] = OfferNumOrders[:, i-1]
#     del OfferNumOrders
#     print("6")
    
#     bidOrders = np.array([i[1:-1].split(',') for i in SH['BidOrders'].values]).astype(np.int32)
#     SH.drop(["BidOrders"],axis=1,inplace=True)
#     for i in range(1, 51):
#         SH["bid1Top%sq"%i] = bidOrders[:, i-1]
#     del bidOrders
#     print("7")
    
#     offerOrders = np.array([i[1:-1].split(',') for i in SH['OfferOrders'].values]).astype(np.int32)
#     SH.drop(["OfferOrders"],axis=1,inplace=True)
#     for i in range(1, 51):
#         SH["ask1Top%sq"%i] = offerOrders[:, i-1]
#     del offerOrders
#     print("8")
#     print(datetime.datetime.now() - startTm)

    startTm = datetime.datetime.now()
    SH["BidPrice"] = SH["BidPrice"].apply(lambda x: [float(i) for i in x[1:-1].split(',')])
    SH["OfferPrice"] = SH["OfferPrice"].apply(lambda x: [float(i) for i in x[1:-1].split(',')])
    SH["BidOrderQty"] = SH["BidOrderQty"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])
    SH["OfferOrderQty"] = SH["OfferOrderQty"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])
    SH["BidNumOrders"] = SH["BidNumOrders"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])
    SH["OfferNumOrders"] = SH["OfferNumOrders"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])

    for i in range(1, 11):
        SH["bid" + str(i) + 'p'] = SH["BidPrice"].apply(lambda x: x[i-1],2)
    SH.drop(["BidPrice"],axis=1,inplace=True)
    print("1")
    for i in range(1, 11):
        SH["ask" + str(i) + 'p'] = SH["OfferPrice"].apply(lambda x: x[i-1],2)
    SH.drop(["OfferPrice"],axis=1,inplace=True)
    print("2")
    for i in range(1, 11):
        SH["bid" + str(i) + 'q'] = SH["BidOrderQty"].apply(lambda x: x[i-1])
    SH.drop(["BidOrderQty"],axis=1,inplace=True)
    print("3")
    for i in range(1, 11):
        SH["ask" + str(i) + 'q'] = SH["OfferOrderQty"].apply(lambda x: x[i-1])
    SH.drop(["OfferOrderQty"],axis=1,inplace=True)
    print("4")
    for i in range(1, 11):
        SH["bid" + str(i) + 'n'] = SH["BidNumOrders"].apply(lambda x: x[i-1])
        SH["bid" + str(i) + 'n'] = SH["bid" + str(i) + 'n'].astype('int32')
    SH.drop(["BidNumOrders"],axis=1,inplace=True)
    print("5")
    for i in range(1, 11):
        SH["ask" + str(i) + 'n'] = SH["OfferNumOrders"].apply(lambda x: x[i-1])
        SH["ask" + str(i) + 'n'] = SH["ask" + str(i) + 'n'].astype('int32') 
    SH.drop(["OfferNumOrders"],axis=1,inplace=True)
    print("6")
    
    SH["BidOrders"] = SH["BidOrders"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])
    SH["OfferOrders"] = SH["OfferOrders"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])

    for i in range(1, 51):
        SH["bid1Top" + str(i) + 'q'] = SH["BidOrders"].apply(lambda x: x[i-1])
        SH["bid1Top" + str(i) + 'q'] = SH["bid1Top" + str(i) + 'q'].astype('int32') 
    SH.drop(["BidOrders"],axis=1,inplace=True)
    print("7")
    
    for i in range(1, 51):
        SH["ask1Top" + str(i) + 'q'] = SH["OfferOrders"].apply(lambda x: x[i-1])
        SH["ask1Top" + str(i) + 'q'] = SH["ask1Top" + str(i) + 'q'].astype('int32') 
    SH.drop(["OfferOrders"],axis=1,inplace=True)
    print("8")
    print(datetime.datetime.now() - startTm)
    
    
    startTm = datetime.datetime.now()
#     SH = SH.rename(columns={"NumTrades":"cum_trades_cnt", "HighPx":"high", "OpenPx":"open", "PreClosePx":"prev_close", "LowPx":"low",
#                             "WeightedAvgBidPx":"total_bid_vwap", "WithdrawSellNumber":"cum_canceled_sell_orders", "TotalOfferNumber":"total_ask_orders",
#                            "OfferTradeMaxDuration":"ask_trade_max_duration", "TotalBidNumber":"total_bid_orders", "WithdrawSellMoney":"cum_canceled_sell_amount",
#                            "TotalOfferQty":"total_ask_quantity", "WithdrawBuyNumber":"cum_canceled_buy_orders", "WeightedAvgOfferPx":"total_ask_vwap",
#                            "WithdrawSellAmount":"cum_canceled_sell_volume", "Volume":"cum_volume", "NumOfferOrders":"total_ask_levels", "TotalBidQty":"total_bid_quantity",
#                            "WithdrawBuyAmount":"cum_canceled_buy_volume", "LastPx":"close", "BidTradeMaxDuration":"bid_trade_max_duration", 
#                            "NumBidOrders":"total_bid_levels", "Amount":"cum_amount", "WithdrawBuyMoney":"cum_canceled_buy_amount"})
    SH.columns = ['cum_trades_cnt', 'ask_trade_max_duration', 'total_bid_orders',
       'cum_canceled_sell_amount', 'total_ask_quantity', 'cum_canceled_buy_orders',
       'total_ask_vwap', 'cum_canceled_sell_volume', 'cum_volume', 'open',
       'high', 'prev_close', 'low', 'total_bid_vwap',
       'cum_canceled_sell_orders', 'total_ask_orders', 'total_ask_levels',
       'total_bid_quantity', 'cum_canceled_buy_volume', 'bid_trade_max_duration',
       'total_bid_levels', 'close', 'cum_amount', 'cum_canceled_buy_amount', 'skey', 'date', 'time', 'clockAtArrival',
       'datetime', 'bid1p', 'bid2p', 'bid3p', 'bid4p', 'bid5p', 'bid6p',
       'bid7p', 'bid8p', 'bid9p', 'bid10p', 'ask1p', 'ask2p', 'ask3p',
       'ask4p', 'ask5p', 'ask6p', 'ask7p', 'ask8p', 'ask9p', 'ask10p',
       'bid1q', 'bid2q', 'bid3q', 'bid4q', 'bid5q', 'bid6q', 'bid7q',
       'bid8q', 'bid9q', 'bid10q', 'ask1q', 'ask2q', 'ask3q', 'ask4q',
       'ask5q', 'ask6q', 'ask7q', 'ask8q', 'ask9q', 'ask10q', 'bid1n',
       'bid2n', 'bid3n', 'bid4n', 'bid5n', 'bid6n', 'bid7n', 'bid8n',
       'bid9n', 'bid10n', 'ask1n', 'ask2n', 'ask3n', 'ask4n', 'ask5n',
       'ask6n', 'ask7n', 'ask8n', 'ask9n', 'ask10n', 'bid1Top1q',
       'bid1Top2q', 'bid1Top3q', 'bid1Top4q', 'bid1Top5q', 'bid1Top6q',
       'bid1Top7q', 'bid1Top8q', 'bid1Top9q', 'bid1Top10q', 'bid1Top11q',
       'bid1Top12q', 'bid1Top13q', 'bid1Top14q', 'bid1Top15q',
       'bid1Top16q', 'bid1Top17q', 'bid1Top18q', 'bid1Top19q',
       'bid1Top20q', 'bid1Top21q', 'bid1Top22q', 'bid1Top23q',
       'bid1Top24q', 'bid1Top25q', 'bid1Top26q', 'bid1Top27q',
       'bid1Top28q', 'bid1Top29q', 'bid1Top30q', 'bid1Top31q',
       'bid1Top32q', 'bid1Top33q', 'bid1Top34q', 'bid1Top35q',
       'bid1Top36q', 'bid1Top37q', 'bid1Top38q', 'bid1Top39q',
       'bid1Top40q', 'bid1Top41q', 'bid1Top42q', 'bid1Top43q',
       'bid1Top44q', 'bid1Top45q', 'bid1Top46q', 'bid1Top47q',
       'bid1Top48q', 'bid1Top49q', 'bid1Top50q', 'ask1Top1q', 'ask1Top2q',
       'ask1Top3q', 'ask1Top4q', 'ask1Top5q', 'ask1Top6q', 'ask1Top7q',
       'ask1Top8q', 'ask1Top9q', 'ask1Top10q', 'ask1Top11q', 'ask1Top12q',
       'ask1Top13q', 'ask1Top14q', 'ask1Top15q', 'ask1Top16q',
       'ask1Top17q', 'ask1Top18q', 'ask1Top19q', 'ask1Top20q',
       'ask1Top21q', 'ask1Top22q', 'ask1Top23q', 'ask1Top24q',
       'ask1Top25q', 'ask1Top26q', 'ask1Top27q', 'ask1Top28q',
       'ask1Top29q', 'ask1Top30q', 'ask1Top31q', 'ask1Top32q',
       'ask1Top33q', 'ask1Top34q', 'ask1Top35q', 'ask1Top36q',
       'ask1Top37q', 'ask1Top38q', 'ask1Top39q', 'ask1Top40q',
       'ask1Top41q', 'ask1Top42q', 'ask1Top43q', 'ask1Top44q',
       'ask1Top45q', 'ask1Top46q', 'ask1Top47q', 'ask1Top48q',
       'ask1Top49q', 'ask1Top50q']
    SH = SH.fillna(0)
#     SH["p1"] = SH["bid1p"] + SH["ask1p"]
#     tt = SH[(SH["cum_volume"] > 0) & (SH["time"] < 145700000000)].groupby("skey")['p1'].min()
#     SH.drop("p1", axis=1, inplace=True)
#     try:
#         assert(tt[tt == 0].shape[0] == 0)
#     except:
#         display(tt[tt == 0])
#     SH = SH[~((SH["bid1p"] == 0) & (SH["ask1p"] == 0))]
    SH["ordering"] = SH.groupby("skey").cumcount()
    SH["ordering"] = SH["ordering"] + 1
    
    SH["has_missing"] = 0
    
    for col in ["skey", "date", "cum_trades_cnt", "total_bid_orders",
        'total_ask_orders', 'total_bid_levels', 'total_ask_levels', 'cum_canceled_buy_orders','cum_canceled_sell_orders',
            "ordering", 'bid_trade_max_duration', 'ask_trade_max_duration','has_missing']:
        SH[col] = SH[col].astype('int32')
    
#     for cols in ["prev_close", 'open', "high", "low", "close", 'bid10p','bid9p','bid8p','bid7p','bid6p','bid5p','bid4p','bid3p',
#              'bid2p','bid1p','ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p']:
# #         SH[cols] = SH[cols].apply(lambda x: round(x, 2)).astype('float64')
#         print(cols)
#         print(SH[cols].astype(str).apply(lambda x: len(str(x.split('.')[1]))).unique())
    
#     for cols in ['cum_amount', "cum_canceled_sell_amount", "cum_canceled_buy_amount"]:
# #         SH[cols] = SH[cols].apply(lambda x: round(x, 2)).astype('float64')
#         print(cols)
#         print(SH[cols].astype(str).apply(lambda x: len(str(x.split('.')[1]))).unique())
        
    for cols in ['total_bid_vwap', "total_ask_vwap"]:
        SH[cols] = SH[cols].apply(lambda x: round(x, 3))
        
   
    assert(sum(SH[SH["open"] != 0].groupby("skey")["open"].nunique() != 1) == 0)
    assert(sum(SH[SH["prev_close"] != 0].groupby("skey")["prev_close"].nunique() != 1) == 0)
    SH["prev_close"] = np.where(SH["time"] >= 91500000000, SH.groupby("skey")["prev_close"].transform("max"), SH["prev_close"]) 
    SH["open"] = np.where(SH["cum_volume"] > 0, SH.groupby("skey")["open"].transform("max"), SH["open"])
    assert(sum(SH[SH["open"] != 0].groupby("skey")["open"].nunique() != 1) == 0)
    assert(sum(SH[SH["prev_close"] != 0].groupby("skey")["prev_close"].nunique() != 1) == 0)
    assert(SH[SH["cum_volume"] > 0]["open"].min() > 0)
    print(datetime.datetime.now() - startTm)
    
    
    # check 1
    startTm = datetime.datetime.now()
    da_te = str(SH["date"].iloc[0]) 
    da_te = da_te[:4] + '-' + da_te[4:6] + '-' + da_te[6:8]
    db1 = db[db["date"] == da_te]
    db1["ID"] = db1["ID"].str[2:].astype(int) + 1000000
    db1["date"] = (db1["date"].str[:4] + db1["date"].str[5:7] + db1["date"].str[8:]).astype(int)
    SH["cum_max"] = SH.groupby("skey")["cum_volume"].transform(max)
    s2 = SH[SH["cum_volume"] == SH["cum_max"]].groupby("skey").first().reset_index()
    dd = SH[SH["cum_volume"] == SH["cum_max"]].groupby("skey")["time"].first().reset_index()
    SH.drop("cum_max", axis=1, inplace=True)
    s2 = s2.rename(columns={"skey": "ID", 'open':"d_open", "prev_close":"d_yclose","high":"d_high", "low":"d_low", "close":"d_close", "cum_volume":"d_volume", "cum_amount":"d_amount"})
    if SH["date"].iloc[0] < 20180820:
        s2["auction"] = 0
    else:
        dd["auction"] = np.where(dd["time"]<=145700000000, 0, 1)
        dd = dd.rename(columns={"skey": "ID"})
        s2 = pd.merge(s2, dd[["ID", "auction"]], on="ID")
    s2 = s2[["ID", "date", "d_open", "d_yclose", "d_high", "d_low", "d_close", "d_volume", "d_amount", "auction"]]
    re = pd.merge(db1, s2, on=["ID", "date", "d_open", "d_yclose","d_high", "d_low", "d_volume"], how="outer")
    try:
        assert(sum(re["d_amount_y"].isnull()) == 0)
    except:
        print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
        print(re[re["d_amount_y"].isnull()])
    print(datetime.datetime.now() - startTm)
    
    # check 2
    # first part
    startTm = datetime.datetime.now()
    date = pd.DataFrame(pd.date_range(start='2019-06-10 08:30:00', end='2019-06-10 18:00:00', freq='s'), columns=["Orig"])
    date["time"] = date["Orig"].apply(lambda x: int(x.strftime("%H%M%S"))*1000)
    date["group"] = date["time"]//30000
    SH["group"] = SH["time"]//30000000
    gl = date[((date["time"] >= 93000000) & (date["time"] <= 113000000))|((date["time"] >= 130000000) & (date["time"] <= 150000000))]["group"].unique()
    l = set(gl) - set(SH["group"].unique())
    SH["has_missing1"] = 0 
    if len(l) != 0:
        print("massive missing")
        print(l)
        SH["order"] = SH.groupby(["skey", "time"]).cumcount()
        for i in l:
            SH["t"] = SH[SH["group"] > i].groupby("StockID")["time"].transform("min")
            SH["has_missing1"] = np.where((SH["time"] == SH["t"]) & (SH["order"] == 0), 1, 0)
        SH.drop(["order", "t", "group"], axis=1, inplace=True)   
    else:
        print("no massive missing")
        SH.drop(["group"], axis=1, inplace=True)
    



    # second part

    SH["time_interval"] = SH.groupby("skey")["datetime"].apply(lambda x: x - x.shift(1))
    SH["time_interval"] = SH["time_interval"].apply(lambda x: x.seconds)
    SH["tn_update"] = SH.groupby("skey")["cum_trades_cnt"].apply(lambda x: x-x.shift(1))

    f1 = SH[(SH["time"] >= 93000000000) & (SH["tn_update"] != 0)].groupby("skey")["time"].min().reset_index()
    f1 = f1.rename(columns={"time": "time1"})
    f2 = SH[(SH["time"] >= 130000000000) & (SH["tn_update"] != 0)].groupby("skey")["time"].min().reset_index()
    f2 = f2.rename(columns={"time": "time2"})
    f3 = SH[(SH["time"] >= 150000000000) & (SH["tn_update"] != 0)].groupby("skey")["time"].min().reset_index()
    f3 = f3.rename(columns={"time": "time3"})
    SH = pd.merge(SH, f1, on="skey", how="left")
    del f1
    SH = pd.merge(SH, f2, on="skey", how="left")
    del f2
    SH = pd.merge(SH, f3, on="skey", how="left")
    del f3
    p99 = SH[(SH["time"] > 93000000000) & (SH["time"] < 145700000000) & (SH["time"] != SH["time2"]) & (SH["tn_update"] != 0)]\
    .groupby("skey")["tn_update"].apply(lambda x: x.describe([0.99])["99%"]).reset_index()
    p99 = p99.rename(columns={"tn_update":"99%"})
    SH = pd.merge(SH, p99, on="skey", how="left")

    SH["has_missing2"] = 0
    SH["has_missing2"] = np.where((SH["time_interval"] > 60) & (SH["tn_update"] > SH["99%"]) & 
         (SH["time"] > SH["time1"]) & (SH["time"] != SH["time2"]) & (SH["time"] != SH["time3"]), 1, 0)
    SH.drop(["time_interval", "tn_update", "time1", "time2", "time3", "99%"], axis=1, inplace=True) 

    SH["has_missing"] = np.where((SH["has_missing1"] == 1) | (SH["has_missing2"] == 1), 1, 0)
    SH.drop(["has_missing1", "has_missing2"], axis=1, inplace=True) 
    if SH[SH["has_missing"] == 1].shape[0] != 0:
        print("has missing!!!!!!!!!!!!!!!!!!!!!!!")
        print(SH[SH["has_missing"] == 1].shape[0])
    print(datetime.datetime.now() - startTm)
    
    
    
    startTm = datetime.datetime.now()
    SH = SH[["skey", "date", "time", "clockAtArrival", "datetime", "ordering", "has_missing", "cum_trades_cnt", "cum_volume", "cum_amount", "prev_close",
                            "open", "high", "low", "close", 'bid10p','bid9p','bid8p','bid7p','bid6p','bid5p','bid4p','bid3p','bid2p','bid1p',
                            'ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p', 'bid10q','bid9q','bid8q',
                             'bid7q','bid6q','bid5q','bid4q','bid3q','bid2q','bid1q', 'ask1q','ask2q','ask3q','ask4q','ask5q','ask6q',
                             'ask7q','ask8q','ask9q','ask10q', 'bid10n', 'bid9n', 'bid8n', 'bid7n', 'bid6n', 'bid5n', 'bid4n', 'bid3n', 'bid2n', 'bid1n', 
                             'ask1n', 'ask2n', 'ask3n', 'ask4n', 'ask5n', 'ask6n','ask7n', 'ask8n', 'ask9n', 'ask10n','bid1Top1q','bid1Top2q','bid1Top3q','bid1Top4q','bid1Top5q','bid1Top6q',
        'bid1Top7q','bid1Top8q','bid1Top9q','bid1Top10q','bid1Top11q','bid1Top12q','bid1Top13q','bid1Top14q','bid1Top15q','bid1Top16q','bid1Top17q','bid1Top18q',
        'bid1Top19q','bid1Top20q','bid1Top21q','bid1Top22q','bid1Top23q','bid1Top24q','bid1Top25q','bid1Top26q','bid1Top27q','bid1Top28q','bid1Top29q',
        'bid1Top30q','bid1Top31q','bid1Top32q','bid1Top33q','bid1Top34q','bid1Top35q','bid1Top36q','bid1Top37q','bid1Top38q','bid1Top39q','bid1Top40q',
        'bid1Top41q','bid1Top42q','bid1Top43q','bid1Top44q','bid1Top45q','bid1Top46q','bid1Top47q','bid1Top48q','bid1Top49q','bid1Top50q', 'ask1Top1q',
        'ask1Top2q','ask1Top3q','ask1Top4q','ask1Top5q','ask1Top6q','ask1Top7q','ask1Top8q','ask1Top9q','ask1Top10q','ask1Top11q','ask1Top12q','ask1Top13q',
        'ask1Top14q','ask1Top15q','ask1Top16q','ask1Top17q','ask1Top18q','ask1Top19q','ask1Top20q','ask1Top21q','ask1Top22q','ask1Top23q',
        'ask1Top24q','ask1Top25q','ask1Top26q','ask1Top27q','ask1Top28q','ask1Top29q','ask1Top30q','ask1Top31q','ask1Top32q','ask1Top33q',
        'ask1Top34q','ask1Top35q','ask1Top36q','ask1Top37q','ask1Top38q','ask1Top39q','ask1Top40q','ask1Top41q','ask1Top42q','ask1Top43q',
        'ask1Top44q','ask1Top45q','ask1Top46q','ask1Top47q','ask1Top48q','ask1Top49q','ask1Top50q',"total_bid_quantity", "total_ask_quantity","total_bid_vwap", "total_ask_vwap",
        "total_bid_orders",'total_ask_orders','total_bid_levels', 'total_ask_levels', 'bid_trade_max_duration', 'ask_trade_max_duration', 'cum_canceled_buy_orders', 'cum_canceled_buy_volume',
        "cum_canceled_buy_amount", "cum_canceled_sell_orders", 'cum_canceled_sell_volume',"cum_canceled_sell_amount"]]
    
    display(SH["date"].iloc[0])
    print("SH finished")
    print(datetime.datetime.now() - startTm)
    
    db1 = DB("mongodb://user_rw:faa96dfc@192.168.10.223")
    db1.write('snapshot', SH)
    
    del SH

  app.launch_new_instance()


0:02:55.135685
0:01:33.981595
0:00:53.102916
1
2
3
4
5
6
7
8
0:06:01.759890
0:00:25.034221


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0:00:02.440498
no massive missing
0:01:20.939264


20190102

SH finished
0:00:01.567805
0:01:27.699336
0:00:51.203000
1
2
3
4
5
6
7
8
0:06:16.496529
0:00:24.422135
0:00:02.394593
no massive missing
0:01:22.154861


20190103

SH finished
0:00:01.753282
0:01:19.334150
0:00:57.107320
1
2
3
4
5
6
7
8
0:07:01.484334
0:00:32.261028
0:00:03.054795
no massive missing
0:01:30.544562


20190104

SH finished
0:00:01.784761
0:01:28.316725
0:00:56.620479
1
2
3
4
5
6
7
8
0:06:56.525311
0:00:28.388044
0:00:02.682822
no massive missing
0:01:32.049776


20190107

SH finished
0:00:01.819134
0:01:15.883671
0:00:55.355140
1
2
3
4
5
6
7
8
0:06:39.375939
0:00:28.108560
0:00:02.942131
no massive missing
0:01:29.519910


20190108

SH finished
0:00:01.768241
0:01:23.341657
0:00:58.949246
1
2
3
4
5
6
7
8
0:07:22.526854
0:00:28.314242
0:00:02.845359
no massive missing
0:01:31.576624


20190109

SH finished
0:00:01.809160
0:01:17.908545
0:00:55.268154
1
2
3
4
5
6
7
8
0:06:48.010847
0:00:30.793608
0:00:02.815496
no massive missing
0:01:30.561720


20190110

SH finished
0:00:01.949755
0:01:25.406228
0:00:55.079783
1
2
3
4
5
6
7
8
0:06:46.208425
0:00:26.565952
0:00:02.648907
no massive missing
0:01:28.071417


20190111

SH finished
0:00:01.777245
0:01:18.349515
0:00:54.062349
1
2
3
4
5
6
7
8
0:06:41.713166
0:00:26.233897
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
          ID      date  d_open  d_high  d_low  d_close_x  d_yclose  \
378  1600353  20190114     4.8    4.83   4.66       4.69      4.86   

     d_cumprodCAA  d_dayReturn  d_5dayReturn  d_ICDayReturn  d_CSIDayReturn  \
378      0.989619    -0.034979     -0.004246      -0.006565       -0.005933   

       d_volume  d_amount_x    TORate  allZT  hasZT  isZT  allDT  hasDT  isDT  \
378  10882631.0  51416487.0  0.020015    0.0    0.0   0.0    0.0    0.0   0.0   

     tmrHalted  haltedDays  marketShares  totalShares  d_close_y  d_amount_y  \
378        0.0         0.0   543720000.0  543720000.0        NaN         NaN   

     auction  
378      NaN  
0:00:02.518289
no massive missing
0:01:26.049424


20190114

SH finished
0:00:01.698455
0:01:22.671700
0:00:56.315551
1
2
3
4
5
6
7
8
0:06:53.623521
0:00:28.361696
0:00:02.908246
no massive missing
0:01:37.972195


20190115

SH finished
0:00:01.747322
0:02:15.989406
0:00:57.002759
1
2
3
4
5
6
7
8
0:06:54.889433
0:00:28.605209
0:00:02.823419
no massive missing
0:01:32.277129


20190121

SH finished
0:00:01.784225
0:01:23.062182
0:00:56.114063
1
2
3
4
5
6
7
8
0:06:57.694797
0:00:29.884062
0:00:02.791503
no massive missing
0:01:31.112012


20190122

SH finished
0:00:01.804166
0:01:14.833588
0:00:54.083262
1
2
3
4
5
6
7
8
0:06:47.101245
0:00:25.825092
0:00:02.637942
no massive missing
0:01:24.314545


20190123

SH finished
0:00:01.732336
0:01:20.350053
0:00:56.366025
1
2
3
4
5
6
7
8
0:06:53.944672
0:00:27.479446
0:00:02.768101
no massive missing
0:01:33.265871


20190124

SH finished
0:00:01.794169
0:01:24.987982
0:00:55.498254
1
2
3
4
5
6
7
8
0:06:52.228879
0:00:27.523142
0:00:02.724723
no massive missing
0:01:30.928525


20190125

SH finished
0:00:01.847054
0:01:25.017743
0:00:56.470882
1
2
3
4
5
6
7
8
0:07:00.891544
0:00:28.145723
0:00:02.784582
no massive missing
0:01:30.558099


20190128

SH finished
0:00:01.944797
0:01:25.271738
0:00:57.332570
1
2
3
4
5
6
7
8
0:07:06.204622
0:00:31.445864
0:00:03.252297
no massive missing
0:01:31.402441


20190129

SH finished
0:00:01.846061
0:01:24.238729
0:00:53.846893
1
2
3
4
5
6
7
8
0:06:33.780516
0:00:26.412327
0:00:02.778538
no massive missing
0:01:24.930876


20190130

SH finished
0:00:01.745299
0:01:20.504142
0:00:58.175261
1
2
3
4
5
6
7
8
0:06:58.408025
0:00:28.238417
0:00:02.801504
no massive missing
0:01:27.929733


20190131

SH finished
0:00:01.762314
0:01:21.059193
0:00:54.497285
1
2
3
4
5
6
7
8
0:06:48.737734
0:00:27.265080
0:00:02.651874
no massive missing
0:01:24.793146


20190201

SH finished
0:00:01.710424
0:01:19.459428
0:00:56.593576
1
2
3
4
5
6
7
8
0:07:01.698607
0:00:27.900527
0:00:02.740637
no massive missing
0:01:38.575311


20190211

SH finished
0:00:02.034556
0:01:53.496594
0:00:59.444560
1
2
3
4
5
6
7
8
0:07:18.093305
0:00:28.977949
0:00:02.903232
no massive missing
0:01:33.812064


20190212

SH finished
0:00:01.901912
0:01:37.593573
0:01:02.687240
1
2
3
4
5
6
7
8
0:07:51.215244
0:00:31.575543
0:00:03.069535
no massive missing
0:01:37.969378


20190213

SH finished
0:00:01.953800
0:01:29.572130
0:00:59.865552
1
2
3
4
5
6
7
8
0:07:29.957878
0:00:28.927629
0:00:02.836383
no massive missing
0:01:35.493148


20190214

SH finished
0:00:01.950780
0:01:31.800779
0:00:59.717801
1
2
3
4
5
6
7
8
0:07:28.583837
0:00:28.958568
0:00:02.844390
no massive missing
0:01:34.495194


20190215

SH finished
0:00:01.824092
0:01:33.833936
0:01:05.793484
1
2
3
4
5
6
7
8
0:08:11.223679
0:00:34.276419
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
           ID      date  d_open  d_high  d_low  d_close_x  d_yclose  \
1414  1600128  20190218     7.2    7.58   7.14       7.38      7.13   

      d_cumprodCAA  d_dayReturn  d_5dayReturn  d_ICDayReturn  d_CSIDayReturn  \
1414      0.982475     0.035063      0.080527       0.035148        0.035479   

        d_volume   d_amount_x    TORate  allZT  hasZT  isZT  allDT  hasDT  \
1414  21832880.0  160322361.0  0.088476    0.0    0.0   0.0    0.0    0.0   

      isDT  tmrHalted  haltedDays  marketShares  totalShares  d_close_y  \
1414   0.0        0.0         0.0   246767500.0  246767500.0        NaN   

      d_amount_y  auction  
1414         NaN      NaN  
0:00:03.084560
no massive missing
0:01:55.423475


20190218

SH finished
0:00:02.307826
0:02:20.169458
0:01:04.524256
1
2
3
4
5
6
7
8
0:07:59.835888
0:00:32.192864
0:00:03.126677
no massive missing
0:01:44.447533


20190219

SH finished
0:00:02.456390
0:01:37.669701
0:01:02.145710
1
2
3
4
5
6
7
8
0:07:41.680731
0:00:30.405187
0:00:02.962075
no massive missing
0:01:38.932369


20190220

SH finished
0:00:01.960726
0:01:36.982796
0:01:07.127727
1
2
3
4
5
6
7
8
0:08:13.269223
0:00:33.624036
0:00:03.531384
no massive missing
0:01:47.394453


20190221

SH finished
0:00:02.077872
0:01:31.858243
0:01:04.467645
1
2
3
4
5
6
7
8
0:08:03.029107
0:00:32.174315
0:00:03.022907
no massive missing
0:01:41.448385


20190222

SH finished
0:00:01.979673
0:01:41.746807
0:01:12.685493
1
2
3
4
5
6
7
8
0:09:03.717576
0:00:42.202111
0:00:03.970349
no massive missing
0:01:54.421850


20190225

SH finished
0:00:02.248018
0:01:55.960410
0:01:12.454139
1
2
3
4
5
6
7
8
0:08:56.737885
0:00:39.043601
0:00:03.824794
no massive missing
0:01:59.851897


20190226

SH finished
0:00:02.475344
0:02:10.546292
0:01:10.115573
1
2
3
4
5
6
7
8
0:08:37.880068
0:00:34.337127
0:00:03.469747
no massive missing
0:01:50.567548


20190227

SH finished
0:00:02.221054
0:01:38.256104
0:01:08.328295
1
2
3
4
5
6
7
8
0:08:12.091938
0:00:33.431687
0:00:03.200465
no massive missing
0:01:43.335142


20190228

SH finished
0:00:02.175181
0:01:38.926563
0:01:06.121217
1
2
3
4
5
6
7
8
0:08:15.227314
0:00:32.739510
0:00:03.146581
no massive missing
0:01:44.117472


20190301

SH finished
0:00:02.213079
0:01:40.436897
0:01:12.705621
1
2
3
4
5
6
7
8
0:09:05.157911
0:00:39.285921
0:00:03.745942
no massive missing
0:01:59.162016


20190304

SH finished
0:00:02.268901
0:02:05.026154
0:01:10.135317
1
2
3
4
5
6
7
8
0:08:45.793008
0:00:35.028245
0:00:03.520585
no massive missing
0:01:53.948508


20190305

SH finished
0:00:02.678792
0:01:57.088118
0:01:13.559240
1
2
3
4
5
6
7
8
0:08:58.506476
0:00:35.154192
0:00:03.377957
no massive missing
0:01:53.260236


20190306

SH finished
0:00:02.301842
0:01:41.826760
0:01:14.182317
1
2
3
4
5
6
7
8
0:09:14.591443
0:00:36.853677
0:00:03.450767
no massive missing
0:02:00.934261


20190307

SH finished
0:00:02.383650
0:01:51.320177
0:01:12.250749
1
2
3
4
5
6
7
8
0:09:02.932329
0:00:35.066199
0:00:03.428824
no massive missing
0:01:53.043597


20190308

SH finished
0:00:02.270897
0:01:50.153302
0:01:08.523626
1
2
3
4
5
6
7
8
0:08:26.707291
0:00:33.945176
0:00:03.289199
no massive missing
0:01:57.446757


20190311

SH finished
0:00:02.181164
0:01:49.298640
0:01:12.177941
1
2
3
4
5
6
7
8
0:08:56.539538
0:00:35.099089
0:00:03.388931
no massive missing
0:01:53.746750


20190312

SH finished
0:00:02.320793
0:01:40.152941
0:01:10.998005
1
2
3
4
5
6
7
8
0:08:48.076331
0:00:33.951159
0:00:03.286178
no massive missing
0:01:52.479080


20190313

SH finished
0:00:02.207064
0:01:37.616842
0:01:07.947171
1
2
3
4
5
6
7
8
0:08:18.997996
0:00:33.380687
0:00:03.725032
no massive missing
0:01:58.959708


20190314

SH finished
0:00:02.170193
0:02:19.668300
0:01:08.869703
1
2
3
4
5
6
7
8
0:08:23.423762
0:00:34.336128
0:00:03.248337
no massive missing
0:01:51.754983


20190315

SH finished
0:00:02.254939
0:01:41.996181
0:01:09.552902
1
2
3
4
5
6
7
8
0:08:37.653953
0:00:35.367398
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
          ID      date  d_open  d_high  d_low  d_close_x  d_yclose  \
197  1600328  20190318    8.54     8.6   8.44       8.55      8.46   

     d_cumprodCAA  d_dayReturn  d_5dayReturn  d_ICDayReturn  d_CSIDayReturn  \
197      0.979131     0.010638      0.046512        0.02653        0.024101   

       d_volume   d_amount_x    TORate  allZT  hasZT  isZT  allDT  hasDT  \
197  17237240.0  146753910.0  0.039352    0.0    0.0   0.0    0.0    0.0   

     isDT  tmrHalted  haltedDays  marketShares  totalShares  d_close_y  \
197   0.0        0.0         0.0   438031073.0  438031073.0        NaN   

     d_amount_y  auction  
197         NaN      NaN  
0:00:03.393916
no massive missing
0:01:53.342713


20190318

SH finished
0:00:02.440498
0:01:36.915750
0:01:09.065207
1
2
3
4
5
6
7
8
0:08:38.072834
0:00:33.528322
0:00:03.302160
no massive missing
0:01:47.236049


20190319

SH finished
0:00:02.157259
0:01:37.101194
0:01:10.618052
1
2
3
4
5
6
7
8
0:08:41.747002
0:00:35.106069
0:00:03.363002
no massive missing
0:01:56.108338


20190320

SH finished
0:00:02.248013
0:01:43.850140
0:01:10.100410
1
2
3
4
5
6
7
8
0:08:39.896012
0:00:33.858412
0:00:03.458743
no massive missing
0:01:50.292903


20190321

SH finished
0:00:02.235014
0:01:39.287343
0:01:10.296883
1
2
3
4
5
6
7
8
0:08:38.773168
0:00:33.144291
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
          ID      date  d_open  d_high  d_low  d_close_x  d_yclose  \
542  1603099  20190322   10.58   11.21  10.53       11.2     10.57   

     d_cumprodCAA  d_dayReturn  d_5dayReturn  d_ICDayReturn  d_CSIDayReturn  \
542      0.985802     0.059603      0.108911       0.005844        0.003991   

      d_volume  d_amount_x    TORate  allZT  hasZT  isZT  allDT  hasDT  isDT  \
542  7846042.0  85941553.0  0.029422    0.0    0.0   0.0    0.0    0.0   0.0   

     tmrHalted  haltedDays  marketShares  totalShares  d_close_y  d_amount_y  \
542        0.0         0.0   266670000.0  266670000.0        NaN         NaN   

     auction  
542      NaN  
0:00:03.576458
no massive missing
0:01:48.565518


20190322

SH finished
0:00:02.238014
0:01:40.741453
0:01:08.829810
1
2
3
4
5
6
7
8
0:08:29.417085
0:00:32.954826
0:00:03.420876
no massive missing
0:02:01.082998


20190325

SH finished
0:00:02.504300
0:02:18.785442
0:01:08.780996
1
2
3
4
5
6
7
8
0:08:33.871168
0:00:33.955178
0:00:03.241299
no massive missing
0:01:50.377671


20190326

SH finished
0:00:02.232055
0:01:37.080222
0:01:07.848463
1
2
3
4
5
6
7
8
0:08:21.814335
0:00:33.369714
0:00:03.298204
no massive missing
0:01:48.589456


20190327

SH finished
0:00:02.113332
0:01:36.984478
0:01:08.100787
1
2
3
4
5
6
7
8
0:08:16.581336
0:00:34.517644
0:00:03.260276
no massive missing
0:02:00.638217


20190328

SH finished
0:00:02.569129
0:01:55.788166
0:01:11.108739
1
2
3
4
5
6
7
8
0:08:44.952425
0:00:34.717110
0:00:03.339094
no massive missing
0:01:54.801806


20190329

SH finished
0:00:02.287880
0:01:46.879007
0:01:14.299204
1
2
3
4
5
6
7
8
0:09:17.687838
0:00:36.001700
0:00:03.810804
no massive missing
0:01:59.403493


20190401

SH finished
0:00:02.372653
0:01:42.904665
0:01:12.705438
1
2
3
4
5
6
7
8
0:09:06.224509
0:00:40.308151
0:00:03.693118
no massive missing
0:01:52.543903


20190402

SH finished
0:00:02.278902
0:01:56.813770
0:01:09.606759
1
2
3
4
5
6
7
8
0:08:35.116741
0:00:35.175916
0:00:03.354020
no massive missing
0:01:52.091059


20190403

SH finished
0:00:02.267932
0:01:46.825177
0:01:11.236368
1
2
3
4
5
6
7
8
0:08:44.643371
0:00:35.805197
0:00:03.434783
no massive missing
0:01:53.410588


20190404

SH finished
0:00:02.295825
0:01:44.648163
0:01:12.525017
1
2
3
4
5
6
7
8
0:08:59.349114
0:00:34.625328
0:00:03.422842
no massive missing
0:01:51.567488


20190408

SH finished
0:00:02.297852
0:01:42.995538
0:01:09.506121
1
2
3
4
5
6
7
8
0:08:33.155198
0:00:38.410228
0:00:03.367963
no massive missing
0:02:02.759540


20190409

SH finished
0:00:02.622983
0:01:57.088712
0:01:10.665897
1
2
3
4
5
6
7
8
0:08:36.708484
0:00:34.104748
0:00:03.312165
no massive missing
0:01:52.881943


20190410

SH finished
0:00:02.231059
0:01:37.124161
0:01:11.672205
1
2
3
4
5
6
7
8
0:08:47.315129
0:00:35.536921
0:00:03.373941
no massive missing
0:01:56.039522


20190411

SH finished
0:00:02.278902
0:01:31.022459
0:01:08.060894
1
2
3
4
5
6
7
8
0:08:28.323953
0:00:36.150267
0:00:03.261275
no massive missing
0:01:48.163568


20190412

SH finished
0:00:02.183187
0:01:38.009763
0:01:09.153942
1
2
3
4
5
6
7
8
0:08:25.884527
0:00:33.411577
0:00:03.220382
no massive missing
0:01:48.125819


20190415

SH finished
0:00:02.136282
0:01:33.701320
0:01:09.433285
1
2
3
4
5
6
7
8
0:08:26.825352
0:00:36.258985
0:00:03.380952
no massive missing
0:01:51.951461


20190416

SH finished
0:00:02.472355
0:01:35.382819
0:01:09.414336
1
2
3
4
5
6
7
8
0:08:40.671939
0:00:33.390660
0:00:03.390926
no massive missing
0:01:48.985448


20190417

SH finished
0:00:02.152278
0:01:33.500827
0:01:06.984744
1
2
3
4
5
6
7
8
0:08:16.278382
0:00:33.067518
0:00:03.253304
no massive missing
0:01:54.889596


20190418

SH finished
0:00:02.561119
0:01:37.306677
0:01:07.449498
1
2
3
4
5
6
7
8
0:08:20.698599
0:00:33.945176
0:00:03.459771
no massive missing
0:01:47.283921


20190419

SH finished
0:00:02.166203
0:01:34.438291
0:01:08.890675
1
2
3
4
5
6
7
8
0:08:34.004740
0:00:34.149633
0:00:03.222349
no massive missing
0:01:48.881673


20190422

SH finished
0:00:02.349714
0:01:32.684010
0:01:09.197853
1
2
3
4
5
6
7
8
0:08:24.344610
0:00:40.503655
0:00:03.699075
no massive missing
0:01:57.638274


20190423

SH finished
0:00:02.649880
0:01:34.452282
0:01:06.405324
1
2
3
4
5
6
7
8
0:08:11.925792
0:00:32.175937
0:00:03.147553
no massive missing
0:01:46.245724


20190424

SH finished
0:00:02.185181
0:01:31.004533
0:01:08.511662
1
2
3
4
5
6
7
8
0:08:28.305963
0:00:33.366723
0:00:03.396941
no massive missing
0:01:47.952130


20190425

SH finished
0:00:02.190140
0:01:29.756915
0:01:04.436563
1
2
3
4
5
6
7
8
0:08:02.252705
0:00:37.052856
0:00:03.467722
no massive missing
0:01:44.902321


20190426

SH finished
0:00:02.060459
0:02:20.735553
0:01:04.847493
1
2
3
4
5
6
7
8
0:08:00.579151
0:00:32.197851
0:00:03.171514
no massive missing
0:01:45.165616


20190429

SH finished
0:00:02.124316
0:01:41.678987
0:01:02.494757
1
2
3
4
5
6
7
8
0:07:45.753968
0:00:31.269307
0:00:03.022941
no massive missing
0:01:40.879084


20190430

SH finished
0:00:02.049489
0:01:43.565009
0:01:05.140678
1
2
3
4
5
6
7
8
0:08:03.864388
0:00:35.188851
0:00:03.527561
no massive missing
0:01:51.875634


20190506

SH finished
0:00:02.094396
0:02:45.489689
0:01:05.316238
1
2
3
4
5
6
7
8
0:07:57.370736
0:00:31.597487
0:00:03.140565
no massive missing
0:01:45.237499


20190507

SH finished
0:00:02.074449
0:01:39.181695
0:01:04.762719
1
2
3
4
5
6
7
8
0:08:00.705875
0:00:31.247425
0:00:03.046818
no massive missing
0:01:44.616115


20190508

SH finished
0:00:02.061458
0:01:36.186669
0:01:01.554274
1
2
3
4
5
6
7
8
0:07:25.794223
0:00:30.775771
0:00:03.012912
no massive missing
0:01:43.968818


20190509

SH finished
0:00:01.940807
0:01:48.071840
0:01:06.576867
1
2
3
4
5
6
7
8
0:08:09.467425
0:00:33.581150
0:00:03.154560
no massive missing
0:01:44.597165


20190510

SH finished
0:00:02.140246
0:01:39.297375
0:01:01.790672
1
2
3
4
5
6
7
8
0:07:34.032212
0:00:29.654651
0:00:02.886280
no massive missing
0:01:36.372144


20190513

SH finished
0:00:01.890912
0:01:39.512769
0:01:00.565943
1
2
3
4
5
6
7
8
0:07:26.160391
0:00:31.270333
0:00:03.008949
no massive missing
0:01:36.740252


20190514

SH finished
0:00:01.925845
0:01:44.071571
0:01:02.714173
1
2
3
4
5
6
7
8
0:07:36.909512
0:00:30.726784
0:00:03.046850
no massive missing
0:01:38.678972


20190515

SH finished
0:00:02.053477
0:01:43.275766
0:01:02.295292
1
2
3
4
5
6
7
8
0:07:57.953852
0:00:31.452843
0:00:03.001968
no massive missing
0:01:38.118774


20190516

SH finished
0:00:01.973715
0:02:08.641090
0:01:05.287178
1
2
3
4
5
6
7
8
0:08:07.620321
0:00:33.380144
0:00:03.223403
no massive missing
0:01:46.342661


20190517

SH finished
0:00:02.069435
0:01:45.142743
0:01:00.964914
1
2
3
4
5
6
7
8
0:07:25.705821
0:00:30.421286
0:00:02.962075
no massive missing
0:01:40.884523


20190520

SH finished
0:00:01.999650
0:01:42.161518
0:01:00.962166
1
2
3
4
5
6
7
8
0:07:37.355674
0:00:33.935202
0:00:03.471743
no massive missing
0:01:46.599446


20190521

SH finished
0:00:01.958732
0:02:23.293332
0:01:00.224232
1
2
3
4
5
6
7
8
0:07:21.404406
0:00:28.573752
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
          ID      date  d_open  d_high  d_low  d_close_x  d_yclose  \
493  1600082  20190522     4.7    4.77   4.56       4.63      4.73   

     d_cumprodCAA  d_dayReturn  d_5dayReturn  d_ICDayReturn  d_CSIDayReturn  \
493      0.997696    -0.021142     -0.021142      -0.006683       -0.004174   

      d_volume  d_amount_x    TORate  allZT  hasZT  isZT  allDT  hasDT  isDT  \
493  5296386.0  24730844.0  0.008356    0.0    0.0   0.0    0.0    0.0   0.0   

     tmrHalted  haltedDays  marketShares  totalShares  d_close_y  d_amount_y  \
493        0.0         0.0   633852202.0  646115826.0        NaN         NaN   

     auction  
493      NaN  
0:00:02.895226
no massive missing
0:01:36.093426


20190522

SH finished
0:00:01.939317
0:01:26.344471
0:01:00.429314
1
2
3
4
5
6
7
8
0:07:36.156391
0:00:31.717641
0:00:02.949110
no massive missing
0:01:43.326902


20190523

SH finished
0:00:01.952775
0:01:24.480211
0:00:58.835837
1
2
3
4
5
6


MemoryError: 

In [None]:
import pandas as pd
import random
import numpy as np
import glob
import os
from unrar import rarfile
import py7zr
import pickle
import datetime
import time
pd.set_option("max_columns", 200)

startTm = datetime.datetime.now()
readPath = r'\\192.168.10.30\Kevin_zhenyu\day_stock_20200424\***'
dataPathLs = np.array(glob.glob(readPath))
dataPathLs = dataPathLs[[np.array([os.path.basename(i).split('.')[0][:2] == 'SH' for i in dataPathLs])]]
db = pd.DataFrame()
for p in dataPathLs:
    dayData = pd.read_csv(p, compression='gzip')
    db = pd.concat([db, dayData])
print(datetime.datetime.now() - startTm)

year = "2019"
startDate = '20190530'
endDate = '20190930'
readPath = 'G:\\KR\\' + year + '\\SH\\***'
dataPathLs = np.array(glob.glob(readPath))
dateLs = np.array([os.path.basename(i) for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]

for data in dataPathLs:
    
    readPath = data + '\\snapshot\\***2\\***'
    dataPathLs = np.array(glob.glob(readPath))
    dateLs = np.array([int(os.path.basename(i).split('.')[0]) for i in dataPathLs])
    dataPathLs = dataPathLs[((dateLs >= 600000) & (dateLs <= 700000))]
    SH = []
    ll = []
    startTm = datetime.datetime.now()
    for i in dataPathLs:
        try:
            df = pd.read_csv(i, usecols = [0,1,3,5,7,9,10,11,15,17,18,19,20,21,22,23,25,26,28,29,30,31,32,33,37,39,40,41,
                                          42,46,47,49,50])
        except:
            print("empty data")
            print(i)
            ll.append(int(os.path.basename(i).split('.')[0]))
            continue
        df["StockID"] = int(os.path.basename(i).split('.')[0])
        SH += [df]
    del df
    SH = pd.concat(SH).reset_index(drop=True)
    print(datetime.datetime.now() - startTm)
    
    startTm = datetime.datetime.now()
    SH["skey"] = SH["StockID"] + 1000000
    SH.drop(["StockID"],axis=1,inplace=True)
    SH["date"] = int(SH["QuotTime"].iloc[0]//1000000000)
    SH["time"] = (SH['QuotTime'] - int(SH['QuotTime'].iloc[0]//1000000000*1000000000)).astype(np.int64) * 1000
    SH["clockAtArrival"] = SH["QuotTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S%f').timestamp()*1e6))
    SH.drop(["QuotTime"],axis=1,inplace=True)
    SH['datetime'] = SH["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))
    print(datetime.datetime.now() - startTm)
    
    
#     startTm = datetime.datetime.now()

#     BidPrice = np.array([i[1:-1].split(',') for i in SH['BidPrice'].values])
#     SH.drop(["BidPrice"],axis=1,inplace=True)
#     for i in range(1, 11):
#         SH["bid%sp"%i] = BidPrice[:, i-1]
#     del BidPrice
#     print("1")
    
#     OfferPrice = np.array([i[1:-1].split(',') for i in SH['OfferPrice'].values])
#     SH.drop(["OfferPrice"],axis=1,inplace=True)
#     for i in range(1, 11):
#         SH["ask%sp"%i] = OfferPrice[:, i-1]
#     del OfferPrice
#     print("2")
    
#     BidOrderQty = np.array([i[1:-1].split(',') for i in SH['BidOrderQty'].values]).astype(np.int64)
#     SH.drop(["BidOrderQty"],axis=1,inplace=True)
#     for i in range(1, 11):
#         SH["bid%sq"%i] = BidOrderQty[:, i-1]
#     del BidOrderQty
#     print("3")
    
#     OfferOrderQty = np.array([i[1:-1].split(',') for i in SH['OfferOrderQty'].values]).astype(np.int64)
#     SH.drop(["OfferOrderQty"],axis=1,inplace=True)
#     for i in range(1, 11):
#         SH["ask%sq"%i] = OfferOrderQty[:, i-1]
#     del OfferOrderQty
#     print("4")
    
#     BidNumOrders = np.array([i[1:-1].split(',') for i in SH['BidNumOrders'].values]).astype(np.int32)
#     SH.drop(["BidNumOrders"],axis=1,inplace=True)
#     for i in range(1, 11):
#         SH["bid%sn"%i] = BidNumOrders[:, i-1]
#     del BidNumOrders
#     print("5")
    
#     OfferNumOrders = np.array([i[1:-1].split(',') for i in SH['OfferNumOrders'].values]).astype(np.int32)
#     SH.drop(["OfferNumOrders"],axis=1,inplace=True)
#     for i in range(1, 11):
#         SH["ask%sn"%i] = OfferNumOrders[:, i-1]
#     del OfferNumOrders
#     print("6")
    
#     bidOrders = np.array([i[1:-1].split(',') for i in SH['BidOrders'].values]).astype(np.int32)
#     SH.drop(["BidOrders"],axis=1,inplace=True)
#     for i in range(1, 51):
#         SH["bid1Top%sq"%i] = bidOrders[:, i-1]
#     del bidOrders
#     print("7")
    
#     offerOrders = np.array([i[1:-1].split(',') for i in SH['OfferOrders'].values]).astype(np.int32)
#     SH.drop(["OfferOrders"],axis=1,inplace=True)
#     for i in range(1, 51):
#         SH["ask1Top%sq"%i] = offerOrders[:, i-1]
#     del offerOrders
#     print("8")
#     print(datetime.datetime.now() - startTm)

    startTm = datetime.datetime.now()
    SH["BidPrice"] = SH["BidPrice"].apply(lambda x: [float(i) for i in x[1:-1].split(',')])
    SH["OfferPrice"] = SH["OfferPrice"].apply(lambda x: [float(i) for i in x[1:-1].split(',')])
    SH["BidOrderQty"] = SH["BidOrderQty"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])
    SH["OfferOrderQty"] = SH["OfferOrderQty"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])
    SH["BidNumOrders"] = SH["BidNumOrders"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])
    SH["OfferNumOrders"] = SH["OfferNumOrders"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])

    for i in range(1, 11):
        SH["bid" + str(i) + 'p'] = SH["BidPrice"].apply(lambda x: x[i-1],2)
    SH.drop(["BidPrice"],axis=1,inplace=True)
    print("1")
    for i in range(1, 11):
        SH["ask" + str(i) + 'p'] = SH["OfferPrice"].apply(lambda x: x[i-1],2)
    SH.drop(["OfferPrice"],axis=1,inplace=True)
    print("2")
    for i in range(1, 11):
        SH["bid" + str(i) + 'q'] = SH["BidOrderQty"].apply(lambda x: x[i-1])
    SH.drop(["BidOrderQty"],axis=1,inplace=True)
    print("3")
    for i in range(1, 11):
        SH["ask" + str(i) + 'q'] = SH["OfferOrderQty"].apply(lambda x: x[i-1])
    SH.drop(["OfferOrderQty"],axis=1,inplace=True)
    print("4")
    for i in range(1, 11):
        SH["bid" + str(i) + 'n'] = SH["BidNumOrders"].apply(lambda x: x[i-1])
        SH["bid" + str(i) + 'n'] = SH["bid" + str(i) + 'n'].astype('int32')
    SH.drop(["BidNumOrders"],axis=1,inplace=True)
    print("5")
    for i in range(1, 11):
        SH["ask" + str(i) + 'n'] = SH["OfferNumOrders"].apply(lambda x: x[i-1])
        SH["ask" + str(i) + 'n'] = SH["ask" + str(i) + 'n'].astype('int32') 
    SH.drop(["OfferNumOrders"],axis=1,inplace=True)
    print("6")
    
    SH["BidOrders"] = SH["BidOrders"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])
    SH["OfferOrders"] = SH["OfferOrders"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])

    for i in range(1, 51):
        SH["bid1Top" + str(i) + 'q'] = SH["BidOrders"].apply(lambda x: x[i-1])
        SH["bid1Top" + str(i) + 'q'] = SH["bid1Top" + str(i) + 'q'].astype('int32') 
    SH.drop(["BidOrders"],axis=1,inplace=True)
    print("7")
    
    for i in range(1, 51):
        SH["ask1Top" + str(i) + 'q'] = SH["OfferOrders"].apply(lambda x: x[i-1])
        SH["ask1Top" + str(i) + 'q'] = SH["ask1Top" + str(i) + 'q'].astype('int32') 
    SH.drop(["OfferOrders"],axis=1,inplace=True)
    print("8")
    print(datetime.datetime.now() - startTm)
    
    
    startTm = datetime.datetime.now()
#     SH = SH.rename(columns={"NumTrades":"cum_trades_cnt", "HighPx":"high", "OpenPx":"open", "PreClosePx":"prev_close", "LowPx":"low",
#                             "WeightedAvgBidPx":"total_bid_vwap", "WithdrawSellNumber":"cum_canceled_sell_orders", "TotalOfferNumber":"total_ask_orders",
#                            "OfferTradeMaxDuration":"ask_trade_max_duration", "TotalBidNumber":"total_bid_orders", "WithdrawSellMoney":"cum_canceled_sell_amount",
#                            "TotalOfferQty":"total_ask_quantity", "WithdrawBuyNumber":"cum_canceled_buy_orders", "WeightedAvgOfferPx":"total_ask_vwap",
#                            "WithdrawSellAmount":"cum_canceled_sell_volume", "Volume":"cum_volume", "NumOfferOrders":"total_ask_levels", "TotalBidQty":"total_bid_quantity",
#                            "WithdrawBuyAmount":"cum_canceled_buy_volume", "LastPx":"close", "BidTradeMaxDuration":"bid_trade_max_duration", 
#                            "NumBidOrders":"total_bid_levels", "Amount":"cum_amount", "WithdrawBuyMoney":"cum_canceled_buy_amount"})
    SH.columns = ['cum_trades_cnt', 'ask_trade_max_duration', 'total_bid_orders',
       'cum_canceled_sell_amount', 'total_ask_quantity', 'cum_canceled_buy_orders',
       'total_ask_vwap', 'cum_canceled_sell_volume', 'cum_volume', 'open',
       'high', 'prev_close', 'low', 'total_bid_vwap',
       'cum_canceled_sell_orders', 'total_ask_orders', 'total_ask_levels',
       'total_bid_quantity', 'cum_canceled_buy_volume', 'bid_trade_max_duration',
       'total_bid_levels', 'close', 'cum_amount', 'cum_canceled_buy_amount', 'skey', 'date', 'time', 'clockAtArrival',
       'datetime', 'bid1p', 'bid2p', 'bid3p', 'bid4p', 'bid5p', 'bid6p',
       'bid7p', 'bid8p', 'bid9p', 'bid10p', 'ask1p', 'ask2p', 'ask3p',
       'ask4p', 'ask5p', 'ask6p', 'ask7p', 'ask8p', 'ask9p', 'ask10p',
       'bid1q', 'bid2q', 'bid3q', 'bid4q', 'bid5q', 'bid6q', 'bid7q',
       'bid8q', 'bid9q', 'bid10q', 'ask1q', 'ask2q', 'ask3q', 'ask4q',
       'ask5q', 'ask6q', 'ask7q', 'ask8q', 'ask9q', 'ask10q', 'bid1n',
       'bid2n', 'bid3n', 'bid4n', 'bid5n', 'bid6n', 'bid7n', 'bid8n',
       'bid9n', 'bid10n', 'ask1n', 'ask2n', 'ask3n', 'ask4n', 'ask5n',
       'ask6n', 'ask7n', 'ask8n', 'ask9n', 'ask10n', 'bid1Top1q',
       'bid1Top2q', 'bid1Top3q', 'bid1Top4q', 'bid1Top5q', 'bid1Top6q',
       'bid1Top7q', 'bid1Top8q', 'bid1Top9q', 'bid1Top10q', 'bid1Top11q',
       'bid1Top12q', 'bid1Top13q', 'bid1Top14q', 'bid1Top15q',
       'bid1Top16q', 'bid1Top17q', 'bid1Top18q', 'bid1Top19q',
       'bid1Top20q', 'bid1Top21q', 'bid1Top22q', 'bid1Top23q',
       'bid1Top24q', 'bid1Top25q', 'bid1Top26q', 'bid1Top27q',
       'bid1Top28q', 'bid1Top29q', 'bid1Top30q', 'bid1Top31q',
       'bid1Top32q', 'bid1Top33q', 'bid1Top34q', 'bid1Top35q',
       'bid1Top36q', 'bid1Top37q', 'bid1Top38q', 'bid1Top39q',
       'bid1Top40q', 'bid1Top41q', 'bid1Top42q', 'bid1Top43q',
       'bid1Top44q', 'bid1Top45q', 'bid1Top46q', 'bid1Top47q',
       'bid1Top48q', 'bid1Top49q', 'bid1Top50q', 'ask1Top1q', 'ask1Top2q',
       'ask1Top3q', 'ask1Top4q', 'ask1Top5q', 'ask1Top6q', 'ask1Top7q',
       'ask1Top8q', 'ask1Top9q', 'ask1Top10q', 'ask1Top11q', 'ask1Top12q',
       'ask1Top13q', 'ask1Top14q', 'ask1Top15q', 'ask1Top16q',
       'ask1Top17q', 'ask1Top18q', 'ask1Top19q', 'ask1Top20q',
       'ask1Top21q', 'ask1Top22q', 'ask1Top23q', 'ask1Top24q',
       'ask1Top25q', 'ask1Top26q', 'ask1Top27q', 'ask1Top28q',
       'ask1Top29q', 'ask1Top30q', 'ask1Top31q', 'ask1Top32q',
       'ask1Top33q', 'ask1Top34q', 'ask1Top35q', 'ask1Top36q',
       'ask1Top37q', 'ask1Top38q', 'ask1Top39q', 'ask1Top40q',
       'ask1Top41q', 'ask1Top42q', 'ask1Top43q', 'ask1Top44q',
       'ask1Top45q', 'ask1Top46q', 'ask1Top47q', 'ask1Top48q',
       'ask1Top49q', 'ask1Top50q']
    SH = SH.fillna(0)
#     SH["p1"] = SH["bid1p"] + SH["ask1p"]
#     tt = SH[(SH["cum_volume"] > 0) & (SH["time"] < 145700000000)].groupby("skey")['p1'].min()
#     SH.drop("p1", axis=1, inplace=True)
#     try:
#         assert(tt[tt == 0].shape[0] == 0)
#     except:
#         display(tt[tt == 0])
#     SH = SH[~((SH["bid1p"] == 0) & (SH["ask1p"] == 0))]
    SH["ordering"] = SH.groupby("skey").cumcount()
    SH["ordering"] = SH["ordering"] + 1
    
    SH["has_missing"] = 0
    
    for col in ["skey", "date", "cum_trades_cnt", "total_bid_orders",
        'total_ask_orders', 'total_bid_levels', 'total_ask_levels', 'cum_canceled_buy_orders','cum_canceled_sell_orders',
            "ordering", 'bid_trade_max_duration', 'ask_trade_max_duration','has_missing']:
        SH[col] = SH[col].astype('int32')
    
#     for cols in ["prev_close", 'open', "high", "low", "close", 'bid10p','bid9p','bid8p','bid7p','bid6p','bid5p','bid4p','bid3p',
#              'bid2p','bid1p','ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p']:
# #         SH[cols] = SH[cols].apply(lambda x: round(x, 2)).astype('float64')
#         print(cols)
#         print(SH[cols].astype(str).apply(lambda x: len(str(x.split('.')[1]))).unique())
    
#     for cols in ['cum_amount', "cum_canceled_sell_amount", "cum_canceled_buy_amount"]:
# #         SH[cols] = SH[cols].apply(lambda x: round(x, 2)).astype('float64')
#         print(cols)
#         print(SH[cols].astype(str).apply(lambda x: len(str(x.split('.')[1]))).unique())
        
    for cols in ['total_bid_vwap', "total_ask_vwap"]:
        SH[cols] = SH[cols].apply(lambda x: round(x, 3))
        
   
    assert(sum(SH[SH["open"] != 0].groupby("skey")["open"].nunique() != 1) == 0)
    assert(sum(SH[SH["prev_close"] != 0].groupby("skey")["prev_close"].nunique() != 1) == 0)
    SH["prev_close"] = np.where(SH["time"] >= 91500000000, SH.groupby("skey")["prev_close"].transform("max"), SH["prev_close"]) 
    SH["open"] = np.where(SH["cum_volume"] > 0, SH.groupby("skey")["open"].transform("max"), SH["open"])
    assert(sum(SH[SH["open"] != 0].groupby("skey")["open"].nunique() != 1) == 0)
    assert(sum(SH[SH["prev_close"] != 0].groupby("skey")["prev_close"].nunique() != 1) == 0)
    assert(SH[SH["cum_volume"] > 0]["open"].min() > 0)
    print(datetime.datetime.now() - startTm)
    
    
    # check 1
    startTm = datetime.datetime.now()
    da_te = str(SH["date"].iloc[0]) 
    da_te = da_te[:4] + '-' + da_te[4:6] + '-' + da_te[6:8]
    db1 = db[db["date"] == da_te]
    db1["ID"] = db1["ID"].str[2:].astype(int) + 1000000
    db1["date"] = (db1["date"].str[:4] + db1["date"].str[5:7] + db1["date"].str[8:]).astype(int)
    SH["cum_max"] = SH.groupby("skey")["cum_volume"].transform(max)
    s2 = SH[SH["cum_volume"] == SH["cum_max"]].groupby("skey").first().reset_index()
    dd = SH[SH["cum_volume"] == SH["cum_max"]].groupby("skey")["time"].first().reset_index()
    SH.drop("cum_max", axis=1, inplace=True)
    s2 = s2.rename(columns={"skey": "ID", 'open':"d_open", "prev_close":"d_yclose","high":"d_high", "low":"d_low", "close":"d_close", "cum_volume":"d_volume", "cum_amount":"d_amount"})
    if SH["date"].iloc[0] < 20180820:
        s2["auction"] = 0
    else:
        dd["auction"] = np.where(dd["time"]<=145700000000, 0, 1)
        dd = dd.rename(columns={"skey": "ID"})
        s2 = pd.merge(s2, dd[["ID", "auction"]], on="ID")
    s2 = s2[["ID", "date", "d_open", "d_yclose", "d_high", "d_low", "d_close", "d_volume", "d_amount", "auction"]]
    re = pd.merge(db1, s2, on=["ID", "date", "d_open", "d_yclose","d_high", "d_low", "d_volume"], how="outer")
    try:
        assert(sum(re["d_amount_y"].isnull()) == 0)
    except:
        print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
        print(re[re["d_amount_y"].isnull()])
    print(datetime.datetime.now() - startTm)
    
    # check 2
    # first part
    startTm = datetime.datetime.now()
    date = pd.DataFrame(pd.date_range(start='2019-06-10 08:30:00', end='2019-06-10 18:00:00', freq='s'), columns=["Orig"])
    date["time"] = date["Orig"].apply(lambda x: int(x.strftime("%H%M%S"))*1000)
    date["group"] = date["time"]//30000
    SH["group"] = SH["time"]//30000000
    gl = date[((date["time"] >= 93000000) & (date["time"] <= 113000000))|((date["time"] >= 130000000) & (date["time"] <= 150000000))]["group"].unique()
    l = set(gl) - set(SH["group"].unique())
    SH["has_missing1"] = 0 
    if len(l) != 0:
        print("massive missing")
        print(l)
        SH["order"] = SH.groupby(["skey", "time"]).cumcount()
        for i in l:
            SH["t"] = SH[SH["group"] > i].groupby("StockID")["time"].transform("min")
            SH["has_missing1"] = np.where((SH["time"] == SH["t"]) & (SH["order"] == 0), 1, 0)
        SH.drop(["order", "t", "group"], axis=1, inplace=True)   
    else:
        print("no massive missing")
        SH.drop(["group"], axis=1, inplace=True)
    



    # second part

    SH["time_interval"] = SH.groupby("skey")["datetime"].apply(lambda x: x - x.shift(1))
    SH["time_interval"] = SH["time_interval"].apply(lambda x: x.seconds)
    SH["tn_update"] = SH.groupby("skey")["cum_trades_cnt"].apply(lambda x: x-x.shift(1))

    f1 = SH[(SH["time"] >= 93000000000) & (SH["tn_update"] != 0)].groupby("skey")["time"].min().reset_index()
    f1 = f1.rename(columns={"time": "time1"})
    f2 = SH[(SH["time"] >= 130000000000) & (SH["tn_update"] != 0)].groupby("skey")["time"].min().reset_index()
    f2 = f2.rename(columns={"time": "time2"})
    f3 = SH[(SH["time"] >= 150000000000) & (SH["tn_update"] != 0)].groupby("skey")["time"].min().reset_index()
    f3 = f3.rename(columns={"time": "time3"})
    SH = pd.merge(SH, f1, on="skey", how="left")
    del f1
    SH = pd.merge(SH, f2, on="skey", how="left")
    del f2
    SH = pd.merge(SH, f3, on="skey", how="left")
    del f3
    p99 = SH[(SH["time"] > 93000000000) & (SH["time"] < 145700000000) & (SH["time"] != SH["time2"]) & (SH["tn_update"] != 0)]\
    .groupby("skey")["tn_update"].apply(lambda x: x.describe([0.99])["99%"]).reset_index()
    p99 = p99.rename(columns={"tn_update":"99%"})
    SH = pd.merge(SH, p99, on="skey", how="left")

    SH["has_missing2"] = 0
    SH["has_missing2"] = np.where((SH["time_interval"] > 60) & (SH["tn_update"] > SH["99%"]) & 
         (SH["time"] > SH["time1"]) & (SH["time"] != SH["time2"]) & (SH["time"] != SH["time3"]), 1, 0)
    SH.drop(["time_interval", "tn_update", "time1", "time2", "time3", "99%"], axis=1, inplace=True) 

    SH["has_missing"] = np.where((SH["has_missing1"] == 1) | (SH["has_missing2"] == 1), 1, 0)
    SH.drop(["has_missing1", "has_missing2"], axis=1, inplace=True) 
    if SH[SH["has_missing"] == 1].shape[0] != 0:
        print("has missing!!!!!!!!!!!!!!!!!!!!!!!")
        print(SH[SH["has_missing"] == 1].shape[0])
    print(datetime.datetime.now() - startTm)
    
    
    
    startTm = datetime.datetime.now()
    SH = SH[["skey", "date", "time", "clockAtArrival", "datetime", "ordering", "has_missing", "cum_trades_cnt", "cum_volume", "cum_amount", "prev_close",
                            "open", "high", "low", "close", 'bid10p','bid9p','bid8p','bid7p','bid6p','bid5p','bid4p','bid3p','bid2p','bid1p',
                            'ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p', 'bid10q','bid9q','bid8q',
                             'bid7q','bid6q','bid5q','bid4q','bid3q','bid2q','bid1q', 'ask1q','ask2q','ask3q','ask4q','ask5q','ask6q',
                             'ask7q','ask8q','ask9q','ask10q', 'bid10n', 'bid9n', 'bid8n', 'bid7n', 'bid6n', 'bid5n', 'bid4n', 'bid3n', 'bid2n', 'bid1n', 
                             'ask1n', 'ask2n', 'ask3n', 'ask4n', 'ask5n', 'ask6n','ask7n', 'ask8n', 'ask9n', 'ask10n','bid1Top1q','bid1Top2q','bid1Top3q','bid1Top4q','bid1Top5q','bid1Top6q',
        'bid1Top7q','bid1Top8q','bid1Top9q','bid1Top10q','bid1Top11q','bid1Top12q','bid1Top13q','bid1Top14q','bid1Top15q','bid1Top16q','bid1Top17q','bid1Top18q',
        'bid1Top19q','bid1Top20q','bid1Top21q','bid1Top22q','bid1Top23q','bid1Top24q','bid1Top25q','bid1Top26q','bid1Top27q','bid1Top28q','bid1Top29q',
        'bid1Top30q','bid1Top31q','bid1Top32q','bid1Top33q','bid1Top34q','bid1Top35q','bid1Top36q','bid1Top37q','bid1Top38q','bid1Top39q','bid1Top40q',
        'bid1Top41q','bid1Top42q','bid1Top43q','bid1Top44q','bid1Top45q','bid1Top46q','bid1Top47q','bid1Top48q','bid1Top49q','bid1Top50q', 'ask1Top1q',
        'ask1Top2q','ask1Top3q','ask1Top4q','ask1Top5q','ask1Top6q','ask1Top7q','ask1Top8q','ask1Top9q','ask1Top10q','ask1Top11q','ask1Top12q','ask1Top13q',
        'ask1Top14q','ask1Top15q','ask1Top16q','ask1Top17q','ask1Top18q','ask1Top19q','ask1Top20q','ask1Top21q','ask1Top22q','ask1Top23q',
        'ask1Top24q','ask1Top25q','ask1Top26q','ask1Top27q','ask1Top28q','ask1Top29q','ask1Top30q','ask1Top31q','ask1Top32q','ask1Top33q',
        'ask1Top34q','ask1Top35q','ask1Top36q','ask1Top37q','ask1Top38q','ask1Top39q','ask1Top40q','ask1Top41q','ask1Top42q','ask1Top43q',
        'ask1Top44q','ask1Top45q','ask1Top46q','ask1Top47q','ask1Top48q','ask1Top49q','ask1Top50q',"total_bid_quantity", "total_ask_quantity","total_bid_vwap", "total_ask_vwap",
        "total_bid_orders",'total_ask_orders','total_bid_levels', 'total_ask_levels', 'bid_trade_max_duration', 'ask_trade_max_duration', 'cum_canceled_buy_orders', 'cum_canceled_buy_volume',
        "cum_canceled_buy_amount", "cum_canceled_sell_orders", 'cum_canceled_sell_volume',"cum_canceled_sell_amount"]]
    
    display(SH["date"].iloc[0])
    print("SH finished")
    print(datetime.datetime.now() - startTm)
    
    db1 = DB("mongodb://user_rw:faa96dfc@192.168.10.223")
    db1.write('snapshot', SH)
    
    del SH

  app.launch_new_instance()


0:02:48.454751
0:01:33.273985
0:01:01.984531


In [3]:
SH[SH['has_missing'] == 1]

Unnamed: 0,skey,date,time,clockAtArrival,datetime,ordering,has_missing,cum_trades_cnt,cum_volume,cum_amount,prev_close,open,high,low,close,bid10p,bid9p,bid8p,bid7p,bid6p,bid5p,bid4p,bid3p,bid2p,bid1p,ask1p,ask2p,ask3p,ask4p,ask5p,ask6p,ask7p,ask8p,ask9p,ask10p,bid10q,bid9q,bid8q,bid7q,bid6q,bid5q,bid4q,bid3q,bid2q,bid1q,ask1q,ask2q,ask3q,ask4q,ask5q,ask6q,ask7q,ask8q,ask9q,ask10q,bid10n,bid9n,bid8n,bid7n,bid6n,bid5n,bid4n,bid3n,bid2n,bid1n,ask1n,ask2n,ask3n,ask4n,ask5n,ask6n,ask7n,ask8n,ask9n,ask10n,bid1Top1q,bid1Top2q,bid1Top3q,bid1Top4q,bid1Top5q,bid1Top6q,bid1Top7q,bid1Top8q,bid1Top9q,bid1Top10q,bid1Top11q,bid1Top12q,bid1Top13q,bid1Top14q,bid1Top15q,bid1Top16q,bid1Top17q,bid1Top18q,bid1Top19q,bid1Top20q,bid1Top21q,bid1Top22q,bid1Top23q,bid1Top24q,bid1Top25q,bid1Top26q,bid1Top27q,bid1Top28q,bid1Top29q,bid1Top30q,bid1Top31q,bid1Top32q,bid1Top33q,bid1Top34q,bid1Top35q,bid1Top36q,bid1Top37q,bid1Top38q,bid1Top39q,bid1Top40q,bid1Top41q,bid1Top42q,bid1Top43q,bid1Top44q,bid1Top45q,bid1Top46q,bid1Top47q,bid1Top48q,bid1Top49q,bid1Top50q,ask1Top1q,ask1Top2q,ask1Top3q,ask1Top4q,ask1Top5q,ask1Top6q,ask1Top7q,ask1Top8q,ask1Top9q,ask1Top10q,ask1Top11q,ask1Top12q,ask1Top13q,ask1Top14q,ask1Top15q,ask1Top16q,ask1Top17q,ask1Top18q,ask1Top19q,ask1Top20q,ask1Top21q,ask1Top22q,ask1Top23q,ask1Top24q,ask1Top25q,ask1Top26q,ask1Top27q,ask1Top28q,ask1Top29q,ask1Top30q,ask1Top31q,ask1Top32q,ask1Top33q,ask1Top34q,ask1Top35q,ask1Top36q,ask1Top37q,ask1Top38q,ask1Top39q,ask1Top40q,ask1Top41q,ask1Top42q,ask1Top43q,ask1Top44q,ask1Top45q,ask1Top46q,ask1Top47q,ask1Top48q,ask1Top49q,ask1Top50q,total_bid_quantity,total_ask_quantity,total_bid_vwap,total_ask_vwap,total_bid_orders,total_ask_orders,total_bid_levels,total_ask_levels,bid_trade_max_duration,ask_trade_max_duration,cum_canceled_buy_orders,cum_canceled_buy_volume,cum_canceled_buy_amount,cum_canceled_sell_orders,cum_canceled_sell_volume,cum_canceled_sell_amount
5120210,1688001,20190722,110143000000,1563764503000000,2019-07-22 11:01:43,1836,1,39630,21651644,1080342000.0,24.26,55.4,72.02,39.59,72.02,71.0,71.02,71.1,71.51,71.9,71.92,71.96,72.0,72.01,72.02,72.05,72.18,72.25,72.28,72.45,72.5,72.51,72.57,72.68,72.78,2500,1400,4500,500,1093,500,500,9739,200,1304,200,200,200,500,500,900,200,200,500,3500,4,3,1,1,1,1,1,11,1,3,1,1,1,1,1,3,1,1,1,9,104,1000,200,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,200,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9291275,327022,14.42,173.729,3575,555,1014,219,723,4897,11137,17677314,145530000000.0,7394,7278508,88553790000.0
5123599,1688002,20190722,94355000000,1563759835000000,2019-07-22 09:43:55,280,1,11417,7830262,439936100.0,20.0,59.1,60.28,41.36,41.36,41.17,41.18,41.2,41.23,41.28,41.29,41.3,41.33,41.35,41.36,41.37,41.38,41.4,41.48,41.5,41.6,41.75,41.8,41.82,41.97,500,1369,500,200,500,200,9000,400,2515,4287,21375,500,200,500,20704,200,200,8732,200,500,1,2,1,1,1,1,6,2,1,4,49,2,1,1,4,1,1,20,1,1,787,500,500,2500,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,375,500,500,500,500,500,500,200,500,200,500,250,500,500,300,500,500,1000,500,200,300,500,200,500,500,500,500,400,1000,500,300,500,300,500,400,200,200,500,500,200,500,500,500,500,300,300,500,500,250,0,11359564,1388876,14.34,64.704,3011,1771,596,448,600,600,3190,7139106,38760170000.0,4142,3924661,46411920000.0
5128545,1688003,20190722,94346000000,1563759826000000,2019-07-22 09:43:46,276,1,6996,3838962,189655900.0,25.5,55.8,55.8,39.02,39.02,38.8,38.86,38.88,38.89,38.9,38.96,38.98,38.99,39.0,39.01,39.02,39.04,39.1,39.12,39.24,39.27,39.47,39.48,39.5,39.8,6129,3477,400,700,1000,500,400,200,14400,200,187159,500,300,500,500,500,500,100,3114,6000,5,1,2,2,1,2,2,1,36,1,279,1,1,1,1,1,1,1,8,1,200,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13361,500,500,200,500,500,500,500,500,500,249,500,200,200,500,500,500,500,500,200,200,200,500,200,300,200,500,500,500,500,500,500,500,500,500,300,500,2431,12918,500,500,500,500,500,500,300,500,500,500,200,6600985,1498616,9.777,77.469,1598,2353,419,513,600,599,1638,2810739,15751530000.0,6413,4634342,49589170000.0
5135067,1688005,20190722,110217000000,1563764537000000,2019-07-22 11:02:17,1850,1,38566,24394059,1020530000.0,26.62,42.58,55.69,35.12,55.69,55.5,55.51,55.54,55.55,55.56,55.58,55.6,55.65,55.66,55.68,55.69,55.71,55.75,55.8,55.88,55.98,56.0,56.09,56.14,56.2,42087,1500,1500,6683,400,482,16058,1502,2800,2000,106,500,200,200,200,900,4100,500,200,200,37,2,2,9,2,2,7,2,2,1,1,1,1,1,1,3,10,1,1,1,2000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,106,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6186121,199661,15.76,122.061,2798,515,805,243,598,4935,9704,14970310,118030000000.0,6032,5477419,55070450000.0
5135105,1688005,20190722,111407000000,1563765247000000,2019-07-22 11:14:07,1888,1,39700,24863222,1050875000.0,26.62,42.58,68.13,35.12,68.0,66.3,66.55,66.62,66.66,66.88,66.99,67.0,67.13,67.5,67.99,68.0,68.01,68.02,68.03,68.04,68.05,68.06,68.07,68.08,68.09,200,1277,1000,200,200,300,500,2001,107,700,159545,1000,1200,400,700,600,200,451,2950,700,1,3,1,1,1,1,2,1,1,2,80,2,3,2,2,3,1,2,8,2,500,200,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,22,200,500,200,100,500,9091,9091,9091,9091,9091,9091,9091,9091,9091,9091,9091,3465,3465,3465,3465,3465,3465,3465,3465,3465,3465,3465,100,350,426,350,350,350,350,331,200,350,500,500,500,500,500,300,500,500,200,500,300,100,5700820,466182,12.809,92.634,2382,787,805,214,600,5039,11455,17466955,147617600000.0,6265,5573031,57213760000.0
5138198,1688006,20190722,94123000000,1563759683000000,2019-07-22 09:41:23,230,1,3451,3202370,165387200.0,27.43,49.0,63.7,48.6,58.88,55.4,55.8,56.88,56.89,57.0,57.3,57.5,57.88,58.0,58.8,58.88,59.0,59.8,59.84,59.88,59.9,59.97,59.98,59.99,60.0,200,200,1000,200,2153,200,200,1200,8500,700,20790,2000,500,200,600,700,250,350,700,49552,1,1,1,1,5,1,1,3,5,2,54,5,1,1,2,2,1,1,2,127,200,500,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1106,893,893,782,820,756,744,737,705,693,605,733,569,518,518,595,569,505,505,505,493,482,469,444,407,322,305,356,405,290,207,207,207,205,195,193,193,205,182,158,156,144,93,93,93,93,69,82,58,56,6329280,1473530,17.477,76.467,2560,2339,593,331,599,587,4527,6970637,62726410000.0,852,519094,9020269000.0
5144701,1688007,20190722,105909000000,1563764349000000,2019-07-22 10:59:09,1788,1,46542,30784614,977016100.0,17.5,35.02,45.54,26.45,45.54,45.3,45.43,45.45,45.47,45.49,45.5,45.51,45.52,45.53,45.54,45.55,45.57,45.58,45.59,45.6,45.62,45.66,45.67,45.68,45.71,1800,25105,255,500,27838,5708,1000,700,200,69363,500,200,3743,500,2000,200,200,100,400,200,1,1,1,1,3,9,1,2,1,73,1,1,7,1,6,1,1,1,2,1,1391,1000,1618,200,300,200,401,486,200,2520,1000,6570,465,1000,600,666,200,200,200,2304,200,736,200,1559,500,200,200,200,200,350,400,299,200,3700,500,900,200,200,200,200,300,1000,898,1000,500,500,600,200,500,200,500,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7769195,266667,10.912,132.075,2613,673,807,285,753,4741,9109,18118157,106133300000.0,8687,8887474,67443060000.0
5153513,1688009,20190722,95408000000,1563760448000000,2019-07-22 09:54:08,491,1,170181,241443126,2502029000.0,5.85,11.7,13.21,8.19,8.19,8.1,8.11,8.12,8.13,8.14,8.15,8.16,8.17,8.18,8.19,8.2,8.21,8.22,8.23,8.24,8.25,8.26,8.27,8.28,8.29,574638,141017,42116,48654,12700,188820,138338,95652,535153,8586675,2783647,38200,131701,2000,6000,34300,123000,2800,274464,29041,179,56,26,25,9,61,55,39,224,3662,348,74,69,4,11,55,50,6,66,55,31841,300,200,1000,2000,1000,1107,400,200,500,2000,10000,2000,2000,200,1000,500,200,1500,600,500,1000,1000,1000,300,2000,200,200,1000,1200,1000,12213,10000,5000,1000,10000,201,1000,1000,200,500,500,586,500,200,1000,200,300,7642,203,500,500,500,500,500,500,500,500,500,500,500,500,500,500,500,500,500,755,8284,20711,18793,29599,500,500,200,500,500,500,200,500,500,500,300,500,300,300,1000,500,500,500,500,500,500,500,500,500,1000,500,500,500,66842703,43184484,6.091,15.531,13662,24759,474,1629,848,600,16711,94103508,162849200000.0,70867,120454134,297622600000.0
5154821,1688009,20190722,110929000000,1563764969000000,2019-07-22 11:09:29,1799,1,453765,688561614,6860069000.0,5.85,11.7,15.21,7.69,15.2,15.09,15.1,15.11,15.13,15.15,15.16,15.17,15.18,15.19,15.2,15.21,15.22,15.23,15.24,15.25,15.26,15.27,15.28,15.29,15.3,1500,16907,20680,400,10441,6232,32891,50088,119299,250083,2716452,14179,10000,2500,22850,9000,5800,26150,12295,36600,2,12,10,2,8,4,17,24,46,59,5258,34,23,5,52,22,12,54,27,86,65871,3700,7000,2000,3000,1000,800,5000,200,500,1600,1000,400,1000,500,20000,1174,1000,5000,1000,5000,300,300,600,20000,1427,200,2000,1000,500,400,250,400,1300,200,500,2000,1200,400,1000,200,24878,17826,500,200,700,10143,200,1000,15836,200,500,500,500,500,500,500,300,500,200,200,500,500,250,500,500,500,200,500,500,500,1000,200,500,500,500,500,500,200,500,200,500,500,500,300,500,200,500,500,200,500,500,500,500,500,400,500,500,500,500,73309631,8925937,7.018,37.949,23024,14994,1142,1078,1036,4755,81052,395727841,774315400000.0,132113,200116038,492457400000.0
5159594,1688010,20190722,110017000000,1563764417000000,2019-07-22 11:00:17,1812,1,33105,18326997,775787800.0,25.22,40.03,52.5,36.12,52.5,52.2,52.21,52.22,52.28,52.3,52.35,52.37,52.38,52.48,52.5,52.52,52.58,52.62,52.66,52.68,52.78,52.88,52.98,53.0,53.01,23334,1608,2500,300,4775,200,200,500,746,15078,1000,300,250,200,500,200,200,200,4000,300,15,2,2,1,4,1,1,2,1,18,2,1,1,1,1,1,1,1,10,1,707,400,200,3000,2000,1000,816,500,300,245,200,400,200,400,610,1000,500,2600,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,500,500,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6281900,298393,14.373,123.571,2448,674,744,296,1389,4817,12282,17007450,134637100000.0,6301,5013327,52085830000.0


In [7]:
startDate = 20190111
endDate = 20190111
targetStockLs = [1600000]

db = DB("mongodb://user_rw:faa96dfc@192.168.10.223")

# pick all stocks from certain period
mdData = db.read('snapshot', start_date=startDate, end_date=endDate)
mdData

Unnamed: 0,skey,date,time,clockAtArrival,datetime,ordering,has_missing,cum_trades_cnt,cum_volume,cum_amount,...,total_bid_levels,total_ask_levels,bid_trade_max_duration,ask_trade_max_duration,cum_canceled_buy_orders,cum_canceled_buy_volume,cum_canceled_buy_amount,cum_canceled_sell_orders,cum_canceled_sell_volume,cum_canceled_sell_amount
0,2000001,20190111,91509000000,1547169309000000,2019-01-11 09:15:09,1,0,0,0,0.000000e+00,...,0,0,0,0,0,0,0.0,0,0,0.0
1,2000001,20190111,91518000000,1547169318000000,2019-01-11 09:15:18,2,0,0,0,0.000000e+00,...,0,0,0,0,0,0,0.0,0,0,0.0
2,2000001,20190111,91536000000,1547169336000000,2019-01-11 09:15:36,3,0,0,0,0.000000e+00,...,0,0,0,0,0,0,0.0,0,0,0.0
3,2000001,20190111,91545000000,1547169345000000,2019-01-11 09:15:45,4,0,0,0,0.000000e+00,...,0,0,0,0,0,0,0.0,0,0,0.0
4,2000001,20190111,91603000000,1547169363000000,2019-01-11 09:16:03,5,0,0,0,0.000000e+00,...,0,0,0,0,0,0,0.0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6819322,2300760,20190111,152303000000,1547191383000000,2019-01-11 15:23:03,4425,0,7310,2072275,2.151257e+08,...,0,0,0,0,0,0,0.0,0,0,0.0
6819323,2300760,20190111,152403000000,1547191443000000,2019-01-11 15:24:03,4426,0,7310,2072275,2.151257e+08,...,0,0,0,0,0,0,0.0,0,0,0.0
6819324,2300760,20190111,152503000000,1547191503000000,2019-01-11 15:25:03,4427,0,7310,2072275,2.151257e+08,...,0,0,0,0,0,0,0.0,0,0,0.0
6819325,2300760,20190111,152603000000,1547191563000000,2019-01-11 15:26:03,4428,0,7310,2072275,2.151257e+08,...,0,0,0,0,0,0,0.0,0,0,0.0


In [8]:
mdData[mdData["skey"] < 2000000]

Unnamed: 0,skey,date,time,clockAtArrival,datetime,ordering,has_missing,cum_trades_cnt,cum_volume,cum_amount,...,total_bid_levels,total_ask_levels,bid_trade_max_duration,ask_trade_max_duration,cum_canceled_buy_orders,cum_canceled_buy_volume,cum_canceled_buy_amount,cum_canceled_sell_orders,cum_canceled_sell_volume,cum_canceled_sell_amount


In [157]:
mdData["date"].unique()

array([20190102, 20190103, 20190104, 20190107, 20190108, 20190109,
       20190110, 20190111, 20190116, 20190117, 20190118, 20190722,
       20190723, 20190724, 20190725, 20190726, 20190729, 20190730,
       20190731, 20190801, 20190802, 20190805, 20190806, 20190807,
       20190808, 20190809, 20190812, 20190813, 20190814, 20190815,
       20190816, 20190819, 20190820, 20190821, 20190822, 20190823,
       20190826, 20190827, 20190828, 20190829, 20190830, 20190902,
       20190903, 20190904, 20190905, 20190906, 20190909, 20190910,
       20190911, 20190912, 20190916, 20190917, 20190918, 20190919,
       20190920, 20190923, 20190924, 20190925, 20190926, 20190927,
       20190930], dtype=int64)

In [142]:
mdData[(mdData["skey"] == 1600552) & (mdData["time"] <= 145817000000)]

TypeError: 'NoneType' object is not subscriptable

In [12]:
kk = pd.read_csv(r"G:\KR\2019\SH\20190225\snapshot\Level_2\600522.csv")
pd.set_option("max_rows", 200)
kk["bid1p"] = kk["BidPrice"].str[1:-1].apply(lambda x: float(x.split(',')[0]))
kk["ask1p"] = kk["OfferPrice"].str[1:-1].apply(lambda x: float(x.split(',')[0]))
display(kk[(kk["Volume"] > 0) & (kk["ask1p"] == 0) & (kk["bid1p"] == 0) & (kk["QuotTime"] < 20190225145700000)])
kk[kk["QuotTime"] >= 20190225092450000].head(20)[["NumTrades", "QuotTime", "Volume", "Amount", "BidPrice", "OfferPrice", 
                                                 "OpenPx", "PreClosePx", "LastPx", "InstrumentStatus"]]
# kk[kk["QuotTime"] >= 20190722110200000].head(50)[["NumTrades", "QuotTime", "Volume", "Amount", "BidPrice", "BidOrderQty", "OfferPrice", "OfferOrderQty",
#                                                  "OpenPx", "HighPx", "PreClosePx", "LowPx", "LastPx", "InstrumentStatus"]]

Unnamed: 0,NumTrades,OfferTradeMaxDuration,ImageStatus,TotalBidNumber,TotalWarrantExecQty,WithdrawSellMoney,IOPV,BidOrders,ETFSellAmount,TotalOfferQty,WithdrawBuyNumber,WeightedAvgOfferPx,ETFBuyNumber,WarLowerPx,MsgSeqNum,WithdrawSellAmount,ETFSellMoney,Volume,BidOrderQty,OpenPx,HighPx,PreClosePx,LowPx,WeightedAvgBidPx,ETFSellNumber,OfferNumOrders,WithdrawSellNumber,ETFBuyAmount,TotalOfferNumber,OfferPrice,NumOfferOrders,BidPrice,OfferOrderQty,TotalBidQty,SendingTime,ETFBuyMoney,InstrumentStatus,WithdrawBuyAmount,ClosePx,BidTradeMaxDuration,NumBidOrders,LastPx,Amount,AveragePx,WarUpperPx,YieldToMaturity,BidNumOrders,WithdrawBuyMoney,TradingPhaseCode,QuotTime,OfferOrders,bid1p,ask1p
230,229,0,1,0,0,0.0,0,"[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...",0,0,0,0.0,0,0,10577,0,0,340300,"[0,0,0,0,0,0,0,0,0,0]",10.3,10.3,9.9,10.3,0.0,0,"[0,0,0,0,0,0,0,0,0,0]",0,0,0,"[0,0,0,0,0,0,0,0,0,0]",0,"[0,0,0,0,0,0,0,0,0,0]","[0,0,0,0,0,0,0,0,0,0]",0,20190225092502000,0,OCALL,0,0.0,0,0,10.3,3505090.0,10.3,0,0,"[0,0,0,0,0,0,0,0,0,0]",0.0,T111,20190225092501000,"[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...",0.0,0.0


Unnamed: 0,NumTrades,QuotTime,Volume,Amount,BidPrice,OfferPrice,OpenPx,PreClosePx,LastPx,InstrumentStatus
226,0,20190225092450000,0,0.0,"[10.3,0,0,0,0,0,0,0,0,0]","[10.3,0,0,0,0,0,0,0,0,0]",0.0,9.9,0.0,OCALL
227,0,20190225092453000,0,0.0,"[10.3,0,0,0,0,0,0,0,0,0]","[10.3,0,0,0,0,0,0,0,0,0]",0.0,9.9,0.0,OCALL
228,0,20190225092456000,0,0.0,"[10.3,0,0,0,0,0,0,0,0,0]","[10.3,0,0,0,0,0,0,0,0,0]",0.0,9.9,0.0,OCALL
229,0,20190225092459000,0,0.0,"[10.3,0,0,0,0,0,0,0,0,0]","[10.3,0,0,0,0,0,0,0,0,0]",0.0,9.9,0.0,OCALL
230,229,20190225092501000,340300,3505090.0,"[0,0,0,0,0,0,0,0,0,0]","[0,0,0,0,0,0,0,0,0,0]",10.3,9.9,10.3,OCALL
231,1073,20190225092501000,2494778,25696213.4,"[10.3,10.29,10.28,10.27,10.26,10.25,10.24,10.2...","[10.31,10.32,10.33,10.34,10.35,10.36,10.37,10....",10.3,9.9,10.3,TRADE
232,1073,20190225092601000,2494778,25696213.4,"[10.3,10.29,10.28,10.27,10.26,10.25,10.24,10.2...","[10.31,10.32,10.33,10.34,10.35,10.36,10.37,10....",10.3,9.9,10.3,TRADE
233,1073,20190225092701000,2494778,25696213.4,"[10.3,10.29,10.28,10.27,10.26,10.25,10.24,10.2...","[10.31,10.32,10.33,10.34,10.35,10.36,10.37,10....",10.3,9.9,10.3,TRADE
234,1073,20190225092801000,2494778,25696213.4,"[10.3,10.29,10.28,10.27,10.26,10.25,10.24,10.2...","[10.31,10.32,10.33,10.34,10.35,10.36,10.37,10....",10.3,9.9,10.3,TRADE
235,1073,20190225092901000,2494778,25696213.4,"[10.3,10.29,10.28,10.27,10.26,10.25,10.24,10.2...","[10.31,10.32,10.33,10.34,10.35,10.36,10.37,10....",10.3,9.9,10.3,TRADE


In [67]:
kk = pd.read_csv(r"G:\KR\2019\SH\20190225\snapshot\Level_2\603217.csv")
pd.set_option("max_rows", 200)
kk["bid1p"] = kk["BidPrice"].str[1:-1].apply(lambda x: float(x.split(',')[0]))
kk["ask1p"] = kk["OfferPrice"].str[1:-1].apply(lambda x: float(x.split(',')[0]))
display(kk[(kk["Volume"] > 0) & (kk["ask1p"] == 0) & (kk["bid1p"] == 0)])
kk[kk["QuotTime"] >= 20190225092450000].head(20)[["NumTrades", "QuotTime", "Volume", "Amount", "BidPrice", "BidOrderQty", "OfferPrice", "OfferOrderQty",
                                                 "OpenPx", "HighPx", "PreClosePx", "LowPx", "LastPx"]]

Unnamed: 0,NumTrades,OfferTradeMaxDuration,ImageStatus,TotalBidNumber,TotalWarrantExecQty,WithdrawSellMoney,IOPV,BidOrders,ETFSellAmount,TotalOfferQty,WithdrawBuyNumber,WeightedAvgOfferPx,ETFBuyNumber,WarLowerPx,MsgSeqNum,WithdrawSellAmount,ETFSellMoney,Volume,BidOrderQty,OpenPx,HighPx,PreClosePx,LowPx,WeightedAvgBidPx,ETFSellNumber,OfferNumOrders,WithdrawSellNumber,ETFBuyAmount,TotalOfferNumber,OfferPrice,NumOfferOrders,BidPrice,OfferOrderQty,TotalBidQty,SendingTime,ETFBuyMoney,InstrumentStatus,WithdrawBuyAmount,ClosePx,BidTradeMaxDuration,NumBidOrders,LastPx,Amount,AveragePx,WarUpperPx,YieldToMaturity,BidNumOrders,WithdrawBuyMoney,TradingPhaseCode,QuotTime,OfferOrders,bid1p,ask1p
230,229,0,1,0,0,0.0,0,"[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...",0,0,0,0.0,0,0,10577,0,0,340300,"[0,0,0,0,0,0,0,0,0,0]",10.3,10.3,9.9,10.3,0.0,0,"[0,0,0,0,0,0,0,0,0,0]",0,0,0,"[0,0,0,0,0,0,0,0,0,0]",0,"[0,0,0,0,0,0,0,0,0,0]","[0,0,0,0,0,0,0,0,0,0]",0,20190225092502000,0,OCALL,0,0.0,0,0,10.3,3505090.0,10.3,0,0,"[0,0,0,0,0,0,0,0,0,0]",0.0,T111,20190225092501000,"[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...",0.0,0.0
5066,76398,8022,1,3310,0,475459100.0,0,"[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...",0,14418559,14551,10.651,0,0,94253,45721672,0,162076888,"[0,0,0,0,0,0,0,0,0,0]",10.3,10.52,9.9,10.05,10.046,0,"[0,0,0,0,0,0,0,0,0,0]",9970,0,5137,"[0,0,0,0,0,0,0,0,0,0]",60,"[0,0,0,0,0,0,0,0,0,0]","[0,0,0,0,0,0,0,0,0,0]",8274674,20190225145659000,0,CCALL,51680138,0.0,3336,121,10.3,1667605000.0,10.28897,0,0,"[0,0,0,0,0,0,0,0,0,0]",526549500.0,T111,20190225145700000,"[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...",0.0,0.0


Unnamed: 0,NumTrades,QuotTime,Volume,Amount,BidPrice,BidOrderQty,OfferPrice,OfferOrderQty,OpenPx,HighPx,PreClosePx,LowPx,LastPx
226,0,20190225092450000,0,0.0,"[10.3,0,0,0,0,0,0,0,0,0]","[2339178,794422,0,0,0,0,0,0,0,0]","[10.3,0,0,0,0,0,0,0,0,0]","[2339178,0,0,0,0,0,0,0,0,0]",0.0,0.0,9.9,0.0,0.0
227,0,20190225092453000,0,0.0,"[10.3,0,0,0,0,0,0,0,0,0]","[2392378,788822,0,0,0,0,0,0,0,0]","[10.3,0,0,0,0,0,0,0,0,0]","[2392378,0,0,0,0,0,0,0,0,0]",0.0,0.0,9.9,0.0,0.0
228,0,20190225092456000,0,0.0,"[10.3,0,0,0,0,0,0,0,0,0]","[2395678,844022,0,0,0,0,0,0,0,0]","[10.3,0,0,0,0,0,0,0,0,0]","[2395678,0,0,0,0,0,0,0,0,0]",0.0,0.0,9.9,0.0,0.0
229,0,20190225092459000,0,0.0,"[10.3,0,0,0,0,0,0,0,0,0]","[2396678,917622,0,0,0,0,0,0,0,0]","[10.3,0,0,0,0,0,0,0,0,0]","[2396678,0,0,0,0,0,0,0,0,0]",0.0,0.0,9.9,0.0,0.0
230,229,20190225092501000,340300,3505090.0,"[0,0,0,0,0,0,0,0,0,0]","[0,0,0,0,0,0,0,0,0,0]","[0,0,0,0,0,0,0,0,0,0]","[0,0,0,0,0,0,0,0,0,0]",10.3,10.3,9.9,10.3,10.3
231,1073,20190225092501000,2494778,25696213.4,"[10.3,10.29,10.28,10.27,10.26,10.25,10.24,10.2...","[834422,349300,361300,95900,351200,254900,4900...","[10.31,10.32,10.33,10.34,10.35,10.36,10.37,10....","[34300,42900,26000,25700,242200,17500,200,2830...",10.3,10.3,9.9,10.3,10.3
232,1073,20190225092601000,2494778,25696213.4,"[10.3,10.29,10.28,10.27,10.26,10.25,10.24,10.2...","[834422,349300,361300,95900,351200,254900,4900...","[10.31,10.32,10.33,10.34,10.35,10.36,10.37,10....","[34300,42900,26000,25700,242200,17500,200,2830...",10.3,10.3,9.9,10.3,10.3
233,1073,20190225092701000,2494778,25696213.4,"[10.3,10.29,10.28,10.27,10.26,10.25,10.24,10.2...","[834422,349300,361300,95900,351200,254900,4900...","[10.31,10.32,10.33,10.34,10.35,10.36,10.37,10....","[34300,42900,26000,25700,242200,17500,200,2830...",10.3,10.3,9.9,10.3,10.3
234,1073,20190225092801000,2494778,25696213.4,"[10.3,10.29,10.28,10.27,10.26,10.25,10.24,10.2...","[834422,349300,361300,95900,351200,254900,4900...","[10.31,10.32,10.33,10.34,10.35,10.36,10.37,10....","[34300,42900,26000,25700,242200,17500,200,2830...",10.3,10.3,9.9,10.3,10.3
235,1073,20190225092901000,2494778,25696213.4,"[10.3,10.29,10.28,10.27,10.26,10.25,10.24,10.2...","[834422,349300,361300,95900,351200,254900,4900...","[10.31,10.32,10.33,10.34,10.35,10.36,10.37,10....","[34300,42900,26000,25700,242200,17500,200,2830...",10.3,10.3,9.9,10.3,10.3


In [43]:
kk.columns

Index(['NumTrades', 'OfferTradeMaxDuration', 'ImageStatus', 'TotalBidNumber',
       'TotalWarrantExecQty', 'WithdrawSellMoney', 'IOPV', 'BidOrders',
       'ETFSellAmount', 'TotalOfferQty', 'WithdrawBuyNumber',
       'WeightedAvgOfferPx', 'ETFBuyNumber', 'WarLowerPx', 'MsgSeqNum',
       'WithdrawSellAmount', 'ETFSellMoney', 'Volume', 'BidOrderQty', 'OpenPx',
       'HighPx', 'PreClosePx', 'LowPx', 'WeightedAvgBidPx', 'ETFSellNumber',
       'OfferNumOrders', 'WithdrawSellNumber', 'ETFBuyAmount',
       'TotalOfferNumber', 'OfferPrice', 'NumOfferOrders', 'BidPrice',
       'OfferOrderQty', 'TotalBidQty', 'SendingTime', 'ETFBuyMoney',
       'InstrumentStatus', 'WithdrawBuyAmount', 'ClosePx',
       'BidTradeMaxDuration', 'NumBidOrders', 'LastPx', 'Amount', 'AveragePx',
       'WarUpperPx', 'YieldToMaturity', 'BidNumOrders', 'WithdrawBuyMoney',
       'TradingPhaseCode', 'QuotTime', 'OfferOrders'],
      dtype='object')

In [36]:
SH["bid1p"] = SH["BidPrice"].str[1:-1].apply(lambda x: float(x.split(',')[0]))
SH["ask1p"] = SH["OfferPrice"].str[1:-1].apply(lambda x: float(x.split(',')[0]))

In [60]:
SH["p1"] = SH["bid1p"] + SH["ask1p"]
tt = SH[(SH["Volume"] > 0) & (SH["time"] < 145700000000)].groupby("skey")['p1'].min()
tt[tt == 0]

skey
1600353    0.0
Name: p1, dtype: float64

In [41]:
re = SH[(SH["Volume"] > 0) & (SH["p1"] == 0)].groupby("skey")["time"].unique()

skey
1600000    [145700000000, 145703000000]
1600004    [145700000000, 145703000000]
1600006    [145700000000, 145703000000]
1600007    [145700000000, 145703000000]
1600008    [145700000000, 145703000000]
                       ...             
1603993    [145700000000, 145703000000]
1603996    [145700000000, 145703000000]
1603997    [145700000000, 145703000000]
1603998    [145700000000, 145703000000]
1603999    [145700000000, 145703000000]
Name: time, Length: 1440, dtype: object

In [53]:
SH[(SH["NumTrades"] >= 8031) & (SH["skey"] == 1600000)].head(10)

Unnamed: 0,NumTrades,OfferTradeMaxDuration,TotalBidNumber,WithdrawSellMoney,BidOrders,TotalOfferQty,WithdrawBuyNumber,WeightedAvgOfferPx,WithdrawSellAmount,Volume,BidOrderQty,OpenPx,HighPx,PreClosePx,LowPx,WeightedAvgBidPx,OfferNumOrders,WithdrawSellNumber,TotalOfferNumber,OfferPrice,NumOfferOrders,BidPrice,OfferOrderQty,TotalBidQty,WithdrawBuyAmount,BidTradeMaxDuration,NumBidOrders,LastPx,Amount,BidNumOrders,WithdrawBuyMoney,QuotTime,OfferOrders,skey,date,time,bid1p,ask1p,p1
4428,8031,5515,505,73970236.29,"[22615,3500,500,200,5000,2000,200,73900,0,0,0,...",5938334,1641,10.308,7328364,20131840,"[107915,278914,329000,67200,175550,167900,1680...",10.03,10.1,10.05,10.01,9.845,"[44,71,87,114,37,29,26,22,104,47]",1340,1574,"[10.07,10.08,10.09,10.1,10.11,10.12,10.13,10.1...",96,"[10.06,10.05,10.04,10.03,10.02,10.01,10,9.99,9...","[166360,430247,500533,700755,297543,178700,416...",2434479,6747881,10466,67,10.06,202435800.0,"[8,18,10,16,21,31,67,16,26,17]",66634991.75,20190114145653000,"[54960,600,2000,2000,20000,2000,100,4000,500,1...",1600000,20190114,145653000000,10.06,10.07,20.13
4429,8031,5515,505,73980306.29,"[22615,3500,500,200,5000,2000,200,73900,0,0,0,...",5937334,1641,10.308,7329364,20131840,"[107915,278914,329000,67200,175550,167900,1680...",10.03,10.1,10.05,10.01,9.845,"[43,71,87,114,37,29,26,22,104,47]",1341,1573,"[10.07,10.08,10.09,10.1,10.11,10.12,10.13,10.1...",96,"[10.06,10.05,10.04,10.03,10.02,10.01,10,9.99,9...","[165360,430247,500533,700755,297543,178700,416...",2434479,6747881,10466,67,10.06,202435800.0,"[8,18,10,16,21,31,67,16,26,17]",66634991.75,20190114145656000,"[54960,600,2000,2000,20000,2000,100,4000,500,1...",1600000,20190114,145656000000,10.06,10.07,20.13
4430,8031,5515,505,73983330.29,"[22615,3500,500,200,5000,2000,200,73900,0,0,0,...",5937534,1641,10.308,7329664,20131840,"[107915,278914,329000,67200,175550,167900,1680...",10.03,10.1,10.05,10.01,9.845,"[44,70,87,114,37,29,26,22,104,47]",1342,1573,"[10.07,10.08,10.09,10.1,10.11,10.12,10.13,10.1...",96,"[10.06,10.05,10.04,10.03,10.02,10.01,10,9.99,9...","[165860,429947,500533,700755,297543,178700,416...",2434479,6747881,10466,67,10.06,202435800.0,"[8,18,10,16,21,31,67,16,26,17]",66634991.75,20190114145659000,"[54960,600,2000,2000,20000,2000,100,4000,500,1...",1600000,20190114,145659000000,10.06,10.07,20.13
4431,8031,5515,505,73983330.29,"[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...",5937534,1641,10.308,7329664,20131840,"[0,0,0,0,0,0,0,0,0,0]",10.03,10.1,10.05,10.01,9.845,"[0,0,0,0,0,0,0,0,0,0]",1342,1573,"[0,0,0,0,0,0,0,0,0,0]",96,"[0,0,0,0,0,0,0,0,0,0]","[0,0,0,0,0,0,0,0,0,0]",2434479,6747881,10466,67,10.06,202435800.0,"[0,0,0,0,0,0,0,0,0,0]",66634991.75,20190114145700000,"[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...",1600000,20190114,145700000000,0.0,0.0,0.0
4432,8031,5515,505,73983330.29,"[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...",5937534,1641,10.308,7329664,20131840,"[0,0,0,0,0,0,0,0,0,0]",10.03,10.1,10.05,10.01,9.845,"[0,0,0,0,0,0,0,0,0,0]",1342,1573,"[0,0,0,0,0,0,0,0,0,0]",96,"[0,0,0,0,0,0,0,0,0,0]","[0,0,0,0,0,0,0,0,0,0]",2434479,6747881,10466,67,10.06,202435800.0,"[0,0,0,0,0,0,0,0,0,0]",66634991.75,20190114145703000,"[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...",1600000,20190114,145703000000,0.0,0.0,0.0
4433,8031,5515,505,73983330.29,"[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...",5937534,1641,10.308,7329664,20131840,"[25500,113715,0,0,0,0,0,0,0,0]",10.03,10.1,10.05,10.01,9.845,"[0,0,0,0,0,0,0,0,0,0]",1342,1573,"[10.06,0,0,0,0,0,0,0,0,0]",96,"[10.06,0,0,0,0,0,0,0,0,0]","[25500,0,0,0,0,0,0,0,0,0]",2434479,6747881,10466,67,10.06,202435800.0,"[0,0,0,0,0,0,0,0,0,0]",66634991.75,20190114145709000,"[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...",1600000,20190114,145709000000,10.06,10.06,20.12
4434,8031,5515,505,73983330.29,"[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...",5937534,1641,10.308,7329664,20131840,"[36000,114615,0,0,0,0,0,0,0,0]",10.03,10.1,10.05,10.01,9.845,"[0,0,0,0,0,0,0,0,0,0]",1342,1573,"[10.06,0,0,0,0,0,0,0,0,0]",96,"[10.06,0,0,0,0,0,0,0,0,0]","[36000,0,0,0,0,0,0,0,0,0]",2434479,6747881,10466,67,10.06,202435800.0,"[0,0,0,0,0,0,0,0,0,0]",66634991.75,20190114145712000,"[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...",1600000,20190114,145712000000,10.06,10.06,20.12
4435,8031,5515,505,73983330.29,"[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...",5937534,1641,10.308,7329664,20131840,"[43200,107415,0,0,0,0,0,0,0,0]",10.03,10.1,10.05,10.01,9.845,"[0,0,0,0,0,0,0,0,0,0]",1342,1573,"[10.06,0,0,0,0,0,0,0,0,0]",96,"[10.06,0,0,0,0,0,0,0,0,0]","[43200,0,0,0,0,0,0,0,0,0]",2434479,6747881,10466,67,10.06,202435800.0,"[0,0,0,0,0,0,0,0,0,0]",66634991.75,20190114145715000,"[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...",1600000,20190114,145715000000,10.06,10.06,20.12
4436,8031,5515,505,73983330.29,"[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...",5937534,1641,10.308,7329664,20131840,"[47700,108015,0,0,0,0,0,0,0,0]",10.03,10.1,10.05,10.01,9.845,"[0,0,0,0,0,0,0,0,0,0]",1342,1573,"[10.06,0,0,0,0,0,0,0,0,0]",96,"[10.06,0,0,0,0,0,0,0,0,0]","[47700,0,0,0,0,0,0,0,0,0]",2434479,6747881,10466,67,10.06,202435800.0,"[0,0,0,0,0,0,0,0,0,0]",66634991.75,20190114145718000,"[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...",1600000,20190114,145718000000,10.06,10.06,20.12
4437,8031,5515,505,73983330.29,"[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...",5937534,1641,10.308,7329664,20131840,"[51700,104015,0,0,0,0,0,0,0,0]",10.03,10.1,10.05,10.01,9.845,"[0,0,0,0,0,0,0,0,0,0]",1342,1573,"[10.06,0,0,0,0,0,0,0,0,0]",96,"[10.06,0,0,0,0,0,0,0,0,0]","[51700,0,0,0,0,0,0,0,0,0]",2434479,6747881,10466,67,10.06,202435800.0,"[0,0,0,0,0,0,0,0,0,0]",66634991.75,20190114145721000,"[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...",1600000,20190114,145721000000,10.06,10.06,20.12


In [38]:
SH[SH["Volume"] > 0]

Unnamed: 0,NumTrades,OfferTradeMaxDuration,TotalBidNumber,WithdrawSellMoney,BidOrders,TotalOfferQty,WithdrawBuyNumber,WeightedAvgOfferPx,WithdrawSellAmount,Volume,BidOrderQty,OpenPx,HighPx,PreClosePx,LowPx,WeightedAvgBidPx,OfferNumOrders,WithdrawSellNumber,TotalOfferNumber,OfferPrice,NumOfferOrders,BidPrice,OfferOrderQty,TotalBidQty,WithdrawBuyAmount,BidTradeMaxDuration,NumBidOrders,LastPx,Amount,BidNumOrders,WithdrawBuyMoney,QuotTime,OfferOrders,skey,date,time,bid1p,ask1p,p1
87,40,0,277,52417.0,"[14500,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...",1539582,11,10.521,5200,99900,"[14500,32800,20900,54600,5200,20700,2800,42600...",0.00,10.03,10.05,10.03,9.870,"[3,2,5,3,1,3,4,1,5,5]",6,562,"[10.04,10.05,10.06,10.07,10.08,10.09,10.1,10.1...",93,"[10.03,10.02,10.01,10,9.99,9.98,9.97,9.96,9.95...","[1001,2100,54200,7200,500,4700,12200,1400,1110...",913500,111600,0,57,10.03,1001997.0,"[1,2,7,28,5,13,3,19,15,6]",1162006.0,20190114092500000,"[858,43,100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...",1600000,20190114,92500000000,10.03,10.04,20.07
88,40,0,277,52417.0,"[14500,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...",1539582,11,10.521,5200,99900,"[14500,32800,20900,54600,5200,20700,2800,42600...",10.03,10.03,10.05,10.03,9.870,"[3,2,5,3,1,3,4,1,5,5]",6,562,"[10.04,10.05,10.06,10.07,10.08,10.09,10.1,10.1...",93,"[10.03,10.02,10.01,10,9.99,9.98,9.97,9.96,9.95...","[1001,2100,54200,7200,500,4700,12200,1400,1110...",913500,111600,0,57,10.03,1001997.0,"[1,2,7,28,5,13,3,19,15,6]",1162006.0,20190114092624000,"[858,43,100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...",1600000,20190114,92624000000,10.03,10.04,20.07
89,40,0,277,52417.0,"[14500,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...",1539582,11,10.521,5200,99900,"[14500,32800,20900,54600,5200,20700,2800,42600...",10.03,10.03,10.05,10.03,9.870,"[3,2,5,3,1,3,4,1,5,5]",6,562,"[10.04,10.05,10.06,10.07,10.08,10.09,10.1,10.1...",93,"[10.03,10.02,10.01,10,9.99,9.98,9.97,9.96,9.95...","[1001,2100,54200,7200,500,4700,12200,1400,1110...",913500,111600,0,57,10.03,1001997.0,"[1,2,7,28,5,13,3,19,15,6]",1162006.0,20190114092724000,"[858,43,100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...",1600000,20190114,92724000000,10.03,10.04,20.07
90,40,0,277,52417.0,"[14500,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...",1539582,11,10.521,5200,99900,"[14500,32800,20900,54600,5200,20700,2800,42600...",10.03,10.03,10.05,10.03,9.870,"[3,2,5,3,1,3,4,1,5,5]",6,562,"[10.04,10.05,10.06,10.07,10.08,10.09,10.1,10.1...",93,"[10.03,10.02,10.01,10,9.99,9.98,9.97,9.96,9.95...","[1001,2100,54200,7200,500,4700,12200,1400,1110...",913500,111600,0,57,10.03,1001997.0,"[1,2,7,28,5,13,3,19,15,6]",1162006.0,20190114092824000,"[858,43,100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...",1600000,20190114,92824000000,10.03,10.04,20.07
91,40,0,277,52417.0,"[14500,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...",1539582,11,10.521,5200,99900,"[14500,32800,20900,54600,5200,20700,2800,42600...",10.03,10.03,10.05,10.03,9.870,"[3,2,5,3,1,3,4,1,5,5]",6,562,"[10.04,10.05,10.06,10.07,10.08,10.09,10.1,10.1...",93,"[10.03,10.02,10.01,10,9.99,9.98,9.97,9.96,9.95...","[1001,2100,54200,7200,500,4700,12200,1400,1110...",913500,111600,0,57,10.03,1001997.0,"[1,2,7,28,5,13,3,19,15,6]",1162006.0,20190114092924000,"[858,43,100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...",1600000,20190114,92924000000,10.03,10.04,20.07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4725408,2272,3942,162,9429833.6,"[3400,2000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...",1122900,308,5.235,1875640,3123280,"[5400,166300,45400,22100,58600,4800,31200,600,...",5.04,5.04,5.04,4.93,4.868,"[17,6,4,4,5,6,2,3,8,7]",458,318,"[4.95,4.96,4.97,4.98,4.99,5,5.01,5.02,5.03,5.04]",56,"[4.94,4.93,4.92,4.91,4.9,4.89,4.88,4.87,4.86,4...","[85200,16800,7800,13100,9400,34200,10800,11400...",478900,948020,10246,25,4.94,15548082.2,"[2,38,12,15,27,5,9,2,1,3]",4684580.8,20190114151451000,"[12300,8700,1300,8500,600,18800,17300,900,100,...",1603999,20190114,151451000000,4.94,4.95,9.89
4725409,2272,3942,162,9429833.6,"[3400,2000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...",1122900,308,5.235,1875640,3123280,"[5400,166300,45400,22100,58600,4800,31200,600,...",5.04,5.04,5.04,4.93,4.868,"[17,6,4,4,5,6,2,3,8,7]",458,318,"[4.95,4.96,4.97,4.98,4.99,5,5.01,5.02,5.03,5.04]",56,"[4.94,4.93,4.92,4.91,4.9,4.89,4.88,4.87,4.86,4...","[85200,16800,7800,13100,9400,34200,10800,11400...",478900,948020,10246,25,4.94,15548082.2,"[2,38,12,15,27,5,9,2,1,3]",4684580.8,20190114151451000,"[12300,8700,1300,8500,600,18800,17300,900,100,...",1603999,20190114,151451000000,4.94,4.95,9.89
4725410,2272,3942,162,9429833.6,"[3400,2000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...",1122900,308,5.235,1875640,3123280,"[5400,166300,45400,22100,58600,4800,31200,600,...",5.04,5.04,5.04,4.93,4.868,"[17,6,4,4,5,6,2,3,8,7]",458,318,"[4.95,4.96,4.97,4.98,4.99,5,5.01,5.02,5.03,5.04]",56,"[4.94,4.93,4.92,4.91,4.9,4.89,4.88,4.87,4.86,4...","[85200,16800,7800,13100,9400,34200,10800,11400...",478900,948020,10246,25,4.94,15548082.2,"[2,38,12,15,27,5,9,2,1,3]",4684580.8,20190114151451000,"[12300,8700,1300,8500,600,18800,17300,900,100,...",1603999,20190114,151451000000,4.94,4.95,9.89
4725411,2272,3942,162,9429833.6,"[3400,2000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...",1122900,308,5.235,1875640,3123280,"[5400,166300,45400,22100,58600,4800,31200,600,...",5.04,5.04,5.04,4.93,4.868,"[17,6,4,4,5,6,2,3,8,7]",458,318,"[4.95,4.96,4.97,4.98,4.99,5,5.01,5.02,5.03,5.04]",56,"[4.94,4.93,4.92,4.91,4.9,4.89,4.88,4.87,4.86,4...","[85200,16800,7800,13100,9400,34200,10800,11400...",478900,948020,10246,25,4.94,15548082.2,"[2,38,12,15,27,5,9,2,1,3]",4684580.8,20190114151451000,"[12300,8700,1300,8500,600,18800,17300,900,100,...",1603999,20190114,151451000000,4.94,4.95,9.89


In [5]:
s2[s2["ID"] == 1600353]

Unnamed: 0,ID,date,d_open,d_yclose,d_high,d_low,d_close,d_volume,d_amount,auction


In [16]:
test.columns

Index(['NumTrades', 'OfferTradeMaxDuration', 'ImageStatus', 'TotalBidNumber',
       'TotalWarrantExecQty', 'WithdrawSellMoney', 'IOPV', 'BidOrders',
       'ETFSellAmount', 'TotalOfferQty', 'WithdrawBuyNumber',
       'WeightedAvgOfferPx', 'ETFBuyNumber', 'WarLowerPx', 'MsgSeqNum',
       'WithdrawSellAmount', 'ETFSellMoney', 'Volume', 'BidOrderQty', 'OpenPx',
       'HighPx', 'PreClosePx', 'LowPx', 'WeightedAvgBidPx', 'ETFSellNumber',
       'OfferNumOrders', 'WithdrawSellNumber', 'ETFBuyAmount',
       'TotalOfferNumber', 'OfferPrice', 'NumOfferOrders', 'BidPrice',
       'OfferOrderQty', 'TotalBidQty', 'SendingTime', 'ETFBuyMoney',
       'InstrumentStatus', 'WithdrawBuyAmount', 'ClosePx',
       'BidTradeMaxDuration', 'NumBidOrders', 'LastPx', 'Amount', 'AveragePx',
       'WarUpperPx', 'YieldToMaturity', 'BidNumOrders', 'WithdrawBuyMoney',
       'TradingPhaseCode', 'QuotTime', 'OfferOrders'],
      dtype='object')

In [50]:
test = pd.read_csv(r"G:\KR\2019\SZ\20190114\snapshot\Level_2\000004.csv")
test["bid1p"] = test["BidPrice"].str[1:-1].apply(lambda x: float(x.split(',')[0]))
test["ask1p"] = test["OfferPrice"].str[1:-1].apply(lambda x: float(x.split(',')[0]))
test["p1"] = test["bid1p"] + test["ask1p"]
test[test['NumTrades'] >= 508].head(5)

Unnamed: 0,NumTrades,OfferNumOrders,LowerLimitPx,ImageStatus,OfferPrice,BidPrice,BidOrders,OfferOrderQty,PeRatio2,TotalBidQty,SendingTime,PeRatio1,TotalOfferQty,ClosePx,WeightedAvgPxChg,Change2,Change1,LastPx,WeightedAvgOfferPx,Amount,UpperLimitPx,AveragePx,TotalLongPosition,MsgSeqNum,Volume,BidNumOrders,BidOrderQty,TradingPhaseCode,QuotTime,OpenPx,OfferOrders,PreWeightedAvgPx,HighPx,PreClosePx,LowPx,WeightedAvgBidPx,bid1p,ask1p,p1
1131,508,"[0,0,0,0,0,0,0,0,0,0]",15.02,1,"[0,0,0,0,0,0,0,0,0,0]","[0,0,0,0,0,0,0,0,0,0]","[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...","[0,0,0,0,0,0,0,0,0,0]",0,0,20190114145700000,162.16,0,0.0,0,0.0,-0.15,16.54,0.0,3417304.28,18.36,16.61305,0,10191166,205700,"[0,0,0,0,0,0,0,0,0,0]","[0,0,0,0,0,0,0,0,0,0]",C0,20190114145700000,16.92,"[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...",0,16.92,16.69,16.49,0.0,0.0,0.0,0.0
1132,508,"[0,0,0,0,0,0,0,0,0,0]",15.02,1,"[16.54,0,0,0,0,0,0,0,0,0]","[16.54,0,0,0,0,0,0,0,0,0]","[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...","[700,2800,0,0,0,0,0,0,0,0]",0,0,20190114145726000,162.16,0,0.0,0,0.0,-0.15,16.54,0.0,3417304.28,18.36,16.61305,0,10209680,205700,"[0,0,0,0,0,0,0,0,0,0]","[700,0,0,0,0,0,0,0,0,0]",C0,20190114145727000,16.92,"[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...",0,16.92,16.69,16.49,0.0,16.54,16.54,33.08
1133,508,"[0,0,0,0,0,0,0,0,0,0]",15.02,1,"[16.54,0,0,0,0,0,0,0,0,0]","[16.54,0,0,0,0,0,0,0,0,0]","[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...","[1400,2100,0,0,0,0,0,0,0,0]",0,0,20190114145735000,162.16,0,0.0,0,0.0,-0.15,16.54,0.0,3417304.28,18.36,16.61305,0,10215532,205700,"[0,0,0,0,0,0,0,0,0,0]","[1400,0,0,0,0,0,0,0,0,0]",C0,20190114145736000,16.92,"[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...",0,16.92,16.69,16.49,0.0,16.54,16.54,33.08
1134,508,"[0,0,0,0,0,0,0,0,0,0]",15.02,1,"[16.54,0,0,0,0,0,0,0,0,0]","[16.54,0,0,0,0,0,0,0,0,0]","[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...","[1400,2200,0,0,0,0,0,0,0,0]",0,0,20190114145744000,162.16,0,0.0,0,0.0,-0.15,16.54,0.0,3417304.28,18.36,16.61305,0,10221100,205700,"[0,0,0,0,0,0,0,0,0,0]","[1400,0,0,0,0,0,0,0,0,0]",C0,20190114145745000,16.92,"[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...",0,16.92,16.69,16.49,0.0,16.54,16.54,33.08
1135,508,"[0,0,0,0,0,0,0,0,0,0]",15.02,1,"[16.54,0,0,0,0,0,0,0,0,0]","[16.54,0,0,0,0,0,0,0,0,0]","[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...","[1400,2200,0,0,0,0,0,0,0,0]",0,0,20190114145844000,162.16,0,0.0,0,0.0,-0.15,16.54,0.0,3417304.28,18.36,16.61305,0,10255420,205700,"[0,0,0,0,0,0,0,0,0,0]","[1400,0,0,0,0,0,0,0,0,0]",C0,20190114145845000,16.92,"[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...",0,16.92,16.69,16.49,0.0,16.54,16.54,33.08


In [7]:
SH.dtypes

skey                                 int32
date                                 int32
time                                 int64
clockAtArrival                       int64
datetime                    datetime64[ns]
ordering                             int32
has_missing                          int32
cum_trades_cnt                       int32
cum_volume                           int64
cum_amount                         float64
prev_close                         float64
open                               float64
high                               float64
low                                float64
close                              float64
bid10p                             float64
bid9p                              float64
bid8p                              float64
bid7p                              float64
bid6p                              float64
bid5p                              float64
bid4p                              float64
bid3p                              float64
bid2p      

In [6]:
for i in ["ID", "date", "d_open", "d_yclose","d_high", "d_low", "d_close", "d_volume"]:
    print(i)
    print(re[re["d_amount_x"].isnull()][i].values[0])

ID
1601199
date
20200106
d_open
3.82
d_yclose
3.81
d_high
3.84
d_low
3.79
d_close
3.82
d_volume
2797484.0


In [7]:
for i in ["ID", "date", "d_open", "d_yclose","d_high", "d_low", "d_close", "d_volume"]:
    print(i)
    print(re[re["d_amount_y"].isnull()][i].values[0])

ID
1601199
date
20200106
d_open
3.82
d_yclose
3.81
d_high
3.84
d_low
3.79
d_close
3.81
d_volume
2797484.0


In [12]:
import pandas as pd
import random
import numpy as np
import glob
import os
from unrar import rarfile
import py7zr
import pickle
import datetime
import time
pd.set_option("max_columns", 200)

year = "2019"
startDate = '20190311'
endDate = '20190311'
readPath = 'G:\\KR\\' + year + '\\SH\\***'
dataPathLs = np.array(glob.glob(readPath))
dateLs = np.array([os.path.basename(i) for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]

for data in dataPathLs:
    
    readPath = data + '\\snapshot\\Level_2\\***'
    dataPathLs = np.array(glob.glob(readPath))
    dateLs = np.array([int(os.path.basename(i).split('.')[0]) for i in dataPathLs])
    dataPathLs = dataPathLs[((dateLs >= 600000) & (dateLs <= 700000))]
    SH = []
    ll = []
    startTm = datetime.datetime.now()
    for i in dataPathLs[:500]:
        try:
            df = pd.read_csv(i, usecols = [0,1,3,5,7,9,10,11,15,17,18,19,20,21,22,23,25,26,28,29,30,31,32,33,37,39,40,41,
                                          42,46,47,49,50])
        except:
            print("empty data")
            print(i)
            ll.append(int(os.path.basename(i).split('.')[0]))
            continue
        df["StockID"] = int(os.path.basename(i).split('.')[0])
        SH += [df]
    del df
    SH = pd.concat(SH).reset_index(drop=True)
    print(datetime.datetime.now() - startTm)
    
    startTm = datetime.datetime.now()
    SH["skey"] = SH["StockID"] + 1000000
    SH.drop(["StockID"],axis=1,inplace=True)
    SH["date"] = int(SH["QuotTime"].iloc[0]//1000000000)
    SH["time"] = (SH['QuotTime'] - int(SH['QuotTime'].iloc[0]//1000000000*1000000000)).astype(np.int64) * 1000
    SH["clockAtArrival"] = SH["QuotTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S%f').timestamp()*1e6))
    SH.drop(["QuotTime"],axis=1,inplace=True)
    SH['datetime'] = SH["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))
    print(datetime.datetime.now() - startTm)

    startTm = datetime.datetime.now()
    SH["BidPrice"] = SH["BidPrice"].apply(lambda x: [float(i) for i in x[1:-1].split(',')])
    SH["OfferPrice"] = SH["OfferPrice"].apply(lambda x: [float(i) for i in x[1:-1].split(',')])
    SH["BidOrderQty"] = SH["BidOrderQty"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])
    SH["OfferOrderQty"] = SH["OfferOrderQty"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])
    SH["BidNumOrders"] = SH["BidNumOrders"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])
    SH["OfferNumOrders"] = SH["OfferNumOrders"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])

    for i in range(1, 11):
        SH["bid" + str(i) + 'p'] = SH["BidPrice"].apply(lambda x: x[i-1],2)
    SH.drop(["BidPrice"],axis=1,inplace=True)
    print("1")
    for i in range(1, 11):
        SH["ask" + str(i) + 'p'] = SH["OfferPrice"].apply(lambda x: x[i-1],2)
    SH.drop(["OfferPrice"],axis=1,inplace=True)
    print("2")
    for i in range(1, 11):
        SH["bid" + str(i) + 'q'] = SH["BidOrderQty"].apply(lambda x: x[i-1])
    SH.drop(["BidOrderQty"],axis=1,inplace=True)
    print("3")
    for i in range(1, 11):
        SH["ask" + str(i) + 'q'] = SH["OfferOrderQty"].apply(lambda x: x[i-1])
    SH.drop(["OfferOrderQty"],axis=1,inplace=True)
    print("4")
    for i in range(1, 11):
        SH["bid" + str(i) + 'n'] = SH["BidNumOrders"].apply(lambda x: x[i-1])
        SH["bid" + str(i) + 'n'] = SH["bid" + str(i) + 'n'].astype('int32')
    SH.drop(["BidNumOrders"],axis=1,inplace=True)
    print("5")
    for i in range(1, 11):
        SH["ask" + str(i) + 'n'] = SH["OfferNumOrders"].apply(lambda x: x[i-1])
        SH["ask" + str(i) + 'n'] = SH["ask" + str(i) + 'n'].astype('int32') 
    SH.drop(["OfferNumOrders"],axis=1,inplace=True)
    print("6")
    
    SH["BidOrders"] = SH["BidOrders"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])
    SH["OfferOrders"] = SH["OfferOrders"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])

    for i in range(1, 51):
        SH["bid1Top" + str(i) + 'q'] = SH["BidOrders"].apply(lambda x: x[i-1])
        SH["bid1Top" + str(i) + 'q'] = SH["bid1Top" + str(i) + 'q'].astype('int32') 
    SH.drop(["BidOrders"],axis=1,inplace=True)
    print("7")
    
    for i in range(1, 51):
        SH["ask1Top" + str(i) + 'q'] = SH["OfferOrders"].apply(lambda x: x[i-1])
        SH["ask1Top" + str(i) + 'q'] = SH["ask1Top" + str(i) + 'q'].astype('int32') 
    SH.drop(["OfferOrders"],axis=1,inplace=True)
    print("8")
    print(datetime.datetime.now() - startTm)
    
    
    startTm = datetime.datetime.now()

    SH.columns = ['cum_trades_cnt', 'ask_trade_max_duration', 'total_bid_orders',
       'cum_canceled_sell_amount', 'total_ask_quantity', 'cum_canceled_buy_orders',
       'total_ask_vwap', 'cum_canceled_sell_volume', 'cum_volume', 'open',
       'high', 'prev_close', 'low', 'total_bid_vwap',
       'cum_canceled_sell_orders', 'total_ask_orders', 'total_ask_levels',
       'total_bid_quantity', 'cum_canceled_buy_volume', 'bid_trade_max_duration',
       'total_bid_levels', 'close', 'cum_amount', 'cum_canceled_buy_amount', 'skey', 'date', 'time', 'clockAtArrival',
       'datetime', 'bid1p', 'bid2p', 'bid3p', 'bid4p', 'bid5p', 'bid6p',
       'bid7p', 'bid8p', 'bid9p', 'bid10p', 'ask1p', 'ask2p', 'ask3p',
       'ask4p', 'ask5p', 'ask6p', 'ask7p', 'ask8p', 'ask9p', 'ask10p',
       'bid1q', 'bid2q', 'bid3q', 'bid4q', 'bid5q', 'bid6q', 'bid7q',
       'bid8q', 'bid9q', 'bid10q', 'ask1q', 'ask2q', 'ask3q', 'ask4q',
       'ask5q', 'ask6q', 'ask7q', 'ask8q', 'ask9q', 'ask10q', 'bid1n',
       'bid2n', 'bid3n', 'bid4n', 'bid5n', 'bid6n', 'bid7n', 'bid8n',
       'bid9n', 'bid10n', 'ask1n', 'ask2n', 'ask3n', 'ask4n', 'ask5n',
       'ask6n', 'ask7n', 'ask8n', 'ask9n', 'ask10n', 'bid1Top1q',
       'bid1Top2q', 'bid1Top3q', 'bid1Top4q', 'bid1Top5q', 'bid1Top6q',
       'bid1Top7q', 'bid1Top8q', 'bid1Top9q', 'bid1Top10q', 'bid1Top11q',
       'bid1Top12q', 'bid1Top13q', 'bid1Top14q', 'bid1Top15q',
       'bid1Top16q', 'bid1Top17q', 'bid1Top18q', 'bid1Top19q',
       'bid1Top20q', 'bid1Top21q', 'bid1Top22q', 'bid1Top23q',
       'bid1Top24q', 'bid1Top25q', 'bid1Top26q', 'bid1Top27q',
       'bid1Top28q', 'bid1Top29q', 'bid1Top30q', 'bid1Top31q',
       'bid1Top32q', 'bid1Top33q', 'bid1Top34q', 'bid1Top35q',
       'bid1Top36q', 'bid1Top37q', 'bid1Top38q', 'bid1Top39q',
       'bid1Top40q', 'bid1Top41q', 'bid1Top42q', 'bid1Top43q',
       'bid1Top44q', 'bid1Top45q', 'bid1Top46q', 'bid1Top47q',
       'bid1Top48q', 'bid1Top49q', 'bid1Top50q', 'ask1Top1q', 'ask1Top2q',
       'ask1Top3q', 'ask1Top4q', 'ask1Top5q', 'ask1Top6q', 'ask1Top7q',
       'ask1Top8q', 'ask1Top9q', 'ask1Top10q', 'ask1Top11q', 'ask1Top12q',
       'ask1Top13q', 'ask1Top14q', 'ask1Top15q', 'ask1Top16q',
       'ask1Top17q', 'ask1Top18q', 'ask1Top19q', 'ask1Top20q',
       'ask1Top21q', 'ask1Top22q', 'ask1Top23q', 'ask1Top24q',
       'ask1Top25q', 'ask1Top26q', 'ask1Top27q', 'ask1Top28q',
       'ask1Top29q', 'ask1Top30q', 'ask1Top31q', 'ask1Top32q',
       'ask1Top33q', 'ask1Top34q', 'ask1Top35q', 'ask1Top36q',
       'ask1Top37q', 'ask1Top38q', 'ask1Top39q', 'ask1Top40q',
       'ask1Top41q', 'ask1Top42q', 'ask1Top43q', 'ask1Top44q',
       'ask1Top45q', 'ask1Top46q', 'ask1Top47q', 'ask1Top48q',
       'ask1Top49q', 'ask1Top50q']
    SH = SH.fillna(0)
    SH = SH[~((SH["bid1p"] == 0) & (SH["ask1p"] == 0))]
    SH["ordering"] = SH.groupby("skey").cumcount()
    SH["ordering"] = SH["ordering"] + 1
    
    SH["has_missing"] = 0

0:00:36.628087
0:00:25.139736
1
2
3
4
5
6
7
8
0:03:28.369023


In [17]:
    startTm = datetime.datetime.now()
    for cols in ["prev_close", 'open', "high", "low", "close", 'bid10p','bid9p','bid8p','bid7p','bid6p','bid5p','bid4p','bid3p',
             'bid2p','bid1p','ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p','cum_amount', "cum_canceled_sell_amount", "cum_canceled_buy_amount"]:
#         SH[cols] = SH[cols].apply(lambda x: round(x, 2)).astype('float64')
        SH[cols] = SH[cols].round(2)
    print(datetime.datetime.now() - startTm)


0:00:00.398911


In [16]:
    startTm = datetime.datetime.now()
    for cols in ["prev_close", 'open', "high", "low", "close", 'bid10p','bid9p','bid8p','bid7p','bid6p','bid5p','bid4p','bid3p',
             'bid2p','bid1p','ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p','cum_amount', "cum_canceled_sell_amount", "cum_canceled_buy_amount"]:
#         SH[cols] = SH[cols].apply(lambda x: round(x, 2)).astype('float64')
        SH[cols] = SH[cols].astype('float64')
    print(datetime.datetime.now() - startTm)

0:00:00.251327


In [15]:
SH["WeightedAvgBidPx1"] = SH["WeightedAvgBidPx"].round(3)
display(SH[SH["WeightedAvgBidPx1"] != SH["WeightedAvgBidPx"]][["WeightedAvgBidPx1", "WeightedAvgBidPx"]].iloc[0])

WeightedAvgBidPx1    40.373
WeightedAvgBidPx     40.373
Name: 74, dtype: float64

In [9]:
SH["WeightedAvgBidPx"][SH["WeightedAvgBidPx"].round(3) != SH["WeightedAvgBidPx"]].astype(str).apply(lambda x: len(str(x.split('.')[1]))).unique()

array([15, 16, 14], dtype=int64)

In [3]:
SH[SH["time"] > 93000000000].head(5)

Unnamed: 0,skey,date,time,clockAtArrival,datetime,ordering,has_missing,cum_trades_cnt,cum_volume,cum_amount,prev_close,open,high,low,close,bid10p,bid9p,bid8p,bid7p,bid6p,bid5p,bid4p,bid3p,bid2p,bid1p,ask1p,ask2p,ask3p,ask4p,ask5p,ask6p,ask7p,ask8p,ask9p,ask10p,bid10q,bid9q,bid8q,bid7q,bid6q,bid5q,bid4q,bid3q,bid2q,bid1q,ask1q,ask2q,ask3q,ask4q,ask5q,ask6q,ask7q,ask8q,ask9q,ask10q,bid10n,bid9n,bid8n,bid7n,bid6n,bid5n,bid4n,bid3n,bid2n,bid1n,ask1n,ask2n,ask3n,ask4n,ask5n,ask6n,ask7n,ask8n,ask9n,ask10n,bid1Top1q,bid1Top2q,bid1Top3q,bid1Top4q,bid1Top5q,bid1Top6q,bid1Top7q,bid1Top8q,bid1Top9q,bid1Top10q,bid1Top11q,bid1Top12q,bid1Top13q,bid1Top14q,bid1Top15q,bid1Top16q,bid1Top17q,bid1Top18q,bid1Top19q,bid1Top20q,bid1Top21q,bid1Top22q,bid1Top23q,bid1Top24q,bid1Top25q,bid1Top26q,bid1Top27q,bid1Top28q,bid1Top29q,bid1Top30q,bid1Top31q,bid1Top32q,bid1Top33q,bid1Top34q,bid1Top35q,bid1Top36q,bid1Top37q,bid1Top38q,bid1Top39q,bid1Top40q,bid1Top41q,bid1Top42q,bid1Top43q,bid1Top44q,bid1Top45q,bid1Top46q,bid1Top47q,bid1Top48q,bid1Top49q,bid1Top50q,ask1Top1q,ask1Top2q,ask1Top3q,ask1Top4q,ask1Top5q,ask1Top6q,ask1Top7q,ask1Top8q,ask1Top9q,ask1Top10q,ask1Top11q,ask1Top12q,ask1Top13q,ask1Top14q,ask1Top15q,ask1Top16q,ask1Top17q,ask1Top18q,ask1Top19q,ask1Top20q,ask1Top21q,ask1Top22q,ask1Top23q,ask1Top24q,ask1Top25q,ask1Top26q,ask1Top27q,ask1Top28q,ask1Top29q,ask1Top30q,ask1Top31q,ask1Top32q,ask1Top33q,ask1Top34q,ask1Top35q,ask1Top36q,ask1Top37q,ask1Top38q,ask1Top39q,ask1Top40q,ask1Top41q,ask1Top42q,ask1Top43q,ask1Top44q,ask1Top45q,ask1Top46q,ask1Top47q,ask1Top48q,ask1Top49q,ask1Top50q,total_bid_quantity,total_ask_quantity,total_bid_vwap,total_ask_vwap,total_bid_orders,total_ask_orders,total_bid_levels,total_ask_levels,bid_trade_max_duration,ask_trade_max_duration,cum_canceled_buy_orders,cum_canceled_buy_volume,cum_canceled_buy_amount,cum_canceled_sell_orders,cum_canceled_sell_volume,cum_canceled_sell_amount
35,1603713,20200103,93003000000,1578015003000000,2020-01-03 09:30:03,36,0,17,3000,123884.0,41.02,41.3,41.3,41.23,41.23,40.86,40.9,40.91,40.92,41.0,41.02,41.03,41.05,41.06,41.15,41.23,41.28,41.29,41.3,41.32,41.33,41.34,41.35,41.38,41.5,600,6200,5700,600,3600,400,200,700,1600,300,100,600,200,500,600,1000,1500,5000,600,1400,1,2,2,1,4,1,1,2,3,1,1,2,1,2,1,1,1,1,3,3,300,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,112300,60200,40.558,42.673,47,65,38,44,1,0,0,0,0.0,5,16600,632450.0
36,1603713,20200103,93006000000,1578015006000000,2020-01-03 09:30:06,37,0,23,4200,173358.0,41.02,41.3,41.3,41.15,41.15,40.86,40.9,40.91,40.92,41.0,41.02,41.03,41.05,41.06,41.15,41.25,41.28,41.29,41.3,41.32,41.33,41.34,41.35,41.38,41.5,600,6200,5700,600,3900,400,200,700,1600,100,100,600,4100,600,600,1000,1500,5000,600,1400,1,2,2,1,5,1,1,2,3,1,1,2,7,3,1,1,1,1,3,3,100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,113600,66000,40.555,42.597,53,77,39,44,1,3,0,0,0.0,5,16600,632450.0
37,1603713,20200103,93009000000,1578015009000000,2020-01-03 09:30:09,38,0,24,4300,177473.0,41.02,41.3,41.3,41.15,41.15,40.82,40.83,40.9,40.91,40.92,41.0,41.02,41.03,41.05,41.06,41.25,41.28,41.29,41.3,41.32,41.33,41.34,41.35,41.38,41.5,100,7000,6200,5700,600,3900,400,200,700,1600,100,600,4100,1100,600,1000,1500,5000,600,1400,1,1,2,2,1,5,1,1,2,3,1,2,7,4,1,1,1,1,3,3,1000,300,300,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,113700,66500,40.553,42.587,52,78,37,44,5,3,1,600,24516.0,5,16600,632450.0
38,1603713,20200103,93012000000,1578015012000000,2020-01-03 09:30:12,39,0,24,4300,177473.0,41.02,41.3,41.3,41.15,41.15,40.82,40.83,40.9,40.91,40.92,41.0,41.02,41.03,41.05,41.06,41.25,41.28,41.29,41.3,41.32,41.33,41.34,41.35,41.38,41.5,100,7000,6200,5700,600,3900,400,200,700,1600,300,600,4100,1100,600,1000,1500,5000,600,1400,1,1,2,2,1,5,1,1,2,3,2,2,7,4,1,1,1,1,3,3,1000,300,300,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,100,200,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,113700,66700,40.553,42.583,52,79,37,44,5,3,1,600,24516.0,5,16600,632450.0
39,1603713,20200103,93014000000,1578015014000000,2020-01-03 09:30:14,40,0,24,4300,177473.0,41.02,41.3,41.3,41.15,41.15,40.82,40.83,40.9,40.91,40.92,41.0,41.02,41.03,41.05,41.06,41.25,41.28,41.29,41.3,41.32,41.33,41.34,41.35,41.38,41.5,100,7000,6200,5700,600,3900,400,200,700,1600,300,600,4100,1100,600,1000,1500,5000,600,1400,1,1,2,2,1,5,1,1,2,3,2,2,7,4,1,1,1,1,3,3,1000,300,300,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,100,200,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,113700,67300,40.553,42.59,52,80,37,45,5,3,1,600,24516.0,5,16600,632450.0


In [11]:
readPath = r'\\192.168.10.30\Kevin_zhenyu\day_stock_20200424\***'
dataPathLs = np.array(glob.glob(readPath))
dataPathLs = dataPathLs[[np.array([os.path.basename(i).split('.')[0][:2] == 'SH' for i in dataPathLs])]]
db = pd.DataFrame()
for p in dataPathLs:
    dayData = pd.read_csv(p, compression='gzip')
    db = pd.concat([db, dayData])

  This is separate from the ipykernel package so we can avoid doing imports until


In [62]:
# check 1
startTm = datetime.datetime.now()
da_te = str(SH["date"].iloc[0]) 
da_te = da_te[:4] + '-' + da_te[4:6] + '-' + da_te[6:8]
db1 = db[db["date"] == da_te]
db1["ID"] = db1["ID"].str[2:].astype(int) + 1000000
db1["date"] = (db1["date"].str[:4] + db1["date"].str[5:7] + db1["date"].str[8:]).astype(int)
SH["cum_max"] = SH.groupby("skey")["cum_volume"].transform(max)
s2 = SH[SH["cum_volume"] == SH["cum_max"]].groupby("skey").first().reset_index()
SH.drop("cum_max", axis=1, inplace=True)
s2 = s2.rename(columns={"skey": "ID", 'open':"d_open", "prev_close":"d_yclose","high":"d_high", "low":"d_low", "close":"d_close", "cum_volume":"d_volume", "cum_amount":"d_amount"})
s2 = s2[["ID", "date", "d_open", "d_yclose", "d_high", "d_low", "d_close", "d_volume", "d_amount"]]
re = pd.merge(db1, s2, on=["ID", "date", "d_open", "d_yclose","d_high", "d_low", "d_close", "d_volume"], how="outer")
assert((sum(re["d_amount_y"].isnull()) == 0) & (sum(re["d_amount_x"].isnull()) == 0))
print(datetime.datetime.now() - startTm)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


0:00:00.925523


In [59]:
# check 2
# first part
startTm = datetime.datetime.now()
date = pd.DataFrame(pd.date_range(start='2019-06-10 08:30:00', end='2019-06-10 18:00:00', freq='s'), columns=["Orig"])
date["time"] = date["Orig"].apply(lambda x: int(x.strftime("%H%M%S"))*1000)
date["group"] = date["time"]//30000
SH["group"] = SH["time"]//30000000
gl = date[((date["time"] >= 93000000) & (date["time"] <= 113000000))|((date["time"] >= 130000000) & (date["time"] <= 150000000))]["group"].unique()
l = set(gl) - set(SH["group"].unique())
SH["has_missing1"] = 0 
if len(l) != 0:
    print("massive missing")
    print(l)
    SH["order"] = SH.groupby(["skey", "time"]).cumcount()
    for i in l:
        SH["t"] = SH[SH["group"] > i].groupby("StockID")["time"].transform("min")
        SH["has_missing1"] = np.where((SH["time"] == SH["t"]) & (SH["order"] == 0), 1, 0)
    SH.drop(["order", "t", "group"], axis=1, inplace=True)   
else:
    print("no massive missing")
    SH.drop(["order", "t", "group"], axis=1, inplace=True)
print(datetime.datetime.now() - startTm)



# second part
startTm = datetime.datetime.now()

SH["time_interval"] = SH.groupby("skey")["datetime"].apply(lambda x: x - x.shift(1))
SH["time_interval"] = SH["time_interval"].apply(lambda x: x.seconds)
SH["tn_update"] = SH.groupby("skey")["cum_trades_cnt"].apply(lambda x: x-x.shift(1))

f1 = SH[(SH["time"] >= 93000000000) & (SH["tn_update"] != 0)].groupby("skey")["time"].min().reset_index()
f1 = f1.rename(columns={"time": "time1"})
f2 = SH[(SH["time"] >= 130000000000) & (SH["tn_update"] != 0)].groupby("skey")["time"].min().reset_index()
f2 = f2.rename(columns={"time": "time2"})
f3 = SH[(SH["time"] >= 150000000000) & (SH["tn_update"] != 0)].groupby("skey")["time"].min().reset_index()
f3 = f3.rename(columns={"time": "time3"})
SH = pd.merge(SH, f1, on="skey", how="left")
del f1
SH = pd.merge(SH, f2, on="skey", how="left")
del f2
SH = pd.merge(SH, f3, on="skey", how="left")
del f3
p95 = SH[(SH["time"] > 93000000000) & (SH["time"] < 145700000000) & (SH["time"] != SH["time2"]) & (SH["tn_update"] != 0)]\
.groupby("skey")["tn_update"].apply(lambda x: x.describe([0.95])["95%"]).reset_index()
p95 = p95.rename(columns={"tn_update":"95%"})
SH = pd.merge(SH, p95, on="skey", how="left")

SH["has_missing2"] = 0
SH["has_missing2"] = np.where((SH["time_interval"] > 60) & (SH["tn_update"] > SH["95%"]) & 
         (SH["time"] > SH["time1"]) & (SH["time"] != SH["time2"]) & (SH["time"] != SH["time3"]), 1, 0)
SH.drop(["time_interval", "tn_update", "time1", "time2", "time3", "95%"], axis=1, inplace=True) 

SH["has_missing"] = np.where((SH["has_missing1"] == 1) | (SH["has_missing2"] == 1), 1, 0)
SH.drop(["has_missing1", "has_missing2"], axis=1, inplace=True) 
print(datetime.datetime.now() - startTm)

no massive missing
0:00:00.367018
0:01:51.810912


In [61]:
pd.set_option("max_rows", 200)
SH.dtypes

skey                                 int32
date                                 int32
time                                 int64
clockAtArrival                       int64
datetime                    datetime64[ns]
ordering                             int32
has_missing                          int32
cum_trades_cnt                       int32
cum_volume                           int64
cum_amount                         float64
prev_close                         float64
open                               float64
high                               float64
low                                float64
close                              float64
bid10p                             float64
bid9p                              float64
bid8p                              float64
bid7p                              float64
bid6p                              float64
bid5p                              float64
bid4p                              float64
bid3p                              float64
bid2p      

In [13]:
pd.set_option("max_rows", 200)
SH.columns.values

array(['cum_trades_cnt', 'ask_trade_max_duration', 'total_bid_orders',
       'cum_canceled_sell_amount', 'total_ask_quantity',
       'cum_canceled_buy_orders', 'total_ask_vwap',
       'cum_canceled_sell_volume', 'cum_volume', 'open', 'high',
       'prev_close', 'low', 'total_bid_vwap', 'cum_canceled_sell_orders',
       'total_ask_orders', 'total_ask_levels', 'total_bid_quantity',
       'cum_canceled_buy_volume', 'bid_trade_max_duration',
       'total_bid_levels', 'close', 'cum_amount', 'AveragePx',
       'cum_canceled_buy_amount', 'skey', 'date', 'time',
       'clockAtArrival', 'datetime', 'bid1p', 'bid2p', 'bid3p', 'bid4p',
       'bid5p', 'bid6p', 'bid7p', 'bid8p', 'bid9p', 'bid10p', 'ask1p',
       'ask2p', 'ask3p', 'ask4p', 'ask5p', 'ask6p', 'ask7p', 'ask8p',
       'ask9p', 'ask10p', 'bid1q', 'bid2q', 'bid3q', 'bid4q', 'bid5q',
       'bid6q', 'bid7q', 'bid8q', 'bid9q', 'bid10q', 'ask1q', 'ask2q',
       'ask3q', 'ask4q', 'ask5q', 'ask6q', 'ask7q', 'ask8q', 'ask9q',
     

In [None]:
SH = SH.rename(columns={"NumTrades":"cum_trades_cnt", "HighPx":"high", "OpenPx":"open", "PreClosePx":"prev_close", "LowPx":"low",
                            "WeightedAvgBidPx":"total_bid_vwap", "WithdrawSellNumber":"cum_canceled_sell_orders", "TotalOfferNumber":"total_ask_orders",
                           "OfferTradeMaxDuration":"ask_trade_max_duration", "TotalBidNumber":"total_bid_orders", "WithdrawSellMoney":"cum_canceled_sell_amount",
                           "TotalOfferQty":"total_ask_quantity", "WithdrawBuyNumber":"cum_canceled_buy_orders", "WeightedAvgOfferPx":"total_ask_vwap",
                           "WithdrawSellAmount":"cum_canceled_sell_volume", "Volume":"cum_volume", "NumOfferOrders":"total_ask_levels", "TotalBidQty":"total_bid_quantity",
                           "WithdrawBuyAmount":"cum_canceled_buy_volume", "LastPx":"close", "BidTradeMaxDuration":"bid_trade_max_duration", 
                           "NumBidOrders":"total_bid_levels", "Amount":"cum_amount", "WithdrawBuyMoney":"cum_canceled_buy_amount"})

In [2]:
for cols in ["prev_close", 'open', "high", "low", "close", 'bid10p','bid9p','bid8p','bid7p','bid6p','bid5p','bid4p','bid3p',
             'bid2p','bid1p','ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p']:
    print(cols)
    print(SH[cols].astype(str).apply(lambda x: len(str(x.split('.')[1]))).unique())

prev_close
[2 1]
open
[1 2]
high
[1 2]
low
[1 2]
close
[1 2]
bid10p
[1 2]
bid9p
[1 2]
bid8p
[1 2]
bid7p
[1 2]
bid6p
[1 2]
bid5p
[1 2]
bid4p
[1 2]
bid3p
[1 2]
bid2p
[1 2]
bid1p
[2 1]
ask1p
[2 1]
ask2p
[1 2]
ask3p
[1 2]
ask4p
[1 2]
ask5p
[1 2]
ask6p
[1 2]
ask7p
[1 2]
ask8p
[1 2]
ask9p
[1 2]
ask10p
[1 2]


In [5]:
for cols in ['cum_amount', "cum_canceled_sell_amount", "cum_canceled_buy_amount"]:
    print(cols)
    print(SH[cols].astype(str).apply(lambda x: len(str(x.split('.')[1]))).unique())

cum_amount
[1 2]
cum_canceled_sell_amount
[1 2]
cum_canceled_buy_amount
[1 2]


In [6]:
for cols in ['total_bid_vwap', "total_ask_vwap"]:
    print(cols)
    print(SH[cols].astype(str).apply(lambda x: len(str(x.split('.')[1]))).unique())

total_bid_vwap
[ 1 15  3  2 16 14 17 13]
total_ask_vwap
[ 1  3 15  2 14 16 17 13]


In [8]:
SH['total_bid_vwap'][SH['total_bid_vwap'].astype(str).apply(lambda x: len(str(x.split('.')[1]))) == 13].round(3)

6084036    1093.216
6084046    1094.331
6084047    1094.592
6084052    1094.314
6084054    1094.341
             ...   
6088849    1056.689
6088853    1057.791
6088854    1057.717
6088859    1058.206
6088866    1057.936
Name: total_bid_vwap, Length: 683, dtype: float64

In [2]:
import pandas as pd
import random
import numpy as np
import glob
import os
from unrar import rarfile
import py7zr
import pickle
import datetime
import time
pd.set_option("max_columns", 200)




year = "2020"
startDate = '20200103'
endDate = '20200103'
readPath = 'L:\\DB\\' + year + '\\SZ\\***'
dataPathLs = np.array(glob.glob(readPath))
dateLs = np.array([os.path.basename(i) for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]

for data in dataPathLs:    
    
    readPath = data + '\\snapshot\\Level_2\\***'
    dataPathLs = np.array(glob.glob(readPath))
    dateLs = np.array([int(os.path.basename(i).split('.')[0]) for i in dataPathLs])
    dataPathLs = dataPathLs[(dateLs < 4000) | ((dateLs > 300000) & (dateLs < 310000))]
    SZ = []
    ll = []

    for i in dataPathLs:
        try:
            df = pd.read_csv(i, usecols = [0,1,4,5,6,7,9,12,17,18,19,24,25,26,28,29,30,32,33,34,35])
        except:
            print("empty data")
            print(i)
            ll.append(int(os.path.basename(i).split('.')[0]))
            continue
        df["StockID"] = int(os.path.basename(i).split('.')[0])
        SZ += [df]
    del df
    SZ = pd.concat(SZ).reset_index(drop=True)

    
    startTm = datetime.datetime.now()

    BidPrice = np.array([i[1:-1].split(',') for i in SZ['BidPrice'].values])
    for i in range(1, 11):
        SZ["bid%sp"%i] = BidPrice[:, i-1]
    del BidPrice
    print("1")
    
    OfferPrice = np.array([i[1:-1].split(',') for i in SZ['OfferPrice'].values])
    for i in range(1, 11):
        SZ["ask%sp"%i] = OfferPrice[:, i-1]
    del OfferPrice
    print("2")
    
    BidOrderQty = np.array([i[1:-1].split(',') for i in SZ['BidOrderQty'].values])
    for i in range(1, 11):
        SZ["bid%sq"%i] = BidOrderQty[:, i-1]
    del BidOrderQty
    print("3")
    
    OfferOrderQty = np.array([i[1:-1].split(',') for i in SZ['OfferOrderQty'].values])
    for i in range(1, 11):
        SZ["ask%sq"%i] = OfferOrderQty[:, i-1]
    del OfferOrderQty
    print("4")
    
    BidNumOrders = np.array([i[1:-1].split(',') for i in SZ['BidNumOrders'].values]).astype(np.int32)
    for i in range(1, 11):
        SZ["bid%sn"%i] = BidNumOrders[:, i-1]
    del BidNumOrders
    print("5")
    
    OfferNumOrders = np.array([i[1:-1].split(',') for i in SZ['OfferNumOrders'].values]).astype(np.int32)
    for i in range(1, 11):
        SZ["ask%sn"%i] = OfferNumOrders[:, i-1]
    del OfferNumOrders
    print("6")
    
    bidOrders = np.array([i[1:-1].split(',') for i in SZ['BidOrders'].values]).astype(np.int32)
    for i in range(1, 51):
        SZ["bid1Top%sq"%i] = bidOrders[:, i-1]
    del bidOrders
    print("7")
    
    offerOrders = np.array([i[1:-1].split(',') for i in SZ['OfferOrders'].values]).astype(np.int32)
    for i in range(1, 51):
        SZ["ask1Top%sq"%i] = offerOrders[:, i-1]
    del offerOrders
    print("8")
    print(datetime.datetime.now() - startTm)

1
2
3
4
5
6
7
8
0:09:03.525120
