In [None]:
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz


def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name)


class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db'):
        self.db_name = db_name
        self.uri = uri
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = 1
            seg = {'ver': version, 'data': self.ser(df_seg, version), 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)

        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        else:
            raise Exception('unknown version')


def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()











import pandas as pd
import random
import numpy as np
import glob
import pickle
import os
import datetime
import time
pd.set_option("max_columns", 200)

startTm = datetime.datetime.now()
readPath = '/home/work516/day_stock/***'
dataPathLs = np.array(glob.glob(readPath))
dataPathLs = dataPathLs[[np.array([os.path.basename(i).split('.')[0][:2] == 'SH' for i in dataPathLs])]]
db = pd.DataFrame()
for p in dataPathLs:
    dayData = pd.read_csv(p, compression='gzip')
    db = pd.concat([db, dayData])
print(datetime.datetime.now() - startTm)

year = "2019"
startDate = '20190101'
endDate = '20191231'
readPath = '/mnt/usb/data/' + year + '/***/***'
dataPathLs = np.array(glob.glob(readPath))
dateLs = np.array([os.path.basename(i).split('_')[0] for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]
date_list = pd.read_csv("/home/work516/KR_upload_code/trading_days.csv")
wr_ong = []
mi_ss = []
less = []

for data in dataPathLs:
    if len(np.array(glob.glob(data + '/SH/***'))) == 0:
        if int(os.path.basename(data)) not in date_list["Date"].values:
            continue
        else:
            print(os.path.basename(data) + " less data!!!!!!!!!!!!!!!!!")
            less.append(data)
            continue
    startTm = datetime.datetime.now()
    date = os.path.basename(data)
    rar_path = data + '/SH/snapshot.7z'
    path = '/mnt/e/unzip_data/2019/SH'
    path1 = path + '/' + date
    un_path = path1
    cmd = '7za x {} -o{}'.format(rar_path, un_path)
    os.system(cmd)
    print(datetime.datetime.now() - startTm)
    print(date + ' unzip finished')

    readPath = path1 + '/snapshot/***2/***'
    dataPathLs = np.array(glob.glob(readPath))
    dateLs = np.array([int(os.path.basename(i).split('.')[0]) for i in dataPathLs])
    dataPathLs = dataPathLs[((dateLs >= 600000) & (dateLs <= 700000))]
    SH = []
    ll = []
    startTm = datetime.datetime.now()
    for i in dataPathLs:
        try:
            df = pd.read_csv(i, usecols = [0,1,3,5,7,9,10,11,15,17,18,19,20,21,22,23,25,26,28,29,30,31,32,33,37,39,40,41,
                                          42,46,47,49,50])
        except:
            print("empty data")
            print(i)
            ll.append(int(os.path.basename(i).split('.')[0]))
            continue
        df["StockID"] = int(os.path.basename(i).split('.')[0])
        SH += [df]
    del df
    SH = pd.concat(SH).reset_index(drop=True)
    print(datetime.datetime.now() - startTm)
    
    startTm = datetime.datetime.now()
    SH["skey"] = SH["StockID"] + 1000000
    SH.drop(["StockID"],axis=1,inplace=True)
    SH["date"] = int(SH["QuotTime"].iloc[0]//1000000000)
    SH["time"] = (SH['QuotTime'] - int(SH['QuotTime'].iloc[0]//1000000000*1000000000)).astype(np.int64) * 1000
    SH["clockAtArrival"] = SH["QuotTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S%f').timestamp()*1e6))
    SH.drop(["QuotTime"],axis=1,inplace=True)
    SH['datetime'] = SH["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))
    print(datetime.datetime.now() - startTm)

    startTm = datetime.datetime.now()
    SH["BidPrice"] = SH["BidPrice"].apply(lambda x: [float(i) for i in x[1:-1].split(',')])
    SH["OfferPrice"] = SH["OfferPrice"].apply(lambda x: [float(i) for i in x[1:-1].split(',')])
    SH["BidOrderQty"] = SH["BidOrderQty"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])
    SH["OfferOrderQty"] = SH["OfferOrderQty"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])
    SH["BidNumOrders"] = SH["BidNumOrders"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])
    SH["OfferNumOrders"] = SH["OfferNumOrders"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])

    for i in range(1, 11):
        SH["bid" + str(i) + 'p'] = SH["BidPrice"].apply(lambda x: x[i-1],2)
    SH.drop(["BidPrice"],axis=1,inplace=True)
    print("1")
    for i in range(1, 11):
        SH["ask" + str(i) + 'p'] = SH["OfferPrice"].apply(lambda x: x[i-1],2)
    SH.drop(["OfferPrice"],axis=1,inplace=True)
    print("2")
    for i in range(1, 11):
        SH["bid" + str(i) + 'q'] = SH["BidOrderQty"].apply(lambda x: x[i-1])
    SH.drop(["BidOrderQty"],axis=1,inplace=True)
    print("3")
    for i in range(1, 11):
        SH["ask" + str(i) + 'q'] = SH["OfferOrderQty"].apply(lambda x: x[i-1])
    SH.drop(["OfferOrderQty"],axis=1,inplace=True)
    print("4")
    for i in range(1, 11):
        SH["bid" + str(i) + 'n'] = SH["BidNumOrders"].apply(lambda x: x[i-1])
        SH["bid" + str(i) + 'n'] = SH["bid" + str(i) + 'n'].astype('int32')
    SH.drop(["BidNumOrders"],axis=1,inplace=True)
    print("5")
    for i in range(1, 11):
        SH["ask" + str(i) + 'n'] = SH["OfferNumOrders"].apply(lambda x: x[i-1])
        SH["ask" + str(i) + 'n'] = SH["ask" + str(i) + 'n'].astype('int32') 
    SH.drop(["OfferNumOrders"],axis=1,inplace=True)
    print("6")
    
    SH["BidOrders"] = SH["BidOrders"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])
    SH["OfferOrders"] = SH["OfferOrders"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])

    for i in range(1, 51):
        SH["bid1Top" + str(i) + 'q'] = SH["BidOrders"].apply(lambda x: x[i-1])
        SH["bid1Top" + str(i) + 'q'] = SH["bid1Top" + str(i) + 'q'].astype('int32') 
    SH.drop(["BidOrders"],axis=1,inplace=True)
    print("7")
    
    for i in range(1, 51):
        SH["ask1Top" + str(i) + 'q'] = SH["OfferOrders"].apply(lambda x: x[i-1])
        SH["ask1Top" + str(i) + 'q'] = SH["ask1Top" + str(i) + 'q'].astype('int32') 
    SH.drop(["OfferOrders"],axis=1,inplace=True)
    print("8")
    print(datetime.datetime.now() - startTm)
    
    
    startTm = datetime.datetime.now()
    SH.columns = ['cum_trades_cnt', 'ask_trade_max_duration', 'total_bid_orders',
       'cum_canceled_sell_amount', 'total_ask_quantity', 'cum_canceled_buy_orders',
       'total_ask_vwap', 'cum_canceled_sell_volume', 'cum_volume', 'open',
       'high', 'prev_close', 'low', 'total_bid_vwap',
       'cum_canceled_sell_orders', 'total_ask_orders', 'total_ask_levels',
       'total_bid_quantity', 'cum_canceled_buy_volume', 'bid_trade_max_duration',
       'total_bid_levels', 'close', 'cum_amount', 'cum_canceled_buy_amount', 'skey', 'date', 'time', 'clockAtArrival',
       'datetime', 'bid1p', 'bid2p', 'bid3p', 'bid4p', 'bid5p', 'bid6p',
       'bid7p', 'bid8p', 'bid9p', 'bid10p', 'ask1p', 'ask2p', 'ask3p',
       'ask4p', 'ask5p', 'ask6p', 'ask7p', 'ask8p', 'ask9p', 'ask10p',
       'bid1q', 'bid2q', 'bid3q', 'bid4q', 'bid5q', 'bid6q', 'bid7q',
       'bid8q', 'bid9q', 'bid10q', 'ask1q', 'ask2q', 'ask3q', 'ask4q',
       'ask5q', 'ask6q', 'ask7q', 'ask8q', 'ask9q', 'ask10q', 'bid1n',
       'bid2n', 'bid3n', 'bid4n', 'bid5n', 'bid6n', 'bid7n', 'bid8n',
       'bid9n', 'bid10n', 'ask1n', 'ask2n', 'ask3n', 'ask4n', 'ask5n',
       'ask6n', 'ask7n', 'ask8n', 'ask9n', 'ask10n', 'bid1Top1q',
       'bid1Top2q', 'bid1Top3q', 'bid1Top4q', 'bid1Top5q', 'bid1Top6q',
       'bid1Top7q', 'bid1Top8q', 'bid1Top9q', 'bid1Top10q', 'bid1Top11q',
       'bid1Top12q', 'bid1Top13q', 'bid1Top14q', 'bid1Top15q',
       'bid1Top16q', 'bid1Top17q', 'bid1Top18q', 'bid1Top19q',
       'bid1Top20q', 'bid1Top21q', 'bid1Top22q', 'bid1Top23q',
       'bid1Top24q', 'bid1Top25q', 'bid1Top26q', 'bid1Top27q',
       'bid1Top28q', 'bid1Top29q', 'bid1Top30q', 'bid1Top31q',
       'bid1Top32q', 'bid1Top33q', 'bid1Top34q', 'bid1Top35q',
       'bid1Top36q', 'bid1Top37q', 'bid1Top38q', 'bid1Top39q',
       'bid1Top40q', 'bid1Top41q', 'bid1Top42q', 'bid1Top43q',
       'bid1Top44q', 'bid1Top45q', 'bid1Top46q', 'bid1Top47q',
       'bid1Top48q', 'bid1Top49q', 'bid1Top50q', 'ask1Top1q', 'ask1Top2q',
       'ask1Top3q', 'ask1Top4q', 'ask1Top5q', 'ask1Top6q', 'ask1Top7q',
       'ask1Top8q', 'ask1Top9q', 'ask1Top10q', 'ask1Top11q', 'ask1Top12q',
       'ask1Top13q', 'ask1Top14q', 'ask1Top15q', 'ask1Top16q',
       'ask1Top17q', 'ask1Top18q', 'ask1Top19q', 'ask1Top20q',
       'ask1Top21q', 'ask1Top22q', 'ask1Top23q', 'ask1Top24q',
       'ask1Top25q', 'ask1Top26q', 'ask1Top27q', 'ask1Top28q',
       'ask1Top29q', 'ask1Top30q', 'ask1Top31q', 'ask1Top32q',
       'ask1Top33q', 'ask1Top34q', 'ask1Top35q', 'ask1Top36q',
       'ask1Top37q', 'ask1Top38q', 'ask1Top39q', 'ask1Top40q',
       'ask1Top41q', 'ask1Top42q', 'ask1Top43q', 'ask1Top44q',
       'ask1Top45q', 'ask1Top46q', 'ask1Top47q', 'ask1Top48q',
       'ask1Top49q', 'ask1Top50q']
    SH = SH.fillna(0)
#     SH["p1"] = SH["bid1p"] + SH["ask1p"]
#     tt = SH[(SH["cum_volume"] > 0) & (SH["time"] < 145700000000)].groupby("skey")['p1'].min()
#     SH.drop("p1", axis=1, inplace=True)
#     try:
#         assert(tt[tt == 0].shape[0] == 0)
#     except:
#         display(tt[tt == 0])
#     SH = SH[~((SH["bid1p"] == 0) & (SH["ask1p"] == 0))]
    SH["ordering"] = SH.groupby("skey").cumcount()
    SH["ordering"] = SH["ordering"] + 1
    
    SH["has_missing"] = 0
    
    for col in ["skey", "date", "cum_trades_cnt", "total_bid_orders",
        'total_ask_orders', 'total_bid_levels', 'total_ask_levels', 'cum_canceled_buy_orders','cum_canceled_sell_orders',
            "ordering", 'bid_trade_max_duration', 'ask_trade_max_duration','has_missing']:
        SH[col] = SH[col].astype('int32')
    
#     for cols in ["prev_close", 'open', "high", "low", "close", 'bid10p','bid9p','bid8p','bid7p','bid6p','bid5p','bid4p','bid3p',
#              'bid2p','bid1p','ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p']:
# #         SH[cols] = SH[cols].apply(lambda x: round(x, 2)).astype('float64')
#         print(cols)
#         print(SH[cols].astype(str).apply(lambda x: len(str(x.split('.')[1]))).unique())
    
#     for cols in ['cum_amount', "cum_canceled_sell_amount", "cum_canceled_buy_amount"]:
# #         SH[cols] = SH[cols].apply(lambda x: round(x, 2)).astype('float64')
#         print(cols)
#         print(SH[cols].astype(str).apply(lambda x: len(str(x.split('.')[1]))).unique())
        
    for cols in ['total_bid_vwap', "total_ask_vwap"]:
#         print(cols)
#         print(SH[cols].astype(str).apply(lambda x: len(str(x.split('.')[1]))).unique())
        SH[cols] = SH[cols].apply(lambda x: round(x, 3))
        
   
    assert(sum(SH[SH["open"] != 0].groupby("skey")["open"].nunique() != 1) == 0)
    assert(sum(SH[SH["prev_close"] != 0].groupby("skey")["prev_close"].nunique() != 1) == 0)
    SH["prev_close"] = np.where(SH["time"] >= 91500000000, SH.groupby("skey")["prev_close"].transform("max"), SH["prev_close"]) 
    SH["open"] = np.where(SH["cum_volume"] > 0, SH.groupby("skey")["open"].transform("max"), SH["open"])
    assert(sum(SH[SH["open"] != 0].groupby("skey")["open"].nunique() != 1) == 0)
    assert(sum(SH[SH["prev_close"] != 0].groupby("skey")["prev_close"].nunique() != 1) == 0)
    assert(SH[SH["cum_volume"] > 0]["open"].min() > 0)
    print(datetime.datetime.now() - startTm)
    
    
    # check 1
    startTm = datetime.datetime.now()
    da_te = str(SH["date"].iloc[0]) 
    da_te = da_te[:4] + '-' + da_te[4:6] + '-' + da_te[6:8]
    db1 = db[db["date"] == da_te]
    db1["ID"] = db1["ID"].str[2:].astype(int) + 1000000
    db1["date"] = (db1["date"].str[:4] + db1["date"].str[5:7] + db1["date"].str[8:]).astype(int)
    SH["cum_max"] = SH.groupby("skey")["cum_volume"].transform(max)
    s2 = SH[SH["cum_volume"] == SH["cum_max"]].groupby("skey").first().reset_index()
    dd = SH[SH["cum_volume"] == SH["cum_max"]].groupby("skey")["time"].first().reset_index()
    SH.drop("cum_max", axis=1, inplace=True)
    s2 = s2.rename(columns={"skey": "ID", 'open':"d_open", "prev_close":"d_yclose","high":"d_high", "low":"d_low", "close":"d_close", "cum_volume":"d_volume", "cum_amount":"d_amount"})
    if SH["date"].iloc[0] < 20180820:
        s2["auction"] = 0
    else:
        dd["auction"] = np.where(dd["time"]<=145700000000, 0, 1)
        dd = dd.rename(columns={"skey": "ID"})
        s2 = pd.merge(s2, dd[["ID", "auction"]], on="ID")
    s2 = s2[["ID", "date", "d_open", "d_yclose", "d_high", "d_low", "d_close", "d_volume", "d_amount", "auction"]]
    re = pd.merge(db1, s2, on=["ID", "date", "d_open", "d_yclose","d_high", "d_low", "d_volume"], how="outer")
    try:
        assert(sum(re["d_amount_y"].isnull()) == 0)
    except:
        print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
        print(re[re["d_amount_y"].isnull()])
        wr_ong += [re[re["d_amount_y"].isnull()]]
    print(datetime.datetime.now() - startTm)
    
    # check 2
    # first part
    startTm = datetime.datetime.now()
    date = pd.DataFrame(pd.date_range(start='2019-06-10 08:30:00', end='2019-06-10 18:00:00', freq='s'), columns=["Orig"])
    date["time"] = date["Orig"].apply(lambda x: int(x.strftime("%H%M%S"))*1000)
    date["group"] = date["time"]//10000
    SH["group"] = SH["time"]//10000000
    gl = date[((date["time"] >= 93000000) & (date["time"] <= 113000000))|((date["time"] >= 130000000) & (date["time"] <= 150000000))]["group"].unique()
    l = set(gl) - set(SH["group"].unique())
    SH["has_missing1"] = 0 
    if len(l) != 0:
        print("massive missing")
        print(l)
        SH["order"] = SH.groupby(["skey", "time"]).cumcount()
        for i in l:
            SH["t"] = SH[SH["group"] > i].groupby("StockID")["time"].transform("min")
            SH["has_missing1"] = np.where((SH["time"] == SH["t"]) & (SH["order"] == 0), 1, 0)
        SH.drop(["order", "t", "group"], axis=1, inplace=True)   
    else:
        print("no massive missing")
        SH.drop(["group"], axis=1, inplace=True)
    



    # second part

    SH["time_interval"] = SH.groupby("skey")["datetime"].apply(lambda x: x - x.shift(1))
    SH["time_interval"] = SH["time_interval"].apply(lambda x: x.seconds)
    SH["tn_update"] = SH.groupby("skey")["cum_trades_cnt"].apply(lambda x: x-x.shift(1))

    f1 = SH[(SH["time"] >= 93000000000) & (SH["tn_update"] != 0)].groupby("skey")["time"].min().reset_index()
    f1 = f1.rename(columns={"time": "time1"})
    f2 = SH[(SH["time"] >= 130000000000) & (SH["tn_update"] != 0)].groupby("skey")["time"].min().reset_index()
    f2 = f2.rename(columns={"time": "time2"})
    f3 = SH[(SH["time"] >= 150000000000) & (SH["tn_update"] != 0)].groupby("skey")["time"].min().reset_index()
    f3 = f3.rename(columns={"time": "time3"})
    SH = pd.merge(SH, f1, on="skey", how="left")
    del f1
    SH = pd.merge(SH, f2, on="skey", how="left")
    del f2
    SH = pd.merge(SH, f3, on="skey", how="left")
    del f3
    p99 = SH[(SH["time"] > 93000000000) & (SH["time"] < 145700000000) & (SH["time"] != SH["time2"]) & (SH["tn_update"] != 0)]\
    .groupby("skey")["tn_update"].apply(lambda x: x.describe([0.99])["99%"]).reset_index()
    p99 = p99.rename(columns={"tn_update":"99%"})
    SH = pd.merge(SH, p99, on="skey", how="left")

    SH["has_missing2"] = 0
    SH["has_missing2"] = np.where((SH["time_interval"] > 60) & (SH["tn_update"] > SH["99%"]) & 
         (SH["time"] > SH["time1"]) & (SH["time"] != SH["time2"]) & (SH["time"] != SH["time3"]) & (SH["time"] != 100000000000), 1, 0)
    SH.drop(["time_interval", "tn_update", "time1", "time2", "time3", "99%"], axis=1, inplace=True) 

    SH["has_missing"] = np.where((SH["has_missing1"] == 1) | (SH["has_missing2"] == 1), 1, 0)
    SH.drop(["has_missing1", "has_missing2"], axis=1, inplace=True) 
    if SH[SH["has_missing"] == 1].shape[0] != 0:
        print("has missing!!!!!!!!!!!!!!!!!!!!!!!")
        print(SH[SH["has_missing"] == 1].shape[0])
        mi_ss += [SH[SH["has_missing"] == 1]]
    print(datetime.datetime.now() - startTm)
    
    
    
    startTm = datetime.datetime.now()
    SH["has_missing"] = SH["has_missing"].astype('int32')
    SH = SH[["skey", "date", "time", "clockAtArrival", "datetime", "ordering", "has_missing", "cum_trades_cnt", "cum_volume", "cum_amount", "prev_close",
                            "open", "high", "low", "close", 'bid10p','bid9p','bid8p','bid7p','bid6p','bid5p','bid4p','bid3p','bid2p','bid1p',
                            'ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p', 'bid10q','bid9q','bid8q',
                             'bid7q','bid6q','bid5q','bid4q','bid3q','bid2q','bid1q', 'ask1q','ask2q','ask3q','ask4q','ask5q','ask6q',
                             'ask7q','ask8q','ask9q','ask10q', 'bid10n', 'bid9n', 'bid8n', 'bid7n', 'bid6n', 'bid5n', 'bid4n', 'bid3n', 'bid2n', 'bid1n', 
                             'ask1n', 'ask2n', 'ask3n', 'ask4n', 'ask5n', 'ask6n','ask7n', 'ask8n', 'ask9n', 'ask10n','bid1Top1q','bid1Top2q','bid1Top3q','bid1Top4q','bid1Top5q','bid1Top6q',
        'bid1Top7q','bid1Top8q','bid1Top9q','bid1Top10q','bid1Top11q','bid1Top12q','bid1Top13q','bid1Top14q','bid1Top15q','bid1Top16q','bid1Top17q','bid1Top18q',
        'bid1Top19q','bid1Top20q','bid1Top21q','bid1Top22q','bid1Top23q','bid1Top24q','bid1Top25q','bid1Top26q','bid1Top27q','bid1Top28q','bid1Top29q',
        'bid1Top30q','bid1Top31q','bid1Top32q','bid1Top33q','bid1Top34q','bid1Top35q','bid1Top36q','bid1Top37q','bid1Top38q','bid1Top39q','bid1Top40q',
        'bid1Top41q','bid1Top42q','bid1Top43q','bid1Top44q','bid1Top45q','bid1Top46q','bid1Top47q','bid1Top48q','bid1Top49q','bid1Top50q', 'ask1Top1q',
        'ask1Top2q','ask1Top3q','ask1Top4q','ask1Top5q','ask1Top6q','ask1Top7q','ask1Top8q','ask1Top9q','ask1Top10q','ask1Top11q','ask1Top12q','ask1Top13q',
        'ask1Top14q','ask1Top15q','ask1Top16q','ask1Top17q','ask1Top18q','ask1Top19q','ask1Top20q','ask1Top21q','ask1Top22q','ask1Top23q',
        'ask1Top24q','ask1Top25q','ask1Top26q','ask1Top27q','ask1Top28q','ask1Top29q','ask1Top30q','ask1Top31q','ask1Top32q','ask1Top33q',
        'ask1Top34q','ask1Top35q','ask1Top36q','ask1Top37q','ask1Top38q','ask1Top39q','ask1Top40q','ask1Top41q','ask1Top42q','ask1Top43q',
        'ask1Top44q','ask1Top45q','ask1Top46q','ask1Top47q','ask1Top48q','ask1Top49q','ask1Top50q',"total_bid_quantity", "total_ask_quantity","total_bid_vwap", "total_ask_vwap",
        "total_bid_orders",'total_ask_orders','total_bid_levels', 'total_ask_levels', 'bid_trade_max_duration', 'ask_trade_max_duration', 'cum_canceled_buy_orders', 'cum_canceled_buy_volume',
        "cum_canceled_buy_amount", "cum_canceled_sell_orders", 'cum_canceled_sell_volume',"cum_canceled_sell_amount"]]
    
    display(SH["date"].iloc[0])
    print("SH finished")
    
    database_name = 'com_md_eq_cn'
    user = "zhenyuy"
    password = "bnONBrzSMGoE"

    db1 = DB("192.168.10.223", database_name, user, password)
    db1.write('md_snapshot_l2', SH)
    
    del SH
    print(datetime.datetime.now() - startTm)

wr_ong = pd.concat(wr_ong).reset_index(drop=True)
print(wr_ong)
mi_ss = pd.concat(mi_ss).reset_index(drop=True)
print(mi_ss)
print(less)



0:02:45.984537
0:00:33.841669
20190102 unzip finished
0:00:43.560642
0:01:05.921339
1
2
3
4
5
6
7
8
0:08:44.763251
0:00:30.660770


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0:00:03.001463
no massive missing
0:01:47.504351


20190102

SH finished
0:01:00.025820
0:00:34.239178
20190103 unzip finished
0:00:45.921740
0:01:10.714202
1
2
3
4
5
6
7
8
0:08:57.652666
0:00:31.642739
0:00:03.152455
no massive missing
0:01:45.283587


20190103

SH finished
0:00:53.146966
0:00:46.573862
20190104 unzip finished
0:00:53.639910
0:01:15.000744
1
2
3
4
5
6
7
8
0:09:58.459681
0:00:36.982629
0:00:03.524324
no massive missing
0:01:57.626434


20190104

SH finished
0:00:53.500348
0:00:41.798848
20190107 unzip finished
0:00:49.163369
0:01:17.648687
1
2
3
4
5
6
7
8
0:10:07.260687
0:00:37.993502
0:00:03.390918
no massive missing
0:02:19.672149


20190107

SH finished
0:01:02.538487
0:01:20.969300
20190108 unzip finished
0:00:48.212476
0:01:09.669859
1
2
3
4
5
6
7
8
0:09:39.776289
0:00:35.204770
0:00:03.615784
no massive missing
0:01:51.527356


20190108

SH finished
0:00:51.774512
0:00:58.562927
20190109 unzip finished
0:00:51.663963
0:01:18.108653
1
2
3
4
5
6
7
8
0:10:33.712160
0:00:38.923074
0:00:03.828584
no massive missing
0:02:04.435076


20190109

SH finished
0:00:57.367732
0:00:46.325992
20190110 unzip finished
0:00:47.318049
0:01:14.120923
1
2
3
4
5
6
7
8
0:09:54.943642
0:00:34.307670
0:00:03.315368
no massive missing
0:02:02.792536


20190110

SH finished
0:00:56.568453
0:00:34.760278
20190111 unzip finished
0:00:47.735262
0:01:12.261136
1
2
3
4
5
6
7
8
0:09:58.218760
0:00:46.214046
0:00:03.319891
no massive missing
0:01:54.913931


20190111

SH finished
0:00:55.421452
0:00:40.544814
20190114 unzip finished
0:00:47.323716
0:01:11.356472
1
2
3
4
5
6
7
8
0:09:51.436820
0:00:33.543315
0:00:03.295977
no massive missing
0:01:52.377864


20190114

SH finished
0:00:55.122061
0:00:41.991049
20190115 unzip finished
0:00:50.084752
0:01:13.572987
1
2
3
4
5
6
7
8
0:09:57.961863
0:00:35.490900
0:00:03.523785
no massive missing
0:02:20.376810


20190115

SH finished
0:00:58.778289
0:00:42.971410
20190116 unzip finished
0:00:51.111281
0:01:14.030373
1
2
3
4
5
6
7
8
0:10:06.684940
0:00:35.079690
0:00:03.512448
no massive missing
0:01:54.940473


20190116

SH finished
0:01:00.820420
0:00:43.479462
20190117 unzip finished
0:00:53.447839
0:01:15.367850
1
2
3
4
5
6
7
8
0:10:04.095791
0:00:39.924835
0:00:03.892494
no massive missing
0:02:01.339959


20190117

SH finished
0:00:58.305213
0:00:41.853579
20190118 unzip finished
0:00:51.789398
0:01:15.194699
1
2
3
4
5
6
7
8
0:10:22.155529
0:00:46.213244
0:00:03.525210
no massive missing
0:02:24.239531


20190118

SH finished
0:00:52.614943
0:00:40.922228
20190121 unzip finished
0:00:50.648644
0:01:16.601965
1
2
3
4
5
6
7
8
0:10:14.286730
0:00:35.460695
0:00:04.245376
no massive missing
0:01:52.451651


20190121

SH finished
0:00:56.017454
0:00:42.652202
20190122 unzip finished
0:01:28.504796
0:01:15.575280
1
2
3
4
5
6
7
8
0:09:56.304878
0:00:36.950585
0:00:03.650371
no massive missing
0:01:58.852388


20190122

SH finished
0:00:56.445936
0:00:48.649601
20190123 unzip finished
0:00:47.965847
0:01:15.209873
1
2
3
4
5
6
7
8
0:10:05.216043
0:00:47.432279
0:00:04.316615
no massive missing
0:02:05.967453


20190123

SH finished
0:00:47.360547
0:00:51.514876
20190124 unzip finished
0:00:49.846707
0:01:14.128699
1
2
3
4
5
6
7
8
0:10:16.363401
0:00:42.331633
0:00:03.894810
no massive missing
0:02:03.195172


20190124

SH finished
0:01:06.177360
0:00:41.225798
20190125 unzip finished
0:00:51.816792
0:01:17.840703
1
2
3
4
5
6
7
8
0:10:13.652094
0:00:34.453189
0:00:03.495564
no massive missing
0:01:51.632266


20190125

SH finished
0:01:02.826015
0:00:45.778265
20190128 unzip finished
0:00:51.560450
0:01:14.809191
1
2
3
4
5
6
7
8
0:10:35.678018
0:00:38.741532
0:00:03.665574
no massive missing
0:01:57.009592


20190128

SH finished
0:00:54.864005
0:00:41.279607
20190129 unzip finished
0:00:49.271007
0:01:19.822848
1
2
3
4
5
6
7
8
0:10:13.842107
0:00:35.723109
0:00:04.395274
no massive missing
0:02:16.600238


20190129

SH finished
0:01:11.323909
0:00:36.592427
20190130 unzip finished
0:00:48.707559
0:01:12.197984
1
2
3
4
5
6
7
8
0:09:51.758839
0:00:35.708949
0:00:03.585133
no massive missing
0:01:50.976828


20190130

SH finished
0:01:00.787257
0:00:41.358792
20190131 unzip finished
0:00:54.779245
0:01:17.855764
1
2
3
4
5
6
7
8
0:10:16.317892
0:00:36.652977
0:00:03.779393
no massive missing
0:02:03.678260


20190131

SH finished
0:01:04.507389
0:00:40.255558
20190201 unzip finished
0:00:48.909609
0:01:13.536638
1
2
3
4
5
6
7
8
0:09:51.859938
0:00:34.040086
0:00:03.431051
no massive missing
0:01:55.914691


20190201

SH finished
0:00:53.513977
0:01:02.298861
20190211 unzip finished
0:00:51.520887
0:01:17.296483
1
2
3
4
5
6
7
8
0:10:18.867982
0:00:36.933072
0:00:03.744542
no massive missing
0:02:06.364624


20190211

SH finished
0:01:04.142593
0:00:48.501699
20190212 unzip finished
0:00:54.445138
0:01:21.726556
1
2
3
4
5
6
7
8
0:10:48.411095
0:00:36.191451
0:00:03.921313
no massive missing
0:01:59.729548


20190212

SH finished
0:00:59.985067
0:00:49.522806
20190213 unzip finished
0:00:57.832036
0:01:26.802130
1
2
3
4
5
6
7
8
0:11:18.753690
0:00:42.056242
0:00:04.095586
no massive missing
0:02:32.988362


20190213

SH finished
0:01:12.559951
0:00:56.064423
20190214 unzip finished
0:00:54.692361
0:01:19.549742
1
2
3
4
5
6
7
8
0:10:41.278030
0:00:40.378683
0:00:03.819867
no massive missing
0:02:19.317955


20190214

SH finished
0:01:04.817877
0:00:47.686291
20190215 unzip finished
0:00:52.181199
0:01:20.875058
1
2
3
4
5
6
7
8
0:11:05.480544
0:00:40.351011
0:00:03.919106
no massive missing
0:02:25.749925


20190215

SH finished
0:01:05.258970
0:00:56.888968
20190218 unzip finished
0:00:55.988169
0:01:26.206695
1
2
3
4
5
6
7
8
0:12:27.897163
0:00:41.909578
0:00:04.157175
no massive missing
0:02:17.734738


20190218

SH finished
0:01:17.931353
0:01:01.176531
20190219 unzip finished
0:00:58.450487
0:01:27.013116
1
2
3
4
5
6
7
8
0:12:06.439192
0:00:42.914452
0:00:04.027947
no massive missing
0:02:10.279807


20190219

SH finished
0:01:02.376786
0:00:47.533531
20190220 unzip finished
0:00:54.772412
0:01:24.176498
1
2
3
4
5
6
7
8
0:11:41.373820
0:00:40.279348
0:00:03.942291
no massive missing
0:02:05.549757


20190220

SH finished
0:00:59.261674
0:00:54.383751
20190221 unzip finished
0:01:02.001735
0:01:29.696119
1
2
3
4
5
6
7
8
0:12:11.969460
0:00:43.702106
0:00:04.132642
no massive missing
0:02:19.574896


20190221

SH finished
0:01:20.137664
0:00:57.695764
20190222 unzip finished
0:00:54.008651
0:01:25.275801
1
2
3
4
5
6
7
8
0:11:50.455490
0:00:41.125291
0:00:04.134363
no massive missing
0:02:15.146807


20190222

SH finished
0:01:01.349748
0:01:03.985086
20190225 unzip finished
0:01:17.430013
0:01:36.213121
1
2
3
4
5
6
7
8
0:13:52.466767
0:00:53.771300
0:00:04.404259
no massive missing
0:02:28.320305


20190225

SH finished
0:01:32.041726
0:01:06.210841
20190226 unzip finished
0:01:07.341209
0:01:38.906534
1
2
3
4
5
6
7
8
0:13:10.097639
0:00:44.813499
0:00:04.527017
no massive missing
0:02:32.808467


20190226

SH finished
0:01:35.068344
0:00:52.373781
20190227 unzip finished
0:01:04.177360
0:01:35.229524
1
2
3
4
5
6
7
8
0:12:41.405144
0:00:45.303236
0:00:04.590643
no massive missing
0:02:42.207251


20190227

SH finished
0:01:13.117852
0:00:56.342222
20190228 unzip finished
0:00:57.027257
0:01:33.153947
1
2
3
4
5
6
7
8
0:12:17.633224
0:00:43.333673
0:00:04.310095
no massive missing
0:02:29.409967


20190228

SH finished
0:01:06.822508
0:00:58.818688
20190301 unzip finished
0:00:59.787652
0:01:31.517664
1
2
3
4
5
6
7
8
0:11:56.096308
0:00:42.449342
0:00:04.142125
no massive missing
0:02:18.255491


20190301

SH finished
0:01:15.307485
0:01:08.002547
20190304 unzip finished
0:01:09.116092
0:01:39.022474
1
2
3
4
5
6
7
8
0:14:51.580659
0:00:51.528609
0:00:04.555144
no massive missing
0:02:46.433641


20190304

SH finished
0:01:15.969425
0:00:52.278947
20190305 unzip finished
0:01:02.811176
0:01:36.994744
1
2
3
4
5
6
7
8
0:12:50.121480
0:00:47.150147
0:00:04.565923
no massive missing
0:02:32.482171


20190305

SH finished
0:01:09.261110
0:01:02.021399
20190306 unzip finished
0:01:10.316418
0:01:41.871397
1
2
3
4
5
6
7
8
0:13:29.254216
0:00:47.431598
0:00:04.421499
no massive missing
0:02:31.162628


20190306

SH finished
0:01:15.128354
0:01:10.513474
20190307 unzip finished
0:01:10.996408
0:01:39.421326
1
2
3
4
5
6
7
8
0:13:35.154775
0:00:46.563553
0:00:04.516704
no massive missing
0:02:44.799176


20190307

SH finished
0:01:31.663909
0:00:58.404471
20190308 unzip finished
0:01:07.211702
0:01:38.694658
1
2
3
4
5
6
7
8
0:14:03.156434
0:00:52.089011
0:00:04.605328
no massive missing
0:03:10.526532


20190308

SH finished
0:01:33.111020
0:01:10.055364
20190311 unzip finished
0:01:04.680973
0:01:45.487273
1
2
3
4
5
6
7
8
0:12:52.384479
0:00:43.515227
0:00:04.335994
no massive missing
0:02:58.377502


20190311

SH finished
0:01:25.064226
0:01:14.677722
20190312 unzip finished
0:01:07.744961
0:01:53.146467
1
2
3
4
5
6
7
8
0:13:46.684668
0:00:46.831292
0:00:04.422855
no massive missing
0:02:30.528612


20190312

SH finished
0:01:28.515950
0:00:59.434072
20190313 unzip finished
0:01:04.268569
0:01:38.180483
1
2
3
4
5
6
7
8
0:13:37.970230
0:00:48.000640
0:00:04.545102
no massive missing
0:02:43.319100


20190313

SH finished
0:01:14.426785
0:01:08.445770
20190314 unzip finished
0:01:01.664551
0:01:37.224214
1
2
3
4
5
6
7
8
0:12:55.324572
0:00:44.496257
0:00:04.377166
no massive missing
0:02:30.923008


20190314

SH finished
0:01:10.829585
0:01:40.166280
20190315 unzip finished
0:00:58.422265
0:01:35.425616


In [1]:
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz


def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name)


class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db'):
        self.db_name = db_name
        self.uri = uri
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = 1
            seg = {'ver': version, 'data': self.ser(df_seg, version), 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)

        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        else:
            raise Exception('unknown version')


def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()











import pandas as pd
import random
import numpy as np
import glob
import pickle
import os
import datetime
import time
pd.set_option("max_columns", 200)

startTm = datetime.datetime.now()
readPath = '/home/work516/day_stock/***'
dataPathLs = np.array(glob.glob(readPath))
dataPathLs = dataPathLs[[np.array([os.path.basename(i).split('.')[0][:2] == 'SH' for i in dataPathLs])]]
db = pd.DataFrame()
for p in dataPathLs:
    dayData = pd.read_csv(p, compression='gzip')
    db = pd.concat([db, dayData])
print(datetime.datetime.now() - startTm)

year = "2019"
startDate = '20190404'
endDate = '20191231'
readPath = '/mnt/usb/data/' + year + '/***/***'
dataPathLs = np.array(glob.glob(readPath))
dateLs = np.array([os.path.basename(i).split('_')[0] for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]
date_list = pd.read_csv("/home/work516/KR_upload_code/trading_days.csv")
wr_ong = []
mi_ss = []
less = []

for data in dataPathLs:
    if len(np.array(glob.glob(data + '/SH/***'))) == 0:
        if int(os.path.basename(data)) not in date_list["Date"].values:
            continue
        else:
            print(os.path.basename(data) + " less data!!!!!!!!!!!!!!!!!")
            less.append(data)
            continue
    startTm = datetime.datetime.now()
    date = os.path.basename(data)
    rar_path = data + '/SH/snapshot.7z'
    path = '/mnt/e/unzip_data/2019/SH'
    path1 = path + '/' + date
    un_path = path1
    cmd = '7za x {} -o{}'.format(rar_path, un_path)
    os.system(cmd)
    print(datetime.datetime.now() - startTm)
    print(date + ' unzip finished')

    readPath = path1 + '/snapshot/***2/***'
    dataPathLs = np.array(glob.glob(readPath))
    dateLs = np.array([int(os.path.basename(i).split('.')[0]) for i in dataPathLs])
    dataPathLs = dataPathLs[((dateLs >= 600000) & (dateLs <= 700000))]
    SH = []
    ll = []
    startTm = datetime.datetime.now()
    for i in dataPathLs:
        try:
            df = pd.read_csv(i, usecols = [0,1,3,5,7,9,10,11,15,17,18,19,20,21,22,23,25,26,28,29,30,31,32,33,37,39,40,41,
                                          42,46,47,49,50])
        except:
            print("empty data")
            print(i)
            ll.append(int(os.path.basename(i).split('.')[0]))
            continue
        df["StockID"] = int(os.path.basename(i).split('.')[0])
        SH += [df]
    del df
    SH = pd.concat(SH).reset_index(drop=True)
    print(datetime.datetime.now() - startTm)
    
    startTm = datetime.datetime.now()
    SH["skey"] = SH["StockID"] + 1000000
    SH.drop(["StockID"],axis=1,inplace=True)
    SH["date"] = int(SH["QuotTime"].iloc[0]//1000000000)
    SH["time"] = (SH['QuotTime'] - int(SH['QuotTime'].iloc[0]//1000000000*1000000000)).astype(np.int64) * 1000
    SH["clockAtArrival"] = SH["QuotTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S%f').timestamp()*1e6))
    SH.drop(["QuotTime"],axis=1,inplace=True)
    SH['datetime'] = SH["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))
    print(datetime.datetime.now() - startTm)

    startTm = datetime.datetime.now()
    SH["BidPrice"] = SH["BidPrice"].apply(lambda x: [float(i) for i in x[1:-1].split(',')])
    SH["OfferPrice"] = SH["OfferPrice"].apply(lambda x: [float(i) for i in x[1:-1].split(',')])
    SH["BidOrderQty"] = SH["BidOrderQty"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])
    SH["OfferOrderQty"] = SH["OfferOrderQty"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])
    SH["BidNumOrders"] = SH["BidNumOrders"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])
    SH["OfferNumOrders"] = SH["OfferNumOrders"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])

    for i in range(1, 11):
        SH["bid" + str(i) + 'p'] = SH["BidPrice"].apply(lambda x: x[i-1],2)
    SH.drop(["BidPrice"],axis=1,inplace=True)
    print("1")
    for i in range(1, 11):
        SH["ask" + str(i) + 'p'] = SH["OfferPrice"].apply(lambda x: x[i-1],2)
    SH.drop(["OfferPrice"],axis=1,inplace=True)
    print("2")
    for i in range(1, 11):
        SH["bid" + str(i) + 'q'] = SH["BidOrderQty"].apply(lambda x: x[i-1])
    SH.drop(["BidOrderQty"],axis=1,inplace=True)
    print("3")
    for i in range(1, 11):
        SH["ask" + str(i) + 'q'] = SH["OfferOrderQty"].apply(lambda x: x[i-1])
    SH.drop(["OfferOrderQty"],axis=1,inplace=True)
    print("4")
    for i in range(1, 11):
        SH["bid" + str(i) + 'n'] = SH["BidNumOrders"].apply(lambda x: x[i-1])
        SH["bid" + str(i) + 'n'] = SH["bid" + str(i) + 'n'].astype('int32')
    SH.drop(["BidNumOrders"],axis=1,inplace=True)
    print("5")
    for i in range(1, 11):
        SH["ask" + str(i) + 'n'] = SH["OfferNumOrders"].apply(lambda x: x[i-1])
        SH["ask" + str(i) + 'n'] = SH["ask" + str(i) + 'n'].astype('int32') 
    SH.drop(["OfferNumOrders"],axis=1,inplace=True)
    print("6")
    
    SH["BidOrders"] = SH["BidOrders"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])
    SH["OfferOrders"] = SH["OfferOrders"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])

    for i in range(1, 51):
        SH["bid1Top" + str(i) + 'q'] = SH["BidOrders"].apply(lambda x: x[i-1])
        SH["bid1Top" + str(i) + 'q'] = SH["bid1Top" + str(i) + 'q'].astype('int32') 
    SH.drop(["BidOrders"],axis=1,inplace=True)
    print("7")
    
    for i in range(1, 51):
        SH["ask1Top" + str(i) + 'q'] = SH["OfferOrders"].apply(lambda x: x[i-1])
        SH["ask1Top" + str(i) + 'q'] = SH["ask1Top" + str(i) + 'q'].astype('int32') 
    SH.drop(["OfferOrders"],axis=1,inplace=True)
    print("8")
    print(datetime.datetime.now() - startTm)
    
    
    startTm = datetime.datetime.now()
    SH.columns = ['cum_trades_cnt', 'ask_trade_max_duration', 'total_bid_orders',
       'cum_canceled_sell_amount', 'total_ask_quantity', 'cum_canceled_buy_orders',
       'total_ask_vwap', 'cum_canceled_sell_volume', 'cum_volume', 'open',
       'high', 'prev_close', 'low', 'total_bid_vwap',
       'cum_canceled_sell_orders', 'total_ask_orders', 'total_ask_levels',
       'total_bid_quantity', 'cum_canceled_buy_volume', 'bid_trade_max_duration',
       'total_bid_levels', 'close', 'cum_amount', 'cum_canceled_buy_amount', 'skey', 'date', 'time', 'clockAtArrival',
       'datetime', 'bid1p', 'bid2p', 'bid3p', 'bid4p', 'bid5p', 'bid6p',
       'bid7p', 'bid8p', 'bid9p', 'bid10p', 'ask1p', 'ask2p', 'ask3p',
       'ask4p', 'ask5p', 'ask6p', 'ask7p', 'ask8p', 'ask9p', 'ask10p',
       'bid1q', 'bid2q', 'bid3q', 'bid4q', 'bid5q', 'bid6q', 'bid7q',
       'bid8q', 'bid9q', 'bid10q', 'ask1q', 'ask2q', 'ask3q', 'ask4q',
       'ask5q', 'ask6q', 'ask7q', 'ask8q', 'ask9q', 'ask10q', 'bid1n',
       'bid2n', 'bid3n', 'bid4n', 'bid5n', 'bid6n', 'bid7n', 'bid8n',
       'bid9n', 'bid10n', 'ask1n', 'ask2n', 'ask3n', 'ask4n', 'ask5n',
       'ask6n', 'ask7n', 'ask8n', 'ask9n', 'ask10n', 'bid1Top1q',
       'bid1Top2q', 'bid1Top3q', 'bid1Top4q', 'bid1Top5q', 'bid1Top6q',
       'bid1Top7q', 'bid1Top8q', 'bid1Top9q', 'bid1Top10q', 'bid1Top11q',
       'bid1Top12q', 'bid1Top13q', 'bid1Top14q', 'bid1Top15q',
       'bid1Top16q', 'bid1Top17q', 'bid1Top18q', 'bid1Top19q',
       'bid1Top20q', 'bid1Top21q', 'bid1Top22q', 'bid1Top23q',
       'bid1Top24q', 'bid1Top25q', 'bid1Top26q', 'bid1Top27q',
       'bid1Top28q', 'bid1Top29q', 'bid1Top30q', 'bid1Top31q',
       'bid1Top32q', 'bid1Top33q', 'bid1Top34q', 'bid1Top35q',
       'bid1Top36q', 'bid1Top37q', 'bid1Top38q', 'bid1Top39q',
       'bid1Top40q', 'bid1Top41q', 'bid1Top42q', 'bid1Top43q',
       'bid1Top44q', 'bid1Top45q', 'bid1Top46q', 'bid1Top47q',
       'bid1Top48q', 'bid1Top49q', 'bid1Top50q', 'ask1Top1q', 'ask1Top2q',
       'ask1Top3q', 'ask1Top4q', 'ask1Top5q', 'ask1Top6q', 'ask1Top7q',
       'ask1Top8q', 'ask1Top9q', 'ask1Top10q', 'ask1Top11q', 'ask1Top12q',
       'ask1Top13q', 'ask1Top14q', 'ask1Top15q', 'ask1Top16q',
       'ask1Top17q', 'ask1Top18q', 'ask1Top19q', 'ask1Top20q',
       'ask1Top21q', 'ask1Top22q', 'ask1Top23q', 'ask1Top24q',
       'ask1Top25q', 'ask1Top26q', 'ask1Top27q', 'ask1Top28q',
       'ask1Top29q', 'ask1Top30q', 'ask1Top31q', 'ask1Top32q',
       'ask1Top33q', 'ask1Top34q', 'ask1Top35q', 'ask1Top36q',
       'ask1Top37q', 'ask1Top38q', 'ask1Top39q', 'ask1Top40q',
       'ask1Top41q', 'ask1Top42q', 'ask1Top43q', 'ask1Top44q',
       'ask1Top45q', 'ask1Top46q', 'ask1Top47q', 'ask1Top48q',
       'ask1Top49q', 'ask1Top50q']
    SH = SH.fillna(0)
#     SH["p1"] = SH["bid1p"] + SH["ask1p"]
#     tt = SH[(SH["cum_volume"] > 0) & (SH["time"] < 145700000000)].groupby("skey")['p1'].min()
#     SH.drop("p1", axis=1, inplace=True)
#     try:
#         assert(tt[tt == 0].shape[0] == 0)
#     except:
#         display(tt[tt == 0])
#     SH = SH[~((SH["bid1p"] == 0) & (SH["ask1p"] == 0))]
    SH["ordering"] = SH.groupby("skey").cumcount()
    SH["ordering"] = SH["ordering"] + 1
    
    SH["has_missing"] = 0
    
    for col in ["skey", "date", "cum_trades_cnt", "total_bid_orders",
        'total_ask_orders', 'total_bid_levels', 'total_ask_levels', 'cum_canceled_buy_orders','cum_canceled_sell_orders',
            "ordering", 'bid_trade_max_duration', 'ask_trade_max_duration','has_missing']:
        SH[col] = SH[col].astype('int32')
    
#     for cols in ["prev_close", 'open', "high", "low", "close", 'bid10p','bid9p','bid8p','bid7p','bid6p','bid5p','bid4p','bid3p',
#              'bid2p','bid1p','ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p']:
# #         SH[cols] = SH[cols].apply(lambda x: round(x, 2)).astype('float64')
#         print(cols)
#         print(SH[cols].astype(str).apply(lambda x: len(str(x.split('.')[1]))).unique())
    
#     for cols in ['cum_amount', "cum_canceled_sell_amount", "cum_canceled_buy_amount"]:
# #         SH[cols] = SH[cols].apply(lambda x: round(x, 2)).astype('float64')
#         print(cols)
#         print(SH[cols].astype(str).apply(lambda x: len(str(x.split('.')[1]))).unique())
        
    for cols in ['total_bid_vwap', "total_ask_vwap"]:
#         print(cols)
#         print(SH[cols].astype(str).apply(lambda x: len(str(x.split('.')[1]))).unique())
        SH[cols] = SH[cols].apply(lambda x: round(x, 3))
        
   
    assert(sum(SH[SH["open"] != 0].groupby("skey")["open"].nunique() != 1) == 0)
    assert(sum(SH[SH["prev_close"] != 0].groupby("skey")["prev_close"].nunique() != 1) == 0)
    SH["prev_close"] = np.where(SH["time"] >= 91500000000, SH.groupby("skey")["prev_close"].transform("max"), SH["prev_close"]) 
    SH["open"] = np.where(SH["cum_volume"] > 0, SH.groupby("skey")["open"].transform("max"), SH["open"])
    assert(sum(SH[SH["open"] != 0].groupby("skey")["open"].nunique() != 1) == 0)
    assert(sum(SH[SH["prev_close"] != 0].groupby("skey")["prev_close"].nunique() != 1) == 0)
    assert(SH[SH["cum_volume"] > 0]["open"].min() > 0)
    print(datetime.datetime.now() - startTm)
    
    
    # check 1
    startTm = datetime.datetime.now()
    da_te = str(SH["date"].iloc[0]) 
    da_te = da_te[:4] + '-' + da_te[4:6] + '-' + da_te[6:8]
    db1 = db[db["date"] == da_te]
    db1["ID"] = db1["ID"].str[2:].astype(int) + 1000000
    db1["date"] = (db1["date"].str[:4] + db1["date"].str[5:7] + db1["date"].str[8:]).astype(int)
    SH["cum_max"] = SH.groupby("skey")["cum_volume"].transform(max)
    s2 = SH[SH["cum_volume"] == SH["cum_max"]].groupby("skey").first().reset_index()
    dd = SH[SH["cum_volume"] == SH["cum_max"]].groupby("skey")["time"].first().reset_index()
    SH.drop("cum_max", axis=1, inplace=True)
    s2 = s2.rename(columns={"skey": "ID", 'open':"d_open", "prev_close":"d_yclose","high":"d_high", "low":"d_low", "close":"d_close", "cum_volume":"d_volume", "cum_amount":"d_amount"})
    if SH["date"].iloc[0] < 20180820:
        s2["auction"] = 0
    else:
        dd["auction"] = np.where(dd["time"]<=145700000000, 0, 1)
        dd = dd.rename(columns={"skey": "ID"})
        s2 = pd.merge(s2, dd[["ID", "auction"]], on="ID")
    s2 = s2[["ID", "date", "d_open", "d_yclose", "d_high", "d_low", "d_close", "d_volume", "d_amount", "auction"]]
    re = pd.merge(db1, s2, on=["ID", "date", "d_open", "d_yclose","d_high", "d_low", "d_volume"], how="outer")
    try:
        assert(sum(re["d_amount_y"].isnull()) == 0)
    except:
        print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
        print(re[re["d_amount_y"].isnull()])
        wr_ong += [re[re["d_amount_y"].isnull()]]
    print(datetime.datetime.now() - startTm)
    
    # check 2
    # first part
    startTm = datetime.datetime.now()
    date = pd.DataFrame(pd.date_range(start='2019-06-10 08:30:00', end='2019-06-10 18:00:00', freq='s'), columns=["Orig"])
    date["time"] = date["Orig"].apply(lambda x: int(x.strftime("%H%M%S"))*1000)
    date["group"] = date["time"]//10000
    SH["group"] = SH["time"]//10000000
    gl = date[((date["time"] >= 93000000) & (date["time"] <= 113000000))|((date["time"] >= 130000000) & (date["time"] <= 150000000))]["group"].unique()
    l = set(gl) - set(SH["group"].unique())
    SH["has_missing1"] = 0 
    if len(l) != 0:
        print("massive missing")
        print(l)
        SH["order"] = SH.groupby(["skey", "time"]).cumcount()
        for i in l:
            SH["t"] = SH[SH["group"] > i].groupby("StockID")["time"].transform("min")
            SH["has_missing1"] = np.where((SH["time"] == SH["t"]) & (SH["order"] == 0), 1, 0)
        SH.drop(["order", "t", "group"], axis=1, inplace=True)   
    else:
        print("no massive missing")
        SH.drop(["group"], axis=1, inplace=True)
    



    # second part

    SH["time_interval"] = SH.groupby("skey")["datetime"].apply(lambda x: x - x.shift(1))
    SH["time_interval"] = SH["time_interval"].apply(lambda x: x.seconds)
    SH["tn_update"] = SH.groupby("skey")["cum_trades_cnt"].apply(lambda x: x-x.shift(1))

    f1 = SH[(SH["time"] >= 93000000000) & (SH["tn_update"] != 0)].groupby("skey")["time"].min().reset_index()
    f1 = f1.rename(columns={"time": "time1"})
    f2 = SH[(SH["time"] >= 130000000000) & (SH["tn_update"] != 0)].groupby("skey")["time"].min().reset_index()
    f2 = f2.rename(columns={"time": "time2"})
    f3 = SH[(SH["time"] >= 150000000000) & (SH["tn_update"] != 0)].groupby("skey")["time"].min().reset_index()
    f3 = f3.rename(columns={"time": "time3"})
    SH = pd.merge(SH, f1, on="skey", how="left")
    del f1
    SH = pd.merge(SH, f2, on="skey", how="left")
    del f2
    SH = pd.merge(SH, f3, on="skey", how="left")
    del f3
    p99 = SH[(SH["time"] > 93000000000) & (SH["time"] < 145700000000) & (SH["time"] != SH["time2"]) & (SH["tn_update"] != 0)]\
    .groupby("skey")["tn_update"].apply(lambda x: x.describe([0.99])["99%"]).reset_index()
    p99 = p99.rename(columns={"tn_update":"99%"})
    SH = pd.merge(SH, p99, on="skey", how="left")

    SH["has_missing2"] = 0
    SH["has_missing2"] = np.where((SH["time_interval"] > 60) & (SH["tn_update"] > SH["99%"]) & 
         (SH["time"] > SH["time1"]) & (SH["time"] != SH["time2"]) & (SH["time"] != SH["time3"]) & (SH["time"] != 100000000000), 1, 0)
    SH.drop(["time_interval", "tn_update", "time1", "time2", "time3", "99%"], axis=1, inplace=True) 

    SH["has_missing"] = np.where((SH["has_missing1"] == 1) | (SH["has_missing2"] == 1), 1, 0)
    SH.drop(["has_missing1", "has_missing2"], axis=1, inplace=True) 
    if SH[SH["has_missing"] == 1].shape[0] != 0:
        print("has missing!!!!!!!!!!!!!!!!!!!!!!!")
        print(SH[SH["has_missing"] == 1].shape[0])
        mi_ss += [SH[SH["has_missing"] == 1]]
    print(datetime.datetime.now() - startTm)
    
    
    
    startTm = datetime.datetime.now()
    SH["has_missing"] = SH["has_missing"].astype('int32')
    SH = SH[["skey", "date", "time", "clockAtArrival", "datetime", "ordering", "has_missing", "cum_trades_cnt", "cum_volume", "cum_amount", "prev_close",
                            "open", "high", "low", "close", 'bid10p','bid9p','bid8p','bid7p','bid6p','bid5p','bid4p','bid3p','bid2p','bid1p',
                            'ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p', 'bid10q','bid9q','bid8q',
                             'bid7q','bid6q','bid5q','bid4q','bid3q','bid2q','bid1q', 'ask1q','ask2q','ask3q','ask4q','ask5q','ask6q',
                             'ask7q','ask8q','ask9q','ask10q', 'bid10n', 'bid9n', 'bid8n', 'bid7n', 'bid6n', 'bid5n', 'bid4n', 'bid3n', 'bid2n', 'bid1n', 
                             'ask1n', 'ask2n', 'ask3n', 'ask4n', 'ask5n', 'ask6n','ask7n', 'ask8n', 'ask9n', 'ask10n','bid1Top1q','bid1Top2q','bid1Top3q','bid1Top4q','bid1Top5q','bid1Top6q',
        'bid1Top7q','bid1Top8q','bid1Top9q','bid1Top10q','bid1Top11q','bid1Top12q','bid1Top13q','bid1Top14q','bid1Top15q','bid1Top16q','bid1Top17q','bid1Top18q',
        'bid1Top19q','bid1Top20q','bid1Top21q','bid1Top22q','bid1Top23q','bid1Top24q','bid1Top25q','bid1Top26q','bid1Top27q','bid1Top28q','bid1Top29q',
        'bid1Top30q','bid1Top31q','bid1Top32q','bid1Top33q','bid1Top34q','bid1Top35q','bid1Top36q','bid1Top37q','bid1Top38q','bid1Top39q','bid1Top40q',
        'bid1Top41q','bid1Top42q','bid1Top43q','bid1Top44q','bid1Top45q','bid1Top46q','bid1Top47q','bid1Top48q','bid1Top49q','bid1Top50q', 'ask1Top1q',
        'ask1Top2q','ask1Top3q','ask1Top4q','ask1Top5q','ask1Top6q','ask1Top7q','ask1Top8q','ask1Top9q','ask1Top10q','ask1Top11q','ask1Top12q','ask1Top13q',
        'ask1Top14q','ask1Top15q','ask1Top16q','ask1Top17q','ask1Top18q','ask1Top19q','ask1Top20q','ask1Top21q','ask1Top22q','ask1Top23q',
        'ask1Top24q','ask1Top25q','ask1Top26q','ask1Top27q','ask1Top28q','ask1Top29q','ask1Top30q','ask1Top31q','ask1Top32q','ask1Top33q',
        'ask1Top34q','ask1Top35q','ask1Top36q','ask1Top37q','ask1Top38q','ask1Top39q','ask1Top40q','ask1Top41q','ask1Top42q','ask1Top43q',
        'ask1Top44q','ask1Top45q','ask1Top46q','ask1Top47q','ask1Top48q','ask1Top49q','ask1Top50q',"total_bid_quantity", "total_ask_quantity","total_bid_vwap", "total_ask_vwap",
        "total_bid_orders",'total_ask_orders','total_bid_levels', 'total_ask_levels', 'bid_trade_max_duration', 'ask_trade_max_duration', 'cum_canceled_buy_orders', 'cum_canceled_buy_volume',
        "cum_canceled_buy_amount", "cum_canceled_sell_orders", 'cum_canceled_sell_volume',"cum_canceled_sell_amount"]]
    
    display(SH["date"].iloc[0])
    print("SH finished")
    
    database_name = 'com_md_eq_cn'
    user = "zhenyuy"
    password = "bnONBrzSMGoE"

    db1 = DB("192.168.10.223", database_name, user, password)
    db1.write('md_snapshot_l2', SH)
    
    del SH
    print(datetime.datetime.now() - startTm)

wr_ong = pd.concat(wr_ong).reset_index(drop=True)
print(wr_ong)
mi_ss = pd.concat(mi_ss).reset_index(drop=True)
print(mi_ss)
print(less)



0:02:44.737086
0:00:46.278985
20190404 unzip finished
0:01:03.732189
0:01:42.145168
1
2
3
4
5
6
7
8
0:12:54.793987
0:00:49.827541


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0:00:04.317053
no massive missing
0:02:27.564455


20190404

SH finished
0:01:24.926164
0:00:47.661270
20190408 unzip finished
0:01:03.468443
0:01:45.135777
1
2
3
4
5
6
7
8
0:12:49.612126
0:00:45.379315
0:00:04.459657
no massive missing
0:02:28.437998


20190408

SH finished
0:01:27.075206
0:00:46.131886
20190409 unzip finished
0:01:05.294475
0:01:42.781908
1
2
3
4
5
6
7
8
0:12:21.938244
0:00:51.226768
0:00:04.627472
no massive missing
0:02:22.532368


20190409

SH finished
0:01:32.584276
0:00:45.009972
20190410 unzip finished
0:01:00.512704
0:01:40.938759
1
2
3
4
5
6
7
8
0:12:34.011283
0:00:46.650469
0:00:04.282685
no massive missing
0:02:26.699949


20190410

SH finished
0:01:15.209660
0:00:45.173687
20190411 unzip finished
0:01:00.472527
0:01:41.739439
1
2
3
4
5
6
7
8
0:12:33.283835
0:00:42.997688
0:00:04.286310
no massive missing
0:02:23.499989


20190411

SH finished
0:01:09.504753
0:00:44.223090
20190412 unzip finished
0:00:59.307537
0:01:38.647604
1
2
3
4
5
6
7
8
0:11:51.814128
0:00:42.263547
0:00:04.286566
no massive missing
0:02:20.129593


20190412

SH finished
0:01:05.948166
0:00:44.061311
20190415 unzip finished
0:01:00.320900
0:01:40.480783
1
2
3
4
5
6
7
8
0:11:59.333821
0:00:43.908883
0:00:04.185500
no massive missing
0:02:18.697451


20190415

SH finished
0:01:09.246871
0:00:45.902015
20190416 unzip finished
0:01:03.226203
0:01:42.504117
1
2
3
4
5
6
7
8
0:12:19.032859
0:00:42.967412
0:00:04.341413
no massive missing
0:02:22.017195


20190416

SH finished
0:01:30.419283
0:00:55.519503
20190417 unzip finished
0:00:59.761428
0:01:41.371895
1
2
3
4
5
6
7
8
0:12:30.907860
0:00:45.607213
0:00:04.503319
no massive missing
0:02:26.476750


20190417

SH finished
0:01:21.077790
0:00:46.584933
20190418 unzip finished
0:00:59.960018
0:01:40.721263
1
2
3
4
5
6
7
8
0:12:06.128015
0:00:43.694033
0:00:04.629444
no massive missing
0:02:44.974655


20190418

SH finished
0:01:10.699541
0:00:43.173262
20190419 unzip finished
0:00:58.004310
0:01:44.213876
1
2
3
4
5
6
7
8
0:12:28.960156
0:00:47.153221
0:00:04.746797
no massive missing
0:02:37.899732


20190419

SH finished
0:01:10.840862
0:00:47.190677
20190422 unzip finished
0:01:02.398595
0:01:38.757822
1
2
3
4
5
6
7
8
0:12:14.566310
0:00:45.483416
0:00:04.649754
no massive missing
0:02:30.631588


20190422

SH finished
0:01:10.299564
0:00:47.880109
20190423 unzip finished
0:01:01.589322
0:01:41.262475
1
2
3
4
5
6
7
8
0:12:09.076378
0:00:45.278169
0:00:04.579237
no massive missing
0:02:38.228676


20190423

SH finished
0:01:22.341567
0:00:44.713789
20190424 unzip finished
0:00:57.021447
0:01:35.810124
1
2
3
4
5
6
7
8
0:11:41.635782
0:00:43.414567
0:00:04.383456
no massive missing
0:02:40.023823


20190424

SH finished
0:01:10.519565
0:00:43.062073
20190425 unzip finished
0:01:02.261895
0:01:41.898379
1
2
3
4
5
6
7
8
0:12:04.922073
0:00:43.290271
0:00:04.260365
no massive missing
0:02:33.218485


20190425

SH finished
0:01:26.538522
0:00:46.670395
20190426 unzip finished
0:01:01.635443
0:01:38.719289
1
2
3
4
5
6
7
8
0:11:42.781720
0:00:44.129704
0:00:04.417841
no massive missing
0:02:26.838503


20190426

SH finished
0:01:23.599784
0:00:43.952056
20190429 unzip finished
0:00:59.099280
0:01:46.259085
1
2
3
4
5
6
7
8
0:11:38.902237
0:00:44.469567
0:00:05.107945
no massive missing
0:02:26.793999


20190429

SH finished
0:01:19.441313
0:00:40.788032
20190430 unzip finished
0:00:55.835066
0:01:46.187529
1
2
3
4
5
6
7
8
0:11:13.410849
0:00:41.988429
0:00:04.042118
no massive missing
0:02:17.165371


20190430

SH finished
0:01:07.301662
0:00:43.704190
20190506 unzip finished
0:00:57.833918
0:01:49.911020
1
2
3
4
5
6
7
8
0:11:58.157709
0:00:44.673223
0:00:04.222155
no massive missing
0:02:33.386601


20190506

SH finished
0:01:22.800104
0:00:43.779672
20190507 unzip finished
0:00:57.720211
0:01:35.698369
1
2
3
4
5
6
7
8
0:11:06.568463
0:00:47.672390
0:00:04.393496
no massive missing
0:02:27.581739


20190507

SH finished
0:01:06.656068
0:00:39.880622
20190508 unzip finished
0:00:57.586799
0:01:35.150756
1
2
3
4
5
6
7
8
0:11:03.773746
0:00:40.804596
0:00:04.469971
no massive missing
0:02:23.079472


20190508

SH finished
0:01:26.869879
0:00:40.090253
20190509 unzip finished
0:00:55.092674
0:01:29.904602
1
2
3
4
5
6
7
8
0:10:39.900210
0:00:38.145700
0:00:03.980153
no massive missing
0:02:06.426172


20190509

SH finished
0:00:59.749151
0:00:43.666275
20190510 unzip finished
0:01:04.903579
0:01:36.539230
1
2
3
4
5
6
7
8
0:11:19.936118
0:00:42.724252
0:00:04.429182
no massive missing
0:02:27.649862


20190510

SH finished
0:01:17.625113
0:00:36.667851
20190513 unzip finished
0:00:51.231684
0:01:27.247639
1
2
3
4
5
6
7
8
0:10:45.514821
0:00:40.771804
0:00:03.953454
no massive missing
0:02:11.343120


20190513

SH finished
0:01:11.136376
0:00:42.924923
20190514 unzip finished
0:00:57.487620
0:01:34.295889
1
2
3
4
5
6
7
8
0:11:02.387810
0:00:42.393539
0:00:04.218906
no massive missing
0:02:10.154520


20190514

SH finished
0:01:07.918076
0:00:39.746229
20190515 unzip finished
0:00:54.962221
0:01:35.071519
1
2
3
4
5
6
7
8
0:11:24.811962
0:00:40.555852
0:00:03.958397
no massive missing
0:02:12.896857


20190515

SH finished
0:01:17.375018
0:00:40.909000
20190516 unzip finished
0:00:57.376196
0:01:34.399037
1
2
3
4
5
6
7
8
0:11:14.328069
0:00:38.860188
0:00:03.955805
no massive missing
0:02:08.313173


20190516

SH finished
0:00:59.404403
0:00:46.585044
20190517 unzip finished
0:00:53.509952
0:01:33.806753
1
2
3
4
5
6
7
8
0:11:33.900032
0:00:40.376331
0:00:04.053795
no massive missing
0:02:17.021799


20190517

SH finished
0:01:26.221307
0:00:38.405468
20190520 unzip finished
0:00:53.034520
0:01:27.787971
1
2
3
4
5
6
7
8
0:10:37.789233
0:00:42.409810
0:00:03.909095
no massive missing
0:02:09.233029


20190520

SH finished
0:00:59.009634
0:00:38.979410
20190521 unzip finished
0:00:52.815266
0:01:30.203402
1
2
3
4
5
6
7
8
0:10:59.957378
0:00:40.243299
0:00:04.039688
no massive missing
0:02:11.337044


20190521

SH finished
0:01:18.040730
0:00:37.487121
20190522 unzip finished
0:00:51.263340
0:01:25.982094
1
2
3
4
5
6
7
8
0:10:36.905613
0:00:38.067247
0:00:03.853521
no massive missing
0:02:04.554999


20190522

SH finished
0:01:04.937659
0:00:39.311537
20190523 unzip finished
0:00:52.141165
0:01:26.764293
1
2
3
4
5
6
7
8
0:10:28.700324
0:00:36.164984
0:00:03.640758
no massive missing
0:02:02.890228


20190523

SH finished
0:01:05.152301
0:00:36.059137
20190524 unzip finished
0:00:51.106707
0:01:22.773149
1
2
3
4
5
6
7
8
0:10:00.596296
0:00:35.044173
0:00:03.531098
no massive missing
0:01:57.707771


20190524

SH finished
0:01:06.137447
0:00:00.117599
20190525 unzip finished


NameError: name 'df' is not defined

In [None]:
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz


def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name)


class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db'):
        self.db_name = db_name
        self.uri = uri
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = 1
            seg = {'ver': version, 'data': self.ser(df_seg, version), 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)

        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        else:
            raise Exception('unknown version')


def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()











import pandas as pd
import random
import numpy as np
import glob
import pickle
import os
import datetime
import time
pd.set_option("max_columns", 200)

startTm = datetime.datetime.now()
readPath = '/home/work516/day_stock/***'
dataPathLs = np.array(glob.glob(readPath))
dataPathLs = dataPathLs[[np.array([os.path.basename(i).split('.')[0][:2] == 'SH' for i in dataPathLs])]]
db = pd.DataFrame()
for p in dataPathLs:
    dayData = pd.read_csv(p, compression='gzip')
    db = pd.concat([db, dayData])
print(datetime.datetime.now() - startTm)

year = "2019"
startDate = '20190525'
endDate = '20191231'
readPath = '/mnt/usb/data/' + year + '/***/***'
dataPathLs = np.array(glob.glob(readPath))
dateLs = np.array([os.path.basename(i).split('_')[0] for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]
date_list = pd.read_csv("/home/work516/KR_upload_code/trading_days.csv")
wr_ong = []
mi_ss = []
less = []

for data in dataPathLs:
    if len(np.array(glob.glob(data + '/SH/snapshot***'))) == 0:
        if int(os.path.basename(data)) not in date_list["Date"].values:
            continue
        else:
            print(os.path.basename(data) + " less data!!!!!!!!!!!!!!!!!")
            less.append(data)
            continue
    startTm = datetime.datetime.now()
    date = os.path.basename(data)
    rar_path = data + '/SH/snapshot.7z'
    path = '/mnt/e/unzip_data/2019/SH'
    path1 = path + '/' + date
    un_path = path1
    cmd = '7za x {} -o{}'.format(rar_path, un_path)
    os.system(cmd)
    print(datetime.datetime.now() - startTm)
    print(date + ' unzip finished')

    readPath = path1 + '/snapshot/***2/***'
    dataPathLs = np.array(glob.glob(readPath))
    dateLs = np.array([int(os.path.basename(i).split('.')[0]) for i in dataPathLs])
    dataPathLs = dataPathLs[((dateLs >= 600000) & (dateLs <= 700000))]
    SH = []
    ll = []
    startTm = datetime.datetime.now()
    for i in dataPathLs:
        try:
            df = pd.read_csv(i, usecols = [0,1,3,5,7,9,10,11,15,17,18,19,20,21,22,23,25,26,28,29,30,31,32,33,37,39,40,41,
                                          42,46,47,49,50])
        except:
            print("empty data")
            print(i)
            ll.append(int(os.path.basename(i).split('.')[0]))
            continue
        df["StockID"] = int(os.path.basename(i).split('.')[0])
        SH += [df]
    del df
    SH = pd.concat(SH).reset_index(drop=True)
    print(datetime.datetime.now() - startTm)
    
    startTm = datetime.datetime.now()
    SH["skey"] = SH["StockID"] + 1000000
    SH.drop(["StockID"],axis=1,inplace=True)
    SH["date"] = int(SH["QuotTime"].iloc[0]//1000000000)
    SH["time"] = (SH['QuotTime'] - int(SH['QuotTime'].iloc[0]//1000000000*1000000000)).astype(np.int64) * 1000
    SH["clockAtArrival"] = SH["QuotTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S%f').timestamp()*1e6))
    SH.drop(["QuotTime"],axis=1,inplace=True)
    SH['datetime'] = SH["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))
    print(datetime.datetime.now() - startTm)

    startTm = datetime.datetime.now()
    SH["BidPrice"] = SH["BidPrice"].apply(lambda x: [float(i) for i in x[1:-1].split(',')])
    SH["OfferPrice"] = SH["OfferPrice"].apply(lambda x: [float(i) for i in x[1:-1].split(',')])
    SH["BidOrderQty"] = SH["BidOrderQty"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])
    SH["OfferOrderQty"] = SH["OfferOrderQty"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])
    SH["BidNumOrders"] = SH["BidNumOrders"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])
    SH["OfferNumOrders"] = SH["OfferNumOrders"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])

    for i in range(1, 11):
        SH["bid" + str(i) + 'p'] = SH["BidPrice"].apply(lambda x: x[i-1],2)
    SH.drop(["BidPrice"],axis=1,inplace=True)
    print("1")
    for i in range(1, 11):
        SH["ask" + str(i) + 'p'] = SH["OfferPrice"].apply(lambda x: x[i-1],2)
    SH.drop(["OfferPrice"],axis=1,inplace=True)
    print("2")
    for i in range(1, 11):
        SH["bid" + str(i) + 'q'] = SH["BidOrderQty"].apply(lambda x: x[i-1])
    SH.drop(["BidOrderQty"],axis=1,inplace=True)
    print("3")
    for i in range(1, 11):
        SH["ask" + str(i) + 'q'] = SH["OfferOrderQty"].apply(lambda x: x[i-1])
    SH.drop(["OfferOrderQty"],axis=1,inplace=True)
    print("4")
    for i in range(1, 11):
        SH["bid" + str(i) + 'n'] = SH["BidNumOrders"].apply(lambda x: x[i-1])
        SH["bid" + str(i) + 'n'] = SH["bid" + str(i) + 'n'].astype('int32')
    SH.drop(["BidNumOrders"],axis=1,inplace=True)
    print("5")
    for i in range(1, 11):
        SH["ask" + str(i) + 'n'] = SH["OfferNumOrders"].apply(lambda x: x[i-1])
        SH["ask" + str(i) + 'n'] = SH["ask" + str(i) + 'n'].astype('int32') 
    SH.drop(["OfferNumOrders"],axis=1,inplace=True)
    print("6")
    
    SH["BidOrders"] = SH["BidOrders"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])
    SH["OfferOrders"] = SH["OfferOrders"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])

    for i in range(1, 51):
        SH["bid1Top" + str(i) + 'q'] = SH["BidOrders"].apply(lambda x: x[i-1])
        SH["bid1Top" + str(i) + 'q'] = SH["bid1Top" + str(i) + 'q'].astype('int32') 
    SH.drop(["BidOrders"],axis=1,inplace=True)
    print("7")
    
    for i in range(1, 51):
        SH["ask1Top" + str(i) + 'q'] = SH["OfferOrders"].apply(lambda x: x[i-1])
        SH["ask1Top" + str(i) + 'q'] = SH["ask1Top" + str(i) + 'q'].astype('int32') 
    SH.drop(["OfferOrders"],axis=1,inplace=True)
    print("8")
    print(datetime.datetime.now() - startTm)
    
    
    startTm = datetime.datetime.now()
    SH.columns = ['cum_trades_cnt', 'ask_trade_max_duration', 'total_bid_orders',
       'cum_canceled_sell_amount', 'total_ask_quantity', 'cum_canceled_buy_orders',
       'total_ask_vwap', 'cum_canceled_sell_volume', 'cum_volume', 'open',
       'high', 'prev_close', 'low', 'total_bid_vwap',
       'cum_canceled_sell_orders', 'total_ask_orders', 'total_ask_levels',
       'total_bid_quantity', 'cum_canceled_buy_volume', 'bid_trade_max_duration',
       'total_bid_levels', 'close', 'cum_amount', 'cum_canceled_buy_amount', 'skey', 'date', 'time', 'clockAtArrival',
       'datetime', 'bid1p', 'bid2p', 'bid3p', 'bid4p', 'bid5p', 'bid6p',
       'bid7p', 'bid8p', 'bid9p', 'bid10p', 'ask1p', 'ask2p', 'ask3p',
       'ask4p', 'ask5p', 'ask6p', 'ask7p', 'ask8p', 'ask9p', 'ask10p',
       'bid1q', 'bid2q', 'bid3q', 'bid4q', 'bid5q', 'bid6q', 'bid7q',
       'bid8q', 'bid9q', 'bid10q', 'ask1q', 'ask2q', 'ask3q', 'ask4q',
       'ask5q', 'ask6q', 'ask7q', 'ask8q', 'ask9q', 'ask10q', 'bid1n',
       'bid2n', 'bid3n', 'bid4n', 'bid5n', 'bid6n', 'bid7n', 'bid8n',
       'bid9n', 'bid10n', 'ask1n', 'ask2n', 'ask3n', 'ask4n', 'ask5n',
       'ask6n', 'ask7n', 'ask8n', 'ask9n', 'ask10n', 'bid1Top1q',
       'bid1Top2q', 'bid1Top3q', 'bid1Top4q', 'bid1Top5q', 'bid1Top6q',
       'bid1Top7q', 'bid1Top8q', 'bid1Top9q', 'bid1Top10q', 'bid1Top11q',
       'bid1Top12q', 'bid1Top13q', 'bid1Top14q', 'bid1Top15q',
       'bid1Top16q', 'bid1Top17q', 'bid1Top18q', 'bid1Top19q',
       'bid1Top20q', 'bid1Top21q', 'bid1Top22q', 'bid1Top23q',
       'bid1Top24q', 'bid1Top25q', 'bid1Top26q', 'bid1Top27q',
       'bid1Top28q', 'bid1Top29q', 'bid1Top30q', 'bid1Top31q',
       'bid1Top32q', 'bid1Top33q', 'bid1Top34q', 'bid1Top35q',
       'bid1Top36q', 'bid1Top37q', 'bid1Top38q', 'bid1Top39q',
       'bid1Top40q', 'bid1Top41q', 'bid1Top42q', 'bid1Top43q',
       'bid1Top44q', 'bid1Top45q', 'bid1Top46q', 'bid1Top47q',
       'bid1Top48q', 'bid1Top49q', 'bid1Top50q', 'ask1Top1q', 'ask1Top2q',
       'ask1Top3q', 'ask1Top4q', 'ask1Top5q', 'ask1Top6q', 'ask1Top7q',
       'ask1Top8q', 'ask1Top9q', 'ask1Top10q', 'ask1Top11q', 'ask1Top12q',
       'ask1Top13q', 'ask1Top14q', 'ask1Top15q', 'ask1Top16q',
       'ask1Top17q', 'ask1Top18q', 'ask1Top19q', 'ask1Top20q',
       'ask1Top21q', 'ask1Top22q', 'ask1Top23q', 'ask1Top24q',
       'ask1Top25q', 'ask1Top26q', 'ask1Top27q', 'ask1Top28q',
       'ask1Top29q', 'ask1Top30q', 'ask1Top31q', 'ask1Top32q',
       'ask1Top33q', 'ask1Top34q', 'ask1Top35q', 'ask1Top36q',
       'ask1Top37q', 'ask1Top38q', 'ask1Top39q', 'ask1Top40q',
       'ask1Top41q', 'ask1Top42q', 'ask1Top43q', 'ask1Top44q',
       'ask1Top45q', 'ask1Top46q', 'ask1Top47q', 'ask1Top48q',
       'ask1Top49q', 'ask1Top50q']
    SH = SH.fillna(0)
#     SH["p1"] = SH["bid1p"] + SH["ask1p"]
#     tt = SH[(SH["cum_volume"] > 0) & (SH["time"] < 145700000000)].groupby("skey")['p1'].min()
#     SH.drop("p1", axis=1, inplace=True)
#     try:
#         assert(tt[tt == 0].shape[0] == 0)
#     except:
#         display(tt[tt == 0])
#     SH = SH[~((SH["bid1p"] == 0) & (SH["ask1p"] == 0))]
    SH["ordering"] = SH.groupby("skey").cumcount()
    SH["ordering"] = SH["ordering"] + 1
    
    SH["has_missing"] = 0
    
    for col in ["skey", "date", "cum_trades_cnt", "total_bid_orders",
        'total_ask_orders', 'total_bid_levels', 'total_ask_levels', 'cum_canceled_buy_orders','cum_canceled_sell_orders',
            "ordering", 'bid_trade_max_duration', 'ask_trade_max_duration','has_missing']:
        SH[col] = SH[col].astype('int32')
    
#     for cols in ["prev_close", 'open', "high", "low", "close", 'bid10p','bid9p','bid8p','bid7p','bid6p','bid5p','bid4p','bid3p',
#              'bid2p','bid1p','ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p']:
# #         SH[cols] = SH[cols].apply(lambda x: round(x, 2)).astype('float64')
#         print(cols)
#         print(SH[cols].astype(str).apply(lambda x: len(str(x.split('.')[1]))).unique())
    
#     for cols in ['cum_amount', "cum_canceled_sell_amount", "cum_canceled_buy_amount"]:
# #         SH[cols] = SH[cols].apply(lambda x: round(x, 2)).astype('float64')
#         print(cols)
#         print(SH[cols].astype(str).apply(lambda x: len(str(x.split('.')[1]))).unique())
        
    for cols in ['total_bid_vwap', "total_ask_vwap"]:
#         print(cols)
#         print(SH[cols].astype(str).apply(lambda x: len(str(x.split('.')[1]))).unique())
        SH[cols] = SH[cols].apply(lambda x: round(x, 3))
        
   
    assert(sum(SH[SH["open"] != 0].groupby("skey")["open"].nunique() != 1) == 0)
    assert(sum(SH[SH["prev_close"] != 0].groupby("skey")["prev_close"].nunique() != 1) == 0)
    SH["prev_close"] = np.where(SH["time"] >= 91500000000, SH.groupby("skey")["prev_close"].transform("max"), SH["prev_close"]) 
    SH["open"] = np.where(SH["cum_volume"] > 0, SH.groupby("skey")["open"].transform("max"), SH["open"])
    assert(sum(SH[SH["open"] != 0].groupby("skey")["open"].nunique() != 1) == 0)
    assert(sum(SH[SH["prev_close"] != 0].groupby("skey")["prev_close"].nunique() != 1) == 0)
    assert(SH[SH["cum_volume"] > 0]["open"].min() > 0)
    print(datetime.datetime.now() - startTm)
    
    
    # check 1
    startTm = datetime.datetime.now()
    da_te = str(SH["date"].iloc[0]) 
    da_te = da_te[:4] + '-' + da_te[4:6] + '-' + da_te[6:8]
    db1 = db[db["date"] == da_te]
    db1["ID"] = db1["ID"].str[2:].astype(int) + 1000000
    db1["date"] = (db1["date"].str[:4] + db1["date"].str[5:7] + db1["date"].str[8:]).astype(int)
    SH["cum_max"] = SH.groupby("skey")["cum_volume"].transform(max)
    s2 = SH[SH["cum_volume"] == SH["cum_max"]].groupby("skey").first().reset_index()
    dd = SH[SH["cum_volume"] == SH["cum_max"]].groupby("skey")["time"].first().reset_index()
    SH.drop("cum_max", axis=1, inplace=True)
    s2 = s2.rename(columns={"skey": "ID", 'open':"d_open", "prev_close":"d_yclose","high":"d_high", "low":"d_low", "close":"d_close", "cum_volume":"d_volume", "cum_amount":"d_amount"})
    if SH["date"].iloc[0] < 20180820:
        s2["auction"] = 0
    else:
        dd["auction"] = np.where(dd["time"]<=145700000000, 0, 1)
        dd = dd.rename(columns={"skey": "ID"})
        s2 = pd.merge(s2, dd[["ID", "auction"]], on="ID")
    s2 = s2[["ID", "date", "d_open", "d_yclose", "d_high", "d_low", "d_close", "d_volume", "d_amount", "auction"]]
    re = pd.merge(db1, s2, on=["ID", "date", "d_open", "d_yclose","d_high", "d_low", "d_volume"], how="outer")
    try:
        assert(sum(re["d_amount_y"].isnull()) == 0)
    except:
        print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
        print(re[re["d_amount_y"].isnull()])
        wr_ong += [re[re["d_amount_y"].isnull()]]
    print(datetime.datetime.now() - startTm)
    
    # check 2
    # first part
    startTm = datetime.datetime.now()
    date = pd.DataFrame(pd.date_range(start='2019-06-10 08:30:00', end='2019-06-10 18:00:00', freq='s'), columns=["Orig"])
    date["time"] = date["Orig"].apply(lambda x: int(x.strftime("%H%M%S"))*1000)
    date["group"] = date["time"]//10000
    SH["group"] = SH["time"]//10000000
    gl = date[((date["time"] >= 93000000) & (date["time"] <= 113000000))|((date["time"] >= 130000000) & (date["time"] <= 150000000))]["group"].unique()
    l = set(gl) - set(SH["group"].unique())
    SH["has_missing1"] = 0 
    if len(l) != 0:
        print("massive missing")
        print(l)
        SH["order"] = SH.groupby(["skey", "time"]).cumcount()
        for i in l:
            SH["t"] = SH[SH["group"] > i].groupby("StockID")["time"].transform("min")
            SH["has_missing1"] = np.where((SH["time"] == SH["t"]) & (SH["order"] == 0), 1, 0)
        SH.drop(["order", "t", "group"], axis=1, inplace=True)   
    else:
        print("no massive missing")
        SH.drop(["group"], axis=1, inplace=True)
    



    # second part

    SH["time_interval"] = SH.groupby("skey")["datetime"].apply(lambda x: x - x.shift(1))
    SH["time_interval"] = SH["time_interval"].apply(lambda x: x.seconds)
    SH["tn_update"] = SH.groupby("skey")["cum_trades_cnt"].apply(lambda x: x-x.shift(1))

    f1 = SH[(SH["time"] >= 93000000000) & (SH["tn_update"] != 0)].groupby("skey")["time"].min().reset_index()
    f1 = f1.rename(columns={"time": "time1"})
    f2 = SH[(SH["time"] >= 130000000000) & (SH["tn_update"] != 0)].groupby("skey")["time"].min().reset_index()
    f2 = f2.rename(columns={"time": "time2"})
    f3 = SH[(SH["time"] >= 150000000000) & (SH["tn_update"] != 0)].groupby("skey")["time"].min().reset_index()
    f3 = f3.rename(columns={"time": "time3"})
    SH = pd.merge(SH, f1, on="skey", how="left")
    del f1
    SH = pd.merge(SH, f2, on="skey", how="left")
    del f2
    SH = pd.merge(SH, f3, on="skey", how="left")
    del f3
    p99 = SH[(SH["time"] > 93000000000) & (SH["time"] < 145700000000) & (SH["time"] != SH["time2"]) & (SH["tn_update"] != 0)]\
    .groupby("skey")["tn_update"].apply(lambda x: x.describe([0.99])["99%"]).reset_index()
    p99 = p99.rename(columns={"tn_update":"99%"})
    SH = pd.merge(SH, p99, on="skey", how="left")

    SH["has_missing2"] = 0
    SH["has_missing2"] = np.where((SH["time_interval"] > 60) & (SH["tn_update"] > SH["99%"]) & 
         (SH["time"] > SH["time1"]) & (SH["time"] != SH["time2"]) & (SH["time"] != SH["time3"]) & (SH["time"] != 100000000000), 1, 0)
    SH.drop(["time_interval", "tn_update", "time1", "time2", "time3", "99%"], axis=1, inplace=True) 

    SH["has_missing"] = np.where((SH["has_missing1"] == 1) | (SH["has_missing2"] == 1), 1, 0)
    SH.drop(["has_missing1", "has_missing2"], axis=1, inplace=True) 
    if SH[SH["has_missing"] == 1].shape[0] != 0:
        print("has missing!!!!!!!!!!!!!!!!!!!!!!!")
        print(SH[SH["has_missing"] == 1].shape[0])
        mi_ss += [SH[SH["has_missing"] == 1]]
    print(datetime.datetime.now() - startTm)
    
    
    
    startTm = datetime.datetime.now()
    SH["has_missing"] = SH["has_missing"].astype('int32')
    SH = SH[["skey", "date", "time", "clockAtArrival", "datetime", "ordering", "has_missing", "cum_trades_cnt", "cum_volume", "cum_amount", "prev_close",
                            "open", "high", "low", "close", 'bid10p','bid9p','bid8p','bid7p','bid6p','bid5p','bid4p','bid3p','bid2p','bid1p',
                            'ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p', 'bid10q','bid9q','bid8q',
                             'bid7q','bid6q','bid5q','bid4q','bid3q','bid2q','bid1q', 'ask1q','ask2q','ask3q','ask4q','ask5q','ask6q',
                             'ask7q','ask8q','ask9q','ask10q', 'bid10n', 'bid9n', 'bid8n', 'bid7n', 'bid6n', 'bid5n', 'bid4n', 'bid3n', 'bid2n', 'bid1n', 
                             'ask1n', 'ask2n', 'ask3n', 'ask4n', 'ask5n', 'ask6n','ask7n', 'ask8n', 'ask9n', 'ask10n','bid1Top1q','bid1Top2q','bid1Top3q','bid1Top4q','bid1Top5q','bid1Top6q',
        'bid1Top7q','bid1Top8q','bid1Top9q','bid1Top10q','bid1Top11q','bid1Top12q','bid1Top13q','bid1Top14q','bid1Top15q','bid1Top16q','bid1Top17q','bid1Top18q',
        'bid1Top19q','bid1Top20q','bid1Top21q','bid1Top22q','bid1Top23q','bid1Top24q','bid1Top25q','bid1Top26q','bid1Top27q','bid1Top28q','bid1Top29q',
        'bid1Top30q','bid1Top31q','bid1Top32q','bid1Top33q','bid1Top34q','bid1Top35q','bid1Top36q','bid1Top37q','bid1Top38q','bid1Top39q','bid1Top40q',
        'bid1Top41q','bid1Top42q','bid1Top43q','bid1Top44q','bid1Top45q','bid1Top46q','bid1Top47q','bid1Top48q','bid1Top49q','bid1Top50q', 'ask1Top1q',
        'ask1Top2q','ask1Top3q','ask1Top4q','ask1Top5q','ask1Top6q','ask1Top7q','ask1Top8q','ask1Top9q','ask1Top10q','ask1Top11q','ask1Top12q','ask1Top13q',
        'ask1Top14q','ask1Top15q','ask1Top16q','ask1Top17q','ask1Top18q','ask1Top19q','ask1Top20q','ask1Top21q','ask1Top22q','ask1Top23q',
        'ask1Top24q','ask1Top25q','ask1Top26q','ask1Top27q','ask1Top28q','ask1Top29q','ask1Top30q','ask1Top31q','ask1Top32q','ask1Top33q',
        'ask1Top34q','ask1Top35q','ask1Top36q','ask1Top37q','ask1Top38q','ask1Top39q','ask1Top40q','ask1Top41q','ask1Top42q','ask1Top43q',
        'ask1Top44q','ask1Top45q','ask1Top46q','ask1Top47q','ask1Top48q','ask1Top49q','ask1Top50q',"total_bid_quantity", "total_ask_quantity","total_bid_vwap", "total_ask_vwap",
        "total_bid_orders",'total_ask_orders','total_bid_levels', 'total_ask_levels', 'bid_trade_max_duration', 'ask_trade_max_duration', 'cum_canceled_buy_orders', 'cum_canceled_buy_volume',
        "cum_canceled_buy_amount", "cum_canceled_sell_orders", 'cum_canceled_sell_volume',"cum_canceled_sell_amount"]]
    
    display(SH["date"].iloc[0])
    print("SH finished")
    
    database_name = 'com_md_eq_cn'
    user = "zhenyuy"
    password = "bnONBrzSMGoE"

    db1 = DB("192.168.10.223", database_name, user, password)
    db1.write('md_snapshot_l2', SH)
    
    del SH
    print(datetime.datetime.now() - startTm)

wr_ong = pd.concat(wr_ong).reset_index(drop=True)
print(wr_ong)
mi_ss = pd.concat(mi_ss).reset_index(drop=True)
print(mi_ss)
print(less)



0:02:57.707707
0:00:40.057232
20190527 unzip finished
0:00:52.226121
0:01:29.592609
1
2
3
4
5
6
7
8
0:10:30.418661
0:00:40.780117


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0:00:04.193168
no massive missing
0:02:06.289264


20190527

SH finished
0:01:18.228015
0:00:42.506151
20190528 unzip finished
0:00:52.794362
0:01:30.427240
1
2
3
4
5
6
7
8
0:10:33.243365
0:00:39.585516
0:00:04.210347
no massive missing
0:02:10.630289


20190528

SH finished
0:00:59.154527
0:00:37.091933
20190529 unzip finished
0:00:51.831947
0:01:23.538522
1
2
3
4
5
6
7
8
0:10:07.137957
0:00:37.229706
0:00:03.922981
no massive missing
0:02:12.334137


20190529

SH finished
0:01:09.643000
0:00:37.703799
20190530 unzip finished
0:00:54.427807
0:01:37.571803
1
2
3
4
5
6
7
8
0:10:26.928351
0:00:38.718840
0:00:04.084201
no massive missing
0:02:04.398854


20190530

SH finished
0:00:57.335476
0:00:38.569913
20190531 unzip finished
0:00:51.640409
0:01:27.264068
1
2
3
4
5
6
7
8
0:10:21.194872
0:00:38.049308
0:00:03.810380
no massive missing
0:02:06.597370


20190531

SH finished
0:01:06.097591
0:00:38.082572
20190603 unzip finished
0:00:57.750131
0:01:31.312410
1
2
3
4
5
6
7
8
0:10:29.803799
0:00:38.667579
0:00:04.080691
no massive missing
0:02:11.344397


20190603

SH finished
0:01:02.841474
0:00:36.032298
20190604 unzip finished
0:00:53.062491
0:01:30.955574
1
2
3
4
5
6
7
8
0:10:09.313502
0:00:38.581019
0:00:03.986889
no massive missing
0:02:10.219364


20190604

SH finished
0:01:23.501268
0:00:35.427915
20190605 unzip finished
0:00:50.069064
0:01:26.201747
1
2
3
4
5
6
7
8
0:10:03.540718
0:00:39.112931
0:00:03.988150
no massive missing
0:02:05.457001


20190605

SH finished
0:01:05.943947
0:00:37.388285
20190606 unzip finished
0:00:54.575160
0:01:29.369265
1
2
3
4
5
6
7
8
0:10:20.915814
0:00:36.765545
0:00:03.881667
no massive missing
0:02:03.097931


20190606

SH finished
0:01:04.509070
0:00:36.159091
20190610 unzip finished
0:00:53.576718
0:01:25.090416
1
2
3
4
5
6
7
8
0:10:02.965350
0:00:35.235128
0:00:03.744307
no massive missing
0:01:58.258142


20190610

SH finished
0:00:55.721974
0:00:40.708079
20190611 unzip finished
0:00:54.969812
0:01:36.865543
1
2
3
4
5
6
7
8
0:11:30.487050
0:00:40.807183
0:00:04.020296
no massive missing
0:02:20.589129


20190611

SH finished
0:01:12.707534
0:00:37.366344
20190612 unzip finished
0:00:54.203833
0:01:28.300195
1
2
3
4
5
6
7
8
0:10:44.935086
0:00:38.364219
0:00:03.824379
no massive missing
0:02:09.534318


20190612

SH finished
0:01:05.006434
0:00:38.345576
20190613 unzip finished
0:00:54.487204
0:01:27.478859
1
2
3
4
5
6
7
8
0:10:53.770989
0:00:37.975151
0:00:03.987782
no massive missing
0:02:09.404406


20190613

SH finished
0:01:03.962180
0:00:36.623950
20190614 unzip finished
0:00:53.660657
0:01:28.692850
1
2
3
4
5
6
7
8
0:10:28.884164
0:00:40.128028
0:00:04.384057
no massive missing
0:02:10.068850


20190614

SH finished
0:00:59.305885
0:00:35.895430
20190617 unzip finished
0:00:50.053240
0:01:33.951729
1
2
3
4
5
6
7
8
0:09:50.010081
0:00:38.698592
0:00:03.850044
no massive missing
0:02:12.208972


20190617

SH finished
0:00:57.157722
0:00:38.808990
20190618 unzip finished
0:00:49.046543
0:01:22.462655
1
2
3
4
5
6
7
8
0:09:46.011613
0:00:41.713761
0:00:03.798736
no massive missing
0:02:18.744689


20190618

SH finished
0:01:08.074750
0:00:37.358704
20190619 unzip finished
0:00:53.265743
0:01:30.095176
1
2
3
4
5
6
7
8
0:10:39.372161
0:00:37.844606
0:00:03.895166
no massive missing
0:02:08.052347


20190619

SH finished
0:00:58.570024
0:00:40.052684
20190620 unzip finished
0:00:57.366631
0:01:39.057552
1
2
3
4
5
6
7
8
0:11:37.344368
0:00:41.769304
0:00:04.262652
no massive missing
0:02:18.563184


20190620

SH finished
0:01:18.499565
0:00:42.661987
20190621 unzip finished
0:00:56.639511
0:01:38.736698
1
2
3
4
5
6
7
8
0:11:46.470169
0:00:42.574635
0:00:04.299234
no massive missing
0:02:20.940815


20190621

SH finished
0:01:06.922408
0:00:36.747188
20190624 unzip finished
0:00:54.059201
0:01:33.739745
1
2
3
4
5
6
7
8
0:10:56.119095
0:00:43.183356
0:00:03.950233
no massive missing
0:02:10.032868


20190624

SH finished
0:01:00.501196
0:00:38.560857
20190625 unzip finished
0:00:54.637425
0:01:34.680275
1
2
3
4
5
6
7
8
0:11:10.391835
0:00:38.797362
0:00:03.947475
no massive missing
0:02:05.145521


20190625

SH finished
0:00:58.517256
0:00:36.600202
20190626 unzip finished
0:00:50.603949
0:01:26.775951
1
2
3
4
5
6
7
8
0:10:22.760621
0:00:39.843593
0:00:03.749294
no massive missing
0:02:04.978553


20190626

SH finished
0:01:01.512695
0:00:49.398424
20190627 unzip finished
0:00:52.491301
0:01:30.908893
1
2
3
4
5
6
7
8
0:10:47.667213
0:00:41.780172
0:00:03.977319
no massive missing
0:02:22.350460


20190627

SH finished
0:01:08.473925
0:00:38.074658
20190628 unzip finished
0:00:53.413594
0:01:31.272127
1
2
3
4
5
6
7
8
0:10:47.805144
0:00:38.910947
0:00:04.214161
no massive missing
0:02:07.574414


20190628

SH finished
0:00:59.721649
0:00:39.663234
20190701 unzip finished
0:00:53.503300
0:01:37.632233
1
2
3
4
5
6
7
8
0:11:34.446423
0:00:39.989856
0:00:04.552235
no massive missing
0:02:12.313325


20190701

SH finished
0:01:10.972329
0:00:39.605303
20190702 unzip finished
0:00:57.017209
0:01:34.297263
1
2
3
4
5
6
7
8
0:11:15.317211
0:00:38.876562
0:00:03.877523
no massive missing
0:02:09.981131


20190702

SH finished
0:01:09.242369
0:00:38.939483
20190703 unzip finished
0:00:52.381113
0:01:29.062519
1
2
3
4
5
6
7
8
0:10:48.002191
0:00:40.274631
0:00:03.833359
no massive missing
0:02:07.649032


20190703

SH finished
0:01:05.327417
0:00:38.992159
20190704 unzip finished
0:00:54.993837
0:01:29.681243
1
2
3
4
5
6
7
8
0:10:31.232711
0:00:39.076676
0:00:03.803325
no massive missing
0:02:01.114551


20190704

SH finished
0:01:09.267873
0:00:35.464587
20190705 unzip finished
0:00:51.162454
0:01:26.415601
1
2
3
4
5
6
7
8
0:10:22.452924
0:00:40.063538
0:00:03.885997
no massive missing
0:02:03.037607


20190705

SH finished
0:01:08.562759
0:00:38.278079
20190708 unzip finished
0:00:52.312434
0:01:34.555632
1
2
3
4
5
6
7
8
0:11:18.506702
0:00:38.344236
0:00:03.887581
no massive missing
0:02:10.011194


20190708

SH finished
0:00:59.831347
0:00:37.550236
20190709 unzip finished
0:00:47.919964
0:01:25.814156
1
2
3
4
5
6
7
8
0:10:05.661974
0:00:36.907653
0:00:03.686066
no massive missing
0:02:02.721785


20190709

SH finished
0:00:57.603693
0:00:34.872591
20190710 unzip finished
0:00:50.456443
0:01:23.859140
1
2
3
4
5
6
7
8
0:10:03.758661
0:00:36.571576
0:00:03.632409
no massive missing
0:02:05.483168


20190710

SH finished
0:00:54.130319
0:00:35.322759
20190711 unzip finished
0:00:49.810398
0:01:24.237519
1
2
3
4
5
6
7
8
0:09:58.773927
0:00:35.732331
0:00:03.529617
no massive missing
0:02:02.679515


20190711

SH finished
0:00:57.115453
0:00:35.833458
20190712 unzip finished
0:00:48.247071
0:01:19.726393
1
2
3
4
5
6
7
8
0:09:48.484155
0:00:34.988268
0:00:03.566217
no massive missing
0:01:57.434125


20190712

SH finished
0:00:55.587885
0:00:36.925382
20190715 unzip finished
0:00:54.177291
0:01:30.212445
1
2
3
4
5
6
7
8
0:10:39.368209
0:00:38.410066
0:00:03.917172
no massive missing
0:02:06.608606


20190715

SH finished
0:00:57.460414
0:00:33.316893
20190716 unzip finished
0:00:48.477552
0:01:22.559734
1
2
3
4
5
6
7
8
0:10:00.507283
0:00:37.190148
0:00:03.682803
no massive missing
0:02:01.207664


20190716

SH finished
0:00:51.277029
0:00:32.237243
20190717 unzip finished
0:00:48.639909
0:01:21.485010
1
2
3
4
5
6
7
8
0:10:19.501495
0:00:35.702274
0:00:03.567719
no massive missing
0:01:57.794499


20190717

SH finished
0:00:52.943721
0:00:32.501964
20190718 unzip finished
0:00:46.283440
0:01:21.341234
1
2
3
4
5
6
7
8
0:09:40.503282
0:00:36.001704
0:00:03.680943
no massive missing
0:01:57.533647


20190718

SH finished
0:00:51.494403
0:00:33.183931
20190719 unzip finished
0:00:47.127647
0:01:19.648384
1
2
3
4
5
6
7
8
0:09:41.050863
0:00:35.705057
0:00:03.521506
no massive missing
0:01:56.346883


20190719

SH finished
0:00:58.315490
0:00:37.156409
20190722 unzip finished
0:00:49.451928
0:01:28.434711
1
2
3
4
5
6
7
8
0:10:35.058675
0:00:38.300624
0:00:03.852053
no massive missing
0:02:06.343464


20190722

SH finished
0:00:56.953973
0:00:39.627144
20190723 unzip finished
0:00:49.770131
0:01:21.860942
1
2
3
4
5
6
7
8
0:10:13.731515
0:00:34.826452
0:00:03.856261
no massive missing
0:02:04.653809


20190723

SH finished
0:00:51.163105
0:00:35.927437
20190724 unzip finished
0:00:48.840316
0:01:25.435322
1
2
3
4
5
6
7
8
0:10:14.813127
0:00:39.292170
0:00:05.216854
no massive missing
0:02:09.300132


20190724

SH finished
0:00:52.860302
0:00:34.969981
20190725 unzip finished
0:00:52.105497
0:01:23.763405
1
2
3
4
5
6
7
8
0:09:53.449283
0:00:37.183704
0:00:03.817817
no massive missing
0:02:02.282106


20190725

SH finished
0:00:58.682499
0:00:33.755065
20190726 unzip finished
0:00:49.571053
0:01:18.963565
1
2
3
4
5
6
7
8
0:10:04.300544
0:00:36.793684
0:00:04.156223
no massive missing
0:01:57.580175


20190726

SH finished
0:00:50.539319
0:00:39.332092
20190729 unzip finished
0:00:46.383277
0:01:18.038651
1
2
3
4
5
6
7
8
0:09:49.044700
0:00:35.681842
0:00:03.774857
no massive missing
0:01:55.688474


20190729

SH finished
0:01:07.549415
0:00:35.088717
20190730 unzip finished
0:00:52.389673
0:01:24.292813
1
2
3
4
5
6
7
8
0:10:39.178485
0:00:36.554946
0:00:03.696514
no massive missing
0:02:00.448542


20190730

SH finished
0:01:02.805868
0:00:34.892396
20190731 unzip finished
0:00:48.676031
0:01:23.083003
1
2
3
4
5
6
7
8
0:10:30.175671
0:00:39.192630
0:00:04.221825
no massive missing
0:02:06.550497


20190731

SH finished
0:00:54.782774
0:00:49.854370
20190801 unzip finished
0:00:47.979679
0:01:24.564510
1
2
3
4
5
6
7
8
0:10:42.598012
0:00:46.600399
0:00:06.643676
no massive missing
0:02:50.647009


20190801

SH finished
0:01:01.267215
0:00:47.691255
20190802 unzip finished
0:00:56.501778
0:01:33.487264
1
2
3
4
5
6
7
8
0:11:08.115249
0:01:00.572504
0:00:09.427684
no massive missing
0:02:13.729338


20190802

SH finished
0:01:09.695762
0:01:20.370335
20190805 unzip finished
0:00:51.187073
0:01:29.092456


In [1]:
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz


def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name)


class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db'):
        self.db_name = db_name
        self.uri = uri
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = 1
            seg = {'ver': version, 'data': self.ser(df_seg, version), 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)

        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        else:
            raise Exception('unknown version')


def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()











import pandas as pd
import random
import numpy as np
import glob
import pickle
import os
import datetime
import time
pd.set_option("max_columns", 200)

startTm = datetime.datetime.now()
readPath = '/home/work516/day_stock/***'
dataPathLs = np.array(glob.glob(readPath))
dataPathLs = dataPathLs[[np.array([os.path.basename(i).split('.')[0][:2] == 'SH' for i in dataPathLs])]]
db = pd.DataFrame()
for p in dataPathLs:
    dayData = pd.read_csv(p, compression='gzip')
    db = pd.concat([db, dayData])
print(datetime.datetime.now() - startTm)

year = "2019"
startDate = '20190805'
endDate = '20191231'
readPath = '/mnt/usb/data/' + year + '/***/***'
dataPathLs = np.array(glob.glob(readPath))
dateLs = np.array([os.path.basename(i).split('_')[0] for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]
date_list = pd.read_csv("/home/work516/KR_upload_code/trading_days.csv")
wr_ong = []
mi_ss = []
less = []

for data in dataPathLs:
    if len(np.array(glob.glob(data + '/SH/snapshot***'))) == 0:
        if int(os.path.basename(data)) not in date_list["Date"].values:
            continue
        else:
            print(os.path.basename(data) + " less data!!!!!!!!!!!!!!!!!")
            less.append(data)
            continue
    startTm = datetime.datetime.now()
    date = os.path.basename(data)
    rar_path = data + '/SH/snapshot.7z'
    path = '/mnt/e/unzip_data/2019/SH'
    path1 = path + '/' + date
    un_path = path1
    cmd = '7za x {} -o{}'.format(rar_path, un_path)
    os.system(cmd)
    print(datetime.datetime.now() - startTm)
    print(date + ' unzip finished')

    readPath = path1 + '/snapshot/***2/***'
    dataPathLs = np.array(glob.glob(readPath))
    dateLs = np.array([int(os.path.basename(i).split('.')[0]) for i in dataPathLs])
    dataPathLs = dataPathLs[((dateLs >= 600000) & (dateLs <= 700000))]
    SH = []
    ll = []
    startTm = datetime.datetime.now()
    for i in dataPathLs:
        try:
            df = pd.read_csv(i, usecols = [0,1,3,5,7,9,10,11,15,17,18,19,20,21,22,23,25,26,28,29,30,31,32,33,37,39,40,41,
                                          42,46,47,49,50])
        except:
            print("empty data")
            print(i)
            ll.append(int(os.path.basename(i).split('.')[0]))
            continue
        df["StockID"] = int(os.path.basename(i).split('.')[0])
        SH += [df]
    del df
    SH = pd.concat(SH).reset_index(drop=True)
    print(datetime.datetime.now() - startTm)
    
    startTm = datetime.datetime.now()
    SH["skey"] = SH["StockID"] + 1000000
    SH.drop(["StockID"],axis=1,inplace=True)
    SH["date"] = int(SH["QuotTime"].iloc[0]//1000000000)
    SH["time"] = (SH['QuotTime'] - int(SH['QuotTime'].iloc[0]//1000000000*1000000000)).astype(np.int64) * 1000
    SH["clockAtArrival"] = SH["QuotTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S%f').timestamp()*1e6))
    SH.drop(["QuotTime"],axis=1,inplace=True)
    SH['datetime'] = SH["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))
    print(datetime.datetime.now() - startTm)

    startTm = datetime.datetime.now()
    SH["BidPrice"] = SH["BidPrice"].apply(lambda x: [float(i) for i in x[1:-1].split(',')])
    SH["OfferPrice"] = SH["OfferPrice"].apply(lambda x: [float(i) for i in x[1:-1].split(',')])
    SH["BidOrderQty"] = SH["BidOrderQty"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])
    SH["OfferOrderQty"] = SH["OfferOrderQty"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])
    SH["BidNumOrders"] = SH["BidNumOrders"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])
    SH["OfferNumOrders"] = SH["OfferNumOrders"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])

    for i in range(1, 11):
        SH["bid" + str(i) + 'p'] = SH["BidPrice"].apply(lambda x: x[i-1],2)
    SH.drop(["BidPrice"],axis=1,inplace=True)
    print("1")
    for i in range(1, 11):
        SH["ask" + str(i) + 'p'] = SH["OfferPrice"].apply(lambda x: x[i-1],2)
    SH.drop(["OfferPrice"],axis=1,inplace=True)
    print("2")
    for i in range(1, 11):
        SH["bid" + str(i) + 'q'] = SH["BidOrderQty"].apply(lambda x: x[i-1])
    SH.drop(["BidOrderQty"],axis=1,inplace=True)
    print("3")
    for i in range(1, 11):
        SH["ask" + str(i) + 'q'] = SH["OfferOrderQty"].apply(lambda x: x[i-1])
    SH.drop(["OfferOrderQty"],axis=1,inplace=True)
    print("4")
    for i in range(1, 11):
        SH["bid" + str(i) + 'n'] = SH["BidNumOrders"].apply(lambda x: x[i-1])
        SH["bid" + str(i) + 'n'] = SH["bid" + str(i) + 'n'].astype('int32')
    SH.drop(["BidNumOrders"],axis=1,inplace=True)
    print("5")
    for i in range(1, 11):
        SH["ask" + str(i) + 'n'] = SH["OfferNumOrders"].apply(lambda x: x[i-1])
        SH["ask" + str(i) + 'n'] = SH["ask" + str(i) + 'n'].astype('int32') 
    SH.drop(["OfferNumOrders"],axis=1,inplace=True)
    print("6")
    
    SH["BidOrders"] = SH["BidOrders"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])
    SH["OfferOrders"] = SH["OfferOrders"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])

    for i in range(1, 51):
        SH["bid1Top" + str(i) + 'q'] = SH["BidOrders"].apply(lambda x: x[i-1])
        SH["bid1Top" + str(i) + 'q'] = SH["bid1Top" + str(i) + 'q'].astype('int32') 
    SH.drop(["BidOrders"],axis=1,inplace=True)
    print("7")
    
    for i in range(1, 51):
        SH["ask1Top" + str(i) + 'q'] = SH["OfferOrders"].apply(lambda x: x[i-1])
        SH["ask1Top" + str(i) + 'q'] = SH["ask1Top" + str(i) + 'q'].astype('int32') 
    SH.drop(["OfferOrders"],axis=1,inplace=True)
    print("8")
    print(datetime.datetime.now() - startTm)
    
    
    startTm = datetime.datetime.now()
    SH.columns = ['cum_trades_cnt', 'ask_trade_max_duration', 'total_bid_orders',
       'cum_canceled_sell_amount', 'total_ask_quantity', 'cum_canceled_buy_orders',
       'total_ask_vwap', 'cum_canceled_sell_volume', 'cum_volume', 'open',
       'high', 'prev_close', 'low', 'total_bid_vwap',
       'cum_canceled_sell_orders', 'total_ask_orders', 'total_ask_levels',
       'total_bid_quantity', 'cum_canceled_buy_volume', 'bid_trade_max_duration',
       'total_bid_levels', 'close', 'cum_amount', 'cum_canceled_buy_amount', 'skey', 'date', 'time', 'clockAtArrival',
       'datetime', 'bid1p', 'bid2p', 'bid3p', 'bid4p', 'bid5p', 'bid6p',
       'bid7p', 'bid8p', 'bid9p', 'bid10p', 'ask1p', 'ask2p', 'ask3p',
       'ask4p', 'ask5p', 'ask6p', 'ask7p', 'ask8p', 'ask9p', 'ask10p',
       'bid1q', 'bid2q', 'bid3q', 'bid4q', 'bid5q', 'bid6q', 'bid7q',
       'bid8q', 'bid9q', 'bid10q', 'ask1q', 'ask2q', 'ask3q', 'ask4q',
       'ask5q', 'ask6q', 'ask7q', 'ask8q', 'ask9q', 'ask10q', 'bid1n',
       'bid2n', 'bid3n', 'bid4n', 'bid5n', 'bid6n', 'bid7n', 'bid8n',
       'bid9n', 'bid10n', 'ask1n', 'ask2n', 'ask3n', 'ask4n', 'ask5n',
       'ask6n', 'ask7n', 'ask8n', 'ask9n', 'ask10n', 'bid1Top1q',
       'bid1Top2q', 'bid1Top3q', 'bid1Top4q', 'bid1Top5q', 'bid1Top6q',
       'bid1Top7q', 'bid1Top8q', 'bid1Top9q', 'bid1Top10q', 'bid1Top11q',
       'bid1Top12q', 'bid1Top13q', 'bid1Top14q', 'bid1Top15q',
       'bid1Top16q', 'bid1Top17q', 'bid1Top18q', 'bid1Top19q',
       'bid1Top20q', 'bid1Top21q', 'bid1Top22q', 'bid1Top23q',
       'bid1Top24q', 'bid1Top25q', 'bid1Top26q', 'bid1Top27q',
       'bid1Top28q', 'bid1Top29q', 'bid1Top30q', 'bid1Top31q',
       'bid1Top32q', 'bid1Top33q', 'bid1Top34q', 'bid1Top35q',
       'bid1Top36q', 'bid1Top37q', 'bid1Top38q', 'bid1Top39q',
       'bid1Top40q', 'bid1Top41q', 'bid1Top42q', 'bid1Top43q',
       'bid1Top44q', 'bid1Top45q', 'bid1Top46q', 'bid1Top47q',
       'bid1Top48q', 'bid1Top49q', 'bid1Top50q', 'ask1Top1q', 'ask1Top2q',
       'ask1Top3q', 'ask1Top4q', 'ask1Top5q', 'ask1Top6q', 'ask1Top7q',
       'ask1Top8q', 'ask1Top9q', 'ask1Top10q', 'ask1Top11q', 'ask1Top12q',
       'ask1Top13q', 'ask1Top14q', 'ask1Top15q', 'ask1Top16q',
       'ask1Top17q', 'ask1Top18q', 'ask1Top19q', 'ask1Top20q',
       'ask1Top21q', 'ask1Top22q', 'ask1Top23q', 'ask1Top24q',
       'ask1Top25q', 'ask1Top26q', 'ask1Top27q', 'ask1Top28q',
       'ask1Top29q', 'ask1Top30q', 'ask1Top31q', 'ask1Top32q',
       'ask1Top33q', 'ask1Top34q', 'ask1Top35q', 'ask1Top36q',
       'ask1Top37q', 'ask1Top38q', 'ask1Top39q', 'ask1Top40q',
       'ask1Top41q', 'ask1Top42q', 'ask1Top43q', 'ask1Top44q',
       'ask1Top45q', 'ask1Top46q', 'ask1Top47q', 'ask1Top48q',
       'ask1Top49q', 'ask1Top50q']
    SH = SH.fillna(0)
#     SH["p1"] = SH["bid1p"] + SH["ask1p"]
#     tt = SH[(SH["cum_volume"] > 0) & (SH["time"] < 145700000000)].groupby("skey")['p1'].min()
#     SH.drop("p1", axis=1, inplace=True)
#     try:
#         assert(tt[tt == 0].shape[0] == 0)
#     except:
#         display(tt[tt == 0])
#     SH = SH[~((SH["bid1p"] == 0) & (SH["ask1p"] == 0))]
    SH["ordering"] = SH.groupby("skey").cumcount()
    SH["ordering"] = SH["ordering"] + 1
    
    SH["has_missing"] = 0
    
    for col in ["skey", "date", "cum_trades_cnt", "total_bid_orders",
        'total_ask_orders', 'total_bid_levels', 'total_ask_levels', 'cum_canceled_buy_orders','cum_canceled_sell_orders',
            "ordering", 'bid_trade_max_duration', 'ask_trade_max_duration','has_missing']:
        SH[col] = SH[col].astype('int32')
    
#     for cols in ["prev_close", 'open', "high", "low", "close", 'bid10p','bid9p','bid8p','bid7p','bid6p','bid5p','bid4p','bid3p',
#              'bid2p','bid1p','ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p']:
# #         SH[cols] = SH[cols].apply(lambda x: round(x, 2)).astype('float64')
#         print(cols)
#         print(SH[cols].astype(str).apply(lambda x: len(str(x.split('.')[1]))).unique())
    
#     for cols in ['cum_amount', "cum_canceled_sell_amount", "cum_canceled_buy_amount"]:
# #         SH[cols] = SH[cols].apply(lambda x: round(x, 2)).astype('float64')
#         print(cols)
#         print(SH[cols].astype(str).apply(lambda x: len(str(x.split('.')[1]))).unique())
        
    for cols in ['total_bid_vwap', "total_ask_vwap"]:
#         print(cols)
#         print(SH[cols].astype(str).apply(lambda x: len(str(x.split('.')[1]))).unique())
        SH[cols] = SH[cols].apply(lambda x: round(x, 3))
        
   
    assert(sum(SH[SH["open"] != 0].groupby("skey")["open"].nunique() != 1) == 0)
    assert(sum(SH[SH["prev_close"] != 0].groupby("skey")["prev_close"].nunique() != 1) == 0)
    SH["prev_close"] = np.where(SH["time"] >= 91500000000, SH.groupby("skey")["prev_close"].transform("max"), SH["prev_close"]) 
    SH["open"] = np.where(SH["cum_volume"] > 0, SH.groupby("skey")["open"].transform("max"), SH["open"])
    assert(sum(SH[SH["open"] != 0].groupby("skey")["open"].nunique() != 1) == 0)
    assert(sum(SH[SH["prev_close"] != 0].groupby("skey")["prev_close"].nunique() != 1) == 0)
    assert(SH[SH["cum_volume"] > 0]["open"].min() > 0)
    print(datetime.datetime.now() - startTm)
    
    
    # check 1
    startTm = datetime.datetime.now()
    da_te = str(SH["date"].iloc[0]) 
    da_te = da_te[:4] + '-' + da_te[4:6] + '-' + da_te[6:8]
    db1 = db[db["date"] == da_te]
    db1["ID"] = db1["ID"].str[2:].astype(int) + 1000000
    db1["date"] = (db1["date"].str[:4] + db1["date"].str[5:7] + db1["date"].str[8:]).astype(int)
    SH["cum_max"] = SH.groupby("skey")["cum_volume"].transform(max)
    s2 = SH[SH["cum_volume"] == SH["cum_max"]].groupby("skey").first().reset_index()
    dd = SH[SH["cum_volume"] == SH["cum_max"]].groupby("skey")["time"].first().reset_index()
    SH.drop("cum_max", axis=1, inplace=True)
    s2 = s2.rename(columns={"skey": "ID", 'open':"d_open", "prev_close":"d_yclose","high":"d_high", "low":"d_low", "close":"d_close", "cum_volume":"d_volume", "cum_amount":"d_amount"})
    if SH["date"].iloc[0] < 20180820:
        s2["auction"] = 0
    else:
        dd["auction"] = np.where(dd["time"]<=145700000000, 0, 1)
        dd = dd.rename(columns={"skey": "ID"})
        s2 = pd.merge(s2, dd[["ID", "auction"]], on="ID")
    s2 = s2[["ID", "date", "d_open", "d_yclose", "d_high", "d_low", "d_close", "d_volume", "d_amount", "auction"]]
    re = pd.merge(db1, s2, on=["ID", "date", "d_open", "d_yclose","d_high", "d_low", "d_volume"], how="outer")
    try:
        assert(sum(re["d_amount_y"].isnull()) == 0)
    except:
        print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
        print(re[re["d_amount_y"].isnull()])
        wr_ong += [re[re["d_amount_y"].isnull()]]
    print(datetime.datetime.now() - startTm)
    
    # check 2
    # first part
    startTm = datetime.datetime.now()
    date = pd.DataFrame(pd.date_range(start='2019-06-10 08:30:00', end='2019-06-10 18:00:00', freq='s'), columns=["Orig"])
    date["time"] = date["Orig"].apply(lambda x: int(x.strftime("%H%M%S"))*1000)
    date["group"] = date["time"]//10000
    SH["group"] = SH["time"]//10000000
    gl = date[((date["time"] >= 93000000) & (date["time"] <= 113000000))|((date["time"] >= 130000000) & (date["time"] <= 150000000))]["group"].unique()
    l = set(gl) - set(SH["group"].unique())
    SH["has_missing1"] = 0 
    if len(l) != 0:
        print("massive missing")
        print(l)
        SH["order"] = SH.groupby(["skey", "time"]).cumcount()
        for i in l:
            SH["t"] = SH[SH["group"] > i].groupby("StockID")["time"].transform("min")
            SH["has_missing1"] = np.where((SH["time"] == SH["t"]) & (SH["order"] == 0), 1, 0)
        SH.drop(["order", "t", "group"], axis=1, inplace=True)   
    else:
        print("no massive missing")
        SH.drop(["group"], axis=1, inplace=True)
    



    # second part

    SH["time_interval"] = SH.groupby("skey")["datetime"].apply(lambda x: x - x.shift(1))
    SH["time_interval"] = SH["time_interval"].apply(lambda x: x.seconds)
    SH["tn_update"] = SH.groupby("skey")["cum_trades_cnt"].apply(lambda x: x-x.shift(1))

    f1 = SH[(SH["time"] >= 93000000000) & (SH["tn_update"] != 0)].groupby("skey")["time"].min().reset_index()
    f1 = f1.rename(columns={"time": "time1"})
    f2 = SH[(SH["time"] >= 130000000000) & (SH["tn_update"] != 0)].groupby("skey")["time"].min().reset_index()
    f2 = f2.rename(columns={"time": "time2"})
    f3 = SH[(SH["time"] >= 150000000000) & (SH["tn_update"] != 0)].groupby("skey")["time"].min().reset_index()
    f3 = f3.rename(columns={"time": "time3"})
    SH = pd.merge(SH, f1, on="skey", how="left")
    del f1
    SH = pd.merge(SH, f2, on="skey", how="left")
    del f2
    SH = pd.merge(SH, f3, on="skey", how="left")
    del f3
    p99 = SH[(SH["time"] > 93000000000) & (SH["time"] < 145700000000) & (SH["time"] != SH["time2"]) & (SH["tn_update"] != 0)]\
    .groupby("skey")["tn_update"].apply(lambda x: x.describe([0.99])["99%"]).reset_index()
    p99 = p99.rename(columns={"tn_update":"99%"})
    SH = pd.merge(SH, p99, on="skey", how="left")

    SH["has_missing2"] = 0
    SH["has_missing2"] = np.where((SH["time_interval"] > 60) & (SH["tn_update"] > SH["99%"]) & 
         (SH["time"] > SH["time1"]) & (SH["time"] != SH["time2"]) & (SH["time"] != SH["time3"]) & (SH["time"] != 100000000000), 1, 0)
    SH.drop(["time_interval", "tn_update", "time1", "time2", "time3", "99%"], axis=1, inplace=True) 

    SH["has_missing"] = np.where((SH["has_missing1"] == 1) | (SH["has_missing2"] == 1), 1, 0)
    SH.drop(["has_missing1", "has_missing2"], axis=1, inplace=True) 
    if SH[SH["has_missing"] == 1].shape[0] != 0:
        print("has missing!!!!!!!!!!!!!!!!!!!!!!!")
        print(SH[SH["has_missing"] == 1].shape[0])
        mi_ss += [SH[SH["has_missing"] == 1]]
    print(datetime.datetime.now() - startTm)
    
    
    
    startTm = datetime.datetime.now()
    SH["has_missing"] = SH["has_missing"].astype('int32')
    SH = SH[["skey", "date", "time", "clockAtArrival", "datetime", "ordering", "has_missing", "cum_trades_cnt", "cum_volume", "cum_amount", "prev_close",
                            "open", "high", "low", "close", 'bid10p','bid9p','bid8p','bid7p','bid6p','bid5p','bid4p','bid3p','bid2p','bid1p',
                            'ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p', 'bid10q','bid9q','bid8q',
                             'bid7q','bid6q','bid5q','bid4q','bid3q','bid2q','bid1q', 'ask1q','ask2q','ask3q','ask4q','ask5q','ask6q',
                             'ask7q','ask8q','ask9q','ask10q', 'bid10n', 'bid9n', 'bid8n', 'bid7n', 'bid6n', 'bid5n', 'bid4n', 'bid3n', 'bid2n', 'bid1n', 
                             'ask1n', 'ask2n', 'ask3n', 'ask4n', 'ask5n', 'ask6n','ask7n', 'ask8n', 'ask9n', 'ask10n','bid1Top1q','bid1Top2q','bid1Top3q','bid1Top4q','bid1Top5q','bid1Top6q',
        'bid1Top7q','bid1Top8q','bid1Top9q','bid1Top10q','bid1Top11q','bid1Top12q','bid1Top13q','bid1Top14q','bid1Top15q','bid1Top16q','bid1Top17q','bid1Top18q',
        'bid1Top19q','bid1Top20q','bid1Top21q','bid1Top22q','bid1Top23q','bid1Top24q','bid1Top25q','bid1Top26q','bid1Top27q','bid1Top28q','bid1Top29q',
        'bid1Top30q','bid1Top31q','bid1Top32q','bid1Top33q','bid1Top34q','bid1Top35q','bid1Top36q','bid1Top37q','bid1Top38q','bid1Top39q','bid1Top40q',
        'bid1Top41q','bid1Top42q','bid1Top43q','bid1Top44q','bid1Top45q','bid1Top46q','bid1Top47q','bid1Top48q','bid1Top49q','bid1Top50q', 'ask1Top1q',
        'ask1Top2q','ask1Top3q','ask1Top4q','ask1Top5q','ask1Top6q','ask1Top7q','ask1Top8q','ask1Top9q','ask1Top10q','ask1Top11q','ask1Top12q','ask1Top13q',
        'ask1Top14q','ask1Top15q','ask1Top16q','ask1Top17q','ask1Top18q','ask1Top19q','ask1Top20q','ask1Top21q','ask1Top22q','ask1Top23q',
        'ask1Top24q','ask1Top25q','ask1Top26q','ask1Top27q','ask1Top28q','ask1Top29q','ask1Top30q','ask1Top31q','ask1Top32q','ask1Top33q',
        'ask1Top34q','ask1Top35q','ask1Top36q','ask1Top37q','ask1Top38q','ask1Top39q','ask1Top40q','ask1Top41q','ask1Top42q','ask1Top43q',
        'ask1Top44q','ask1Top45q','ask1Top46q','ask1Top47q','ask1Top48q','ask1Top49q','ask1Top50q',"total_bid_quantity", "total_ask_quantity","total_bid_vwap", "total_ask_vwap",
        "total_bid_orders",'total_ask_orders','total_bid_levels', 'total_ask_levels', 'bid_trade_max_duration', 'ask_trade_max_duration', 'cum_canceled_buy_orders', 'cum_canceled_buy_volume',
        "cum_canceled_buy_amount", "cum_canceled_sell_orders", 'cum_canceled_sell_volume',"cum_canceled_sell_amount"]]
    
    display(SH["date"].iloc[0])
    print("SH finished")
    
    database_name = 'com_md_eq_cn'
    user = "zhenyuy"
    password = "bnONBrzSMGoE"

    db1 = DB("192.168.10.223", database_name, user, password)
    db1.write('md_snapshot_l2', SH)
    
    del SH
    print(datetime.datetime.now() - startTm)

wr_ong = pd.concat(wr_ong).reset_index(drop=True)
print(wr_ong)
mi_ss = pd.concat(mi_ss).reset_index(drop=True)
print(mi_ss)
print(less)



0:02:46.758274
0:01:14.296037
20190805 unzip finished
0:00:56.339416
0:01:20.262936
1
2
3
4
5
6
7
8
0:10:21.550306
0:00:35.507205


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0:00:03.686619
no massive missing
0:01:58.907230


20190805

SH finished
0:01:03.337919
0:00:51.820813
20190806 unzip finished
0:00:55.062016
0:01:25.750637
1
2
3
4
5
6
7
8
0:11:06.988244
0:00:38.078616
0:00:03.913382
no massive missing
0:02:08.116246


20190806

SH finished
0:01:01.357491
0:00:47.934524
20190807 unzip finished
0:00:50.299404
0:01:18.337294
1
2
3
4
5
6
7
8
0:09:49.839146
0:00:41.113879
0:00:03.716385
no massive missing
0:01:55.874718


20190807

SH finished
0:01:06.335457
0:00:45.100028
20190808 unzip finished
0:00:49.257248
0:01:24.985821
1
2
3
4
5
6
7
8
0:09:39.065421
0:00:35.442772
0:00:03.819243
no massive missing
0:01:55.693803


20190808

SH finished
0:01:00.307279
0:00:46.520203
20190809 unzip finished
0:00:53.146831
0:01:21.196005
1
2
3
4
5
6
7
8
0:10:30.256266
0:00:37.975810
0:00:03.787210
no massive missing
0:01:58.319972


20190809

SH finished
0:01:12.998462
0:00:41.337375
20190812 unzip finished
0:00:51.369871
0:01:19.050264
1
2
3
4
5
6
7
8
0:10:08.509631
0:00:36.160649
0:00:03.847450
no massive missing
0:01:59.326628


20190812

SH finished
0:00:57.577598
0:01:15.045665
20190813 unzip finished
0:00:49.286361
0:01:17.100247
1
2
3
4
5
6
7
8
0:10:03.416645
0:00:35.554235
0:00:03.558015
no massive missing
0:01:56.486508


20190813

SH finished
0:01:06.237230
0:00:48.565296
20190814 unzip finished
0:00:52.651360
0:01:22.078487
1
2
3
4
5
6
7
8
0:10:29.652518
0:00:35.640401
0:00:03.720213
no massive missing
0:01:59.923737


20190814

SH finished
0:00:56.955332
0:00:48.296249
20190815 unzip finished
0:00:50.853348
0:01:22.407037
1
2
3
4
5
6
7
8
0:10:05.885527
0:00:36.898774
0:00:03.703555
no massive missing
0:02:00.344635


20190815

SH finished
0:01:01.256223
0:00:47.978453
20190816 unzip finished
0:00:50.066057
0:01:25.561881
1
2
3
4
5
6
7
8
0:10:11.015906
0:00:35.677617
0:00:03.714006
no massive missing
0:01:57.525226


20190816

SH finished
0:01:13.971745
0:00:44.437747
20190819 unzip finished
0:00:53.958013
0:01:29.260708
1
2
3
4
5
6
7
8
0:11:02.799013
0:00:39.601618
0:00:04.081963
no massive missing
0:02:10.004192


20190819

SH finished
0:01:05.413927
0:00:41.152386
20190820 unzip finished
0:00:53.255355
0:01:25.799432
1
2
3
4
5
6
7
8
0:10:38.350910
0:00:37.448263
0:00:03.793706
no massive missing
0:02:04.810119


20190820

SH finished
0:01:16.489006
0:00:40.722131
20190821 unzip finished
0:00:50.624838
0:01:22.093699
1
2
3
4
5
6
7
8
0:10:24.625119
0:00:37.229207
0:00:03.718602
no massive missing
0:02:00.286341


20190821

SH finished
0:01:13.117334
0:00:51.383472
20190822 unzip finished
0:00:48.904752
0:01:22.062864
1
2
3
4
5
6
7
8
0:10:22.474535
0:00:41.751147
0:00:03.865709
no massive missing
0:02:03.942948


20190822

SH finished
0:01:14.909199
0:00:41.040758
20190823 unzip finished
0:00:51.249316
0:01:22.813088
1
2
3
4
5
6
7
8
0:10:15.595484
0:00:35.503981
0:00:03.568831
no massive missing
0:01:57.873319


20190823

SH finished
0:01:12.101372
0:00:39.092577
20190826 unzip finished
0:00:53.300178
0:01:22.941969
1
2
3
4
5
6
7
8
0:10:21.936151
0:00:37.470332
0:00:03.843514
no massive missing
0:02:03.841327


20190826

SH finished
0:01:04.348542
0:00:44.576512
20190827 unzip finished
0:00:51.718042
0:01:24.880284
1
2
3
4
5
6
7
8
0:11:00.699627
0:00:38.293818
0:00:03.863433
no massive missing
0:02:08.481447


20190827

SH finished
0:01:01.302806
0:00:56.356795
20190828 unzip finished
0:00:52.723658
0:01:24.417503
1
2
3
4
5
6
7
8
0:11:04.758366
0:00:37.575679
0:00:03.875258
no massive missing
0:02:03.817282


20190828

SH finished
0:00:56.244295
0:00:48.771167
20190829 unzip finished
0:00:50.505405
0:01:20.019752
1
2
3
4
5
6
7
8
0:10:43.419869
0:00:37.319269
0:00:03.841894
no massive missing
0:02:04.167687


20190829

SH finished
0:01:11.879918
0:00:42.915725
20190830 unzip finished
0:00:54.587486
0:01:24.354624
1
2
3
4
5
6
7
8
0:10:55.536056
0:00:39.020689
0:00:03.946374
no massive missing
0:02:09.685296


20190830

SH finished
0:00:59.613415
0:00:38.800814
20190902 unzip finished
0:00:54.142285
0:01:27.444018
1
2
3
4
5
6
7
8
0:10:57.751163
0:00:38.587544
0:00:03.836498
no massive missing
0:02:10.244209


20190902

SH finished
0:01:16.495277
0:00:39.395880
20190903 unzip finished
0:00:52.428505
0:01:26.430878
1
2
3
4
5
6
7
8
0:10:44.980914
0:00:38.719477
0:00:03.975979
no massive missing
0:02:05.779654


20190903

SH finished
0:01:04.355231
0:00:40.190425
20190904 unzip finished
0:00:53.402399
0:01:27.927130
1
2
3
4
5
6
7
8
0:11:19.188343
0:00:42.914554
0:00:04.101818
no massive missing
0:02:09.227507


20190904

SH finished
0:01:06.125254
0:00:43.456820
20190905 unzip finished
0:00:58.659784
0:01:31.299742
1
2
3
4
5
6
7
8
0:12:01.835410
0:00:44.378016
0:00:04.324667
no massive missing
0:02:20.282850


20190905

SH finished
0:01:28.567837
0:00:37.799442
20190906 unzip finished
0:00:53.452131
0:01:28.940842
1
2
3
4
5
6
7
8
0:11:08.094081
0:00:40.648003
0:00:04.079135
no massive missing
0:02:11.218811


20190906

SH finished
0:01:22.193195
0:00:40.966516
20190909 unzip finished
0:00:57.295999
0:01:31.870881
1
2
3
4
5
6
7
8
0:11:33.739850
0:00:44.298625
0:00:04.324772
no massive missing
0:02:18.709162


20190909

SH finished
0:01:05.957112
0:00:42.397283
20190910 unzip finished
0:00:54.894907
0:01:33.397750
1
2
3
4
5
6
7
8
0:11:10.164301
0:00:41.047381
0:00:05.666677
no massive missing
0:02:17.205591


20190910

SH finished
0:01:00.877785
0:00:39.962433
20190911 unzip finished
0:00:53.984967
0:01:28.326078
1
2
3
4
5
6
7
8
0:11:09.967960
0:00:39.926934
0:00:05.204962
no massive missing
0:02:15.647765


20190911

SH finished
0:01:03.215850
0:00:37.956087
20190912 unzip finished
0:00:55.256476
0:01:28.014334
1
2
3
4
5
6
7
8
0:10:57.909048
0:00:38.591691
0:00:04.053049
no massive missing
0:02:07.914651


20190912

SH finished
0:01:03.648285
0:00:39.713779
20190916 unzip finished
0:00:54.583469
0:01:29.353209
1
2
3
4
5
6
7
8
0:11:04.767801
0:00:41.046359
0:00:04.024162
no massive missing
0:02:11.999877


20190916

SH finished
0:01:02.287589
0:00:39.640316
20190917 unzip finished
0:00:55.902879
0:01:32.137480
1
2
3
4
5
6
7
8
0:11:23.642891
0:00:43.310987
0:00:04.199195
no massive missing
0:02:16.389143


20190917

SH finished
0:00:59.764523
0:00:39.455514
20190918 unzip finished
0:00:51.762757
0:01:25.302570
1
2
3
4
5
6
7
8
0:10:48.874249
0:00:38.812174
0:00:03.954155
no massive missing
0:02:07.363040


20190918

SH finished
0:01:00.024677
0:00:37.179728
20190919 unzip finished
0:00:53.334964
0:01:26.350975
1
2
3
4
5
6
7
8
0:10:33.823548
0:00:38.765875
0:00:03.973830
no massive missing
0:02:05.865037


20190919

SH finished
0:00:55.878992
0:00:38.921180
20190920 unzip finished
0:00:53.552855
0:01:26.791466
1
2
3
4
5
6
7
8
0:10:41.796256
0:00:40.691279
0:00:03.920336
no massive missing
0:02:08.983161


20190920

SH finished
0:01:02.600226
0:00:40.132592
20190923 unzip finished
0:00:52.556420
0:01:26.702747
1
2
3
4
5
6
7
8
0:10:39.028453
0:00:39.090430
0:00:03.902506
no massive missing
0:02:06.104961


20190923

SH finished
0:00:58.764674
0:00:38.686221
20190924 unzip finished
0:00:52.609401
0:01:27.850438
1
2
3
4
5
6
7
8
0:10:53.661011
0:00:38.680352
0:00:03.963418
no massive missing
0:02:13.740151


20190924

SH finished
0:00:59.955169
0:00:38.858236
20190925 unzip finished
0:00:51.635855
0:01:26.114708
1
2
3
4
5
6
7
8
0:10:44.178644
0:00:41.506959
0:00:04.493513
no massive missing
0:02:07.303484


20190925

SH finished
0:01:15.415019
0:00:38.738834
20190926 unzip finished
0:00:53.915020
0:01:28.888421
1
2
3
4
5
6
7
8
0:10:53.072041
0:00:39.353894
0:00:03.998290
no massive missing
0:02:14.008774


20190926

SH finished
0:01:01.511347
0:00:35.143792
20190927 unzip finished
0:00:51.190833
0:01:22.765846
1
2
3
4
5
6
7
8
0:10:12.473130
0:00:37.123226
0:00:03.748748
no massive missing
0:02:02.753814


20190927

SH finished
0:00:54.303246
0:00:34.889001
20190930 unzip finished
0:00:47.121157
0:01:18.762021
1
2
3
4
5
6
7
8
0:09:39.768924
0:00:35.176026
0:00:03.642637
no massive missing
0:01:54.105367


20190930

SH finished
0:00:51.565236
0:00:34.529061
20191008 unzip finished
0:00:49.272122
0:01:18.688185
1
2
3
4
5
6
7
8
0:09:44.774960
0:00:35.165115
0:00:03.675727
no massive missing
0:01:57.777789


20191008

SH finished
0:00:58.823827
0:00:34.293427
20191009 unzip finished
0:00:47.803818
0:01:20.171781
1
2
3
4
5
6
7
8
0:09:51.010396
0:00:34.884942
0:00:03.635818
no massive missing
0:01:54.328464


20191009

SH finished
0:00:50.847240
0:00:35.762872
20191010 unzip finished
0:00:49.325249
0:01:20.394574
1
2
3
4
5
6
7
8
0:10:04.069316
0:00:36.881598
0:00:03.720026
no massive missing
0:02:04.159048


20191010

SH finished
0:01:04.404297
0:00:36.704655
20191011 unzip finished
0:00:51.872459
0:01:23.856669
1
2
3
4
5
6
7
8
0:10:22.147716
0:00:36.840704
0:00:03.779417
no massive missing
0:02:01.290829


20191011

SH finished
0:00:54.102102
0:00:38.726818
20191014 unzip finished
0:00:50.999909
0:01:23.467181
1
2
3
4
5
6
7
8
0:10:54.883610
0:00:38.299719
0:00:03.913269
no massive missing
0:02:13.511418


20191014

SH finished
0:00:57.479529
0:00:38.020364
20191015 unzip finished
0:00:51.639656
0:01:23.063392
1
2
3
4
5
6
7
8
0:10:13.555309
0:00:37.509525
0:00:03.853809
no massive missing
0:02:02.368458


20191015

SH finished
0:00:58.842125
0:00:36.337371
20191016 unzip finished
0:00:50.813812
0:01:21.666749
1
2
3
4
5
6
7
8
0:10:10.507754
0:00:35.872465
0:00:03.598331
no massive missing
0:02:02.393686


20191016

SH finished
0:00:54.319352
0:00:34.737250
20191017 unzip finished
0:00:47.671653
0:01:19.226091
1
2
3
4
5
6
7
8
0:10:07.037203
0:00:37.796724
0:00:03.701497
no massive missing
0:01:59.483569


20191017

SH finished
0:00:57.718914
0:00:41.379911
20191018 unzip finished
0:01:18.971207
0:01:22.808483
1
2
3
4
5
6
7
8
0:10:30.182457
0:00:38.060044
0:00:03.927026
no massive missing
0:02:03.091602


20191018

SH finished
0:01:03.110551
0:00:34.376037
20191021 unzip finished
0:00:47.740634
0:01:20.679010
1
2
3
4
5
6
7
8
0:10:00.215175
0:00:35.841561
0:00:03.664666
no massive missing
0:01:56.948309


20191021

SH finished
0:01:07.209374
0:00:33.759430
20191022 unzip finished
0:00:48.345235
0:01:18.415162
1
2
3
4
5
6
7
8
0:09:56.916033
0:00:35.649406
0:00:03.661391
no massive missing
0:01:56.537418


20191022

SH finished
0:01:01.101511
0:00:34.481887
20191023 unzip finished
0:00:49.295574
0:01:16.903648
1
2
3
4
5
6
7
8
0:09:53.365825
0:00:36.152968
0:00:03.610364
no massive missing
0:01:54.064844


20191023

SH finished
0:00:52.459724
0:00:33.819851
20191024 unzip finished
0:00:50.162732
0:01:20.673707
1
2
3
4
5
6
7
8
0:09:59.412441
0:00:36.991371
0:00:03.797731
no massive missing
0:02:04.179902


20191024

SH finished
0:00:52.348645
0:00:35.800711
20191025 unzip finished
0:00:48.535981
0:01:18.958193
1
2
3
4
5
6
7
8
0:10:18.937125
0:00:37.045635
0:00:03.817282
no massive missing
0:02:08.341326


20191025

SH finished
0:00:56.902375
0:00:39.554581
20191028 unzip finished
0:00:53.788929
0:01:25.460392
1
2
3
4
5
6
7
8
0:10:55.595040
0:00:39.470901
0:00:04.038694
no massive missing
0:02:06.801626


20191028

SH finished
0:01:06.944420
0:00:40.443395
20191029 unzip finished
0:00:54.766673
0:01:25.018229
1
2
3
4
5
6
7
8
0:10:53.020654
0:00:39.371192
0:00:03.991807
no massive missing
0:02:09.408559


20191029

SH finished
0:00:55.160870
0:00:35.676310
20191030 unzip finished
0:00:50.012638
0:01:22.402914
1
2
3
4
5
6
7
8
0:10:39.229129
0:00:38.442004
0:00:03.897255
no massive missing
0:02:06.565458


20191030

SH finished
0:00:55.518727
0:00:37.721108
20191031 unzip finished
0:00:53.080571
0:01:22.950801
1
2
3
4
5
6
7
8
0:10:42.807934
0:00:38.705833
0:00:03.883379
no massive missing
0:02:06.383392


20191031

SH finished
0:01:00.921356
0:00:38.024523
20191101 unzip finished
0:00:50.384757
0:01:22.186701
1
2
3
4
5
6
7
8
0:10:36.611923
0:00:39.359511
0:00:03.932377
no massive missing
0:02:08.233767


20191101

SH finished
0:01:01.284526
0:00:35.255320
20191104 unzip finished
0:00:53.085948
0:01:20.111100
1
2
3
4
5
6
7
8
0:10:19.040739
0:00:36.785202
0:00:03.704835
no massive missing
0:02:00.465948


20191104

SH finished
0:00:54.612709
0:00:37.834455
20191105 unzip finished
0:00:52.269464
0:01:21.985394
1
2
3
4
5
6
7
8
0:10:37.985305
0:00:38.269553
0:00:03.865474
no massive missing
0:02:07.740772


20191105

SH finished
0:01:08.648940
0:00:37.304611
20191106 unzip finished
0:00:51.030922
0:01:20.556287
1
2
3
4
5
6
7
8
0:10:27.987935
0:00:39.051414
0:00:04.181219
no massive missing
0:02:19.458218


20191106

SH finished
0:00:55.162723
0:00:35.142425
20191107 unzip finished
0:00:48.880124
0:01:26.618395
1
2
3
4
5
6
7
8
0:10:23.249587
0:00:38.011570
0:00:04.147578
no massive missing
0:02:08.398069


20191107

SH finished
0:00:58.820240
0:00:35.176376
20191108 unzip finished
0:00:49.770634
0:01:27.980760
1
2
3
4
5
6
7
8
0:10:31.377794
0:00:38.039071
0:00:03.974718
no massive missing
0:02:07.889088


20191108

SH finished
0:01:03.762082
0:00:39.987239
20191111 unzip finished
0:00:53.848393
0:01:29.206514
1
2
3
4
5
6
7
8
0:10:26.834465
0:00:39.711276
0:00:04.283562
no massive missing
0:02:07.279362


20191111

SH finished
0:01:03.331804
0:00:34.530200
20191112 unzip finished
0:00:48.607548
0:01:17.469969
1
2
3
4
5
6
7
8
0:10:23.895621
0:00:41.040888
0:00:03.793321
no massive missing
0:02:04.772628


20191112

SH finished
0:00:59.119363
0:00:43.080094
20191113 unzip finished
0:00:48.437358
0:01:14.646319
1
2
3
4
5
6
7
8
0:10:02.786892
0:00:36.527875
0:00:03.979899
no massive missing
0:01:59.925121


20191113

SH finished
0:01:00.286328
0:00:42.057243
20191114 unzip finished
0:00:50.139956
0:01:14.107956
1
2
3
4
5
6
7
8
0:10:17.055269
0:00:35.887001
0:00:03.899938
no massive missing
0:01:59.490770


20191114

SH finished
0:01:05.807578
0:00:35.073944
20191115 unzip finished
0:00:49.624790
0:01:16.026545
1
2
3
4
5
6
7
8
0:10:18.806935
0:00:37.406469
0:00:03.945573
no massive missing
0:02:13.455838


20191115

SH finished
0:00:58.123226
0:00:37.034920
20191118 unzip finished
0:00:48.770058
0:01:17.136086
1
2
3
4
5
6
7
8
0:10:06.557217
0:00:38.037355
0:00:03.925538
no massive missing
0:02:19.277005


20191118

SH finished
0:01:08.966658
0:00:37.875306
20191119 unzip finished
0:00:51.966899
0:01:18.512156
1
2
3
4
5
6
7
8
0:10:57.629050
0:00:40.286188
0:00:04.037700
no massive missing
0:02:04.301680


20191119

SH finished
0:01:06.844777
0:00:37.365523
20191120 unzip finished
0:00:54.054275
0:01:20.211576
1
2
3
4
5
6
7
8
0:11:22.305222
0:00:43.293525
0:00:03.953741
no massive missing
0:02:08.920299


20191120

SH finished
0:01:09.682707
0:00:39.745994
20191121 unzip finished
0:00:53.991151
0:01:22.264635
1
2
3
4
5
6
7
8
0:11:23.526644
0:00:39.476045
0:00:04.071005
no massive missing
0:02:07.789334


20191121

SH finished
0:01:00.959833
0:00:45.073495
20191122 unzip finished
0:00:56.791622
0:01:32.845247
1
2
3
4
5
6
7
8
0:11:59.201195
0:00:43.202732
0:00:04.286893
no massive missing
0:02:24.525685


20191122

SH finished
0:01:05.206472
0:00:47.211611
20191125 unzip finished
0:00:59.073434
0:01:30.034847
1
2
3
4
5
6
7
8
0:11:39.620056
0:00:40.034378
0:00:04.254904
no massive missing
0:02:09.664333


20191125

SH finished
0:01:20.739633
0:00:41.166475
20191126 unzip finished
0:00:55.900452
0:01:25.762071
1
2
3
4
5
6
7
8
0:11:21.293378
0:00:40.183042
0:00:04.174751
no massive missing
0:02:07.983150


20191126

SH finished
0:01:00.972707
0:00:40.305433
20191127 unzip finished
0:00:58.958846
0:01:26.730931
1
2
3
4
5
6
7
8
0:11:28.043994
0:00:39.330019
0:00:05.198833
no massive missing
0:02:08.960240


20191127

SH finished
0:01:03.342078
0:00:39.943697
20191128 unzip finished
0:00:55.828898
0:01:26.267306
1
2
3
4
5
6
7
8
0:11:12.264906
0:00:38.492192
0:00:03.825416
no massive missing
0:02:03.663906


20191128

SH finished
0:00:56.588313
0:01:09.102856
20191129 unzip finished
0:00:55.880762
0:01:25.606688
1
2
3
4
5
6
7
8
0:11:19.614817
0:00:41.571006
0:00:04.066216
no massive missing
0:02:10.318840


20191129

SH finished
0:00:59.877879
0:00:59.926185
20191202 unzip finished
0:00:59.900525
0:01:30.197061
1
2
3
4
5
6
7
8
0:11:59.672915
0:00:41.439596
0:00:04.096592
no massive missing
0:02:13.433692


20191202

SH finished
0:01:01.731563
0:00:43.710694
20191203 unzip finished
0:01:02.469286
0:01:35.325594
1
2
3
4
5
6
7
8
0:12:28.801293
0:00:40.908491
0:00:04.255551
no massive missing
0:02:26.080866


20191203

SH finished
0:01:01.906371
0:01:19.663077
20191204 unzip finished
0:00:59.286046
0:01:36.082246
1
2
3
4
5
6
7
8
0:12:10.601794
0:00:42.780581
0:00:04.099028
no massive missing
0:02:16.318165


20191204

SH finished
0:01:20.468362
0:01:07.488510
20191205 unzip finished
0:00:59.986643
0:01:35.019696
1
2
3
4
5
6
7
8
0:13:38.228832
0:00:53.291720
0:00:04.094598
no massive missing
0:02:21.921988


20191205

SH finished
0:01:12.602496
0:00:44.740807
20191206 unzip finished
0:01:00.155868
0:01:35.257597
1
2
3
4
5
6
7
8
0:12:25.426390
0:00:46.644557
0:00:04.448186
no massive missing
0:02:22.009967


20191206

SH finished
0:01:28.663993
0:00:53.890629
20191209 unzip finished
0:01:02.805766
0:01:46.292655
1
2
3
4
5
6
7
8
0:12:54.348665
0:00:42.803966
0:00:04.370124
no massive missing
0:02:29.821624


20191209

SH finished
0:01:26.144773
0:01:07.349269
20191210 unzip finished
0:01:02.814865
0:01:33.802515
1
2
3
4
5
6
7
8
0:13:04.474517
0:00:48.570611
0:00:04.713623
no massive missing
0:02:25.701951


20191210

SH finished
0:01:21.079342
0:06:37.908069
20191211 unzip finished


NameError: name 'df' is not defined

In [1]:
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz


def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name)


class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db'):
        self.db_name = db_name
        self.uri = uri
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = 1
            seg = {'ver': version, 'data': self.ser(df_seg, version), 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)

        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        else:
            raise Exception('unknown version')


def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()











import pandas as pd
import random
import numpy as np
import glob
import pickle
import os
import datetime
import time
pd.set_option("max_columns", 200)

startTm = datetime.datetime.now()
readPath = '/home/work516/day_stock/***'
dataPathLs = np.array(glob.glob(readPath))
dataPathLs = dataPathLs[[np.array([os.path.basename(i).split('.')[0][:2] == 'SH' for i in dataPathLs])]]
db = pd.DataFrame()
for p in dataPathLs:
    dayData = pd.read_csv(p, compression='gzip')
    db = pd.concat([db, dayData])
print(datetime.datetime.now() - startTm)

year = "2019"
startDate = '20191211'
endDate = '20191231'
readPath = '/mnt/usb/data/' + year + '/***/***'
dataPathLs = np.array(glob.glob(readPath))
dateLs = np.array([os.path.basename(i).split('_')[0] for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]
date_list = pd.read_csv("/home/work516/KR_upload_code/trading_days.csv")
wr_ong = []
mi_ss = []
less = []

for data in dataPathLs:
    if len(np.array(glob.glob(data + '/SH/snapshot***'))) == 0:
        if int(os.path.basename(data)) not in date_list["Date"].values:
            continue
        else:
            print(os.path.basename(data) + " less data!!!!!!!!!!!!!!!!!")
            less.append(data)
            continue
    startTm = datetime.datetime.now()
    date = os.path.basename(data)
    rar_path = data + '/SH/snapshot.7z'
    path = '/mnt/e/unzip_data/2019/SH'
    path1 = path + '/' + date
    un_path = path1
    cmd = '7za x {} -o{}'.format(rar_path, un_path)
    os.system(cmd)
    print(datetime.datetime.now() - startTm)
    print(date + ' unzip finished')

    readPath = path1 + '/snapshot/***2/***'
    dataPathLs = np.array(glob.glob(readPath))
    dateLs = np.array([int(os.path.basename(i).split('.')[0]) for i in dataPathLs])
    dataPathLs = dataPathLs[((dateLs >= 600000) & (dateLs <= 700000))]
    SH = []
    ll = []
    startTm = datetime.datetime.now()
    for i in dataPathLs:
        try:
            df = pd.read_csv(i, usecols = [0,1,3,5,7,9,10,11,15,17,18,19,20,21,22,23,25,26,28,29,30,31,32,33,37,39,40,41,
                                          42,46,47,49,50])
        except:
            print("empty data")
            print(i)
            ll.append(int(os.path.basename(i).split('.')[0]))
            continue
        df["StockID"] = int(os.path.basename(i).split('.')[0])
        SH += [df]
    del df
    SH = pd.concat(SH).reset_index(drop=True)
    print(datetime.datetime.now() - startTm)
    
    startTm = datetime.datetime.now()
    SH["skey"] = SH["StockID"] + 1000000
    SH.drop(["StockID"],axis=1,inplace=True)
    SH["date"] = int(SH["QuotTime"].iloc[0]//1000000000)
    SH["time"] = (SH['QuotTime'] - int(SH['QuotTime'].iloc[0]//1000000000*1000000000)).astype(np.int64) * 1000
    SH["clockAtArrival"] = SH["QuotTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S%f').timestamp()*1e6))
    SH.drop(["QuotTime"],axis=1,inplace=True)
    SH['datetime'] = SH["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))
    print(datetime.datetime.now() - startTm)

    startTm = datetime.datetime.now()
    SH["BidPrice"] = SH["BidPrice"].apply(lambda x: [float(i) for i in x[1:-1].split(',')])
    SH["OfferPrice"] = SH["OfferPrice"].apply(lambda x: [float(i) for i in x[1:-1].split(',')])
    SH["BidOrderQty"] = SH["BidOrderQty"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])
    SH["OfferOrderQty"] = SH["OfferOrderQty"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])
    SH["BidNumOrders"] = SH["BidNumOrders"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])
    SH["OfferNumOrders"] = SH["OfferNumOrders"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])

    for i in range(1, 11):
        SH["bid" + str(i) + 'p'] = SH["BidPrice"].apply(lambda x: x[i-1],2)
    SH.drop(["BidPrice"],axis=1,inplace=True)
    print("1")
    for i in range(1, 11):
        SH["ask" + str(i) + 'p'] = SH["OfferPrice"].apply(lambda x: x[i-1],2)
    SH.drop(["OfferPrice"],axis=1,inplace=True)
    print("2")
    for i in range(1, 11):
        SH["bid" + str(i) + 'q'] = SH["BidOrderQty"].apply(lambda x: x[i-1])
    SH.drop(["BidOrderQty"],axis=1,inplace=True)
    print("3")
    for i in range(1, 11):
        SH["ask" + str(i) + 'q'] = SH["OfferOrderQty"].apply(lambda x: x[i-1])
    SH.drop(["OfferOrderQty"],axis=1,inplace=True)
    print("4")
    for i in range(1, 11):
        SH["bid" + str(i) + 'n'] = SH["BidNumOrders"].apply(lambda x: x[i-1])
        SH["bid" + str(i) + 'n'] = SH["bid" + str(i) + 'n'].astype('int32')
    SH.drop(["BidNumOrders"],axis=1,inplace=True)
    print("5")
    for i in range(1, 11):
        SH["ask" + str(i) + 'n'] = SH["OfferNumOrders"].apply(lambda x: x[i-1])
        SH["ask" + str(i) + 'n'] = SH["ask" + str(i) + 'n'].astype('int32') 
    SH.drop(["OfferNumOrders"],axis=1,inplace=True)
    print("6")
    
    SH["BidOrders"] = SH["BidOrders"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])
    SH["OfferOrders"] = SH["OfferOrders"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])

    for i in range(1, 51):
        SH["bid1Top" + str(i) + 'q'] = SH["BidOrders"].apply(lambda x: x[i-1])
        SH["bid1Top" + str(i) + 'q'] = SH["bid1Top" + str(i) + 'q'].astype('int32') 
    SH.drop(["BidOrders"],axis=1,inplace=True)
    print("7")
    
    for i in range(1, 51):
        SH["ask1Top" + str(i) + 'q'] = SH["OfferOrders"].apply(lambda x: x[i-1])
        SH["ask1Top" + str(i) + 'q'] = SH["ask1Top" + str(i) + 'q'].astype('int32') 
    SH.drop(["OfferOrders"],axis=1,inplace=True)
    print("8")
    print(datetime.datetime.now() - startTm)
    
    
    startTm = datetime.datetime.now()
    SH.columns = ['cum_trades_cnt', 'ask_trade_max_duration', 'total_bid_orders',
       'cum_canceled_sell_amount', 'total_ask_quantity', 'cum_canceled_buy_orders',
       'total_ask_vwap', 'cum_canceled_sell_volume', 'cum_volume', 'open',
       'high', 'prev_close', 'low', 'total_bid_vwap',
       'cum_canceled_sell_orders', 'total_ask_orders', 'total_ask_levels',
       'total_bid_quantity', 'cum_canceled_buy_volume', 'bid_trade_max_duration',
       'total_bid_levels', 'close', 'cum_amount', 'cum_canceled_buy_amount', 'skey', 'date', 'time', 'clockAtArrival',
       'datetime', 'bid1p', 'bid2p', 'bid3p', 'bid4p', 'bid5p', 'bid6p',
       'bid7p', 'bid8p', 'bid9p', 'bid10p', 'ask1p', 'ask2p', 'ask3p',
       'ask4p', 'ask5p', 'ask6p', 'ask7p', 'ask8p', 'ask9p', 'ask10p',
       'bid1q', 'bid2q', 'bid3q', 'bid4q', 'bid5q', 'bid6q', 'bid7q',
       'bid8q', 'bid9q', 'bid10q', 'ask1q', 'ask2q', 'ask3q', 'ask4q',
       'ask5q', 'ask6q', 'ask7q', 'ask8q', 'ask9q', 'ask10q', 'bid1n',
       'bid2n', 'bid3n', 'bid4n', 'bid5n', 'bid6n', 'bid7n', 'bid8n',
       'bid9n', 'bid10n', 'ask1n', 'ask2n', 'ask3n', 'ask4n', 'ask5n',
       'ask6n', 'ask7n', 'ask8n', 'ask9n', 'ask10n', 'bid1Top1q',
       'bid1Top2q', 'bid1Top3q', 'bid1Top4q', 'bid1Top5q', 'bid1Top6q',
       'bid1Top7q', 'bid1Top8q', 'bid1Top9q', 'bid1Top10q', 'bid1Top11q',
       'bid1Top12q', 'bid1Top13q', 'bid1Top14q', 'bid1Top15q',
       'bid1Top16q', 'bid1Top17q', 'bid1Top18q', 'bid1Top19q',
       'bid1Top20q', 'bid1Top21q', 'bid1Top22q', 'bid1Top23q',
       'bid1Top24q', 'bid1Top25q', 'bid1Top26q', 'bid1Top27q',
       'bid1Top28q', 'bid1Top29q', 'bid1Top30q', 'bid1Top31q',
       'bid1Top32q', 'bid1Top33q', 'bid1Top34q', 'bid1Top35q',
       'bid1Top36q', 'bid1Top37q', 'bid1Top38q', 'bid1Top39q',
       'bid1Top40q', 'bid1Top41q', 'bid1Top42q', 'bid1Top43q',
       'bid1Top44q', 'bid1Top45q', 'bid1Top46q', 'bid1Top47q',
       'bid1Top48q', 'bid1Top49q', 'bid1Top50q', 'ask1Top1q', 'ask1Top2q',
       'ask1Top3q', 'ask1Top4q', 'ask1Top5q', 'ask1Top6q', 'ask1Top7q',
       'ask1Top8q', 'ask1Top9q', 'ask1Top10q', 'ask1Top11q', 'ask1Top12q',
       'ask1Top13q', 'ask1Top14q', 'ask1Top15q', 'ask1Top16q',
       'ask1Top17q', 'ask1Top18q', 'ask1Top19q', 'ask1Top20q',
       'ask1Top21q', 'ask1Top22q', 'ask1Top23q', 'ask1Top24q',
       'ask1Top25q', 'ask1Top26q', 'ask1Top27q', 'ask1Top28q',
       'ask1Top29q', 'ask1Top30q', 'ask1Top31q', 'ask1Top32q',
       'ask1Top33q', 'ask1Top34q', 'ask1Top35q', 'ask1Top36q',
       'ask1Top37q', 'ask1Top38q', 'ask1Top39q', 'ask1Top40q',
       'ask1Top41q', 'ask1Top42q', 'ask1Top43q', 'ask1Top44q',
       'ask1Top45q', 'ask1Top46q', 'ask1Top47q', 'ask1Top48q',
       'ask1Top49q', 'ask1Top50q']
    SH = SH.fillna(0)
#     SH["p1"] = SH["bid1p"] + SH["ask1p"]
#     tt = SH[(SH["cum_volume"] > 0) & (SH["time"] < 145700000000)].groupby("skey")['p1'].min()
#     SH.drop("p1", axis=1, inplace=True)
#     try:
#         assert(tt[tt == 0].shape[0] == 0)
#     except:
#         display(tt[tt == 0])
#     SH = SH[~((SH["bid1p"] == 0) & (SH["ask1p"] == 0))]
    SH["ordering"] = SH.groupby("skey").cumcount()
    SH["ordering"] = SH["ordering"] + 1
    
    SH["has_missing"] = 0
    
    for col in ["skey", "date", "cum_trades_cnt", "total_bid_orders",
        'total_ask_orders', 'total_bid_levels', 'total_ask_levels', 'cum_canceled_buy_orders','cum_canceled_sell_orders',
            "ordering", 'bid_trade_max_duration', 'ask_trade_max_duration','has_missing']:
        SH[col] = SH[col].astype('int32')
    
#     for cols in ["prev_close", 'open', "high", "low", "close", 'bid10p','bid9p','bid8p','bid7p','bid6p','bid5p','bid4p','bid3p',
#              'bid2p','bid1p','ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p']:
# #         SH[cols] = SH[cols].apply(lambda x: round(x, 2)).astype('float64')
#         print(cols)
#         print(SH[cols].astype(str).apply(lambda x: len(str(x.split('.')[1]))).unique())
    
#     for cols in ['cum_amount', "cum_canceled_sell_amount", "cum_canceled_buy_amount"]:
# #         SH[cols] = SH[cols].apply(lambda x: round(x, 2)).astype('float64')
#         print(cols)
#         print(SH[cols].astype(str).apply(lambda x: len(str(x.split('.')[1]))).unique())
        
    for cols in ['total_bid_vwap', "total_ask_vwap"]:
#         print(cols)
#         print(SH[cols].astype(str).apply(lambda x: len(str(x.split('.')[1]))).unique())
        SH[cols] = SH[cols].apply(lambda x: round(x, 3))
        
   
    assert(sum(SH[SH["open"] != 0].groupby("skey")["open"].nunique() != 1) == 0)
    assert(sum(SH[SH["prev_close"] != 0].groupby("skey")["prev_close"].nunique() != 1) == 0)
    SH["prev_close"] = np.where(SH["time"] >= 91500000000, SH.groupby("skey")["prev_close"].transform("max"), SH["prev_close"]) 
    SH["open"] = np.where(SH["cum_volume"] > 0, SH.groupby("skey")["open"].transform("max"), SH["open"])
    assert(sum(SH[SH["open"] != 0].groupby("skey")["open"].nunique() != 1) == 0)
    assert(sum(SH[SH["prev_close"] != 0].groupby("skey")["prev_close"].nunique() != 1) == 0)
    assert(SH[SH["cum_volume"] > 0]["open"].min() > 0)
    print(datetime.datetime.now() - startTm)
    
    
    # check 1
    startTm = datetime.datetime.now()
    da_te = str(SH["date"].iloc[0]) 
    da_te = da_te[:4] + '-' + da_te[4:6] + '-' + da_te[6:8]
    db1 = db[db["date"] == da_te]
    db1["ID"] = db1["ID"].str[2:].astype(int) + 1000000
    db1["date"] = (db1["date"].str[:4] + db1["date"].str[5:7] + db1["date"].str[8:]).astype(int)
    SH["cum_max"] = SH.groupby("skey")["cum_volume"].transform(max)
    s2 = SH[SH["cum_volume"] == SH["cum_max"]].groupby("skey").first().reset_index()
    dd = SH[SH["cum_volume"] == SH["cum_max"]].groupby("skey")["time"].first().reset_index()
    SH.drop("cum_max", axis=1, inplace=True)
    s2 = s2.rename(columns={"skey": "ID", 'open':"d_open", "prev_close":"d_yclose","high":"d_high", "low":"d_low", "close":"d_close", "cum_volume":"d_volume", "cum_amount":"d_amount"})
    if SH["date"].iloc[0] < 20180820:
        s2["auction"] = 0
    else:
        dd["auction"] = np.where(dd["time"]<=145700000000, 0, 1)
        dd = dd.rename(columns={"skey": "ID"})
        s2 = pd.merge(s2, dd[["ID", "auction"]], on="ID")
    s2 = s2[["ID", "date", "d_open", "d_yclose", "d_high", "d_low", "d_close", "d_volume", "d_amount", "auction"]]
    re = pd.merge(db1, s2, on=["ID", "date", "d_open", "d_yclose","d_high", "d_low", "d_volume"], how="outer")
    try:
        assert(sum(re["d_amount_y"].isnull()) == 0)
    except:
        print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
        print(re[re["d_amount_y"].isnull()])
        wr_ong += [re[re["d_amount_y"].isnull()]]
    print(datetime.datetime.now() - startTm)
    
    # check 2
    # first part
    startTm = datetime.datetime.now()
    date = pd.DataFrame(pd.date_range(start='2019-06-10 08:30:00', end='2019-06-10 18:00:00', freq='s'), columns=["Orig"])
    date["time"] = date["Orig"].apply(lambda x: int(x.strftime("%H%M%S"))*1000)
    date["group"] = date["time"]//10000
    SH["group"] = SH["time"]//10000000
    gl = date[((date["time"] >= 93000000) & (date["time"] <= 113000000))|((date["time"] >= 130000000) & (date["time"] <= 150000000))]["group"].unique()
    l = set(gl) - set(SH["group"].unique())
    SH["has_missing1"] = 0 
    if len(l) != 0:
        print("massive missing")
        print(l)
        SH["order"] = SH.groupby(["skey", "time"]).cumcount()
        for i in l:
            SH["t"] = SH[SH["group"] > i].groupby("StockID")["time"].transform("min")
            SH["has_missing1"] = np.where((SH["time"] == SH["t"]) & (SH["order"] == 0), 1, 0)
        SH.drop(["order", "t", "group"], axis=1, inplace=True)   
    else:
        print("no massive missing")
        SH.drop(["group"], axis=1, inplace=True)
    



    # second part

    SH["time_interval"] = SH.groupby("skey")["datetime"].apply(lambda x: x - x.shift(1))
    SH["time_interval"] = SH["time_interval"].apply(lambda x: x.seconds)
    SH["tn_update"] = SH.groupby("skey")["cum_trades_cnt"].apply(lambda x: x-x.shift(1))

    f1 = SH[(SH["time"] >= 93000000000) & (SH["tn_update"] != 0)].groupby("skey")["time"].min().reset_index()
    f1 = f1.rename(columns={"time": "time1"})
    f2 = SH[(SH["time"] >= 130000000000) & (SH["tn_update"] != 0)].groupby("skey")["time"].min().reset_index()
    f2 = f2.rename(columns={"time": "time2"})
    f3 = SH[(SH["time"] >= 150000000000) & (SH["tn_update"] != 0)].groupby("skey")["time"].min().reset_index()
    f3 = f3.rename(columns={"time": "time3"})
    SH = pd.merge(SH, f1, on="skey", how="left")
    del f1
    SH = pd.merge(SH, f2, on="skey", how="left")
    del f2
    SH = pd.merge(SH, f3, on="skey", how="left")
    del f3
    p99 = SH[(SH["time"] > 93000000000) & (SH["time"] < 145700000000) & (SH["time"] != SH["time2"]) & (SH["tn_update"] != 0)]\
    .groupby("skey")["tn_update"].apply(lambda x: x.describe([0.99])["99%"]).reset_index()
    p99 = p99.rename(columns={"tn_update":"99%"})
    SH = pd.merge(SH, p99, on="skey", how="left")

    SH["has_missing2"] = 0
    SH["has_missing2"] = np.where((SH["time_interval"] > 60) & (SH["tn_update"] > SH["99%"]) & 
         (SH["time"] > SH["time1"]) & (SH["time"] != SH["time2"]) & (SH["time"] != SH["time3"]) & (SH["time"] != 100000000000), 1, 0)
    SH.drop(["time_interval", "tn_update", "time1", "time2", "time3", "99%"], axis=1, inplace=True) 

    SH["has_missing"] = np.where((SH["has_missing1"] == 1) | (SH["has_missing2"] == 1), 1, 0)
    SH.drop(["has_missing1", "has_missing2"], axis=1, inplace=True) 
    if SH[SH["has_missing"] == 1].shape[0] != 0:
        print("has missing!!!!!!!!!!!!!!!!!!!!!!!")
        print(SH[SH["has_missing"] == 1].shape[0])
        mi_ss += [SH[SH["has_missing"] == 1]]
    print(datetime.datetime.now() - startTm)
    
    
    
    startTm = datetime.datetime.now()
    SH["has_missing"] = SH["has_missing"].astype('int32')
    SH = SH[["skey", "date", "time", "clockAtArrival", "datetime", "ordering", "has_missing", "cum_trades_cnt", "cum_volume", "cum_amount", "prev_close",
                            "open", "high", "low", "close", 'bid10p','bid9p','bid8p','bid7p','bid6p','bid5p','bid4p','bid3p','bid2p','bid1p',
                            'ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p', 'bid10q','bid9q','bid8q',
                             'bid7q','bid6q','bid5q','bid4q','bid3q','bid2q','bid1q', 'ask1q','ask2q','ask3q','ask4q','ask5q','ask6q',
                             'ask7q','ask8q','ask9q','ask10q', 'bid10n', 'bid9n', 'bid8n', 'bid7n', 'bid6n', 'bid5n', 'bid4n', 'bid3n', 'bid2n', 'bid1n', 
                             'ask1n', 'ask2n', 'ask3n', 'ask4n', 'ask5n', 'ask6n','ask7n', 'ask8n', 'ask9n', 'ask10n','bid1Top1q','bid1Top2q','bid1Top3q','bid1Top4q','bid1Top5q','bid1Top6q',
        'bid1Top7q','bid1Top8q','bid1Top9q','bid1Top10q','bid1Top11q','bid1Top12q','bid1Top13q','bid1Top14q','bid1Top15q','bid1Top16q','bid1Top17q','bid1Top18q',
        'bid1Top19q','bid1Top20q','bid1Top21q','bid1Top22q','bid1Top23q','bid1Top24q','bid1Top25q','bid1Top26q','bid1Top27q','bid1Top28q','bid1Top29q',
        'bid1Top30q','bid1Top31q','bid1Top32q','bid1Top33q','bid1Top34q','bid1Top35q','bid1Top36q','bid1Top37q','bid1Top38q','bid1Top39q','bid1Top40q',
        'bid1Top41q','bid1Top42q','bid1Top43q','bid1Top44q','bid1Top45q','bid1Top46q','bid1Top47q','bid1Top48q','bid1Top49q','bid1Top50q', 'ask1Top1q',
        'ask1Top2q','ask1Top3q','ask1Top4q','ask1Top5q','ask1Top6q','ask1Top7q','ask1Top8q','ask1Top9q','ask1Top10q','ask1Top11q','ask1Top12q','ask1Top13q',
        'ask1Top14q','ask1Top15q','ask1Top16q','ask1Top17q','ask1Top18q','ask1Top19q','ask1Top20q','ask1Top21q','ask1Top22q','ask1Top23q',
        'ask1Top24q','ask1Top25q','ask1Top26q','ask1Top27q','ask1Top28q','ask1Top29q','ask1Top30q','ask1Top31q','ask1Top32q','ask1Top33q',
        'ask1Top34q','ask1Top35q','ask1Top36q','ask1Top37q','ask1Top38q','ask1Top39q','ask1Top40q','ask1Top41q','ask1Top42q','ask1Top43q',
        'ask1Top44q','ask1Top45q','ask1Top46q','ask1Top47q','ask1Top48q','ask1Top49q','ask1Top50q',"total_bid_quantity", "total_ask_quantity","total_bid_vwap", "total_ask_vwap",
        "total_bid_orders",'total_ask_orders','total_bid_levels', 'total_ask_levels', 'bid_trade_max_duration', 'ask_trade_max_duration', 'cum_canceled_buy_orders', 'cum_canceled_buy_volume',
        "cum_canceled_buy_amount", "cum_canceled_sell_orders", 'cum_canceled_sell_volume',"cum_canceled_sell_amount"]]
    
    display(SH["date"].iloc[0])
    print("SH finished")
    
    database_name = 'com_md_eq_cn'
    user = "zhenyuy"
    password = "bnONBrzSMGoE"

    db1 = DB("192.168.10.223", database_name, user, password)
    db1.write('md_snapshot_l2', SH)
    
    del SH
    print(datetime.datetime.now() - startTm)

wr_ong = pd.concat(wr_ong).reset_index(drop=True)
print(wr_ong)
mi_ss = pd.concat(mi_ss).reset_index(drop=True)
print(mi_ss)
print(less)



0:03:02.894416
0:00:00.449359
20191211 unzip finished
0:02:46.652392
0:01:34.334777
1
2
3
4
5
6
7
8
0:12:12.577035
0:00:42.653749


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0:00:04.155281
no massive missing
0:02:22.345053


20191211

SH finished
0:01:08.401486
0:00:00.712164
20191212 unzip finished
0:02:03.652652
0:01:35.222243
1
2
3
4
5
6
7
8
0:12:20.945182
0:00:45.309722
0:00:04.022625
no massive missing
0:02:17.207886


20191212

SH finished
0:01:14.354238
0:00:00.629685
20191213 unzip finished
0:04:08.458863
0:01:38.801259
1
2
3
4
5
6
7
8
0:13:08.789862
0:00:44.417304
0:00:04.624116
no massive missing
0:02:33.946027


20191213

SH finished
0:01:21.845709
0:00:00.574479
20191216 unzip finished
0:02:49.655229
0:01:42.530140
1
2
3
4
5
6
7
8
0:13:04.490019
0:00:45.983097
0:00:04.928800
no massive missing
0:02:33.929906


20191216

SH finished
0:01:34.610243
0:00:01.767299
20191217 unzip finished
0:04:15.853336
0:01:49.843461
1
2
3
4
5
6
7
8
0:13:43.686261
0:00:49.479845
0:00:05.167028
no massive missing
0:02:47.408400


20191217

SH finished
0:01:12.334777
0:00:54.957307
20191218 unzip finished
0:01:01.211602
0:01:41.574053
1
2
3
4
5
6
7
8
0:13:25.200194
0:00:47.491886
0:00:04.722066
no massive missing
0:02:34.311300


20191218

SH finished
0:01:22.681440
0:00:48.360453
20191219 unzip finished
0:01:12.898225
0:01:44.176444
1
2
3
4
5
6
7
8
0:13:12.950755
0:00:46.996816
0:00:05.279675
no massive missing
0:02:41.190433


20191219

SH finished
0:01:11.521391
0:00:45.578808
20191220 unzip finished
0:01:01.770232
0:01:43.805872
1
2
3
4
5
6
7
8
0:13:29.754896
0:00:52.438203
0:00:04.427702
no massive missing
0:02:34.285372


20191220

SH finished
0:01:18.840354
0:00:46.546763
20191223 unzip finished
0:01:06.320526
0:01:54.038790
1
2
3
4
5
6
7
8
0:13:28.553618
0:00:47.814887
0:00:04.645779
no massive missing
0:02:37.829126


20191223

SH finished
0:01:14.535939
0:00:46.409716
20191224 unzip finished
0:01:04.388511
0:01:38.035442
1
2
3
4
5
6
7
8
0:12:41.893405
0:00:55.268608
0:00:04.588251
no massive missing
0:02:32.322649


20191224

SH finished
0:01:11.083234
0:00:43.537187
20191225 unzip finished
0:00:55.928387
0:01:34.599809
1
2
3
4
5
6
7
8
0:12:42.226689
0:00:45.306221
0:00:04.615397
no massive missing
0:02:32.938271


20191225

SH finished
0:01:11.203224
0:00:44.656102
20191226 unzip finished
0:01:05.695836
0:01:45.961467
1
2
3
4
5
6
7
8
0:12:57.829234
0:00:46.803221
0:00:04.354820
no massive missing
0:02:32.030972


20191226

SH finished
0:01:07.841860
0:00:48.235913
20191227 unzip finished
0:01:05.434638
0:01:43.615647
1
2
3
4
5
6
7
8
0:13:41.440981
0:00:45.884654
0:00:04.684739
no massive missing
0:02:35.304356


20191227

SH finished
0:01:23.179043
0:00:52.184113
20191230 unzip finished
0:01:07.842161
0:01:47.241564
1
2
3
4
5
6
7
8
0:14:04.198294
0:00:51.643452
0:00:04.503669
no massive missing
0:02:41.570663


20191230

SH finished
0:01:22.149651
0:00:46.975146
20191231 unzip finished
0:01:02.070020
0:01:43.842126
1
2
3
4
5
6
7
8
0:13:36.903536
0:00:45.901378
0:00:04.424820
no massive missing
0:02:41.579730


20191231

SH finished
0:01:13.927147


ValueError: No objects to concatenate