In [1]:
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz


def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name)


class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db'):
        self.db_name = db_name
        self.uri = uri
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = 1
            seg = {'ver': version, 'data': self.ser(df_seg, version), 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None

    def read_daily(self, table_name, start_date=None, end_date=None, skey=None, index_id=None, interval=None, index_name=None, col=None, return_sdi=True): 
        collection = self.db[table_name]
        # Build projection 
        prj = {'_id': 0} 
        if col is not None: 
            if return_sdi: 
                col = ['skey', 'date', 'index_id'] + col 
            for col_name in col: 
                prj[col_name] = 1 
        # Build query 
        query = {} 
        if skey is not None: 
            query['skey'] = {'$in': skey} 
        if interval is not None: 
            query['interval'] = {'$in': interval} 
        if index_id is not None: 
            query['index_id'] = {'$in': index_id}    
        if index_name is not None:
            n = '' 
            for name in index_name: 
                try: 
                    name = re.compile('[\u4e00-\u9fff]+').findall(name)[0] 
                    if len(n) == 0: 
                        n = n = "|".join(name) 
                    else: 
                        n = n + '|' + "|".join(name) 
                except: 
                    if len(n) == 0: 
                        n = name 
                    else: 
                        n = n + '|' + name 
            query['index_name'] = {'$regex': n}
        if start_date is not None: 
            if end_date is not None: 
                query['date'] = {'$gte': start_date, '$lte': end_date} 
            else: 
                query['date'] = {'$gte': start_date} 
        elif end_date is not None: 
            query['date'] = {'$lte': end_date} 
        # Load data 
        cur = collection.find(query, prj) 
        df = pd.DataFrame.from_records(cur) 
        if df.empty: 
            df = pd.DataFrame() 
        else:
            if 'index_id' in df.columns:
                df = df.sort_values(by=['date', 'index_id', 'skey']).reset_index(drop=True)
            else:
                df = df.sort_values(by=['date','skey']).reset_index(drop=True)
        return df 
 
    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)

        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        else:
            raise Exception('unknown version')


def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()





import pandas as pd
import random
import numpy as np
import glob
import pickle
import os
import datetime
import time
pd.set_option("max_columns", 200)

startTm = datetime.datetime.now()
readPath = '/mnt/Kevin_zhenyu/day_stock/***'
dataPathLs = np.array(glob.glob(readPath))
dataPathLs = dataPathLs[[np.array([os.path.basename(i).split('.')[0][:2] == 'SH' for i in dataPathLs])]]
db = pd.DataFrame()
for p in dataPathLs:
    dayData = pd.read_csv(p, compression='gzip')
    db = pd.concat([db, dayData])
print(datetime.datetime.now() - startTm)

startDate = '20140102'
endDate = '20150628'
readPath = '/mnt/SH/x64release/Tick/SH/***'
dataPathLs = np.array(glob.glob(readPath))
dateLs = np.array([os.path.basename(i) for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]
wr_ong = []
mi_ss = []
less = []

for data in np.sort(dataPathLs)[::-1]:
    readPath = data + '/***'
    dataPathLs = np.array(glob.glob(readPath))
    dataPathLs = np.array([i for i in dataPathLs if os.path.basename(i)[0] != 'H'])
    dateLs = np.array([int(os.path.basename(i).split('.')[0]) for i in dataPathLs])
    dataPathLs = dataPathLs[((dateLs >= 600000) & (dateLs <= 700000))]
    SH = []
    ll = []
    startTm = datetime.datetime.now()
    for i in dataPathLs:
        try:
            df = pd.read_csv(i, encoding='GBK')
        except:
            print("empty data")
            print(i)
            ll.append(int(os.path.basename(i).split('.')[0]))
            continue
        SH += [df]
    del df
    SH = pd.concat(SH).reset_index(drop=True)
    print(datetime.datetime.now() - startTm)
    
    startTm = datetime.datetime.now()
    SH["skey"] = SH["code"] + 1000000
    SH.drop(["code"],axis=1,inplace=True)
    SH['clockAtArrival'] = SH['date'] * 1000000000 + SH['time']
    SH["clockAtArrival"] = SH["clockAtArrival"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S%f').timestamp()*1e6))
    SH['datetime'] = SH["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))
    SH['time'] = SH['time'] * 1000
    print(datetime.datetime.now() - startTm)    
    
    for i in range(1, 11):
        SH = SH.rename(columns={'ask'+str(i):'ask'+str(i)+'p', 'asize'+str(i):'ask'+str(i)+'q', \
                            'bid'+str(i):'bid'+str(i)+'p', 'bsize'+str(i):'bid'+str(i)+'q'})
    SH = SH.rename(columns={'accvolume':'cum_volume', 'accturover':'cum_amount', 'match_items':'cum_trades_cnt', 'price':'close',
                       'pre_close':'prev_close'})
    SH = SH.fillna(0)
    SH["ordering"] = SH.groupby("skey").cumcount()
    SH["ordering"] = SH["ordering"] + 1
    
    SH["has_missing"] = 0
    
    for col in ["skey", "date", "cum_trades_cnt", 'ordering']:
        SH[col] = SH[col].astype('int32')
    
    for cols in ["prev_close", 'open', "high", "low", "close", 'bid10p','bid9p','bid8p','bid7p','bid6p','bid5p','bid4p','bid3p',
             'bid2p','bid1p','ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p']:
        SH[cols] = SH[cols] / 10000
        
    try:
        assert(sum(SH[SH["open"] != 0].groupby("skey")["open"].nunique() != 1) == 0)
    except:
        sl = SH[SH["open"] != 0].groupby("skey")["open"].nunique()[
            SH[SH["open"] != 0].groupby("skey")["open"].nunique() != 1
        ].index
        for i in sl:
            op = SH[(SH['skey'] == i) & (SH['open'] > 0)]['open'].unique()
            op = [x for x in op if len(str(x).split('.')[1])==2]
            try:
                assert(len(op) == 1)
                print(SH[(SH['skey'] == i) & (SH['open'] != op[0]) & (SH['open'] != 0)])
                SH.loc[(SH['skey'] == i) & (SH['open'] != 0), 'open'] = op[0]
            except:
                size = [SH[(SH['skey'] == i) & (SH['open'] == x)].shape[0] for x in op]
                print(SH[(SH['skey'] == i) & (SH['open'] != op[size.index(np.max(size))]) & (SH['open'] != 0)])
                SH.loc[(SH['skey'] == i) & (SH['open'] != 0), 'open'] = op[size.index(np.max(size))]
        assert(sum(SH[SH["open"] != 0].groupby("skey")["open"].nunique() != 1) == 0)
    assert(sum(SH[SH["prev_close"] != 0].groupby("skey")["prev_close"].nunique() != 1) == 0)
    SH["prev_close"] = np.where(SH["time"] >= 91500000000, SH.groupby("skey")["prev_close"].transform("max"), SH["prev_close"]) 
    SH["open"] = np.where(SH["cum_volume"] > 0, SH.groupby("skey")["open"].transform("max"), SH["open"])
    assert(sum(SH[SH["open"] != 0].groupby("skey")["open"].nunique() != 1) == 0)
    assert(sum(SH[SH["prev_close"] != 0].groupby("skey")["prev_close"].nunique() != 1) == 0)
    assert(SH[SH["cum_volume"] > 0]["open"].min() > 0)
    print(datetime.datetime.now() - startTm)
    
    
    # check 1
    startTm = datetime.datetime.now()
    da_te = str(SH["date"].iloc[0]) 
    da_te = da_te[:4] + '-' + da_te[4:6] + '-' + da_te[6:8]
    db1 = db[db["date"] == da_te]
    db1["ID"] = db1["ID"].str[2:].astype(int) + 1000000
    db1["date"] = (db1["date"].str[:4] + db1["date"].str[5:7] + db1["date"].str[8:]).astype(int)
    SH["cum_max"] = SH.groupby("skey")["cum_volume"].transform(max)
    s2 = SH[SH["cum_volume"] == SH["cum_max"]].groupby("skey").first().reset_index()
    SH.drop("cum_max", axis=1, inplace=True)
    s2 = s2.rename(columns={"skey": "ID", 'open':"d_open", "prev_close":"d_yclose","high":"d_high", "low":"d_low", "close":"d_close", "cum_volume":"d_volume", "cum_amount":"d_amount"})
    s2 = s2[["ID", "date", "d_open", "d_yclose", "d_high", "d_low", "d_close", "d_volume", "d_amount"]]
    re = pd.merge(db1, s2, on=["ID", "date", "d_open", "d_yclose","d_high", "d_low", "d_volume"], how="outer")
    try:
        assert(sum(re["d_amount_y"].isnull()) == 0)
    except:
        print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
        print(re[re["d_amount_y"].isnull()])
        wr_ong += [re[re["d_amount_y"].isnull()]]
    print(datetime.datetime.now() - startTm)
    
    # check 2
    # first part
    startTm = datetime.datetime.now()
    date = pd.DataFrame(pd.date_range(start='2019-06-10 08:30:00', end='2019-06-10 18:00:00', freq='s'), columns=["Orig"])
    date["time"] = date["Orig"].apply(lambda x: int(x.strftime("%H%M%S"))*1000)
    date["group"] = date["time"]//10000
    SH["group"] = SH["time"]//10000000
    gl = date[((date["time"] >= 93000000) & (date["time"] < 113000000))|((date["time"] >= 130000000) & (date["time"] <= 150000000))]["group"].unique()
    l = set(gl) - set(SH["group"].unique())
    SH["has_missing1"] = 0 
    if len(l) != 0:
        print("massive missing")
        print(l)
        SH["order"] = SH.groupby(["skey", "time"]).cumcount()
        for i in l:
            SH["t"] = SH[SH["group"] > i].groupby("skey")["time"].transform("min")
            SH["has_missing1"] = np.where((SH["time"] == SH["t"]) & (SH["order"] == 0), 1, 0)
        SH.drop(["order", "t", "group"], axis=1, inplace=True)   
    else:
        print("no massive missing")
        SH.drop(["group"], axis=1, inplace=True)
    



    # second part

    SH["time_interval"] = SH.groupby("skey")["datetime"].apply(lambda x: x - x.shift(1))
    SH["time_interval"] = SH["time_interval"].apply(lambda x: x.seconds)
    SH["tn_update"] = SH.groupby("skey")["cum_trades_cnt"].apply(lambda x: x-x.shift(1))

    f1 = SH[(SH["time"] >= 93000000000) & (SH["tn_update"] != 0)].groupby("skey")["time"].min().reset_index()
    f1 = f1.rename(columns={"time": "time1"})
    f2 = SH[(SH["time"] >= 130000000000) & (SH["tn_update"] != 0)].groupby("skey")["time"].min().reset_index()
    f2 = f2.rename(columns={"time": "time2"})
    f3 = SH[(SH["time"] >= 150000000000) & (SH["tn_update"] != 0)].groupby("skey")["time"].min().reset_index()
    f3 = f3.rename(columns={"time": "time3"})
    SH = pd.merge(SH, f1, on="skey", how="left")
    del f1
    SH = pd.merge(SH, f2, on="skey", how="left")
    del f2
    SH = pd.merge(SH, f3, on="skey", how="left")
    del f3
    p99 = SH[(SH["time"] > 93000000000) & (SH["time"] < 145700000000) & (SH["time"] != SH["time2"]) & (SH["tn_update"] != 0)]\
    .groupby("skey")["tn_update"].apply(lambda x: x.describe([0.99])["99%"]).round(0).reset_index()
    p99 = p99.rename(columns={"tn_update":"99%"})
    SH = pd.merge(SH, p99, on="skey", how="left")

    SH["has_missing2"] = 0
    SH["has_missing2"] = np.where((SH["time_interval"] > 60) & (SH["tn_update"] > SH["99%"]) & 
         (SH["time"] > SH["time1"]) & (SH["time"] != SH["time2"]) & (SH["time"] != SH["time3"]) & (SH["time"] != 100000000000), 1, 0)
    SH.drop(["time_interval", "tn_update", "time1", "time2", "time3", "99%"], axis=1, inplace=True) 

    SH["has_missing"] = np.where((SH["has_missing1"] == 1) | (SH["has_missing2"] == 1), 1, 0)
    SH.drop(["has_missing1", "has_missing2"], axis=1, inplace=True) 
    if SH[SH["has_missing"] == 1].shape[0] != 0:
        print("has missing!!!!!!!!!!!!!!!!!!!!!!!")
        print(SH[SH["has_missing"] == 1].shape[0])
        mi_ss += [SH[SH["has_missing"] == 1]]
    print(datetime.datetime.now() - startTm)
    
    
    
    startTm = datetime.datetime.now()
    SH["has_missing"] = SH["has_missing"].astype('int32')
    SH = SH[["skey", "date", "time", "clockAtArrival", "datetime", "ordering", "has_missing", "cum_trades_cnt", "cum_volume", "cum_amount", "prev_close",
                            "open", "high", "low", "close", 'bid10p','bid9p','bid8p','bid7p','bid6p','bid5p','bid4p','bid3p','bid2p','bid1p',
                            'ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p', 'bid10q','bid9q','bid8q',
                             'bid7q','bid6q','bid5q','bid4q','bid3q','bid2q','bid1q', 'ask1q','ask2q','ask3q','ask4q','ask5q','ask6q',
                             'ask7q','ask8q','ask9q','ask10q']]
    
    display(SH["date"].iloc[0])
    print("SH finished")
    
    database_name = 'com_md_eq_cn'
    user = "zhenyuy"
    password = "bnONBrzSMGoE"

    db1 = DB("192.168.10.178", database_name, user, password)
    db1.write('md_snapshot_l2', SH)
    
    del SH
    print(datetime.datetime.now() - startTm)

wr_ong = pd.concat(wr_ong).reset_index(drop=True)
print(wr_ong)
mi_ss = pd.concat(mi_ss).reset_index(drop=True)
print(mi_ss)
print(less)



0:03:02.326132
0:01:13.550512
0:00:56.735214
       wind_code  name      date          time        close            volume  \
95236  600023.SH  浙能电力  20150626  143049000000 -127788.6132  5805701627314027   
95237  600023.SH  浙能电力  20150626  143049000000 -127788.6132                 0   
95238  600023.SH  浙能电力  20150626  143942000000 -127788.6132                 0   
95239  600023.SH  浙能电力  20150626  144402000000 -127788.6132                 0   
95240  600023.SH  浙能电力  20150626  144406000000 -127788.6132                 0   

                turover  cum_trades_cnt  interest  trade_flag  bs_flag  \
95236  2318457176542587          109731         0           0       32   
95237                 0          109731         0           0       32   
95238                 0          109731         0           0       32   
95239                 0          109731         0           0       32   
95240                 0          109731         0           0       32   

             cum_volume

20150626

SH finished
0:00:19.629528
0:01:22.526318
0:00:59.920849
0:02:10.010866
0:00:02.977602
no massive missing
0:01:16.340337


20150625

SH finished
0:00:22.151882
0:01:49.708037
0:00:58.835582
         wind_code  name      date          time        close  \
3796061  601929.SH  吉视传媒  20150624  144444169000  159805.6042   
3796062  601929.SH  吉视传媒  20150624  144446169000  159805.6342   
3796063  601929.SH  吉视传媒  20150624  144450169000  159805.6342   
3796064  601929.SH  吉视传媒  20150624  144452169000  159805.6342   
3796065  601929.SH  吉视传媒  20150624  144456169000  159805.6042   
...            ...   ...       ...           ...          ...   
3796450  601929.SH  吉视传媒  20150624  150416169000  159805.5642   
3796451  601929.SH  吉视传媒  20150624  150419169000  159805.5642   
3796452  601929.SH  吉视传媒  20150624  150422169000  159805.5542   
3796453  601929.SH  吉视传媒  20150624  150422169000  159805.5142   
3796454  601929.SH  吉视传媒  20150624  150422169000  159805.5142   

                   volume            turover  cum_trades_cnt  interest  \
3796061  1336363063379208  24923432814370719          176695         0   
3796062       

0:02:07.468438
0:00:02.583124
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
1
0:01:12.257845


20150624

SH finished
0:00:22.128077
0:01:36.148679
0:01:01.694089
        wind_code  name      date          time  close       volume  \
136462  600036.SH  招商银行  20150623  144442000000  18.98  17283260160   

             turover  cum_trades_cnt  interest  trade_flag  bs_flag  \
136462  328094388172           89250         0           0       32   

         cum_volume    cum_amount  high    low   open  prev_close  settle  \
136462  17558203160  333150252239  19.0  17.72  18.22        18.0       0   

        position  curDelta  preSettle  prePosition  ask10p  ask9p  ask8p  \
136462         0         0          0            0   19.03  19.02  19.01   

        ask7p  ask6p  ask5p  ask4p  ask3p  ask2p  ask1p  bid1p  bid2p  bid3p  \
136462   19.0  18.99  18.98  18.97  18.96  18.95  18.94  18.93  18.92  18.91   

        bid4p  bid5p  bid6p  bid7p  bid8p  bid9p  bid10p  ask10q   ask9q  \
136462   18.9  18.89  18.88  18.87  18.86  18.85   18.84   40200  102564   

        ask8q    ask7q   ask6q   as

20150623

SH finished
0:00:21.480826
0:01:34.175496
0:00:57.477399
0:01:59.595457
0:00:02.568180
no massive missing
0:01:10.357751


20150619

SH finished
0:00:30.177258
0:01:50.789934
0:00:59.824465
0:02:09.245511
0:00:02.596124
no massive missing
0:01:11.896177


20150618

SH finished
0:00:21.512087
0:01:41.037294
0:01:01.438164
0:02:12.655940
0:00:02.739356
no massive missing
0:01:15.241013


20150617

SH finished
0:00:21.572378
0:01:49.901106
0:01:00.787579
0:02:10.166775
0:00:02.622121
no massive missing
0:01:10.230065


20150616

SH finished
0:00:21.472489
0:01:51.722473
0:00:58.725694
0:01:59.894575
0:00:02.669542
no massive missing
0:01:20.101135


20150615

SH finished
0:00:31.596594
0:01:47.189153
0:01:00.008763
0:02:23.841260
0:00:02.663045
massive missing
{15000}
0:01:23.074904


20150612

SH finished
0:00:27.728956
0:02:04.460473
0:01:02.281715
0:02:05.237566
0:00:02.643092
no massive missing
0:01:11.240927


20150611

SH finished
0:00:25.146722
0:01:37.353926
0:00:59.614707
0:02:01.777381
0:00:02.701924
no massive missing
0:01:12.057401


20150610

SH finished
0:00:20.658542
0:01:59.322984
0:00:59.165210
         wind_code  name      date          time       close      volume  \
3653332  601766.SH  中国中车  20150609  140821000000     32.1700  3830888800   
3653333  601766.SH  中国中车  20150609  141752000000     32.1700           0   
3653334  601766.SH  中国中车  20150609  230834000000  11099.2273       14032   
3653335  601766.SH  中国中车  20150609  232132000000  11119.3704     1922704   
3653336  601766.SH  中国中车  20150609  232138000000  11118.9710       23240   
3653337  601766.SH  中国中车  20150609  232635000000  11119.8120      736652   
3653338  601766.SH  中国中车  20150609  232640000000  11120.4544        8388   
3653339  601766.SH  中国中车  20150609  232645000000  11120.1615       14470   
3653340  601766.SH  中国中车  20150609  232650000000  11119.9998       11917   
3653341  601766.SH  中国中车  20150609  232655000000  11119.5323        9050   
3653342  601766.SH  中国中车  20150609  232700000000  11119.5237        5008   
3653343  601766.SH  中国中车  20150

0:02:04.613213
0:00:02.584496
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
4
0:01:11.658694


20150609

SH finished
0:00:27.632151
0:01:34.369514
0:00:58.271431
0:01:59.648977
0:00:02.575594
massive missing
{9411, 9383, 9353, 9355, 9391, 9334, 9431, 9340, 9342}
has missing!!!!!!!!!!!!!!!!!!!!!!!
941
0:01:32.217781


20150608

SH finished
0:00:21.933362
0:01:20.464221
0:00:49.213760
0:01:40.353808
0:00:02.146802
massive missing
{9320}
has missing!!!!!!!!!!!!!!!!!!!!!!!
768
0:01:02.817482


20150605

SH finished
0:00:25.508426
0:01:38.755745
0:01:02.189941
0:02:05.928393
0:00:02.748227
no massive missing
0:01:17.078307


20150604

SH finished
0:00:22.736352
0:01:47.899924
0:01:00.502123
0:02:04.091904
0:00:02.669227
no massive missing
0:01:11.709244


20150603

SH finished
0:00:30.036263
0:01:46.678258
0:01:01.040698
         wind_code  name      date         time        close  \
2393808  600695.SH  绿庭投资  20150602  71413659000  108491.7754   

                   volume             turover  cum_trades_cnt  interest  \
2393808  1353580624794179  424546921877470081         2545507         0   

         trade_flag  bs_flag        cum_volume          cum_amount       high  \
2393808           0       32  1353580644216064  424546922132154898  5667.1358   

           low     open  prev_close  settle  position  curDelta  preSettle  \
2393808  12.51  12.6495       12.73       0         0         0          0   

         prePosition       ask10p        ask9p        ask8p       ask7p  \
2393808            0  108491.7763  108491.7762  108491.7761  108491.776   

               ask6p        ask5p        ask4p        ask3p        ask2p  \
2393808  108491.7759  108491.7758  108491.7757  108491.7756  108491.7755   

               ask1p       bid1p       

20150602

SH finished
0:00:21.574726
0:01:38.801767
0:00:59.521382
0:02:01.186415
0:00:02.613049
massive missing
{13003}
has missing!!!!!!!!!!!!!!!!!!!!!!!
939
0:01:15.304234


20150601

SH finished
0:00:27.715605
0:02:12.784926
0:01:00.731380
0:02:04.523697
0:00:02.692316
no massive missing
0:01:13.571278


20150529

SH finished
0:00:29.148345
0:01:58.912771
0:00:59.717550


ValueError: zero-size array to reduction operation maximum which has no identity

In [7]:
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz


def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name)


class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db'):
        self.db_name = db_name
        self.uri = uri
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = 1
            seg = {'ver': version, 'data': self.ser(df_seg, version), 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)

        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        else:
            raise Exception('unknown version')


def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()





import pandas as pd
import random
import numpy as np
import glob
import pickle
import os
import datetime
import time
pd.set_option("max_columns", 200)

startTm = datetime.datetime.now()
readPath = '/mnt/Kevin_zhenyu/day_stock/***'
dataPathLs = np.array(glob.glob(readPath))
dataPathLs = dataPathLs[[np.array([os.path.basename(i).split('.')[0][:2] == 'SH' for i in dataPathLs])]]
db = pd.DataFrame()
for p in dataPathLs:
    dayData = pd.read_csv(p, compression='gzip')
    db = pd.concat([db, dayData])
print(datetime.datetime.now() - startTm)

startDate = '20140102'
endDate = '20150528'
readPath = '/mnt/SH/x64release/Tick/SH/***'
dataPathLs = np.array(glob.glob(readPath))
dateLs = np.array([os.path.basename(i) for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]
wr_ong = []
mi_ss = []
less = []

for data in np.sort(dataPathLs)[::-1]:
    readPath = data + '/***'
    dataPathLs = np.array(glob.glob(readPath))
    dataPathLs = np.array([i for i in dataPathLs if os.path.basename(i)[0] != 'H'])
    dateLs = np.array([int(os.path.basename(i).split('.')[0]) for i in dataPathLs])
    dataPathLs = dataPathLs[((dateLs >= 600000) & (dateLs <= 700000))]
    SH = []
    ll = []
    startTm = datetime.datetime.now()
    for i in dataPathLs:
        try:
            df = pd.read_csv(i, encoding='GBK')
        except:
            print("empty data")
            print(i)
            ll.append(int(os.path.basename(i).split('.')[0]))
            continue
        SH += [df]
    del df
    SH = pd.concat(SH).reset_index(drop=True)
    print(datetime.datetime.now() - startTm)
    
    startTm = datetime.datetime.now()
    SH["skey"] = SH["code"] + 1000000
    SH.drop(["code"],axis=1,inplace=True)
    SH['clockAtArrival'] = SH['date'] * 1000000000 + SH['time']
    SH["clockAtArrival"] = SH["clockAtArrival"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S%f').timestamp()*1e6))
    SH['datetime'] = SH["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))
    SH['time'] = SH['time'] * 1000
    print(datetime.datetime.now() - startTm)    
    
    for i in range(1, 11):
        SH = SH.rename(columns={'ask'+str(i):'ask'+str(i)+'p', 'asize'+str(i):'ask'+str(i)+'q', \
                            'bid'+str(i):'bid'+str(i)+'p', 'bsize'+str(i):'bid'+str(i)+'q'})
    SH = SH.rename(columns={'accvolume':'cum_volume', 'accturover':'cum_amount', 'match_items':'cum_trades_cnt', 'price':'close',
                       'pre_close':'prev_close'})
    SH = SH.fillna(0)
    SH["ordering"] = SH.groupby("skey").cumcount()
    SH["ordering"] = SH["ordering"] + 1
    
    SH["has_missing"] = 0
    
    for col in ["skey", "date", "cum_trades_cnt", 'ordering']:
        SH[col] = SH[col].astype('int32')
    
    for cols in ["prev_close", 'open', "high", "low", "close", 'bid10p','bid9p','bid8p','bid7p','bid6p','bid5p','bid4p','bid3p',
             'bid2p','bid1p','ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p']:
        SH[cols] = SH[cols] / 10000
        
    try:
        assert(sum(SH[SH["open"] != 0].groupby("skey")["open"].nunique() != 1) == 0)
    except:
        sl = SH[SH["open"] != 0].groupby("skey")["open"].nunique()[
            SH[SH["open"] != 0].groupby("skey")["open"].nunique() != 1
        ].index
        for i in sl:
            op = SH[(SH['skey'] == i) & (SH['open'] > 0)]['open'].unique()
            print(i)
            print(op)
            op = [x for x in op if len(str(x).split('.')[1])<=2]
            try:
                assert(len(op) == 1)
                print(SH[(SH['skey'] == i) & (SH['open'] != op[0]) & (SH['open'] != 0)])
                SH.loc[(SH['skey'] == i) & (SH['open'] != 0), 'open'] = op[0]
            except:
                size = [SH[(SH['skey'] == i) & (SH['open'] == x)].shape[0] for x in op]
                print(SH[(SH['skey'] == i) & (SH['open'] != op[size.index(np.max(size))]) & (SH['open'] != 0)])
                SH.loc[(SH['skey'] == i) & (SH['open'] != 0), 'open'] = op[size.index(np.max(size))]
        assert(sum(SH[SH["open"] != 0].groupby("skey")["open"].nunique() != 1) == 0)
    assert(sum(SH[SH["prev_close"] != 0].groupby("skey")["prev_close"].nunique() != 1) == 0)
    SH["prev_close"] = np.where(SH["time"] >= 91500000000, SH.groupby("skey")["prev_close"].transform("max"), SH["prev_close"]) 
    SH["open"] = np.where(SH["cum_volume"] > 0, SH.groupby("skey")["open"].transform("max"), SH["open"])
    assert(sum(SH[SH["open"] != 0].groupby("skey")["open"].nunique() != 1) == 0)
    assert(sum(SH[SH["prev_close"] != 0].groupby("skey")["prev_close"].nunique() != 1) == 0)
    assert(SH[SH["cum_volume"] > 0]["open"].min() > 0)
    print(datetime.datetime.now() - startTm)
    
    
    # check 1
    startTm = datetime.datetime.now()
    da_te = str(SH["date"].iloc[0]) 
    da_te = da_te[:4] + '-' + da_te[4:6] + '-' + da_te[6:8]
    db1 = db[db["date"] == da_te]
    db1["ID"] = db1["ID"].str[2:].astype(int) + 1000000
    db1["date"] = (db1["date"].str[:4] + db1["date"].str[5:7] + db1["date"].str[8:]).astype(int)
    SH["cum_max"] = SH.groupby("skey")["cum_volume"].transform(max)
    s2 = SH[SH["cum_volume"] == SH["cum_max"]].groupby("skey").first().reset_index()
    SH.drop("cum_max", axis=1, inplace=True)
    s2 = s2.rename(columns={"skey": "ID", 'open':"d_open", "prev_close":"d_yclose","high":"d_high", "low":"d_low", "close":"d_close", "cum_volume":"d_volume", "cum_amount":"d_amount"})
    s2 = s2[["ID", "date", "d_open", "d_yclose", "d_high", "d_low", "d_close", "d_volume", "d_amount"]]
    re = pd.merge(db1, s2, on=["ID", "date", "d_open", "d_yclose","d_high", "d_low", "d_volume"], how="outer")
    try:
        assert(sum(re["d_amount_y"].isnull()) == 0)
    except:
        print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
        print(re[re["d_amount_y"].isnull()])
        wr_ong += [re[re["d_amount_y"].isnull()]]
    print(datetime.datetime.now() - startTm)
    
    # check 2
    # first part
    startTm = datetime.datetime.now()
    date = pd.DataFrame(pd.date_range(start='2019-06-10 08:30:00', end='2019-06-10 18:00:00', freq='s'), columns=["Orig"])
    date["time"] = date["Orig"].apply(lambda x: int(x.strftime("%H%M%S"))*1000)
    date["group"] = date["time"]//10000
    SH["group"] = SH["time"]//10000000
    gl = date[((date["time"] >= 93000000) & (date["time"] < 113000000))|((date["time"] >= 130000000) & (date["time"] <= 150000000))]["group"].unique()
    l = set(gl) - set(SH["group"].unique())
    SH["has_missing1"] = 0 
    if len(l) != 0:
        print("massive missing")
        print(l)
        SH["order"] = SH.groupby(["skey", "time"]).cumcount()
        for i in l:
            SH["t"] = SH[SH["group"] > i].groupby("skey")["time"].transform("min")
            SH["has_missing1"] = np.where((SH["time"] == SH["t"]) & (SH["order"] == 0), 1, 0)
        SH.drop(["order", "t", "group"], axis=1, inplace=True)   
    else:
        print("no massive missing")
        SH.drop(["group"], axis=1, inplace=True)
    



    # second part

    SH["time_interval"] = SH.groupby("skey")["datetime"].apply(lambda x: x - x.shift(1))
    SH["time_interval"] = SH["time_interval"].apply(lambda x: x.seconds)
    SH["tn_update"] = SH.groupby("skey")["cum_trades_cnt"].apply(lambda x: x-x.shift(1))

    f1 = SH[(SH["time"] >= 93000000000) & (SH["tn_update"] != 0)].groupby("skey")["time"].min().reset_index()
    f1 = f1.rename(columns={"time": "time1"})
    f2 = SH[(SH["time"] >= 130000000000) & (SH["tn_update"] != 0)].groupby("skey")["time"].min().reset_index()
    f2 = f2.rename(columns={"time": "time2"})
    f3 = SH[(SH["time"] >= 150000000000) & (SH["tn_update"] != 0)].groupby("skey")["time"].min().reset_index()
    f3 = f3.rename(columns={"time": "time3"})
    SH = pd.merge(SH, f1, on="skey", how="left")
    del f1
    SH = pd.merge(SH, f2, on="skey", how="left")
    del f2
    SH = pd.merge(SH, f3, on="skey", how="left")
    del f3
    p99 = SH[(SH["time"] > 93000000000) & (SH["time"] < 145700000000) & (SH["time"] != SH["time2"]) & (SH["tn_update"] != 0)]\
    .groupby("skey")["tn_update"].apply(lambda x: x.describe([0.99])["99%"]).round(0).reset_index()
    p99 = p99.rename(columns={"tn_update":"99%"})
    SH = pd.merge(SH, p99, on="skey", how="left")

    SH["has_missing2"] = 0
    SH["has_missing2"] = np.where((SH["time_interval"] > 60) & (SH["tn_update"] > SH["99%"]) & 
         (SH["time"] > SH["time1"]) & (SH["time"] != SH["time2"]) & (SH["time"] != SH["time3"]) & (SH["time"] != 100000000000), 1, 0)
    SH.drop(["time_interval", "tn_update", "time1", "time2", "time3", "99%"], axis=1, inplace=True) 

    SH["has_missing"] = np.where((SH["has_missing1"] == 1) | (SH["has_missing2"] == 1), 1, 0)
    SH.drop(["has_missing1", "has_missing2"], axis=1, inplace=True) 
    if SH[SH["has_missing"] == 1].shape[0] != 0:
        print("has missing!!!!!!!!!!!!!!!!!!!!!!!")
        print(SH[SH["has_missing"] == 1].shape[0])
        mi_ss += [SH[SH["has_missing"] == 1]]
    print(datetime.datetime.now() - startTm)
    
    
    
    startTm = datetime.datetime.now()
    SH["has_missing"] = SH["has_missing"].astype('int32')
    SH = SH[["skey", "date", "time", "clockAtArrival", "datetime", "ordering", "has_missing", "cum_trades_cnt", "cum_volume", "cum_amount", "prev_close",
                            "open", "high", "low", "close", 'bid10p','bid9p','bid8p','bid7p','bid6p','bid5p','bid4p','bid3p','bid2p','bid1p',
                            'ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p', 'bid10q','bid9q','bid8q',
                             'bid7q','bid6q','bid5q','bid4q','bid3q','bid2q','bid1q', 'ask1q','ask2q','ask3q','ask4q','ask5q','ask6q',
                             'ask7q','ask8q','ask9q','ask10q']]
    
    display(SH["date"].iloc[0])
    print("SH finished")
    
    database_name = 'com_md_eq_cn'
    user = "zhenyuy"
    password = "bnONBrzSMGoE"

    db1 = DB("192.168.10.178", database_name, user, password)
    db1.write('md_snapshot_l2', SH)
    
    del SH
    print(datetime.datetime.now() - startTm)

wr_ong = pd.concat(wr_ong).reset_index(drop=True)
print(wr_ong)
mi_ss = pd.concat(mi_ss).reset_index(drop=True)
print(mi_ss)
print(less)



0:03:03.954062
0:01:45.158710
0:01:01.749214
        wind_code  name      date          time      close            volume  \
499020  600131.SH  岷江水电  20150528  101356019000  41628.449  3353446621741654   
499021  600131.SH  岷江水电  20150528  101359019000  41628.439              4000   
499022  600131.SH  岷江水电  20150528  101402019000  41628.449              3000   
499023  600131.SH  岷江水电  20150528  101405019000  41628.449              1800   
499024  600131.SH  岷江水电  20150528  101408019000  41628.449                 0   
...           ...   ...       ...           ...        ...               ...   
499996  600131.SH  岷江水电  20150528  110247019000  41628.099             30000   
499997  600131.SH  岷江水电  20150528  110250019000  41628.139              9100   
499998  600131.SH  岷江水电  20150528  110254019000  41628.129              4900   
499999  600131.SH  岷江水电  20150528  110257019000  41628.109             25800   
500000  600131.SH  岷江水电  20150528  110259019000  41628.129              200

0:02:09.312523
0:00:02.674845
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
1
0:01:13.423131


20150528

SH finished
0:00:28.193215
0:02:12.474871
0:01:01.287619
0:02:11.180320
0:00:03.050315
no massive missing
0:01:27.249275


20150527

SH finished
0:00:22.114669
0:02:48.787858
0:01:03.082824
0:02:21.653141
0:00:02.968825
no massive missing
0:01:14.859072


20150526

SH finished
0:00:23.930879
0:02:07.181594
0:01:11.577470
0:02:31.313634
0:00:03.752534
no massive missing
0:01:20.852420


20150525

SH finished
0:00:27.992679
0:03:28.506938
0:01:02.978590
0:03:00.441822
0:00:05.545890
no massive missing
0:01:39.844502


20150522

SH finished
0:00:25.466785
0:02:04.161422
0:00:59.641181
0:02:14.629243
0:00:03.028359
massive missing
{15000, 13003}
has missing!!!!!!!!!!!!!!!!!!!!!!!
947
0:01:27.178066


20150521

SH finished
0:00:27.461131
0:02:05.883180
0:00:59.047335
0:02:09.507439
0:00:03.017877
massive missing
{15000}
0:01:22.004781


20150520

SH finished
0:00:30.968981
0:02:05.140511
0:00:58.650073
0:02:07.908048
0:00:02.914640
massive missing
{15000}
has missing!!!!!!!!!!!!!!!!!!!!!!!
1
0:01:21.217100


20150519

SH finished
0:00:20.407454
0:01:51.258526
0:00:57.591088
0:02:04.957238
0:00:02.829120
no massive missing
0:01:13.609706


20150518

SH finished
0:00:26.424536
0:01:09.360325
0:00:36.296597
0:01:17.509865
0:00:01.782785
no massive missing
0:00:44.336660


20150514

SH finished
0:00:11.897076
0:01:54.989202
0:00:59.067854
0:02:04.392152
0:00:02.765564
no massive missing
0:01:13.300652


20150512

SH finished
0:00:28.254014
0:02:41.186263
0:00:59.059017
0:02:04.663332
0:00:02.517489
no massive missing
0:01:09.520576


20150511

SH finished
0:00:28.453209
0:01:42.114480
0:00:57.231569
0:02:00.147776
0:00:02.807148
no massive missing
0:01:13.052966


20150508

SH finished
0:00:26.218280
0:01:49.150362
0:00:57.607223
0:02:04.993282
0:00:02.819707
no massive missing
0:01:13.411363


20150507

SH finished
0:00:26.633445
0:01:54.652194
0:00:57.047931
0:02:04.099149
0:00:02.841632
no massive missing
0:01:12.968375


20150506

SH finished
0:00:23.298692
0:01:48.373508
0:00:58.351226
0:02:06.177461
0:00:02.824810
no massive missing
0:01:12.964754


20150505

SH finished
0:00:21.996084
0:01:49.178594
0:00:56.462766
0:02:02.744994
0:00:02.768938
no massive missing
0:01:12.417487


20150504

SH finished
0:00:20.060136
0:01:53.475707
0:00:58.344235
0:02:07.348316
0:00:02.881727
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
1
0:01:14.941525


20150430

SH finished
0:00:25.931568
0:01:46.595957
0:00:59.500078
0:02:09.032383
0:00:02.917183
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
1
0:01:15.463640


20150429

SH finished
0:00:20.435628
0:01:50.287868
0:01:00.034112
0:02:09.683449
0:00:02.917768
no massive missing
0:01:15.401368


20150428

SH finished
0:00:21.576756
0:01:47.056571
0:01:00.448181
0:02:12.185416
0:00:03.045740
no massive missing
0:01:16.787596


20150427

SH finished
0:00:20.936360
0:01:53.651956
0:00:59.749722
0:02:09.843824
0:00:02.939694
massive missing
{15000}
0:01:20.773414


20150424

SH finished
0:00:27.027136
0:01:42.257208
0:00:58.897103
0:02:10.216568
0:00:03.051851
no massive missing
0:01:16.895779


20150423

SH finished
0:00:20.871391
0:01:37.263747
0:00:59.866723
0:02:10.331234
0:00:02.957616
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
1
0:01:16.871280


20150422

SH finished
0:00:21.047459
0:01:32.988366
0:00:58.448881
0:02:08.188361
0:00:02.900815
no massive missing
0:01:15.158054


20150421

SH finished
0:00:27.412906
0:01:51.052025
0:00:59.101481
0:02:10.662782
0:00:02.928038
no massive missing
0:01:15.968581


20150420

SH finished
0:00:20.922121
0:01:58.803553
0:00:58.112675
0:02:06.922876
0:00:02.838818
no massive missing
0:01:14.244693


20150417

SH finished
0:00:20.116361
0:01:57.341744
0:00:58.460045
0:02:09.488212
0:00:02.893880
no massive missing
0:01:14.282550


20150416

SH finished
0:00:20.079082
0:01:42.751524
0:00:59.531952
0:02:10.201470
0:00:02.929160
no massive missing
0:01:16.232061


20150415

SH finished
0:00:25.505669
0:01:36.248685
0:00:58.600059
0:02:07.415638
0:00:02.831352
no massive missing
0:01:13.797674


20150414

SH finished
0:00:22.906834
0:02:02.841209
0:00:59.051731
0:02:09.484404
0:00:02.928368
no massive missing
0:01:15.434008


20150413

SH finished
0:00:20.031538
0:01:53.602914
0:00:59.155517
0:02:08.664061
0:00:02.905424
no massive missing
0:01:15.586511


20150410

SH finished
0:00:27.917614
0:01:33.909550
0:00:59.484060
0:02:09.464099
0:00:02.883659
no massive missing
0:01:14.909955


20150409

SH finished
0:00:20.579365
0:01:36.177993
0:01:00.691252
0:02:11.256143
0:00:02.941025
no massive missing
0:01:15.678140


20150408

SH finished
0:00:24.186094
0:01:33.330250
0:01:00.289660
0:02:11.749811
0:00:02.962019
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
1
0:01:16.992373


20150407

SH finished
0:00:20.066562
0:01:33.900047
0:01:00.811131
0:02:10.945535
0:00:02.923937
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
1
0:01:15.900580


20150403

SH finished
0:00:27.691320
0:01:33.975882
0:01:00.126056
0:02:10.787481
0:00:02.924144
no massive missing
0:01:16.199307


20150402

SH finished
0:00:20.013261
0:01:33.304956
0:00:57.363054
0:02:07.204587
0:00:02.863151
no massive missing
0:01:14.544764


20150401

SH finished
0:00:23.941991
0:01:34.703308
0:01:00.305305
0:02:10.180115
0:00:02.952036
no massive missing
0:01:14.836108


20150331

SH finished
0:00:24.351650
0:01:33.263586
0:00:58.184851
0:02:05.246058
0:00:02.748936
no massive missing
0:01:11.662941


20150330

SH finished
0:00:21.768799
0:01:31.545507
0:00:55.412112
0:02:00.940400
0:00:02.765058
no massive missing
0:01:12.092632


20150327

SH finished
0:00:24.886782
0:01:33.313495
0:00:58.240791
0:02:08.461687
0:00:02.899488
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
1
0:01:14.596961


20150326

SH finished
0:00:23.079861
0:01:35.097846
0:00:58.292596
0:02:07.502969
0:00:02.901847
no massive missing
0:01:15.119682


20150325

SH finished
0:00:22.925546
0:01:36.932461
0:00:58.913793
0:02:07.871772
0:00:02.844429
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
1
0:01:14.859028


20150324

SH finished
0:00:20.048877
0:01:35.480510
0:00:57.246832
0:02:03.818396
0:00:02.786346
no massive missing
0:01:12.348264


20150323

SH finished
0:00:19.044234
0:01:34.908217
0:00:56.719005
0:02:04.113941
0:00:02.815763
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
1
0:01:12.540026


20150320

SH finished
0:00:18.925358
0:01:34.987315
0:00:56.819026
0:02:02.984482
0:00:02.771300
no massive missing
0:01:11.580490


20150319

SH finished
0:00:23.205136
0:01:31.531788
0:00:56.716026
0:02:04.263285
0:00:02.866316
massive missing
{13002}
has missing!!!!!!!!!!!!!!!!!!!!!!!
942
0:01:18.286827


20150318

SH finished
0:00:19.042984
0:01:32.071068
0:00:57.079145
0:02:04.418548
0:00:02.805102
massive missing
{15000}
0:01:17.560161


20150317

SH finished
0:00:18.957305
0:01:29.427969
0:00:57.661987
0:02:05.188824
0:00:02.817030
no massive missing
0:01:12.362040


20150316

SH finished
0:00:18.554338
0:01:30.491627
0:00:53.182120
0:01:56.368828
0:00:02.618373
no massive missing
0:01:06.443384


20150313

SH finished
0:00:20.399444
0:01:32.683697
0:00:54.298718
0:01:59.152598
0:00:02.761551
no massive missing
0:01:10.253306


20150312

SH finished
0:00:17.623890
0:01:26.614022
0:00:53.508207
0:01:57.045525
0:00:02.684358
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
2
0:01:09.613241


20150311

SH finished
0:00:26.472361
0:01:28.804867
0:00:52.622928
0:01:55.300907
0:00:02.635981
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
1
0:01:07.881284


20150310

SH finished
0:00:17.125663
0:01:28.160901
0:00:52.628501
0:01:55.326136
0:00:02.674222
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
1
0:01:07.932480


20150309

SH finished
0:00:16.992954
0:01:31.157406
0:00:54.419565
0:01:58.317979
0:00:02.668655
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
1
0:01:07.764297


20150306

SH finished
0:00:25.351604
0:01:26.870538
0:00:53.376641
0:01:56.442619
0:00:02.609154
no massive missing
0:01:07.182145


20150305

SH finished
0:00:25.364234
0:01:30.567640
0:00:53.980744
0:01:56.478653
0:00:02.610025
no massive missing
0:01:08.005038


20150304

SH finished
0:00:17.405472
0:01:29.122517
0:00:55.964675
0:02:00.573585
0:00:02.703566
massive missing
{13001}
has missing!!!!!!!!!!!!!!!!!!!!!!!
951
0:01:16.149531


20150303

SH finished
0:00:26.125822
0:01:30.665290
0:00:55.032213
0:01:59.028016
0:00:02.697912
no massive missing
0:01:09.416359


20150302

SH finished
0:00:17.225037
0:01:27.284301
0:00:51.278120
0:01:50.521881
0:00:02.491366
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
1
0:01:03.503180


20150227

SH finished
0:00:20.423343
0:01:22.263825
0:00:49.059379
0:01:47.389798
0:00:02.427872
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
2
0:01:02.700315


20150226

SH finished
0:00:19.147800
0:01:17.909193
0:00:47.385592
0:01:43.080182
0:00:02.361627
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
1
0:01:00.969639


20150225

SH finished
0:00:18.991223
0:01:22.733360
0:00:47.458691
0:01:43.100752
0:00:02.362793
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
1
0:01:01.057796


20150217

SH finished
0:00:19.413425
0:01:20.972389
0:00:48.146792
0:01:44.270477
0:00:02.353715
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
1
0:01:00.813657


20150216

SH finished
0:00:22.503766
0:01:22.365529
0:00:49.945585
0:01:47.105232
0:00:02.461921
no massive missing
0:01:02.578988


20150213

SH finished
0:00:23.754433
0:01:16.366601
0:00:45.766371
0:01:38.556415
0:00:02.265991
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
1
0:00:58.013603


20150212

SH finished
0:00:14.666139
0:01:12.809671
0:00:44.303136
0:01:34.780188
0:00:02.159003
no massive missing
0:00:55.218792


20150211

SH finished
0:00:14.240698
0:01:17.874116
0:00:42.991459
0:01:32.095448
0:00:02.082985
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
3
0:00:53.780363


20150210

SH finished
0:00:20.301864
0:01:13.148318
0:00:43.413194
0:01:33.446524
0:00:02.145024
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
7
0:00:55.227261


20150209

SH finished
0:00:14.017805
0:01:16.713216
0:00:46.602911
0:01:41.013972
0:00:02.337114
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
3
0:01:00.054410


20150206

SH finished
0:00:23.195994
0:01:19.757869
0:00:48.612124
0:01:45.764461
0:00:02.422301
massive missing
{15000}
has missing!!!!!!!!!!!!!!!!!!!!!!!
2
0:01:05.913541


20150205

SH finished
0:00:16.036585
0:01:19.828807
0:00:48.540862
0:01:45.357280
0:00:02.404440
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
1
0:01:01.581493


20150204

SH finished
0:00:24.289372
0:01:17.768611
0:00:48.098525
0:01:45.221311
0:00:02.425768
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
2
0:01:01.890598


20150203

SH finished
0:00:22.782735
0:01:17.598570
0:00:46.852042
0:01:41.701876
0:00:02.326034
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
1
0:00:59.950558


20150202

SH finished
0:00:24.202566
0:01:21.757259
0:00:49.037560
0:01:47.125826
0:00:02.451970
no massive missing
0:01:01.958881


20150130

SH finished
0:00:20.448189
0:01:19.707526
0:00:49.729942
0:01:47.686823
0:00:02.470301
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
1
0:01:03.068351


20150129

SH finished
0:00:23.543786
0:01:20.065743
0:00:50.232642
0:01:48.909103
0:00:02.484977
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
1
0:01:03.657882


20150128

SH finished
0:00:24.887027
0:01:27.765918
0:00:54.577313
0:01:57.557388
0:00:02.607653
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
1
0:01:06.637355


20150127

SH finished
0:00:17.635211
0:01:25.208344
0:00:50.959970
0:01:50.650930
0:00:02.589428
massive missing
{14200}
has missing!!!!!!!!!!!!!!!!!!!!!!!
929
0:01:08.421100


20150126

SH finished
0:00:20.973875
0:01:26.935874
0:00:52.044465
0:01:52.674238
0:00:02.559694
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
1
0:01:05.067242


20150123

SH finished
0:00:16.867807
0:01:23.772063
0:00:51.359373
0:01:52.941855
0:00:02.635147
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
1
0:01:06.473177


20150122

SH finished
0:00:22.667517
0:01:23.300136
0:00:51.885211
0:01:52.875422
0:00:02.608862
no massive missing
0:01:05.623027


20150121

SH finished
0:00:16.894344
0:01:23.296143
0:00:49.284146
0:01:40.506195
0:00:02.242196
no massive missing
0:00:59.515549


20150120

SH finished
0:00:17.941825
0:01:28.398833
0:00:50.640087
0:01:48.462992
0:00:02.266151
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
1
0:00:59.585820


20150119

SH finished
0:00:16.581326
0:01:25.355284
0:00:45.663261
0:01:35.796979
0:00:02.127010
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
3
0:00:57.571486


20150116

SH finished
0:00:22.692811
0:01:15.782699
0:00:44.268198
0:01:41.726034
0:00:02.063326
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
9
0:00:53.759688


20150115

SH finished
0:00:21.249960
0:01:26.651845
0:00:52.193349
0:02:05.056132
0:00:02.661400
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
2
0:01:07.327598


20150114

SH finished
0:00:22.003395
0:01:14.974236
0:00:44.852071
0:01:31.424114
0:00:02.145481
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
3
0:00:53.207724


20150113

SH finished
0:00:20.147706
0:01:16.216582
0:00:46.409697
0:01:34.619028
0:00:02.095786
no massive missing
0:00:55.161491


20150112

SH finished
0:00:19.902800
0:01:22.336374
0:00:47.768904
0:01:40.072814
0:00:02.252163
no massive missing
0:00:58.852629


20150109

SH finished
0:00:24.661887
0:01:19.819004
0:00:46.951123
0:01:39.692572
0:00:02.243433
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
2
0:00:59.051297


20150108

SH finished
0:00:18.220964
0:01:20.549772
0:00:47.752137
0:01:38.112836
0:00:02.128221
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
2
0:00:57.763348


20150107

SH finished
0:00:22.581808
0:01:23.761132
0:00:49.796247
0:01:43.166258
0:00:02.274106
no massive missing
0:01:01.293011


20150106

SH finished
0:00:22.280248
0:01:21.292319
0:00:48.143683
0:01:39.346012
0:00:02.183291
no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!
2
0:00:59.112737


20150105

SH finished
0:00:25.273289


ValueError: No objects to concatenate

In [6]:
SH[SH['skey'] == i]['open'].unique()[1]

11.2

In [33]:
size = [SH[(SH['skey'] == i) & (SH['open'] == x)].shape[0] for x in op]
op[size.index(np.max(size))]

18.06

In [24]:
op

[18.06, 18.22]

In [27]:
SH[(SH['skey'] == 1600036) & (SH['open'] == 18.22)]

Unnamed: 0,wind_code,name,date,time,close,volume,turover,cum_trades_cnt,interest,trade_flag,bs_flag,cum_volume,cum_amount,high,low,open,prev_close,settle,position,curDelta,preSettle,prePosition,ask10p,ask9p,ask8p,ask7p,ask6p,ask5p,ask4p,ask3p,ask2p,ask1p,bid1p,bid2p,bid3p,bid4p,bid5p,bid6p,bid7p,bid8p,bid9p,bid10p,ask10q,ask9q,ask8q,ask7q,ask6q,ask5q,ask4q,ask3q,ask2q,ask1q,bid1q,bid2q,bid3q,bid4q,bid5q,bid6q,bid7q,bid8q,bid9q,bid10q,ask_av_price,bid_av_price,total_ask_volume,total_bid_volume,index,stocks,ups,downs,holdLines,nResv1,nResv2,nResv3,skey,clockAtArrival,datetime,ordering,has_missing
136462,600036.SH,招商银行,20150623,144442000000,18.98,17283260160,328094388172,89250,0,0,32,17558203160,333150252239,19.0,17.72,18.22,18.0,0,0,0,0,0,19.03,19.02,19.01,19.0,18.99,18.98,18.97,18.96,18.95,18.94,18.93,18.92,18.91,18.9,18.89,18.88,18.87,18.86,18.85,18.84,40200,102564,45100,2079505,613286,226893,250778,57600,291194,85151,3900,4700,145262,102615,18500,126100,1100,103400,11100,33800,193670,180000,13328844,14244974,0,0,0,0,0,0,0,0,1600036,1435041882000000,2015-06-23 14:44:42,4612,0


In [18]:
[x for x in SH[(SH['skey'] == 1600023) & (SH['open'] > 0)]['open'].unique() if len(str(x).split('.')[1])==2]

[10.35]