In [7]:
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz


def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name)


class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db'):
        self.db_name = db_name
        self.uri = uri
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = 1
            seg = {'ver': version, 'data': self.ser(df_seg, version), 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None
    
    def read_daily(self, table_name, start_date=None, end_date=None, skey=None, index_id=None, interval=None, index_name=None, col=None, return_sdi=True): 
        collection = self.db[table_name]
        # Build projection 
        prj = {'_id': 0} 
        if col is not None: 
            if return_sdi: 
                col = ['skey', 'date', 'index_id'] + col 
            for col_name in col: 
                prj[col_name] = 1 
        # Build query 
        query = {} 
        if skey is not None: 
            query['skey'] = {'$in': skey} 
        if interval is not None: 
            query['interval'] = {'$in': interval} 
        if index_id is not None: 
            query['index_id'] = {'$in': index_id}    
        if index_name is not None:
            n = '' 
            for name in index_name: 
                try: 
                    name = re.compile('[\u4e00-\u9fff]+').findall(name)[0] 
                    if len(n) == 0: 
                        n = n = "|".join(name) 
                    else: 
                        n = n + '|' + "|".join(name) 
                except: 
                    if len(n) == 0: 
                        n = name 
                    else: 
                        n = n + '|' + name 
            query['index_name'] = {'$regex': n}
        if start_date is not None: 
            if end_date is not None: 
                query['date'] = {'$gte': start_date, '$lte': end_date} 
            else: 
                query['date'] = {'$gte': start_date} 
        elif end_date is not None: 
            query['date'] = {'$lte': end_date} 
        # Load data 
        cur = collection.find(query, prj) 
        df = pd.DataFrame.from_records(cur) 
        if df.empty: 
            df = pd.DataFrame() 
        else:
            if 'index_id' in df.columns:
                df = df.sort_values(by=['date', 'index_id', 'skey']).reset_index(drop=True)
            else:
                df = df.sort_values(by=['date','skey']).reset_index(drop=True)
        return df 
 
    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)

        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        else:
            raise Exception('unknown version')


def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()





import pandas as pd
import random
import numpy as np
import glob
import pickle
import os
import datetime
import time
pd.set_option("max_columns", 200)


startDate = '20140102'
endDate = '20140102'
readPath = '/mnt/SH1/x64release/Tick/SH/***'
dataPathLs = np.array(glob.glob(readPath))
dateLs = np.array([os.path.basename(i) for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]
wr_ong = []
mi_ss = []
less = []

for data in np.sort(dataPathLs)[::-1]:
    readPath = data + '/***'
    dataPathLs = np.array(glob.glob(readPath))
    dataPathLs = np.array([i for i in dataPathLs if os.path.basename(i)[0] != 'H'])
    dateLs = np.array([int(os.path.basename(i).split('.')[0]) for i in dataPathLs])
    dataPathLs = dataPathLs[((dateLs >= 600000) & (dateLs <= 700000))]
    SH = []
    ll = []
    startTm = datetime.datetime.now()
    for i in dataPathLs:
        try:
            df = pd.read_csv(i, encoding='GBK')
        except:
            print("empty data")
            print(i)
            ll.append(int(os.path.basename(i).split('.')[0]))
            continue
        SH += [df]
    del df
    SH = pd.concat(SH).reset_index(drop=True)
    print(datetime.datetime.now() - startTm)
    
    startTm = datetime.datetime.now()
    SH["skey"] = SH["code"] + 1000000
    SH.drop(["code"],axis=1,inplace=True)
    SH['clockAtArrival'] = SH['date'] * 1000000000 + SH['time']
    SH["clockAtArrival"] = SH["clockAtArrival"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S%f').timestamp()*1e6))
    SH['datetime'] = SH["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))
    SH['time'] = SH['time'] * 1000
    print(datetime.datetime.now() - startTm)    
    
    for i in range(1, 11):
        SH = SH.rename(columns={'ask'+str(i):'ask'+str(i)+'p', 'asize'+str(i):'ask'+str(i)+'q', \
                            'bid'+str(i):'bid'+str(i)+'p', 'bsize'+str(i):'bid'+str(i)+'q'})
    SH = SH.rename(columns={'accvolume':'cum_volume', 'accturover':'cum_amount', 'match_items':'cum_trades_cnt', 'price':'close',
                       'pre_close':'prev_close'})
    SH = SH.fillna(0)
    SH["ordering"] = SH.groupby("skey").cumcount()
    SH["ordering"] = SH["ordering"] + 1
    
    SH["has_missing"] = 0
    
    for col in ["skey", "date", "cum_trades_cnt", 'ordering']:
        SH[col] = SH[col].astype('int32')
    
    for cols in ["prev_close", 'open', "high", "low", "close", 'bid10p','bid9p','bid8p','bid7p','bid6p','bid5p','bid4p','bid3p',
             'bid2p','bid1p','ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p']:
        SH[cols] = SH[cols] / 10000
        
    assert(sum(SH[SH["open"] != 0].groupby("skey")["open"].nunique() != 1) == 0)
    assert(sum(SH[SH["prev_close"] != 0].groupby("skey")["prev_close"].nunique() != 1) == 0)
    SH["prev_close"] = np.where(SH["time"] >= 91500000000, SH.groupby("skey")["prev_close"].transform("max"), SH["prev_close"]) 
    SH["open"] = np.where(SH["cum_volume"] > 0, SH.groupby("skey")["open"].transform("max"), SH["open"])
    assert(sum(SH[SH["open"] != 0].groupby("skey")["open"].nunique() != 1) == 0)
    assert(sum(SH[SH["prev_close"] != 0].groupby("skey")["prev_close"].nunique() != 1) == 0)
    assert(SH[SH["cum_volume"] > 0]["open"].min() > 0)
    print(datetime.datetime.now() - startTm)
    
    
    # check 1
    database_name = 'com_md_eq_cn'
    user = "zhenyuy"
    password = "bnONBrzSMGoE"

    db1 = DB("192.168.10.178", database_name, user, password)
    tr = db1.read_daily('mdbar1d_tr', int(SH['date'].iloc[0]), int(SH['date'].iloc[0]))
    tr = tr.rename(columns={'closeL1':'prev_close', 'volume':'cum_volume', 'amount':'cum_amount'})
    startTm = datetime.datetime.now()
    SH["cum_max"] = SH.groupby("skey")["cum_volume"].transform(max)
    s2 = SH[SH["cum_volume"] == SH["cum_max"]].groupby("skey").first().reset_index()
    SH.drop("cum_max", axis=1, inplace=True)
    tr = tr[["skey", "date", "open", "prev_close", "high", "low", "close", "cum_volume", "cum_amount"]]
    s2 = s2[["skey", "date", "open", "prev_close", "high", "low", "close", "cum_volume", "cum_amount"]]
    tr = tr[(tr['skey'] >= 1600000) & (tr['skey'] < 1700000)]
    re = pd.merge(tr, s2, on=["skey", "date", "open", "prev_close", "high", "low", "close", "cum_volume"], how="outer")
    try:
        assert(sum(re["cum_amount_y"].isnull()) == 0)
    except:
        print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
        print(re[re["cum_amount_y"].isnull()])
        wr_ong += [re[re["cum_amount_y"].isnull()]]
    print(datetime.datetime.now() - startTm)
    
#     # check 2
#     # first part
#     startTm = datetime.datetime.now()
#     date = pd.DataFrame(pd.date_range(start='2019-06-10 08:30:00', end='2019-06-10 18:00:00', freq='s'), columns=["Orig"])
#     date["time"] = date["Orig"].apply(lambda x: int(x.strftime("%H%M%S"))*1000)
#     date["group"] = date["time"]//10000
#     SH["group"] = SH["time"]//10000000
#     gl = date[((date["time"] >= 93000000) & (date["time"] < 113000000))|((date["time"] >= 130000000) & (date["time"] <= 150000000))]["group"].unique()
#     l = set(gl) - set(SH["group"].unique())
#     SH["has_missing1"] = 0 
#     if len(l) != 0:
#         print("massive missing")
#         print(l)
#         SH["order"] = SH.groupby(["skey", "time"]).cumcount()
#         for i in l:
#             SH["t"] = SH[SH["group"] > i].groupby("skey")["time"].transform("min")
#             SH["has_missing1"] = np.where((SH["time"] == SH["t"]) & (SH["order"] == 0), 1, 0)
#         SH.drop(["order", "t", "group"], axis=1, inplace=True)   
#     else:
#         print("no massive missing")
#         SH.drop(["group"], axis=1, inplace=True)
    



#     # second part

#     SH["time_interval"] = SH.groupby("skey")["datetime"].apply(lambda x: x - x.shift(1))
#     SH["time_interval"] = SH["time_interval"].apply(lambda x: x.seconds)
#     SH["tn_update"] = SH.groupby("skey")["cum_trades_cnt"].apply(lambda x: x-x.shift(1))

#     f1 = SH[(SH["time"] >= 93000000000) & (SH["tn_update"] != 0)].groupby("skey")["time"].min().reset_index()
#     f1 = f1.rename(columns={"time": "time1"})
#     f2 = SH[(SH["time"] >= 130000000000) & (SH["tn_update"] != 0)].groupby("skey")["time"].min().reset_index()
#     f2 = f2.rename(columns={"time": "time2"})
#     f3 = SH[(SH["time"] >= 150000000000) & (SH["tn_update"] != 0)].groupby("skey")["time"].min().reset_index()
#     f3 = f3.rename(columns={"time": "time3"})
#     SH = pd.merge(SH, f1, on="skey", how="left")
#     del f1
#     SH = pd.merge(SH, f2, on="skey", how="left")
#     del f2
#     SH = pd.merge(SH, f3, on="skey", how="left")
#     del f3
#     p99 = SH[(SH["time"] > 93000000000) & (SH["time"] < 145700000000) & (SH["time"] != SH["time2"]) & (SH["tn_update"] != 0)]\
#     .groupby("skey")["tn_update"].apply(lambda x: x.describe([0.99])["99%"]).round(0).reset_index()
#     p99 = p99.rename(columns={"tn_update":"99%"})
#     SH = pd.merge(SH, p99, on="skey", how="left")

#     SH["has_missing2"] = 0
#     SH["has_missing2"] = np.where((SH["time_interval"] > 60) & (SH["tn_update"] > SH["99%"]) & 
#          (SH["time"] > SH["time1"]) & (SH["time"] != SH["time2"]) & (SH["time"] != SH["time3"]) & (SH["time"] != 100000000000), 1, 0)
#     SH.drop(["time_interval", "tn_update", "time1", "time2", "time3", "99%"], axis=1, inplace=True) 

#     SH["has_missing"] = np.where((SH["has_missing1"] == 1) | (SH["has_missing2"] == 1), 1, 0)
#     SH.drop(["has_missing1", "has_missing2"], axis=1, inplace=True) 
#     if SH[SH["has_missing"] == 1].shape[0] != 0:
#         print("has missing!!!!!!!!!!!!!!!!!!!!!!!")
#         print(SH[SH["has_missing"] == 1].shape[0])
#         mi_ss += [SH[SH["has_missing"] == 1]]
#     print(datetime.datetime.now() - startTm)
    
    
    
#     startTm = datetime.datetime.now()
#     SH["has_missing"] = SH["has_missing"].astype('int32')
#     SH = SH[["skey", "date", "time", "clockAtArrival", "datetime", "ordering", "has_missing", "cum_trades_cnt", "cum_volume", "cum_amount", "prev_close",
#                             "open", "high", "low", "close", 'bid10p','bid9p','bid8p','bid7p','bid6p','bid5p','bid4p','bid3p','bid2p','bid1p',
#                             'ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p', 'bid10q','bid9q','bid8q',
#                              'bid7q','bid6q','bid5q','bid4q','bid3q','bid2q','bid1q', 'ask1q','ask2q','ask3q','ask4q','ask5q','ask6q',
#                              'ask7q','ask8q','ask9q','ask10q']]
    
#     display(SH["date"].iloc[0])
#     print("SH finished")
    
#     database_name = 'com_md_eq_cn'
#     user = "zhenyuy"
#     password = "bnONBrzSMGoE"

#     db1 = DB("192.168.10.178", database_name, user, password)
#     db1.write('md_snapshot_l2', SH)
    
#     del SH
#     print(datetime.datetime.now() - startTm)

# wr_ong = pd.concat(wr_ong).reset_index(drop=True)
# print(wr_ong)
# mi_ss = pd.concat(mi_ss).reset_index(drop=True)
# print(mi_ss)
# print(less)

0:01:23.801086
0:00:44.008296
0:01:22.106863
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!


KeyError: 'd_cum_amount_y'

In [29]:

    db1 = DB("192.168.10.178", database_name, user, password)
    tr = db1.read_daily('mdbar1d_tr', int(SH['date'].iloc[0]), int(SH['date'].iloc[0]))
    tr = tr.rename(columns={'closeL1':'prev_close', 'volume':'cum_volume', 'amount':'cum_amount'})
    startTm = datetime.datetime.now()
    SH["cum_max"] = SH.groupby("skey")["cum_volume"].transform(max)
    s2 = SH[SH["cum_volume"] == SH["cum_max"]].groupby("skey").last().reset_index()
    SH.drop("cum_max", axis=1, inplace=True)
    tr = tr[["skey", "date", "open", "prev_close", "high", "low", "close", "cum_volume", "cum_amount"]]
    s2 = s2[["skey", "date", "open", "prev_close", "high", "low", "close", "cum_volume", "cum_amount"]]
    tr = tr[tr['skey'].isin(s2['skey'].unique())]
    re = pd.merge(tr, s2, on=["skey", "date", "open", "prev_close", "high", "low", "close", "cum_volume"], how="outer")

In [18]:
SH = []
for i in dataPathLs:
    try:
        df = pd.read_csv(i, encoding='GBK')
    except:
        print("empty data")
        print(i)
        ll.append(int(os.path.basename(i).split('.')[0]))
        continue
    SH += [df]
del df
SH = pd.concat(SH).reset_index(drop=True)

In [34]:

re[re["cum_amount_x"].isnull()]

Unnamed: 0,skey,date,open,prev_close,high,low,close,cum_volume,cum_amount_x,cum_amount_y
942,1600051,20140102,0.0,6.71,0.0,0.0,0.0,0,,0.0
943,1600053,20140102,0.0,6.19,0.0,0.0,0.0,0,,0.0
944,1600057,20140102,0.0,6.81,0.0,0.0,0.0,0,,0.0
945,1600058,20140102,0.0,13.56,0.0,0.0,0.0,0,,0.0
946,1600063,20140102,0.0,2.07,0.0,0.0,0.0,0,,0.0
947,1600074,20140102,0.0,4.14,0.0,0.0,0.0,0,,0.0
948,1600141,20140102,0.0,12.48,0.0,0.0,0.0,0,,0.0
949,1600146,20140102,0.0,9.45,0.0,0.0,0.0,0,,0.0
950,1600217,20140102,0.0,4.84,0.0,0.0,0.0,0,,0.0
951,1600234,20140102,0.0,5.77,0.0,0.0,0.0,0,,0.0


In [33]:
SH[SH['skey'].isin(list(set['SH']))]

Unnamed: 0,wind_code,name,date,time,close,volume,turover,cum_trades_cnt,interest,trade_flag,bs_flag,cum_volume,cum_amount,high,low,open,prev_close,settle,position,curDelta,preSettle,prePosition,ask10p,ask9p,ask8p,ask7p,ask6p,ask5p,ask4p,ask3p,ask2p,ask1p,bid1p,bid2p,bid3p,bid4p,bid5p,bid6p,bid7p,bid8p,bid9p,bid10p,ask10q,ask9q,ask8q,ask7q,ask6q,ask5q,ask4q,ask3q,ask2q,ask1q,bid1q,bid2q,bid3q,bid4q,bid5q,bid6q,bid7q,bid8q,bid9q,bid10q,ask_av_price,bid_av_price,total_ask_volume,total_bid_volume,index,stocks,ups,downs,holdLines,nResv1,nResv2,nResv3,skey,clockAtArrival,datetime,ordering,has_missing
110190,600051.SH,宁波联合,20140102,74003000000,0.0,0,0,0,0,0,32,0,0,0.0,0.0,0.0,6.71,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1600051,1388619603000000,2014-01-02 07:40:03,1,0
