In [7]:
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz


import pymongo
import io
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz
import pyarrow as pa
import pyarrow.parquet as pq
import numpy as np


def DB(host, db_name, user, passwd, version=3):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name, version=version)


class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db', version=3):
        self.db_name = db_name
        self.uri = uri
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'
        self.version = version

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = self.version
            ser_data = self.ser(df_seg, version)
            seg = {'ver': version, 'data': ser_data, 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None
    
    def read_raw(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        return collection.find(query)

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        elif version == 3:
            # 32-bit number needs more space than 64-bit for parquet
            for col_name in s.columns:
                col = s[col_name]
                if col.dtype == np.int32:
                    s[col_name] = col.astype(np.int64)
                elif col.dtype == np.uint32:
                    s[col_name] = col.astype(np.uint64)
            tbl = pa.Table.from_pandas(s)
            f = io.BytesIO()
            pq.write_table(tbl, f, use_dictionary=False, compression='ZSTD', compression_level=0)
            f.seek(0)
            data = f.read()
            return data
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)
        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        elif version == 3:
            f = io.BytesIO()
            f.write(s)
            f.seek(0)
            return pq.read_table(f, use_threads=False).to_pandas()
        else:
            raise Exception('unknown version')


def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()

def dailyDB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    url = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    client = pymongo.MongoClient(url, maxPoolSize=None)
    db = client[db_name]
    return db

def read_stock_daily(db, name, start_date=None, end_date=None, skey=None, index_name=None, interval=None, col=None, return_sdi=True):
    collection = db[name]
    # Build projection
    prj = {'_id': 0}
    if col is not None:
        if return_sdi:
            col = ['skey', 'date'] + col
        for col_name in col:
            prj[col_name] = 1

    # Build query
    query = {}
    if skey is not None:
        query['skey'] = {'$in': skey}
    if index_name is not None:
        query['index_name'] = {'$in': index_name}
    if start_date is not None:
        if end_date is not None:
            query['date'] = {'$gte': start_date, '$lte': end_date}
        else:
            query['date'] = {'$gte': start_date}
    elif end_date is not None:
        query['date'] = {'$lte': end_date}

    # Load data
    cur = collection.find(query, prj)
    df = pd.DataFrame.from_records(cur)
    if df.empty:
        df = pd.DataFrame()
    else:
        df = df.sort_values(by=['date', 'skey'])
    return df   

def read_memb_daily(db, name, start_date=None, end_date=None, skey=None, index_id=None, interval=None, col=None, return_sdi=True):
    collection = db[name]
    # Build projection
    prj = {'_id': 0}
    if col is not None:
        if return_sdi:
            col = ['skey', 'date', 'index_id'] + col
        for col_name in col:
            prj[col_name] = 1

    # Build query
    query = {}
    if skey is not None:
        query['skey'] = {'$in': skey}
    if index_id is not None:
        query['index_id'] = {'$in': index_id}
    if interval is not None:
        query['interval'] = {'$in': interval}
    if start_date is not None:
        if end_date is not None:
            query['date'] = {'$gte': start_date, '$lte': end_date}
        else:
            query['date'] = {'$gte': start_date}
    elif end_date is not None:
        query['date'] = {'$lte': end_date}

    # Load data
    cur = collection.find(query, prj)
    df = pd.DataFrame.from_records(cur)
    if df.empty:
        df = pd.DataFrame()
    else:
        df = df.sort_values(by=['date', 'index_id', 'skey'])
    return df 



import pandas as pd
import random
import numpy as np
import glob
import pickle
import os
import datetime
import time
pd.set_option("max_columns", 200)

year = "2020"
startDate = '20201102'
endDate = '20201210'
database_name = 'com_md_eq_cn'
user = "zhenyuy"
password = "bnONBrzSMGoE"

startTm = datetime.datetime.now()
db1 = DB("192.168.10.178", database_name, user, password)
db2 = dailyDB("192.168.10.178", database_name, user, password)
save = {}
save['date'] = []
save['secid'] = []
mdOrderLog = db1.read('md_order', start_date=startDate, end_date=endDate, symbol=[2000001])
datelist = mdOrderLog['date'].unique()
ss = pd.read_csv('/mnt/ShareWithServer/result/shangshi1.csv')
ss = ss[ss['证券代码↑'] != 'T00018.SH']
ss['skey'] = np.where(ss['证券代码↑'].str[-2:] == 'SZ', ss['证券代码↑'].str[:6].astype(int) + 2000000, ss['证券代码↑'].str[:6].astype(int) + 1000000)
ss['date'] = (ss['上市日期'].str[:4] + ss['上市日期'].str[5:7] + ss['上市日期'].str[8:10]).astype(int)
print(datetime.datetime.now() - startTm)

startTm = datetime.datetime.now()
for d in datelist:
    print(d)
    sl1 = db1.read('md_order', start_date=str(d), end_date=str(d))['skey'].unique()
    data1 = db1.read('md_snapshot_l2', start_date=str(d), end_date=str(d), symbol=list(sl1))
    assert(len(sl1) == data1['skey'].nunique())
    op = read_stock_daily(db2, 'mdbar1d_tr', start_date=int(d), end_date=int(d))

    if len(sl1) != 0:
        for s in sl1:
            mbd = db1.read('md_snapshot_mbd', start_date=str(d), end_date=str(d), symbol=s)
            if mbd is None:
                if ss[ss['skey'] == s]['date'].iloc[0] == d:
                    l2 = data1[data1['skey'] == s]
                    l2['ApplSeqNum'] = -1
                    l2['ApplSeqNum'] = l2['ApplSeqNum'].astype('int32') 
                    db1.write('md_snapshot_l2', l2)
                    continue
                else:
                    save['date'].append(d)
                    save['secid'].append(s)
                    print(s)
                    continue
            try:
                assert(mbd.shape[1] == 82)
            except:
                print('mdb data column unupdated')
                print(s)
            try:
                op1 = op[op['skey'] == s]['open'].iloc[0]
                assert(mbd[mbd['cum_volume'] > 0]['open'].iloc[0] == op1)
            except:
                print('%s have no information in mdbar1d_tr' % str(s))
            l2 = data1[data1['skey'] == s]
            cols = ['skey', 'date', 'cum_volume', 'prev_close', 'open', 'close', 'cum_trades_cnt', 'bid10p', 'bid9p',
                   'bid8p', 'bid7p', 'bid6p', 'bid5p', 'bid4p', 'bid3p', 'bid2p', 'bid1p', 'ask1p', 'ask2p',
                   'ask3p', 'ask4p', 'ask5p', 'ask6p', 'ask7p', 'ask8p', 'ask9p', 'ask10p', 'bid10q', 'bid9q', 
                   'bid8q', 'bid7q', 'bid6q', 'bid5q', 'bid4q', 'bid3q', 'bid2q', 'bid1q', 'ask1q', 'ask2q', 'ask3q', 
                   'ask4q', 'ask5q', 'ask6q','ask7q', 'ask8q', 'ask9q', 'ask10q', 'bid10n', 'bid9n', 'bid8n',
                   'bid7n', 'bid6n', 'bid5n', 'bid4n', 'bid3n', 'bid2n', 'bid1n', 'ask1n', 'ask2n', 'ask3n', 
                   'ask4n', 'ask5n', 'ask6n', 'ask7n', 'ask8n', 'ask9n', 'ask10n', 'total_bid_quantity', 'total_ask_quantity']
            mbd1 = mbd.drop_duplicates(cols, keep='first')
            mbd = mbd1[cols+['ApplSeqNum']]
            if 'ApplSeqNum' in l2.columns:
                l2 = l2[list(l2.columns[l2.columns != 'ApplSeqNum'])]
            rl2 = pd.merge(l2, mbd, on=cols, how='left')
            try:
                assert(rl2[(rl2['ApplSeqNum'].isnull()) & (rl2['cum_volume'] > 0) & (rl2['time'] <= 145655000000)].shape[0] == 0)
            except:
                print(rl2[(rl2['ApplSeqNum'].isnull()) & (rl2['cum_volume'] > 0) & (rl2['time'] <= 145655000000)][['skey', 'date', 'time', 'cum_volume', 'close', 'bid1p', 'bid2p','bid1q', 'bid2q', 'ask1p', 'ask2p', 'ask1q', 'ask2q']])
                print(mbd1.tail(1)[['skey', 'date', 'time', 'cum_volume', 'close', 'bid1p', 'bid2p','bid1q', 'bid2q', 'ask1p', 'ask2p', 'ask1q', 'ask2q']])
            rl2.loc[rl2['ApplSeqNum'].isnull(), 'ApplSeqNum'] = -1
            rl2['ApplSeqNum'] = rl2['ApplSeqNum'].astype('int32') 
            assert(rl2.shape[0] == l2.shape[0])
            db1.write('md_snapshot_l2', rl2)
        print(datetime.datetime.now() - startTm)
    else:
        continue

0:00:02.146038
20201102
         skey      date          time  cum_volume  close  bid1p  bid2p  bid1q  \
3482  2002995  20201102  145654000000     1042043  62.89  62.88  62.79    500   

      bid2q  ask1p  ask2p  ask1q  ask2q  
3482    700  62.89   62.9   6200    100  
          skey      date          time  cum_volume  close  bid1p  bid2p  \
10021  2002995  20201102  145652870000     1041943  62.91  62.89  62.88   

       bid1q  bid2q  ask1p  ask2p  ask1q  ask2q  
10021    100    500   62.9  62.91    100   5000  
       skey      date         time  cum_volume  close  bid1p  bid2p  bid1q  \
80  2300900  20201102  92503000000      182100  69.88  69.86   69.8    400   
81  2300900  20201102  92603000000      182100  69.88  69.86   69.8    400   
82  2300900  20201102  92703000000      182100  69.88  69.86   69.8    400   
83  2300900  20201102  92803000000      182100  69.88  69.86   69.8    400   
84  2300900  20201102  92903000000      182100  69.88  69.86   69.8    400   

    bid2q

       skey      date         time  cum_volume  close  bid1p  bid2p  bid1q  \
80  2300902  20201104  92503000000      241424  54.22  54.22  54.19   5276   
81  2300902  20201104  92603000000      241424  54.22  54.22  54.19   5276   
82  2300902  20201104  92703000000      241424  54.22  54.22  54.19   5276   
83  2300902  20201104  92803000000      241424  54.22  54.22  54.19   5276   
84  2300902  20201104  92903000000      241424  54.22  54.22  54.19   5276   

    bid2q  ask1p  ask2p  ask1q  ask2q  
80   1600   54.3  54.37   1200    300  
81   1600   54.3  54.37   1200    300  
82   1600   54.3  54.37   1200    300  
83   1600   54.3  54.37   1200    300  
84   1600   54.3  54.37   1200    300  
          skey      date          time  cum_volume  close  bid1p  bid2p  \
82781  2300902  20201104  145654830000    14561362  56.96  56.96   56.9   

       bid1q  bid2q  ask1p  ask2p  ask1q  ask2q  
82781   4100    100  56.97  56.98   2100    700  
1:00:14.404963
20201105
         skey   

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


1:20:28.990578
20201106
2300903
       skey      date         time  cum_volume  close  bid1p  bid2p  bid1q  \
80  2300905  20201106  92503000000      167200  94.01   94.0  93.99   3400   
81  2300905  20201106  92603000000      167200  94.01   94.0  93.99   3400   
82  2300905  20201106  92703000000      167200  94.01   94.0  93.99   3400   
83  2300905  20201106  92803000000      167200  94.01   94.0  93.99   3400   
84  2300905  20201106  92903000000      167200  94.01   94.0  93.99   3400   

    bid2q  ask1p  ask2p  ask1q  ask2q  
80    200  94.01  94.04   1200    200  
81    200  94.01  94.04   1200    200  
82    200  94.01  94.04   1200    200  
83    200  94.01  94.04   1200    200  
84    200  94.01  94.04   1200    200  
          skey      date          time  cum_volume  close  bid1p  bid2p  \
83999  2300905  20201106  145654960000     6502521  97.19  97.19  97.15   

       bid1q  bid2q  ask1p  ask2p  ask1q  ask2q  
83999   2500    200  97.21  97.23   3400    100  
       s

       skey      date         time  cum_volume  close  bid1p  bid2p  bid1q  \
71  2300906  20201111  92503000000       96100  63.01  63.01   63.0   2800   
72  2300906  20201111  92603000000       96100  63.01  63.01   63.0   2800   
73  2300906  20201111  92703000000       96100  63.01  63.01   63.0   2800   
74  2300906  20201111  92803000000       96100  63.01  63.01   63.0   2800   
75  2300906  20201111  92903000000       96100  63.01  63.01   63.0   2800   

    bid2q  ask1p  ask2p  ask1q  ask2q  
71  17400   63.3   63.5    300    400  
72  17400   63.3   63.5    300    400  
73  17400   63.3   63.5    300    400  
74  17400   63.3   63.5    300    400  
75  17400   63.3   63.5    300    400  
          skey      date          time  cum_volume  close  bid1p  bid2p  \
38322  2300906  20201111  145654490000     4826838  63.06  63.05  63.03   

       bid1q  bid2q  ask1p  ask2p  ask1q  ask2q  
38322   3700    200  63.06  63.09   1200    373  
2:43:04.041392
20201112
2300489
       s

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


       skey      date         time  cum_volume  close  bid1p  bid2p  bid1q  \
79  2300884  20201117  92503000000       71100   70.0  69.88  69.69    100   
80  2300884  20201117  92603000000       71100   70.0  69.88  69.69    100   
81  2300884  20201117  92703000000       71100   70.0  69.88  69.69    100   
82  2300884  20201117  92803000000       71100   70.0  69.88  69.69    100   
83  2300884  20201117  92903000000       71100   70.0  69.88  69.69    100   

    bid2q  ask1p  ask2p  ask1q  ask2q  
79    100   70.0  70.01   5408    300  
80    100   70.0  70.01   5408    300  
81    100   70.0  70.01   5408    300  
82    100   70.0  70.01   5408    300  
83    100   70.0  70.01   5408    300  
          skey      date          time  cum_volume  close  bid1p  bid2p  \
94169  2300884  20201117  145654820000    13052102   74.8  74.71  74.59   

       bid1q  bid2q  ask1p  ask2p  ask1q  ask2q  
94169    300    100   74.8  74.82   8900   7907  
3:59:42.516665
20201118
       skey     

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


       skey      date         time  cum_volume  close  bid1p  bid2p  bid1q  \
79  2300909  20201123  92503000000       63459  80.31  80.31   80.3   2341   
80  2300909  20201123  92603000000       63459  80.31  80.31   80.3   2341   
81  2300909  20201123  92703000000       63459  80.31  80.31   80.3   2341   
82  2300909  20201123  92803000000       63459  80.31  80.31   80.3   2341   
83  2300909  20201123  92903000000       63459  80.31  80.31   80.3   2341   

    bid2q  ask1p  ask2p  ask1q  ask2q  
79    500   80.5  80.69   1000    100  
80    500   80.5  80.69   1000    100  
81    500   80.5  80.69   1000    100  
82    500   80.5  80.69   1000    100  
83    500   80.5  80.69   1000    100  
          skey      date          time  cum_volume  close  bid1p  bid2p  \
55845  2300909  20201123  145654890000     6715830  78.19  78.15  78.13   

       bid1q  bid2q  ask1p  ask2p  ask1q  ask2q  
55845   2153    100  78.18  78.24   3700   8177  
5:16:54.054729
20201124
       skey     

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


       skey      date         time  cum_volume   close   bid1p   bid2p  bid1q  \
76  2300908  20201126  92503000000       64000  115.69  115.69  115.68   6700   
77  2300908  20201126  92603000000       64000  115.69  115.69  115.68   6700   
78  2300908  20201126  92703000000       64000  115.69  115.69  115.68   6700   
79  2300908  20201126  92803000000       64000  115.69  115.69  115.68   6700   
80  2300908  20201126  92903000000       64000  115.69  115.69  115.68   6700   

    bid2q   ask1p   ask2p  ask1q  ask2q  
76   5100  115.86  115.87    300    100  
77   5100  115.86  115.87    300    100  
78   5100  115.86  115.87    300    100  
79   5100  115.86  115.87    300    100  
80   5100  115.86  115.87    300    100  
          skey      date          time  cum_volume  close  bid1p   bid2p  \
94647  2300908  20201126  145654800000    12393785  115.0  114.8  114.77   

       bid1q  bid2q   ask1p  ask2p  ask1q  ask2q  
94647   3400    100  114.93  115.0    200   4800  
6:14:4

       skey      date         time  cum_volume  close   bid1p  bid2p  bid1q  \
82  2300916  20201202  92503000000      318200  186.0  185.89  185.8   3100   
83  2300916  20201202  92603000000      318200  186.0  185.89  185.8   3100   
84  2300916  20201202  92703000000      318200  186.0  185.89  185.8   3100   
85  2300916  20201202  92803000000      318200  186.0  185.89  185.8   3100   
86  2300916  20201202  92903000000      318200  186.0  185.89  185.8   3100   

    bid2q  ask1p   ask2p  ask1q  ask2q  
82    100  186.0  186.33   8400    100  
83    100  186.0  186.33   8400    100  
84    100  186.0  186.33   8400    100  
85    100  186.0  186.33   8400    100  
86    100  186.0  186.33   8400    100  
          skey      date          time  cum_volume  close  bid1p   bid2p  \
99981  2300916  20201202  145654810000     6651645  165.0  164.9  164.89   

       bid1q  bid2q  ask1p   ask2p  ask1q  ask2q  
99981    100    100  165.0  165.01   9300    700  
7:32:22.013963
20201203


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


2300816
       skey      date         time  cum_volume  close  bid1p  bid2p  bid1q  \
81  2300911  20201204  92503000000      488400   57.5  57.44  57.43    200   
82  2300911  20201204  92603000000      488400   57.5  57.44  57.43    200   
83  2300911  20201204  92703000000      488400   57.5  57.44  57.43    200   
84  2300911  20201204  92803000000      488400   57.5  57.44  57.43    200   
85  2300911  20201204  92903000000      488400   57.5  57.44  57.43    200   

    bid2q  ask1p  ask2p  ask1q  ask2q  
81    300   57.5  57.56   5304   1700  
82    300   57.5  57.56   5304   1700  
83    300   57.5  57.56   5304   1700  
84    300   57.5  57.56   5304   1700  
85    300   57.5  57.56   5304   1700  
          skey      date          time  cum_volume  close  bid1p  bid2p  \
94771  2300911  20201204  145654640000    13057782   56.0  55.97  55.96   

       bid1q  bid2q  ask1p  ask2p  ask1q  ask2q  
94771   1000    400   56.0  56.02   5700    900  
       skey      date         ti

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


2300381
       skey      date         time  cum_volume  close  bid1p  bid2p  bid1q  \
77  2300911  20201208  92503000000       45700   52.8   52.7  52.68    400   
78  2300911  20201208  92603000000       45700   52.8   52.7  52.68    400   
79  2300911  20201208  92703000000       45700   52.8   52.7  52.68    400   
80  2300911  20201208  92803000000       45700   52.8   52.7  52.68    400   
81  2300911  20201208  92903000000       45700   52.8   52.7  52.68    400   

    bid2q  ask1p  ask2p  ask1q  ask2q  
77    100   52.8  52.83  46582   1000  
78    100   52.8  52.83  46582   1000  
79    100   52.8  52.83  46582   1000  
80    100   52.8  52.83  46582   1000  
81    100   52.8  52.83  46582   1000  
          skey      date          time  cum_volume  close  bid1p  bid2p  \
48533  2300911  20201208  145654990000     7313095  53.46  53.46  53.45   

       bid1q  bid2q  ask1p  ask2p  ask1q  ask2q  
48533   5000   3800  53.48  53.49   3000  11000  
       skey      date         ti

9:25:21.403188


In [1]:
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz


import pymongo
import io
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz
import pyarrow as pa
import pyarrow.parquet as pq
import numpy as np


def DB(host, db_name, user, passwd, version=3):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name, version=version)


class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db', version=3):
        self.db_name = db_name
        self.uri = uri
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'
        self.version = version

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = self.version
            ser_data = self.ser(df_seg, version)
            seg = {'ver': version, 'data': ser_data, 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None
    
    def read_raw(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        return collection.find(query)

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        elif version == 3:
            # 32-bit number needs more space than 64-bit for parquet
            for col_name in s.columns:
                col = s[col_name]
                if col.dtype == np.int32:
                    s[col_name] = col.astype(np.int64)
                elif col.dtype == np.uint32:
                    s[col_name] = col.astype(np.uint64)
            tbl = pa.Table.from_pandas(s)
            f = io.BytesIO()
            pq.write_table(tbl, f, use_dictionary=False, compression='ZSTD', compression_level=0)
            f.seek(0)
            data = f.read()
            return data
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)
        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        elif version == 3:
            f = io.BytesIO()
            f.write(s)
            f.seek(0)
            return pq.read_table(f, use_threads=False).to_pandas()
        else:
            raise Exception('unknown version')


def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()

def dailyDB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    url = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    client = pymongo.MongoClient(url, maxPoolSize=None)
    db = client[db_name]
    return db

def read_stock_daily(db, name, start_date=None, end_date=None, skey=None, index_name=None, interval=None, col=None, return_sdi=True):
    collection = db[name]
    # Build projection
    prj = {'_id': 0}
    if col is not None:
        if return_sdi:
            col = ['skey', 'date'] + col
        for col_name in col:
            prj[col_name] = 1

    # Build query
    query = {}
    if skey is not None:
        query['skey'] = {'$in': skey}
    if index_name is not None:
        query['index_name'] = {'$in': index_name}
    if start_date is not None:
        if end_date is not None:
            query['date'] = {'$gte': start_date, '$lte': end_date}
        else:
            query['date'] = {'$gte': start_date}
    elif end_date is not None:
        query['date'] = {'$lte': end_date}

    # Load data
    cur = collection.find(query, prj)
    df = pd.DataFrame.from_records(cur)
    if df.empty:
        df = pd.DataFrame()
    else:
        df = df.sort_values(by=['date', 'skey'])
    return df   

def read_memb_daily(db, name, start_date=None, end_date=None, skey=None, index_id=None, interval=None, col=None, return_sdi=True):
    collection = db[name]
    # Build projection
    prj = {'_id': 0}
    if col is not None:
        if return_sdi:
            col = ['skey', 'date', 'index_id'] + col
        for col_name in col:
            prj[col_name] = 1

    # Build query
    query = {}
    if skey is not None:
        query['skey'] = {'$in': skey}
    if index_id is not None:
        query['index_id'] = {'$in': index_id}
    if interval is not None:
        query['interval'] = {'$in': interval}
    if start_date is not None:
        if end_date is not None:
            query['date'] = {'$gte': start_date, '$lte': end_date}
        else:
            query['date'] = {'$gte': start_date}
    elif end_date is not None:
        query['date'] = {'$lte': end_date}

    # Load data
    cur = collection.find(query, prj)
    df = pd.DataFrame.from_records(cur)
    if df.empty:
        df = pd.DataFrame()
    else:
        df = df.sort_values(by=['date', 'index_id', 'skey'])
    return df 



import pandas as pd
import random
import numpy as np
import glob
import pickle
import os
import datetime
import time
pd.set_option("max_columns", 200)

year = "2020"
startDate = '20201211'
endDate = '20201231'
database_name = 'com_md_eq_cn'
user = "zhenyuy"
password = "bnONBrzSMGoE"

startTm = datetime.datetime.now()
db1 = DB("192.168.10.178", database_name, user, password)
db2 = dailyDB("192.168.10.178", database_name, user, password)
save = {}
save['date'] = []
save['secid'] = []
mdOrderLog = db1.read('md_order', start_date=startDate, end_date=endDate, symbol=[2000001])
datelist = mdOrderLog['date'].unique()
ss = pd.read_csv('/mnt/ShareWithServer/result/shangshi1.csv')
ss = ss[ss['证券代码↑'] != 'T00018.SH']
ss['skey'] = np.where(ss['证券代码↑'].str[-2:] == 'SZ', ss['证券代码↑'].str[:6].astype(int) + 2000000, ss['证券代码↑'].str[:6].astype(int) + 1000000)
ss['date'] = (ss['上市日期'].str[:4] + ss['上市日期'].str[5:7] + ss['上市日期'].str[8:10]).astype(int)
print(datetime.datetime.now() - startTm)

startTm = datetime.datetime.now()
for d in datelist:
    print(d)
    sl1 = db1.read('md_order', start_date=str(d), end_date=str(d))['skey'].unique()
    data1 = db1.read('md_snapshot_l2', start_date=str(d), end_date=str(d), symbol=list(sl1))
    assert(len(sl1) == data1['skey'].nunique())
    op = read_stock_daily(db2, 'mdbar1d_tr', start_date=int(d), end_date=int(d))

    if len(sl1) != 0:
        for s in sl1:
            mbd = db1.read('md_snapshot_mbd', start_date=str(d), end_date=str(d), symbol=s)
            if mbd is None:
                if ss[ss['skey'] == s]['date'].iloc[0] == d:
                    l2 = data1[data1['skey'] == s]
                    l2['ApplSeqNum'] = -1
                    l2['ApplSeqNum'] = l2['ApplSeqNum'].astype('int32') 
                    db1.write('md_snapshot_l2', l2)
                    continue
                else:
                    save['date'].append(d)
                    save['secid'].append(s)
                    print(s)
                    continue
            try:
                assert(mbd.shape[1] == 82)
            except:
                print('mdb data column unupdated')
                print(s)
            try:
                op1 = op[op['skey'] == s]['open'].iloc[0]
                assert(mbd[mbd['cum_volume'] > 0]['open'].iloc[0] == op1)
            except:
                print('%s have no information in mdbar1d_tr' % str(s))
            l2 = data1[data1['skey'] == s]
            cols = ['skey', 'date', 'cum_volume', 'prev_close', 'open', 'close', 'cum_trades_cnt', 'bid10p', 'bid9p',
                   'bid8p', 'bid7p', 'bid6p', 'bid5p', 'bid4p', 'bid3p', 'bid2p', 'bid1p', 'ask1p', 'ask2p',
                   'ask3p', 'ask4p', 'ask5p', 'ask6p', 'ask7p', 'ask8p', 'ask9p', 'ask10p', 'bid10q', 'bid9q', 
                   'bid8q', 'bid7q', 'bid6q', 'bid5q', 'bid4q', 'bid3q', 'bid2q', 'bid1q', 'ask1q', 'ask2q', 'ask3q', 
                   'ask4q', 'ask5q', 'ask6q','ask7q', 'ask8q', 'ask9q', 'ask10q', 'bid10n', 'bid9n', 'bid8n',
                   'bid7n', 'bid6n', 'bid5n', 'bid4n', 'bid3n', 'bid2n', 'bid1n', 'ask1n', 'ask2n', 'ask3n', 
                   'ask4n', 'ask5n', 'ask6n', 'ask7n', 'ask8n', 'ask9n', 'ask10n', 'total_bid_quantity', 'total_ask_quantity']
            mbd1 = mbd.drop_duplicates(cols, keep='first')
            mbd = mbd1[cols+['ApplSeqNum']]
            if 'ApplSeqNum' in l2.columns:
                l2 = l2[list(l2.columns[l2.columns != 'ApplSeqNum'])]
            rl2 = pd.merge(l2, mbd, on=cols, how='left')
            try:
                assert(rl2[(rl2['ApplSeqNum'].isnull()) & (rl2['cum_volume'] > 0) & (rl2['time'] <= 145655000000)].shape[0] == 0)
            except:
                print(rl2[(rl2['ApplSeqNum'].isnull()) & (rl2['cum_volume'] > 0) & (rl2['time'] <= 145655000000)][['skey', 'date', 'time', 'cum_volume', 'close', 'bid1p', 'bid2p','bid1q', 'bid2q', 'ask1p', 'ask2p', 'ask1q', 'ask2q']])
                print(mbd1.tail(1)[['skey', 'date', 'time', 'cum_volume', 'close', 'bid1p', 'bid2p','bid1q', 'bid2q', 'ask1p', 'ask2p', 'ask1q', 'ask2q']])
            rl2.loc[rl2['ApplSeqNum'].isnull(), 'ApplSeqNum'] = -1
            rl2['ApplSeqNum'] = rl2['ApplSeqNum'].astype('int32') 
            assert(rl2.shape[0] == l2.shape[0])
            db1.write('md_snapshot_l2', rl2)
        print(datetime.datetime.now() - startTm)
    else:
        continue

0:00:02.219341
20201211


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


         skey      date          time  cum_volume  close  bid1p  bid2p  bid1q  \
4768  2300568  20201211  145230000000    19499891  25.48  25.48  25.47   2700   
4769  2300568  20201211  145233000000    19502691  25.48  25.48  25.47   4000   
4770  2300568  20201211  145236000000    19512691  25.47  25.47  25.46   7500   
4771  2300568  20201211  145239000000    19517191  25.50  25.48  25.47   2200   
4772  2300568  20201211  145242000000    19523191  25.48  25.48  25.47    200   
...       ...       ...           ...         ...    ...    ...    ...    ...   
4852  2300568  20201211  145642000000    19906665  25.47  25.47  25.46    100   
4853  2300568  20201211  145645000000    19917265  25.45  25.46  25.45   1000   
4854  2300568  20201211  145648000000    19917865  25.48  25.48  25.47   8400   
4855  2300568  20201211  145651000000    19925265  25.48  25.48  25.47   1000   
4856  2300568  20201211  145654000000    19926765  25.47  25.47  25.46   1600   

      bid2q  ask1p  ask2p  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0:59:24.841126
20201216
2000995
1:18:58.645511
20201217
         skey      date          time  cum_volume  close  bid1p  bid2p  bid1q  \
3436  2002096  20201217  145654000000     3334058  11.97  11.97  11.95   2100   

      bid2q  ask1p  ask2p  ask1q  ask2q  
3436  26000  11.98  11.99   4400  16600  
         skey      date          time  cum_volume  close  bid1p  bid2p  bid1q  \
9137  2002096  20201217  145651560000     3333358  11.96  11.95  11.94  26000   

      bid2q  ask1p  ask2p  ask1q  ask2q  
9137   4300  11.97  11.98    700   4400  
1:38:37.926814
20201218


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


1:58:33.233876
20201221
         skey      date          time  cum_volume  close  bid1p  bid2p  bid1q  \
2564  2002209  20201221  145654000000     1685100   7.06   7.06   7.05   1800   

      bid2q  ask1p  ask2p  ask1q  ask2q  
2564  25600   7.07   7.08  11400  25500  
         skey      date          time  cum_volume  close  bid1p  bid2p  bid1q  \
8346  2002209  20201221  145646360000     1683700   7.06   7.05   7.04  25600   

      bid2q  ask1p  ask2p  ask1q  ask2q  
8346  12800   7.06   7.07   1400  11400  
       skey      date         time  cum_volume  close  bid1p  bid2p  bid1q  \
82  2300917  20201221  92503000000      816666   50.0   50.0  49.99  12634   
83  2300917  20201221  92603000000      816666   50.0   50.0  49.99  12634   
84  2300917  20201221  92703000000      816666   50.0   50.0  49.99  12634   
85  2300917  20201221  92803000000      816666   50.0   50.0  49.99  12634   
86  2300917  20201221  92903000000      816666   50.0   50.0  49.99  12634   

    bid2q  as

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


       skey      date         time  cum_volume  close  bid1p  bid2p  bid1q  \
83  2300921  20201222  92503000000      156800   70.0   69.0  68.99    500   
84  2300921  20201222  92603000000      156800   70.0   69.0  68.99    500   
85  2300921  20201222  92703000000      156800   70.0   69.0  68.99    500   
86  2300921  20201222  92803000000      156800   70.0   69.0  68.99    500   
87  2300921  20201222  92903000000      156800   70.0   69.0  68.99    500   

    bid2q  ask1p  ask2p  ask1q  ask2q  
83    200   70.0  70.01   3100    500  
84    200   70.0  70.01   3100    500  
85    200   70.0  70.01   3100    500  
86    200   70.0  70.01   3100    500  
87    200   70.0  70.01   3100    500  
           skey      date          time  cum_volume  close  bid1p  bid2p  \
113942  2300921  20201222  145654980000     8803336  61.51   61.5  61.45   

        bid1q  bid2q  ask1p  ask2p  ask1q  ask2q  
113942    600    200  61.51  61.53   3200    500  
2:39:42.163473
20201223
         ske

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


       skey      date         time  cum_volume  close  bid1p  bid2p  bid1q  \
78  2300921  20201223  92503000000      106400   55.0  54.98  54.78    100   
79  2300921  20201223  92603000000      106400   55.0  54.98  54.78    100   
80  2300921  20201223  92703000000      106400   55.0  54.98  54.78    100   
81  2300921  20201223  92803000000      106400   55.0  54.98  54.78    100   
82  2300921  20201223  92903000000      106400   55.0  54.98  54.78    100   

    bid2q  ask1p  ask2p  ask1q  ask2q  
78    100   55.0  55.01   1500   4400  
79    100   55.0  55.01   1500   4400  
80    100   55.0  55.01   1500   4400  
81    100   55.0  55.01   1500   4400  
82    100   55.0  55.01   1500   4400  
          skey      date          time  cum_volume  close  bid1p  bid2p  \
64224  2300921  20201223  145654690000     6878980  54.63  54.62  54.61   

       bid1q  bid2q  ask1p  ask2p  ask1q  ask2q  
64224   2200   1300  54.63  54.66    100    400  
3:01:18.330737
20201224


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


2300917
       skey      date         time  cum_volume  close  bid1p  bid2p  bid1q  \
79  2300918  20201224  92503000000      549611   14.5   14.5  14.49  21289   
80  2300918  20201224  92603000000      549611   14.5   14.5  14.49  21289   
81  2300918  20201224  92703000000      549611   14.5   14.5  14.49  21289   
82  2300918  20201224  92803000000      549611   14.5   14.5  14.49  21289   
83  2300918  20201224  92903000000      549611   14.5   14.5  14.49  21289   

    bid2q  ask1p  ask2p  ask1q  ask2q  
79   6400  14.55  14.59    400    400  
80   6400  14.55  14.59    400    400  
81   6400  14.55  14.59    400    400  
82   6400  14.55  14.59    400    400  
83   6400  14.55  14.59    400    400  
          skey      date          time  cum_volume  close  bid1p  bid2p  \
96554  2300918  20201224  145654980000    37854019  15.57  15.54  15.52   

       bid1q  bid2q  ask1p  ask2p  ask1q  ask2q  
96554    800    200  15.56  15.57   1500  10200  
2300919
       skey      date   

       skey      date         time  cum_volume  close  bid1p  bid2p  bid1q  \
80  2300923  20201225  92503000000      172300   44.0  43.99  43.92    200   
81  2300923  20201225  92603000000      172300   44.0  43.99  43.92    200   
82  2300923  20201225  92703000000      172300   44.0  43.99  43.92    200   
83  2300923  20201225  92803000000      172300   44.0  43.99  43.92    200   
84  2300923  20201225  92903000000      172300   44.0  43.99  43.92    200   

    bid2q  ask1p  ask2p  ask1q  ask2q  
80    100   44.0  44.11  18900    300  
81    100   44.0  44.11  18900    300  
82    100   44.0  44.11  18900    300  
83    100   44.0  44.11  18900    300  
84    100   44.0  44.11  18900    300  
          skey      date          time  cum_volume  close  bid1p  bid2p  \
69532  2300923  20201225  145654880000     6765848  44.88  44.88  44.87   

       bid1q  bid2q  ask1p  ask2p  ask1q  ask2q  
69532   6500    100  44.91  44.92    100   2700  
3:42:20.111948
20201228


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


       skey      date         time  cum_volume  close  bid1p  bid2p  bid1q  \
78  2300918  20201228  92503000000      319913  13.66  13.66  13.65   3087   
79  2300918  20201228  92603000000      319913  13.66  13.66  13.65   3087   
80  2300918  20201228  92703000000      319913  13.66  13.66  13.65   3087   
81  2300918  20201228  92803000000      319913  13.66  13.66  13.65   3087   
82  2300918  20201228  92903000000      319913  13.66  13.66  13.65   3087   

    bid2q  ask1p  ask2p  ask1q  ask2q  
78  18700  13.67  13.68    500    700  
79  18700  13.67  13.68    500    700  
80  18700  13.67  13.68    500    700  
81  18700  13.67  13.68    500    700  
82  18700  13.67  13.68    500    700  
          skey      date          time  cum_volume  close  bid1p  bid2p  \
68781  2300918  20201228  145654940000    27767820  13.71  13.71   13.7   

       bid1q  bid2q  ask1p  ask2p  ask1q  ask2q  
68781   4300  55100  13.72  13.73   2500   6503  
       skey      date         time  cum_

4:24:38.592925
20201230
       skey      date         time  cum_volume  close  bid1p  bid2p  bid1q  \
68  2300920  20201230  92503000000       51400  37.52  37.51   37.5   1800   
69  2300920  20201230  92603000000       51400  37.52  37.51   37.5   1800   
70  2300920  20201230  92703000000       51400  37.52  37.51   37.5   1800   
71  2300920  20201230  92803000000       51400  37.52  37.51   37.5   1800   
72  2300920  20201230  92903000000       51400  37.52  37.51   37.5   1800   

    bid2q  ask1p  ask2p  ask1q  ask2q  
68  11100  37.52   37.6   7700    100  
69  11100  37.52   37.6   7700    100  
70  11100  37.52   37.6   7700    100  
71  11100  37.52   37.6   7700    100  
72  11100  37.52   37.6   7700    100  
          skey      date          time  cum_volume  close  bid1p  bid2p  \
41559  2300920  20201230  145654980000     6891011  38.81  38.81  38.78   

       bid1q  bid2q  ask1p  ask2p  ask1q  ask2q  
41559  11800    924  38.83  38.84   3500   1100  
       skey     

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


4:44:43.461983
20201231
         skey      date          time  cum_volume  close  bid1p  bid2p  bid1q  \
1170  2002207  20201231  145654000000      772200   6.09   6.09   6.08   2800   

      bid2q  ask1p  ask2p  ask1q  ask2q  
1170   8800   6.12   6.13   1400  50420  
         skey      date          time  cum_volume  close  bid1p  bid2p  bid1q  \
2058  2002207  20201231  145642730000      769200   6.08   6.08   6.07   8800   

      bid2q  ask1p  ask2p  ask1q  ask2q  
2058   6800   6.09   6.12   3000   1400  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


       skey      date         time  cum_volume  close  bid1p  bid2p  bid1q  \
79  2300922  20201231  92503000000      125490   37.3   37.3  37.29   1410   
80  2300922  20201231  92603000000      125490   37.3   37.3  37.29   1410   
81  2300922  20201231  92703000000      125490   37.3   37.3  37.29   1410   
82  2300922  20201231  92803000000      125490   37.3   37.3  37.29   1410   
83  2300922  20201231  92903000000      125490   37.3   37.3  37.29   1410   

    bid2q  ask1p  ask2p  ask1q  ask2q  
79    100  37.44   37.5   1100   1600  
80    100  37.44   37.5   1100   1600  
81    100  37.44   37.5   1100   1600  
82    100  37.44   37.5   1100   1600  
83    100  37.44   37.5   1100   1600  
          skey      date          time  cum_volume  close  bid1p  bid2p  \
58572  2300922  20201231  145654990000    11880727  38.74   38.7  38.67   

       bid1q  bid2q  ask1p  ask2p  ask1q  ask2q  
58572   8200   1300  38.74  38.75    900    500  
       skey      date         time  cum_

In [2]:
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz


import pymongo
import io
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz
import pyarrow as pa
import pyarrow.parquet as pq
import numpy as np


def DB(host, db_name, user, passwd, version=3):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name, version=version)


class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db', version=3):
        self.db_name = db_name
        self.uri = uri
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'
        self.version = version

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = self.version
            ser_data = self.ser(df_seg, version)
            seg = {'ver': version, 'data': ser_data, 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None
    
    def read_raw(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        return collection.find(query)

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        elif version == 3:
            # 32-bit number needs more space than 64-bit for parquet
            for col_name in s.columns:
                col = s[col_name]
                if col.dtype == np.int32:
                    s[col_name] = col.astype(np.int64)
                elif col.dtype == np.uint32:
                    s[col_name] = col.astype(np.uint64)
            tbl = pa.Table.from_pandas(s)
            f = io.BytesIO()
            pq.write_table(tbl, f, use_dictionary=False, compression='ZSTD', compression_level=0)
            f.seek(0)
            data = f.read()
            return data
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)
        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        elif version == 3:
            f = io.BytesIO()
            f.write(s)
            f.seek(0)
            return pq.read_table(f, use_threads=False).to_pandas()
        else:
            raise Exception('unknown version')


def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()

def dailyDB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    url = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    client = pymongo.MongoClient(url, maxPoolSize=None)
    db = client[db_name]
    return db

def read_stock_daily(db, name, start_date=None, end_date=None, skey=None, index_name=None, interval=None, col=None, return_sdi=True):
    collection = db[name]
    # Build projection
    prj = {'_id': 0}
    if col is not None:
        if return_sdi:
            col = ['skey', 'date'] + col
        for col_name in col:
            prj[col_name] = 1

    # Build query
    query = {}
    if skey is not None:
        query['skey'] = {'$in': skey}
    if index_name is not None:
        query['index_name'] = {'$in': index_name}
    if start_date is not None:
        if end_date is not None:
            query['date'] = {'$gte': start_date, '$lte': end_date}
        else:
            query['date'] = {'$gte': start_date}
    elif end_date is not None:
        query['date'] = {'$lte': end_date}

    # Load data
    cur = collection.find(query, prj)
    df = pd.DataFrame.from_records(cur)
    if df.empty:
        df = pd.DataFrame()
    else:
        df = df.sort_values(by=['date', 'skey'])
    return df   

def read_memb_daily(db, name, start_date=None, end_date=None, skey=None, index_id=None, interval=None, col=None, return_sdi=True):
    collection = db[name]
    # Build projection
    prj = {'_id': 0}
    if col is not None:
        if return_sdi:
            col = ['skey', 'date', 'index_id'] + col
        for col_name in col:
            prj[col_name] = 1

    # Build query
    query = {}
    if skey is not None:
        query['skey'] = {'$in': skey}
    if index_id is not None:
        query['index_id'] = {'$in': index_id}
    if interval is not None:
        query['interval'] = {'$in': interval}
    if start_date is not None:
        if end_date is not None:
            query['date'] = {'$gte': start_date, '$lte': end_date}
        else:
            query['date'] = {'$gte': start_date}
    elif end_date is not None:
        query['date'] = {'$lte': end_date}

    # Load data
    cur = collection.find(query, prj)
    df = pd.DataFrame.from_records(cur)
    if df.empty:
        df = pd.DataFrame()
    else:
        df = df.sort_values(by=['date', 'index_id', 'skey'])
    return df 



import pandas as pd
import random
import numpy as np
import glob
import pickle
import os
import datetime
import time
pd.set_option("max_columns", 200)

year = "2020"
startDate = '20200814'
endDate = '20200814'
database_name = 'com_md_eq_cn'
user = "zhenyuy"
password = "bnONBrzSMGoE"

startTm = datetime.datetime.now()
db1 = DB("192.168.10.178", database_name, user, password)
db2 = dailyDB("192.168.10.178", database_name, user, password)
save = {}
save['date'] = []
save['secid'] = []
mdOrderLog = db1.read('md_order', start_date=startDate, end_date=endDate, symbol=[2000001])
datelist = mdOrderLog['date'].unique()
ss = pd.read_csv('/mnt/ShareWithServer/result/shangshi1.csv')
ss = ss[ss['证券代码↑'] != 'T00018.SH']
ss['skey'] = np.where(ss['证券代码↑'].str[-2:] == 'SZ', ss['证券代码↑'].str[:6].astype(int) + 2000000, ss['证券代码↑'].str[:6].astype(int) + 1000000)
ss['date'] = (ss['上市日期'].str[:4] + ss['上市日期'].str[5:7] + ss['上市日期'].str[8:10]).astype(int)
print(datetime.datetime.now() - startTm)

startTm = datetime.datetime.now()
for d in datelist:
    print(d)
    sl1 = db1.read('md_order', start_date=str(d), end_date=str(d))['skey'].unique()
    data1 = db1.read('md_snapshot_l2', start_date=str(d), end_date=str(d), symbol=list(sl1))
    assert(len(sl1) == data1['skey'].nunique())
    op = read_stock_daily(db2, 'mdbar1d_tr', start_date=int(d), end_date=int(d))

    if len(sl1) != 0:
        for s in sl1:
            mbd = db1.read('md_snapshot_mbd', start_date=str(d), end_date=str(d), symbol=s)
            if mbd is None:
                if ss[ss['skey'] == s]['date'].iloc[0] == d:
                    l2 = data1[data1['skey'] == s]
                    l2['ApplSeqNum'] = -1
                    l2['ApplSeqNum'] = l2['ApplSeqNum'].astype('int32') 
                    db1.write('md_snapshot_l2', l2)
                    continue
                else:
                    save['date'].append(d)
                    save['secid'].append(s)
                    print(s)
                    continue
            try:
                assert(mbd.shape[1] == 82)
            except:
                print('mdb data column unupdated')
                print(s)
            try:
                op1 = op[op['skey'] == s]['open'].iloc[0]
                assert(mbd[mbd['cum_volume'] > 0]['open'].iloc[0] == op1)
            except:
                print('%s have no information in mdbar1d_tr' % str(s))
            l2 = data1[data1['skey'] == s]
            cols = ['skey', 'date', 'cum_volume', 'prev_close', 'open', 'close', 'cum_trades_cnt', 'bid10p', 'bid9p',
                   'bid8p', 'bid7p', 'bid6p', 'bid5p', 'bid4p', 'bid3p', 'bid2p', 'bid1p', 'ask1p', 'ask2p',
                   'ask3p', 'ask4p', 'ask5p', 'ask6p', 'ask7p', 'ask8p', 'ask9p', 'ask10p', 'bid10q', 'bid9q', 
                   'bid8q', 'bid7q', 'bid6q', 'bid5q', 'bid4q', 'bid3q', 'bid2q', 'bid1q', 'ask1q', 'ask2q', 'ask3q', 
                   'ask4q', 'ask5q', 'ask6q','ask7q', 'ask8q', 'ask9q', 'ask10q', 'bid10n', 'bid9n', 'bid8n',
                   'bid7n', 'bid6n', 'bid5n', 'bid4n', 'bid3n', 'bid2n', 'bid1n', 'ask1n', 'ask2n', 'ask3n', 
                   'ask4n', 'ask5n', 'ask6n', 'ask7n', 'ask8n', 'ask9n', 'ask10n', 'total_bid_quantity', 'total_ask_quantity']
            mbd1 = mbd.drop_duplicates(cols, keep='first')
            mbd = mbd1[cols+['ApplSeqNum']]
            if 'ApplSeqNum' in l2.columns:
                l2 = l2[list(l2.columns[l2.columns != 'ApplSeqNum'])]
            rl2 = pd.merge(l2, mbd, on=cols, how='left')
            try:
                assert(rl2[(rl2['ApplSeqNum'].isnull()) & (rl2['cum_volume'] > 0) & (rl2['time'] <= 145655000000)].shape[0] == 0)
            except:
                print(rl2[(rl2['ApplSeqNum'].isnull()) & (rl2['cum_volume'] > 0) & (rl2['time'] <= 145655000000)][['skey', 'date', 'time', 'cum_volume', 'close', 'bid1p', 'bid2p','bid1q', 'bid2q', 'ask1p', 'ask2p', 'ask1q', 'ask2q']])
                print(mbd1.tail(1)[['skey', 'date', 'time', 'cum_volume', 'close', 'bid1p', 'bid2p','bid1q', 'bid2q', 'ask1p', 'ask2p', 'ask1q', 'ask2q']])
            rl2.loc[rl2['ApplSeqNum'].isnull(), 'ApplSeqNum'] = -1
            rl2['ApplSeqNum'] = rl2['ApplSeqNum'].astype('int32') 
            assert(rl2.shape[0] == l2.shape[0])
            db1.write('md_snapshot_l2', rl2)
        print(datetime.datetime.now() - startTm)
    else:
        continue

0:00:00.132225
20200814
         skey      date          time  cum_volume  close  bid1p  bid2p  bid1q  \
2249  2300668  20200814  145654000000      865100  17.16  17.15  17.14   8100   

      bid2q  ask1p  ask2p  ask1q  ask2q  
2249    700  17.16  17.19  26900    500  
         skey      date          time  cum_volume  close  bid1p  bid2p  bid1q  \
4318  2300668  20200814  145649310000      862800  17.19  17.16  17.15   2300   

      bid2q  ask1p  ask2p  ask1q  ask2q  
4318   8100  17.19   17.2    500   2300  
0:20:34.377283
