In [1]:
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz


import pymongo
import io
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz
import pyarrow as pa
import pyarrow.parquet as pq
import numpy as np


def DB(host, db_name, user, passwd, version=3):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name, version=version)


class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db', version=3):
        self.db_name = db_name
        self.uri = uri
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'
        self.version = version

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = self.version
            ser_data = self.ser(df_seg, version)
            seg = {'ver': version, 'data': ser_data, 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None
    
    def read_raw(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        return collection.find(query)

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        elif version == 3:
            # 32-bit number needs more space than 64-bit for parquet
            for col_name in s.columns:
                col = s[col_name]
                if col.dtype == np.int32:
                    s[col_name] = col.astype(np.int64)
                elif col.dtype == np.uint32:
                    s[col_name] = col.astype(np.uint64)
            tbl = pa.Table.from_pandas(s)
            f = io.BytesIO()
            pq.write_table(tbl, f, use_dictionary=False, compression='ZSTD', compression_level=0)
            f.seek(0)
            data = f.read()
            return data
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)
        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        elif version == 3:
            f = io.BytesIO()
            f.write(s)
            f.seek(0)
            return pq.read_table(f, use_threads=False).to_pandas()
        else:
            raise Exception('unknown version')


def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()

def dailyDB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    url = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    client = pymongo.MongoClient(url, maxPoolSize=None)
    db = client[db_name]
    return db

def read_stock_daily(db, name, start_date=None, end_date=None, skey=None, index_name=None, interval=None, col=None, return_sdi=True):
    collection = db[name]
    # Build projection
    prj = {'_id': 0}
    if col is not None:
        if return_sdi:
            col = ['skey', 'date'] + col
        for col_name in col:
            prj[col_name] = 1

    # Build query
    query = {}
    if skey is not None:
        query['skey'] = {'$in': skey}
    if index_name is not None:
        query['index_name'] = {'$in': index_name}
    if start_date is not None:
        if end_date is not None:
            query['date'] = {'$gte': start_date, '$lte': end_date}
        else:
            query['date'] = {'$gte': start_date}
    elif end_date is not None:
        query['date'] = {'$lte': end_date}

    # Load data
    cur = collection.find(query, prj)
    df = pd.DataFrame.from_records(cur)
    if df.empty:
        df = pd.DataFrame()
    else:
        df = df.sort_values(by=['date', 'skey'])
    return df   

def read_memb_daily(db, name, start_date=None, end_date=None, skey=None, index_id=None, interval=None, col=None, return_sdi=True):
    collection = db[name]
    # Build projection
    prj = {'_id': 0}
    if col is not None:
        if return_sdi:
            col = ['skey', 'date', 'index_id'] + col
        for col_name in col:
            prj[col_name] = 1

    # Build query
    query = {}
    if skey is not None:
        query['skey'] = {'$in': skey}
    if index_id is not None:
        query['index_id'] = {'$in': index_id}
    if interval is not None:
        query['interval'] = {'$in': interval}
    if start_date is not None:
        if end_date is not None:
            query['date'] = {'$gte': start_date, '$lte': end_date}
        else:
            query['date'] = {'$gte': start_date}
    elif end_date is not None:
        query['date'] = {'$lte': end_date}

    # Load data
    cur = collection.find(query, prj)
    df = pd.DataFrame.from_records(cur)
    if df.empty:
        df = pd.DataFrame()
    else:
        df = df.sort_values(by=['date', 'index_id', 'skey'])
    return df 



import pandas as pd
import random
import numpy as np
import glob
import pickle
import os
import datetime
import time
pd.set_option("max_columns", 200)

year = "2020"
startDate = '20200916'
endDate = '20201015'
database_name = 'com_md_eq_cn'
user = "zhenyuy"
password = "bnONBrzSMGoE"

startTm = datetime.datetime.now()
db1 = DB("192.168.10.178", database_name, user, password)
db2 = dailyDB("192.168.10.178", database_name, user, password)
save = {}
save['date'] = []
save['secid'] = []
mdOrderLog = db1.read('md_order', start_date=startDate, end_date=endDate, symbol=[2000001])
datelist = mdOrderLog['date'].unique()
ss = pd.read_csv('/mnt/ShareWithServer/result/shangshi1.csv')
ss = ss[ss['证券代码↑'] != 'T00018.SH']
ss['skey'] = np.where(ss['证券代码↑'].str[-2:] == 'SZ', ss['证券代码↑'].str[:6].astype(int) + 2000000, ss['证券代码↑'].str[:6].astype(int) + 1000000)
ss['date'] = (ss['上市日期'].str[:4] + ss['上市日期'].str[5:7] + ss['上市日期'].str[8:10]).astype(int)
print(datetime.datetime.now() - startTm)

startTm = datetime.datetime.now()
for d in datelist:
    print(d)
    sl1 = db1.read('md_order', start_date=str(d), end_date=str(d))['skey'].unique()
    data1 = db1.read('md_snapshot_l2', start_date=str(d), end_date=str(d), symbol=list(sl1))
    assert(len(sl1) == data1['skey'].nunique())
    op = read_stock_daily(db2, 'mdbar1d_tr', start_date=int(d), end_date=int(d))

    if len(sl1) != 0:
        for s in sl1:
            mbd = db1.read('md_snapshot_mbd', start_date=str(d), end_date=str(d), symbol=s)
            if mbd is None:
                if ss[ss['skey'] == s]['date'].iloc[0] == d:
                    l2 = data1[data1['skey'] == s]
                    l2['ApplSeqNum'] = -1
                    l2['ApplSeqNum'] = l2['ApplSeqNum'].astype('int32') 
                    db1.write('md_snapshot_l2', l2)
                    continue
                else:
                    save['date'].append(d)
                    save['secid'].append(s)
                    print(s)
                    continue
            try:
                assert(mbd.shape[1] == 82)
            except:
                print('mdb data column unupdated')
                print(s)
            try:
                op1 = op[op['skey'] == s]['open'].iloc[0]
                assert(mbd[mbd['cum_volume'] > 0]['open'].iloc[0] == op1)
            except:
                print('%s have no information in mdbar1d_tr' % str(s))
            l2 = data1[data1['skey'] == s]
            cols = ['skey', 'date', 'cum_volume', 'prev_close', 'open', 'close', 'cum_trades_cnt', 'bid10p', 'bid9p',
                   'bid8p', 'bid7p', 'bid6p', 'bid5p', 'bid4p', 'bid3p', 'bid2p', 'bid1p', 'ask1p', 'ask2p',
                   'ask3p', 'ask4p', 'ask5p', 'ask6p', 'ask7p', 'ask8p', 'ask9p', 'ask10p', 'bid10q', 'bid9q', 
                   'bid8q', 'bid7q', 'bid6q', 'bid5q', 'bid4q', 'bid3q', 'bid2q', 'bid1q', 'ask1q', 'ask2q', 'ask3q', 
                   'ask4q', 'ask5q', 'ask6q','ask7q', 'ask8q', 'ask9q', 'ask10q', 'bid10n', 'bid9n', 'bid8n',
                   'bid7n', 'bid6n', 'bid5n', 'bid4n', 'bid3n', 'bid2n', 'bid1n', 'ask1n', 'ask2n', 'ask3n', 
                   'ask4n', 'ask5n', 'ask6n', 'ask7n', 'ask8n', 'ask9n', 'ask10n', 'total_bid_quantity', 'total_ask_quantity']
            mbd1 = mbd.drop_duplicates(cols, keep='first')
            mbd = mbd1[cols+['ApplSeqNum']]
            if 'ApplSeqNum' in l2.columns:
                l2 = l2[list(l2.columns[l2.columns != 'ApplSeqNum'])]
            rl2 = pd.merge(l2, mbd, on=cols, how='left')
            try:
                assert(rl2[(rl2['ApplSeqNum'].isnull()) & (rl2['cum_volume'] > 0) & (rl2['time'] <= 145655000000)].shape[0] == 0)
            except:
                print(rl2[(rl2['ApplSeqNum'].isnull()) & (rl2['cum_volume'] > 0) & (rl2['time'] <= 145655000000)][['skey', 'date', 'time', 'cum_volume', 'close', 'bid1p', 'bid2p','bid1q', 'bid2q', 'ask1p', 'ask2p', 'ask1q', 'ask2q']])
                print(mbd1.tail(1)[['skey', 'date', 'time', 'cum_volume', 'close', 'bid1p', 'bid2p','bid1q', 'bid2q', 'ask1p', 'ask2p', 'ask1q', 'ask2q']])
            rl2.loc[rl2['ApplSeqNum'].isnull(), 'ApplSeqNum'] = -1
            rl2['ApplSeqNum'] = rl2['ApplSeqNum'].astype('int32') 
            assert(rl2.shape[0] == l2.shape[0])
            db1.write('md_snapshot_l2', rl2)
        print(datetime.datetime.now() - startTm)
    else:
        continue

0:00:01.378231
20200916
       skey      date         time  cum_volume  close  bid1p  bid2p  bid1q  \
75  2300882  20200916  92503000000      161740  32.05  32.05  32.03    160   
76  2300882  20200916  92603000000      161740  32.05  32.05  32.03    160   
77  2300882  20200916  92703000000      161740  32.05  32.05  32.03    160   
78  2300882  20200916  92803000000      161740  32.05  32.05  32.03    160   
79  2300882  20200916  92903000000      161740  32.05  32.05  32.03    160   

    bid2q  ask1p  ask2p  ask1q  ask2q  
75   3500  32.06  32.08    700    500  
76   3500  32.06  32.08    700    500  
77   3500  32.06  32.08    700    500  
78   3500  32.06  32.08    700    500  
79   3500  32.06  32.08    700    500  
           skey      date          time  cum_volume  close  bid1p  bid2p  \
113952  2300882  20200916  145654890000    21044177   36.2  36.19  36.18   

        bid1q  bid2q  ask1p  ask2p  ask1q  ask2q  
113952   3800    300   36.2  36.22   5200   5700  
       skey 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


       skey      date         time  cum_volume  close  bid1p  bid2p  bid1q  \
82  2300889  20200916  92503000000      857386   58.0   58.0  57.98  30914   
83  2300889  20200916  92603000000      857386   58.0   58.0  57.98  30914   
84  2300889  20200916  92703000000      857386   58.0   58.0  57.98  30914   
85  2300889  20200916  92803000000      857386   58.0   58.0  57.98  30914   
86  2300889  20200916  92903000000      857386   58.0   58.0  57.98  30914   

    bid2q  ask1p  ask2p  ask1q  ask2q  
82    300  58.01  58.02    700    500  
83    300  58.01  58.02    700    500  
84    300  58.01  58.02    700    500  
85    300  58.01  58.02    700    500  
86    300  58.01  58.02    700    500  
           skey      date          time  cum_volume  close  bid1p  bid2p  \
248815  2300889  20200916  145654990000    25862760  64.18  64.11   64.1   

        bid1q  bid2q  ask1p  ask2p  ask1q  ask2q  
248815   1300   3596  64.18  64.19    600   2100  
0:22:16.115464
20200917
       skey 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


       skey      date         time  cum_volume  close  bid1p  bid2p  bid1q  \
83  2300889  20200917  92503000000      284100  58.11   58.1  58.08   1000   
84  2300889  20200917  92603000000      284100  58.11   58.1  58.08   1000   
85  2300889  20200917  92703000000      284100  58.11   58.1  58.08   1000   
86  2300889  20200917  92803000000      284100  58.11   58.1  58.08   1000   
87  2300889  20200917  92903000000      284100  58.11   58.1  58.08   1000   

    bid2q  ask1p  ask2p  ask1q  ask2q  
83    500  58.11  58.12     17    100  
84    500  58.11  58.12     17    100  
85    500  58.11  58.12     17    100  
86    500  58.11  58.12     17    100  
87    500  58.11  58.12     17    100  
           skey      date          time  cum_volume  close  bid1p  bid2p  \
198957  2300889  20200917  145654990000    20666809  54.89  54.88  54.85   

        bid1q  bid2q  ask1p  ask2p  ask1q  ask2q  
198957   8800   2400   54.9  54.91   7500  17621  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0:44:51.057336
20200918
       skey      date         time  cum_volume  close  bid1p  bid2p  bid1q  \
76  2300886  20200918  92503000000      118000  73.31  73.28  73.16    200   
77  2300886  20200918  92603000000      118000  73.31  73.28  73.16    200   
78  2300886  20200918  92703000000      118000  73.31  73.28  73.16    200   
79  2300886  20200918  92803000000      118000  73.31  73.28  73.16    200   
80  2300886  20200918  92903000000      118000  73.31  73.28  73.16    200   

    bid2q  ask1p  ask2p  ask1q  ask2q  
76    300  73.31  73.45    300   1500  
77    300  73.31  73.45    300   1500  
78    300  73.31  73.45    300   1500  
79    300  73.31  73.45    300   1500  
80    300  73.31  73.45    300   1500  
          skey      date          time  cum_volume  close  bid1p  bid2p  \
50221  2300886  20200918  145654800000     6576429   69.9  69.89  69.88   

       bid1q  bid2q  ask1p  ask2p  ask1q  ask2q  
50221    557    100  69.92  69.98    500    300  
       skey     

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


        skey      date         time  cum_volume  close  bid1p  bid2p  bid1q  \
105  2300886  20200921  92503000000       43300  68.76  68.71   68.7    700   
106  2300886  20200921  92603000000       43300  68.76  68.71   68.7    700   
107  2300886  20200921  92703000000       43300  68.76  68.71   68.7    700   
108  2300886  20200921  92803000000       43300  68.76  68.71   68.7    700   
109  2300886  20200921  92903000000       43300  68.76  68.71   68.7    700   

     bid2q  ask1p  ask2p  ask1q  ask2q  
105    100  68.76  68.79    251   1200  
106    100  68.76  68.79    251   1200  
107    100  68.76  68.79    251   1200  
108    100  68.76  68.79    251   1200  
109    100  68.76  68.79    251   1200  
          skey      date          time  cum_volume  close  bid1p  bid2p  \
30580  2300886  20200921  145654470000     3894347  70.49  70.49  70.47   

       bid1q  bid2q  ask1p  ask2p  ask1q  ask2q  
30580   2914    500   70.5  70.52   3400    300  
        skey      date      

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


       skey      date         time  cum_volume  close  bid1p  bid2p  bid1q  \
71  2300886  20200922  92503000000       32200  68.86  68.76  68.62    300   
72  2300886  20200922  92603000000       32200  68.86  68.76  68.62    300   
73  2300886  20200922  92703000000       32200  68.86  68.76  68.62    300   
74  2300886  20200922  92803000000       32200  68.86  68.76  68.62    300   
75  2300886  20200922  92903000000       32200  68.86  68.76  68.62    300   

    bid2q  ask1p  ask2p  ask1q  ask2q  
71    100  68.86  68.89   3100    100  
72    100  68.86  68.89   3100    100  
73    100  68.86  68.89   3100    100  
74    100  68.86  68.89   3100    100  
75    100  68.86  68.89   3100    100  
          skey      date          time  cum_volume  close  bid1p  bid2p  \
29774  2300886  20200922  145654990000     4210096  66.63   66.6  66.59   

       bid1q  bid2q  ask1p  ask2p  ask1q  ask2q  
29774    700    300  66.63  66.74   1473   1052  
       skey      date         time  cum_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


       skey      date         time  cum_volume   close   bid1p  bid2p  bid1q  \
71  2300888  20200923  92503000000      135000  121.67  121.66  121.5   2300   
72  2300888  20200923  92603000000      135000  121.67  121.66  121.5   2300   
73  2300888  20200923  92703000000      135000  121.67  121.66  121.5   2300   
74  2300888  20200923  92803000000      135000  121.67  121.66  121.5   2300   
75  2300888  20200923  92903000000      135000  121.67  121.66  121.5   2300   

    bid2q   ask1p   ask2p  ask1q  ask2q  
71    200  121.67  121.68   2482    100  
72    200  121.67  121.68   2482    100  
73    200  121.67  121.68   2482    100  
74    200  121.67  121.68   2482    100  
75    200  121.67  121.68   2482    100  
          skey      date          time  cum_volume  close   bid1p   bid2p  \
80091  2300888  20200923  145654990000     9565978  118.9  118.89  118.88   

       bid1q  bid2q  ask1p   ask2p  ask1q  ask2q  
80091    300   3900  118.9  118.95   3863    100  
       ske

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


2:36:20.867115
20200925
       skey      date         time  cum_volume  close  bid1p  bid2p  bid1q  \
79  2300892  20200925  92503000000      194300   73.1  73.09  73.03    500   
80  2300892  20200925  92603000000      194300   73.1  73.09  73.03    500   
81  2300892  20200925  92703000000      194300   73.1  73.09  73.03    500   
82  2300892  20200925  92803000000      194300   73.1  73.09  73.03    500   
83  2300892  20200925  92903000000      194300   73.1  73.09  73.03    500   

    bid2q  ask1p  ask2p  ask1q  ask2q  
79    100   73.1  73.27   1952   1600  
80    100   73.1  73.27   1952   1600  
81    100   73.1  73.27   1952   1600  
82    100   73.1  73.27   1952   1600  
83    100   73.1  73.27   1952   1600  
          skey      date          time  cum_volume  close  bid1p  bid2p  \
71787  2300892  20200925  145654930000    12387124  65.25   65.2   65.1   

       bid1q  bid2q  ask1p  ask2p  ask1q  ask2q  
71787    100    100  65.24   65.3    142   1697  
       skey     

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


       skey      date         time  cum_volume  close  bid1p  bid2p   bid1q  \
83  2300897  20200928  92503000000      237100   65.0   65.0   64.8  150400   
84  2300897  20200928  92603000000      237100   65.0   65.0   64.8  150400   
85  2300897  20200928  92703000000      237100   65.0   65.0   64.8  150400   
86  2300897  20200928  92803000000      237100   65.0   65.0   64.8  150400   
87  2300897  20200928  92903000000      237100   65.0   65.0   64.8  150400   

    bid2q  ask1p  ask2p  ask1q  ask2q  
83    100   65.1  65.17   3700    200  
84    100   65.1  65.17   3700    200  
85    100   65.1  65.17   3700    200  
86    100   65.1  65.17   3700    200  
87    100   65.1  65.17   3700    200  
           skey      date          time  cum_volume  close  bid1p  bid2p  \
118027  2300897  20200928  145654930000     8391586  58.48  58.36   58.3   

        bid1q  bid2q  ask1p  ask2p  ask1q  ask2q  
118027    400   1100  58.48  58.49    300    200  
3:14:23.656332
20200929
      

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


       skey      date         time  cum_volume  close  bid1p  bid2p  bid1q  \
71  2300892  20200930  92503000000      166042   55.3   55.3  55.28   1758   
72  2300892  20200930  92603000000      166042   55.3   55.3  55.28   1758   
73  2300892  20200930  92703000000      166042   55.3   55.3  55.28   1758   
74  2300892  20200930  92803000000      166042   55.3   55.3  55.28   1758   
75  2300892  20200930  92903000000      166042   55.3   55.3  55.28   1758   

    bid2q  ask1p  ask2p  ask1q  ask2q  
71    200   55.4  55.55    600    500  
72    200   55.4  55.55    600    500  
73    200   55.4  55.55    600    500  
74    200   55.4  55.55    600    500  
75    200   55.4  55.55    600    500  
          skey      date          time  cum_volume  close  bid1p  bid2p  \
35170  2300892  20200930  145654910000     6105740  54.14  54.14  54.12   

       bid1q  bid2q  ask1p  ask2p  ask1q  ask2q  
35170    700    300  54.18  54.19   1700   2100  
       skey      date         time  cum_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


4:54:17.557099
20201014
5:16:01.064915
20201015
       skey      date         time  cum_volume  close  bid1p  bid2p  bid1q  \
83  2300999  20201015  92503000000     7392100  48.96  48.92  48.91    500   
84  2300999  20201015  92603000000     7392100  48.96  48.92  48.91    500   
85  2300999  20201015  92703000000     7392100  48.96  48.92  48.91    500   
86  2300999  20201015  92803000000     7392100  48.96  48.92  48.91    500   
87  2300999  20201015  92903000000     7392100  48.96  48.92  48.91    500   

    bid2q  ask1p  ask2p  ask1q  ask2q  
83    100  48.96  48.97  16559   9700  
84    100  48.96  48.97  16559   9700  
85    100  48.96  48.97  16559   9700  
86    100  48.96  48.97  16559   9700  
87    100  48.96  48.97  16559   9700  
            skey      date          time  cum_volume  close  bid1p  bid2p  \
1202777  2300999  20201015  145654990000   251852871  57.95   57.9  57.88   

         bid1q  bid2q  ask1p  ask2p  ask1q  ask2q  
1202777    300   1700  57.95  57.97 

In [2]:
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz


import pymongo
import io
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz
import pyarrow as pa
import pyarrow.parquet as pq
import numpy as np


def DB(host, db_name, user, passwd, version=3):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name, version=version)


class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db', version=3):
        self.db_name = db_name
        self.uri = uri
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'
        self.version = version

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = self.version
            ser_data = self.ser(df_seg, version)
            seg = {'ver': version, 'data': ser_data, 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None
    
    def read_raw(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        return collection.find(query)

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        elif version == 3:
            # 32-bit number needs more space than 64-bit for parquet
            for col_name in s.columns:
                col = s[col_name]
                if col.dtype == np.int32:
                    s[col_name] = col.astype(np.int64)
                elif col.dtype == np.uint32:
                    s[col_name] = col.astype(np.uint64)
            tbl = pa.Table.from_pandas(s)
            f = io.BytesIO()
            pq.write_table(tbl, f, use_dictionary=False, compression='ZSTD', compression_level=0)
            f.seek(0)
            data = f.read()
            return data
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)
        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        elif version == 3:
            f = io.BytesIO()
            f.write(s)
            f.seek(0)
            return pq.read_table(f, use_threads=False).to_pandas()
        else:
            raise Exception('unknown version')


def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()

def dailyDB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    url = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    client = pymongo.MongoClient(url, maxPoolSize=None)
    db = client[db_name]
    return db

def read_stock_daily(db, name, start_date=None, end_date=None, skey=None, index_name=None, interval=None, col=None, return_sdi=True):
    collection = db[name]
    # Build projection
    prj = {'_id': 0}
    if col is not None:
        if return_sdi:
            col = ['skey', 'date'] + col
        for col_name in col:
            prj[col_name] = 1

    # Build query
    query = {}
    if skey is not None:
        query['skey'] = {'$in': skey}
    if index_name is not None:
        query['index_name'] = {'$in': index_name}
    if start_date is not None:
        if end_date is not None:
            query['date'] = {'$gte': start_date, '$lte': end_date}
        else:
            query['date'] = {'$gte': start_date}
    elif end_date is not None:
        query['date'] = {'$lte': end_date}

    # Load data
    cur = collection.find(query, prj)
    df = pd.DataFrame.from_records(cur)
    if df.empty:
        df = pd.DataFrame()
    else:
        df = df.sort_values(by=['date', 'skey'])
    return df   

def read_memb_daily(db, name, start_date=None, end_date=None, skey=None, index_id=None, interval=None, col=None, return_sdi=True):
    collection = db[name]
    # Build projection
    prj = {'_id': 0}
    if col is not None:
        if return_sdi:
            col = ['skey', 'date', 'index_id'] + col
        for col_name in col:
            prj[col_name] = 1

    # Build query
    query = {}
    if skey is not None:
        query['skey'] = {'$in': skey}
    if index_id is not None:
        query['index_id'] = {'$in': index_id}
    if interval is not None:
        query['interval'] = {'$in': interval}
    if start_date is not None:
        if end_date is not None:
            query['date'] = {'$gte': start_date, '$lte': end_date}
        else:
            query['date'] = {'$gte': start_date}
    elif end_date is not None:
        query['date'] = {'$lte': end_date}

    # Load data
    cur = collection.find(query, prj)
    df = pd.DataFrame.from_records(cur)
    if df.empty:
        df = pd.DataFrame()
    else:
        df = df.sort_values(by=['date', 'index_id', 'skey'])
    return df 



import pandas as pd
import random
import numpy as np
import glob
import pickle
import os
import datetime
import time
pd.set_option("max_columns", 200)

year = "2020"
startDate = '20201016'
endDate = '20201031'
database_name = 'com_md_eq_cn'
user = "zhenyuy"
password = "bnONBrzSMGoE"

startTm = datetime.datetime.now()
db1 = DB("192.168.10.178", database_name, user, password)
db2 = dailyDB("192.168.10.178", database_name, user, password)
save = {}
save['date'] = []
save['secid'] = []
mdOrderLog = db1.read('md_order', start_date=startDate, end_date=endDate, symbol=[2000001])
datelist = mdOrderLog['date'].unique()
ss = pd.read_csv('/mnt/ShareWithServer/result/shangshi1.csv')
ss = ss[ss['证券代码↑'] != 'T00018.SH']
ss['skey'] = np.where(ss['证券代码↑'].str[-2:] == 'SZ', ss['证券代码↑'].str[:6].astype(int) + 2000000, ss['证券代码↑'].str[:6].astype(int) + 1000000)
ss['date'] = (ss['上市日期'].str[:4] + ss['上市日期'].str[5:7] + ss['上市日期'].str[8:10]).astype(int)
print(datetime.datetime.now() - startTm)

startTm = datetime.datetime.now()
for d in datelist:
    print(d)
    sl1 = db1.read('md_order', start_date=str(d), end_date=str(d))['skey'].unique()
    data1 = db1.read('md_snapshot_l2', start_date=str(d), end_date=str(d), symbol=list(sl1))
    assert(len(sl1) == data1['skey'].nunique())
    op = read_stock_daily(db2, 'mdbar1d_tr', start_date=int(d), end_date=int(d))

    if len(sl1) != 0:
        for s in sl1:
            mbd = db1.read('md_snapshot_mbd', start_date=str(d), end_date=str(d), symbol=s)
            if mbd is None:
                if ss[ss['skey'] == s]['date'].iloc[0] == d:
                    l2 = data1[data1['skey'] == s]
                    l2['ApplSeqNum'] = -1
                    l2['ApplSeqNum'] = l2['ApplSeqNum'].astype('int32') 
                    db1.write('md_snapshot_l2', l2)
                    continue
                else:
                    save['date'].append(d)
                    save['secid'].append(s)
                    print(s)
                    continue
            try:
                assert(mbd.shape[1] == 82)
            except:
                print('mdb data column unupdated')
                print(s)
            try:
                op1 = op[op['skey'] == s]['open'].iloc[0]
                assert(mbd[mbd['cum_volume'] > 0]['open'].iloc[0] == op1)
            except:
                print('%s have no information in mdbar1d_tr' % str(s))
            l2 = data1[data1['skey'] == s]
            cols = ['skey', 'date', 'cum_volume', 'prev_close', 'open', 'close', 'cum_trades_cnt', 'bid10p', 'bid9p',
                   'bid8p', 'bid7p', 'bid6p', 'bid5p', 'bid4p', 'bid3p', 'bid2p', 'bid1p', 'ask1p', 'ask2p',
                   'ask3p', 'ask4p', 'ask5p', 'ask6p', 'ask7p', 'ask8p', 'ask9p', 'ask10p', 'bid10q', 'bid9q', 
                   'bid8q', 'bid7q', 'bid6q', 'bid5q', 'bid4q', 'bid3q', 'bid2q', 'bid1q', 'ask1q', 'ask2q', 'ask3q', 
                   'ask4q', 'ask5q', 'ask6q','ask7q', 'ask8q', 'ask9q', 'ask10q', 'bid10n', 'bid9n', 'bid8n',
                   'bid7n', 'bid6n', 'bid5n', 'bid4n', 'bid3n', 'bid2n', 'bid1n', 'ask1n', 'ask2n', 'ask3n', 
                   'ask4n', 'ask5n', 'ask6n', 'ask7n', 'ask8n', 'ask9n', 'ask10n', 'total_bid_quantity', 'total_ask_quantity']
            mbd1 = mbd.drop_duplicates(cols, keep='first')
            mbd = mbd1[cols+['ApplSeqNum']]
            if 'ApplSeqNum' in l2.columns:
                l2 = l2[list(l2.columns[l2.columns != 'ApplSeqNum'])]
            rl2 = pd.merge(l2, mbd, on=cols, how='left')
            try:
                assert(rl2[(rl2['ApplSeqNum'].isnull()) & (rl2['cum_volume'] > 0) & (rl2['time'] <= 145655000000)].shape[0] == 0)
            except:
                print(rl2[(rl2['ApplSeqNum'].isnull()) & (rl2['cum_volume'] > 0) & (rl2['time'] <= 145655000000)][['skey', 'date', 'time', 'cum_volume', 'close', 'bid1p', 'bid2p','bid1q', 'bid2q', 'ask1p', 'ask2p', 'ask1q', 'ask2q']])
                print(mbd1.tail(1)[['skey', 'date', 'time', 'cum_volume', 'close', 'bid1p', 'bid2p','bid1q', 'bid2q', 'ask1p', 'ask2p', 'ask1q', 'ask2q']])
            rl2.loc[rl2['ApplSeqNum'].isnull(), 'ApplSeqNum'] = -1
            rl2['ApplSeqNum'] = rl2['ApplSeqNum'].astype('int32') 
            assert(rl2.shape[0] == l2.shape[0])
            db1.write('md_snapshot_l2', rl2)
        print(datetime.datetime.now() - startTm)
    else:
        continue

0:00:05.297581
20201016
         skey      date          time  cum_volume  close  bid1p  bid2p  bid1q  \
3150  2002921  20201016  145636000000     2156900  22.57  22.55  22.54    600   

      bid2q  ask1p  ask2p  ask1q  ask2q  
3150    200  22.57  22.58    200   1000  
          skey      date          time  cum_volume  close  bid1p  bid2p  \
13680  2002921  20201016  145634250000     2156500  22.58  22.57  22.55   

       bid1q  bid2q  ask1p  ask2p  ask1q  ask2q  
13680    400    600  22.58  22.59   1000    800  
         skey      date          time  cum_volume   close   bid1p   bid2p  \
4111  2300792  20201016  145036000000     1554217  165.66  169.93  169.92   
4112  2300792  20201016  145039000000     1554217  165.66  169.93  169.92   
4113  2300792  20201016  145042000000     1554317  169.93  169.92  169.91   

      bid1q  bid2q   ask1p   ask2p  ask1q  ask2q  
4111    100    100  170.72  170.74    500   1700  
4112    100    100  170.72  170.74    500   1800  
4113    100    1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


       skey      date         time  cum_volume  close  bid1p  bid2p  bid1q  \
83  2300999  20201016  92503000000     5708200   49.0  48.99  48.98   1200   
84  2300999  20201016  92603000000     5708200   49.0  48.99  48.98   1200   
85  2300999  20201016  92703000000     5708200   49.0  48.99  48.98   1200   
86  2300999  20201016  92803000000     5708200   49.0  48.99  48.98   1200   
87  2300999  20201016  92903000000     5708200   49.0  48.99  48.98   1200   

    bid2q  ask1p  ask2p    ask1q  ask2q  
83    100   49.0  49.01  1436460   1800  
84    100   49.0  49.01  1436460   1800  
85    100   49.0  49.01  1436460   1800  
86    100   49.0  49.01  1436460   1800  
87    100   49.0  49.01  1436460   1800  
           skey      date          time  cum_volume  close  bid1p  bid2p  \
859109  2300999  20201016  145654990000   179813647  49.07  49.06  49.05   

        bid1q  bid2q  ask1p  ask2p  ask1q  ask2q  
859109   1500   5100  49.07  49.08   3096  40200  
0:20:22.852253
20201019


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


       skey      date         time  cum_volume  close  bid1p  bid2p  bid1q  \
83  2300898  20201019  92503000000      692002   53.4   53.4  53.38  16298   
84  2300898  20201019  92603000000      692002   53.4   53.4  53.38  16298   
85  2300898  20201019  92703000000      692002   53.4   53.4  53.38  16298   
86  2300898  20201019  92803000000      692002   53.4   53.4  53.38  16298   
87  2300898  20201019  92903000000      692002   53.4   53.4  53.38  16298   

    bid2q  ask1p  ask2p  ask1q  ask2q  
83    800  53.46  53.68    900    400  
84    800  53.46  53.68    900    400  
85    800  53.46  53.68    900    400  
86    800  53.46  53.68    900    400  
87    800  53.46  53.68    900    400  
           skey      date          time  cum_volume  close  bid1p  bid2p  \
104954  2300898  20201019  145654980000    20037404  47.75  47.75  47.74   

        bid1q  bid2q  ask1p  ask2p  ask1q  ask2q  
104954  49900    300  47.78  47.79    300    500  
       skey      date         time  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


2300898
       skey      date         time  cum_volume  close  bid1p  bid2p  bid1q  \
72  2300899  20201021  92503000000      116900   74.0  73.94   73.9    100   
73  2300899  20201021  92603000000      116900   74.0  73.94   73.9    100   
74  2300899  20201021  92703000000      116900   74.0  73.94   73.9    100   
75  2300899  20201021  92803000000      116900   74.0  73.94   73.9    100   
76  2300899  20201021  92903000000      116900   74.0  73.94   73.9    100   

    bid2q  ask1p  ask2p  ask1q  ask2q  
72    400   74.0  74.02  24600    300  
73    400   74.0  74.02  24600    300  
74    400   74.0  74.02  24600    300  
75    400   74.0  74.02  24600    300  
76    400   74.0  74.02  24600    300  
          skey      date          time  cum_volume  close  bid1p  bid2p  \
83010  2300899  20201021  145654810000     8983340   75.3   75.3  75.29   

       bid1q  bid2q  ask1p  ask2p  ask1q  ask2q  
83010   5900   1100  75.31  75.35    400   1600  
2300999
1:17:03.884984
20201022


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


       skey      date         time  cum_volume  close  bid1p  bid2p  bid1q  \
81  2300898  20201022  92503000000      182500   67.2   67.1  67.01   1000   
82  2300898  20201022  92603000000      182500   67.2   67.1  67.01   1000   
83  2300898  20201022  92703000000      182500   67.2   67.1  67.01   1000   
84  2300898  20201022  92803000000      182500   67.2   67.1  67.01   1000   
85  2300898  20201022  92903000000      182500   67.2   67.1  67.01   1000   

    bid2q  ask1p  ask2p  ask1q  ask2q  
81    300   67.2  67.24   4653    100  
82    300   67.2  67.24   4653    100  
83    300   67.2  67.24   4653    100  
84    300   67.2  67.24   4653    100  
85    300   67.2  67.24   4653    100  
           skey      date          time  cum_volume  close  bid1p  bid2p  \
101669  2300898  20201022  145654990000    15578126  64.56  64.56  64.55   

        bid1q  bid2q  ask1p  ask2p  ask1q  ask2q  
101669   1400   1000   64.6  64.67   3300   4396  
       skey      date         time  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


2:13:19.129081
20201027
2:31:39.283104
20201028
2:51:04.685689
20201029
3:10:54.242356
20201030
       skey      date         time  cum_volume  close  bid1p  bid2p  bid1q  \
84  2300900  20201030  92503000000      476300   65.0  64.95  64.88    200   
85  2300900  20201030  92603000000      476300   65.0  64.95  64.88    200   
86  2300900  20201030  92703000000      476300   65.0  64.95  64.88    200   
87  2300900  20201030  92803000000      476300   65.0  64.95  64.88    200   
88  2300900  20201030  92903000000      476300   65.0  64.95  64.88    200   

    bid2q  ask1p  ask2p   ask1q  ask2q  
84    500   65.0  65.02  112364    100  
85    500   65.0  65.02  112364    100  
86    500   65.0  65.02  112364    100  
87    500   65.0  65.02  112364    100  
88    500   65.0  65.02  112364    100  
           skey      date          time  cum_volume  close  bid1p  bid2p  \
165248  2300900  20201030  145654940000    20471707  67.65  67.61   67.6   

        bid1q  bid2q  ask1p  ask2p  