In [19]:
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz

def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    print(auth_db)
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    print(uri)
    return DBObj(uri, db_name=db_name)

class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db'):
        self.db_name = db_name
        self.uri = uri
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}
        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("date must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid date type: " + str(type(x)))
        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)
        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)
        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)
        return query

    def read_tick(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None
        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None

    def read_daily(self, table_name, start_date=None, end_date=None, skey=None, index_id=None, interval=None, index_name=None, col=None, return_sdi=True): 
        collection = self.db[table_name]
        # Build projection 
        prj = {'_id': 0} 
        if col is not None: 
            if return_sdi: 
                col = ['skey', 'date', 'index_id'] + col 
            for col_name in col: 
                prj[col_name] = 1 
        # Build query 
        query = {} 
        if skey is not None: 
            query['skey'] = {'$in': skey} 
        if interval is not None: 
            query['interval'] = {'$in': interval} 
        if index_id is not None: 
            query['index_id'] = {'$in': index_id}    
        if index_name is not None:
            n = '' 
            for name in index_name: 
                try: 
                    name = re.compile('[\u4e00-\u9fff]+').findall(name)[0] 
                    if len(n) == 0: 
                        n = n = "|".join(name) 
                    else: 
                        n = n + '|' + "|".join(name) 
                except: 
                    if len(n) == 0: 
                        n = name 
                    else: 
                        n = n + '|' + name 
            query['index_name'] = {'$regex': n}
        if start_date is not None: 
            if end_date is not None: 
                query['date'] = {'$gte': start_date, '$lte': end_date} 
            else: 
                query['date'] = {'$gte': start_date} 
        elif end_date is not None: 
            query['date'] = {'$lte': end_date} 
        # Load data 
        cur = collection.find(query, prj) 
        df = pd.DataFrame.from_records(cur) 
        if df.empty: 
            df = pd.DataFrame() 
        else:
            if 'index_id' in df.columns:
                df = df.sort_values(by=['date', 'index_id', 'skey']).reset_index(drop=True)
            else:
                df = df.sort_values(by=['date','skey']).reset_index(drop=True)
        return df 
 
    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)
        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        else:
            raise Exception('unknown version')
    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = 1
            seg = {'ver': version, 'data': self.ser(df_seg, version), 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)
    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        else:
            raise Exception('unknown version')

def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()

In [25]:
database_name = 'com_md_eq_cn'
user = "root"
password = "17e14250"
db = DB("192.168.10.142", database_name, user, password)
database_name = 'com_md_eq_cn'
user = "zhenyuy"
password = "bnONBrzSMGoE"
db1 = DB("192.168.10.178", database_name, user, password)
trade_data = db1.read_tick('md_trade', start_date=20201119, end_date=20201119)
order_data = db1.read_tick('md_order', start_date=20201119, end_date=20201119)

display(trade_data.head())
display(order_data.head())

admin
mongodb://root:17e14250@192.168.10.142/?authSource=admin
com_md_eq_cn
mongodb://zhenyuy:bnONBrzSMGoE@192.168.10.178/?authSource=com_md_eq_cn


Unnamed: 0,skey,date,time,clockAtArrival,datetime,ApplSeqNum,trade_type,trade_flag,trade_price,trade_qty,BidApplSeqNum,OfferApplSeqNum
0,1600000,20201119,92500510000,1605749100510000,2020-11-19 09:25:00.510,14411,1,0,9.7,100,56132,56123
1,1600000,20201119,92500510000,1605749100510000,2020-11-19 09:25:00.510,14412,1,0,9.7,100,56132,56127
2,1600000,20201119,92500510000,1605749100510000,2020-11-19 09:25:00.510,14413,1,0,9.7,100,56132,56131
3,1600000,20201119,92500510000,1605749100510000,2020-11-19 09:25:00.510,14414,1,0,9.7,200,86978,56177
4,1600000,20201119,92500510000,1605749100510000,2020-11-19 09:25:00.510,14415,1,0,9.7,100,86978,56179


Unnamed: 0,skey,date,time,clockAtArrival,datetime,ApplSeqNum,order_side,order_type,order_price,order_qty
0,2000001,20201119,91500000000,1605748500000000,2020-11-19 09:15:00.000,110,1,2,18.46,100
1,2000001,20201119,91500000000,1605748500000000,2020-11-19 09:15:00.000,159,2,2,18.6,9870
2,2000001,20201119,91500020000,1605748500020000,2020-11-19 09:15:00.020,314,2,2,18.45,3000
3,2000001,20201119,91500020000,1605748500020000,2020-11-19 09:15:00.020,347,1,2,18.0,100
4,2000001,20201119,91500030000,1605748500030000,2020-11-19 09:15:00.030,368,1,2,17.54,100


In [32]:
db.read_tick('md_trade', start_date=20201117, end_date=20201119, symbol=[2000001]).dtypes

skey                        int32
date                        int32
time                        int64
clockAtArrival              int64
datetime           datetime64[ns]
ApplSeqNum                  int32
trade_type                  int32
trade_flag                  int32
trade_price               float64
trade_qty                   int32
BidApplSeqNum               int32
OfferApplSeqNum             int32
dtype: object

In [20]:
database_name = 'com_md_eq_cn'
user = "root"
password = "17e14250"
db = DB("192.168.10.142", database_name, user, password)

admin
mongodb://root:17e14250@192.168.10.142/?authSource=admin


In [16]:
database_name = 'com_md_eq_cn'
user = "zhenyuy"
password = "bnONBrzSMGoE"

db = DB("192.168.10.178", database_name, user, password)
db.read_daily('index_memb', start_date=20201112, end_date=20201112, index_name=['建筑','农业', 'IC', 'IF'])['index_name'].unique()

com_md_eq_cn
mongodb://zhenyuy:bnONBrzSMGoE@192.168.10.178/?authSource=com_md_eq_cn


array(['IF', 'IC', 'AMAC 建筑'], dtype=object)

In [8]:
db.read_tick('md_index', start_date=20201112, end_date=20201112)

Unnamed: 0,skey,date,time,clockAtArrival,datetime,ordering,cum_volume,cum_amount,open,close
0,1000016,20201112,92503000000,1605144303000000,2020-11-12 09:25:03,1,231770,4.566743e+08,3413.2074,3413.2074
1,1000016,20201112,92504000000,1605144304000000,2020-11-12 09:25:04,2,231770,4.566743e+08,3413.2074,3413.2074
2,1000016,20201112,92505000000,1605144305000000,2020-11-12 09:25:05,3,231770,4.566743e+08,3413.2074,3413.2074
3,1000016,20201112,92506000000,1605144306000000,2020-11-12 09:25:06,4,231770,4.566743e+08,3413.2074,3413.2074
4,1000016,20201112,92507000000,1605144307000000,2020-11-12 09:25:07,5,231770,4.566743e+08,3413.2074,3413.2074
...,...,...,...,...,...,...,...,...,...,...
720426,3030067,20201112,150008000000,1605164408000000,2020-11-12 15:00:08,15308,327999380,5.063986e+05,4030.1546,3977.7974
720427,3030067,20201112,150009000000,1605164409000000,2020-11-12 15:00:09,15309,327999380,5.063986e+05,4030.1546,3977.7974
720428,3030067,20201112,150010000000,1605164410000000,2020-11-12 15:00:10,15310,327999380,5.063986e+05,4030.1546,3977.7974
720429,3030067,20201112,150011000000,1605164411000000,2020-11-12 15:00:11,15311,327999380,5.063986e+05,4030.1546,3977.7974
