In [7]:
import timeit

def func():
    normal_list=range(10000)
    L = [i**2 for i in normal_list]
    
timeit.repeat("func()", setup="from __main__ import func",repeat=10,number=1)

[0.002512399999432091,
 0.0024830000002111774,
 0.0024029000005612033,
 0.0023946999999679974,
 0.0025488000001132605,
 0.0023909999999887077,
 0.0023871000003055087,
 0.002375699999902281,
 0.0023857999995016144,
 0.002460400000018126]

In [1]:
import pymongo
import io
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz
import pyarrow as pa
import pyarrow.parquet as pq
import numpy as np

def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name)

class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db', version=3): 
        self.db_name = db_name 
        self.uri = uri 
        self.client = pymongo.MongoClient(self.uri) 
        self.db = self.client[self.db_name] 
        self.chunk_size = 20000 
        self.symbol_column = symbol_column 
        self.date_column = 'date' 
        self.version = version

    def parse_uri(self, uri): 
        # mongodb://user:password@example.com 
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}
        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("date must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid date type: " + str(type(x)))
        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)
        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)
        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)
        return query

    def read_tick(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name] 
        query = self.build_query(start_date, end_date, symbol) 
        if not query: 
            print('cannot read the whole table') 
            return None  
        segs = [] 
        for x in collection.find(query): 
            x['data'] = self.deser(x['data'], x['ver']) 
            segs.append(x) 
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start'])) 
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None

    def read_tick1(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name] 
        query = self.build_query(start_date, end_date, symbol) 
        if not query: 
            print('cannot read the whole table') 
            return None  
        segs = [] 
        for x in collection.find(query): 
            x['data'] = self.deser(x['data'], x['ver']) 
            segs.append(x) 
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start'])) 
        return pd.DataFrame(np.concatenate([x['data'].values for x in segs], axis=0), columns=x['data'].columns).astype(x['data'].dtypes.to_dict())
  
    def read_tick2(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name] 
        query = self.build_query(start_date, end_date, symbol) 
        if not query: 
            print('cannot read the whole table') 
            return None  
        segs = [] 
        for x in collection.find(query): 
            x['data'] = self.deser(x['data'], x['ver']) 
            segs.append(x) 
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start'])) 
        return pd.DataFrame(np.concatenate([x['data'].values for x in segs], axis=0), columns=x['data'].columns)
    
    def read_daily(self, table_name, start_date=None, end_date=None, skey=None, index_id=None, interval=None, index_name=None, col=None, return_sdi=True): 
        collection = self.db[table_name]
        # Build projection 
        prj = {'_id': 0} 
        if col is not None: 
            if return_sdi: 
                col = ['skey', 'date', 'index_id'] + col 
            for col_name in col: 
                prj[col_name] = 1 
        # Build query 
        query = {} 
        if skey is not None: 
            query['skey'] = {'$in': skey} 
        if interval is not None: 
            query['interval'] = {'$in': interval} 
        if index_id is not None: 
            query['index_id'] = {'$in': index_id}    
        if index_name is not None:
            n = '' 
            for name in index_name: 
                try: 
                    name = re.compile('[\u4e00-\u9fff]+').findall(name)[0] 
                    if len(n) == 0: 
                        n = n = "|".join(name) 
                    else: 
                        n = n + '|' + "|".join(name) 
                except: 
                    if len(n) == 0: 
                        n = name 
                    else: 
                        n = n + '|' + name 
            query['index_name'] = {'$regex': n}
        if start_date is not None: 
            if end_date is not None: 
                query['date'] = {'$gte': start_date, '$lte': end_date} 
            else: 
                query['date'] = {'$gte': start_date} 
        elif end_date is not None: 
            query['date'] = {'$lte': end_date} 
        # Load data 
        cur = collection.find(query, prj) 
        df = pd.DataFrame.from_records(cur) 
        if df.empty: 
            df = pd.DataFrame() 
        else:
            if 'index_id' in df.columns:
                df = df.sort_values(by=['date', 'index_id', 'skey']).reset_index(drop=True)
            else:
                df = df.sort_values(by=['date','skey']).reset_index(drop=True)
        return df 
 
    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def deser(self, s, version): 
        def unpickle(s): 
            return pickle.loads(s) 
        if version == 1: 
            return unpickle(gzip.decompress(s)) 
        elif version == 2: 
            return unpickle(lzma.decompress(s)) 
        elif version == 3: 
            f = io.BytesIO() 
            f.write(s) 
            f.seek(0) 
            return pq.read_table(f, use_threads=False).to_pandas() 
        else: 
            raise Exception('unknown version')

def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()


In [38]:
import os
import pyarrow as pa
import pyarrow.parquet as pq
import timeit
import pickle
# os.environ['OMP_NUM_THREADS'] = '1'

database_name = 'com_md_eq_cn'
user = 'zhenyuy'
password = 'bnONBrzSMGoE'

import sys
pd.set_option('max_columns', 200)
db1 = DB("192.168.10.178", database_name, user, password)

test = db1.read_tick('md_snapshot_l2', start_date=20190102, end_date=20191231, symbol=1600000)
test.to_pickle('E:\\DBdata\\l2.pkl')
test1.to_pickle('E:\\DBdata\\l2_1.pkl')
table = pa.Table.from_pandas(test)
pq.write_table(table, 'E:\\DBdata\\l2.parquet', compression = 'zstd', compression_level = 0)
table = pa.Table.from_pandas(test1)
pq.write_table(table, 'E:\\DBdata\\l2_1.parquet', compression = 'zstd', compression_level = 0)
print(os.stat('E:\\DBdata\\l2.pkl').st_size/(1024 ** 2))
print(os.stat('E:\\DBdata\\l2_1.pkl').st_size/(1024 ** 2))
print(os.stat('E:\\DBdata\\l2.parquet').st_size/(1024 ** 2))
print(os.stat('E:\\DBdata\\l2_1.parquet').st_size/(1024 ** 2))

def func():
    test = pickle.load(open('E:\\DBdata\\l2.pkl', 'rb'))   
def dfunc():
    test = pickle.load(open('E:\\DBdata\\l2_1.pkl', 'rb'))
def func1():
    test = pq.read_table('E:\\DBdata\\l2.parquet').to_pandas()

print(np.mean(timeit.repeat("func()", setup="from __main__ import func",repeat=20,number=1)))
print(np.mean(timeit.repeat("dfunc()", setup="from __main__ import dfunc",repeat=20,number=1)))

1771.102445602417
796.6893196105957
90.59174728393555
36.15474510192871
1.5580506900000046
0.6757827999999109


In [75]:
lv2 = db1.read_tick('md_snapshot_l2', start_date=20201103, end_date=20201103, symbol=2300900)
mbd = db1.read_tick('md_snapshot_mbd', start_date=20201103, end_date=20201103, symbol=2300900)

In [76]:
lv2[(lv2['cum_volume'] > 0) & (lv2['time'] <= 145655000000) & (lv2['ApplSeqNum'] == -1)]

Unnamed: 0,skey,date,time,clockAtArrival,ordering,has_missing,cum_trades_cnt,cum_volume,cum_amount,prev_close,open,high,low,close,bid10p,bid9p,bid8p,bid7p,bid6p,bid5p,bid4p,bid3p,bid2p,bid1p,ask1p,ask2p,ask3p,ask4p,ask5p,ask6p,ask7p,ask8p,ask9p,ask10p,bid10q,bid9q,bid8q,bid7q,bid6q,bid5q,bid4q,bid3q,bid2q,bid1q,ask1q,ask2q,ask3q,ask4q,ask5q,ask6q,ask7q,ask8q,ask9q,ask10q,bid10n,bid9n,bid8n,bid7n,bid6n,bid5n,bid4n,bid3n,bid2n,bid1n,ask1n,ask2n,ask3n,ask4n,ask5n,ask6n,ask7n,ask8n,ask9n,ask10n,bid1Top1q,bid1Top2q,bid1Top3q,bid1Top4q,bid1Top5q,bid1Top6q,bid1Top7q,bid1Top8q,bid1Top9q,bid1Top10q,bid1Top11q,bid1Top12q,bid1Top13q,bid1Top14q,bid1Top15q,bid1Top16q,bid1Top17q,bid1Top18q,bid1Top19q,bid1Top20q,bid1Top21q,bid1Top22q,bid1Top23q,bid1Top24q,bid1Top25q,bid1Top26q,bid1Top27q,bid1Top28q,bid1Top29q,bid1Top30q,bid1Top31q,bid1Top32q,bid1Top33q,bid1Top34q,bid1Top35q,bid1Top36q,bid1Top37q,bid1Top38q,bid1Top39q,bid1Top40q,bid1Top41q,bid1Top42q,bid1Top43q,bid1Top44q,bid1Top45q,bid1Top46q,bid1Top47q,bid1Top48q,bid1Top49q,bid1Top50q,ask1Top1q,ask1Top2q,ask1Top3q,ask1Top4q,ask1Top5q,ask1Top6q,ask1Top7q,ask1Top8q,ask1Top9q,ask1Top10q,ask1Top11q,ask1Top12q,ask1Top13q,ask1Top14q,ask1Top15q,ask1Top16q,ask1Top17q,ask1Top18q,ask1Top19q,ask1Top20q,ask1Top21q,ask1Top22q,ask1Top23q,ask1Top24q,ask1Top25q,ask1Top26q,ask1Top27q,ask1Top28q,ask1Top29q,ask1Top30q,ask1Top31q,ask1Top32q,ask1Top33q,ask1Top34q,ask1Top35q,ask1Top36q,ask1Top37q,ask1Top38q,ask1Top39q,ask1Top40q,ask1Top41q,ask1Top42q,ask1Top43q,ask1Top44q,ask1Top45q,ask1Top46q,ask1Top47q,ask1Top48q,ask1Top49q,ask1Top50q,total_bid_quantity,total_ask_quantity,total_bid_vwap,total_ask_vwap,total_bid_orders,total_ask_orders,total_bid_levels,total_ask_levels,bid_trade_max_duration,ask_trade_max_duration,cum_canceled_buy_orders,cum_canceled_buy_volume,cum_canceled_buy_amount,cum_canceled_sell_orders,cum_canceled_sell_volume,cum_canceled_sell_amount,ApplSeqNum
81,2300900,20201103,92503000000,1604366703000000,82,0,408,117600,7761600.0,68.5,66.0,66.0,66.0,66.0,65.56,65.6,65.61,65.66,65.76,65.78,65.8,65.81,65.88,65.93,66.0,66.02,66.1,66.15,66.16,66.17,66.25,66.28,66.3,66.39,200,3500,500,100,300,100,400,100,200,200,17300,800,200,700,300,300,500,2900,1100,800,1,6,1,1,1,1,1,1,2,1,9,1,1,2,1,1,1,2,1,1,200,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6500,600,5200,300,300,800,1400,400,1800,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3905100,610632,2.98,81.2,0,0,0,0,0,0,0,0,0.0,0,0,0.0,-1
82,2300900,20201103,92603000000,1604366763000000,83,0,408,117600,7761600.0,68.5,66.0,66.0,66.0,66.0,65.56,65.6,65.61,65.66,65.76,65.78,65.8,65.81,65.88,65.93,66.0,66.02,66.1,66.15,66.16,66.17,66.25,66.28,66.3,66.39,200,3500,500,100,300,100,400,100,200,200,17300,800,200,700,300,300,500,2900,1100,800,1,6,1,1,1,1,1,1,2,1,9,1,1,2,1,1,1,2,1,1,200,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6500,600,5200,300,300,800,1400,400,1800,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3905100,610632,2.98,81.2,0,0,0,0,0,0,0,0,0.0,0,0,0.0,-1
83,2300900,20201103,92703000000,1604366823000000,84,0,408,117600,7761600.0,68.5,66.0,66.0,66.0,66.0,65.56,65.6,65.61,65.66,65.76,65.78,65.8,65.81,65.88,65.93,66.0,66.02,66.1,66.15,66.16,66.17,66.25,66.28,66.3,66.39,200,3500,500,100,300,100,400,100,200,200,17300,800,200,700,300,300,500,2900,1100,800,1,6,1,1,1,1,1,1,2,1,9,1,1,2,1,1,1,2,1,1,200,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6500,600,5200,300,300,800,1400,400,1800,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3905100,610632,2.98,81.2,0,0,0,0,0,0,0,0,0.0,0,0,0.0,-1
84,2300900,20201103,92803000000,1604366883000000,85,0,408,117600,7761600.0,68.5,66.0,66.0,66.0,66.0,65.56,65.6,65.61,65.66,65.76,65.78,65.8,65.81,65.88,65.93,66.0,66.02,66.1,66.15,66.16,66.17,66.25,66.28,66.3,66.39,200,3500,500,100,300,100,400,100,200,200,17300,800,200,700,300,300,500,2900,1100,800,1,6,1,1,1,1,1,1,2,1,9,1,1,2,1,1,1,2,1,1,200,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6500,600,5200,300,300,800,1400,400,1800,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3905100,610632,2.98,81.2,0,0,0,0,0,0,0,0,0.0,0,0,0.0,-1
85,2300900,20201103,92903000000,1604366943000000,86,0,408,117600,7761600.0,68.5,66.0,66.0,66.0,66.0,65.56,65.6,65.61,65.66,65.76,65.78,65.8,65.81,65.88,65.93,66.0,66.02,66.1,66.15,66.16,66.17,66.25,66.28,66.3,66.39,200,3500,500,100,300,100,400,100,200,200,17300,800,200,700,300,300,500,2900,1100,800,1,6,1,1,1,1,1,1,2,1,9,1,1,2,1,1,1,2,1,1,200,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6500,600,5200,300,300,800,1400,400,1800,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3905100,610632,2.98,81.2,0,0,0,0,0,0,0,0,0.0,0,0,0.0,-1


In [78]:
mbd[mbd['cum_volume'] == 117600][['ApplSeqNum', 'skey', 'date', 'cum_volume', 'prev_close', 'open', 'close', 'cum_trades_cnt', 'bid10p', 'bid9p',
                   'bid8p', 'bid7p', 'bid6p', 'bid5p', 'bid4p', 'bid3p', 'bid2p', 'bid1p', 'ask1p', 'ask2p',
                   'ask3p', 'ask4p', 'ask5p', 'ask6p', 'ask7p', 'ask8p', 'ask9p', 'ask10p', 'bid10q', 'bid9q', 
                   'bid8q', 'bid7q', 'bid6q', 'bid5q', 'bid4q', 'bid3q', 'bid2q', 'bid1q', 'ask1q', 'ask2q', 'ask3q', 
                   'ask4q', 'ask5q', 'ask6q','ask7q', 'ask8q', 'ask9q', 'ask10q', 'bid10n', 'bid9n', 'bid8n',
                   'bid7n', 'bid6n', 'bid5n', 'bid4n', 'bid3n', 'bid2n', 'bid1n', 'ask1n', 'ask2n', 'ask3n', 
                   'ask4n', 'ask5n', 'ask6n', 'ask7n', 'ask8n', 'ask9n', 'ask10n', 'total_bid_quantity', 'total_ask_quantity']]

Unnamed: 0,ApplSeqNum,skey,date,cum_volume,prev_close,open,close,cum_trades_cnt,bid10p,bid9p,bid8p,bid7p,bid6p,bid5p,bid4p,bid3p,bid2p,bid1p,ask1p,ask2p,ask3p,ask4p,ask5p,ask6p,ask7p,ask8p,ask9p,ask10p,bid10q,bid9q,bid8q,bid7q,bid6q,bid5q,bid4q,bid3q,bid2q,bid1q,ask1q,ask2q,ask3q,ask4q,ask5q,ask6q,ask7q,ask8q,ask9q,ask10q,bid10n,bid9n,bid8n,bid7n,bid6n,bid5n,bid4n,bid3n,bid2n,bid1n,ask1n,ask2n,ask3n,ask4n,ask5n,ask6n,ask7n,ask8n,ask9n,ask10n,total_bid_quantity,total_ask_quantity
0,283460,2300900,20201103,117600,68.5,66.0,66.0,408,65.56,65.6,65.61,65.66,65.76,65.78,65.8,65.81,65.88,65.93,66.0,66.02,66.1,66.15,66.16,66.17,66.25,66.28,66.3,66.39,200,3500,500,100,300,100,400,100,200,200,17300,800,200,700,300,300,500,2900,1100,800,1,6,1,1,1,1,1,1,2,1,9,1,1,2,1,1,1,2,1,1,3905100,615632


In [79]:
615632 - 610632

5000

In [37]:
print(sys.getsizeof(test) / (1024**3))
print(sys.getsizeof(test1) / (1024**3))

0.7779964432120323

In [39]:
test1 = test.copy()
for cols in ['skey', 'date', 'ordering', 'has_missing', 'cum_trades_cnt', 'bid10n', 'bid9n', 'bid8n', 'bid7n', 'bid6n', 'bid5n', 'bid4n', 'bid3n',
       'bid2n', 'bid1n', 'ask1n', 'ask2n', 'ask3n', 'ask4n', 'ask5n', 'ask6n',
       'ask7n', 'ask8n', 'ask9n', 'ask10n', 'bid1Top1q', 'bid1Top2q', 'bid1Top3q', 'bid1Top4q', 'bid1Top5q',
       'bid1Top6q', 'bid1Top7q', 'bid1Top8q', 'bid1Top9q', 'bid1Top10q',
       'bid1Top11q', 'bid1Top12q', 'bid1Top13q', 'bid1Top14q', 'bid1Top15q',
       'bid1Top16q', 'bid1Top17q', 'bid1Top18q', 'bid1Top19q', 'bid1Top20q','bid1Top21q', 'bid1Top22q', 'bid1Top23q', 'bid1Top24q', 'bid1Top25q',
       'bid1Top26q', 'bid1Top27q', 'bid1Top28q', 'bid1Top29q', 'bid1Top30q',
       'bid1Top31q', 'bid1Top32q', 'bid1Top33q', 'bid1Top34q', 'bid1Top35q',
       'bid1Top36q', 'bid1Top37q', 'bid1Top38q', 'bid1Top39q', 'bid1Top40q','bid1Top41q', 'bid1Top42q', 'bid1Top43q', 'bid1Top44q', 'bid1Top45q',
       'bid1Top46q', 'bid1Top47q', 'bid1Top48q', 'bid1Top49q', 'bid1Top50q',
       'ask1Top1q', 'ask1Top2q', 'ask1Top3q', 'ask1Top4q', 'ask1Top5q',
       'ask1Top6q', 'ask1Top7q', 'ask1Top8q', 'ask1Top9q', 'ask1Top10q', 'ask1Top11q', 'ask1Top12q', 'ask1Top13q', 'ask1Top14q', 'ask1Top15q',
       'ask1Top16q', 'ask1Top17q', 'ask1Top18q', 'ask1Top19q', 'ask1Top20q',
       'ask1Top21q', 'ask1Top22q', 'ask1Top23q', 'ask1Top24q', 'ask1Top25q',
       'ask1Top26q', 'ask1Top27q', 'ask1Top28q', 'ask1Top29q', 'ask1Top30q', 'ask1Top31q', 'ask1Top32q', 'ask1Top33q', 'ask1Top34q', 'ask1Top35q',
       'ask1Top36q', 'ask1Top37q', 'ask1Top38q', 'ask1Top39q', 'ask1Top40q',
       'ask1Top41q', 'ask1Top42q', 'ask1Top43q', 'ask1Top44q', 'ask1Top45q',
       'ask1Top46q', 'ask1Top47q', 'ask1Top48q', 'ask1Top49q', 'ask1Top50q', 'total_bid_orders', 'total_ask_orders',
       'total_bid_levels', 'total_ask_levels', 'bid_trade_max_duration',
       'ask_trade_max_duration', 'cum_canceled_buy_orders', 'cum_canceled_sell_orders', 'ApplSeqNum']:
    test1[cols] = test1[cols].astype('int32')

ValueError: Cannot convert non-finite values (NA or inf) to integer

In [2]:
database_name = 'com_md_eq_cn'
user = 'zhenyuy'
password = 'bnONBrzSMGoE'

import sys


pd.set_option('max_columns', 200)
db1 = DB("192.168.10.178", database_name, user, password)

import timeit

tag = []
speed = []
ram = []


def func():
    test = db1.read_tick('md_snapshot_l2', start_date=20190102, end_date=20191231, symbol=2000006)

def func1():
    test = db1.read_tick1('md_snapshot_l2', start_date=20190102, end_date=20191231, symbol=2000006)

def func2():
    test = db1.read_tick2('md_snapshot_l2', start_date=20190102, end_date=20191231, symbol=2000006)

tag.append('local PC, read database (pd.concat)')
speed.append(np.mean(timeit.repeat("func()", setup="from __main__ import func",repeat=20,number=1)))
test = db1.read_tick('md_snapshot_l2', start_date=20190102, end_date=20191231, symbol=2000006)
ram.append(sys.getsizeof(test) / 1024)

tag.append('local PC, read database (np.concatenate + change dtypes)')
speed.append(np.mean(timeit.repeat("func1()", setup="from __main__ import func1",repeat=20,number=1)))
test = db1.read_tick1('md_snapshot_l2', start_date=20190102, end_date=20191231, symbol=2000006)
ram.append(sys.getsizeof(test) / 1024)

tag.append('local PC, read database (np.concatenate)')
speed.append(np.mean(timeit.repeat("func2()", setup="from __main__ import func2",repeat=20,number=1)))
test = db1.read_tick2('md_snapshot_l2', start_date=20190102, end_date=20191231, symbol=2000006)
ram.append(sys.getsizeof(test) / 1024)

In [36]:
test = db1.read_tick('md_snapshot_l2', start_date=20190102, end_date=20190102, symbol=2000006)

In [40]:
sys.getsizeof(test) / (1024 ** 2)

4.094917297363281

In [41]:
test1 = test.copy()
for col in test1.columns[test1.dtypes == 'int64']:
    test1[col] = test1[col].astype('int32')
sys.getsizeof(test1) / (1024 ** 2)

2.3798294067382812

In [32]:
test1 = test.copy()
for col in test1.columns[test1.dtypes == 'int64']:
    test1[col] = test1[col].astype('int32')

In [34]:
import os
import pyarrow as pa
import pyarrow.parquet as pq

table = pa.Table.from_pandas(test1)
pq.write_table(table, 'E:\\DBdata\\test2.parquet', compression = 'zstd', compression_level = 0)
size = os.stat('E:\\DBdata\\test2.parquet').st_size/(1024 ** 2)
size

38.080321311950684

In [35]:
pq.read_table('E:\\DBdata\\test2.parquet').to_pandas().dtypes

skey                          int32
date                          int32
time                          int32
clockAtArrival                int32
ordering                      int32
                             ...   
cum_canceled_buy_amount     float64
cum_canceled_sell_orders      int32
cum_canceled_sell_volume      int32
cum_canceled_sell_amount    float64
ApplSeqNum                  float64
Length: 191, dtype: object

In [3]:
def func():
    dataPathLs = np.array(glob.glob('E:\\DBdata\\per_day\\***'))
    re = []
    for d in dataPathLs:
        test = pd.read_pickle(d)
        re += [test]
    re = pd.concat(re).reset_index(drop=True)

def func1():
    dataPathLs = np.array(glob.glob('E:\\DBdata\\per_month\\***'))
    re = []
    for d in dataPathLs:
        test = pd.read_pickle(d)
        re += [test]
    re = pd.concat(re).reset_index(drop=True)



import os
import glob

tag.append('local PC, read from local drive (per day)')
speed.append(np.mean(timeit.repeat("func()", setup="from __main__ import func",repeat=20,number=1)))
dataPathLs = np.array(glob.glob('E:\\DBdata\\per_day\\***'))
size1 = 0
for i in dataPathLs:
    size1 += os.path.getsize(i)
ram.append(size1/1024)

tag.append('local PC, read from local drive (per month)')
speed.append(np.mean(timeit.repeat("func1()", setup="from __main__ import func1",repeat=20,number=1)))
dataPathLs = np.array(glob.glob('E:\\DBdata\\per_month\\***'))
size1 = 0
for i in dataPathLs:
    size1 += os.path.getsize(i)
ram.append(size1/1024)

In [19]:
test.dtypes

skey                        float64
date                        float64
time                        float64
clockAtArrival              float64
ordering                    float64
                             ...   
cum_canceled_buy_amount     float64
cum_canceled_sell_orders    float64
cum_canceled_sell_volume    float64
cum_canceled_sell_amount    float64
ApplSeqNum                  float64
Length: 191, dtype: object

In [11]:
tag += ['remote SH server, read database (pd.concat)',
 'remote SH server, read database (np.concatenate + change dtypes)',
 'remote SH server, read database (np.concatenate)',
 'remote SH server,, read from server (per day)',
 'remote SH server,, read from server (per month)']

In [16]:
re = pd.DataFrame()
re['tag'] = tag
re['time (s)'] = speed
re['ram (MB)'] = [i/1024 for i in ram]
re

Unnamed: 0,tag,time (s),ram (MB)
0,"local PC, read database (pd.concat)",5.642329,1227.126183
1,"local PC, read database (np.concatenate + chan...",6.344521,1227.126183
2,"local PC, read database (np.concatenate)",5.485455,1227.126183
3,"local PC, read from local drive (per day)",4.175218,1228.119286
4,"local PC, read from local drive (per month)",2.94528,1227.184128
5,"remote SH server, read database (pd.concat)",8.338575,1227.126175
6,"remote SH server, read database (np.concatenat...",9.316128,1227.126175
7,"remote SH server, read database (np.concatenate)",8.159712,1227.126175
8,"remote SH server,, read from server (per day)",3.956808,1228.116028
9,"remote SH server,, read from server (per month)",3.048589,1227.184368


In [17]:
re['time (s)'] = re['time (s)'].apply(lambda x: round(x, 2))
re['speed (MB/s)'] = re['ram (MB)'] / re['time (s)']
re['speed (MB/s)'] = re['speed (MB/s)'].astype(int)
re['ram (MB)'] = re['ram (MB)'].astype(int)
re

Unnamed: 0,tag,time (s),ram (MB),speed (MB/s)
0,"local PC, read database (pd.concat)",5.64,1227,217
1,"local PC, read database (np.concatenate + chan...",6.34,1227,193
2,"local PC, read database (np.concatenate)",5.49,1227,223
3,"local PC, read from local drive (per day)",4.18,1228,293
4,"local PC, read from local drive (per month)",2.95,1227,415
5,"remote SH server, read database (pd.concat)",8.34,1227,147
6,"remote SH server, read database (np.concatenat...",9.32,1227,131
7,"remote SH server, read database (np.concatenate)",8.16,1227,150
8,"remote SH server,, read from server (per day)",3.96,1228,310
9,"remote SH server,, read from server (per month)",3.05,1227,402


In [23]:
from IPython.display import display, HTML
HTML(re.groupby('tag').first().to_html())

Unnamed: 0_level_0,time (s),ram (MB),speed (MB/s)
tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"local PC, read database (np.concatenate + change dtypes)",6.34,1227,193
"local PC, read database (np.concatenate)",5.49,1227,223
"local PC, read database (pd.concat)",5.64,1227,217
"local PC, read from local drive (per day)",4.18,1228,293
"local PC, read from local drive (per month)",2.95,1227,415
"remote SH server, read database (np.concatenate + change dtypes)",9.32,1227,131
"remote SH server, read database (np.concatenate)",8.16,1227,150
"remote SH server, read database (pd.concat)",8.34,1227,147
"remote SH server,, read from server (per day)",3.96,1228,310
"remote SH server,, read from server (per month)",3.05,1227,402


In [23]:
data1 = pd.DataFrame()
data2 = pd.DataFrame()
data1['c1'] = [1, 2]
data1['c2'] = [1.2, 3.45]
data2['c1'] = [3, 4]
data2['c2'] = [4.5, 7.8]
re = []
re += [data1]
re += [data2]
re = pd.concat(re, ignore_index = True)
display(re.dtypes)

re = []
re += [data1.values]
re += [data2.values]
re = pd.DataFrame(np.concatenate(re, axis=0), columns=data1.columns)
re
# display(re.dtypes)
# re = re.astype(data1.dtypes.to_dict())
# display(re.dtypes)

c1      int64
c2    float64
dtype: object

Unnamed: 0,c1,c2
0,1.0,1.2
1,2.0,3.45
2,3.0,4.5
3,4.0,7.8
