In [1]:
import pymongo
import io
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz
import pyarrow as pa
import pyarrow.parquet as pq
import numpy as np

def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name)

class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db', version=3): 
        self.db_name = db_name 
        self.uri = uri 
        self.client = pymongo.MongoClient(self.uri) 
        self.db = self.client[self.db_name] 
        self.chunk_size = 20000 
        self.symbol_column = symbol_column 
        self.date_column = 'date' 
        self.version = version

    def parse_uri(self, uri): 
        # mongodb://user:password@example.com 
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}
        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("date must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid date type: " + str(type(x)))
        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)
        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)
        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)
        return query

    def read_tick(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name] 
        query = self.build_query(start_date, end_date, symbol) 
        if not query: 
            print('cannot read the whole table') 
            return None  
        segs = [] 
        for x in collection.find(query): 
            x['data'] = self.deser(x['data'], x['ver']) 
            segs.append(x) 
        print(x['ver'])
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start'])) 
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None

    def read_tick1(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name] 
        query = self.build_query(start_date, end_date, symbol) 
        if not query: 
            print('cannot read the whole table') 
            return None  
        segs = [] 
        start_time = time.time()
        for x in collection.find(query): 
            x['data'] = self.deser(x['data'], x['ver']) 
            segs.append(x) 
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start'])) 
        time1 = time.time() - start_time
        start_time = time.time()
        data = pd.DataFrame(np.concatenate([x['data'] for x in segs], axis=0), columns=x['data'].columns).astype(x['data'].dtypes.to_dict())
        time2 = time.time() - start_time
        print(str(time1) + ',' + str(time2))
    
    def read_daily(self, table_name, start_date=None, end_date=None, skey=None, index_id=None, interval=None, index_name=None, col=None, return_sdi=True): 
        collection = self.db[table_name]
        # Build projection 
        prj = {'_id': 0} 
        if col is not None: 
            if return_sdi: 
                col = ['skey', 'date', 'index_id'] + col 
            for col_name in col: 
                prj[col_name] = 1 
        # Build query 
        query = {} 
        if skey is not None: 
            query['skey'] = {'$in': skey} 
        if interval is not None: 
            query['interval'] = {'$in': interval} 
        if index_id is not None: 
            query['index_id'] = {'$in': index_id}    
        if index_name is not None:
            n = '' 
            for name in index_name: 
                try: 
                    name = re.compile('[\u4e00-\u9fff]+').findall(name)[0] 
                    if len(n) == 0: 
                        n = n = "|".join(name) 
                    else: 
                        n = n + '|' + "|".join(name) 
                except: 
                    if len(n) == 0: 
                        n = name 
                    else: 
                        n = n + '|' + name 
            query['index_name'] = {'$regex': n}
        if start_date is not None: 
            if end_date is not None: 
                query['date'] = {'$gte': start_date, '$lte': end_date} 
            else: 
                query['date'] = {'$gte': start_date} 
        elif end_date is not None: 
            query['date'] = {'$lte': end_date} 
        # Load data 
        cur = collection.find(query, prj) 
        df = pd.DataFrame.from_records(cur) 
        if df.empty: 
            df = pd.DataFrame() 
        else:
            if 'index_id' in df.columns:
                df = df.sort_values(by=['date', 'index_id', 'skey']).reset_index(drop=True)
            else:
                df = df.sort_values(by=['date','skey']).reset_index(drop=True)
        return df 
 
    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def deser(self, s, version): 
        def unpickle(s): 
            return pickle.loads(s) 
        if version == 1: 
            return unpickle(gzip.decompress(s)) 
        elif version == 2: 
            return unpickle(lzma.decompress(s)) 
        elif version == 3: 
            f = io.BytesIO() 
            f.write(s) 
            f.seek(0) 
            return pq.read_table(f, use_threads=False).to_pandas() 
        else: 
            raise Exception('unknown version')

def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()

import random
random.seed(1)

database_name = 'com_md_eq_cn'
user = 'zhenyuy'
password = 'bnONBrzSMGoE'
pd.set_option('max_columns', 200)
db1 = DB("192.168.10.178", database_name, user, password)
sl = list(db1.read_tick('md_order', 20201211, 20201211)['skey'].unique())
sl = random.sample(sl, 500)
db1.read_tick1('md_trade', 20201211, 20201211, symbol=sl)

3
12.690072774887085,1.0182745456695557


In [63]:
import pandas as pd
import numpy as np
import glob
import os
path = np.array(glob.glob(r'E:\re\***\***\***\***'))
path1 = np.array(glob.glob(r'E:\re\***\load_home1\***\***'))
path2 = list(set(path) - set(path1))
data1 = []
data2 = []
for i in path1:
    df = pd.read_csv(i).iloc[:, 1:]
    data1 += [df]
data1 = pd.concat(data1)
for i in path2:
    df = pd.read_csv(i).iloc[:, 1:]
    data2 += [df]
data2 = pd.concat(data2)
data1['mode'] = 'cluster load home1'
data = pd.concat([data1, data2])

In [64]:
import pandas as pd
import numpy as np

data['time3'] = data['time1'] + data['time2']
data = data.groupby(['case', 'core', 'mode'])['time1', 'time2', 'time3'].min().reset_index()
data1 = data[['case', 'core', 'mode', 'time1']]
data1['measure'] = 't_load'
data1 = data1.rename(columns={'time1':'time'})
data2 = data[['case', 'core', 'mode', 'time2']]
data2['measure'] = 't_concat'
data2 = data2.rename(columns={'time2':'time'})
data3 = data[['case', 'core', 'mode', 'time3']]
data3['measure'] = 't_total'
data3 = data3.rename(columns={'time3':'time'})
data = pd.concat([data1, data2, data3])
data['case'] = np.where(data['case'] == ' case1', 'case1', np.where(
data['case'] == ' case2', 'case2', np.where(
data['case'] == ' case3', 'case3', data['case'])))
re = pd.pivot_table(data, values=['time'], columns=['mode', 'core'], index=['case', 'measure']).reset_index()
re = re.fillna(0)
for i in re.columns[2:]:
    re[i] = re[i].round(3)
pd.set_option('max_columns', 200)
re

  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


Unnamed: 0_level_0,case,measure,time,time,time,time,time,time,time,time,time,time,time,time,time,time,time,time,time,time,time,time
mode,Unnamed: 1_level_1,Unnamed: 2_level_1,HPC load db,HPC load db,HPC load home,HPC load home,cluster load db,cluster load db,cluster load db,cluster load home,cluster load home,cluster load home,cluster load home1,cluster load home1,cluster load home1,cluster load shared,cluster load shared,cluster load shared,local load db,local load db,local load home,local load home
core,Unnamed: 1_level_2,Unnamed: 2_level_2,1,96,1,96,1,96,300,1,96,300,1,96,300,1,96,300,1,72,1,72
0,case1,t_concat,0.576,0.807,0.577,1.593,0.805,0.806,0.796,0.59,0.895,1.048,0.675,0.914,1.424,0.694,0.951,1.026,1.062,1.495,0.576,1.837
1,case1,t_load,6.516,103.66,2.038,4.998,3.727,70.287,223.948,2.617,2.822,3.885,0.861,1.347,2.503,2.617,2.896,3.694,5.682,10.117,2.234,6.634
2,case1,t_total,7.344,104.982,2.667,6.819,4.579,71.31,225.03,3.214,3.717,4.933,1.551,2.298,4.357,3.399,3.883,4.72,6.895,11.611,2.814,8.639
3,case2,t_concat,3.472,4.065,2.946,10.693,4.151,3.428,3.187,4.779,3.851,6.313,4.616,4.016,7.75,4.397,3.606,6.242,3.575,15.856,2.859,9.924
4,case2,t_load,20.302,503.086,4.228,15.396,14.436,336.911,1162.101,6.885,8.566,10.57,2.254,2.714,4.862,7.266,8.573,10.234,21.16,37.253,5.502,17.792
5,case2,t_total,24.468,507.151,7.383,27.429,18.588,340.565,1166.21,11.777,14.01,17.276,6.875,7.271,14.469,12.36,14.184,17.041,24.885,54.285,8.37,27.717
6,case3,t_concat,0.006,0.011,0.002,0.007,0.006,0.005,0.004,0.002,0.002,0.002,0.002,0.002,0.002,0.002,0.002,0.002,0.004,0.008,0.002,0.01
7,case3,t_load,0.096,0.184,0.008,0.039,0.709,0.039,0.044,0.013,0.183,0.192,0.003,0.022,0.128,0.013,0.037,0.127,0.059,0.258,0.01,0.056
8,case3,t_total,0.102,0.2,0.009,0.047,0.715,0.043,0.048,0.015,0.185,0.194,0.005,0.024,0.13,0.015,0.039,0.129,0.064,0.267,0.012,0.072


In [28]:
3126128792/(1024**3)

2.9114343151450157

In [17]:
data[data['mode'] == 'cluster load db']['case'].unique()

array([' case1', ' case2', ' case3'], dtype=object)