In [1]:
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz


def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name)


class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db'):
        self.db_name = db_name
        self.uri = uri
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = 1
            seg = {'ver': version, 'data': self.ser(df_seg, version), 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)

        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        else:
            raise Exception('unknown version')


def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()











import pandas as pd
import random
import numpy as np
import glob
import pickle
import os
import datetime
import time
pd.set_option("max_columns", 200)


year = "2020"
startDate = '20200102'
endDate = '20200529'
readPath = '/mnt/usb/data/' + year + '/***/***'
dataPathLs = np.array(glob.glob(readPath))
dateLs = np.array([os.path.basename(i).split('_')[0] for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]
date_list = pd.read_csv("/home/work516/KR_upload_code/trading_days.csv")
wr_ong = []
mi_ss = []
less = []

for data in dataPathLs[:1]:
    if len(np.array(glob.glob(data + '/SH/***'))) == 0:
        if int(os.path.basename(data)) not in date_list["Date"].values:
            continue
        else:
            print(os.path.basename(data) + " less data!!!!!!!!!!!!!!!!!")
            less.append(data)
            continue
    startTm = datetime.datetime.now()
    date = os.path.basename(data)
    rar_path = data + '/SH/snapshot.7z'
    path = '/mnt/e/unzip_data/2020/SH'
    path1 = path + '/' + date
    un_path = path1
    cmd = '7za x {} -o{}'.format(rar_path, un_path)
    os.system(cmd)
    print(datetime.datetime.now() - startTm)
    print(date + ' unzip finished')
    
    readPath = path1 + '/snapshot/***2/***'
    dataPathLs = np.array(glob.glob(readPath))
    dateLs = np.array([int(os.path.basename(i).split('.')[0]) for i in dataPathLs])
    dataPathLs = dataPathLs[(dateLs == 16) | (dateLs == 300) | (dateLs == 852) | (dateLs == 905)]
    SH = []
    ll = []
    for i in dataPathLs:
        try:
            df = pd.read_csv(i, usecols = [17,19,20,21,22,34,41,42])
        except:
            print("empty data")
            print(i)
            ll.append(int(os.path.basename(i).split('.')[0]))
            continue
        df["StockID"] = int(os.path.basename(i).split('.')[0])
        SH += [df]
    del df
    SH = pd.concat(SH).reset_index(drop=True)
    
    SH["skey"] = SH["StockID"] + 1000000
    SH.drop(["StockID"],axis=1,inplace=True)
    SH["date"] = int(SH["SendingTime"].iloc[0]//1000000000)
    SH["time"] = (SH['SendingTime'] - int(SH['SendingTime'].iloc[0]//1000000000*1000000000)).astype(np.int64) * 1000
    SH["clockAtArrival"] = SH["SendingTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S%f').timestamp()*1e6))
    SH.drop(["SendingTime"],axis=1,inplace=True)
    SH['datetime'] = SH["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))

    SH.columns = ['cum_volume', 'open','high', 'prev_close', 'low', 'close', 'cum_amount', 'skey', 
                  'date', 'time', 'clockAtArrival', 'datetime']
    SH = SH.fillna(0)
    SH = SH.drop_duplicates(['cum_volume', 'open','high', 'prev_close', 'low', 'close', 'cum_amount', 'skey', 
                  'date', 'time', 'clockAtArrival', 'datetime'])
    assert(sum(SH['time']%1000000) == 0)
    
    k1 = SH.groupby('skey')['datetime'].min().reset_index()
    k1 = k1.rename(columns={'datetime':'min'})
    k2 = SH.groupby('skey')['datetime'].max().reset_index()
    k2 = k2.rename(columns={'datetime':'max'})
    k = pd.merge(k1, k2, on='skey')
    k['diff'] = (k['max']-k['min']).apply(lambda x: x.seconds)
    df = pd.DataFrame()
    for i in np.arange(k.shape[0]):
        df1 = pd.DataFrame()
        df1['datetime1'] = [k.loc[i, 'min'] + datetime.timedelta(seconds=int(x)) for x in np.arange(0, k.loc[i, 'diff'] + 1)]
        df1['skey'] = k.loc[i, 'skey']
        assert(df1['datetime1'].min() == k.loc[i, 'min'])
        assert(df1['datetime1'].max() == k.loc[i, 'max'])
        df = pd.concat([df, df1])
    
    SH = pd.merge(SH, df, left_on=['skey', 'datetime'], right_on=['skey', 'datetime1'], how='outer').sort_values(by=['skey', 'datetime1']).reset_index(drop=True)
    assert(SH[SH['datetime1'].isnull()].shape[0] == 0)
    for cols in ['date', 'cum_volume', 'cum_amount', 'prev_close', 'open', 'high', 'low', 'close']:
        SH[cols] = SH.groupby('skey')[cols].ffill()
    SH.drop(["datetime"],axis=1,inplace=True)
    SH = SH.rename(columns={'datetime1':'datetime'})
    SH['date'] = SH['date'].iloc[0]
    SH['date'] = SH['date'].astype('int32')
    SH['skey'] = SH['skey'].astype('int32')
    SH["time"] = SH['datetime'].astype(str).apply(lambda x: int(x.split(' ')[1].replace(':', ""))).astype(np.int64)
    SH['SendingTime'] = SH['date'] * 1000000 + SH['time']
    SH["clockAtArrival"] = SH["SendingTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S').timestamp()*1e6))
    SH.drop(["SendingTime"],axis=1,inplace=True)
    SH['time'] = SH['time'] * 1000000
    
    assert(sum(SH[SH["open"] != 0].groupby("skey")["open"].nunique() != 1) == 0)
    assert(sum(SH[SH["prev_close"] != 0].groupby("skey")["prev_close"].nunique() != 1) == 0)
    SH["prev_close"] = np.where(SH["time"] >= 91500000000, SH.groupby("skey")["prev_close"].transform("max"), SH["prev_close"]) 
    SH["open"] = np.where(SH["cum_volume"] > 0, SH.groupby("skey")["open"].transform("max"), SH["open"])
    assert(sum(SH[SH["open"] != 0].groupby("skey")["open"].nunique() != 1) == 0)
    assert(sum(SH[SH["prev_close"] != 0].groupby("skey")["prev_close"].nunique() != 1) == 0)
    assert(SH[SH["cum_volume"] > 0]["open"].min() > 0)
    
    for cols in ['open', 'high', 'prev_close', 'low', 'close']:
        SH[cols] = SH[cols].apply(lambda x: round(x, 4)).astype('float64')

    SH = SH[["skey", "date", "time", "clockAtArrival", "datetime", "cum_volume", "cum_amount", 
             "prev_close", "open", "high", "low", "close"]]    
    SH = SH.sort_values(by=['skey', 'time'])
    SH["ordering"] = SH.groupby("skey").cumcount()
    SH["ordering"] = SH["ordering"] + 1
    SH['ordering'] = SH['ordering'].astype('int32')
    SH['cum_volume'] = SH['cum_volume'].astype('int64')
    SH['close'] = np.where(SH['cum_volume'] > 0, SH['close'], 0)
    
    SH = SH[["skey", "date", "time", "clockAtArrival", "datetime", "ordering", "cum_volume", "cum_amount", 
             "open", "close"]]
            
    display(SH["date"].iloc[0])
    print("index finished")
    
    database_name = 'com_md_eq_cn'
    user = "zhenyuy"
    password = "bnONBrzSMGoE"

    db1 = DB("192.168.10.178", database_name, user, password)
    db1.write('md_index', SH)
    
    del SH

print(less)



0:00:01.398817
20200102 unzip finished


In [6]:
SH[SH['time'] < 130000000000].groupby('skey').last()

Unnamed: 0_level_0,cum_volume,open,high,prev_close,low,close,cum_amount,date,time,clockAtArrival,datetime
skey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1000016,35206849,3073.9313,3107.5172,3063.219,3073.9313,3093.1425,65777390000.0,20200102,125923000000,1577941163000000,2020-01-02 12:59:23
1000300,123615895,4121.3487,4172.6555,4096.58209,4121.3487,4154.6494,181217000000.0,20200102,125914000000,1577941154000000,2020-01-02 12:59:14
1000852,109423223,5603.9331,5670.7695,5567.033,5586.1399,5662.5843,108463500000.0,20200102,125908000000,1577941148000000,2020-01-02 12:59:08
1000905,92149511,5306.6677,5374.9082,5267.6622,5288.1675,5358.2605,86526450000.0,20200102,125914000000,1577941154000000,2020-01-02 12:59:14


In [7]:
SH[SH['time'] >= 130000000000].groupby('skey').first()

Unnamed: 0_level_0,cum_volume,open,high,prev_close,low,close,cum_amount,date,time,clockAtArrival,datetime
skey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1000016,35278892,3073.9313,3107.5172,3063.219,3073.9313,3093.3177,65894760000.0,20200102,130003000000,1577941203000000,2020-01-02 13:00:03
1000300,123859520,4121.3487,4172.6555,4096.58209,4121.3487,4154.7372,181498600000.0,20200102,130003000000,1577941203000000,2020-01-02 13:00:03
1000852,109540949,5603.9331,5670.7695,5567.033,5586.1399,5662.5486,108557600000.0,20200102,130003000000,1577941203000000,2020-01-02 13:00:03
1000905,92255690,5306.6677,5374.9082,5267.6622,5288.1675,5358.2839,86616690000.0,20200102,130003000000,1577941203000000,2020-01-02 13:00:03


In [1]:
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz


def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name)


class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db'):
        self.db_name = db_name
        self.uri = uri
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = 1
            seg = {'ver': version, 'data': self.ser(df_seg, version), 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)

        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        else:
            raise Exception('unknown version')


def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()











import pandas as pd
import random
import numpy as np
import glob
import pickle
import os
import datetime
import time
pd.set_option("max_columns", 200)


year = "2020"
startDate = '20200417'
endDate = '20200630'
readPath = '/mnt/e/unzip_data/2020/SH/***'
dataPathLs = np.array(glob.glob(readPath))
dateLs = np.array([os.path.basename(i).split('_')[0] for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]
date_list = pd.read_csv("/home/work516/KR_upload_code/trading_days.csv")
wr_ong = []
mi_ss = []
less = []

for data in np.sort(dataPathLs):
    readPath = data + '/snapshot/***2/***'
    dataPathLs = np.array(glob.glob(readPath))
    dateLs = np.array([int(os.path.basename(i).split('.')[0]) for i in dataPathLs])
    dataPathLs = dataPathLs[(dateLs == 16) | (dateLs == 300) | (dateLs == 852) | (dateLs == 905)]
    SH = []
    ll = []
    for i in dataPathLs:
        try:
            df = pd.read_csv(i, usecols = [17,19,20,21,22,34,41,42])
        except:
            print("empty data")
            print(i)
            ll.append(int(os.path.basename(i).split('.')[0]))
            continue
        df["StockID"] = int(os.path.basename(i).split('.')[0])
        SH += [df]
    del df
    SH = pd.concat(SH).reset_index(drop=True)
    
    SH["skey"] = SH["StockID"] + 1000000
    SH.drop(["StockID"],axis=1,inplace=True)
    SH["date"] = int(SH["SendingTime"].iloc[0]//1000000000)
    SH["time"] = (SH['SendingTime'] - int(SH['SendingTime'].iloc[0]//1000000000*1000000000)).astype(np.int64) * 1000
    SH["clockAtArrival"] = SH["SendingTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S%f').timestamp()*1e6))
    SH.drop(["SendingTime"],axis=1,inplace=True)
    SH['datetime'] = SH["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))

    SH.columns = ['cum_volume', 'open','high', 'prev_close', 'low', 'close', 'cum_amount', 'skey', 
                  'date', 'time', 'clockAtArrival', 'datetime']
    SH = SH.fillna(0)
    SH = SH.drop_duplicates(['cum_volume', 'open','high', 'prev_close', 'low', 'close', 'cum_amount', 'skey', 
                  'date', 'time', 'clockAtArrival', 'datetime'])
    assert(sum(SH['time']%1000000) == 0)
    assert(sum(SH[SH['cum_volume'] == 0].groupby('skey')['time'].max() 
               < SH[SH['cum_volume'] > 0].groupby('skey')['time'].min()))
    m_ax = SH[SH['time'] <= 150500000000].groupby('skey').last()['time'].min()
    assert((SH[SH['time'] >= m_ax].drop_duplicates(['cum_volume', 'open', 'high', 'low', 'prev_close', 
                                               'close', 'cum_amount', 'skey', 'date'], keep=False).shape[0] == 0) & \
               (sum(SH[SH['time'] >= m_ax].groupby('skey')['close'].nunique() != 1) == 0))
    SH = SH[(SH['cum_volume'] > 0) & (SH['time'] <= 150500000000)]
    
    k1 = SH.groupby('skey')['datetime'].min().reset_index()
    k1 = k1.rename(columns={'datetime':'min'})
    k2 = SH.groupby('skey')['datetime'].max().reset_index()
    k2 = k2.rename(columns={'datetime':'max'})
    k = pd.merge(k1, k2, on='skey')
    k['diff'] = (k['max']-k['min']).apply(lambda x: x.seconds)
    df = pd.DataFrame()
    for i in np.arange(k.shape[0]):
        df1 = pd.DataFrame()
        df1['datetime1'] = [k.loc[i, 'min'] + datetime.timedelta(seconds=int(x)) for x in np.arange(0, k.loc[i, 'diff'] + 1)]
        df1['skey'] = k.loc[i, 'skey']
        assert(df1['datetime1'].min() == k.loc[i, 'min'])
        assert(df1['datetime1'].max() == k.loc[i, 'max'])
        df = pd.concat([df, df1])
    
    SH = pd.merge(SH, df, left_on=['skey', 'datetime'], right_on=['skey', 'datetime1'], how='outer').sort_values(by=['skey', 'datetime1']).reset_index(drop=True)
    assert(SH[SH['datetime1'].isnull()].shape[0] == 0)
    for cols in ['date', 'cum_volume', 'cum_amount', 'prev_close', 'open', 'high', 'low', 'close']:
        SH[cols] = SH.groupby('skey')[cols].ffill()
    SH.drop(["datetime"],axis=1,inplace=True)
    SH = SH.rename(columns={'datetime1':'datetime'})
    SH['date'] = SH['date'].iloc[0]
    SH['date'] = SH['date'].astype('int32')
    SH['skey'] = SH['skey'].astype('int32')
    SH["time"] = SH['datetime'].astype(str).apply(lambda x: int(x.split(' ')[1].replace(':', ""))).astype(np.int64)
    SH['SendingTime'] = SH['date'] * 1000000 + SH['time']
    SH["clockAtArrival"] = SH["SendingTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S').timestamp()*1e6))
    SH.drop(["SendingTime"],axis=1,inplace=True)
    SH['time'] = SH['time'] * 1000000
    
    assert(sum(SH[SH["open"] != 0].groupby("skey")["open"].nunique() != 1) == 0)
    assert(sum(SH[SH["prev_close"] != 0].groupby("skey")["prev_close"].nunique() != 1) == 0)
    SH["prev_close"] = np.where(SH["time"] >= 91500000000, SH.groupby("skey")["prev_close"].transform("max"), SH["prev_close"]) 
    SH["open"] = np.where(SH["cum_volume"] > 0, SH.groupby("skey")["open"].transform("max"), SH["open"])
    assert(sum(SH[SH["open"] != 0].groupby("skey")["open"].nunique() != 1) == 0)
    assert(sum(SH[SH["prev_close"] != 0].groupby("skey")["prev_close"].nunique() != 1) == 0)
    assert(SH[SH["cum_volume"] > 0]["open"].min() > 0)
    
    for cols in ['open', 'high', 'prev_close', 'low', 'close']:
        SH[cols] = SH[cols].apply(lambda x: round(x, 4)).astype('float64')

    SH = SH[["skey", "date", "time", "clockAtArrival", "datetime", "cum_volume", "cum_amount", 
             "prev_close", "open", "high", "low", "close"]] 
    m_in = SH[SH['time'] <= 113500000000].groupby('skey').last()['time'].min()
    m_ax = SH[SH['time'] >= 125500000000].groupby('skey').first()['time'].max()
    try:
        assert((SH[(SH['time'] >= m_in) & (SH['time'] <= m_ax)].drop_duplicates(['cum_volume', 'open', 'high', 'low', 'prev_close', 
                                               'close', 'cum_amount', 'skey', 'date'], keep=False).shape[0] == 0)
          & (sum(SH[(SH['time'] >= m_in) & (SH['time'] <= m_ax)].groupby('skey')['cum_volume'].nunique() != 1) == 0) & 
           (sum(SH[(SH['time'] >= m_in) & (SH['time'] <= m_ax)].groupby('skey')['close'].nunique() != 1) == 0))
        SH = pd.concat([SH[SH['time'] <= 113500000000], SH[SH['time'] >= 125500000000]])
    except:
        print(SH[(SH['time'] >= m_in) & (SH['time'] <= m_ax)].drop_duplicates(['cum_volume', 'open', 'high', 'low', 'prev_close', 
                                               'close', 'cum_amount', 'skey', 'date'], keep='first').groupby('skey')['time'].unique())
        tt = SH[(SH['time'] >= m_in) & (SH['time'] <= m_ax)].drop_duplicates(['cum_volume', 'open', 'high', 'low', 'prev_close', 
                                               'close', 'cum_amount', 'skey', 'date'], keep='first').groupby('skey')['time'].last().unique().max()
        if tt < 121500000000:
            m_in = tt
            assert((SH[(SH['time'] >= m_in) & (SH['time'] <= m_ax)].drop_duplicates(['cum_volume', 'open', 'high', 'low', 'prev_close', 
                                               'close', 'cum_amount', 'skey', 'date'], keep=False).shape[0] == 0)
          & (sum(SH[(SH['time'] >= m_in) & (SH['time'] <= m_ax)].groupby('skey')['cum_volume'].nunique() != 1) == 0) & 
           (sum(SH[(SH['time'] >= m_in) & (SH['time'] <= m_ax)].groupby('skey')['close'].nunique() != 1) == 0))
            SH = pd.concat([SH[SH['time'] <= m_in], SH[SH['time'] >= 125500000000]])
        else:
            m_ax = tt
            assert((SH[(SH['time'] >= m_in) & (SH['time'] < m_ax)].drop_duplicates(['cum_volume', 'open', 'high', 'low', 'prev_close', 
                                               'close', 'cum_amount', 'skey', 'date'], keep=False).shape[0] == 0)
          & (sum(SH[(SH['time'] >= m_in) & (SH['time'] < m_ax)].groupby('skey')['cum_volume'].nunique() != 1) == 0) & 
           (sum(SH[(SH['time'] >= m_in) & (SH['time'] < m_ax)].groupby('skey')['close'].nunique() != 1) == 0))
            SH = pd.concat([SH[SH['time'] <= 113500000000], SH[SH['time'] >= m_ax]])


    SH = SH.sort_values(by=['skey', 'time'])
    SH["ordering"] = SH.groupby("skey").cumcount()
    SH["ordering"] = SH["ordering"] + 1
    SH['ordering'] = SH['ordering'].astype('int32')
    SH['cum_volume'] = SH['cum_volume'].astype('int64')
    
    SH = SH[["skey", "date", "time", "clockAtArrival", "datetime", "ordering", "cum_volume", "cum_amount", 
             "open", "close"]]
            
    display(SH["date"].iloc[0])
    print("index finished")
    
    database_name = 'com_md_eq_cn'
    user = "zhenyuy"
    password = "bnONBrzSMGoE"

    db1 = DB("192.168.10.178", database_name, user, password)
    db1.write('md_index', SH)
    
    del SH

print(less)



20200417

index finished
skey
1000016                  [113500000000]
1000300    [113500000000, 124237000000]
1000852    [113500000000, 124237000000]
1000905    [113500000000, 124237000000]
Name: time, dtype: object


20200420

index finished


20200421

index finished


20200422

index finished


20200423

index finished


20200424

index finished


20200427

index finished


20200428

index finished


20200429

index finished


20200430

index finished


20200506

index finished


20200507

index finished


20200508

index finished


20200511

index finished


20200512

index finished


20200513

index finished


20200514

index finished


20200515

index finished


20200518

index finished


20200519

index finished


20200520

index finished


20200521

index finished


20200522

index finished


20200525

index finished


20200526

index finished


20200527

index finished


20200528

index finished


20200529

index finished


20200601

index finished


20200602

index finished


20200603

index finished


20200604

index finished


20200605

index finished


20200608

index finished


20200609

index finished


20200610

index finished


20200611

index finished


20200612

index finished


20200615

index finished


20200616

index finished


20200617

index finished


20200618

index finished


20200619

index finished


20200622

index finished


20200623

index finished


20200624

index finished


20200629

index finished


20200630

index finished
[]


In [1]:
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz


def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name)


class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db'):
        self.db_name = db_name
        self.uri = uri
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = 1
            seg = {'ver': version, 'data': self.ser(df_seg, version), 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)

        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        else:
            raise Exception('unknown version')


def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()











import pandas as pd
import random
import numpy as np
import glob
import pickle
import os
import datetime
import time
pd.set_option("max_columns", 200)

year = "2020"
startDate = '20201202'
endDate = '20201202'
readPath = '/mnt/Kevin_zhenyu/KR_daily_data' + '/***'
dataPathLs = np.array(glob.glob(readPath))
dateLs = np.array([os.path.basename(i) for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]
date_list = pd.read_csv("/home/work516/KR_upload_code/trading_days.csv")
wr_ong = []
mi_ss = []
less = []

for data in np.sort(dataPathLs):
    readPath = data + '/SH/snapshot/***2/***'
    dataPathLs = np.array(glob.glob(readPath))
    dateLs = np.array([int(os.path.basename(i).split('.')[0]) for i in dataPathLs])
    dataPathLs = dataPathLs[(dateLs == 16) | (dateLs == 300) | (dateLs == 852) | (dateLs == 905)]
    SH = []
    ll = []
    for i in dataPathLs:
        try:
            df = pd.read_csv(i, usecols = [17,19,20,21,22,34,41,42])
        except:
            print("empty data")
            print(i)
            ll.append(int(os.path.basename(i).split('.')[0]))
            continue
        df["StockID"] = int(os.path.basename(i).split('.')[0])
        SH += [df]
    del df
    SH = pd.concat(SH).reset_index(drop=True)
    
    SH["skey"] = SH["StockID"] + 1000000
    SH.drop(["StockID"],axis=1,inplace=True)
    SH["date"] = int(SH["SendingTime"].iloc[0]//1000000000)
    SH["time"] = (SH['SendingTime'] - int(SH['SendingTime'].iloc[0]//1000000000*1000000000)).astype(np.int64) * 1000
    SH["clockAtArrival"] = SH["SendingTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S%f').timestamp()*1e6))
    SH.drop(["SendingTime"],axis=1,inplace=True)
    SH['datetime'] = SH["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))

    SH.columns = ['cum_volume', 'open','high', 'prev_close', 'low', 'close', 'cum_amount', 'skey', 
                  'date', 'time', 'clockAtArrival', 'datetime']
    SH = SH.fillna(0)
    SH = SH.drop_duplicates(['cum_volume', 'open','high', 'prev_close', 'low', 'close', 'cum_amount', 'skey', 
                  'date', 'time', 'clockAtArrival', 'datetime'])
    assert(sum(SH['time']%1000000) == 0)
    assert(sum(SH[SH['cum_volume'] == 0].groupby('skey')['time'].max() 
               < SH[SH['cum_volume'] > 0].groupby('skey')['time'].min()))
    m_ax = SH[SH['time'] <= 150500000000].groupby('skey').last()['time'].min()
    assert((SH[SH['time'] >= m_ax].drop_duplicates(['cum_volume', 'open', 'high', 'low', 'prev_close', 
                                               'close', 'cum_amount', 'skey', 'date'], keep=False).shape[0] == 0) & \
               (sum(SH[SH['time'] >= m_ax].groupby('skey')['close'].nunique() != 1) == 0))
    SH = SH[(SH['cum_volume'] > 0) & (SH['time'] <= 150500000000)]
    
    k1 = SH.groupby('skey')['datetime'].min().reset_index()
    k1 = k1.rename(columns={'datetime':'min'})
    k2 = SH.groupby('skey')['datetime'].max().reset_index()
    k2 = k2.rename(columns={'datetime':'max'})
    k = pd.merge(k1, k2, on='skey')
    k['diff'] = (k['max']-k['min']).apply(lambda x: x.seconds)
    df = pd.DataFrame()
    for i in np.arange(k.shape[0]):
        df1 = pd.DataFrame()
        df1['datetime1'] = [k.loc[i, 'min'] + datetime.timedelta(seconds=int(x)) for x in np.arange(0, k.loc[i, 'diff'] + 1)]
        df1['skey'] = k.loc[i, 'skey']
        assert(df1['datetime1'].min() == k.loc[i, 'min'])
        assert(df1['datetime1'].max() == k.loc[i, 'max'])
        df = pd.concat([df, df1])
    
    SH = pd.merge(SH, df, left_on=['skey', 'datetime'], right_on=['skey', 'datetime1'], how='outer').sort_values(by=['skey', 'datetime1']).reset_index(drop=True)
    assert(SH[SH['datetime1'].isnull()].shape[0] == 0)
    for cols in ['date', 'cum_volume', 'cum_amount', 'prev_close', 'open', 'high', 'low', 'close']:
        SH[cols] = SH.groupby('skey')[cols].ffill()
    SH.drop(["datetime"],axis=1,inplace=True)
    SH = SH.rename(columns={'datetime1':'datetime'})
    SH['date'] = SH['date'].iloc[0]
    SH['date'] = SH['date'].astype('int32')
    SH['skey'] = SH['skey'].astype('int32')
    SH["time"] = SH['datetime'].astype(str).apply(lambda x: int(x.split(' ')[1].replace(':', ""))).astype(np.int64)
    SH['SendingTime'] = SH['date'] * 1000000 + SH['time']
    SH["clockAtArrival"] = SH["SendingTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S').timestamp()*1e6))
    SH.drop(["SendingTime"],axis=1,inplace=True)
    SH['time'] = SH['time'] * 1000000
    
    assert(sum(SH[SH["open"] != 0].groupby("skey")["open"].nunique() != 1) == 0)
    assert(sum(SH[SH["prev_close"] != 0].groupby("skey")["prev_close"].nunique() != 1) == 0)
    SH["prev_close"] = np.where(SH["time"] >= 91500000000, SH.groupby("skey")["prev_close"].transform("max"), SH["prev_close"]) 
    SH["open"] = np.where(SH["cum_volume"] > 0, SH.groupby("skey")["open"].transform("max"), SH["open"])
    assert(sum(SH[SH["open"] != 0].groupby("skey")["open"].nunique() != 1) == 0)
    assert(sum(SH[SH["prev_close"] != 0].groupby("skey")["prev_close"].nunique() != 1) == 0)
    assert(SH[SH["cum_volume"] > 0]["open"].min() > 0)
    
    for cols in ['open', 'high', 'prev_close', 'low', 'close']:
        SH[cols] = SH[cols].apply(lambda x: round(x, 4)).astype('float64')

    SH = SH[["skey", "date", "time", "clockAtArrival", "datetime", "cum_volume", "cum_amount", 
             "prev_close", "open", "high", "low", "close"]]    
    m_in = SH[SH['time'] <= 113500000000].groupby('skey').last()['time'].min()
    m_ax = SH[SH['time'] >= 125500000000].groupby('skey').first()['time'].max()
    assert((SH[(SH['time'] >= m_in) & (SH['time'] <= m_ax)].drop_duplicates(['cum_volume', 'open', 'high', 'low', 'prev_close', 
                                               'close', 'cum_amount', 'skey', 'date'], keep=False).shape[0] == 0)
          & (sum(SH[(SH['time'] >= m_in) & (SH['time'] <= m_ax)].groupby('skey')['cum_volume'].nunique() != 1) == 0) & 
           (sum(SH[(SH['time'] >= m_in) & (SH['time'] <= m_ax)].groupby('skey')['close'].nunique() != 1) == 0))
    SH = pd.concat([SH[SH['time'] <= 113500000000], SH[SH['time'] >= 125500000000]])
    SH = SH.sort_values(by=['skey', 'time'])
    SH["ordering"] = SH.groupby("skey").cumcount()
    SH["ordering"] = SH["ordering"] + 1
    SH['ordering'] = SH['ordering'].astype('int32')
    SH['cum_volume'] = SH['cum_volume'].astype('int64')
    
    SH = SH[["skey", "date", "time", "clockAtArrival", "datetime", "ordering", "cum_volume", "cum_amount", 
             "open", "close"]]
            
    display(SH["date"].iloc[0])
    print("index finished")
    
    database_name = 'com_md_eq_cn'
    user = "zhenyuy"
    password = "bnONBrzSMGoE"

    db1 = DB("192.168.10.178", database_name, user, password)
    db1.write('md_index', SH)
    
    del SH

print(less)



20201202

index finished
[]


In [1]:
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz


def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name)


class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db'):
        self.db_name = db_name
        self.uri = uri
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = 1
            seg = {'ver': version, 'data': self.ser(df_seg, version), 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)

        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        else:
            raise Exception('unknown version')


def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()











import pandas as pd
import random
import numpy as np
import glob
import pickle
import os
import datetime
import time
pd.set_option("max_columns", 200)

year = "2020"
startDate = '20201016'
endDate = '20201016'
readPath = '/mnt/ShareWithServer/data' + '/***'
dataPathLs = np.array(glob.glob(readPath))
dateLs = np.array([os.path.basename(i) for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]
date_list = pd.read_csv("/home/work516/KR_upload_code/trading_days.csv")
wr_ong = []
mi_ss = []
less = []

for data in np.sort(dataPathLs):
    readPath = data + '/snapshot/***2/***'
    dataPathLs = np.array(glob.glob(readPath))
    dateLs = np.array([int(os.path.basename(i).split('.')[0]) for i in dataPathLs])
    dataPathLs = dataPathLs[(dateLs == 16) | (dateLs == 300) | (dateLs == 852) | (dateLs == 905)]
    SH = []
    ll = []
    for i in dataPathLs:
        try:
            df = pd.read_csv(i, usecols = [17,19,20,21,22,34,41,42])
        except:
            print("empty data")
            print(i)
            ll.append(int(os.path.basename(i).split('.')[0]))
            continue
        df["StockID"] = int(os.path.basename(i).split('.')[0])
        SH += [df]
    del df
    SH = pd.concat(SH).reset_index(drop=True)
    
    SH["skey"] = SH["StockID"] + 1000000
    SH.drop(["StockID"],axis=1,inplace=True)
    SH["date"] = int(SH["SendingTime"].iloc[0]//1000000000)
    SH["time"] = (SH['SendingTime'] - int(SH['SendingTime'].iloc[0]//1000000000*1000000000)).astype(np.int64) * 1000
    SH["clockAtArrival"] = SH["SendingTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S%f').timestamp()*1e6))
    SH.drop(["SendingTime"],axis=1,inplace=True)
    SH['datetime'] = SH["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))

    SH.columns = ['cum_volume', 'open','high', 'prev_close', 'low', 'close', 'cum_amount', 'skey', 
                  'date', 'time', 'clockAtArrival', 'datetime']
    SH = SH.fillna(0)
    SH = SH.drop_duplicates(['cum_volume', 'open','high', 'prev_close', 'low', 'close', 'cum_amount', 'skey', 
                  'date', 'time', 'clockAtArrival', 'datetime'])
    assert(sum(SH['time']%1000000) == 0)
    assert(sum(SH[SH['cum_volume'] == 0].groupby('skey')['time'].max() 
               < SH[SH['cum_volume'] > 0].groupby('skey')['time'].min()))
    m_ax = SH[SH['time'] <= 150500000000].groupby('skey').last()['time'].min()
    assert((SH[SH['time'] >= m_ax].drop_duplicates(['cum_volume', 'open', 'high', 'low', 'prev_close', 
                                               'close', 'cum_amount', 'skey', 'date'], keep=False).shape[0] == 0) & \
               (sum(SH[SH['time'] >= m_ax].groupby('skey')['close'].nunique() != 1) == 0))
    SH = SH[(SH['cum_volume'] > 0) & (SH['time'] <= 150500000000)]
    
    k1 = SH.groupby('skey')['datetime'].min().reset_index()
    k1 = k1.rename(columns={'datetime':'min'})
    k2 = SH.groupby('skey')['datetime'].max().reset_index()
    k2 = k2.rename(columns={'datetime':'max'})
    k = pd.merge(k1, k2, on='skey')
    k['diff'] = (k['max']-k['min']).apply(lambda x: x.seconds)
    df = pd.DataFrame()
    for i in np.arange(k.shape[0]):
        df1 = pd.DataFrame()
        df1['datetime1'] = [k.loc[i, 'min'] + datetime.timedelta(seconds=int(x)) for x in np.arange(0, k.loc[i, 'diff'] + 1)]
        df1['skey'] = k.loc[i, 'skey']
        assert(df1['datetime1'].min() == k.loc[i, 'min'])
        assert(df1['datetime1'].max() == k.loc[i, 'max'])
        df = pd.concat([df, df1])
    
    SH = pd.merge(SH, df, left_on=['skey', 'datetime'], right_on=['skey', 'datetime1'], how='outer').sort_values(by=['skey', 'datetime1']).reset_index(drop=True)
    assert(SH[SH['datetime1'].isnull()].shape[0] == 0)
    for cols in ['date', 'cum_volume', 'cum_amount', 'prev_close', 'open', 'high', 'low', 'close']:
        SH[cols] = SH.groupby('skey')[cols].ffill()
    SH.drop(["datetime"],axis=1,inplace=True)
    SH = SH.rename(columns={'datetime1':'datetime'})
    SH['date'] = SH['date'].iloc[0]
    SH['date'] = SH['date'].astype('int32')
    SH['skey'] = SH['skey'].astype('int32')
    SH["time"] = SH['datetime'].astype(str).apply(lambda x: int(x.split(' ')[1].replace(':', ""))).astype(np.int64)
    SH['SendingTime'] = SH['date'] * 1000000 + SH['time']
    SH["clockAtArrival"] = SH["SendingTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S').timestamp()*1e6))
    SH.drop(["SendingTime"],axis=1,inplace=True)
    SH['time'] = SH['time'] * 1000000
    
    assert(sum(SH[SH["open"] != 0].groupby("skey")["open"].nunique() != 1) == 0)
    assert(sum(SH[SH["prev_close"] != 0].groupby("skey")["prev_close"].nunique() != 1) == 0)
    SH["prev_close"] = np.where(SH["time"] >= 91500000000, SH.groupby("skey")["prev_close"].transform("max"), SH["prev_close"]) 
    SH["open"] = np.where(SH["cum_volume"] > 0, SH.groupby("skey")["open"].transform("max"), SH["open"])
    assert(sum(SH[SH["open"] != 0].groupby("skey")["open"].nunique() != 1) == 0)
    assert(sum(SH[SH["prev_close"] != 0].groupby("skey")["prev_close"].nunique() != 1) == 0)
    assert(SH[SH["cum_volume"] > 0]["open"].min() > 0)
    
    for cols in ['open', 'high', 'prev_close', 'low', 'close']:
        SH[cols] = SH[cols].apply(lambda x: round(x, 4)).astype('float64')

    SH = SH[["skey", "date", "time", "clockAtArrival", "datetime", "cum_volume", "cum_amount", 
             "prev_close", "open", "high", "low", "close"]]    
    m_in = SH[SH['time'] <= 113500000000].groupby('skey').last()['time'].min()
    m_ax = SH[SH['time'] >= 125500000000].groupby('skey').first()['time'].max()
    assert((SH[(SH['time'] >= m_in) & (SH['time'] <= m_ax)].drop_duplicates(['cum_volume', 'open', 'high', 'low', 'prev_close', 
                                               'close', 'cum_amount', 'skey', 'date'], keep=False).shape[0] == 0)
          & (sum(SH[(SH['time'] >= m_in) & (SH['time'] <= m_ax)].groupby('skey')['cum_volume'].nunique() != 1) == 0) & 
           (sum(SH[(SH['time'] >= m_in) & (SH['time'] <= m_ax)].groupby('skey')['close'].nunique() != 1) == 0))
    SH = pd.concat([SH[SH['time'] <= 113500000000], SH[SH['time'] >= 125500000000]])
    
    SH = SH.sort_values(by=['skey', 'time'])
    SH["ordering"] = SH.groupby("skey").cumcount()
    SH["ordering"] = SH["ordering"] + 1
    SH['ordering'] = SH['ordering'].astype('int32')
    SH['cum_volume'] = SH['cum_volume'].astype('int64')
    
    SH = SH[["skey", "date", "time", "clockAtArrival", "datetime", "ordering", "cum_volume", "cum_amount", 
             "open", "close"]]
            
    display(SH["date"].iloc[0])
    print("index finished")
    
    database_name = 'com_md_eq_cn'
    user = "zhenyuy"
    password = "bnONBrzSMGoE"

    db1 = DB("192.168.10.178", database_name, user, password)
    db1.write('md_index', SH)
    
    del SH

print(less)



[]


#### how to deal with new record data

In [38]:
import pickle
from matplotlib import pyplot as plt
from matplotlib.ticker import Formatter
import collections
import glob
import os
import datetime


y = '20200818'
print('----------------------------------------------------------------')
print(y)

readPath = '/mnt/dailyRawData/' + y + '/logs_' + y + '_zt_88_03_day_pcap/mdIndexPcap_SH_***'
dataPathLs = np.array(glob.glob(readPath))
startTm = datetime.datetime.now()
SH = pd.read_csv(dataPathLs[0])
print(datetime.datetime.now() - startTm)

startTm = datetime.datetime.now()
SH = SH.rename(columns={"ID":"skey"})
in_dex = [1000016, 1000300, 1000852, 1000905]
SH = SH[SH['skey'].isin(in_dex)]

for cols in ["cum_amount", "close", "open"]:
    SH[cols] = (SH[cols]/10000).round(4)



----------------------------------------------------------------
20200818
0:00:01.205647


In [57]:
kk = pd.read_csv('/mnt/e/result/logs_20200528_zs_92_01_day_data/mdLog_SH_20200528_0844.csv')


In [66]:
kk[kk.duplicated("time", keep=False)]

Unnamed: 0,ms,clock,threadId,clockAtArrival,sequenceNo,source,StockID,exchange,time,cum_volume,cum_amount,close,__origTickSeq,bid1p,bid2p,bid3p,bid4p,bid5p,bid1q,bid2q,bid3q,bid4q,bid5q,ask1p,ask2p,ask3p,ask4p,ask5p,ask1q,ask2q,ask3q,ask4q,ask5q,openPrice,numTrades
8688301,13:41:36.400935,1590644496402041,46198,1590644496402004,80965551,5,300,SH,13:41:16.000,69444537,103559100000.0,3838.2067,576023834002927177,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,3848.4892,0
8688340,13:41:36.476935,1590644496478270,46198,1590644496478244,80966661,5,300,SH,13:41:16.000,69470629,103589800000.0,3837.9549,576023834002972848,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,3848.4892,0
9024634,13:49:36.409957,1590644976410508,46198,1590644976410484,83299609,5,300,SH,13:49:16.000,70901297,105858600000.0,3832.792,579459807843723173,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,3848.4892,0
9024670,13:49:36.481957,1590644976482791,46198,1590644976482766,83300391,5,300,SH,13:49:16.000,70916453,105892700000.0,3832.7297,579459807843763559,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,3848.4892,0
9370097,13:57:36.415978,1590645456416764,46198,1590645456416727,85858818,5,300,SH,13:57:16.000,72681005,108595300000.0,3829.9145,582895781684763502,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,3848.4892,0
9370103,13:57:36.485978,1590645456486531,46198,1590645456486493,85859365,5,300,SH,13:57:16.000,72695657,108621300000.0,3829.5535,582895781684802736,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,3848.4892,0
9590939,14:02:36.409992,1590645756410745,46198,1590645756410710,87549870,5,300,SH,14:02:16.000,73739058,110200000000.0,3832.9734,602223134519542702,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,3848.4892,0
9590959,14:02:36.484992,1590645756485373,46198,1590645756485345,87550685,5,300,SH,14:02:16.000,73751791,110218500000.0,3833.11,602223134519585453,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,3848.4892,0
9816553,14:07:36.421006,1590646056421654,46198,1590646056421620,89285805,5,300,SH,14:07:16.000,74918935,112056400000.0,3839.4489,604370618170401478,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,3848.4892,0
9816575,14:07:36.510006,1590646056511213,46198,1590646056510937,89286932,5,300,SH,14:07:16.000,74933544,112086000000.0,3839.9419,604370618170445594,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,3848.4892,0


In [41]:
SH[SH.duplicated(['skey', 'time'], keep=False) & (SH['time'] > 93000000) & (SH['skey'] == 1000300)].head(200)

Unnamed: 0,clockAtArrival,sequenceNo,skey,time,cum_volume,cum_amount,close,open,prevClose
35973,1597714242806822,1871238,1000300,93013620,4482769,6408187000.0,4812.8793,4816.5703,48152261
36017,1597714245749746,1908321,1000300,93013620,4482769,6408187000.0,4812.8793,4816.5703,48152261
38559,1597714302891315,2528795,1000300,93113580,7930712,11720380000.0,4808.8962,4816.5703,48152261
38603,1597714305663322,2552590,1000300,93113580,7930712,11720380000.0,4808.8962,4816.5703,48152261
41146,1597714362872918,3004049,1000300,93213580,10704747,16360240000.0,4816.3131,4816.5703,48152261
41190,1597714365664399,3026554,1000300,93213580,10704747,16360240000.0,4816.3131,4816.5703,48152261
43733,1597714422843100,3456141,1000300,93313600,13265955,20750800000.0,4819.0937,4816.5703,48152261
43777,1597714425663179,3478277,1000300,93313600,13265955,20750800000.0,4819.0937,4816.5703,48152261
46318,1597714482834563,3912833,1000300,93413590,15922231,25376600000.0,4811.8439,4816.5703,48152261
46362,1597714485664567,3934929,1000300,93413590,15922231,25376600000.0,4811.8439,4816.5703,48152261


In [31]:
SH[(SH['skey'] == 1000016) & (SH['time'] <= 150500000)].tail(100)

Unnamed: 0,clockAtArrival,sequenceNo,skey,time,cum_volume,cum_amount,close,open,prevClose
693481,1597733697974569,55624040,1000016,145428610,46208836,90606650000.0,3370.3933,3369.8674,33688488
693692,1597733702940159,55645454,1000016,145433750,46235971,90654030000.0,3370.1787,3369.8674,33688488
693912,1597733707911421,55666764,1000016,145437640,46254413,90711000000.0,3370.5913,3369.8674,33688488
694122,1597733712972676,55688144,1000016,145443610,46279073,90773300000.0,3370.3065,3369.8674,33688488
694338,1597733717941278,55708353,1000016,145448780,46297927,90818480000.0,3370.4379,3369.8674,33688488
694557,1597733722915717,55728560,1000016,145452600,46312160,90855440000.0,3370.408,3369.8674,33688488
694768,1597733727885802,55751621,1000016,145458590,46328682,90894650000.0,3370.4948,3369.8674,33688488
694984,1597733732974655,55779656,1000016,145503780,46349201,90942970000.0,3370.4328,3369.8674,33688488
695200,1597733737946259,55802876,1000016,145507570,46366054,90986780000.0,3370.9188,3369.8674,33688488
695418,1597733742916491,55827493,1000016,145513600,46397224,91063310000.0,3370.5196,3369.8674,33688488


In [28]:
SH.loc[SH['time']%1000 != 0, 'time'] = ((SH.loc[SH['time']%1000 != 0, 'time'] + 1000) // 1000) * 1000
assert(sum(SH['time']%1000 != 0) == 0)
SH = SH.drop_duplicates(['cum_volume', 'open', 'close', 'cum_amount', 'skey', 'time'])
SH = SH[(SH['cum_volume'] > 0) & (SH['time'] <= 150500000)]

In [29]:
SH[SH.duplicated(['skey', 'time'], keep=False)]

Unnamed: 0,clockAtArrival,sequenceNo,skey,time,cum_volume,cum_amount,close,open,prevClose
708125,1597734037949326,56790400,1000016,150003000,47379851,93359130000.0,3367.3872,3369.8674,33688488
708253,1597734037998629,56790661,1000300,150003000,188973464,314215800000.0,4812.7719,4816.5703,48152261
708264,1597734038008712,56790688,1000852,150003000,215438011,259860100000.0,7410.1998,7348.4193,73396933
708268,1597734038008712,56790692,1000905,150003000,189027781,208443300000.0,6788.9562,6747.6059,67445468
708405,1597734052956881,56929116,1000016,150003000,47444157,93429430000.0,3367.3313,3369.8674,33688488
708529,1597734053039485,56929369,1000300,150003000,189139854,314395900000.0,4812.7564,4816.5703,48152261
708540,1597734053049379,56929398,1000852,150003000,215555413,260025300000.0,7410.2411,7348.4193,73396933
708544,1597734053049379,56929402,1000905,150003000,189166175,208576400000.0,6789.3979,6747.6059,67445468


In [None]:
SH['date'] = int(y)
SH['time1'] = int(y) * 1000000000 + SH['time']
SH['time'] = SH['time'].astype('int64') * 1000
SH["clockAtArrival"] = SH["time1"].astype(str).apply(
    lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S%f').timestamp() * 1e6))
SH.drop("time1", axis=1, inplace=True)
SH['datetime'] = SH["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x / 1e6))

SH = SH.fillna(0)
SH = SH.drop_duplicates(['cum_volume', 'open', 'close', 'cum_amount', 'skey', 
              'date', 'time', 'clockAtArrival', 'datetime'])
assert(sum(SH['time']%1000000) == 0)
assert(sum(SH[SH['cum_volume'] == 0].groupby('skey')['time'].max() 
           < SH[SH['cum_volume'] > 0].groupby('skey')['time'].min()))
m_ax = SH[SH['time'] <= 150500000000].groupby('skey').last()['time'].min()
assert(SH[SH['time'] >= m_ax].drop_duplicates(['cum_volume', 'open', 'high', 'low', 'prev_close', 
                                           'close', 'cum_amount', 'skey', 'date'], keep=False).shape[0] == 0)
SH = SH[(SH['cum_volume'] > 0) & (SH['time'] <= 150500000000)]


k1 = SH.groupby('skey')['datetime'].min().reset_index()
k1 = k1.rename(columns={'datetime':'min'})
k2 = SH.groupby('skey')['datetime'].max().reset_index()
k2 = k2.rename(columns={'datetime':'max'})
k = pd.merge(k1, k2, on='skey')
k['diff'] = (k['max']-k['min']).apply(lambda x: x.seconds)
df = pd.DataFrame()
for i in np.arange(k.shape[0]):
    df1 = pd.DataFrame()
    df1['datetime1'] = [k.loc[i, 'min'] + datetime.timedelta(seconds=int(x)) for x in np.arange(0, k.loc[i, 'diff'] + 1)]
    df1['skey'] = k.loc[i, 'skey']
    assert(df1['datetime1'].min() == k.loc[i, 'min'])
    assert(df1['datetime1'].max() == k.loc[i, 'max'])
    df = pd.concat([df, df1])

SH = pd.merge(SH, df, left_on=['skey', 'datetime'], right_on=['skey', 'datetime1'], how='outer').sort_values(by=['skey', 'datetime1']).reset_index(drop=True)
assert(SH[SH['datetime1'].isnull()].shape[0] == 0)
for cols in ['date', 'cum_volume', 'cum_amount', 'prev_close', 'open', 'high', 'low', 'close']:
    SH[cols] = SH.groupby('skey')[cols].ffill()
SH.drop(["datetime"],axis=1,inplace=True)
SH = SH.rename(columns={'datetime1':'datetime'})
SH['date'] = SH['date'].iloc[0]
SH['date'] = SH['date'].astype('int32')
SH['skey'] = SH['skey'].astype('int32')
SH["time"] = SH['datetime'].astype(str).apply(lambda x: int(x.split(' ')[1].replace(':', ""))).astype(np.int64)
SH['SendingTime'] = SH['date'] * 1000000 + SH['time']
SH["clockAtArrival"] = SH["SendingTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S').timestamp()*1e6))
SH.drop(["SendingTime"],axis=1,inplace=True)
SH['time'] = SH['time'] * 1000000

assert(sum(SH[SH["open"] != 0].groupby("skey")["open"].nunique() != 1) == 0)
assert(sum(SH[SH["prev_close"] != 0].groupby("skey")["prev_close"].nunique() != 1) == 0)
SH["prev_close"] = np.where(SH["time"] >= 91500000000, SH.groupby("skey")["prev_close"].transform("max"), SH["prev_close"]) 
SH["open"] = np.where(SH["cum_volume"] > 0, SH.groupby("skey")["open"].transform("max"), SH["open"])
assert(sum(SH[SH["open"] != 0].groupby("skey")["open"].nunique() != 1) == 0)
assert(sum(SH[SH["prev_close"] != 0].groupby("skey")["prev_close"].nunique() != 1) == 0)
assert(SH[SH["cum_volume"] > 0]["open"].min() > 0)

for cols in ['open', 'close', 'cum_amount']:
    SH[cols] = SH[cols].apply(lambda x: round(x, 4)).astype('float64')
m_in = SH[SH['time'] <= 113500000000].groupby('skey').last()['time'].min()
m_ax = SH[SH['time'] >= 125500000000].groupby('skey').first()['time'].max()
assert(SH[(SH['time'] >= m_in) & (SH['time'] <= m_ax)].drop_duplicates(['cum_volume', 'open', 'high', 'low', 'prev_close', 
                                           'close', 'cum_amount', 'skey', 'date'], keep=False).shape[0] == 0)
SH = pd.concat([SH[SH['time'] <= 113500000000], SH[SH['time'] >= 125500000000]])
    
SH = SH.sort_values(by=['skey', 'time'])
SH["ordering"] = SH.groupby("skey").cumcount()
SH["ordering"] = SH["ordering"] + 1
SH['ordering'] = SH['ordering'].astype('int32')
SH['cum_volume'] = SH['cum_volume'].astype('int64')
SH['close'] = np.where(SH['cum_volume'] > 0, SH['close'], 0)

SH = SH[["skey", "date", "time", "clockAtArrival", "datetime", "ordering", "cum_volume", "cum_amount", 
         "open", "close"]]
        
print(SH["date"].iloc[0])
print("index finished")