In [1]:
import os
import pandas as pd
os.environ['OMP_NUM_THREADS'] = '1'
import glob
import pymongo
import numpy as np
import pandas as pd
import pickle
import time
import gzip
import lzma
import pytz
import warnings
import glob
import datetime
from collections import defaultdict, OrderedDict
import pyarrow as pa
import pyarrow.parquet as pq
import io


def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name)


class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db', version=3):
        self.db_name = db_name
        self.uri = uri
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'
        self.version = version

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = self.version
            ser_data = self.ser(df_seg, version)
            seg = {'ver': version, 'data': ser_data, 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None
    
    def read_raw(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        return collection.find(query)

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        elif version == 3:
            # 32-bit number needs more space than 64-bit for parquet
            for col_name in s.columns:
                col = s[col_name]
                if col.dtype == np.int32:
                    s[col_name] = s[col_name].astype(np.int64)
                elif col.dtype == np.uint32:
                    s[col_name] = s[col_name].astype(np.uint64)
            tbl = pa.Table.from_pandas(s)
            f = io.BytesIO()
            pq.write_table(tbl, f, use_dictionary=False, compression='ZSTD', compression_level=0)
            f.seek(0)
            data = f.read()
            return data
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)
        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        elif version == 3:
            f = io.BytesIO()
            f.write(s)
            f.seek(0)
            return pq.read_table(f, use_threads=False).to_pandas()
        else:
            raise Exception('unknown version')


def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()


            
            
            
            
            
            
            
            
import pandas as pd
import random
import numpy as np
import glob
import os
import pickle
import datetime
import time
pd.set_option("max_columns", 200)

startTm = datetime.datetime.now()
readPath = '/mnt/ShareWithServer/day_stock_20200820/***'
dataPathLs = np.array(glob.glob(readPath))
dataPathLs = dataPathLs[[np.array([os.path.basename(i).split('.')[0][:2] == 'SZ' for i in dataPathLs])]]
db = pd.DataFrame()
for p in dataPathLs:
    dayData = pd.read_csv(p, compression='gzip')
    db = pd.concat([db, dayData])
print(datetime.datetime.now() - startTm)

year = "2020"
startDate = '20200803'
endDate = '20200803'
readPath = '/mnt/Kevin_zhenyu/KR_daily_data/***'
dataPathLs = np.array(glob.glob(readPath))
dateLs = np.array([os.path.basename(i) for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]

for data in dataPathLs:
    readPath = data + '/SZ/tick/***'
    dataPathLs = np.array(glob.glob(readPath))
    dateLs = np.array([int(os.path.basename(i).split('.')[0]) for i in dataPathLs])
    dataPathLs = dataPathLs[(dateLs < 4000) | ((dateLs > 300000) & (dateLs < 310000))]
    TradeLog = []
    ll = []
    
    for i in dataPathLs:
        try:
            df = pd.read_csv(i)
        except:
            print("empty data")
            print(i)
            ll.append(int(os.path.basename(i).split('.')[0]))
            continue
        df["SecurityID"] = int(os.path.basename(i).split('.')[0])
        TradeLog += [df]
    TradeLog = pd.concat(TradeLog).reset_index(drop=True)
    TradeLog = TradeLog[TradeLog["ChannelNo"] != 4001]
    
    TradeLog["date"] = TradeLog["TransactTime"].iloc[0]//1000000000
    TradeLog = TradeLog.rename(columns={"Qty":"trade_qty", "Price":"trade_price", "ExecType":"trade_type"})
    TradeLog["trade_money"] = TradeLog["trade_price"] * TradeLog["trade_qty"]
    TradeLog["trade_flag"] = 0
    TradeLog["skey"] = TradeLog["SecurityID"] + 2000000
    TradeLog["clockAtArrival"] = TradeLog["TransactTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S%f').timestamp()*1e6))
    TradeLog['datetime'] = TradeLog["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))
    TradeLog["time"] = (TradeLog['TransactTime'] - int(TradeLog['TransactTime'].iloc[0]//1000000000*1000000000)).astype(np.int64)*1000
    TradeLog["trade_type"] = np.where(TradeLog["trade_type"] == 'F', 1, TradeLog["trade_type"])
    for col in ["skey", "date", "ApplSeqNum", "BidApplSeqNum", "OfferApplSeqNum", "trade_qty", "trade_type", "trade_flag"]:
        TradeLog[col] = TradeLog[col].astype('int32')
#     for cols in ["trade_money", "trade_price"]:
#         display(cols)
#         display(TradeLog[cols].astype(str).apply(lambda x: len(str(x.split('.')[1]))).unique())
#         TradeLog[cols] = TradeLog[cols].round(2)
#         display(TradeLog[cols].astype(str).apply(lambda x: len(str(x.split('.')[1]))).unique())
    for cols in ["trade_money"]:
        TradeLog[cols] = TradeLog[cols].round(2)
    
    da_te = str(TradeLog["date"].iloc[0]) 
    da_te = da_te[:4] + '-' + da_te[4:6] + '-' + da_te[6:8]
    db1 = db[db["date"] == da_te]
    sl = (db1["ID"].str[2:].astype(int) + 2000000).unique()
    db1["max_volume"] = db1.groupby("ID")["d_volume"].transform("max")
    db1["max_amount"] = db1.groupby("ID")["d_amount"].transform("max")
    t1 = db1.groupby("ID")["max_volume", "max_amount"].first().reset_index()
    del db1
    t1["skey"] = t1["ID"].str[2:].astype(int) + 2000000
    trade1 = TradeLog[TradeLog["trade_type"] == 1].groupby("skey")["trade_qty"].sum().reset_index()
    trade1.columns=["skey", "cum_volume"]
    trade2 = TradeLog[TradeLog["trade_type"] == 1].groupby("skey")["trade_money"].sum().reset_index()
    trade2.columns=["skey", "cum_amount"]
    t2 = pd.merge(trade1, trade2, on="skey")
    re = pd.merge(t1, t2, on="skey", how="outer")
    try:
        assert(t1.shape[0] == t2.shape[0])
        assert(re[re["cum_volume"] != re["max_volume"]].shape[0] == 0)
        assert(re[re["cum_amount"].round(2) != re["max_amount"]].shape[0] == 0)
    except:
        display(set(t1["skey"]) - set(t2["skey"]))
        display(re[re["cum_volume"] != re["max_volume"]])
        display(re[re["cum_amount"].round(2) != re["max_amount"]])
    del t1
    del t2
    del re
 
    TradeLog = TradeLog[["skey", "date", "time", "clockAtArrival", "ApplSeqNum", "trade_type", "trade_flag",
                                                 "trade_price", "trade_qty", "BidApplSeqNum", "OfferApplSeqNum"]]
    print(da_te)
    print("trade finished")


    database_name = 'com_md_eq_cn'
    user = "zhenyuy"
    password = "bnONBrzSMGoE"

    db1 = DB("192.168.10.178", database_name, user, password)
    db1.write('md_trade', TradeLog)    
    
    del TradeLog

    print(datetime.datetime.now() - startTm)
    




0:08:39.336363


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


2020-08-03
trade finished


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0:25:18.247530


In [1]:
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz


def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name)


class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db'):
        self.db_name = db_name
        self.uri = uri
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = 1
            seg = {'ver': version, 'data': self.ser(df_seg, version), 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)

        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        else:
            raise Exception('unknown version')


def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()


            
            
            
            
            
            
            
            
import pandas as pd
import random
import numpy as np
import glob
import os
import pickle
import datetime
import time
pd.set_option("max_columns", 200)

startTm = datetime.datetime.now()
readPath = '/home/work516/day_stock/***'
dataPathLs = np.array(glob.glob(readPath))
dataPathLs = dataPathLs[[np.array([os.path.basename(i).split('.')[0][:2] == 'SZ' for i in dataPathLs])]]
db = pd.DataFrame()
for p in dataPathLs:
    dayData = pd.read_csv(p, compression='gzip')
    db = pd.concat([db, dayData])
print(datetime.datetime.now() - startTm)

year = "2020"
startDate = '20200109'
endDate = '20200529'
readPath = '/mnt/usb/data/' + year + '/***/***'
dataPathLs = np.array(glob.glob(readPath))
dateLs = np.array([os.path.basename(i) for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]

for data in dataPathLs:
    
    if len(np.array(glob.glob(data + '/SZ/***'))) == 0:
        if int(os.path.basename(data)) not in date_list["Date"].values:
            continue
        else:
            print(os.path.basename(data) + " less data!!!!!!!!!!!!!!!!!")
            less.append(data)
            continue
    startTm = datetime.datetime.now()
    date = os.path.basename(data)
    rar_path = data + '/SZ/tick.7z'
    path = '/mnt/e/unzip_data/2020/SZ'
    path1 = path + '/' + date
    un_path = path1
    cmd = '7za x {} -o{}'.format(rar_path, un_path)
    os.system(cmd)
    print(datetime.datetime.now() - startTm)
    print(date + ' unzip finished')
    
    
    startTm = datetime.datetime.now()
    
    readPath = path1 + '/tick/***'
    dataPathLs = np.array(glob.glob(readPath))
    dateLs = np.array([int(os.path.basename(i).split('.')[0]) for i in dataPathLs])
    dataPathLs = dataPathLs[(dateLs < 4000) | ((dateLs > 300000) & (dateLs < 310000))]
    TradeLog = []
    ll = []
    
    for i in dataPathLs:
        try:
            df = pd.read_csv(i)
        except:
            print("empty data")
            print(i)
            ll.append(int(os.path.basename(i).split('.')[0]))
            continue
        df["SecurityID"] = int(os.path.basename(i).split('.')[0])
        TradeLog += [df]
    TradeLog = pd.concat(TradeLog).reset_index(drop=True)
    TradeLog = TradeLog[TradeLog["ChannelNo"] != 4001]
    
    TradeLog["date"] = TradeLog["TransactTime"].iloc[0]//1000000000
    TradeLog = TradeLog.rename(columns={"Qty":"trade_qty", "Price":"trade_price", "ExecType":"trade_type"})
    TradeLog["trade_money"] = TradeLog["trade_price"] * TradeLog["trade_qty"]
    TradeLog["trade_flag"] = 0
    TradeLog["skey"] = TradeLog["SecurityID"] + 2000000
    TradeLog["clockAtArrival"] = TradeLog["TransactTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S%f').timestamp()*1e6))
    TradeLog['datetime'] = TradeLog["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))
    TradeLog["time"] = (TradeLog['TransactTime'] - int(TradeLog['TransactTime'].iloc[0]//1000000000*1000000000)).astype(np.int64)*1000
    TradeLog["trade_type"] = np.where(TradeLog["trade_type"] == 'F', 1, TradeLog["trade_type"])
    for col in ["skey", "date", "ApplSeqNum", "BidApplSeqNum", "OfferApplSeqNum", "trade_qty", "trade_type", "trade_flag"]:
        TradeLog[col] = TradeLog[col].astype('int32')
#     for cols in ["trade_money", "trade_price"]:
#         display(cols)
#         display(TradeLog[cols].astype(str).apply(lambda x: len(str(x.split('.')[1]))).unique())
#         TradeLog[cols] = TradeLog[cols].round(2)
#         display(TradeLog[cols].astype(str).apply(lambda x: len(str(x.split('.')[1]))).unique())
    for cols in ["trade_money"]:
        TradeLog[cols] = TradeLog[cols].round(2)
    
    da_te = str(TradeLog["date"].iloc[0]) 
    da_te = da_te[:4] + '-' + da_te[4:6] + '-' + da_te[6:8]
    db1 = db[db["date"] == da_te]
    sl = (db1["ID"].str[2:].astype(int) + 2000000).unique()
    db1["max_volume"] = db1.groupby("ID")["d_volume"].transform("max")
    db1["max_amount"] = db1.groupby("ID")["d_amount"].transform("max")
    t1 = db1.groupby("ID")["max_volume", "max_amount"].first().reset_index()
    del db1
    t1["skey"] = t1["ID"].str[2:].astype(int) + 2000000
    trade1 = TradeLog[TradeLog["trade_type"] == 1].groupby("skey")["trade_qty"].sum().reset_index()
    trade1.columns=["skey", "cum_volume"]
    trade2 = TradeLog[TradeLog["trade_type"] == 1].groupby("skey")["trade_money"].sum().reset_index()
    trade2.columns=["skey", "cum_amount"]
    t2 = pd.merge(trade1, trade2, on="skey")
    re = pd.merge(t1, t2, on="skey", how="outer")
    try:
        assert(t1.shape[0] == t2.shape[0])
        assert(re[re["cum_volume"] != re["max_volume"]].shape[0] == 0)
        assert(re[re["cum_amount"].round(2) != re["max_amount"]].shape[0] == 0)
    except:
        display(set(t1["skey"]) - set(t2["skey"]))
        display(re[re["cum_volume"] != re["max_volume"]])
        display(re[re["cum_amount"].round(2) != re["max_amount"]])
    del t1
    del t2
    del re
 
    TradeLog = TradeLog[["skey", "date", "time", "clockAtArrival", "datetime", "ApplSeqNum", "trade_type", "trade_flag",
                                                 "trade_price", "trade_qty", "BidApplSeqNum", "OfferApplSeqNum"]]
    print(da_te)
    print("trade finished")


    database_name = 'com_md_eq_cn'
    user = "zhenyuy"
    password = "bnONBrzSMGoE"

    db1 = DB("192.168.10.223", database_name, user, password)
    db1.write('md_trade', TradeLog)    
    
    del TradeLog

    print(datetime.datetime.now() - startTm)
    




0:05:44.481435
0:00:00.372039
20200109 unzip finished


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


2020-01-09
trade finished
0:12:04.344354
0:00:00.485499
20200110 unzip finished
2020-01-10
trade finished
0:10:47.494568
0:00:34.307719
20200113 unzip finished
2020-01-13
trade finished
0:10:37.627474
0:00:36.368465
20200114 unzip finished
2020-01-14
trade finished
0:10:58.233442
0:00:33.738996
20200115 unzip finished
2020-01-15
trade finished
0:11:01.353976
0:00:31.068934
20200116 unzip finished
2020-01-16
trade finished
0:09:55.706836
0:00:30.302293
20200117 unzip finished
2020-01-17
trade finished
0:09:33.058346
0:00:31.196891
20200120 unzip finished
2020-01-20
trade finished
0:09:57.885947
0:00:34.025310
20200121 unzip finished
2020-01-21
trade finished
0:10:25.002092
0:00:33.197496
20200122 unzip finished
2020-01-22
trade finished
0:10:45.403860
0:00:38.120436
20200123 unzip finished


set()

Unnamed: 0,ID,max_volume,max_amount,skey,cum_volume,cum_amount
19,SZ000026,14988714.0,157168400.0,2000026,14986014,157140300.0
604,SZ002151,30999166.0,840626500.0,2002151,30999066,840623900.0
696,SZ002243,8370992.0,122400700.0,2002243,8370592,122394900.0
830,SZ002379,43961587.0,170863600.0,2002379,43960587,170859800.0
874,SZ002423,4666510.0,42165260.0,2002423,4666410,42164360.0
906,SZ002456,218772484.0,4009594000.0,2002456,218772384,4009593000.0
910,SZ002460,59039851.0,2553695000.0,2002460,59038751,2553649000.0
1236,SZ002798,3673927.0,83839230.0,2002798,3673627,83832490.0
1364,SZ002937,5058576.0,78523480.0,2002937,5054576,78461810.0
1415,SZ300017,86463136.0,724810100.0,2300017,86462536,724805200.0


Unnamed: 0,ID,max_volume,max_amount,skey,cum_volume,cum_amount
19,SZ000026,14988714.0,157168400.0,2000026,14986014,157140300.0
604,SZ002151,30999166.0,840626500.0,2002151,30999066,840623900.0
696,SZ002243,8370992.0,122400700.0,2002243,8370592,122394900.0
830,SZ002379,43961587.0,170863600.0,2002379,43960587,170859800.0
874,SZ002423,4666510.0,42165260.0,2002423,4666410,42164360.0
906,SZ002456,218772484.0,4009594000.0,2002456,218772384,4009593000.0
910,SZ002460,59039851.0,2553695000.0,2002460,59038751,2553649000.0
1236,SZ002798,3673927.0,83839230.0,2002798,3673627,83832490.0
1364,SZ002937,5058576.0,78523480.0,2002937,5054576,78461810.0
1415,SZ300017,86463136.0,724810100.0,2300017,86462536,724805200.0


2020-01-23
trade finished
0:12:17.712658
0:00:17.862023
20200203 unzip finished


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


2020-02-03
trade finished
0:05:18.340310
0:00:43.304407
20200204 unzip finished
2020-02-04
trade finished
0:13:51.951551
0:00:43.585129
20200205 unzip finished
2020-02-05
trade finished
0:14:06.325212
0:00:52.138396
20200206 unzip finished
2020-02-06
trade finished
0:14:33.066820
0:00:45.052206
20200207 unzip finished
2020-02-07
trade finished
0:14:53.496839
0:00:42.644696
20200210 unzip finished
2020-02-10
trade finished
0:14:08.960072
0:00:41.812703
20200211 unzip finished
2020-02-11
trade finished
0:13:17.920456
0:00:39.630340
20200212 unzip finished
2020-02-12
trade finished
0:13:21.744411
0:00:41.793861
20200213 unzip finished
2020-02-13
trade finished
0:13:56.573032
0:00:40.231200
20200214 unzip finished
2020-02-14
trade finished
0:13:19.989219
0:00:42.834794
20200217 unzip finished
2020-02-17
trade finished
0:13:52.989676
0:00:47.195667
20200218 unzip finished
2020-02-18
trade finished
0:15:37.447598
0:00:54.307577
20200219 unzip finished
2020-02-19
trade finished
0:16:11.042454

{2000002,
 2000004,
 2000005,
 2000006,
 2000007,
 2000008,
 2000009,
 2000010,
 2000011,
 2000012,
 2000014,
 2000016,
 2000017,
 2000019,
 2000020,
 2000021,
 2000023,
 2000025,
 2000026,
 2000027,
 2000028,
 2000030,
 2000031,
 2000032,
 2000034,
 2000035,
 2000036,
 2000037,
 2000038,
 2000039,
 2000040,
 2000042,
 2000045,
 2000046,
 2000048,
 2000049,
 2000050,
 2000055,
 2000056,
 2000058,
 2000059,
 2000060,
 2000061,
 2000062,
 2000063,
 2000065,
 2000066,
 2000068,
 2000069,
 2000070,
 2000078,
 2000088,
 2000089,
 2000090,
 2000096,
 2000099,
 2000100,
 2000150,
 2000151,
 2000153,
 2000155,
 2000156,
 2000157,
 2000158,
 2000159,
 2000166,
 2000301,
 2000333,
 2000338,
 2000400,
 2000401,
 2000402,
 2000403,
 2000404,
 2000407,
 2000408,
 2000409,
 2000410,
 2000411,
 2000413,
 2000415,
 2000416,
 2000417,
 2000419,
 2000420,
 2000421,
 2000422,
 2000423,
 2000425,
 2000426,
 2000428,
 2000429,
 2000430,
 2000488,
 2000498,
 2000501,
 2000503,
 2000504,
 2000505,
 2000506,


Unnamed: 0,ID,max_volume,max_amount,skey,cum_volume,cum_amount
1,SZ000002,73253061.0,2.122143e+09,2000002,,
2,SZ000004,4948058.0,1.863005e+08,2000004,,
3,SZ000005,13668719.0,3.943993e+07,2000005,,
4,SZ000006,16732683.0,7.799912e+07,2000006,,
5,SZ000007,6923153.0,5.704177e+07,2000007,,
...,...,...,...,...,...,...
2191,SZ300818,1447706.0,5.069249e+07,2300818,,
2192,SZ300819,28535.0,6.811304e+05,2300819,,
2193,SZ300820,1939456.0,2.378142e+08,2300820,,
2194,SZ300821,533212.0,5.828007e+06,2300821,,


Unnamed: 0,ID,max_volume,max_amount,skey,cum_volume,cum_amount
1,SZ000002,73253061.0,2.122143e+09,2000002,,
2,SZ000004,4948058.0,1.863005e+08,2000004,,
3,SZ000005,13668719.0,3.943993e+07,2000005,,
4,SZ000006,16732683.0,7.799912e+07,2000006,,
5,SZ000007,6923153.0,5.704177e+07,2000007,,
...,...,...,...,...,...,...
2191,SZ300818,1447706.0,5.069249e+07,2300818,,
2192,SZ300819,28535.0,6.811304e+05,2300819,,
2193,SZ300820,1939456.0,2.378142e+08,2300820,,
2194,SZ300821,533212.0,5.828007e+06,2300821,,


2020-03-13
trade finished
0:00:02.594564
0:00:00.272622
20200316 unzip finished


ValueError: No objects to concatenate

In [None]:
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz


def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name)


class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db'):
        self.db_name = db_name
        self.uri = uri
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = 1
            seg = {'ver': version, 'data': self.ser(df_seg, version), 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)

        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        else:
            raise Exception('unknown version')


def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()


            
            
            
            
            
            
            
            
import pandas as pd
import random
import numpy as np
import glob
import os
import pickle
import datetime
import time
pd.set_option("max_columns", 200)

startTm = datetime.datetime.now()
readPath = '/home/work516/day_stock/***'
dataPathLs = np.array(glob.glob(readPath))
dataPathLs = dataPathLs[[np.array([os.path.basename(i).split('.')[0][:2] == 'SZ' for i in dataPathLs])]]
db = pd.DataFrame()
for p in dataPathLs:
    dayData = pd.read_csv(p, compression='gzip')
    db = pd.concat([db, dayData])
print(datetime.datetime.now() - startTm)

year = "2020"
startDate = '20200313'
endDate = '20200529'
readPath = '/mnt/usb/data/' + year + '/***/***'
dataPathLs = np.array(glob.glob(readPath))
dateLs = np.array([os.path.basename(i) for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]

for data in dataPathLs:
    
    if len(np.array(glob.glob(data + '/SZ/***'))) == 0:
        if int(os.path.basename(data)) not in date_list["Date"].values:
            continue
        else:
            print(os.path.basename(data) + " less data!!!!!!!!!!!!!!!!!")
            less.append(data)
            continue
    startTm = datetime.datetime.now()
    date = os.path.basename(data)
    rar_path = data + '/SZ/tick.7z'
    path = '/mnt/e/unzip_data/2020/SZ'
    path1 = path + '/' + date
    un_path = path1
    cmd = '7za x {} -o{}'.format(rar_path, un_path)
    os.system(cmd)
    print(datetime.datetime.now() - startTm)
    print(date + ' unzip finished')
    
    
    startTm = datetime.datetime.now()
    
    readPath = path1 + '/tick/***'
    dataPathLs = np.array(glob.glob(readPath))
    dateLs = np.array([int(os.path.basename(i).split('.')[0]) for i in dataPathLs])
    dataPathLs = dataPathLs[(dateLs < 4000) | ((dateLs > 300000) & (dateLs < 310000))]
    TradeLog = []
    ll = []
    
    for i in dataPathLs:
        try:
            df = pd.read_csv(i)
        except:
            print("empty data")
            print(i)
            ll.append(int(os.path.basename(i).split('.')[0]))
            continue
        df["SecurityID"] = int(os.path.basename(i).split('.')[0])
        TradeLog += [df]
    TradeLog = pd.concat(TradeLog).reset_index(drop=True)
    TradeLog = TradeLog[TradeLog["ChannelNo"] != 4001]
    
    TradeLog["date"] = TradeLog["TransactTime"].iloc[0]//1000000000
    TradeLog = TradeLog.rename(columns={"Qty":"trade_qty", "Price":"trade_price", "ExecType":"trade_type"})
    TradeLog["trade_money"] = TradeLog["trade_price"] * TradeLog["trade_qty"]
    TradeLog["trade_flag"] = 0
    TradeLog["skey"] = TradeLog["SecurityID"] + 2000000
    TradeLog["clockAtArrival"] = TradeLog["TransactTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S%f').timestamp()*1e6))
    TradeLog['datetime'] = TradeLog["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))
    TradeLog["time"] = (TradeLog['TransactTime'] - int(TradeLog['TransactTime'].iloc[0]//1000000000*1000000000)).astype(np.int64)*1000
    TradeLog["trade_type"] = np.where(TradeLog["trade_type"] == 'F', 1, TradeLog["trade_type"])
    for col in ["skey", "date", "ApplSeqNum", "BidApplSeqNum", "OfferApplSeqNum", "trade_qty", "trade_type", "trade_flag"]:
        TradeLog[col] = TradeLog[col].astype('int32')
#     for cols in ["trade_money", "trade_price"]:
#         display(cols)
#         display(TradeLog[cols].astype(str).apply(lambda x: len(str(x.split('.')[1]))).unique())
#         TradeLog[cols] = TradeLog[cols].round(2)
#         display(TradeLog[cols].astype(str).apply(lambda x: len(str(x.split('.')[1]))).unique())
    for cols in ["trade_money"]:
        TradeLog[cols] = TradeLog[cols].round(2)
    
    da_te = str(TradeLog["date"].iloc[0]) 
    da_te = da_te[:4] + '-' + da_te[4:6] + '-' + da_te[6:8]
    db1 = db[db["date"] == da_te]
    sl = (db1["ID"].str[2:].astype(int) + 2000000).unique()
    db1["max_volume"] = db1.groupby("ID")["d_volume"].transform("max")
    db1["max_amount"] = db1.groupby("ID")["d_amount"].transform("max")
    t1 = db1.groupby("ID")["max_volume", "max_amount"].first().reset_index()
    del db1
    t1["skey"] = t1["ID"].str[2:].astype(int) + 2000000
    trade1 = TradeLog[TradeLog["trade_type"] == 1].groupby("skey")["trade_qty"].sum().reset_index()
    trade1.columns=["skey", "cum_volume"]
    trade2 = TradeLog[TradeLog["trade_type"] == 1].groupby("skey")["trade_money"].sum().reset_index()
    trade2.columns=["skey", "cum_amount"]
    t2 = pd.merge(trade1, trade2, on="skey")
    re = pd.merge(t1, t2, on="skey", how="outer")
    try:
        assert(t1.shape[0] == t2.shape[0])
        assert(re[re["cum_volume"] != re["max_volume"]].shape[0] == 0)
        assert(re[re["cum_amount"].round(2) != re["max_amount"]].shape[0] == 0)
    except:
        display(set(t1["skey"]) - set(t2["skey"]))
        display(re[re["cum_volume"] != re["max_volume"]])
        display(re[re["cum_amount"].round(2) != re["max_amount"]])
    del t1
    del t2
    del re
 
    TradeLog = TradeLog[["skey", "date", "time", "clockAtArrival", "datetime", "ApplSeqNum", "trade_type", "trade_flag",
                                                 "trade_price", "trade_qty", "BidApplSeqNum", "OfferApplSeqNum"]]
    print(da_te)
    print("trade finished")


    database_name = 'com_md_eq_cn'
    user = "zhenyuy"
    password = "bnONBrzSMGoE"

    db1 = DB("192.168.10.223", database_name, user, password)
    db1.write('md_trade', TradeLog)    
    
    del TradeLog

    print(datetime.datetime.now() - startTm)
    




0:05:32.778531
0:00:39.875453
20200313 unzip finished


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


2020-03-13
trade finished
0:14:46.161557
0:00:52.752605
20200316 unzip finished
2020-03-16
trade finished
0:16:19.287593
0:00:47.076649
20200317 unzip finished
2020-03-17
trade finished
0:13:47.876184
0:00:45.702508
20200318 unzip finished
2020-03-18
trade finished
0:15:28.674127
0:00:50.527394
20200319 unzip finished
2020-03-19
trade finished
0:15:59.824166
0:00:43.677323
20200320 unzip finished
2020-03-20
trade finished
0:12:29.637345
0:00:42.988152
20200323 unzip finished


In [1]:
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz


def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name)


class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db'):
        self.db_name = db_name
        self.uri = uri
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = 1
            seg = {'ver': version, 'data': self.ser(df_seg, version), 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)

        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        else:
            raise Exception('unknown version')


def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()


            
            
            
            
            
            
            
            
import pandas as pd
import random
import numpy as np
import glob
import os
import pickle
import datetime
import time
pd.set_option("max_columns", 200)

startTm = datetime.datetime.now()
readPath = '/home/work516/day_stock/***'
dataPathLs = np.array(glob.glob(readPath))
dataPathLs = dataPathLs[[np.array([os.path.basename(i).split('.')[0][:2] == 'SZ' for i in dataPathLs])]]
db = pd.DataFrame()
for p in dataPathLs:
    dayData = pd.read_csv(p, compression='gzip')
    db = pd.concat([db, dayData])
print(datetime.datetime.now() - startTm)

year = "2020"
startDate = '20200323'
endDate = '20200529'
readPath = '/mnt/usb/data/' + year + '/***/***'
dataPathLs = np.array(glob.glob(readPath))
dateLs = np.array([os.path.basename(i) for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]

for data in dataPathLs:
    
    if len(np.array(glob.glob(data + '/SZ/***'))) == 0:
        if int(os.path.basename(data)) not in date_list["Date"].values:
            continue
        else:
            print(os.path.basename(data) + " less data!!!!!!!!!!!!!!!!!")
            less.append(data)
            continue
    startTm = datetime.datetime.now()
    date = os.path.basename(data)
    rar_path = data + '/SZ/tick.7z'
    path = '/mnt/e/unzip_data/2020/SZ'
    path1 = path + '/' + date
    un_path = path1
    cmd = '7za x {} -o{}'.format(rar_path, un_path)
    os.system(cmd)
    print(datetime.datetime.now() - startTm)
    print(date + ' unzip finished')
    
    
    startTm = datetime.datetime.now()
    
    readPath = path1 + '/tick/***'
    dataPathLs = np.array(glob.glob(readPath))
    dateLs = np.array([int(os.path.basename(i).split('.')[0]) for i in dataPathLs])
    dataPathLs = dataPathLs[(dateLs < 4000) | ((dateLs > 300000) & (dateLs < 310000))]
    TradeLog = []
    ll = []
    
    for i in dataPathLs:
        try:
            df = pd.read_csv(i)
        except:
            print("empty data")
            print(i)
            ll.append(int(os.path.basename(i).split('.')[0]))
            continue
        df["SecurityID"] = int(os.path.basename(i).split('.')[0])
        TradeLog += [df]
    TradeLog = pd.concat(TradeLog).reset_index(drop=True)
    TradeLog = TradeLog[TradeLog["ChannelNo"] != 4001]
    
    TradeLog["date"] = TradeLog["TransactTime"].iloc[0]//1000000000
    TradeLog = TradeLog.rename(columns={"Qty":"trade_qty", "Price":"trade_price", "ExecType":"trade_type"})
    TradeLog["trade_money"] = TradeLog["trade_price"] * TradeLog["trade_qty"]
    TradeLog["trade_flag"] = 0
    TradeLog["skey"] = TradeLog["SecurityID"] + 2000000
    TradeLog["clockAtArrival"] = TradeLog["TransactTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S%f').timestamp()*1e6))
    TradeLog['datetime'] = TradeLog["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))
    TradeLog["time"] = (TradeLog['TransactTime'] - int(TradeLog['TransactTime'].iloc[0]//1000000000*1000000000)).astype(np.int64)*1000
    TradeLog["trade_type"] = np.where(TradeLog["trade_type"] == 'F', 1, TradeLog["trade_type"])
    for col in ["skey", "date", "ApplSeqNum", "BidApplSeqNum", "OfferApplSeqNum", "trade_qty", "trade_type", "trade_flag"]:
        TradeLog[col] = TradeLog[col].astype('int32')
#     for cols in ["trade_money", "trade_price"]:
#         display(cols)
#         display(TradeLog[cols].astype(str).apply(lambda x: len(str(x.split('.')[1]))).unique())
#         TradeLog[cols] = TradeLog[cols].round(2)
#         display(TradeLog[cols].astype(str).apply(lambda x: len(str(x.split('.')[1]))).unique())
    for cols in ["trade_money"]:
        TradeLog[cols] = TradeLog[cols].round(2)
    
    da_te = str(TradeLog["date"].iloc[0]) 
    da_te = da_te[:4] + '-' + da_te[4:6] + '-' + da_te[6:8]
    db1 = db[db["date"] == da_te]
    sl = (db1["ID"].str[2:].astype(int) + 2000000).unique()
    db1["max_volume"] = db1.groupby("ID")["d_volume"].transform("max")
    db1["max_amount"] = db1.groupby("ID")["d_amount"].transform("max")
    t1 = db1.groupby("ID")["max_volume", "max_amount"].first().reset_index()
    del db1
    t1["skey"] = t1["ID"].str[2:].astype(int) + 2000000
    trade1 = TradeLog[TradeLog["trade_type"] == 1].groupby("skey")["trade_qty"].sum().reset_index()
    trade1.columns=["skey", "cum_volume"]
    trade2 = TradeLog[TradeLog["trade_type"] == 1].groupby("skey")["trade_money"].sum().reset_index()
    trade2.columns=["skey", "cum_amount"]
    t2 = pd.merge(trade1, trade2, on="skey")
    re = pd.merge(t1, t2, on="skey", how="outer")
    try:
        assert(t1.shape[0] == t2.shape[0])
        assert(re[re["cum_volume"] != re["max_volume"]].shape[0] == 0)
        assert(re[re["cum_amount"].round(2) != re["max_amount"]].shape[0] == 0)
    except:
        display(set(t1["skey"]) - set(t2["skey"]))
        display(re[re["cum_volume"] != re["max_volume"]])
        display(re[re["cum_amount"].round(2) != re["max_amount"]])
    del t1
    del t2
    del re
 
    TradeLog = TradeLog[["skey", "date", "time", "clockAtArrival", "datetime", "ApplSeqNum", "trade_type", "trade_flag",
                                                 "trade_price", "trade_qty", "BidApplSeqNum", "OfferApplSeqNum"]]
    print(da_te)
    print("trade finished")


    database_name = 'com_md_eq_cn'
    user = "zhenyuy"
    password = "bnONBrzSMGoE"

    db1 = DB("192.168.10.223", database_name, user, password)
    db1.write('md_trade', TradeLog)    
    
    del TradeLog

    print(datetime.datetime.now() - startTm)
    




0:05:33.504464
0:00:00.429773
20200323 unzip finished


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


2020-03-23
trade finished
0:13:43.439359
0:00:00.553192
20200324 unzip finished
2020-03-24
trade finished
0:13:35.726998
0:00:41.898963
20200325 unzip finished
2020-03-25
trade finished
0:11:35.263058
0:00:38.002841
20200326 unzip finished
2020-03-26
trade finished
0:10:47.952231
0:00:35.833855
20200327 unzip finished


set()

Unnamed: 0,ID,max_volume,max_amount,skey,cum_volume,cum_amount
1646,SZ300252,13634647.0,150500400.0,2300252,13633647,150489600.0
2194,SZ300821,43161516.0,550985200.0,2300821,43156916,550927600.0


Unnamed: 0,ID,max_volume,max_amount,skey,cum_volume,cum_amount
1646,SZ300252,13634647.0,150500400.0,2300252,13633647,150489600.0
2194,SZ300821,43161516.0,550985200.0,2300821,43156916,550927600.0


2020-03-27
trade finished
0:10:54.038944
0:00:39.453107
20200330 unzip finished


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


2020-03-30
trade finished
0:11:17.995843
0:00:35.544870
20200331 unzip finished
2020-03-31
trade finished
0:10:31.901580
0:00:35.956553
20200401 unzip finished
2020-04-01
trade finished
0:10:41.891199
0:00:34.669028
20200402 unzip finished
2020-04-02
trade finished
0:10:39.626276
0:00:35.339899
20200403 unzip finished
2020-04-03
trade finished
0:10:09.908098
0:00:38.348307
20200407 unzip finished
2020-04-07
trade finished
0:11:34.182876
0:00:37.698418
20200408 unzip finished
2020-04-08
trade finished
0:12:17.173702
0:00:36.225965
20200409 unzip finished
2020-04-09
trade finished
0:11:17.368517
0:00:40.482277
20200410 unzip finished
2020-04-10
trade finished
0:12:24.102932
0:00:31.229746
20200413 unzip finished
2020-04-13
trade finished
0:09:31.809788
0:00:36.717334
20200414 unzip finished
2020-04-14
trade finished
0:09:59.374833
0:00:40.758808
20200415 unzip finished
2020-04-15
trade finished
0:12:18.886606
0:00:40.947589
20200416 unzip finished
2020-04-16
trade finished
0:11:22.153524

empty data
/mnt/e/unzip_data/2020/SZ/20200525/tick/000910.csv
empty data
/mnt/e/unzip_data/2020/SZ/20200525/tick/000911.csv
empty data
/mnt/e/unzip_data/2020/SZ/20200525/tick/000912.csv
empty data
/mnt/e/unzip_data/2020/SZ/20200525/tick/000913.csv
empty data
/mnt/e/unzip_data/2020/SZ/20200525/tick/000915.csv
empty data
/mnt/e/unzip_data/2020/SZ/20200525/tick/000917.csv
empty data
/mnt/e/unzip_data/2020/SZ/20200525/tick/000918.csv
empty data
/mnt/e/unzip_data/2020/SZ/20200525/tick/000919.csv
empty data
/mnt/e/unzip_data/2020/SZ/20200525/tick/000920.csv
empty data
/mnt/e/unzip_data/2020/SZ/20200525/tick/000921.csv
empty data
/mnt/e/unzip_data/2020/SZ/20200525/tick/000922.csv
empty data
/mnt/e/unzip_data/2020/SZ/20200525/tick/000923.csv
empty data
/mnt/e/unzip_data/2020/SZ/20200525/tick/000925.csv
empty data
/mnt/e/unzip_data/2020/SZ/20200525/tick/000926.csv
empty data
/mnt/e/unzip_data/2020/SZ/20200525/tick/000927.csv
empty data
/mnt/e/unzip_data/2020/SZ/20200525/tick/000928.csv
empty da

{2000001,
 2000002,
 2000004,
 2000005,
 2000006,
 2000007,
 2000008,
 2000009,
 2000010,
 2000011,
 2000012,
 2000014,
 2000016,
 2000017,
 2000019,
 2000020,
 2000021,
 2000023,
 2000025,
 2000026,
 2000027,
 2000028,
 2000030,
 2000031,
 2000032,
 2000034,
 2000035,
 2000036,
 2000037,
 2000038,
 2000039,
 2000040,
 2000042,
 2000045,
 2000046,
 2000048,
 2000049,
 2000050,
 2000055,
 2000056,
 2000058,
 2000059,
 2000060,
 2000061,
 2000062,
 2000063,
 2000065,
 2000066,
 2000068,
 2000069,
 2000070,
 2000078,
 2000088,
 2000089,
 2000090,
 2000096,
 2000099,
 2000100,
 2000150,
 2000151,
 2000153,
 2000156,
 2000157,
 2000158,
 2000159,
 2000166,
 2000301,
 2000333,
 2000338,
 2000400,
 2000401,
 2000402,
 2000403,
 2000404,
 2000407,
 2000408,
 2000409,
 2000410,
 2000411,
 2000413,
 2000415,
 2000416,
 2000417,
 2000419,
 2000420,
 2000421,
 2000422,
 2000423,
 2000425,
 2000426,
 2000428,
 2000429,
 2000430,
 2000488,
 2000498,
 2000501,
 2000502,
 2000503,
 2000504,
 2000505,


Unnamed: 0,ID,max_volume,max_amount,skey,cum_volume,cum_amount
0,SZ000001,41017078.0,5.287694e+08,2000001,,
1,SZ000002,51276102.0,1.300390e+09,2000002,,
2,SZ000004,1482843.0,4.127411e+07,2000004,,
3,SZ000005,2738504.0,6.885460e+06,2000005,,
4,SZ000006,4367771.0,2.055259e+07,2000006,,
...,...,...,...,...,...,...
727,SZ002281,13579673.0,3.740188e+08,2002281,,
731,SZ002285,13127150.0,3.400329e+07,2002285,,
738,SZ002292,11147092.0,6.453698e+07,2002292,,
1402,SZ002990,96968.0,5.101541e+06,2002990,,


Unnamed: 0,ID,max_volume,max_amount,skey,cum_volume,cum_amount
0,SZ000001,41017078.0,5.287694e+08,2000001,,
1,SZ000002,51276102.0,1.300390e+09,2000002,,
2,SZ000004,1482843.0,4.127411e+07,2000004,,
3,SZ000005,2738504.0,6.885460e+06,2000005,,
4,SZ000006,4367771.0,2.055259e+07,2000006,,
...,...,...,...,...,...,...
727,SZ002281,13579673.0,3.740188e+08,2002281,,
731,SZ002285,13127150.0,3.400329e+07,2002285,,
738,SZ002292,11147092.0,6.453698e+07,2002292,,
1402,SZ002990,96968.0,5.101541e+06,2002990,,


2020-05-25
trade finished
0:06:08.717779
0:00:32.584514
20200526 unzip finished


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


2020-05-26
trade finished
0:09:27.912688
0:00:32.421389
20200527 unzip finished
2020-05-27
trade finished
0:10:55.450550
0:00:35.781700
20200528 unzip finished
2020-05-28
trade finished
0:10:09.473995
0:00:34.528610
20200529 unzip finished


set()

Unnamed: 0,ID,max_volume,max_amount,skey,cum_volume,cum_amount
1865,SZ300475,1796100.0,14268967.1,2300475,12425863,42682178.96


Unnamed: 0,ID,max_volume,max_amount,skey,cum_volume,cum_amount
1865,SZ300475,1796100.0,14268967.1,2300475,12425863,42682178.96


2020-05-29
trade finished
0:09:56.956703
