In [1]:
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz


def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name)


class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db'):
        self.db_name = db_name
        self.uri = uri
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = 1
            seg = {'ver': version, 'data': self.ser(df_seg, version), 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)

        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        else:
            raise Exception('unknown version')


def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()
    
    


import pandas as pd
import random
import numpy as np
import glob
import os
import pickle
import datetime
import time
pd.set_option("max_columns", 200)

startTm = datetime.datetime.now()
readPath = '/home/work516/day_stock/***'
dataPathLs = np.array(glob.glob(readPath))
dataPathLs = dataPathLs[[np.array([os.path.basename(i).split('.')[0][:2] == 'SZ' for i in dataPathLs])]]
db = pd.DataFrame()
for p in dataPathLs:
    dayData = pd.read_csv(p, compression='gzip')
    db = pd.concat([db, dayData])
print(datetime.datetime.now() - startTm)

year = "2017"
startDate = '20170901'
endDate = '20171231'
readPath = '/mnt/usb/data/' + year + '/***/***'
dataPathLs = np.array(glob.glob(readPath))
dateLs = np.array([os.path.basename(i) for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]
less = []

for data in dataPathLs:
    
    
    if len(np.array(glob.glob(data + '/SZ/***'))) == 0:
        if int(os.path.basename(data)) not in date_list["Date"].values:
            continue
        else:
            print(os.path.basename(data) + " less data!!!!!!!!!!!!!!!!!")
            less.append(data)
            continue
    startTm = datetime.datetime.now()
    date = os.path.basename(data)
    rar_path = data + '/SZ/order.7z'
    path = '/mnt/e/unzip_data/2017/SZ'
    path1 = path + '/' + date
    un_path = path1
    cmd = '7za x {} -o{}'.format(rar_path, un_path)
    os.system(cmd)
    print(datetime.datetime.now() - startTm)
    print(date + ' unzip finished')
    
    
    startTm = datetime.datetime.now()
    
    readPath = path1 + '/order/***'
    dataPathLs = np.array(glob.glob(readPath))
    dateLs = np.array([int(os.path.basename(i).split('.')[0]) for i in dataPathLs])
    dataPathLs = dataPathLs[(dateLs < 4000) | ((dateLs > 300000) & (dateLs < 310000))]
    OrderLog = []
    ll = []
    
    for i in dataPathLs:
        try:
            df = pd.read_csv(i, encoding='GBK')
        except:
            print("empty data")
            print(i)
            ll.append(int(os.path.basename(i).split('.')[0]))
            continue
        df["SecurityID"] = int(os.path.basename(i).split('.')[0])
        OrderLog += [df]
    OrderLog = pd.concat(OrderLog).reset_index(drop=True)
    OrderLog = OrderLog[OrderLog["ChannelNo"] != 4001]
    
    OrderLog = OrderLog.rename(columns={"OrdType": "OrderType"})
    OrderLog["date"] = OrderLog["TransactTime"].iloc[0]//1000000000
    OrderLog["OrderType"] = np.where(OrderLog["OrderType"] == 'U', 3, OrderLog["OrderType"])
    OrderLog["skey"] = OrderLog["SecurityID"] + 2000000
    OrderLog["clockAtArrival"] = OrderLog["TransactTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S%f').timestamp()*1e6))
    OrderLog['datetime'] = OrderLog["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))
    OrderLog["time"] = (OrderLog['TransactTime'] - int(OrderLog['TransactTime'].iloc[0]//1000000000*1000000000)).astype(np.int64)*1000
    
    for col in ["skey", "date", "ApplSeqNum", "OrderQty", "Side", "OrderType"]:
        OrderLog[col] = OrderLog[col].astype('int32')
#     for cols in ["Price"]:
#         print(cols)
#         print(OrderLog[cols].astype(str).apply(lambda x: len(str(x.split('.')[1]))).unique())
    
    assert(OrderLog[((OrderLog["Side"] != 1) & (OrderLog["Side"] != 2)) | (OrderLog["OrderType"].isnull())].shape[0] == 0)
    da_te = str(OrderLog["date"].iloc[0]) 
    da_te = da_te[:4] + '-' + da_te[4:6] + '-' + da_te[6:8]
    db1 = db[db["date"] == da_te]
    sl = (db1["ID"].str[2:].astype(int) + 2000000).unique()
    del db1
    try:
        assert(len(set(sl) - set(OrderLog["skey"].unique())) == 0)
    except:
        print("less stocks")
        display(set(sl) - set(OrderLog["skey"].unique()))
    if len(set(OrderLog["skey"].unique()) - set(sl)) != 0:
        print("more stocks")
        print(set(OrderLog["skey"].unique()) - set(sl))
    
    OrderLog = OrderLog.rename(columns={"Side":"order_side", "OrderType":"order_type", "Price":"order_price", "OrderQty":"order_qty"})
    OrderLog = OrderLog[["skey", "date", "time", "clockAtArrival", "datetime", "ApplSeqNum", "order_side", "order_type", "order_price",
                                                 "order_qty"]]
    
    print(OrderLog["date"].iloc[0])
    print("order finished")

    database_name = 'com_md_eq_cn'
    user = "zhenyuy"
    password = "bnONBrzSMGoE"

    db1 = DB("192.168.10.223", database_name, user, password)
    db1.write('md_order', OrderLog)
    
    del OrderLog
    
    print(datetime.datetime.now() - startTm)
    
#     pd.set_option("max_rows", 200)
#     display(OrderLog.dtypes)
    



0:05:53.910072
0:00:20.833907
20170901 unzip finished


  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


less stocks


{2001872, 2001914}

20170901
order finished
0:07:51.565118
0:00:22.086957
20170904 unzip finished
less stocks


{2001872, 2001914}

20170904
order finished
0:08:12.524122
0:00:19.427176
20170905 unzip finished
less stocks


{2001872, 2001914}

20170905
order finished
0:07:30.170558
0:00:26.989535
20170906 unzip finished
less stocks


{2001872, 2001914}

20170906
order finished
0:07:56.591715
0:00:20.998357
20170907 unzip finished


  interactivity=interactivity, compiler=compiler, result=result)


less stocks


{2001872, 2001914}

20170907
order finished
0:08:22.849819
0:00:21.049312
20170908 unzip finished
less stocks


{2001872, 2001914}

20170908
order finished
0:07:28.216675
0:00:23.710070
20170911 unzip finished
less stocks


{2001872, 2001914}

20170911
order finished
0:07:34.187523
0:00:23.721532
20170912 unzip finished
less stocks


{2001872, 2001914}

20170912
order finished
0:09:28.063395
0:00:18.604889
20170913 unzip finished
less stocks


{2001872, 2001914}

20170913
order finished
0:07:15.304521
0:00:20.941566
20170914 unzip finished
less stocks


{2001872, 2001914}

20170914
order finished
0:08:00.919550
0:00:21.197931
20170915 unzip finished
less stocks


{2001872, 2001914}

20170915
order finished
0:07:54.420771
0:00:18.767983
20170918 unzip finished
less stocks


{2001872, 2001914}

20170918
order finished
0:07:29.646360
0:00:23.006759
20170919 unzip finished
less stocks


{2001872, 2001914}

20170919
order finished
0:07:43.727748
0:00:21.299894
20170920 unzip finished
less stocks


{2001872, 2001914}

20170920
order finished
0:08:02.956854
0:00:21.525432
20170921 unzip finished
less stocks


{2001872, 2001914}

20170921
order finished
0:08:15.625868
0:00:20.269686
20170922 unzip finished
less stocks


{2001872, 2001914}

20170922
order finished
0:07:37.612626
0:00:19.273197
20170925 unzip finished
less stocks


{2001872, 2001914}

20170925
order finished
0:06:57.024632
0:00:23.681472
20170926 unzip finished
less stocks


{2001872, 2001914}

20170926
order finished
0:06:41.239092
0:00:18.631381
20170927 unzip finished
less stocks


{2001872, 2001914}

20170927
order finished
0:06:51.292952
0:00:19.400692
20170928 unzip finished
less stocks


{2001872, 2001914}

20170928
order finished
0:07:07.017135
0:00:19.557239
20170929 unzip finished
less stocks


{2001872, 2001914}

20170929
order finished
0:06:52.378877
0:00:25.546880
20171009 unzip finished
less stocks


{2001872, 2001914}

20171009
order finished
0:07:18.623551
0:00:19.855161
20171010 unzip finished
less stocks


{2001872, 2001914}

20171010
order finished
0:07:42.000977
0:00:23.048472
20171011 unzip finished
less stocks


{2001872, 2001914}

20171011
order finished
0:08:31.564779
0:01:15.681454
20171012 unzip finished
less stocks


{2001872, 2001914}

20171012
order finished
0:07:29.961601
0:00:28.903904
20171013 unzip finished
less stocks


{2001872, 2001914}

20171013
order finished
0:07:07.461257
0:00:21.697468
20171016 unzip finished
less stocks


{2001872, 2001914}

20171016
order finished
0:07:59.961901
0:00:20.460452
20171017 unzip finished
less stocks


{2001872, 2001914}

20171017
order finished
0:06:24.354545
0:00:18.442983
20171018 unzip finished
less stocks


{2001872, 2001914}

20171018
order finished
0:06:33.765806
0:00:18.592418
20171019 unzip finished
less stocks


{2001872, 2001914}

20171019
order finished
0:06:49.486709
0:00:21.094902
20171020 unzip finished
less stocks


{2001872, 2001914}

20171020
order finished
0:05:52.281180
0:00:19.020431
20171023 unzip finished
less stocks


{2001872, 2001914}

20171023
order finished
0:06:10.175763
0:00:17.167505
20171024 unzip finished
less stocks


{2001872, 2001914}

20171024
order finished
0:06:14.177331
0:00:17.791934
20171025 unzip finished
less stocks


{2001872, 2001914}

20171025
order finished
0:06:09.242513
0:00:21.365720
20171026 unzip finished
less stocks


{2001872, 2001914}

20171026
order finished
0:07:01.620479
0:00:19.830178
20171027 unzip finished
less stocks


{2001872, 2001914}

20171027
order finished
0:06:36.909655
0:00:29.826126
20171030 unzip finished
less stocks


{2001872, 2001914}

20171030
order finished
0:07:34.322956
0:00:18.345820
20171031 unzip finished
less stocks


{2001872, 2001914}

20171031
order finished
0:06:14.734698
0:00:17.546455
20171101 unzip finished
less stocks


{2001872, 2001914}

20171101
order finished
0:06:44.310083
0:00:42.997191
20171102 unzip finished
less stocks


{2001872, 2001914}

20171102
order finished
0:06:36.998889
0:00:22.524164
20171103 unzip finished
less stocks


{2001872, 2001914}

20171103
order finished
0:06:43.889356
0:00:24.168287
20171106 unzip finished
less stocks


{2001872, 2001914}

20171106
order finished
0:06:51.058662
0:00:31.714699
20171107 unzip finished
less stocks


{2001872, 2001914}

20171107
order finished
0:07:00.244546
0:00:29.526175
20171108 unzip finished
less stocks


{2001872, 2001914}

20171108
order finished
0:08:17.874677
0:00:19.725843
20171109 unzip finished
less stocks


{2001872, 2001914}

20171109
order finished
0:07:22.810944
0:00:37.289769
20171110 unzip finished
less stocks


{2001872, 2001914}

20171110
order finished
0:07:50.039367
0:00:22.803041
20171113 unzip finished
less stocks


{2001872, 2001914}

20171113
order finished
0:08:05.752387
0:01:54.770460
20171114 unzip finished
less stocks


{2001872, 2001914}

20171114
order finished
0:08:03.827949
0:01:36.129371
20171115 unzip finished
less stocks


{2001872, 2001914}

20171115
order finished
0:07:35.959858
0:01:40.705551
20171116 unzip finished
less stocks


{2001872, 2001914}

20171116
order finished
0:07:31.786451
0:00:19.630365
20171117 unzip finished


ValueError: No objects to concatenate

In [None]:
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz


def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name)


class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db'):
        self.db_name = db_name
        self.uri = uri
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = 1
            seg = {'ver': version, 'data': self.ser(df_seg, version), 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)

        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        else:
            raise Exception('unknown version')


def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()
    
    


import pandas as pd
import random
import numpy as np
import glob
import os
import pickle
import datetime
import time
pd.set_option("max_columns", 200)

startTm = datetime.datetime.now()
readPath = '/home/work516/day_stock/***'
dataPathLs = np.array(glob.glob(readPath))
dataPathLs = dataPathLs[[np.array([os.path.basename(i).split('.')[0][:2] == 'SZ' for i in dataPathLs])]]
db = pd.DataFrame()
for p in dataPathLs:
    dayData = pd.read_csv(p, compression='gzip')
    db = pd.concat([db, dayData])
print(datetime.datetime.now() - startTm)

year = "2017"
startDate = '20171117'
endDate = '20171231'
readPath = '/mnt/usb/data/' + year + '/***/***'
dataPathLs = np.array(glob.glob(readPath))
dateLs = np.array([os.path.basename(i) for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]
less = []

for data in dataPathLs:
    
    
    if len(np.array(glob.glob(data + '/SZ/***'))) == 0:
        if int(os.path.basename(data)) not in date_list["Date"].values:
            continue
        else:
            print(os.path.basename(data) + " less data!!!!!!!!!!!!!!!!!")
            less.append(data)
            continue
    startTm = datetime.datetime.now()
    date = os.path.basename(data)
    rar_path = data + '/SZ/order.7z'
    path = '/mnt/e/unzip_data/2017/SZ'
    path1 = path + '/' + date
    un_path = path1
    cmd = '7za x {} -o{}'.format(rar_path, un_path)
    os.system(cmd)
    print(datetime.datetime.now() - startTm)
    print(date + ' unzip finished')
    
    
    startTm = datetime.datetime.now()
    
    readPath = path1 + '/order/***'
    dataPathLs = np.array(glob.glob(readPath))
    dateLs = np.array([int(os.path.basename(i).split('.')[0]) for i in dataPathLs])
    dataPathLs = dataPathLs[(dateLs < 4000) | ((dateLs > 300000) & (dateLs < 310000))]
    OrderLog = []
    ll = []
    
    for i in dataPathLs:
        try:
            df = pd.read_csv(i, encoding='GBK')
        except:
            print("empty data")
            print(i)
            ll.append(int(os.path.basename(i).split('.')[0]))
            continue
        df["SecurityID"] = int(os.path.basename(i).split('.')[0])
        OrderLog += [df]
    OrderLog = pd.concat(OrderLog).reset_index(drop=True)
    OrderLog = OrderLog[OrderLog["ChannelNo"] != 4001]
    
    OrderLog = OrderLog.rename(columns={"OrdType": "OrderType"})
    OrderLog["date"] = OrderLog["TransactTime"].iloc[0]//1000000000
    OrderLog["OrderType"] = np.where(OrderLog["OrderType"] == 'U', 3, OrderLog["OrderType"])
    OrderLog["skey"] = OrderLog["SecurityID"] + 2000000
    OrderLog["clockAtArrival"] = OrderLog["TransactTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S%f').timestamp()*1e6))
    OrderLog['datetime'] = OrderLog["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))
    OrderLog["time"] = (OrderLog['TransactTime'] - int(OrderLog['TransactTime'].iloc[0]//1000000000*1000000000)).astype(np.int64)*1000
    
    for col in ["skey", "date", "ApplSeqNum", "OrderQty", "Side", "OrderType"]:
        OrderLog[col] = OrderLog[col].astype('int32')
#     for cols in ["Price"]:
#         print(cols)
#         print(OrderLog[cols].astype(str).apply(lambda x: len(str(x.split('.')[1]))).unique())
    
    assert(OrderLog[((OrderLog["Side"] != 1) & (OrderLog["Side"] != 2)) | (OrderLog["OrderType"].isnull())].shape[0] == 0)
    da_te = str(OrderLog["date"].iloc[0]) 
    da_te = da_te[:4] + '-' + da_te[4:6] + '-' + da_te[6:8]
    db1 = db[db["date"] == da_te]
    sl = (db1["ID"].str[2:].astype(int) + 2000000).unique()
    del db1
    try:
        assert(len(set(sl) - set(OrderLog["skey"].unique())) == 0)
    except:
        print("less stocks")
        display(set(sl) - set(OrderLog["skey"].unique()))
    if len(set(OrderLog["skey"].unique()) - set(sl)) != 0:
        print("more stocks")
        print(set(OrderLog["skey"].unique()) - set(sl))
    
    OrderLog = OrderLog.rename(columns={"Side":"order_side", "OrderType":"order_type", "Price":"order_price", "OrderQty":"order_qty"})
    OrderLog = OrderLog[["skey", "date", "time", "clockAtArrival", "datetime", "ApplSeqNum", "order_side", "order_type", "order_price",
                                                 "order_qty"]]
    
    print(OrderLog["date"].iloc[0])
    print("order finished")

    database_name = 'com_md_eq_cn'
    user = "zhenyuy"
    password = "bnONBrzSMGoE"

    db1 = DB("192.168.10.223", database_name, user, password)
    db1.write('md_order', OrderLog)
    
    del OrderLog
    
    print(datetime.datetime.now() - startTm)
    
#     pd.set_option("max_rows", 200)
#     display(OrderLog.dtypes)
    



0:04:44.635741
0:00:19.382055
20171117 unzip finished
less stocks


{2001872, 2001914}

20171117
order finished
0:08:17.026919
0:00:16.400086
20171120 unzip finished


  interactivity=interactivity, compiler=compiler, result=result)


less stocks


{2001914}

20171120
order finished
0:06:46.243236
0:00:19.146173
20171121 unzip finished
less stocks


{2001914}

20171121
order finished
0:07:41.460746
0:00:18.682503
20171122 unzip finished
less stocks


{2001914}

20171122
order finished
0:07:55.932529
0:00:19.052622
20171123 unzip finished
less stocks


{2001914}

20171123
order finished
0:07:40.804100
0:00:17.234268
20171124 unzip finished
less stocks


{2001914}

20171124
order finished
0:06:25.847623
0:00:19.348507
20171127 unzip finished
less stocks


{2001914}

20171127
order finished
0:06:00.215878
0:00:15.424651
20171128 unzip finished
less stocks


{2001914}

20171128
order finished
0:06:40.725272
0:00:16.346202
20171129 unzip finished
less stocks


{2001914}

20171129
order finished
0:06:29.476386
0:00:17.020726
20171130 unzip finished
less stocks


{2001914}

20171130
order finished
0:06:16.511944
0:00:15.940252
20171201 unzip finished
less stocks


{2001914}

20171201
order finished
0:06:12.626145
0:00:16.630455
20171204 unzip finished
less stocks


{2001914}

20171204
order finished
0:07:28.472688
0:00:19.339368
20171205 unzip finished
less stocks


{2001914}

20171205
order finished
0:07:05.536547
0:00:20.841208
20171206 unzip finished
less stocks


{2001914}

20171206
order finished
0:06:27.043340
0:00:20.667314
20171207 unzip finished
less stocks


{2001914}

20171207
order finished
0:06:00.631903
0:00:18.339043
20171208 unzip finished


In [1]:
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz


def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name)


class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db'):
        self.db_name = db_name
        self.uri = uri
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = 1
            seg = {'ver': version, 'data': self.ser(df_seg, version), 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)

        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        else:
            raise Exception('unknown version')


def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()
    
    


import pandas as pd
import random
import numpy as np
import glob
import os
import pickle
import datetime
import time
pd.set_option("max_columns", 200)

startTm = datetime.datetime.now()
readPath = '/home/work516/day_stock/***'
dataPathLs = np.array(glob.glob(readPath))
dataPathLs = dataPathLs[[np.array([os.path.basename(i).split('.')[0][:2] == 'SZ' for i in dataPathLs])]]
db = pd.DataFrame()
for p in dataPathLs:
    dayData = pd.read_csv(p, compression='gzip')
    db = pd.concat([db, dayData])
print(datetime.datetime.now() - startTm)

year = "2017"
startDate = '20171207'
endDate = '20171231'
readPath = '/mnt/usb/data/' + year + '/***/***'
dataPathLs = np.array(glob.glob(readPath))
dateLs = np.array([os.path.basename(i) for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]
less = []

for data in dataPathLs:
    
    
    if len(np.array(glob.glob(data + '/SZ/***'))) == 0:
        if int(os.path.basename(data)) not in date_list["Date"].values:
            continue
        else:
            print(os.path.basename(data) + " less data!!!!!!!!!!!!!!!!!")
            less.append(data)
            continue
    startTm = datetime.datetime.now()
    date = os.path.basename(data)
    rar_path = data + '/SZ/order.7z'
    path = '/mnt/e/unzip_data/2017/SZ'
    path1 = path + '/' + date
    un_path = path1
    cmd = '7za x {} -o{}'.format(rar_path, un_path)
    os.system(cmd)
    print(datetime.datetime.now() - startTm)
    print(date + ' unzip finished')
    
    
    startTm = datetime.datetime.now()
    
    readPath = path1 + '/order/***'
    dataPathLs = np.array(glob.glob(readPath))
    dateLs = np.array([int(os.path.basename(i).split('.')[0]) for i in dataPathLs])
    dataPathLs = dataPathLs[(dateLs < 4000) | ((dateLs > 300000) & (dateLs < 310000))]
    OrderLog = []
    ll = []
    
    for i in dataPathLs:
        try:
            df = pd.read_csv(i, encoding='GBK')
        except:
            print("empty data")
            print(i)
            ll.append(int(os.path.basename(i).split('.')[0]))
            continue
        df["SecurityID"] = int(os.path.basename(i).split('.')[0])
        OrderLog += [df]
    OrderLog = pd.concat(OrderLog).reset_index(drop=True)
    OrderLog = OrderLog[OrderLog["ChannelNo"] != 4001]
    
    OrderLog = OrderLog.rename(columns={"OrdType": "OrderType"})
    OrderLog["date"] = OrderLog["TransactTime"].iloc[0]//1000000000
    OrderLog["OrderType"] = np.where(OrderLog["OrderType"] == 'U', 3, OrderLog["OrderType"])
    OrderLog["skey"] = OrderLog["SecurityID"] + 2000000
    OrderLog["clockAtArrival"] = OrderLog["TransactTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S%f').timestamp()*1e6))
    OrderLog['datetime'] = OrderLog["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))
    OrderLog["time"] = (OrderLog['TransactTime'] - int(OrderLog['TransactTime'].iloc[0]//1000000000*1000000000)).astype(np.int64)*1000
    
    for col in ["skey", "date", "ApplSeqNum", "OrderQty", "Side", "OrderType"]:
        OrderLog[col] = OrderLog[col].astype('int32')
#     for cols in ["Price"]:
#         print(cols)
#         print(OrderLog[cols].astype(str).apply(lambda x: len(str(x.split('.')[1]))).unique())
    
    assert(OrderLog[((OrderLog["Side"] != 1) & (OrderLog["Side"] != 2)) | (OrderLog["OrderType"].isnull())].shape[0] == 0)
    da_te = str(OrderLog["date"].iloc[0]) 
    da_te = da_te[:4] + '-' + da_te[4:6] + '-' + da_te[6:8]
    db1 = db[db["date"] == da_te]
    sl = (db1["ID"].str[2:].astype(int) + 2000000).unique()
    del db1
    try:
        assert(len(set(sl) - set(OrderLog["skey"].unique())) == 0)
    except:
        print("less stocks")
        display(set(sl) - set(OrderLog["skey"].unique()))
    if len(set(OrderLog["skey"].unique()) - set(sl)) != 0:
        print("more stocks")
        print(set(OrderLog["skey"].unique()) - set(sl))
    
    OrderLog = OrderLog.rename(columns={"Side":"order_side", "OrderType":"order_type", "Price":"order_price", "OrderQty":"order_qty"})
    OrderLog = OrderLog[["skey", "date", "time", "clockAtArrival", "datetime", "ApplSeqNum", "order_side", "order_type", "order_price",
                                                 "order_qty"]]
    
    print(OrderLog["date"].iloc[0])
    print("order finished")

    database_name = 'com_md_eq_cn'
    user = "zhenyuy"
    password = "bnONBrzSMGoE"

    db1 = DB("192.168.10.223", database_name, user, password)
    db1.write('md_order', OrderLog)
    
    del OrderLog
    
    print(datetime.datetime.now() - startTm)
    
#     pd.set_option("max_rows", 200)
#     display(OrderLog.dtypes)
    



0:06:21.279861
0:00:00.449144
20171207 unzip finished


  interactivity=interactivity, compiler=compiler, result=result)


less stocks


{2001914}

20171207
order finished
0:06:23.511890
0:00:00.843641
20171208 unzip finished


  interactivity=interactivity, compiler=compiler, result=result)


less stocks


{2001914}

20171208
order finished
0:07:01.909869
0:00:00.432961
20171211 unzip finished


  interactivity=interactivity, compiler=compiler, result=result)


less stocks


{2001914}

20171211
order finished
0:06:37.205144
0:00:00.438255
20171212 unzip finished
less stocks


{2001914}

20171212
order finished
0:06:48.356236
0:00:00.379938
20171213 unzip finished
less stocks


{2001914}

20171213
order finished
0:05:35.669396
0:00:15.445587
20171214 unzip finished
less stocks


{2001914}

20171214
order finished
0:05:53.395458
0:00:18.267237
20171215 unzip finished
less stocks


{2001914}

20171215
order finished
0:06:12.183349
0:00:16.789061
20171218 unzip finished
less stocks


{2001914}

20171218
order finished
0:05:41.642334
0:00:14.336990
20171219 unzip finished
less stocks


{2001914}

20171219
order finished
0:05:23.808078
0:00:15.481196
20171220 unzip finished
less stocks


{2001914}

20171220
order finished
0:05:37.826269
0:00:15.700688
20171221 unzip finished
less stocks


{2001914}

20171221
order finished
0:05:53.249613
0:00:18.953655
20171222 unzip finished
less stocks


{2001914}

20171222
order finished
0:05:22.868228
0:00:15.742478
20171225 unzip finished
less stocks


{2001914}

20171225
order finished
0:05:48.593555
0:00:15.526567
20171226 unzip finished
less stocks


{2001914}

20171226
order finished
0:05:42.580268
0:00:16.428841
20171227 unzip finished
less stocks


{2001914}

20171227
order finished
0:06:11.443631
0:00:16.376430
20171228 unzip finished
less stocks


{2001914}

20171228
order finished
0:06:28.620900
0:00:15.981398
20171229 unzip finished
less stocks


{2001914}

20171229
order finished
0:05:46.854041


In [1]:
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz

class DB(object):
    def __init__(self, uri, symbol_column='skey'):
        self.db_name = 'white_db'
        user, passwd, host = self.parse_uri(uri)
        auth_db = 'admin' if user in ('admin', 'root') else self.db_name
        self.uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)

        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = 1
            seg = {'ver': version, 'data': self.ser(df_seg, version), 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        if version == 1:
            return gzip.compress(pickle.dumps(s), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s), preset=1)
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)

        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        else:
            raise Exception('unknown version')
    
    


import pandas as pd
import random
import numpy as np
import glob
import os
import pickle
import datetime
import time
pd.set_option("max_columns", 200)

startTm = datetime.datetime.now()
readPath = '/home/work516/day_stock/***'
dataPathLs = np.array(glob.glob(readPath))
dataPathLs = dataPathLs[[np.array([os.path.basename(i).split('.')[0][:2] == 'SZ' for i in dataPathLs])]]
db = pd.DataFrame()
for p in dataPathLs:
    dayData = pd.read_csv(p, compression='gzip')
    db = pd.concat([db, dayData])
print(datetime.datetime.now() - startTm)

year = "2018"
startDate = '20180102'
endDate = '20180731'
readPath = '/mnt/usb/' + year + '/***/***'
dataPathLs = np.array(glob.glob(readPath))
dateLs = np.array([os.path.basename(i) for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]
less = []

for data in dataPathLs:
    
    
    if len(np.array(glob.glob(data + '/SZ/***'))) == 0:
        if int(os.path.basename(data)) not in date_list["Date"].values:
            continue
        else:
            print(os.path.basename(data) + " less data!!!!!!!!!!!!!!!!!")
            less.append(data)
            continue
    startTm = datetime.datetime.now()
    date = os.path.basename(data)
    rar_path = data + '/SZ/order.7z'
    path = '/mnt/e/unzip_data/2018/SZ'
    path1 = path + '/' + date
    un_path = path1
    cmd = '7za x {} -o{}'.format(rar_path, un_path)
    os.system(cmd)
    print(datetime.datetime.now() - startTm)
    print(date + ' unzip finished')
    
    
    startTm = datetime.datetime.now()
    
    readPath = path1 + '/order/***'
    dataPathLs = np.array(glob.glob(readPath))
    dateLs = np.array([int(os.path.basename(i).split('.')[0]) for i in dataPathLs])
    dataPathLs = dataPathLs[(dateLs < 4000) | ((dateLs > 300000) & (dateLs < 310000))]
    OrderLog = []
    ll = []
    
    for i in dataPathLs:
        try:
            df = pd.read_csv(i, encoding='GBK')
        except:
            print("empty data")
            print(i)
            ll.append(int(os.path.basename(i).split('.')[0]))
            continue
        df["SecurityID"] = int(os.path.basename(i).split('.')[0])
        OrderLog += [df]
    OrderLog = pd.concat(OrderLog).reset_index(drop=True)
    OrderLog = OrderLog[OrderLog["ChannelNo"] != 4001]
    
    OrderLog = OrderLog.rename(columns={"OrdType": "OrderType"})
    OrderLog["date"] = OrderLog["TransactTime"].iloc[0]//1000000000
    OrderLog["OrderType"] = np.where(OrderLog["OrderType"] == 'U', 3, OrderLog["OrderType"])
    OrderLog["skey"] = OrderLog["SecurityID"] + 2000000
    OrderLog["clockAtArrival"] = OrderLog["TransactTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S%f').timestamp()*1e6))
    OrderLog['datetime'] = OrderLog["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))
    OrderLog["time"] = (OrderLog['TransactTime'] - int(OrderLog['TransactTime'].iloc[0]//1000000000*1000000000)).astype(np.int64)*1000
    
    for col in ["skey", "date", "ApplSeqNum", "OrderQty", "Side", "OrderType"]:
        OrderLog[col] = OrderLog[col].astype('int32')
#     for cols in ["Price"]:
#         print(cols)
#         print(OrderLog[cols].astype(str).apply(lambda x: len(str(x.split('.')[1]))).unique())
    
    assert(OrderLog[((OrderLog["Side"] != 1) & (OrderLog["Side"] != 2)) | (OrderLog["OrderType"].isnull())].shape[0] == 0)
    da_te = str(OrderLog["date"].iloc[0]) 
    da_te = da_te[:4] + '-' + da_te[4:6] + '-' + da_te[6:8]
    db1 = db[db["date"] == da_te]
    sl = (db1["ID"].str[2:].astype(int) + 2000000).unique()
    del db1
    try:
        assert(len(set(sl) - set(OrderLog["skey"].unique())) == 0)
    except:
        print("less stocks")
        display(set(sl) - set(OrderLog["skey"].unique()))
    if len(set(OrderLog["skey"].unique()) - set(sl)) != 0:
        print("more stocks")
        print(set(OrderLog["skey"].unique()) - set(sl))
    
    OrderLog = OrderLog.rename(columns={"Side":"order_side", "OrderType":"order_type", "Price":"order_price", "OrderQty":"order_qty"})
    OrderLog = OrderLog[["skey", "date", "time", "clockAtArrival", "datetime", "ApplSeqNum", "order_side", "order_type", "order_price",
                                                 "order_qty"]]
    
    print(OrderLog["date"].iloc[0])
    print("order finished")

    
    db1 = DB("mongodb://user_rw:faa96dfc@192.168.10.223")
    db1.write('order', OrderLog)
    
    del OrderLog
    
    print(datetime.datetime.now() - startTm)
    
#     pd.set_option("max_rows", 200)
#     display(OrderLog.dtypes)



0:05:27.019539
0:00:15.261566
20180102 unzip finished


  interactivity=interactivity, compiler=compiler, result=result)


less stocks


{2001914}

20180102
order finished
0:05:20.148924
0:00:18.443840
20180103 unzip finished


  interactivity=interactivity, compiler=compiler, result=result)


less stocks


{2001914}

20180103
order finished
0:06:46.281167
0:00:16.408613
20180104 unzip finished
less stocks


{2001914}

20180104
order finished
0:06:13.494110
0:00:16.065804
20180105 unzip finished
less stocks


{2001914}

20180105
order finished
0:06:08.911434
0:00:16.527113
20180108 unzip finished
less stocks


{2001914}

20180108
order finished
0:06:25.142429
0:00:15.676600
20180109 unzip finished
less stocks


{2001914}

20180109
order finished
0:06:21.597419
0:00:19.048002
20180110 unzip finished
less stocks


{2001914}

20180110
order finished
0:06:38.901671
0:00:17.238399
20180111 unzip finished
less stocks


{2001914}

20180111
order finished
0:06:40.423588
0:00:22.325239
20180112 unzip finished
less stocks


{2001914}

20180112
order finished
0:06:15.221123
0:00:18.444350
20180115 unzip finished
less stocks


{2001914}

20180115
order finished
0:07:05.444892
0:00:16.510426
20180116 unzip finished
less stocks


{2001914}

20180116
order finished
0:06:26.723757
0:00:19.135810
20180117 unzip finished


  interactivity=interactivity, compiler=compiler, result=result)


less stocks


{2001914}

20180117
order finished
0:07:30.839039
0:00:15.958015
20180118 unzip finished
less stocks


{2001914}

20180118
order finished
0:05:49.210835
0:00:16.633041
20180119 unzip finished
less stocks


{2001914}

20180119
order finished
0:06:03.494841
0:00:17.245802
20180122 unzip finished
less stocks


{2001914}

20180122
order finished
0:06:14.780511
0:00:16.119549
20180123 unzip finished
less stocks


{2001914}

20180123
order finished
0:06:06.367385
0:00:19.997868
20180124 unzip finished
less stocks


{2001914}

20180124
order finished
0:07:01.167360
0:00:19.813490
20180125 unzip finished


  interactivity=interactivity, compiler=compiler, result=result)


less stocks


{2001914}

20180125
order finished
0:07:22.952835
0:00:16.548694
20180126 unzip finished
less stocks


{2001914}

20180126
order finished
0:06:10.249547
0:00:17.690573
20180129 unzip finished
less stocks


{2001914}

20180129
order finished
0:06:31.385331
0:00:16.091478
20180130 unzip finished
less stocks


{2001914}

20180130
order finished
0:05:34.177123
0:00:15.955127
20180131 unzip finished
less stocks


{2001914}

20180131
order finished
0:06:20.480684
0:00:17.565088
20180201 unzip finished
less stocks


{2001914}

20180201
order finished
0:06:43.062987
0:00:15.877069
20180202 unzip finished
less stocks


{2001914}

20180202
order finished
0:05:39.833827
0:00:15.050018
20180205 unzip finished
less stocks


{2001914}

20180205
order finished
0:05:04.167137
0:00:17.003395
20180206 unzip finished
less stocks


{2001914}

20180206
order finished
0:05:59.321689
0:00:15.786458
20180207 unzip finished
less stocks


{2001914}

20180207
order finished
0:05:53.222993
0:00:14.151549
20180208 unzip finished
less stocks


{2001914}

20180208
order finished
0:05:16.285365
0:00:15.736174
20180209 unzip finished
less stocks


{2001914}

20180209
order finished
0:05:58.165742
0:00:13.103288
20180212 unzip finished
less stocks


{2001914}

20180212
order finished
0:04:40.026146
0:00:13.055207
20180213 unzip finished
less stocks


{2001914}

20180213
order finished
0:04:37.459528
0:00:10.937269
20180214 unzip finished
less stocks


{2001914}

20180214
order finished
0:03:37.664909
0:00:12.534550
20180222 unzip finished
less stocks


{2001914}

20180222
order finished
0:04:24.582347
0:00:12.750247
20180223 unzip finished
less stocks


{2001914}

20180223
order finished
0:04:51.587619
0:00:15.779228
20180226 unzip finished
less stocks


{2001914}

20180226
order finished
0:06:06.479273
0:00:17.757917
20180227 unzip finished
less stocks


{2001914}

20180227
order finished
0:06:33.179746
0:00:17.345650
20180228 unzip finished
less stocks


{2001914}

20180228
order finished
0:06:22.642316
0:00:17.676001
20180301 unzip finished
less stocks


{2001914}

20180301
order finished
0:06:33.434194
0:00:16.198069
20180302 unzip finished
less stocks


{2001914}

20180302
order finished
0:06:28.432628
0:00:15.926401
20180305 unzip finished
less stocks


{2001914}

20180305
order finished
0:05:46.409953
0:00:19.398650
20180306 unzip finished
less stocks


{2001914}

20180306
order finished
0:06:57.411389
0:00:16.809244
20180307 unzip finished
less stocks


{2001914}

20180307
order finished
0:06:28.516157
0:00:16.024746
20180308 unzip finished
less stocks


{2001914}

20180308
order finished
0:06:15.604881
0:00:19.204001
20180309 unzip finished
less stocks


{2001914}

20180309
order finished
0:07:28.994862
0:00:21.068572
20180312 unzip finished
less stocks


{2001914}

20180312
order finished
0:08:05.748117
0:00:19.816021
20180313 unzip finished
less stocks


{2001914}

20180313
order finished
0:07:59.237378
0:00:18.312332
20180314 unzip finished
less stocks


{2001914}

20180314
order finished
0:07:01.847033
0:00:18.328504
20180315 unzip finished
less stocks


{2001914}

20180315
order finished
0:07:11.379725
0:00:16.395514
20180316 unzip finished
less stocks


{2001914}

20180316
order finished
0:06:17.441992
0:00:15.928405
20180319 unzip finished
less stocks


{2001914}

20180319
order finished
0:06:02.673082
0:00:16.532107
20180320 unzip finished
less stocks


{2001914}

20180320
order finished
0:06:26.531484
0:00:18.493716
20180321 unzip finished


  interactivity=interactivity, compiler=compiler, result=result)


less stocks


{2001914}

20180321
order finished
0:07:18.875221
0:00:17.054693
20180322 unzip finished
less stocks


{2001914}

20180322
order finished
0:06:25.286991
0:00:20.989584
20180323 unzip finished
less stocks


{2001914}

20180323
order finished
0:08:22.147048
0:00:17.639570
20180326 unzip finished
less stocks


{2001914}

20180326
order finished
0:06:33.937945
0:00:19.286927
20180327 unzip finished
less stocks


{2001914}

20180327
order finished
0:07:39.787753
0:00:18.999646
20180328 unzip finished
less stocks


{2001914}

20180328
order finished
0:07:38.498647
0:00:18.442396
20180329 unzip finished
less stocks


{2001914}

20180329
order finished
0:07:15.415938
0:00:19.842264
20180330 unzip finished
less stocks


{2001914}

20180330
order finished
0:07:17.809251
0:00:21.407315
20180402 unzip finished
less stocks


{2001914}

20180402
order finished
0:08:23.933893
0:00:19.415656
20180403 unzip finished
less stocks


{2001914}

20180403
order finished
0:07:50.418525
0:00:19.140316
20180404 unzip finished
less stocks


{2001914}

20180404
order finished
0:07:23.704858
0:00:17.608209
20180409 unzip finished
less stocks


{2001914}

20180409
order finished
0:06:48.984520
0:00:18.803542
20180410 unzip finished
less stocks


{2001914}

20180410
order finished
0:07:19.915465
0:00:18.863933
20180411 unzip finished
less stocks


{2001914}

20180411
order finished
0:07:04.017052
0:00:18.268056
20180412 unzip finished
less stocks


{2001914}

20180412
order finished
0:07:05.007878
0:00:17.311854
20180413 unzip finished
less stocks


{2001914}

20180413
order finished
0:06:29.447978
0:00:17.619171
20180416 unzip finished
less stocks


{2001914}

20180416
order finished
0:06:57.831247
0:00:18.530960
20180417 unzip finished
less stocks


{2001914}

20180417
order finished
0:07:17.534560
0:00:19.380596
20180418 unzip finished
less stocks


{2001914}

20180418
order finished
0:07:33.473407
0:00:19.495587
20180419 unzip finished
less stocks


{2001914}

20180419
order finished
0:07:11.932020
0:00:18.312387
20180420 unzip finished
less stocks


{2001914}

20180420
order finished
0:07:03.050035
0:00:17.843980
20180423 unzip finished
less stocks


{2001914}

20180423
order finished
0:06:34.339163
0:00:19.094354
20180424 unzip finished
less stocks


{2001914}

20180424
order finished
0:06:59.174439
0:00:18.349650
20180425 unzip finished
less stocks


{2001914}

20180425
order finished
0:07:03.201285
0:00:20.187256
20180426 unzip finished
less stocks


{2001914}

20180426
order finished
0:06:52.571916
0:00:16.542483
20180427 unzip finished
less stocks


{2001914}

20180427
order finished
0:06:19.740968
0:00:15.664752
20180502 unzip finished
less stocks


{2001914}

20180502
order finished
0:06:04.928007
0:00:17.472913
20180503 unzip finished
less stocks


{2001914}

20180503
order finished
0:06:37.835006
0:00:16.077413
20180504 unzip finished
less stocks


{2001914}

20180504
order finished
0:06:21.866788
0:00:16.453385
20180507 unzip finished
less stocks


{2001914}

20180507
order finished
0:06:30.859304
0:00:18.084797
20180508 unzip finished
less stocks


{2001914}

20180508
order finished
0:07:14.780327
0:00:16.922069
20180509 unzip finished
less stocks


{2001914}

20180509
order finished
0:06:38.625565
0:00:18.190436
20180510 unzip finished
less stocks


{2001914}

20180510
order finished
0:06:51.992188
0:00:17.960428
20180511 unzip finished
less stocks


{2001914}

20180511
order finished
0:06:54.328733
0:00:15.996475
20180514 unzip finished
less stocks


{2001914}

20180514
order finished
0:06:25.874431
0:00:15.932719
20180515 unzip finished
less stocks


{2001914}

20180515
order finished
0:06:07.679163
0:00:18.276837
20180516 unzip finished
less stocks


{2001914}

20180516
order finished
0:06:19.942097
0:00:15.753956
20180517 unzip finished
less stocks


{2001914}

20180517
order finished
0:06:05.344541
0:00:16.815732
20180518 unzip finished
less stocks


{2001914}

20180518
order finished
0:06:07.974394
0:00:17.661990
20180521 unzip finished
less stocks


{2001914}

20180521
order finished
0:06:57.528936
0:00:18.040902
20180522 unzip finished
less stocks


{2001914}

20180522
order finished
0:06:58.899012
0:00:18.784033
20180523 unzip finished
less stocks


{2001914}

20180523
order finished
0:07:28.093350
0:00:16.704679
20180524 unzip finished
less stocks


{2001914}

20180524
order finished
0:06:25.777311
0:00:17.355552
20180525 unzip finished
less stocks


{2001914}

20180525
order finished
0:06:44.429939
0:00:16.928164
20180528 unzip finished
less stocks


{2001914}

20180528
order finished
0:06:52.453413
0:00:18.310608
20180529 unzip finished
less stocks


{2001914}

20180529
order finished
0:06:41.237046
0:00:17.193978
20180530 unzip finished
less stocks


{2001914}

20180530
order finished
0:07:06.415700
0:00:16.239665
20180531 unzip finished
less stocks


{2001914}

20180531
order finished
0:06:24.432930
0:00:16.870324
20180601 unzip finished
less stocks


{2001914}

20180601
order finished
0:06:20.536785
0:00:14.875665
20180604 unzip finished
less stocks


{2001914}

20180604
order finished
0:05:43.265163
0:00:15.541596
20180605 unzip finished
less stocks


{2001914}

20180605
order finished
0:06:08.006074
0:00:15.547180
20180606 unzip finished
less stocks


{2001914}

20180606
order finished
0:06:06.398417
0:00:15.358212
20180607 unzip finished
less stocks


{2001914}

20180607
order finished
0:06:02.611598
0:00:15.749122
20180608 unzip finished
less stocks


{2001914}

20180608
order finished
0:06:12.867474
0:00:13.404895
20180611 unzip finished
less stocks


{2001914}

20180611
order finished
0:05:41.136380
0:00:14.842294
20180612 unzip finished
less stocks


{2001914}

20180612
order finished
0:06:13.220974
0:00:14.957685
20180613 unzip finished


  interactivity=interactivity, compiler=compiler, result=result)


less stocks


{2001914}

20180613
order finished
0:05:55.368262
0:00:14.424780
20180614 unzip finished
less stocks


{2001914}

20180614
order finished
0:05:25.343731
0:00:15.604853
20180615 unzip finished
less stocks


{2001914}

20180615
order finished
0:06:04.562583
0:00:16.960267
20180619 unzip finished
less stocks


{2001914}

20180619
order finished
0:06:36.961886
0:00:16.458245
20180620 unzip finished
less stocks


{2001914}

20180620
order finished
0:06:05.777486
0:00:15.744724
20180621 unzip finished
less stocks


{2001914}

20180621
order finished
0:06:17.036410
0:00:14.522742
20180622 unzip finished
less stocks


{2001914}

20180622
order finished
0:05:47.975341
0:00:13.908047
20180625 unzip finished
less stocks


{2001914}

20180625
order finished
0:05:38.426917
0:00:16.076314
20180626 unzip finished
less stocks


{2001914}

20180626
order finished
0:06:07.860806
0:00:15.193149
20180627 unzip finished
less stocks


{2001914}

20180627
order finished
0:06:06.798470
0:00:14.993349
20180628 unzip finished
less stocks


{2001914}

20180628
order finished
0:05:51.721587
0:00:16.372911
20180629 unzip finished
less stocks


{2001914}

20180629
order finished
0:06:26.937653
0:00:15.654776
20180702 unzip finished
less stocks


{2001914}

20180702
order finished
0:06:25.834728
0:00:17.615437
20180703 unzip finished
less stocks


{2001914}

20180703
order finished
0:06:58.338099
0:00:15.569742
20180704 unzip finished
less stocks


{2001914}

20180704
order finished
0:06:07.356209
0:00:14.743653
20180705 unzip finished
less stocks


{2001914}

20180705
order finished
0:06:10.911651
0:00:16.123562
20180706 unzip finished
less stocks


{2001914}

20180706
order finished
0:06:17.787481
0:00:14.904952
20180709 unzip finished
less stocks


{2001914}

20180709
order finished
0:05:37.836228
0:00:15.350420
20180710 unzip finished
less stocks


{2001872, 2001914}

20180710
order finished
0:05:50.363044
0:00:16.892843
20180711 unzip finished
less stocks


{2001872, 2001914}

20180711
order finished
0:06:01.914301
0:00:17.562680
20180712 unzip finished
less stocks


{2001872, 2001914}

20180712
order finished
0:06:48.496495
0:00:15.724632
20180713 unzip finished
less stocks


{2001872, 2001914}

20180713
order finished
0:06:16.809287
0:00:14.537138
20180716 unzip finished
less stocks


{2001872, 2001914}

20180716
order finished
0:05:47.812167
0:00:14.616277
20180717 unzip finished
less stocks


{2001872, 2001914}

20180717
order finished
0:05:44.168600
0:00:15.129265
20180718 unzip finished
less stocks


{2001872, 2001914}

20180718
order finished
0:05:59.104202
0:00:14.984794
20180719 unzip finished
less stocks


{2001872, 2001914}

20180719
order finished
0:05:33.157577
0:00:15.954371
20180720 unzip finished
less stocks


{2001872, 2001914}

20180720
order finished
0:05:46.957150
0:00:15.797503
20180723 unzip finished
less stocks


{2001872, 2001914}

20180723
order finished
0:06:22.224145
0:00:17.735208
20180724 unzip finished
less stocks


{2001872, 2001914}

20180724
order finished
0:07:09.315471
0:00:16.386070
20180725 unzip finished
less stocks


{2001872, 2001914}

20180725
order finished
0:06:40.481096
0:00:18.766378
20180726 unzip finished
less stocks


{2001872, 2001914}

20180726
order finished
0:06:44.215940
0:00:14.407674
20180727 unzip finished
less stocks


{2001872, 2001914}

20180727
order finished
0:05:40.812098
0:00:14.548657
20180730 unzip finished
less stocks


{2001872, 2001914}

20180730
order finished
0:05:46.041285
0:00:12.925391
20180731 unzip finished
less stocks


{2001872, 2001914}

20180731
order finished
0:05:05.725817


In [18]:
pd.set_option("max_columns", 200)
d = pd.read_csv('/mnt/e/unzip_data/2017/SZ/20170901/order/000001.csv')
d

Unnamed: 0,OrderQty,OrdType,TransactTime,ExpirationDays,Side,ApplSeqNum,Contactor,SendingTime,Price,ChannelNo,ExpirationType,ContactInfo,ConfirmID
0,1000,2,20170901091500000,0,1,223,,20170901091500000,11.21,2011,0,,
1,200,2,20170901091500000,0,2,324,,20170901091500000,11.80,2011,0,,
2,1000,2,20170901091500020,0,1,616,,20170901091500000,11.02,2011,0,,
3,1000,2,20170901091500030,0,2,667,,20170901091500000,11.80,2011,0,,
4,1000,2,20170901091500030,0,2,669,,20170901091500000,12.00,2011,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
42402,1300,2,20170901145956230,0,2,11563946,,20170901145956000,11.21,2011,0,,
42403,100,2,20170901145956290,0,2,11563965,,20170901145956000,11.18,2011,0,,
42404,500,2,20170901145959030,0,1,11564604,,20170901145958000,11.21,2011,0,,
42405,13700,2,20170901145959140,0,1,11564627,,20170901145959000,11.33,2011,0,,
