In [1]:
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz


def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name)


class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db'):
        self.db_name = db_name
        self.uri = uri
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = 1
            seg = {'ver': version, 'data': self.ser(df_seg, version), 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)

        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        else:
            raise Exception('unknown version')


def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()

import pandas as pd
import random
import numpy as np
import glob
import os
import pickle
import datetime


startTm = datetime.datetime.now()
readPath = '/home/work516/day_stock/***'
dataPathLs = np.array(glob.glob(readPath))
dataPathLs = dataPathLs[[np.array([os.path.basename(i).split('.')[0][:2] == 'SZ' for i in dataPathLs])]]
db = pd.DataFrame()
for p in dataPathLs:
    dayData = pd.read_csv(p, compression='gzip')
    db = pd.concat([db, dayData])
print(datetime.datetime.now() - startTm)

startTm = datetime.datetime.now()
for dd in ['20200102', '20200123', '20200327', '20200525', '20200529']:
    startDate = dd
    endDate = dd
    df = []
    bad = []
    readPath = '/mnt/Kevin_zhenyu/rawData/logs_***_zs_92_01_day_data'
    dataPathLs = np.array(glob.glob(readPath))
    dateLs = np.array([os.path.basename(i).split('_')[1] for i in dataPathLs])
    dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]
    for data in dataPathLs:
        readPath = data + '/mdTradeLog***'
        dataPathLs = np.array(glob.glob(readPath))
        TradeLogSZ1 = pd.read_csv(dataPathLs[0], encoding="utf-8").loc[:, ["TransactTime",
                                                     "ApplSeqNum", "SecurityID", "ExecType", "TradeBSFlag",
                                                     "TradePrice", "TradeQty", "TradeMoney", "BidApplSeqNum",
                                                     "OfferApplSeqNum"]]
        TradeLogSZ1 = TradeLogSZ1[(TradeLogSZ1["SecurityID"] < 4000) | ((TradeLogSZ1["SecurityID"] > 300000) 
                                                                        & (TradeLogSZ1["SecurityID"] < 310000))]
        TradeLogSZ1 = TradeLogSZ1.rename(columns={"TradeBSFlag":"trade_flag", "TradeMoney":"trade_money", "TradePrice":"trade_price",
                                                 "TradeQty":'trade_qty', "ExecType":"trade_type"})
        TradeLogSZ1['date'] = int(os.path.basename(dataPathLs[0]).split('_')[1])
        TradeLogSZ1["skey"] = TradeLogSZ1["SecurityID"] + 2000000
        TradeLogSZ1["time"] = TradeLogSZ1['TransactTime'].astype(np.int64)*1000
        TradeLogSZ1['TransactTime'] = TradeLogSZ1['TransactTime'] + TradeLogSZ1['date'] * 1000000000
        TradeLogSZ1["clockAtArrival"] = TradeLogSZ1["TransactTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S%f').timestamp()*1e6))
        TradeLogSZ1['datetime'] = TradeLogSZ1["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))
        TradeLogSZ1["trade_type"] = np.where(TradeLogSZ1["trade_type"] == 'F', 1, TradeLogSZ1["trade_type"])
        TradeLogSZ1['trade_flag'] = np.where(TradeLogSZ1["trade_flag"] == 'B', 1, np.where(
            TradeLogSZ1["trade_flag"] == 'S', 2, 0))

        for col in ["skey", "date", "ApplSeqNum", "BidApplSeqNum", "OfferApplSeqNum", "trade_qty", "trade_type", "trade_flag"]:
            TradeLogSZ1[col] = TradeLogSZ1[col].astype('int32')
        for cols in ["trade_money", 'trade_price']:
            TradeLogSZ1[cols] = TradeLogSZ1[cols]/10000
        display(TradeLogSZ1["trade_price"].astype(str).apply(lambda x: len(x.split('.')[1])).unique())
        display(TradeLogSZ1["trade_money"].astype(str).apply(lambda x: len(x.split('.')[1])).unique())

        da_te = str(TradeLogSZ1["date"].iloc[0]) 
        da_te = da_te[:4] + '-' + da_te[4:6] + '-' + da_te[6:8]
        db1 = db[db["date"] == da_te]
        sl = (db1["ID"].str[2:].astype(int) + 2000000).unique()
        db1["max_volume"] = db1.groupby("ID")["d_volume"].transform("max")
        db1["max_amount"] = db1.groupby("ID")["d_amount"].transform("max")
        t1 = db1.groupby("ID")["max_volume", "max_amount"].first().reset_index()
        del db1
        t1["skey"] = t1["ID"].str[2:].astype(int) + 2000000
        trade1 = TradeLogSZ1[TradeLogSZ1["trade_type"] == 1].groupby("skey")["trade_qty"].sum().reset_index()
        trade1.columns=["skey", "cum_volume"]
        trade2 = TradeLogSZ1[TradeLogSZ1["trade_type"] == 1].groupby("skey")["trade_money"].sum().reset_index()
        trade2.columns=["skey", "cum_amount"]
        t2 = pd.merge(trade1, trade2, on="skey")
        re = pd.merge(t1, t2, on="skey", how="outer")
        try:
            assert(t1.shape[0] == t2.shape[0])
            assert(re[re["cum_volume"] != re["max_volume"]].shape[0] == 0)
            assert(re[re["cum_amount"].round(2) != re["max_amount"]].shape[0] == 0)
        except:
            display(set(t1["skey"]) - set(t2["skey"]))
            display(re[re["cum_volume"] != re["max_volume"]])
            display(re[re["cum_amount"].round(2) != re["max_amount"]])
        del t1
        del t2
        del re

        TradeLogSZ1 = TradeLogSZ1[["skey", "date", "time", "clockAtArrival", "datetime", "ApplSeqNum", "trade_type", "trade_flag",
                                                     "trade_price", "trade_qty", "BidApplSeqNum", "OfferApplSeqNum"]]
        print(da_te)
        print("trade finished")

        database_name = 'com_md_eq_cn'
        user = "zhenyuy"
        password = "bnONBrzSMGoE"

        db1 = DB("192.168.10.223", database_name, user, password)
        db1.write('md_trade', TradeLogSZ1)

        del TradeLogSZ1

        print(datetime.datetime.now() - startTm)



0:05:23.526796


array([1, 2])

array([1, 2])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


{2002972, 2300811}

Unnamed: 0,ID,max_volume,max_amount,skey,cum_volume,cum_amount
1394,SZ002972,73990.0,1629999.7,2002972,,
2181,SZ300811,15442.0,705544.98,2300811,,


Unnamed: 0,ID,max_volume,max_amount,skey,cum_volume,cum_amount
1394,SZ002972,73990.0,1629999.7,2002972,,
2181,SZ300811,15442.0,705544.98,2300811,,


2020-01-02
trade finished
0:13:40.802776


array([1, 2])

array([1, 2])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


{2002971, 2002973, 2300812, 2300813, 2300815}

Unnamed: 0,ID,max_volume,max_amount,skey,cum_volume,cum_amount
1395,SZ002971,17254023.0,434336800.0,2002971,,
1397,SZ002973,10590727.0,186276700.0,2002973,,
2185,SZ300812,11831168.0,864250400.0,2300812,,
2186,SZ300813,131420.0,6768130.0,2300813,,
2187,SZ300815,304197.0,12929940.0,2300815,,


Unnamed: 0,ID,max_volume,max_amount,skey,cum_volume,cum_amount
1395,SZ002971,17254023.0,434336800.0,2002971,,
1397,SZ002973,10590727.0,186276700.0,2002973,,
2185,SZ300812,11831168.0,864250400.0,2300812,,
2186,SZ300813,131420.0,6768130.0,2300813,,
2187,SZ300815,304197.0,12929940.0,2300815,,


2020-01-23
trade finished
0:28:10.217371


  interactivity=interactivity, compiler=compiler, result=result)


array([1, 2])

array([1, 2])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


{2002977, 2300819, 2300821, 2300822, 2300823, 2300825}

Unnamed: 0,ID,max_volume,max_amount,skey,cum_volume,cum_amount
1398,SZ002977,7703166.0,702166600.0,2002977,,
2192,SZ300819,7927130.0,239775100.0,2300819,,
2194,SZ300821,43161516.0,550985200.0,2300821,,
2195,SZ300822,5640143.0,264237500.0,2300822,,
2196,SZ300823,11102828.0,395817600.0,2300823,,
2197,SZ300825,228389.0,2014396.0,2300825,,


Unnamed: 0,ID,max_volume,max_amount,skey,cum_volume,cum_amount
1398,SZ002977,7703166.0,702166600.0,2002977,,
2192,SZ300819,7927130.0,239775100.0,2300819,,
2194,SZ300821,43161516.0,550985200.0,2300821,,
2195,SZ300822,5640143.0,264237500.0,2300822,,
2196,SZ300823,11102828.0,395817600.0,2300823,,
2197,SZ300825,228389.0,2014396.0,2300825,,


2020-03-27
trade finished
0:42:05.833599


array([1, 2])

array([1, 2])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


{2002988, 2002990, 2300832, 2300833, 2300835}

Unnamed: 0,ID,max_volume,max_amount,skey,cum_volume,cum_amount
1401,SZ002988,34811351.0,784896700.0,2002988,,
1402,SZ002990,96968.0,5101541.0,2002990,,
2208,SZ300832,351314.0,37446560.0,2300832,,
2209,SZ300833,2184514.0,185103500.0,2300833,,
2210,SZ300835,32200.0,925230.0,2300835,,


Unnamed: 0,ID,max_volume,max_amount,skey,cum_volume,cum_amount
1401,SZ002988,34811351.0,784896700.0,2002988,,
1402,SZ002990,96968.0,5101541.0,2002990,,
2208,SZ300832,351314.0,37446560.0,2300832,,
2209,SZ300833,2184514.0,185103500.0,2300833,,
2210,SZ300835,32200.0,925230.0,2300835,,


2020-05-25
trade finished
0:53:23.812144


  interactivity=interactivity, compiler=compiler, result=result)


array([1, 2])

array([1, 2])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


{2002988, 2002990, 2300832, 2300833, 2300835, 2300836}

Unnamed: 0,ID,max_volume,max_amount,skey,cum_volume,cum_amount
1400,SZ002988,23789396.0,559250100.0,2002988,,
1401,SZ002990,170656.0,13244610.0,2002990,,
2203,SZ300832,21296985.0,3244855000.0,2300832,,
2204,SZ300833,3651426.0,255272200.0,2300833,,
2205,SZ300835,101200.0,4267604.0,2300835,,
2206,SZ300836,10063.0,449212.3,2300836,,


Unnamed: 0,ID,max_volume,max_amount,skey,cum_volume,cum_amount
1400,SZ002988,23789396.0,559250100.0,2002988,,
1401,SZ002990,170656.0,13244610.0,2002990,,
2203,SZ300832,21296985.0,3244855000.0,2300832,,
2204,SZ300833,3651426.0,255272200.0,2300833,,
2205,SZ300835,101200.0,4267604.0,2300835,,
2206,SZ300836,10063.0,449212.3,2300836,,


2020-05-29
trade finished
1:05:20.914375
