In [1]:
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz


def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name)


class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db'):
        self.db_name = db_name
        self.uri = uri
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = 1
            seg = {'ver': version, 'data': self.ser(df_seg, version), 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)

        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        else:
            raise Exception('unknown version')


def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()

In [37]:
import pandas as pd
import random
import numpy as np
import glob
import pickle
import os
import datetime
import time
pd.set_option("max_columns", 200)

startTm = datetime.datetime.now()
readPath = '/home/work516/day_stock/***'
dataPathLs = np.array(glob.glob(readPath))
dataPathLs = dataPathLs[[np.array([os.path.basename(i).split('.')[0][:2] == 'SH' for i in dataPathLs])]]
db = pd.DataFrame()
for p in dataPathLs:
    dayData = pd.read_csv(p, compression='gzip')
    db = pd.concat([db, dayData])
print(datetime.datetime.now() - startTm)

colnames1=['StockID', 'DateTime', 'PreClosePx', 'openPrice','HighPx','LowPx','close','cum_volume','cum_amount','InstrumentStatus',
         'bid1p','bid2p','bid3p','bid4p','bid5p','bid6p','bid7p','bid8p','bid9p','bid10p','bid1q','bid2q','bid3q','bid4q','bid5q','bid6q','bid7q',
         'bid8q','bid9q','bid10q','b1n', 'b2n', 'b3n', 'b4n', 'b5n', 'b6n', 'b7n', 'b8n', 'b9n', 'b10n','b1','b2','b3','b4','b5','b6','b7','b8','b9','b10','b11','b12','b13','b14','b15','b16','b17','b18','b19','b20',
          'b21','b22','b23','b24','b25','b26','b27','b28','b29','b30','b31','b32','b33','b34','b35','b36','b37','b38','b39','b40','b41','b42','b43',
          'b44','b45','b46','b47','b48','b49','b50','ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p','ask1q','ask2q',
          'ask3q','ask4q','ask5q','ask6q','ask7q','ask8q','ask9q','ask10q','a1n', 'a2n', 'a3n', 'a4n', 'a5n', 'a6n', 'a7n', 'a8n', 'a9n', 'a10n','a1','a2','a3','a4','a5','a6','a7','a8','a9','a10','a11','a12','a13','a14',
         'a15','a16','a17','a18','a19','a20','a21','a22','a23','a24','a25','a26','a27','a28','a29','a30','a31','a32','a33','a34','a35','a36','a37','a38',
         'a39','a40','a41','a42','a43','a44','a45','a46','a47','a48','a49','a50','NumTrades','IOPV','TotalBidQty','TotalOfferQty','WeightedAvgBidPx',
         'WeightedAvgOfferPx','TotalBidNumber','TotalOfferNumber','BidTradeMaxDuration','OfferTradeMaxDuration','NumBidOrders','NumOfferOrders','WithdrawBuyNumber',
         'WithdrawBuyAmount','WithdrawBuyMoney','WithdrawSellNumber','WithdrawSellAmount','WithdrawSellMoney','ETFBuyNumber','ETFBuyAmount','ETFBuyMoney',
         'ETFSellNumber','ETFSellAmount','ETFSellMoney'] 

year = "2017"
startDate = '20170630'
endDate = '20170630'
readPath = '/home/work516/KR_upload_code/20170630/SH/snapshot/***'
dataPathLs = np.array(glob.glob(readPath))
date_list = pd.read_csv("/home/work516/KR_upload_code/trading_days.csv")
wr_ong = []
mi_ss = []
less = []
try1 = pd.read_csv('/home/work516/KR_upload_code/20170628/Snapshot.csv', nrows=1)
if try1.shape[1] == 2:
    print("first row two columns")
    SH = pd.read_csv('/home/work516/KR_upload_code/20170628/Snapshot.csv', header=None, names=colnames1).iloc[1:, :-6]
    SH["DateTime"] = SH["DateTime"].apply(lambda x: int(x))
    SH['time'] = ((SH["DateTime"] - int(SH["DateTime"].iloc[0]//1000000 * 1000000))*1000).astype(int)
    SH['StockID'] = SH['StockID'].astype(int)
    SH = SH[(SH['StockID'] >= 600000) & (SH['StockID'] < 700000)]
SH["skey"] = SH["StockID"] + 1000000
SH.drop(["StockID"],axis=1,inplace=True)
SH["date"] = int(SH["DateTime"].iloc[0]//1000000)
SH["time"] = (SH['DateTime'] - int(SH['DateTime'].iloc[0]//1000000*1000000)).astype(np.int64) * 1000000
SH["clockAtArrival"] = SH["DateTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S').timestamp()*1e6))
SH.drop(["DateTime"],axis=1,inplace=True)
SH['datetime'] = SH["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))

  


0:01:52.799638
first row two columns


  interactivity=interactivity, compiler=compiler, result=result)


In [29]:
SH["skey"] = SH["StockID"] + 1000000
SH.drop(["StockID"],axis=1,inplace=True)
SH["date"] = int(SH["DateTime"].iloc[0]//1000000)
SH["time"] = (SH['DateTime'] - int(SH['DateTime'].iloc[0]//1000000*1000000)).astype(np.int64) * 1000000
SH["clockAtArrival"] = SH["DateTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S').timestamp()*1e6))
SH.drop(["DateTime"],axis=1,inplace=True)
SH['datetime'] = SH["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))
SH.loc[SH['OfferTradeMaxDuration'] == SH['OfferTradeMaxDuration'].max(), 'OfferTradeMaxDuration'] = 0
SH.loc[SH['BidTradeMaxDuration'] == SH['BidTradeMaxDuration'].max(), 'BidTradeMaxDuration'] = 0

In [65]:
SH.columns = ['prev_close', 'open', 'high', 'low', 'close', 'cum_volume', 'cum_amount', 'InstrumentStatus', 'bid1p',
              'bid2p', 'bid3p', 'bid4p', 'bid5p', 'bid6p', 'bid7p', 'bid8p', 'bid9p', 'bid10p', 'bid1q', 'bid2q', 
              'bid3q', 'bid4q', 'bid5q', 'bid6q', 'bid7q', 'bid8q', 'bid9q', 'bid10q', 'bid1n', 'bid2n', 'bid3n',
              'bid4n', 'bid5n', 'bid6n', 'bid7n', 'bid8n', 'bid9n', 'bid10n', 'bid1Top1q', 'bid1Top2q', 
              'bid1Top3q', 'bid1Top4q', 'bid1Top5q', 'bid1Top6q', 'bid1Top7q', 'bid1Top8q', 'bid1Top9q', 
              'bid1Top10q', 'bid1Top11q', 'bid1Top12q', 'bid1Top13q', 'bid1Top14q', 'bid1Top15q', 'bid1Top16q', 
              'bid1Top17q', 'bid1Top18q', 'bid1Top19q', 'bid1Top20q', 'bid1Top21q', 'bid1Top22q', 'bid1Top23q',
              'bid1Top24q', 'bid1Top25q', 'bid1Top26q', 'bid1Top27q', 'bid1Top28q', 'bid1Top29q', 'bid1Top30q', 
              'bid1Top31q', 'bid1Top32q', 'bid1Top33q', 'bid1Top34q', 'bid1Top35q', 'bid1Top36q', 'bid1Top37q',
              'bid1Top38q', 'bid1Top39q', 'bid1Top40q', 'bid1Top41q', 'bid1Top42q', 'bid1Top43q', 'bid1Top44q', 
              'bid1Top45q', 'bid1Top46q', 'bid1Top47q', 'bid1Top48q', 'bid1Top49q', 'bid1Top50q', 'ask1p', 'ask2p',
              'ask3p', 'ask4p', 'ask5p', 'ask6p', 'ask7p', 'ask8p', 'ask9p', 'ask10p', 'ask1q', 'ask2q', 'ask3q', 
              'ask4q', 'ask5q', 'ask6q', 'ask7q', 'ask8q', 'ask9q', 'ask10q', 'ask1n', 'ask2n', 'ask3n', 'ask4n',
              'ask5n', 'ask6n', 'ask7n', 'ask8n', 'ask9n', 'ask10n', 'ask1Top1q', 'ask1Top2q', 'ask1Top3q',
              'ask1Top4q', 'ask1Top5q', 'ask1Top6q', 'ask1Top7q', 'ask1Top8q', 'ask1Top9q', 'ask1Top10q', 
              'ask1Top11q', 'ask1Top12q', 'ask1Top13q', 'ask1Top14q', 'ask1Top15q', 'ask1Top16q', 'ask1Top17q', 
              'ask1Top18q', 'ask1Top19q', 'ask1Top20q', 'ask1Top21q', 'ask1Top22q', 'ask1Top23q', 'ask1Top24q',
              'ask1Top25q', 'ask1Top26q', 'ask1Top27q', 'ask1Top28q', 'ask1Top29q', 'ask1Top30q', 'ask1Top31q', 
              'ask1Top32q', 'ask1Top33q', 'ask1Top34q', 'ask1Top35q', 'ask1Top36q', 'ask1Top37q', 'ask1Top38q',
              'ask1Top39q', 'ask1Top40q', 'ask1Top41q', 'ask1Top42q', 'ask1Top43q', 'ask1Top44q', 'ask1Top45q', 
              'ask1Top46q', 'ask1Top47q', 'ask1Top48q', 'ask1Top49q', 'ask1Top50q', 'cum_trades_cnt', 'IOPV', 
              'total_bid_quantity', 'total_ask_quantity', 'total_bid_vwap', 'total_ask_vwap', 'total_bid_orders',
              'total_ask_orders', 'bid_trade_max_duration', 'ask_trade_max_duration', 'total_bid_levels', 'total_ask_levels',
              'cum_canceled_buy_orders', 'cum_canceled_buy_volume', "cum_canceled_buy_amount", 
              "cum_canceled_sell_orders", 'cum_canceled_sell_volume',"cum_canceled_sell_amount", 'time', 
              'skey', 'date', 'clockAtArrival', 'datetime']

In [66]:
SH.head(5)

Unnamed: 0,prev_close,open,high,low,close,cum_volume,cum_amount,InstrumentStatus,bid1p,bid2p,bid3p,bid4p,bid5p,bid6p,bid7p,bid8p,bid9p,bid10p,bid1q,bid2q,bid3q,bid4q,bid5q,bid6q,bid7q,bid8q,bid9q,bid10q,bid1n,bid2n,bid3n,bid4n,bid5n,bid6n,bid7n,bid8n,bid9n,bid10n,bid1Top1q,bid1Top2q,bid1Top3q,bid1Top4q,bid1Top5q,bid1Top6q,bid1Top7q,bid1Top8q,bid1Top9q,bid1Top10q,bid1Top11q,bid1Top12q,bid1Top13q,bid1Top14q,bid1Top15q,bid1Top16q,bid1Top17q,bid1Top18q,bid1Top19q,bid1Top20q,bid1Top21q,bid1Top22q,bid1Top23q,bid1Top24q,bid1Top25q,bid1Top26q,bid1Top27q,bid1Top28q,bid1Top29q,bid1Top30q,bid1Top31q,bid1Top32q,bid1Top33q,bid1Top34q,bid1Top35q,bid1Top36q,bid1Top37q,bid1Top38q,bid1Top39q,bid1Top40q,bid1Top41q,bid1Top42q,bid1Top43q,bid1Top44q,bid1Top45q,bid1Top46q,bid1Top47q,bid1Top48q,bid1Top49q,bid1Top50q,ask1p,ask2p,ask3p,ask4p,ask5p,ask6p,ask7p,ask8p,ask9p,ask10p,ask1q,ask2q,ask3q,ask4q,ask5q,ask6q,ask7q,ask8q,ask9q,ask10q,ask1n,ask2n,ask3n,ask4n,ask5n,ask6n,ask7n,ask8n,ask9n,ask10n,ask1Top1q,ask1Top2q,ask1Top3q,ask1Top4q,ask1Top5q,ask1Top6q,ask1Top7q,ask1Top8q,ask1Top9q,ask1Top10q,ask1Top11q,ask1Top12q,ask1Top13q,ask1Top14q,ask1Top15q,ask1Top16q,ask1Top17q,ask1Top18q,ask1Top19q,ask1Top20q,ask1Top21q,ask1Top22q,ask1Top23q,ask1Top24q,ask1Top25q,ask1Top26q,ask1Top27q,ask1Top28q,ask1Top29q,ask1Top30q,ask1Top31q,ask1Top32q,ask1Top33q,ask1Top34q,ask1Top35q,ask1Top36q,ask1Top37q,ask1Top38q,ask1Top39q,ask1Top40q,ask1Top41q,ask1Top42q,ask1Top43q,ask1Top44q,ask1Top45q,ask1Top46q,ask1Top47q,ask1Top48q,ask1Top49q,ask1Top50q,cum_trades_cnt,IOPV,total_bid_quantity,total_ask_quantity,total_bid_vwap,total_ask_vwap,total_bid_orders,total_ask_orders,bid_trade_max_duration,ask_trade_max_duration,total_bid_levels,total_ask_levels,cum_canceled_buy_orders,cum_canceled_buy_volume,cum_canceled_buy_amount,cum_canceled_sell_orders,cum_canceled_sell_volume,cum_canceled_sell_amount,time,skey,date,clockAtArrival,datetime
50,6.23,0.0,6.2,6.2,6.2,410900.0,2547580.0,BETW,6.19,6.18,6.17,6.16,6.15,6.14,6.13,6.12,6.11,6.1,33900.0,43300.0,3300.0,45800.0,116800.0,50800.0,71500.0,254400.0,101200.0,187200.0,3.0,7.0,2.0,6.0,24.0,13.0,11.0,23.0,16.0,50.0,3000.0,8900.0,22000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.2,6.21,6.22,6.23,6.24,6.25,6.26,6.27,6.28,6.29,34200.0,45100.0,184000.0,219700.0,286200.0,168600.0,24500.0,15400.0,87800.0,64800.0,3.0,16.0,21.0,38.0,38.0,22.0,9.0,7.0,33.0,33.0,10100.0,2000.0,22100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,105.0,0.0,1799700.0,8047367.0,6.023,6.608,363.0,980.0,0.0,0.0,49.0,66.0,3.0,2000.0,12305.0,16.0,71000.0,448558.0,92501000000,1600497,20170628,1498613101000000,2017-06-28 09:25:01
51,9.74,0.0,0.0,0.0,0.0,0.0,0.0,BETW,9.7,9.66,9.65,9.63,9.6,9.58,9.56,9.55,9.54,9.53,300.0,1000.0,600.0,1000.0,1000.0,4500.0,3900.0,3600.0,600.0,1000.0,1.0,1.0,1.0,1.0,1.0,6.0,4.0,3.0,1.0,1.0,300.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.74,9.78,9.79,9.8,9.81,9.82,9.83,9.84,9.85,9.86,301.0,22400.0,3000.0,6500.0,1000.0,1000.0,1000.0,19100.0,6900.0,3200.0,2.0,11.0,1.0,5.0,1.0,1.0,1.0,5.0,3.0,2.0,1.0,300.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,101500.0,409201.0,9.305,10.156,55.0,186.0,0.0,0.0,35.0,61.0,0.0,0.0,0.0,1.0,200.0,2132.0,92501000000,1600178,20170628,1498613101000000,2017-06-28 09:25:01
52,5.14,0.0,5.12,5.12,5.12,10000.0,51200.0,BETW,5.12,5.11,5.1,5.09,5.08,5.07,5.06,5.05,5.04,5.03,1400.0,32000.0,24000.0,13100.0,20600.0,5700.0,9600.0,125800.0,3000.0,10000.0,1.0,3.0,8.0,6.0,4.0,3.0,2.0,4.0,1.0,1.0,1400.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.13,5.14,5.16,5.17,5.18,5.19,5.2,5.21,5.22,5.23,6101.0,185700.0,6000.0,18800.0,11200.0,12100.0,10400.0,19100.0,8600.0,800.0,4.0,4.0,1.0,3.0,3.0,5.0,3.0,1.0,2.0,1.0,1.0,100.0,4000.0,2000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,410300.0,742885.0,4.977,5.386,75.0,156.0,0.0,0.0,29.0,42.0,1.0,1000.0,5090.0,3.0,2700.0,14326.0,92501000000,1600311,20170628,1498613101000000,2017-06-28 09:25:01
53,5.43,0.0,5.4,5.4,5.4,79900.0,431460.0,BETW,5.39,5.38,5.37,5.36,5.35,5.34,5.33,5.32,5.31,5.3,13000.0,57800.0,2900.0,22600.0,49200.0,700.0,24900.0,2000.0,17600.0,72400.0,3.0,6.0,2.0,3.0,7.0,1.0,7.0,2.0,4.0,11.0,7100.0,1600.0,4300.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.4,5.41,5.42,5.43,5.44,5.45,5.46,5.47,5.48,5.49,15400.0,54900.0,4000.0,93400.0,100.0,8900.0,3100.0,18000.0,11500.0,44900.0,2.0,2.0,3.0,4.0,1.0,5.0,2.0,5.0,8.0,12.0,11400.0,4000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.0,0.0,345300.0,1473439.0,5.302,5.701,75.0,292.0,0.0,0.0,28.0,55.0,1.0,3600.0,19440.0,0.0,0.0,0.0,92501000000,1600331,20170628,1498613101000000,2017-06-28 09:25:01
54,8.91,0.0,8.91,8.91,8.91,1500.0,13365.0,BETW,8.9,8.88,8.87,8.84,8.83,8.81,8.8,8.78,8.76,8.72,500.0,600.0,1000.0,7300.0,1500.0,6700.0,8000.0,3000.0,1000.0,500.0,1.0,2.0,1.0,2.0,2.0,2.0,1.0,1.0,2.0,1.0,500.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.91,8.93,8.97,8.98,9.0,9.02,9.03,9.04,9.05,9.07,900.0,2200.0,500.0,3100.0,5700.0,500.0,4900.0,5600.0,200.0,1500.0,1.0,2.0,1.0,2.0,4.0,1.0,4.0,2.0,1.0,1.0,900.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,46400.0,637801.0,8.714,9.603,34.0,154.0,0.0,0.0,26.0,54.0,0.0,0.0,0.0,0.0,0.0,0.0,92501000000,1600355,20170628,1498613101000000,2017-06-28 09:25:01


In [67]:
for cols in ["prev_close", 'open', "high", "low", "close", 'bid10p','bid9p','bid8p','bid7p','bid6p','bid5p','bid4p','bid3p',
         'bid2p','bid1p','ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p']:
    print(cols)
    print(SH[cols].astype(str).apply(lambda x: len(str(x.split('.')[1]))).unique())

for cols in ['cum_amount', "cum_canceled_sell_amount", "cum_canceled_buy_amount"]:
    print(cols)
    print(SH[cols].astype(str).apply(lambda x: len(str(x.split('.')[1]))).unique())

for cols in ['total_bid_vwap', "total_ask_vwap"]:
    print(cols)
    print(SH[cols].astype(str).apply(lambda x: len(str(x.split('.')[1]))).unique())



prev_close
[2 1]
open
[1 2]
high
[1 2]
low
[1 2]
close
[1 2]
bid10p
[1 2]
bid9p
[2 1]
bid8p
[2 1]
bid7p
[2 1]
bid6p
[2 1]
bid5p
[2 1]
bid4p
[2 1]
bid3p
[2 1]
bid2p
[2 1]
bid1p
[2 1]
ask1p
[1 2]
ask2p
[2 1]
ask3p
[2 1]
ask4p
[2 1]
ask5p
[2 1]
ask6p
[2 1]
ask7p
[2 1]
ask8p
[2 1]
ask9p
[2 1]
ask10p
[2 1]
cum_amount
[1 2]
cum_canceled_sell_amount
[1 2]
cum_canceled_buy_amount
[1 2]
total_bid_vwap
[ 3 16  1  2 15 14]
total_ask_vwap
[ 3 16 15  2  1 14]


In [70]:
import pandas as pd
import random
import numpy as np
import glob
import pickle
import os
import datetime
import time
pd.set_option("max_columns", 200)

SH = SH.fillna(0)
SH["ordering"] = SH.groupby("skey").cumcount()
SH["ordering"] = SH["ordering"] + 1

SH["has_missing"] = 0

for col in ["skey", "date", "cum_trades_cnt", "total_bid_orders",
    'total_ask_orders', 'total_bid_levels', 'total_ask_levels', 'cum_canceled_buy_orders','cum_canceled_sell_orders',
        "ordering", 'bid_trade_max_duration', 'ask_trade_max_duration','has_missing','bid1n', 'bid2n', 'bid3n',
              'bid4n', 'bid5n', 'bid6n', 'bid7n', 'bid8n', 'bid9n', 'bid10n', 'bid1Top1q', 'bid1Top2q', 
              'bid1Top3q', 'bid1Top4q', 'bid1Top5q', 'bid1Top6q', 'bid1Top7q', 'bid1Top8q', 'bid1Top9q', 
              'bid1Top10q', 'bid1Top11q', 'bid1Top12q', 'bid1Top13q', 'bid1Top14q', 'bid1Top15q', 'bid1Top16q', 
              'bid1Top17q', 'bid1Top18q', 'bid1Top19q', 'bid1Top20q', 'bid1Top21q', 'bid1Top22q', 'bid1Top23q',
              'bid1Top24q', 'bid1Top25q', 'bid1Top26q', 'bid1Top27q', 'bid1Top28q', 'bid1Top29q', 'bid1Top30q', 
              'bid1Top31q', 'bid1Top32q', 'bid1Top33q', 'bid1Top34q', 'bid1Top35q', 'bid1Top36q', 'bid1Top37q',
              'bid1Top38q', 'bid1Top39q', 'bid1Top40q', 'bid1Top41q', 'bid1Top42q', 'bid1Top43q', 'bid1Top44q', 
              'bid1Top45q', 'bid1Top46q', 'bid1Top47q', 'bid1Top48q', 'bid1Top49q', 'bid1Top50q', 'ask1n', 'ask2n', 'ask3n', 'ask4n',
              'ask5n', 'ask6n', 'ask7n', 'ask8n', 'ask9n', 'ask10n', 'ask1Top1q', 'ask1Top2q', 'ask1Top3q',
              'ask1Top4q', 'ask1Top5q', 'ask1Top6q', 'ask1Top7q', 'ask1Top8q', 'ask1Top9q', 'ask1Top10q', 
              'ask1Top11q', 'ask1Top12q', 'ask1Top13q', 'ask1Top14q', 'ask1Top15q', 'ask1Top16q', 'ask1Top17q', 
              'ask1Top18q', 'ask1Top19q', 'ask1Top20q', 'ask1Top21q', 'ask1Top22q', 'ask1Top23q', 'ask1Top24q',
              'ask1Top25q', 'ask1Top26q', 'ask1Top27q', 'ask1Top28q', 'ask1Top29q', 'ask1Top30q', 'ask1Top31q', 
              'ask1Top32q', 'ask1Top33q', 'ask1Top34q', 'ask1Top35q', 'ask1Top36q', 'ask1Top37q', 'ask1Top38q',
              'ask1Top39q', 'ask1Top40q', 'ask1Top41q', 'ask1Top42q', 'ask1Top43q', 'ask1Top44q', 'ask1Top45q', 
              'ask1Top46q', 'ask1Top47q', 'ask1Top48q', 'ask1Top49q', 'ask1Top50q']:
    SH[col] = SH[col].astype('int32')

for col in ["cum_volume", 'bid1q', 'bid2q', 'bid3q', 'bid4q', 'bid5q', 'bid6q', 'bid7q', 'bid8q', 'bid9q', 'bid10q',
           'ask1q', 'ask2q', 'ask3q', 'ask4q', 'ask5q', 'ask6q', 'ask7q', 'ask8q', 'ask9q', 'ask10q', 'total_bid_quantity',
           'total_ask_quantity', 'cum_canceled_buy_volume', 'cum_canceled_sell_volume']:
    SH[col] = SH[col].astype('int64')

for cols in ['total_bid_vwap', "total_ask_vwap"]:
    SH[cols] = SH[cols].apply(lambda x: round(x, 3))

In [72]:
assert(sum(SH[SH["open"] != 0].groupby("skey")["open"].nunique() != 1) == 0)
assert(sum(SH[SH["prev_close"] != 0].groupby("skey")["prev_close"].nunique() != 1) == 0)
SH["prev_close"] = np.where(SH["time"] >= 91500000000, SH.groupby("skey")["prev_close"].transform("max"), SH["prev_close"]) 
SH["open"] = np.where(SH["cum_volume"] > 0, SH.groupby("skey")["open"].transform("max"), SH["open"])
assert(sum(SH[SH["open"] != 0].groupby("skey")["open"].nunique() != 1) == 0)
assert(sum(SH[SH["prev_close"] != 0].groupby("skey")["prev_close"].nunique() != 1) == 0)
assert(SH[SH["cum_volume"] > 0]["open"].min() > 0)


# check 1
da_te = str(SH["date"].iloc[0]) 
da_te = da_te[:4] + '-' + da_te[4:6] + '-' + da_te[6:8]
db1 = db[db["date"] == da_te]
db1["ID"] = db1["ID"].str[2:].astype(int) + 1000000
db1["date"] = (db1["date"].str[:4] + db1["date"].str[5:7] + db1["date"].str[8:]).astype(int)
SH["cum_max"] = SH.groupby("skey")["cum_volume"].transform(max)
s2 = SH[SH["cum_volume"] == SH["cum_max"]].groupby("skey").first().reset_index()
dd = SH[SH["cum_volume"] == SH["cum_max"]].groupby("skey")["time"].first().reset_index()
SH.drop("cum_max", axis=1, inplace=True)
s2 = s2.rename(columns={"skey": "ID", 'open':"d_open", "prev_close":"d_yclose","high":"d_high", "low":"d_low", "close":"d_close", "cum_volume":"d_volume", "cum_amount":"d_amount"})
if SH["date"].iloc[0] < 20180820:
    s2["auction"] = 0
else:
    dd["auction"] = np.where(dd["time"]<=145700000000, 0, 1)
    dd = dd.rename(columns={"skey": "ID"})
    s2 = pd.merge(s2, dd[["ID", "auction"]], on="ID")
s2 = s2[["ID", "date", "d_open", "d_yclose", "d_high", "d_low", "d_close", "d_volume", "d_amount", "auction"]]
re = pd.merge(db1, s2, on=["ID", "date", "d_open", "d_yclose","d_high", "d_low", "d_volume"], how="outer")
try:
    assert(sum(re["d_amount_y"].isnull()) == 0)
except:
    print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    print(re[re["d_amount_y"].isnull()])
    wr_ong += [re[re["d_amount_y"].isnull()]]

# check 2
# first part
date = pd.DataFrame(pd.date_range(start='2019-06-10 08:30:00', end='2019-06-10 18:00:00', freq='s'), columns=["Orig"])
date["time"] = date["Orig"].apply(lambda x: int(x.strftime("%H%M%S"))*1000)
date["group"] = date["time"]//10000
SH["group"] = SH["time"]//10000000
gl = date[((date["time"] >= 93000000) & (date["time"] < 113000000))|((date["time"] >= 130000000) & (date["time"] < 150000000))]["group"].unique()
l = set(gl) - set(SH["group"].unique())
SH["has_missing1"] = 0 
if len(l) != 0:
    print("massive missing")
    print(l)
    SH["order"] = SH.groupby(["skey", "time"]).cumcount()
    for i in l:
        SH["t"] = SH[SH["group"] > i].groupby("skey")["time"].transform("min")
        SH["has_missing1"] = np.where((SH["time"] == SH["t"]) & (SH["order"] == 0), 1, SH["has_missing1"])
    SH.drop(["order", "t", "group"], axis=1, inplace=True)   
else:
    print("no massive missing")
    SH.drop(["group"], axis=1, inplace=True)




# second part

SH["time_interval"] = SH.groupby("skey")["datetime"].apply(lambda x: x - x.shift(1))
SH["time_interval"] = SH["time_interval"].apply(lambda x: x.seconds)
SH["tn_update"] = SH.groupby("skey")["cum_trades_cnt"].apply(lambda x: x-x.shift(1))

f1 = SH[(SH["time"] >= 93000000000) & (SH["tn_update"] != 0)].groupby("skey")["time"].min().reset_index()
f1 = f1.rename(columns={"time": "time1"})
f2 = SH[(SH["time"] >= 130000000000) & (SH["tn_update"] != 0)].groupby("skey")["time"].min().reset_index()
f2 = f2.rename(columns={"time": "time2"})
f3 = SH[(SH["time"] >= 150000000000) & (SH["tn_update"] != 0)].groupby("skey")["time"].min().reset_index()
f3 = f3.rename(columns={"time": "time3"})
SH = pd.merge(SH, f1, on="skey", how="left")
del f1
SH = pd.merge(SH, f2, on="skey", how="left")
del f2
SH = pd.merge(SH, f3, on="skey", how="left")
del f3
p99 = SH[(SH["time"] > 93000000000) & (SH["time"] < 145700000000) & (SH["time"] != SH["time2"]) & (SH["tn_update"] != 0)]\
.groupby("skey")["tn_update"].apply(lambda x: x.describe([0.99])["99%"]).reset_index()
p99 = p99.rename(columns={"tn_update":"99%"})
SH = pd.merge(SH, p99, on="skey", how="left")

SH["has_missing2"] = 0
SH["has_missing2"] = np.where((SH["time_interval"] > 60) & (SH["tn_update"] > SH["99%"]) & 
     (SH["time"] > SH["time1"]) & (SH["time"] != SH["time2"]) & (SH["time"] != SH["time3"]) & (SH["time"] != 100000000000), 1, 0)
SH.drop(["time_interval", "tn_update", "time1", "time2", "time3", "99%"], axis=1, inplace=True) 

SH["has_missing"] = np.where((SH["has_missing1"] == 1) | (SH["has_missing2"] == 1), 1, 0)
SH.drop(["has_missing1", "has_missing2"], axis=1, inplace=True) 
if SH[SH["has_missing"] == 1].shape[0] != 0:
    print("has missing!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    print(SH[SH["has_missing"] == 1].shape[0])
    mi_ss += [SH[SH["has_missing"] == 1]]



SH["has_missing"] = SH["has_missing"].astype('int32')
SH = SH[["skey", "date", "time", "clockAtArrival", "datetime", "ordering", "has_missing", "cum_trades_cnt", "cum_volume", "cum_amount", "prev_close",
                        "open", "high", "low", "close", 'bid10p','bid9p','bid8p','bid7p','bid6p','bid5p','bid4p','bid3p','bid2p','bid1p',
                        'ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p', 'bid10q','bid9q','bid8q',
                         'bid7q','bid6q','bid5q','bid4q','bid3q','bid2q','bid1q', 'ask1q','ask2q','ask3q','ask4q','ask5q','ask6q',
                         'ask7q','ask8q','ask9q','ask10q', 'bid10n', 'bid9n', 'bid8n', 'bid7n', 'bid6n', 'bid5n', 'bid4n', 'bid3n', 'bid2n', 'bid1n', 
                         'ask1n', 'ask2n', 'ask3n', 'ask4n', 'ask5n', 'ask6n','ask7n', 'ask8n', 'ask9n', 'ask10n','bid1Top1q','bid1Top2q','bid1Top3q','bid1Top4q','bid1Top5q','bid1Top6q',
    'bid1Top7q','bid1Top8q','bid1Top9q','bid1Top10q','bid1Top11q','bid1Top12q','bid1Top13q','bid1Top14q','bid1Top15q','bid1Top16q','bid1Top17q','bid1Top18q',
    'bid1Top19q','bid1Top20q','bid1Top21q','bid1Top22q','bid1Top23q','bid1Top24q','bid1Top25q','bid1Top26q','bid1Top27q','bid1Top28q','bid1Top29q',
    'bid1Top30q','bid1Top31q','bid1Top32q','bid1Top33q','bid1Top34q','bid1Top35q','bid1Top36q','bid1Top37q','bid1Top38q','bid1Top39q','bid1Top40q',
    'bid1Top41q','bid1Top42q','bid1Top43q','bid1Top44q','bid1Top45q','bid1Top46q','bid1Top47q','bid1Top48q','bid1Top49q','bid1Top50q', 'ask1Top1q',
    'ask1Top2q','ask1Top3q','ask1Top4q','ask1Top5q','ask1Top6q','ask1Top7q','ask1Top8q','ask1Top9q','ask1Top10q','ask1Top11q','ask1Top12q','ask1Top13q',
    'ask1Top14q','ask1Top15q','ask1Top16q','ask1Top17q','ask1Top18q','ask1Top19q','ask1Top20q','ask1Top21q','ask1Top22q','ask1Top23q',
    'ask1Top24q','ask1Top25q','ask1Top26q','ask1Top27q','ask1Top28q','ask1Top29q','ask1Top30q','ask1Top31q','ask1Top32q','ask1Top33q',
    'ask1Top34q','ask1Top35q','ask1Top36q','ask1Top37q','ask1Top38q','ask1Top39q','ask1Top40q','ask1Top41q','ask1Top42q','ask1Top43q',
    'ask1Top44q','ask1Top45q','ask1Top46q','ask1Top47q','ask1Top48q','ask1Top49q','ask1Top50q',"total_bid_quantity", "total_ask_quantity","total_bid_vwap", "total_ask_vwap",
    "total_bid_orders",'total_ask_orders','total_bid_levels', 'total_ask_levels', 'bid_trade_max_duration', 'ask_trade_max_duration', 'cum_canceled_buy_orders', 'cum_canceled_buy_volume',
    "cum_canceled_buy_amount", "cum_canceled_sell_orders", 'cum_canceled_sell_volume',"cum_canceled_sell_amount"]]

display(SH["date"].iloc[0])
print("SH finished")
display(SH.dtypes)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


no massive missing
has missing!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
3


20170628

SH finished


skey                                 int32
date                                 int32
time                                 int64
clockAtArrival                       int64
datetime                    datetime64[ns]
ordering                             int32
has_missing                          int32
cum_trades_cnt                       int32
cum_volume                           int64
cum_amount                         float64
prev_close                         float64
open                               float64
high                               float64
low                                float64
close                              float64
bid10p                             float64
bid9p                              float64
bid8p                              float64
bid7p                              float64
bid6p                              float64
bid5p                              float64
bid4p                              float64
bid3p                              float64
bid2p      

In [76]:
SH["tn_update"] = SH.groupby("skey")["cum_trades_cnt"].apply(lambda x: x-x.shift(1))
f2 = SH[(SH["time"] >= 130000000000) & (SH["tn_update"] != 0)].groupby("skey")["time"].min().reset_index()
f2 = f2.rename(columns={"time": "time2"})
SH = pd.merge(SH, f2, on="skey", how="left")
del f2
p99 = SH[(SH["time"] > 93000000000) & (SH["time"] < 145700000000) & (SH["time"] != SH["time2"]) & (SH["tn_update"] != 0)]\
.groupby("skey")["tn_update"].apply(lambda x: x.describe([0.99])["99%"]).reset_index()
p99 = p99.rename(columns={"tn_update":"99%"})
SH = pd.merge(SH, p99, on="skey", how="left")

In [112]:
SH[SH['has_missing'] == 1]

Unnamed: 0,skey,date,time,clockAtArrival,datetime,ordering,has_missing,cum_trades_cnt,cum_volume,cum_amount,prev_close,open,high,low,close,bid10p,bid9p,bid8p,bid7p,bid6p,bid5p,bid4p,bid3p,bid2p,bid1p,ask1p,ask2p,ask3p,ask4p,ask5p,ask6p,ask7p,ask8p,ask9p,ask10p,bid10q,bid9q,bid8q,bid7q,bid6q,bid5q,bid4q,bid3q,bid2q,bid1q,ask1q,ask2q,ask3q,ask4q,ask5q,ask6q,ask7q,ask8q,ask9q,ask10q,bid10n,bid9n,bid8n,bid7n,bid6n,bid5n,bid4n,bid3n,bid2n,bid1n,ask1n,ask2n,ask3n,ask4n,ask5n,ask6n,ask7n,ask8n,ask9n,ask10n,bid1Top1q,bid1Top2q,bid1Top3q,bid1Top4q,bid1Top5q,bid1Top6q,bid1Top7q,bid1Top8q,bid1Top9q,bid1Top10q,bid1Top11q,bid1Top12q,bid1Top13q,bid1Top14q,bid1Top15q,bid1Top16q,bid1Top17q,bid1Top18q,bid1Top19q,bid1Top20q,bid1Top21q,bid1Top22q,bid1Top23q,bid1Top24q,bid1Top25q,bid1Top26q,bid1Top27q,bid1Top28q,bid1Top29q,bid1Top30q,bid1Top31q,bid1Top32q,bid1Top33q,bid1Top34q,bid1Top35q,bid1Top36q,bid1Top37q,bid1Top38q,bid1Top39q,bid1Top40q,bid1Top41q,bid1Top42q,bid1Top43q,bid1Top44q,bid1Top45q,bid1Top46q,bid1Top47q,bid1Top48q,bid1Top49q,bid1Top50q,ask1Top1q,ask1Top2q,ask1Top3q,ask1Top4q,ask1Top5q,ask1Top6q,ask1Top7q,ask1Top8q,ask1Top9q,ask1Top10q,ask1Top11q,ask1Top12q,ask1Top13q,ask1Top14q,ask1Top15q,ask1Top16q,ask1Top17q,ask1Top18q,ask1Top19q,ask1Top20q,ask1Top21q,ask1Top22q,ask1Top23q,ask1Top24q,ask1Top25q,ask1Top26q,ask1Top27q,ask1Top28q,ask1Top29q,ask1Top30q,ask1Top31q,ask1Top32q,ask1Top33q,ask1Top34q,ask1Top35q,ask1Top36q,ask1Top37q,ask1Top38q,ask1Top39q,ask1Top40q,ask1Top41q,ask1Top42q,ask1Top43q,ask1Top44q,ask1Top45q,ask1Top46q,ask1Top47q,ask1Top48q,ask1Top49q,ask1Top50q,total_bid_quantity,total_ask_quantity,total_bid_vwap,total_ask_vwap,total_bid_orders,total_ask_orders,total_bid_levels,total_ask_levels,bid_trade_max_duration,ask_trade_max_duration,cum_canceled_buy_orders,cum_canceled_buy_volume,cum_canceled_buy_amount,cum_canceled_sell_orders,cum_canceled_sell_volume,cum_canceled_sell_amount,tn_update,time2,99%,time_diff
1735578,1603520,20170628,110639000000,1498619199000000,2017-06-28 11:06:39,755,1,471,260000,8042606.0,31.24,31.02,31.24,30.8,30.8,30.67,30.68,30.69,30.7,30.71,30.72,30.75,30.76,30.77,30.78,30.91,30.92,30.94,30.96,30.97,30.98,31.0,31.02,31.1,31.11,200,600,1000,2200,1500,900,500,3600,1100,300,100,100,3500,1800,1200,1100,1600,6,500,1000,1,1,1,5,2,2,1,2,3,1,1,1,2,1,1,2,2,1,1,1,300,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,100000,205619,29.962,32.723,151,218,77,120,4812,997,99,72300,2229339.0,91,148000,4635900.0,22.0,130003000000.0,11.33,103000000.0
1983219,1600747,20170628,112336000000,1498620216000000,2017-06-28 11:23:36,1155,1,888,5079824,12752603.46,2.54,2.52,2.53,2.49,2.52,2.42,2.43,2.44,2.45,2.46,2.47,2.48,2.49,2.5,2.51,2.52,2.53,2.54,2.55,2.56,2.57,2.58,2.59,2.6,2.61,100200,52600,62100,55900,35800,197700,411100,269845,332600,141599,47900,339600,127000,292900,241800,187300,29600,72500,325600,67100,17,11,10,19,12,26,44,24,34,13,5,18,22,34,16,15,12,17,24,9,47099,10200,5000,200,400,23600,2100,4100,10000,20000,10500,500,7900,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1900,1000,5000,30000,10000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1880544,3397506,2.47,2.608,237,346,11,16,2445,2877,182,2361432,5937551.0,198,2697968,6836692.36,23.0,130003000000.0,16.0,103000000.0
2165476,1600228,20170628,130648000000,1498626408000000,2017-06-28 13:06:48,1052,1,652,1232201,13832890.96,11.14,11.14,11.33,11.07,11.2,11.1,11.12,11.13,11.14,11.15,11.16,11.17,11.18,11.19,11.2,11.23,11.25,11.26,11.27,11.28,11.29,11.3,11.31,11.32,11.33,1200,2400,500,5300,15200,9500,2200,1100,7000,4200,800,3555,1900,9700,20201,4500,43100,123100,12600,15500,2,2,1,5,4,2,2,2,2,3,1,6,3,4,2,5,12,3,4,6,1400,800,2000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,800,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,223300,587757,10.978,11.46,92,221,38,34,3750,3154,184,496899,5534321.86,152,431760,4869693.36,12.0,130003000000.0,8.37,124000000.0


In [149]:
SH[(SH['skey'] == 1600228) & (SH['ordering'] >= 1050)].head(10)

Unnamed: 0,skey,date,time,clockAtArrival,datetime,ordering,has_missing,cum_trades_cnt,cum_volume,cum_amount,prev_close,open,high,low,close,bid10p,bid9p,bid8p,bid7p,bid6p,bid5p,bid4p,bid3p,bid2p,bid1p,ask1p,ask2p,ask3p,ask4p,ask5p,ask6p,ask7p,ask8p,ask9p,ask10p,bid10q,bid9q,bid8q,bid7q,bid6q,bid5q,bid4q,bid3q,bid2q,bid1q,ask1q,ask2q,ask3q,ask4q,ask5q,ask6q,ask7q,ask8q,ask9q,ask10q,bid10n,bid9n,bid8n,bid7n,bid6n,bid5n,bid4n,bid3n,bid2n,bid1n,ask1n,ask2n,ask3n,ask4n,ask5n,ask6n,ask7n,ask8n,ask9n,ask10n,bid1Top1q,bid1Top2q,bid1Top3q,bid1Top4q,bid1Top5q,bid1Top6q,bid1Top7q,bid1Top8q,bid1Top9q,bid1Top10q,bid1Top11q,bid1Top12q,bid1Top13q,bid1Top14q,bid1Top15q,bid1Top16q,bid1Top17q,bid1Top18q,bid1Top19q,bid1Top20q,bid1Top21q,bid1Top22q,bid1Top23q,bid1Top24q,bid1Top25q,bid1Top26q,bid1Top27q,bid1Top28q,bid1Top29q,bid1Top30q,bid1Top31q,bid1Top32q,bid1Top33q,bid1Top34q,bid1Top35q,bid1Top36q,bid1Top37q,bid1Top38q,bid1Top39q,bid1Top40q,bid1Top41q,bid1Top42q,bid1Top43q,bid1Top44q,bid1Top45q,bid1Top46q,bid1Top47q,bid1Top48q,bid1Top49q,bid1Top50q,ask1Top1q,ask1Top2q,ask1Top3q,ask1Top4q,ask1Top5q,ask1Top6q,ask1Top7q,ask1Top8q,ask1Top9q,ask1Top10q,ask1Top11q,ask1Top12q,ask1Top13q,ask1Top14q,ask1Top15q,ask1Top16q,ask1Top17q,ask1Top18q,ask1Top19q,ask1Top20q,ask1Top21q,ask1Top22q,ask1Top23q,ask1Top24q,ask1Top25q,ask1Top26q,ask1Top27q,ask1Top28q,ask1Top29q,ask1Top30q,ask1Top31q,ask1Top32q,ask1Top33q,ask1Top34q,ask1Top35q,ask1Top36q,ask1Top37q,ask1Top38q,ask1Top39q,ask1Top40q,ask1Top41q,ask1Top42q,ask1Top43q,ask1Top44q,ask1Top45q,ask1Top46q,ask1Top47q,ask1Top48q,ask1Top49q,ask1Top50q,total_bid_quantity,total_ask_quantity,total_bid_vwap,total_ask_vwap,total_bid_orders,total_ask_orders,total_bid_levels,total_ask_levels,bid_trade_max_duration,ask_trade_max_duration,cum_canceled_buy_orders,cum_canceled_buy_volume,cum_canceled_buy_amount,cum_canceled_sell_orders,cum_canceled_sell_volume,cum_canceled_sell_amount,tn_update,time2,99%,time_diff
2141188,1600228,20170628,130500000000,1498626300000000,2017-06-28 13:05:00,1050,0,640,1201701,13491143.96,11.14,11.14,11.33,11.07,11.22,11.13,11.14,11.15,11.16,11.17,11.18,11.19,11.2,11.21,11.22,11.23,11.25,11.26,11.27,11.28,11.29,11.3,11.31,11.32,11.33,500,5300,15200,9500,2200,1100,7000,21900,10900,1900,800,3455,1900,9700,20201,4500,43100,123100,12600,15500,1,5,4,2,2,2,2,5,5,3,1,5,3,4,2,5,12,3,4,6,200,1500,200,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,800,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,253800,587657,11.005,11.46,102,220,40,34,3290,3154,184,496899,5534321.86,152,431760,4869693.36,1.0,130003000000.0,8.37,58000000.0
2146552,1600228,20170628,130524000000,1498626324000000,2017-06-28 13:05:24,1051,0,640,1201701,13491143.96,11.14,11.14,11.33,11.07,11.22,11.13,11.14,11.15,11.16,11.17,11.18,11.19,11.2,11.21,11.22,11.23,11.25,11.26,11.27,11.28,11.29,11.3,11.31,11.32,11.33,500,5300,15200,9500,2200,1100,7000,21900,10900,1900,800,3555,1900,9700,20201,4500,43100,123100,12600,15500,1,5,4,2,2,2,2,5,5,3,1,6,3,4,2,5,12,3,4,6,200,1500,200,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,800,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,253800,587757,11.005,11.46,102,221,40,34,3290,3154,184,496899,5534321.86,152,431760,4869693.36,0.0,130003000000.0,8.37,24000000.0
2165476,1600228,20170628,130648000000,1498626408000000,2017-06-28 13:06:48,1052,1,652,1232201,13832890.96,11.14,11.14,11.33,11.07,11.2,11.1,11.12,11.13,11.14,11.15,11.16,11.17,11.18,11.19,11.2,11.23,11.25,11.26,11.27,11.28,11.29,11.3,11.31,11.32,11.33,1200,2400,500,5300,15200,9500,2200,1100,7000,4200,800,3555,1900,9700,20201,4500,43100,123100,12600,15500,2,2,1,5,4,2,2,2,2,3,1,6,3,4,2,5,12,3,4,6,1400,800,2000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,800,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,223300,587757,10.978,11.46,92,221,38,34,3750,3154,184,496899,5534321.86,152,431760,4869693.36,12.0,130003000000.0,8.37,124000000.0
2183569,1600228,20170628,130806000000,1498626486000000,2017-06-28 13:08:06,1053,0,652,1232201,13832890.96,11.14,11.14,11.33,11.07,11.2,11.1,11.12,11.13,11.14,11.15,11.16,11.17,11.18,11.19,11.2,11.23,11.25,11.26,11.27,11.28,11.29,11.3,11.31,11.32,11.33,1200,2400,500,5300,15200,9500,2200,1100,7000,4300,800,3555,1900,9700,20201,4500,43100,123100,12600,15500,2,2,1,5,4,2,2,2,2,4,1,6,3,4,2,5,12,3,4,6,1400,800,2000,100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,800,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,223400,587757,10.978,11.46,93,221,38,34,3750,3154,184,496899,5534321.86,152,431760,4869693.36,0.0,130003000000.0,8.37,158000000.0
2184011,1600228,20170628,130809000000,1498626489000000,2017-06-28 13:08:09,1054,0,653,1232301,13834013.96,11.14,11.14,11.33,11.07,11.23,11.1,11.12,11.13,11.14,11.15,11.16,11.17,11.18,11.19,11.2,11.23,11.25,11.26,11.27,11.28,11.29,11.3,11.31,11.32,11.33,1200,2400,500,5300,15200,9500,2200,1100,7000,4300,700,3555,1900,9700,20201,4500,43100,123100,12600,15500,2,2,1,5,4,2,2,2,2,4,1,6,3,4,2,5,12,3,4,6,1400,800,2000,100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,700,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,223400,587657,10.978,11.46,93,221,38,34,3750,3154,184,496899,5534321.86,152,431760,4869693.36,1.0,130003000000.0,8.37,3000000.0
2189985,1600228,20170628,130836000000,1498626516000000,2017-06-28 13:08:36,1055,0,653,1232301,13834013.96,11.14,11.14,11.33,11.07,11.23,11.1,11.12,11.13,11.14,11.15,11.16,11.17,11.18,11.19,11.2,11.23,11.25,11.26,11.27,11.28,11.29,11.3,11.31,11.32,11.33,1200,2400,500,5300,15200,9500,2200,1100,7000,4300,700,3555,1900,9700,20201,4500,43100,123100,12600,15500,2,2,1,5,4,2,2,2,2,4,1,6,3,4,2,5,12,3,4,6,1400,800,2000,100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,700,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,224400,587657,10.978,11.46,94,221,38,34,3750,3154,184,496899,5534321.86,152,431760,4869693.36,0.0,130003000000.0,8.37,27000000.0
2191623,1600228,20170628,130842000000,1498626522000000,2017-06-28 13:08:42,1056,0,653,1232301,13834013.96,11.14,11.14,11.33,11.07,11.23,11.1,11.12,11.13,11.14,11.15,11.16,11.17,11.18,11.19,11.2,11.23,11.25,11.26,11.27,11.28,11.29,11.3,11.31,11.32,11.33,1200,2400,500,5300,15200,9500,2200,6100,7000,4300,700,3555,1900,9700,20201,4500,43100,123100,12600,15500,2,2,1,5,4,2,2,3,2,4,1,6,3,4,2,5,12,3,4,6,1400,800,2000,100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,700,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,229400,587657,10.983,11.46,95,221,38,34,3750,3154,184,496899,5534321.86,152,431760,4869693.36,0.0,130003000000.0,8.37,6000000.0
2202091,1600228,20170628,130927000000,1498626567000000,2017-06-28 13:09:27,1057,0,654,1233001,13841874.96,11.14,11.14,11.33,11.07,11.23,11.12,11.13,11.14,11.15,11.16,11.17,11.18,11.19,11.2,11.23,11.25,11.26,11.27,11.28,11.29,11.3,11.31,11.32,11.33,11.34,2400,500,5300,15200,9500,2200,6100,7000,4300,16300,3555,1900,9700,20201,4500,43100,123100,12600,15500,10000,2,1,5,4,2,2,3,2,4,1,6,3,4,2,5,12,3,4,6,3,16300,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1054,1,400,1600,400,100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,245700,586957,10.999,11.46,96,220,39,33,3750,3154,184,496899,5534321.86,152,431760,4869693.36,1.0,130003000000.0,8.37,85000000.0
2202922,1600228,20170628,130930000000,1498626570000000,2017-06-28 13:09:30,1058,0,654,1233001,13841874.96,11.14,11.14,11.33,11.07,11.23,11.12,11.13,11.14,11.15,11.16,11.17,11.18,11.19,11.2,11.23,11.25,11.26,11.27,11.28,11.29,11.3,11.31,11.32,11.33,11.34,2400,500,5300,15200,9500,2200,6100,7000,4300,16300,3555,1900,9700,21801,4500,43100,123100,12600,15500,10000,2,1,5,4,2,2,3,2,4,1,6,3,4,3,5,12,3,4,6,3,16300,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1054,1,400,1600,400,100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,245700,588557,10.999,11.46,96,221,39,33,3750,3154,184,496899,5534321.86,152,431760,4869693.36,0.0,130003000000.0,8.37,3000000.0
2203354,1600228,20170628,130933000000,1498626573000000,2017-06-28 13:09:33,1059,0,654,1233001,13841874.96,11.14,11.14,11.33,11.07,11.23,11.12,11.13,11.14,11.15,11.16,11.17,11.18,11.19,11.2,11.23,11.25,11.26,11.27,11.28,11.29,11.3,11.31,11.32,11.33,11.34,2400,500,5300,15200,9500,2200,6100,7000,5400,16300,3555,1900,9700,21801,4500,43100,123100,12600,15500,10000,2,1,5,4,2,2,3,2,5,1,6,3,4,3,5,12,3,4,6,3,16300,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1054,1,400,1600,400,100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,246800,588557,11.0,11.46,97,221,39,33,3750,3154,184,496899,5534321.86,152,431760,4869693.36,0.0,130003000000.0,8.37,3000000.0


In [133]:
SH[SH['skey'] == 1600747]['cum_volume'].max()

7580052

In [122]:
trade = pd.read_csv('/home/work516/KR_upload_code/20170628/Tick.csv', names=['StockID', 'TransactTime', 'TradePrice',
                                                                            'TradeQty', 'TradeAmount', 'BuyNo'])
trade = trade.iloc[1:, :]
trade['cum_volume'] = trade.groupby('StockID')['TradeQty'].cumsum()

  interactivity=interactivity, compiler=compiler, result=result)


In [165]:
trade[(trade['StockID'] == 600228) & (trade['TransactTime'] >= 2017062813040000)].head(100)

Unnamed: 0,StockID,TransactTime,TradePrice,TradeQty,TradeAmount,BuyNo,cum_volume
4834220,600228,2017062813042365,11.22,500.0,5610.0,1505116.0,1191601.0
4839714,600228,2017062813044102,11.22,1000.0,11220.0,1505116.0,1192601.0
4847166,600228,2017062813045980,11.22,500.0,5610.0,1505116.0,1193101.0
4894461,600228,2017062813064752,11.22,200.0,2244.0,1505116.0,1193301.0
4894462,600228,2017062813064752,11.22,300.0,3366.0,1518183.0,1193601.0
4894484,600228,2017062813064807,11.22,1200.0,13464.0,1518183.0,1194801.0
4894485,600228,2017062813064807,11.22,200.0,2244.0,1563933.0,1195001.0
4894486,600228,2017062813064807,11.21,2000.0,22420.0,1125092.0,1197001.0
4894487,600228,2017062813064807,11.21,400.0,4484.0,1128103.0,1197401.0
4894488,600228,2017062813064807,11.21,500.0,5605.0,1334567.0,1197901.0


In [163]:
trade[(trade['StockID'] == 600228) & (trade['TransactTime'] >= 2017062813064752)].head(20)

Unnamed: 0,StockID,TransactTime,TradePrice,TradeQty,TradeAmount,BuyNo,cum_volume
4894461,600228,2017062813064752,11.22,200.0,2244.0,1505116.0,1193301.0
4894462,600228,2017062813064752,11.22,300.0,3366.0,1518183.0,1193601.0
4894484,600228,2017062813064807,11.22,1200.0,13464.0,1518183.0,1194801.0
4894485,600228,2017062813064807,11.22,200.0,2244.0,1563933.0,1195001.0
4894486,600228,2017062813064807,11.21,2000.0,22420.0,1125092.0,1197001.0
4894487,600228,2017062813064807,11.21,400.0,4484.0,1128103.0,1197401.0
4894488,600228,2017062813064807,11.21,500.0,5605.0,1334567.0,1197901.0
4894489,600228,2017062813064807,11.21,2000.0,22420.0,1493481.0,1199901.0
4894490,600228,2017062813064807,11.21,6000.0,67260.0,1526941.0,1205901.0
4894491,600228,2017062813064807,11.2,5000.0,56000.0,1225610.0,1210901.0


In [109]:
SH[(SH['skey'] == 1600747) & (SH['time'] > 93000000000) & (SH['time'] < 150000000000) & (SH['time'] != 130003000000) & (SH['time'] != 125901000000)]['time_diff'].describe([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99]).astype('int64')

count          2046
mean       17570381
std       155741132
min         3000000
10%         3000000
20%         3000000
30%         3000000
40%         3000000
50%         6000000
60%         6000000
70%         9000000
80%        12000000
90%        43000000
95%        52000000
99%        71649999
max      4070000000
Name: time_diff, dtype: int64

In [108]:
SH[(SH['skey'] == 1600747) & (SH['time_diff'] == 4070000000)]

Unnamed: 0,skey,date,time,clockAtArrival,datetime,ordering,has_missing,cum_trades_cnt,cum_volume,cum_amount,prev_close,open,high,low,close,bid10p,bid9p,bid8p,bid7p,bid6p,bid5p,bid4p,bid3p,bid2p,bid1p,ask1p,ask2p,ask3p,ask4p,ask5p,ask6p,ask7p,ask8p,ask9p,ask10p,bid10q,bid9q,bid8q,bid7q,bid6q,bid5q,bid4q,bid3q,bid2q,bid1q,ask1q,ask2q,ask3q,ask4q,ask5q,ask6q,ask7q,ask8q,ask9q,ask10q,bid10n,bid9n,bid8n,bid7n,bid6n,bid5n,bid4n,bid3n,bid2n,bid1n,ask1n,ask2n,ask3n,ask4n,ask5n,ask6n,ask7n,ask8n,ask9n,ask10n,bid1Top1q,bid1Top2q,bid1Top3q,bid1Top4q,bid1Top5q,bid1Top6q,bid1Top7q,bid1Top8q,bid1Top9q,bid1Top10q,bid1Top11q,bid1Top12q,bid1Top13q,bid1Top14q,bid1Top15q,bid1Top16q,bid1Top17q,bid1Top18q,bid1Top19q,bid1Top20q,bid1Top21q,bid1Top22q,bid1Top23q,bid1Top24q,bid1Top25q,bid1Top26q,bid1Top27q,bid1Top28q,bid1Top29q,bid1Top30q,bid1Top31q,bid1Top32q,bid1Top33q,bid1Top34q,bid1Top35q,bid1Top36q,bid1Top37q,bid1Top38q,bid1Top39q,bid1Top40q,bid1Top41q,bid1Top42q,bid1Top43q,bid1Top44q,bid1Top45q,bid1Top46q,bid1Top47q,bid1Top48q,bid1Top49q,bid1Top50q,ask1Top1q,ask1Top2q,ask1Top3q,ask1Top4q,ask1Top5q,ask1Top6q,ask1Top7q,ask1Top8q,ask1Top9q,ask1Top10q,ask1Top11q,ask1Top12q,ask1Top13q,ask1Top14q,ask1Top15q,ask1Top16q,ask1Top17q,ask1Top18q,ask1Top19q,ask1Top20q,ask1Top21q,ask1Top22q,ask1Top23q,ask1Top24q,ask1Top25q,ask1Top26q,ask1Top27q,ask1Top28q,ask1Top29q,ask1Top30q,ask1Top31q,ask1Top32q,ask1Top33q,ask1Top34q,ask1Top35q,ask1Top36q,ask1Top37q,ask1Top38q,ask1Top39q,ask1Top40q,ask1Top41q,ask1Top42q,ask1Top43q,ask1Top44q,ask1Top45q,ask1Top46q,ask1Top47q,ask1Top48q,ask1Top49q,ask1Top50q,total_bid_quantity,total_ask_quantity,total_bid_vwap,total_ask_vwap,total_bid_orders,total_ask_orders,total_bid_levels,total_ask_levels,bid_trade_max_duration,ask_trade_max_duration,cum_canceled_buy_orders,cum_canceled_buy_volume,cum_canceled_buy_amount,cum_canceled_sell_orders,cum_canceled_sell_volume,cum_canceled_sell_amount,tn_update,time2,99%,time_diff
1639751,1600747,20170628,110003000000,1498618803000000,2017-06-28 11:00:03,1035,0,810,4575724,11484964.46,2.54,2.52,2.53,2.49,2.51,2.42,2.43,2.44,2.45,2.46,2.47,2.48,2.49,2.5,2.51,2.52,2.53,2.54,2.55,2.56,2.57,2.58,2.59,2.6,2.61,100100,52600,62100,55900,35800,153000,375200,208445,208400,203999,181600,320700,164500,328800,261800,187300,28600,72500,325600,65500,16,11,10,19,12,23,43,21,29,16,24,17,24,34,17,15,11,17,24,8,5499,1000,200,800,100000,200,5700,200,50000,23600,8500,1000,1000,300,3000,3000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,18500,10000,1000,5900,12000,5000,1000,30400,10000,1000,2000,1000,1000,38400,400,11000,6500,200,5000,700,2100,10000,1700,6800,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1676644,3580106,2.469,2.603,227,366,11,16,2445,2700,164,2221632,5587578.0,179,2441568,6187560.36,1.0,130003000000.0,16.0,4070000000.0


In [81]:
SH[(SH['skey'] == 1603520) & (SH['tn_update'] == 24)]

Unnamed: 0,skey,date,time,clockAtArrival,datetime,ordering,has_missing,cum_trades_cnt,cum_volume,cum_amount,prev_close,open,high,low,close,bid10p,bid9p,bid8p,bid7p,bid6p,bid5p,bid4p,bid3p,bid2p,bid1p,ask1p,ask2p,ask3p,ask4p,ask5p,ask6p,ask7p,ask8p,ask9p,ask10p,bid10q,bid9q,bid8q,bid7q,bid6q,bid5q,bid4q,bid3q,bid2q,bid1q,ask1q,ask2q,ask3q,ask4q,ask5q,ask6q,ask7q,ask8q,ask9q,ask10q,bid10n,bid9n,bid8n,bid7n,bid6n,bid5n,bid4n,bid3n,bid2n,bid1n,ask1n,ask2n,ask3n,ask4n,ask5n,ask6n,ask7n,ask8n,ask9n,ask10n,bid1Top1q,bid1Top2q,bid1Top3q,bid1Top4q,bid1Top5q,bid1Top6q,bid1Top7q,bid1Top8q,bid1Top9q,bid1Top10q,bid1Top11q,bid1Top12q,bid1Top13q,bid1Top14q,bid1Top15q,bid1Top16q,bid1Top17q,bid1Top18q,bid1Top19q,bid1Top20q,bid1Top21q,bid1Top22q,bid1Top23q,bid1Top24q,bid1Top25q,bid1Top26q,bid1Top27q,bid1Top28q,bid1Top29q,bid1Top30q,bid1Top31q,bid1Top32q,bid1Top33q,bid1Top34q,bid1Top35q,bid1Top36q,bid1Top37q,bid1Top38q,bid1Top39q,bid1Top40q,bid1Top41q,bid1Top42q,bid1Top43q,bid1Top44q,bid1Top45q,bid1Top46q,bid1Top47q,bid1Top48q,bid1Top49q,bid1Top50q,ask1Top1q,ask1Top2q,ask1Top3q,ask1Top4q,ask1Top5q,ask1Top6q,ask1Top7q,ask1Top8q,ask1Top9q,ask1Top10q,ask1Top11q,ask1Top12q,ask1Top13q,ask1Top14q,ask1Top15q,ask1Top16q,ask1Top17q,ask1Top18q,ask1Top19q,ask1Top20q,ask1Top21q,ask1Top22q,ask1Top23q,ask1Top24q,ask1Top25q,ask1Top26q,ask1Top27q,ask1Top28q,ask1Top29q,ask1Top30q,ask1Top31q,ask1Top32q,ask1Top33q,ask1Top34q,ask1Top35q,ask1Top36q,ask1Top37q,ask1Top38q,ask1Top39q,ask1Top40q,ask1Top41q,ask1Top42q,ask1Top43q,ask1Top44q,ask1Top45q,ask1Top46q,ask1Top47q,ask1Top48q,ask1Top49q,ask1Top50q,total_bid_quantity,total_ask_quantity,total_bid_vwap,total_ask_vwap,total_bid_orders,total_ask_orders,total_bid_levels,total_ask_levels,bid_trade_max_duration,ask_trade_max_duration,cum_canceled_buy_orders,cum_canceled_buy_volume,cum_canceled_buy_amount,cum_canceled_sell_orders,cum_canceled_sell_volume,cum_canceled_sell_amount,tn_update,time2,99%
1478782,1603520,20170628,104924000000,1498618164000000,2017-06-28 10:49:24,657,0,406,218800,6772116.0,31.24,31.02,31.24,30.81,30.81,30.66,30.67,30.68,30.69,30.7,30.71,30.75,30.76,30.8,30.81,30.95,30.96,30.97,30.98,31.0,31.02,31.09,31.1,31.11,31.14,2000,200,600,1000,1500,300,500,100,9000,1900,400,200,500,1500,1000,6,1300,500,1000,500,1,1,1,1,4,1,1,1,12,1,1,1,1,3,1,1,2,1,1,1,1900,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,400,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,96100,191119,29.987,32.736,145,210,75,118,2609,997,85,59200,1826003.0,83,142800,4474370.0,24.0,130003000000.0,11.33


In [84]:
SH[(SH['skey'] == 1600747) & (SH['tn_update'] != 0)]['tn_update'].describe([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99, 0.999])

count    674.000000
mean       2.063798
std        3.069830
min        1.000000
10%        1.000000
20%        1.000000
30%        1.000000
40%        1.000000
50%        1.000000
60%        1.000000
70%        2.000000
80%        2.000000
90%        4.000000
95%        7.000000
99%       16.000000
99.9%     30.848000
max       47.000000
Name: tn_update, dtype: float64

In [None]:
database_name = 'com_md_eq_cn'
user = "zhenyuy"
password = "bnONBrzSMGoE"

db1 = DB("192.168.10.178", database_name, user, password)
db1.write('md_snapshot_l2', SH)

del SH

In [1]:
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz


def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name)


class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db'):
        self.db_name = db_name
        self.uri = uri
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = 1
            seg = {'ver': version, 'data': self.ser(df_seg, version), 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)

        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        else:
            raise Exception('unknown version')


def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()











import pandas as pd
import random
import numpy as np
import glob
import pickle
import os
import datetime
import time
pd.set_option("max_columns", 200)

startTm = datetime.datetime.now()
readPath = '/home/work516/day_stock/***'
dataPathLs = np.array(glob.glob(readPath))
dataPathLs = dataPathLs[[np.array([os.path.basename(i).split('.')[0][:2] == 'SH' for i in dataPathLs])]]
db = pd.DataFrame()
for p in dataPathLs:
    dayData = pd.read_csv(p, compression='gzip')
    db = pd.concat([db, dayData])
print(datetime.datetime.now() - startTm)


date_list = pd.read_csv("/home/work516/KR_upload_code/trading_days.csv")
wr_ong = []
mi_ss = []
less = []

data = '/home/work516/KR_upload_code/20170628'
date = os.path.basename(data)
rar_path = data + '/SH/snapshot.7z'
path = '/mnt/e/unzip_data/2017/SH'
path1 = path + '/' + date
un_path = path1
cmd = '7za x {} -o{}'.format(rar_path, un_path)
os.system(cmd)

readPath = path1 + '/snapshot/***2/***'
dataPathLs = np.array(glob.glob(readPath))
dateLs = np.array([int(os.path.basename(i).split('.')[0]) for i in dataPathLs])
dataPathLs = dataPathLs[((dateLs >= 600000) & (dateLs <= 700000))]
SH = []
ll = []

for i in dataPathLs:
    try:
        df = pd.read_csv(i, usecols = [0,1,3,5,7,9,10,11,15,17,18,19,20,21,22,23,25,26,28,29,30,31,32,33,37,39,40,41,
                                      42,46,47,49,50])
    except:
        print("empty data")
        print(i)
        ll.append(int(os.path.basename(i).split('.')[0]))
        continue
    df["StockID"] = int(os.path.basename(i).split('.')[0])
    SH += [df]
del df
SH = pd.concat(SH).reset_index(drop=True)


SH["skey"] = SH["StockID"] + 1000000
SH.drop(["StockID"],axis=1,inplace=True)
SH["date"] = int(SH["QuotTime"].iloc[0]//1000000000)
SH["time"] = (SH['QuotTime'] - int(SH['QuotTime'].iloc[0]//1000000000*1000000000)).astype(np.int64) * 1000
SH["clockAtArrival"] = SH["QuotTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S%f').timestamp()*1e6))
SH.drop(["QuotTime"],axis=1,inplace=True)
SH['datetime'] = SH["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))


SH["BidPrice"] = SH["BidPrice"].apply(lambda x: [float(i) for i in x[1:-1].split(',')])
SH["OfferPrice"] = SH["OfferPrice"].apply(lambda x: [float(i) for i in x[1:-1].split(',')])
SH["BidOrderQty"] = SH["BidOrderQty"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])
SH["OfferOrderQty"] = SH["OfferOrderQty"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])
SH["BidNumOrders"] = SH["BidNumOrders"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])
SH["OfferNumOrders"] = SH["OfferNumOrders"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])

for i in range(1, 11):
    SH["bid" + str(i) + 'p'] = SH["BidPrice"].apply(lambda x: x[i-1],2)
SH.drop(["BidPrice"],axis=1,inplace=True)

for i in range(1, 11):
    SH["ask" + str(i) + 'p'] = SH["OfferPrice"].apply(lambda x: x[i-1],2)
SH.drop(["OfferPrice"],axis=1,inplace=True)

for i in range(1, 11):
    SH["bid" + str(i) + 'q'] = SH["BidOrderQty"].apply(lambda x: x[i-1])
SH.drop(["BidOrderQty"],axis=1,inplace=True)

for i in range(1, 11):
    SH["ask" + str(i) + 'q'] = SH["OfferOrderQty"].apply(lambda x: x[i-1])
SH.drop(["OfferOrderQty"],axis=1,inplace=True)

for i in range(1, 11):
    SH["bid" + str(i) + 'n'] = SH["BidNumOrders"].apply(lambda x: x[i-1])
    SH["bid" + str(i) + 'n'] = SH["bid" + str(i) + 'n'].astype('int32')
SH.drop(["BidNumOrders"],axis=1,inplace=True)

for i in range(1, 11):
    SH["ask" + str(i) + 'n'] = SH["OfferNumOrders"].apply(lambda x: x[i-1])
    SH["ask" + str(i) + 'n'] = SH["ask" + str(i) + 'n'].astype('int32') 
SH.drop(["OfferNumOrders"],axis=1,inplace=True)


SH["BidOrders"] = SH["BidOrders"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])
SH["OfferOrders"] = SH["OfferOrders"].apply(lambda x: [int(i) for i in x[1:-1].split(',')])

for i in range(1, 51):
    SH["bid1Top" + str(i) + 'q'] = SH["BidOrders"].apply(lambda x: x[i-1])
    SH["bid1Top" + str(i) + 'q'] = SH["bid1Top" + str(i) + 'q'].astype('int32') 
SH.drop(["BidOrders"],axis=1,inplace=True)


for i in range(1, 51):
    SH["ask1Top" + str(i) + 'q'] = SH["OfferOrders"].apply(lambda x: x[i-1])
    SH["ask1Top" + str(i) + 'q'] = SH["ask1Top" + str(i) + 'q'].astype('int32') 
SH.drop(["OfferOrders"],axis=1,inplace=True)



SH.columns = ['cum_trades_cnt', 'ask_trade_max_duration', 'total_bid_orders',
   'cum_canceled_sell_amount', 'total_ask_quantity', 'cum_canceled_buy_orders',
   'total_ask_vwap', 'cum_canceled_sell_volume', 'cum_volume', 'open',
   'high', 'prev_close', 'low', 'total_bid_vwap',
   'cum_canceled_sell_orders', 'total_ask_orders', 'total_ask_levels',
   'total_bid_quantity', 'cum_canceled_buy_volume', 'bid_trade_max_duration',
   'total_bid_levels', 'close', 'cum_amount', 'cum_canceled_buy_amount', 'skey', 'date', 'time', 'clockAtArrival',
   'datetime', 'bid1p', 'bid2p', 'bid3p', 'bid4p', 'bid5p', 'bid6p',
   'bid7p', 'bid8p', 'bid9p', 'bid10p', 'ask1p', 'ask2p', 'ask3p',
   'ask4p', 'ask5p', 'ask6p', 'ask7p', 'ask8p', 'ask9p', 'ask10p',
   'bid1q', 'bid2q', 'bid3q', 'bid4q', 'bid5q', 'bid6q', 'bid7q',
   'bid8q', 'bid9q', 'bid10q', 'ask1q', 'ask2q', 'ask3q', 'ask4q',
   'ask5q', 'ask6q', 'ask7q', 'ask8q', 'ask9q', 'ask10q', 'bid1n',
   'bid2n', 'bid3n', 'bid4n', 'bid5n', 'bid6n', 'bid7n', 'bid8n',
   'bid9n', 'bid10n', 'ask1n', 'ask2n', 'ask3n', 'ask4n', 'ask5n',
   'ask6n', 'ask7n', 'ask8n', 'ask9n', 'ask10n', 'bid1Top1q',
   'bid1Top2q', 'bid1Top3q', 'bid1Top4q', 'bid1Top5q', 'bid1Top6q',
   'bid1Top7q', 'bid1Top8q', 'bid1Top9q', 'bid1Top10q', 'bid1Top11q',
   'bid1Top12q', 'bid1Top13q', 'bid1Top14q', 'bid1Top15q',
   'bid1Top16q', 'bid1Top17q', 'bid1Top18q', 'bid1Top19q',
   'bid1Top20q', 'bid1Top21q', 'bid1Top22q', 'bid1Top23q',
   'bid1Top24q', 'bid1Top25q', 'bid1Top26q', 'bid1Top27q',
   'bid1Top28q', 'bid1Top29q', 'bid1Top30q', 'bid1Top31q',
   'bid1Top32q', 'bid1Top33q', 'bid1Top34q', 'bid1Top35q',
   'bid1Top36q', 'bid1Top37q', 'bid1Top38q', 'bid1Top39q',
   'bid1Top40q', 'bid1Top41q', 'bid1Top42q', 'bid1Top43q',
   'bid1Top44q', 'bid1Top45q', 'bid1Top46q', 'bid1Top47q',
   'bid1Top48q', 'bid1Top49q', 'bid1Top50q', 'ask1Top1q', 'ask1Top2q',
   'ask1Top3q', 'ask1Top4q', 'ask1Top5q', 'ask1Top6q', 'ask1Top7q',
   'ask1Top8q', 'ask1Top9q', 'ask1Top10q', 'ask1Top11q', 'ask1Top12q',
   'ask1Top13q', 'ask1Top14q', 'ask1Top15q', 'ask1Top16q',
   'ask1Top17q', 'ask1Top18q', 'ask1Top19q', 'ask1Top20q',
   'ask1Top21q', 'ask1Top22q', 'ask1Top23q', 'ask1Top24q',
   'ask1Top25q', 'ask1Top26q', 'ask1Top27q', 'ask1Top28q',
   'ask1Top29q', 'ask1Top30q', 'ask1Top31q', 'ask1Top32q',
   'ask1Top33q', 'ask1Top34q', 'ask1Top35q', 'ask1Top36q',
   'ask1Top37q', 'ask1Top38q', 'ask1Top39q', 'ask1Top40q',
   'ask1Top41q', 'ask1Top42q', 'ask1Top43q', 'ask1Top44q',
   'ask1Top45q', 'ask1Top46q', 'ask1Top47q', 'ask1Top48q',
   'ask1Top49q', 'ask1Top50q']
SH = SH.fillna(0)
#     SH["p1"] = SH["bid1p"] + SH["ask1p"]
#     tt = SH[(SH["cum_volume"] > 0) & (SH["time"] < 145700000000)].groupby("skey")['p1'].min()
#     SH.drop("p1", axis=1, inplace=True)
#     try:
#         assert(tt[tt == 0].shape[0] == 0)
#     except:
#         display(tt[tt == 0])
#     SH = SH[~((SH["bid1p"] == 0) & (SH["ask1p"] == 0))]
SH["ordering"] = SH.groupby("skey").cumcount()
SH["ordering"] = SH["ordering"] + 1

SH["has_missing"] = 0

for col in ["skey", "date", "cum_trades_cnt", "total_bid_orders",
    'total_ask_orders', 'total_bid_levels', 'total_ask_levels', 'cum_canceled_buy_orders','cum_canceled_sell_orders',
        "ordering", 'bid_trade_max_duration', 'ask_trade_max_duration','has_missing']:
    SH[col] = SH[col].astype('int32')

#     for cols in ["prev_close", 'open', "high", "low", "close", 'bid10p','bid9p','bid8p','bid7p','bid6p','bid5p','bid4p','bid3p',
#              'bid2p','bid1p','ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p']:
# #         SH[cols] = SH[cols].apply(lambda x: round(x, 2)).astype('float64')
#         print(cols)
#         print(SH[cols].astype(str).apply(lambda x: len(str(x.split('.')[1]))).unique())

#     for cols in ['cum_amount', "cum_canceled_sell_amount", "cum_canceled_buy_amount"]:
# #         SH[cols] = SH[cols].apply(lambda x: round(x, 2)).astype('float64')
#         print(cols)
#         print(SH[cols].astype(str).apply(lambda x: len(str(x.split('.')[1]))).unique())

for cols in ['total_bid_vwap', "total_ask_vwap"]:
#         print(cols)
#         print(SH[cols].astype(str).apply(lambda x: len(str(x.split('.')[1]))).unique())
    SH[cols] = SH[cols].apply(lambda x: round(x, 3))


assert(sum(SH[SH["open"] != 0].groupby("skey")["open"].nunique() != 1) == 0)
assert(sum(SH[SH["prev_close"] != 0].groupby("skey")["prev_close"].nunique() != 1) == 0)
SH["prev_close"] = np.where(SH["time"] >= 91500000000, SH.groupby("skey")["prev_close"].transform("max"), SH["prev_close"]) 
SH["open"] = np.where(SH["cum_volume"] > 0, SH.groupby("skey")["open"].transform("max"), SH["open"])
assert(sum(SH[SH["open"] != 0].groupby("skey")["open"].nunique() != 1) == 0)
assert(sum(SH[SH["prev_close"] != 0].groupby("skey")["prev_close"].nunique() != 1) == 0)
assert(SH[SH["cum_volume"] > 0]["open"].min() > 0)


# check 1
da_te = str(SH["date"].iloc[0]) 
da_te = da_te[:4] + '-' + da_te[4:6] + '-' + da_te[6:8]
db1 = db[db["date"] == da_te]
db1["ID"] = db1["ID"].str[2:].astype(int) + 1000000
db1["date"] = (db1["date"].str[:4] + db1["date"].str[5:7] + db1["date"].str[8:]).astype(int)
SH["cum_max"] = SH.groupby("skey")["cum_volume"].transform(max)
s2 = SH[SH["cum_volume"] == SH["cum_max"]].groupby("skey").first().reset_index()
dd = SH[SH["cum_volume"] == SH["cum_max"]].groupby("skey")["time"].first().reset_index()
SH.drop("cum_max", axis=1, inplace=True)
s2 = s2.rename(columns={"skey": "ID", 'open':"d_open", "prev_close":"d_yclose","high":"d_high", "low":"d_low", "close":"d_close", "cum_volume":"d_volume", "cum_amount":"d_amount"})
if SH["date"].iloc[0] < 20180820:
    s2["auction"] = 0
else:
    dd["auction"] = np.where(dd["time"]<=145700000000, 0, 1)
    dd = dd.rename(columns={"skey": "ID"})
    s2 = pd.merge(s2, dd[["ID", "auction"]], on="ID")
s2 = s2[["ID", "date", "d_open", "d_yclose", "d_high", "d_low", "d_close", "d_volume", "d_amount", "auction"]]
re = pd.merge(db1, s2, on=["ID", "date", "d_open", "d_yclose","d_high", "d_low", "d_volume"], how="outer")
try:
    assert(sum(re["d_amount_y"].isnull()) == 0)
except:
    print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    print(re[re["d_amount_y"].isnull()])
    wr_ong += [re[re["d_amount_y"].isnull()]]

# check 2
# first part
date = pd.DataFrame(pd.date_range(start='2019-06-10 08:30:00', end='2019-06-10 18:00:00', freq='s'), columns=["Orig"])
date["time"] = date["Orig"].apply(lambda x: int(x.strftime("%H%M%S"))*1000)
date["group"] = date["time"]//10000
SH["group"] = SH["time"]//10000000
gl = date[((date["time"] >= 93000000) & (date["time"] < 113000000))|((date["time"] >= 130000000) & (date["time"] < 150000000))]["group"].unique()
l = set(gl) - set(SH["group"].unique())
SH["has_missing1"] = 0 
if len(l) != 0:
    print("massive missing")
    print(l)
    SH["order"] = SH.groupby(["skey", "time"]).cumcount()
    for i in l:
        SH["t"] = SH[SH["group"] > i].groupby("skey")["time"].transform("min")
        SH["has_missing1"] = np.where((SH["time"] == SH["t"]) & (SH["order"] == 0), 1, SH["has_missing1"])
    SH.drop(["order", "t", "group"], axis=1, inplace=True)   
else:
    print("no massive missing")
    SH.drop(["group"], axis=1, inplace=True)




# second part

SH["time_interval"] = SH.groupby("skey")["datetime"].apply(lambda x: x - x.shift(1))
SH["time_interval"] = SH["time_interval"].apply(lambda x: x.seconds)
SH["tn_update"] = SH.groupby("skey")["cum_trades_cnt"].apply(lambda x: x-x.shift(1))

f1 = SH[(SH["time"] >= 93000000000) & (SH["tn_update"] != 0)].groupby("skey")["time"].min().reset_index()
f1 = f1.rename(columns={"time": "time1"})
f2 = SH[(SH["time"] >= 130000000000) & (SH["tn_update"] != 0)].groupby("skey")["time"].min().reset_index()
f2 = f2.rename(columns={"time": "time2"})
f3 = SH[(SH["time"] >= 150000000000) & (SH["tn_update"] != 0)].groupby("skey")["time"].min().reset_index()
f3 = f3.rename(columns={"time": "time3"})
SH = pd.merge(SH, f1, on="skey", how="left")
del f1
SH = pd.merge(SH, f2, on="skey", how="left")
del f2
SH = pd.merge(SH, f3, on="skey", how="left")
del f3
p99 = SH[(SH["time"] > 93000000000) & (SH["time"] < 145700000000) & (SH["time"] != SH["time2"]) & (SH["tn_update"] != 0)]\
.groupby("skey")["tn_update"].apply(lambda x: x.describe([0.99])["99%"]).reset_index()
p99 = p99.rename(columns={"tn_update":"99%"})
SH = pd.merge(SH, p99, on="skey", how="left")

SH["has_missing2"] = 0
SH["has_missing2"] = np.where((SH["time_interval"] > 60) & (SH["tn_update"] > SH["99%"]) & 
     (SH["time"] > SH["time1"]) & (SH["time"] != SH["time2"]) & (SH["time"] != SH["time3"]) & (SH["time"] != 100000000000), 1, 0)
SH.drop(["time_interval", "tn_update", "time1", "time2", "time3", "99%"], axis=1, inplace=True) 

SH["has_missing"] = np.where((SH["has_missing1"] == 1) | (SH["has_missing2"] == 1), 1, 0)
SH.drop(["has_missing1", "has_missing2"], axis=1, inplace=True) 
if SH[SH["has_missing"] == 1].shape[0] != 0:
    print("has missing!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    print(SH[SH["has_missing"] == 1].shape[0])
    mi_ss += [SH[SH["has_missing"] == 1]]



SH["has_missing"] = SH["has_missing"].astype('int32')
SH = SH[["skey", "date", "time", "clockAtArrival", "datetime", "ordering", "has_missing", "cum_trades_cnt", "cum_volume", "cum_amount", "prev_close",
                        "open", "high", "low", "close", 'bid10p','bid9p','bid8p','bid7p','bid6p','bid5p','bid4p','bid3p','bid2p','bid1p',
                        'ask1p','ask2p','ask3p','ask4p','ask5p','ask6p','ask7p','ask8p','ask9p','ask10p', 'bid10q','bid9q','bid8q',
                         'bid7q','bid6q','bid5q','bid4q','bid3q','bid2q','bid1q', 'ask1q','ask2q','ask3q','ask4q','ask5q','ask6q',
                         'ask7q','ask8q','ask9q','ask10q', 'bid10n', 'bid9n', 'bid8n', 'bid7n', 'bid6n', 'bid5n', 'bid4n', 'bid3n', 'bid2n', 'bid1n', 
                         'ask1n', 'ask2n', 'ask3n', 'ask4n', 'ask5n', 'ask6n','ask7n', 'ask8n', 'ask9n', 'ask10n','bid1Top1q','bid1Top2q','bid1Top3q','bid1Top4q','bid1Top5q','bid1Top6q',
    'bid1Top7q','bid1Top8q','bid1Top9q','bid1Top10q','bid1Top11q','bid1Top12q','bid1Top13q','bid1Top14q','bid1Top15q','bid1Top16q','bid1Top17q','bid1Top18q',
    'bid1Top19q','bid1Top20q','bid1Top21q','bid1Top22q','bid1Top23q','bid1Top24q','bid1Top25q','bid1Top26q','bid1Top27q','bid1Top28q','bid1Top29q',
    'bid1Top30q','bid1Top31q','bid1Top32q','bid1Top33q','bid1Top34q','bid1Top35q','bid1Top36q','bid1Top37q','bid1Top38q','bid1Top39q','bid1Top40q',
    'bid1Top41q','bid1Top42q','bid1Top43q','bid1Top44q','bid1Top45q','bid1Top46q','bid1Top47q','bid1Top48q','bid1Top49q','bid1Top50q', 'ask1Top1q',
    'ask1Top2q','ask1Top3q','ask1Top4q','ask1Top5q','ask1Top6q','ask1Top7q','ask1Top8q','ask1Top9q','ask1Top10q','ask1Top11q','ask1Top12q','ask1Top13q',
    'ask1Top14q','ask1Top15q','ask1Top16q','ask1Top17q','ask1Top18q','ask1Top19q','ask1Top20q','ask1Top21q','ask1Top22q','ask1Top23q',
    'ask1Top24q','ask1Top25q','ask1Top26q','ask1Top27q','ask1Top28q','ask1Top29q','ask1Top30q','ask1Top31q','ask1Top32q','ask1Top33q',
    'ask1Top34q','ask1Top35q','ask1Top36q','ask1Top37q','ask1Top38q','ask1Top39q','ask1Top40q','ask1Top41q','ask1Top42q','ask1Top43q',
    'ask1Top44q','ask1Top45q','ask1Top46q','ask1Top47q','ask1Top48q','ask1Top49q','ask1Top50q',"total_bid_quantity", "total_ask_quantity","total_bid_vwap", "total_ask_vwap",
    "total_bid_orders",'total_ask_orders','total_bid_levels', 'total_ask_levels', 'bid_trade_max_duration', 'ask_trade_max_duration', 'cum_canceled_buy_orders', 'cum_canceled_buy_volume',
    "cum_canceled_buy_amount", "cum_canceled_sell_orders", 'cum_canceled_sell_volume',"cum_canceled_sell_amount"]]

display(SH["date"].iloc[0])
print("SH finished")

# database_name = 'com_md_eq_cn'
# user = "zhenyuy"
# password = "bnONBrzSMGoE"

# db1 = DB("192.168.10.178", database_name, user, password)
# db1.write('md_snapshot_l2', SH)

# del SH




0:03:04.425980


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


no massive missing


20170628

SH finished


In [2]:
database_name = 'com_md_eq_cn'
user = "zhenyuy"
password = "bnONBrzSMGoE"

db1 = DB("192.168.10.178", database_name, user, password)
db1.write('md_snapshot_l2', SH)