In [1]:
import os
import glob
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('max_rows', 100)
pd.set_option('max_columns', 100)

perc = [0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]

In [2]:
startDate = '20201027'
endDate = '20201028'


readPath = r'\\192.168.10.30\Kevin_zhenyu\orderLog\equityTradeLogs'
dataPathLs = np.array(glob.glob(os.path.join(readPath, 'speedCompare***.csv')))
dateLs = np.array([os.path.basename(i).split('_')[1].split('.')[0] for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]
rawOrderLog = []
for thisDate, thisPath in zip(dateLs, dataPathLs):
    data = pd.read_csv(thisPath)
    data = data.rename(columns={'mdClockAtArrival': 'caamd'})
    rawOrderLog += [data]
rawOrderLog = pd.concat(rawOrderLog, sort=False)

for col in ['clockAtArrival', 'caamd', 'secid', 'updateType', 'vai', 'absFilledThisUpdate', 'orderDirection', 'absOrderSize',
            'absOrderSizeCumFilled', 'date', 'accCode', 'mse']:
    rawOrderLog[col] = rawOrderLog[col].astype('int64')   
rawOrderLog = rawOrderLog.sort_values(by=['date', 'secid', 'vai', 'accCode', 'clockAtArrival']).reset_index(drop=True)

rawOrderLog = rawOrderLog[rawOrderLog["secid"] >= 1000000]

rawOrderLog['clock'] = rawOrderLog['clockAtArrival'].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))
rawOrderLog['broker'] = rawOrderLog['accCode'] // 100
rawOrderLog["broker"] = np.where(rawOrderLog["accCode"].astype(str).apply(lambda x: len(x) == 6), rawOrderLog['accCode'] // 10000, rawOrderLog["broker"])
rawOrderLog['colo_broker'] = rawOrderLog['colo'].str[:2] + '_' + rawOrderLog['broker'].astype('str')
rawOrderLog['order'] = rawOrderLog.groupby(['date', 'accCode', 'secid', 'vai']).grouper.group_info[0]
rawOrderLog['group'] = rawOrderLog.groupby(['date', 'secid', 'vai']).grouper.group_info[0]
rawOrderLog['startClock'] = rawOrderLog.groupby(['order'])['clockAtArrival'].transform('first')
rawOrderLog['duration'] = rawOrderLog['clockAtArrival'] - rawOrderLog['startClock']
rawOrderLog['orderPrice'] = rawOrderLog['orderPrice'].apply(lambda x: round(x, 2))
rawOrderLog['tradePrice'] = rawOrderLog['tradePrice'].apply(lambda x: round(x, 2))
rawOrderLog['orderDirection1'] = np.where(rawOrderLog["orderDirection"] == -2, -1, np.where(
    rawOrderLog["orderDirection"] == 2, 1, rawOrderLog["orderDirection"]))
orderLog = rawOrderLog.copy()

### Assertion 1:  make sure same direction in same date, secid, vai
print('=======================================================================================')
print('1. same date, secid, vai: same direction')
orderLog['directNum'] = orderLog.groupby(['date', 'secid', 'vai'])['orderDirection1'].transform('nunique')
if len(orderLog[orderLog['directNum'] != 1]) > 0:
    print('opposite direction for same date, same secid, same vai')
    display(orderLog[(orderLog['directNum'] != 1) & (orderLog['updateType'] == 0)][['date', 'accCode', 'secid', 'vai', 'orderDirection', 'order']])
    orderLog = orderLog[orderLog['directNum'] == 1]

assert((orderLog.groupby(['date', 'secid', 'vai'])['orderDirection1'].nunique() == 1).all() == True)

## Assertion 2:  make sure each account, secid, vai only has one insertion
print('=======================================================================================')
print('2. same date, secid, vai, accCode: one insertion')
a = orderLog[orderLog['updateType'] == 0].groupby(['date', 'accCode', 'secid', 'vai', 'order'])['clockAtArrival'].count()
if len(a[a > 1]) > 0:
    print('more than one insertion at same time')
    a = a[a>1].reset_index()
    display(a)
    orderLog = orderLog[~(orderLog['order'].isin(a['order'].unique()))]

orderLog['isMsg'] = np.where(orderLog['updateType'] == 0, 
                             np.where(orderLog['mse'] == 100, 1, 0), np.nan)
orderLog['isMsg'] = orderLog.groupby(['order'])['isMsg'].ffill()

placeSZE = orderLog[(orderLog['secid'] >= 2000000) & (orderLog['updateType'] == 0)]
print('%.2f%% SZE orders triggered by msg data'%(placeSZE[placeSZE['isMsg'] == 1].shape[0]/placeSZE.shape[0]*100))


### Assertion 3:  check IPO stocks selling status
print('=======================================================================================')
print('3. IPO stocks selling (ars = 301, 302)')
if orderLog[orderLog['ars'].isin([301, 302])].shape[0] != 0:
    kk = orderLog[orderLog['ars'].isin([301, 302])]
    print(kk)
    try:
        assert(kk[kk['orderDirection1'] == 1].shape[0] == 0)
        print('we only sell, never buy')
    except:
        print('There are IPO buy side orders!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
        print(kk[kk['orderDirection1'] == 1])
    kk1 = kk[kk['updateType'] == 0]
    kk1 = kk1.sort_values(by=['accCode', 'secid','clockAtArrival'])
    kk1['diff'] = kk1.groupby(['accCode', 'secid'])['clockAtArrival'].apply(lambda x: x-x.shift(1))
    kk1['diff'] = kk1['diff'].fillna(0)
    try:
        assert(kk1[kk1['diff'] < 10e6].shape[0] == 0)
        print('for each stock in the same account, there is no insertion within 10 seconds of the previous insertion')
    except:
        print('There are insertion within 10 seconds for orders under same account same stock!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
        print(kk1[kk1['diff'] < 10e6])
    kk2 = kk[(kk['updateType'] == 1)]
    try:
        assert(kk2[kk2['duration'] < 3e6].shape[0] == 0)
        print('for each stock in the same account, the cancellation of an order happens more than 3 seconds after the insertion')
    except:
        print('There are cancellation within 3 seconds for orders under same account same stock!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
        print(kk2[kk2['duration'] < 3e6])


### Assertion 4: check updateType == 7 orders, make sure updateType == 7 orders < 20 per account, < 100 in total
print('=======================================================================================')
print('4. updateType 7 orders')
if orderLog[orderLog['updateType'] == 7].shape[0] != 0:
    assert(orderLog[orderLog['updateType'] == 7].groupby('accCode')['order'].nunique().max() < 20)
    assert(orderLog[orderLog['updateType'] == 7].groupby('accCode')['order'].nunique().sum() < 100)

### Assertion 5: check updateType == 6 orders, make sure updateType == 6 orders < 5% per account
print('=======================================================================================')
print('5. updateType 6 orders')
k1 = orderLog[orderLog['updateType'] == 6].groupby('accCode')['order'].nunique().reset_index()
k2 = orderLog.groupby('accCode')['order'].nunique().reset_index()
k = pd.merge(k1, k2, on='accCode', how='left')
k['prob'] = k['order_x']/k['order_y']
try:
    assert(sum(k['prob'] >= 0.05) == 0)
except:
    print('There are accounts with more than 5% updateType 6 orders!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
    print(k[k['prob'] >= 0.05])

### Assertion 6: check CYB orders, make sure CYB stocks total absOrderSize < 30w
print('=======================================================================================')
print('6. CYB stocks total order size < 30w')
try:
    assert(orderLog[(orderLog['secid'] >= 2300000) & (orderLog['updateType'] == 0)]['absOrderSize'].max() <= 300000)
except:
    print('CYB stocks total absOrderSize >= 30w!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
    
     
### Assertion 7:  make sure there is no unexpected updateType 
print('=======================================================================================')
print('7. unexpected updateType')
def getTuple(x):
    return tuple(i for i in x)

checkLog = orderLog[~((orderLog['updateType'] == 4) & (orderLog.groupby(['order'])['updateType'].shift(-1) == 4))]
checkLog = checkLog.groupby(['order'])['updateType'].apply(lambda x: getTuple(x)).reset_index()
checkLog['status'] = np.where(checkLog['updateType'].isin([(0, 2, 4), (0, 2, 1, 4), (0, 2, 1, 2, 4), (0, 2, 4, 1, 4), (0, 4), (0, 1, 4), (0, 4, 1, 4), (0, 2, 2, 4), (0, 4, 2, 4), (0, 2, 2, 1, 4), (0, 2, 2, 4, 1, 4)]),0,
                     np.where(checkLog['updateType'].isin([(0, 2, 4, 1, 3), (0, 2, 4, 1, 4, 3), (0, 2, 1, 4, 3), (0, 4, 1, 3), (0, 1, 4, 3),
                                                               (0, 2, 2, 4, 1, 3), (0, 2, 2, 4, 1, 4, 3), (0, 2, 2, 1, 4, 3), (0, 4, 2, 4, 1, 3),
                                                               (0, 4, 2, 1, 3), (0, 4, 1, 4, 3), (0, 4, 1)]), 1,
                     np.where(checkLog['updateType'].isin([(0, 2, 1, 3), (0, 2, 2, 1, 3), (0, 2, 3), (0, 3), (0, 1, 3), (0, ), (0, 2), (0, 2, 1), (0, 2, 2)]), 2, 3)))

orderLog = pd.merge(orderLog, checkLog[['order', 'status']], how='left', on=['order'], validate='many_to_one')
orderLog = orderLog[orderLog['status'].isin([0, 1, 2])].reset_index(drop=True)

### Assertion 8:  make sure status==0 got all traded
print('=======================================================================================')
print('8. status == 0: all traded')
a = orderLog[orderLog['status'] == 0]
a = a.groupby(['order'])[['absOrderSizeCumFilled', 'absOrderSize']].max().reset_index()
a.columns = ['order', 'filled', 'total']
print('in total trade, any fill != total cases')
display(a[a['filled'] != a['total']])
if a[a['filled'] != a['total']].shape[0] > 0:
    removeOrderLs = a[a['filled'] != a['total']]['order'].unique()
    orderLog = orderLog[~(orderLog['order'].isin(removeOrderLs))]
    
### Assertion 9:  make sure status==1 got partial traded
print('=======================================================================================')
print('9. status == 1: partial traded')
a = orderLog[orderLog['status'] == 1]
a = a.groupby(['order'])[['absOrderSizeCumFilled', 'absOrderSize']].max().reset_index()
a.columns = ['order', 'filled', 'total']
print('in partial trade, any fill >= total or fill is 0 cases for updateType 4')
display(a[(a['filled'] >= a['total']) | (a['filled'] == 0)])
if a[(a['filled'] >= a['total']) | (a['filled'] == 0)].shape[0] > 0:
    removeOrderLs = a[(a['filled'] >= a['total']) | (a['filled'] == 0)]['order'].unique()
    orderLog = orderLog[~(orderLog['order'].isin(removeOrderLs))]
    
### Assertion 10: make sure no cancellation within 1 sec
print('=======================================================================================')
print('10. no cancellation within 1 sec')
a = orderLog[(orderLog['updateType'] == 1) & (orderLog['duration'] < 1e6)]
print('any cancellation within 1 sec')
display(a)
if a.shape[0] > 0:
    removeOrderLs = a['order'].unique()
    orderLog = orderLog[~(orderLog['order'].isin(removeOrderLs))]


### Assertion 11: make sure no order has shares > 80w or notional > 800w
print('=======================================================================================')
print('11. Orders with size > 80w or notional > 800w')
orderLog['orderNtl'] = orderLog['absOrderSize'] * orderLog['orderPrice']
if orderLog[orderLog['absOrderSize'] > 800000].shape[0] > 0:
    print('some order quantity are > 80w')
    print(orderLog[orderLog['absOrderSize'] > 800000].groupby(['colo', 'accCode'])['order'].nunique())
    display(orderLog[orderLog['absOrderSize'] > 800000][['date', 'accCode', 'secid', 'vai', 'absOrderSize', 'orderPrice',
                                                         'orderNtl', 'orderDirection', 'clock', 'order']])
            
if orderLog[orderLog['orderNtl'] > 8000000].shape[0] > 0:
    print('some order ntl are > 800w')
    print(orderLog[orderLog['orderNtl'] > 8000000].groupby(['colo', 'accCode'])['order'].nunique())
    display(orderLog[orderLog['orderNtl'] > 8000000][['date', 'accCode', 'secid', 'vai', 'absOrderSize', 'orderPrice',
                                                      'orderNtl', 'orderDirection', 'clock', 'order', "updateType", 
                                                      "tradePrice", "absOrderSizeCumFilled", "absFilledThisUpdate"]])

removeOrderLs = list(set(orderLog[orderLog['absOrderSize'] > 800000]['order'].unique()) | set(orderLog[orderLog['orderNtl'] > 8000000]['order'].unique()))
orderLog = orderLog[~(orderLog['order'].isin(removeOrderLs))]


orderLog = orderLog.sort_values(by=['date', 'secid', 'vai', 'accCode', 'clockAtArrival']).reset_index(drop=True)

orderLog['exchange'] = np.where(orderLog['secid'] >= 2000000, 'SZE', 'SSE')
orderLog['orderNtl'] = orderLog['orderPrice'] * orderLog['absOrderSize']
orderLog['tradeNtl'] = np.where(orderLog['updateType'] == 4, orderLog['tradePrice']*orderLog['absFilledThisUpdate'], 0)
orderLog["mrstaat"] = orderLog.groupby(['order'])['mrstaat'].transform('first')
orderLog["ars"] = orderLog.groupby(['order'])['ars'].transform('first')
orderLog["mrstauc"] = orderLog.groupby(['order'])['mrstauc'].transform('first')
orderLog['sta'] = np.where(orderLog['mrstaat'] == 1000, 'staone', np.where(
orderLog['mrstaat'] == 3000, 'statwo', np.where(
orderLog['mrstaat'] == 11000, 'stathree13', 'stathree23')))
orderLog

  interactivity=interactivity, compiler=compiler, result=result)


1. same date, secid, vai: same direction
opposite direction for same date, same secid, same vai


Unnamed: 0,date,accCode,secid,vai,orderDirection,order
14105,20201027,6282,1600132,1464227,-1,67699
14110,20201027,9441,1600132,1464227,1,111850
204653,20201027,8854,1603618,6158800,1,90722
204657,20201027,528401,1603618,6158800,-1,177724
204660,20201027,968501,1603618,6158800,-1,194512
547951,20201027,5456,2300132,498700,-1,61470
547955,20201027,5456,2300132,498700,1,61470
551406,20201027,9208,2300143,3233402,-1,110325
551408,20201027,9758,2300143,3233402,1,148323
607326,20201027,5289,2300357,1615190,-1,49731


2. same date, secid, vai, accCode: one insertion
more than one insertion at same time


Unnamed: 0,date,accCode,secid,vai,order,clockAtArrival
0,20201027,5222,2300551,2420000,3776,2
1,20201027,5456,2002833,22300,61255,2
2,20201027,5456,2002912,21180,61311,3
3,20201027,5456,2002947,282802,61359,2
4,20201027,5456,2300570,1042740,61810,2
5,20201027,5456,2300709,102800,61885,2
6,20201027,5456,2300788,19643,61932,2
7,20201027,6282,1600132,980927,67687,2
8,20201027,6282,1600132,981927,67691,2
9,20201027,6282,1601799,681919,67955,2


99.75% SZE orders triggered by msg data
3. IPO stocks selling (ars = 301, 302)
4. updateType 7 orders
5. updateType 6 orders
There are accounts with more than 5% updateType 6 orders!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
   accCode  order_x  order_y      prob
1     6282      268     3057  0.087668
6. CYB stocks total order size < 30w
7. unexpected updateType
8. status == 0: all traded
in total trade, any fill != total cases


Unnamed: 0,order,filled,total
12828,18530,5800,6600
12829,18531,7000,12400
19425,27781,700,1300
23280,33126,2200,3300
23439,33366,200,300
36113,49487,300,4900
37401,51280,300,1300
40174,55256,200,2400
41125,56764,8000,8700
69491,96507,100,300


9. status == 1: partial traded
in partial trade, any fill >= total or fill is 0 cases for updateType 4


Unnamed: 0,order,filled,total


10. no cancellation within 1 sec
any cancellation within 1 sec


Unnamed: 0.1,Unnamed: 0,date,clockAtArrival,caamd,colo,accCode,secid,vai,sdd,orderDirection,absOrderSize,updateType,tradePrice,absFilledThisUpdate,absOrderSizeCumFilled,tradeId,orderSysId,aaa,internalId,ars,mrstaat,mrstauc,mse,mta,session,cfe,mrss90,mra100,orderId,underlyingIndex,mt,mrsb90,mrm,mrsb300,orderPrice,l4tr,hee,mrb100,cancellationPending,mrss300,mfe,threadId,ms,finalState,sequenceNo,ApplSeqNum,clock,mrstaum,broker,colo_broker,order,group,startClock,duration,orderDirection1,directNum,isMsg,status


11. Orders with size > 80w or notional > 800w


Unnamed: 0.1,Unnamed: 0,date,clockAtArrival,caamd,colo,accCode,secid,vai,sdd,orderDirection,absOrderSize,updateType,tradePrice,absFilledThisUpdate,absOrderSizeCumFilled,tradeId,orderSysId,aaa,internalId,ars,mrstaat,mrstauc,mse,mta,session,cfe,mrss90,mra100,orderId,underlyingIndex,mt,mrsb90,mrm,mrsb300,orderPrice,l4tr,hee,mrb100,cancellationPending,mrss300,mfe,threadId,ms,finalState,sequenceNo,ApplSeqNum,clock,mrstaum,broker,colo_broker,order,group,startClock,duration,orderDirection1,directNum,isMsg,status,orderNtl,exchange,tradeNtl,sta
0,0,20201027,1603763823831461,1603763823831354,zt_88_03,8970,1600000,9654762,95657000.0,-2,400,0,-1.00,0,0,,,0.000145,827.0,1.0,3000.0,1000.0,100,-0.002638,0,1.335624e+07,0.000145,955.0,-1.000000e+00,300,2.0,-0.001082,-0.002638,-1.0,9.54,0.0,-0.000100,954.0,0.0,-1.0,35485692.03,19497.0,09:57:03.830838,0.0,46817603.0,613184.0,2020-10-27 09:57:03.831461,0.0,89,zt_89,103374,0,1603763823831461,0,-1,1,1.0,0,3816.0,SSE,0.0,statwo
1,1,20201027,1603763823952457,1603763823831354,zt_88_03,8970,1600000,9654762,-1.0,-2,400,2,-1.00,0,0,,5.10019e+07,-1.000000,827.0,1.0,3000.0,1000.0,0,-1.000000,0,1.335624e+07,0.000145,955.0,8.270000e+02,300,-1.0,-0.001082,-0.002638,-1.0,9.54,0.0,-1.000000,954.0,0.0,-1.0,35485692.03,19526.0,09:57:03.951838,0.0,46817603.0,0.0,2020-10-27 09:57:03.952457,0.0,89,zt_89,103374,0,1603763823831461,120996,-1,1,1.0,0,3816.0,SSE,0.0,statwo
2,2,20201027,1603763824157686,1603763824088890,zt_88_03,8970,1600000,9654762,-1.0,-2,400,4,9.54,400,400,3419189.0,,-1.000000,827.0,1.0,3000.0,1000.0,0,-1.000000,0,1.335624e+07,-1.000000,955.0,8.270000e+02,300,-1.0,-1.000000,-0.002638,-1.0,9.54,0.0,-1.000000,954.0,0.0,-1.0,35485692.03,19526.0,09:57:04.156838,1.0,46822587.0,0.0,2020-10-27 09:57:04.157686,0.0,89,zt_89,103374,0,1603763823831461,326225,-1,1,1.0,0,3816.0,SSE,3816.0,statwo
3,3,20201027,1603763992522858,1603763992522655,zs_88_04,8967,1600000,10412800,95950000.0,-2,700,0,-1.00,0,0,,,0.000097,1335.0,1.0,3000.0,1000.0,100,-0.002860,0,2.298950e+07,0.000097,955.0,-1.000000e+00,300,2.0,-0.001146,-0.002860,-1.0,9.54,0.0,-0.000090,954.0,0.0,-1.0,12013883.87,94783.0,09:59:52.522168,0.0,70375446.0,657169.0,2020-10-27 09:59:52.522858,0.0,89,zs_89,100080,1,1603763992522858,0,-1,1,1.0,0,6678.0,SSE,0.0,statwo
4,4,20201027,1603763992748831,1603763992522655,zs_88_04,8967,1600000,10412800,-1.0,-2,700,2,-1.00,0,0,,5.10125e+07,-1.000000,1335.0,1.0,3000.0,1000.0,0,-1.000000,0,2.298950e+07,0.000097,955.0,1.335000e+03,300,-1.0,-0.001146,-0.002860,-1.0,9.54,0.0,-1.000000,954.0,0.0,-1.0,12013883.87,94810.0,09:59:52.748168,0.0,70375446.0,0.0,2020-10-27 09:59:52.748831,0.0,89,zs_89,100080,1,1603763992522858,225973,-1,1,1.0,0,6678.0,SSE,0.0,statwo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1766232,985095,20201028,1603868046411116,1603868046407392,zs_66_01,6683,2300815,624417,-1.0,-1,100,4,107.16,100,100,1.02e+14,,-1.000000,3905.0,1.0,3000.0,1000.0,0,-1.000000,0,8.295196e+06,0.000910,10719.0,1.812810e+10,852,-1.0,-0.001072,0.000286,-1.0,107.16,0.0,-1.000000,10716.0,0.0,-1.0,-1.00,32007.0,14:54:06.409881,1.0,232547943.0,0.0,2020-10-28 14:54:06.411116,0.0,66,zs_66,307185,226549,1603868046407445,3671,-1,1,1.0,0,10716.0,SZE,10716.0,statwo
1766233,985096,20201028,1603868044507329,1603868044507276,zs_88_04,8865,2300815,624417,145404660.0,-1,100,0,-1.00,0,0,,,0.000910,2033.0,1.0,3000.0,1000.0,100,0.000286,0,1.790473e+06,0.000910,10719.0,-1.000000e+00,852,0.0,-0.001072,0.000286,-1.0,107.16,0.0,0.000303,10716.0,0.0,-1.0,-1.00,162740.0,14:54:04.507130,0.0,292190276.0,21690787.0,2020-10-28 14:54:04.507329,0.0,88,zs_88,311908,226549,1603868044507329,0,-1,1,1.0,2,10716.0,SZE,0.0,statwo
1766234,985097,20201028,1603868044513501,1603868044511203,zs_88_04,8865,2300815,624417,-1.0,-1,100,2,-1.00,0,0,,AE22007117,-1.000000,2033.0,1.0,3000.0,1000.0,0,-1.000000,0,1.790473e+06,-0.000411,10716.0,2.200712e+07,852,-1.0,-0.001147,0.000641,-1.0,107.16,0.0,-1.000000,10693.0,0.0,-1.0,-1.00,162760.0,14:54:04.513130,0.0,292190376.0,0.0,2020-10-28 14:54:04.513501,0.0,88,zs_88,311908,226549,1603868044507329,6172,-1,1,1.0,2,10716.0,SZE,0.0,statwo
1766235,985098,20201028,1603868046260165,1603868046260114,zs_88_04,8865,2300815,624417,-1.0,-1,100,1,-1.00,0,0,,AE22007117,-1.000000,2033.0,1.0,3000.0,1000.0,100,-1.000000,0,1.790473e+06,-0.000363,10699.0,2.200712e+07,852,-1.0,0.000077,0.000719,-1.0,107.16,0.0,-1.000000,10693.0,1.0,-1.0,-1.00,162740.0,14:54:06.260130,0.0,292226522.0,0.0,2020-10-28 14:54:06.260165,0.0,88,zs_88,311908,226549,1603868044507329,1752836,-1,1,1.0,2,10716.0,SZE,0.0,statwo


In [3]:
orderLog[(orderLog['exchange'] == 'SZE') & (orderLog['accCode'].isin([5328, 5386, 5377])) & (orderLog['updateType'] == 0)] \
.groupby(['accCode', 'sta'])['mrstauc'].unique()

accCode  sta   
5328     staone                       [0.0]
         statwo            [1000.0, 2000.0]
5377     staone                       [0.0]
         statwo    [1000.0, 2000.0, 3000.0]
5386     staone                       [0.0]
         statwo    [1000.0, 2000.0, 3000.0]
Name: mrstauc, dtype: object

In [4]:
orderLog = orderLog[~orderLog['mrstauc'].isnull()]
orderLog['sta_uc'] = orderLog['sta'] + '_' + orderLog['mrstauc'].astype(int).astype(str)
orderLog["server"] = orderLog["colo"].apply(lambda x: x.split("_")[0] + x.split("_")[1] + x.split("_")[2])
orderLog["server_account"] = orderLog["server"] + '_' + orderLog['accCode'].astype('str')
checkLog = orderLog[orderLog['updateType'].isin([0, 3, 4])].reset_index(drop=True)
checkLog = checkLog[checkLog['duration'] <= 1e6]

checkLog["time"] = checkLog["clock"].apply(lambda x: x.strftime("%H%M%S")).astype(int)
# ol = checkLog[(checkLog["updateType"] == 0) & (~checkLog["ars"].isin([121, 221, 321, 131, 231, 331])) & (checkLog["time"]>= 93000)
#              & (checkLog["time"] < 93300)]["order"].unique()

sta_list = orderLog['sta_uc'].unique()

result = {}
for col in ['date', 'broker1', 'broker2', 'isFast']:
    result[col] = []
            
for i in sta_list:
    checkLog1 = checkLog[checkLog['sta_uc'] == i]
    checkLog1['accountNum'] = checkLog1.groupby(['date', 'secid', 'vai'])['server_account'].transform('nunique')
    checkLog1 = checkLog1[checkLog1['accountNum'] >= 2]
    checkLog1['cumFillSize'] = checkLog1.groupby(['order'])['absOrderSizeCumFilled'].transform('max')
    checkLog1['status'] = np.where(checkLog1['cumFillSize'] == 0, 2, 
                                    np.where(checkLog1['cumFillSize'] < checkLog1['absOrderSize'], 1, 0))
    checkLog1 = checkLog1.groupby(['group', 'order'])[['date', 'server_account', 'secid', 'vai', 'status']].first().reset_index()

    for exchg in ['SZE']:

        print(exchg)

        if exchg == 'SSE':
            exchangeLog = checkLog1[checkLog1['secid'] < 2000000]
        else:
            exchangeLog = checkLog1[checkLog1['secid'] >= 2000000]

        for group, groupData in exchangeLog.groupby(['group']):
            date = groupData['date'].values[0]
            brokerLs = groupData['server_account'].values
            if len(brokerLs) == 0:
                continue
            statusLs = groupData['status'].values
            ixLs = [i for i in range(len(brokerLs))]
            for k, broker1, status1 in zip(ixLs[:-1], brokerLs[:-1], statusLs[:-1]):
                for broker2, status2 in zip(brokerLs[k+1:], statusLs[k+1:]):
                    if broker1 != broker2:
                        result['date'] += [date, date]
                        result['broker1'] += [broker1, broker2]
                        result['broker2'] += [broker2, broker1]

                        if status1 < status2:
                            result['isFast'] += [1, 0]
                        elif status1 > status2:
                            result['isFast'] += [0, 1]
                        else:
                            result['isFast'] += [-1, -1]                

result = pd.DataFrame(result)
result = result[result['isFast'] != -1]
result['count'] = result.groupby(['broker1', 'broker2'])['isFast'].transform('count')
result['fasterSum'] = result.groupby(['broker1', 'broker2'])['isFast'].transform('sum')
result['faster'] = result['fasterSum']/result['count']


summary = pd.DataFrame()
summaryCols = []
countCols = []
fasterCols = []
for date in result['date'].unique():
    a = result[result['date'] == date].reset_index(drop=True)
    a['count'] = a.groupby(['broker1', 'broker2'])['isFast'].transform('count')
    a['fasterSum'] = a.groupby(['broker1', 'broker2'])['isFast'].transform('sum')
    a['faster'] = a['fasterSum']/a['count']
    a = a.groupby(['broker1', 'broker2'])[['count', 'faster']].mean().reset_index()
    a.columns = ['broker1', 'broker2', '%s count'%date, '%s faster'%date]
    summaryCols += ['%s count'%date, '%s faster'%date]
    countCols += ['%s count'%date]
    fasterCols += ['%s faster'%date]

    if summary.empty:
        summary = a.copy()
    else:
        summary = pd.merge(summary, a, how='outer', on=['broker1', 'broker2'], validate='one_to_one')

a = result.groupby(['broker1', 'broker2'])[['count', 'faster']].mean().reset_index()
a.columns = ['broker1', 'broker2', 'total count', 'total faster']
summary = pd.merge(summary, a, how='outer', on=['broker1', 'broker2'], validate='one_to_one')
summaryCols = ['total count', 'total faster'] + summaryCols


savePath = r'L:\orderLog\result\relativeSpeed'
a = summary.groupby(['broker1', 'broker2'])[['total count', 'total faster']].first().reset_index()
a.to_csv(os.path.join(savePath, 'relativeSpeedAccount_%s_%s_%s.csv'%(startDate, endDate, exchg)), index=False)




countCols += ['total count']
fasterCols += ['total faster']
summary['b1'] = summary['broker1'].str[7:11].astype(int) // 100
summary['b2'] = summary['broker2'].str[7:11].astype(int) // 100
#     summary = summary.groupby(['broker1', 'broker2'])[summaryCols].first()
for col in countCols:
    summary[col] = summary[col].fillna(0)
    summary[col] = summary[col].astype('int64')

for col in fasterCols:
    summary[col] = summary[col].fillna(-1)
    summary[col] = summary[col].apply(lambda x: '%.0f%%'%(x*100))
    summary[col] = np.where(summary[col] == '-100%', ' ', summary[col])

#     summary = summary.rename(columns={"broker1":"account1", "broker2":"account2"})
#     summary = summary.groupby(['account1', 'account2'])[summaryCols].first()
#     from IPython.display import display, HTML
#     display(HTML(summary.iloc[:, :2].to_html()))
#     summary["tt"]=summary[["broker1", "broker2"]].min(axis=1).astype(str) + ' - ' + summary[["broker1", "broker2"]].max(axis=1).astype(str)
#     summary = summary.sort_values(by=["tt", "broker1"])
#     s1 = summary.groupby("tt").first().reset_index(drop=True)
s1 = summary
display(s1[(s1["broker1"] == 'zs5209_5289') & (s1['b2'] == 52)])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_ind

SZE
SZE
SZE
SZE
SZE
SZE
SZE
SZE
SZE
SZE
SZE
SZE
SZE
SZE
SZE
SZE
SZE


Unnamed: 0,broker1,broker2,20201027 count,20201027 faster,20201028 count,20201028 faster,total count,total faster,b1,b2
191,zs5209_5289,zs5208_5229,256,73%,391,73%,647,73%,52,52
192,zs5209_5289,zs5208_5276,181,93%,505,91%,686,92%,52,52
195,zs5209_5289,zs5209_5291,112,62%,115,70%,227,66%,52,52
203,zs5209_5289,zt5207_523201,118,100%,293,100%,411,100%,52,52


In [10]:
s1[(s1['broker1'] == 'zs5206_5328') & (s1['broker2'].isin(['zs5208_5377', 'zs5208_5386']))][['broker1', 'broker2', 'total count', 'total faster']]

Unnamed: 0,broker1,broker2,total count,total faster
68,zs5206_5328,zs5208_5377,31,97%
69,zs5206_5328,zs5208_5386,75,100%


In [33]:
s1[s1['broker1'] == 'zs5206_5328'][['broker1', 'broker2', 'total count', 'total faster']]

Unnamed: 0,broker1,broker2,total count,total faster
64,zs5206_5328,zs5208_5229,15,7%
65,zs5206_5328,zs5208_5276,14,0%
66,zs5206_5328,zs5208_5377,6,83%
67,zs5206_5328,zs5208_5386,11,100%
68,zs5206_5328,zs5209_5289,19,0%
69,zs5206_5328,zs5209_5291,13,0%
70,zs5206_5328,zs6601_6678,6,0%
71,zs5206_5328,zs8804_8924,6,100%
72,zs5206_5328,zs8804_896702,7,100%
73,zs5206_5328,zs9403_9448,1,0%


In [37]:
s1[s1['broker1'] == 'zs5208_5386'][['broker1', 'broker2', 'total count', 'total faster']]

Unnamed: 0,broker1,broker2,total count,total faster
132,zs5208_5386,zs5206_5328,11,0%
133,zs5208_5386,zs5208_5229,170,0%
134,zs5208_5386,zs5208_5276,76,0%
135,zs5208_5386,zs5208_5377,30,47%
136,zs5208_5386,zs5209_5289,123,0%
137,zs5208_5386,zs5209_5291,57,2%
138,zs5208_5386,zs6601_6678,28,0%
139,zs5208_5386,zs8804_8924,63,35%
140,zs5208_5386,zs8804_896702,64,33%
141,zs5208_5386,zs9403_9448,4,0%


In [6]:
startDate = '20201027'
endDate = '20201028'
readPath = r'L:\orderLog\result\marketPos'
dataPathLs = np.array(glob.glob(os.path.join(readPath, 'SZspeed_***.pkl')))
dateLs = np.array([os.path.basename(i).split('.')[0].split('_')[1] for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]
checkData = []
for path in dataPathLs:
    data = pd.read_pickle(path)
    checkData += [data]
checkLog = pd.concat(checkData).reset_index(drop=True)   
checkLog['date'].unique()

array([20201027, 20201028], dtype=int64)

In [7]:
checkLog = checkLog.drop_duplicates(['date', 'secid', 'Price', 'OrderQty', 'Side', 'statusLs', 'TradePriceLs', 'TradeQtyLs', 'ApplSeqNum'], keep=False)
checkLog = checkLog[~checkLog['accCode'].isnull()]

In [8]:
cc1['sta'].unique()

NameError: name 'cc1' is not defined

In [9]:
cc1 = checkLog[checkLog['accCode'].isin([5328, 5386, 5377])]
cc1 = cc1.drop_duplicates(['date', 'secid', 'Price', 'OrderQty', 'Side', 'statusLs', 'TradePriceLs', 'TradeQtyLs', 'ApplSeqNum'], keep=False)
cc1 = cc1.reset_index(drop=True)
cc1['ordering'] = cc1.index
cc1['time_diff'] = cc1['clockAtArrival'] - cc1['start_time']
cc1['colo1'] = cc1['colo'].str[:2] + cc1['colo'].str[3:5] + cc1['colo'].str[6:8]
cc1['colo_broker'] = cc1['colo1'] + '_' + cc1["accCode"].astype(int).astype(str)
cc1['accCode'] = cc1['accCode'].fillna(0).astype(int)
cc1['sta'] = np.where(cc1['mrstaat'] == 1000, 'staone', np.where(
cc1['mrstaat'] == 3000, 'statwo', np.where(
cc1['mrstaat'] == 11000, 'stathree13', 'stathree23')))

checkLog1 = cc1[cc1['sta'] == 'statwo']
ol = pd.merge(cc1[cc1['sta'] == 'staone'], cc1[cc1['sta'] == 'statwo'][['colo', 'accCode']].drop_duplicates(),
         on=['colo', 'accCode'], how='inner')['ordering'].unique()
# checkLog2 = cc1[(cc1['sta'] == 'staone') & (~cc1['ordering'].isin(ol))]
checkLog2 = cc1[(cc1['sta'] == 'staone')]

from IPython.display import display, HTML
re1 = checkLog1.groupby(['sta', 'colo_broker', 'colo'])['time_diff'].describe().fillna(0).astype(int).reset_index()
# re1 = re1[re1['count'] > 20].reset_index()
c1 = checkLog1.groupby(['sta', 'colo_broker', 'colo'])['time_diff'].apply(lambda x: x.describe([0.1])['10%']).astype(int).reset_index()
c1 = c1.rename(columns={"time_diff":"10%"})
re1 = pd.merge(re1, c1[['sta', 'colo_broker', 'colo', '10%']], on=['sta', 'colo_broker', 'colo'])
c1 = checkLog1.groupby(['sta', 'colo_broker', 'colo'])['time_diff'].apply(lambda x: x.describe([0.9])['90%']).astype(int).reset_index()
c1 = c1.rename(columns={"time_diff":"90%"})
re1 = pd.merge(re1, c1[['sta', 'colo_broker', 'colo', '90%']], on=['sta', 'colo_broker', 'colo'])

display(HTML(re1[['sta', 'colo_broker', 'colo', 'count',  '10%', '25%', '50%', '75%', '90%']].groupby(['sta', 'colo_broker', 'colo']).first().to_html()))

from IPython.display import display, HTML
re1 = checkLog2.groupby(['sta', 'colo_broker', 'colo'])['time_diff'].describe().fillna(0).astype(int).reset_index()
# re1 = re1[re1['count'] > 20].reset_index()
c1 = checkLog2.groupby(['sta', 'colo_broker', 'colo'])['time_diff'].apply(lambda x: x.describe([0.1])['10%']).astype(int).reset_index()
c1 = c1.rename(columns={"time_diff":"10%"})
re1 = pd.merge(re1, c1[['sta', 'colo_broker', 'colo', '10%']], on=['sta', 'colo_broker', 'colo'])
c1 = checkLog2.groupby(['sta', 'colo_broker', 'colo'])['time_diff'].apply(lambda x: x.describe([0.9])['90%']).astype(int).reset_index()
c1 = c1.rename(columns={"time_diff":"90%"})
re1 = pd.merge(re1, c1[['sta', 'colo_broker', 'colo', '90%']], on=['sta', 'colo_broker', 'colo'])

display(HTML(re1[['sta', 'colo_broker', 'colo', 'count',  '10%', '25%', '50%', '75%', '90%']].groupby(['sta', 'colo_broker', 'colo']).first().to_html()))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,10%,25%,50%,75%,90%
sta,colo_broker,colo,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
statwo,zs5206_5328,zs_52_06,458,3068,3246,3568,3836,4113
statwo,zs5208_5377,zs_52_08,1862,4408,4583,4935,7217,8396
statwo,zs5208_5386,zs_52_08,2750,4477,4745,7172,10241,11425


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,10%,25%,50%,75%,90%
sta,colo_broker,colo,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
staone,zs5206_5328,zs_52_06,57,3261,3513,3791,3977,4062
staone,zs5208_5377,zs_52_08,68,4292,4500,5209,6997,7946
staone,zs5208_5386,zs_52_08,259,4631,5631,8317,10602,15642


In [58]:
orderLog[(orderLog['exchange'] == 'SZE') & (orderLog['accCode'].isin([5328, 5386, 5377])) & (orderLog['updateType'] == 0)].groupby(['accCode','sta'])['accCode'].size()

accCode  sta   
5328     staone      7
         statwo     70
5377     staone     59
         statwo    690
5386     staone     59
         statwo    794
Name: accCode, dtype: int64