In [1]:
import os
import glob
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('max_rows', 100)
pd.set_option('max_columns', 100)

perc = [0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]

In [2]:
startDate = '20200914'
endDate = '20200930'


readPath = r'\\192.168.10.28\equityTradeLogs'
dataPathLs = np.array(glob.glob(os.path.join(readPath, 'speedCompare***.csv')))
dateLs = np.array([os.path.basename(i).split('_')[1].split('.')[0] for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]
rawOrderLog = []
for thisDate, thisPath in zip(dateLs, dataPathLs):
    data = pd.read_csv(thisPath)
    data = data.rename(columns={'mdClockAtArrival': 'caamd'})
    rawOrderLog += [data]
rawOrderLog = pd.concat(rawOrderLog, sort=False)

for col in ['clockAtArrival', 'caamd', 'secid', 'updateType', 'vai', 'absFilledThisUpdate', 'orderDirection', 'absOrderSize',
            'absOrderSizeCumFilled', 'date', 'accCode', 'mse']:
    rawOrderLog[col] = rawOrderLog[col].fillna(0).astype('int64')   
rawOrderLog = rawOrderLog.sort_values(by=['date', 'secid', 'vai', 'accCode', 'clockAtArrival']).reset_index(drop=True)

rawOrderLog = rawOrderLog[rawOrderLog["secid"] >= 1000000]

rawOrderLog['clock'] = rawOrderLog['clockAtArrival'].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))
rawOrderLog['broker'] = rawOrderLog['accCode'] // 100
rawOrderLog["broker"] = np.where(rawOrderLog["accCode"].astype(str).apply(lambda x: len(x) == 6), rawOrderLog['accCode'] // 10000, rawOrderLog["broker"])
rawOrderLog['colo_broker'] = rawOrderLog['colo'].str[:2] + '_' + rawOrderLog['broker'].astype('str')
rawOrderLog['order'] = rawOrderLog.groupby(['date', 'accCode', 'secid', 'vai']).grouper.group_info[0]
rawOrderLog['group'] = rawOrderLog.groupby(['date', 'secid', 'vai']).grouper.group_info[0]
rawOrderLog['startClock'] = rawOrderLog.groupby(['order'])['clockAtArrival'].transform('first')
rawOrderLog['duration'] = rawOrderLog['clockAtArrival'] - rawOrderLog['startClock']
rawOrderLog['orderPrice'] = rawOrderLog['orderPrice'].apply(lambda x: round(x, 2))
rawOrderLog['tradePrice'] = rawOrderLog['tradePrice'].apply(lambda x: round(x, 2))
rawOrderLog['orderDirection1'] = np.where(rawOrderLog["orderDirection"] == -2, -1, np.where(
    rawOrderLog["orderDirection"] == 2, 1, rawOrderLog["orderDirection"]))
orderLog = rawOrderLog.copy()

### Assertion 1:  make sure same direction in same date, secid, vai
print('=======================================================================================')
print('1. same date, secid, vai: same direction')
orderLog['directNum'] = orderLog.groupby(['date', 'secid', 'vai'])['orderDirection1'].transform('nunique')
if len(orderLog[orderLog['directNum'] != 1]) > 0:
    print('opposite direction for same date, same secid, same vai')
    display(orderLog[(orderLog['directNum'] != 1) & (orderLog['updateType'] == 0)][['date', 'accCode', 'secid', 'vai', 'orderDirection', 'order']])
    orderLog = orderLog[orderLog['directNum'] == 1]

assert((orderLog.groupby(['date', 'secid', 'vai'])['orderDirection1'].nunique() == 1).all() == True)

## Assertion 2:  make sure each account, secid, vai only has one insertion
print('=======================================================================================')
print('2. same date, secid, vai, accCode: one insertion')
a = orderLog[orderLog['updateType'] == 0].groupby(['date', 'accCode', 'secid', 'vai', 'order'])['clockAtArrival'].count()
if len(a[a > 1]) > 0:
    print('more than one insertion at same time')
    a = a[a>1].reset_index()
    display(a)
    orderLog = orderLog[~(orderLog['order'].isin(a['order'].unique()))]

orderLog['isMsg'] = np.where(orderLog['updateType'] == 0, 
                             np.where(orderLog['mse'] == 100, 1, 0), np.nan)
orderLog['isMsg'] = orderLog.groupby(['order'])['isMsg'].ffill()

placeSZE = orderLog[(orderLog['secid'] >= 2000000) & (orderLog['updateType'] == 0)]
print('%.2f%% SZE orders triggered by msg data'%(placeSZE[placeSZE['isMsg'] == 1].shape[0]/placeSZE.shape[0]*100))


### Assertion 3:  check IPO stocks selling status
print('=======================================================================================')
print('3. IPO stocks selling (ars = 301, 302)')
if orderLog[orderLog['ars'].isin([301, 302])].shape[0] != 0:
    kk = orderLog[orderLog['ars'].isin([301, 302])]
    print(kk)
    try:
        assert(kk[kk['orderDirection1'] == 1].shape[0] == 0)
        print('we only sell, never buy')
    except:
        print('There are IPO buy side orders!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
        print(kk[kk['orderDirection1'] == 1])
    kk1 = kk[kk['updateType'] == 0]
    kk1 = kk1.sort_values(by=['accCode', 'secid','clockAtArrival'])
    kk1['diff'] = kk1.groupby(['accCode', 'secid'])['clockAtArrival'].apply(lambda x: x-x.shift(1))
    kk1['diff'] = kk1['diff'].fillna(0)
    try:
        assert(kk1[kk1['diff'] < 10e6].shape[0] == 0)
        print('for each stock in the same account, there is no insertion within 10 seconds of the previous insertion')
    except:
        print('There are insertion within 10 seconds for orders under same account same stock!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
        print(kk1[kk1['diff'] < 10e6])
    kk2 = kk[(kk['updateType'] == 1)]
    try:
        assert(kk2[kk2['duration'] < 3e6].shape[0] == 0)
        print('for each stock in the same account, the cancellation of an order happens more than 3 seconds after the insertion')
    except:
        print('There are cancellation within 3 seconds for orders under same account same stock!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
        print(kk2[kk2['duration'] < 3e6])


### Assertion 4: check updateType == 7 orders, make sure updateType == 7 orders < 20 per account, < 100 in total
print('=======================================================================================')
print('4. updateType 7 orders')
if orderLog[orderLog['updateType'] == 7].shape[0] != 0:
    assert(orderLog[orderLog['updateType'] == 7].groupby('accCode')['order'].nunique().max() < 20)
    assert(orderLog[orderLog['updateType'] == 7].groupby('accCode')['order'].nunique().sum() < 100)

### Assertion 5: check updateType == 6 orders, make sure updateType == 6 orders < 5% per account
print('=======================================================================================')
print('5. updateType 6 orders')
k1 = orderLog[orderLog['updateType'] == 6].groupby('accCode')['order'].nunique().reset_index()
k2 = orderLog.groupby('accCode')['order'].nunique().reset_index()
k = pd.merge(k1, k2, on='accCode', how='left')
k['prob'] = k['order_x']/k['order_y']
try:
    assert(sum(k['prob'] >= 0.05) == 0)
except:
    print('There are accounts with more than 5% updateType 6 orders!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
    print(k[k['prob'] >= 0.05])

### Assertion 6: check CYB orders, make sure CYB stocks total absOrderSize < 30w
print('=======================================================================================')
print('6. CYB stocks total order size < 30w')
try:
    assert(orderLog[(orderLog['secid'] >= 2300000) & (orderLog['updateType'] == 0)]['absOrderSize'].max() <= 300000)
except:
    print('CYB stocks total absOrderSize >= 30w!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
    
     
### Assertion 7:  make sure there is no unexpected updateType 
print('=======================================================================================')
print('7. unexpected updateType')
def getTuple(x):
    return tuple(i for i in x)

checkLog = orderLog[~((orderLog['updateType'] == 4) & (orderLog.groupby(['order'])['updateType'].shift(-1) == 4))]
checkLog = checkLog.groupby(['order'])['updateType'].apply(lambda x: getTuple(x)).reset_index()
checkLog['status'] = np.where(checkLog['updateType'].isin([(0, 2, 4), (0, 2, 1, 4), (0, 2, 1, 2, 4), (0, 2, 4, 1, 4), (0, 4), (0, 1, 4), (0, 4, 1, 4), (0, 2, 2, 4), (0, 4, 2, 4), (0, 2, 2, 1, 4), (0, 2, 2, 4, 1, 4)]),0,
                     np.where(checkLog['updateType'].isin([(0, 2, 4, 1, 3), (0, 2, 4, 1, 4, 3), (0, 2, 1, 4, 3), (0, 4, 1, 3), (0, 1, 4, 3),
                                                               (0, 2, 2, 4, 1, 3), (0, 2, 2, 4, 1, 4, 3), (0, 2, 2, 1, 4, 3), (0, 4, 2, 4, 1, 3),
                                                               (0, 4, 2, 1, 3), (0, 4, 1, 4, 3), (0, 4, 1)]), 1,
                     np.where(checkLog['updateType'].isin([(0, 2, 1, 3), (0, 2, 2, 1, 3), (0, 2, 3), (0, 3), (0, 1, 3), (0, ), (0, 2), (0, 2, 1), (0, 2, 2)]), 2, 3)))

orderLog = pd.merge(orderLog, checkLog[['order', 'status']], how='left', on=['order'], validate='many_to_one')
orderLog = orderLog[orderLog['status'].isin([0, 1, 2])].reset_index(drop=True)

### Assertion 8:  make sure status==0 got all traded
print('=======================================================================================')
print('8. status == 0: all traded')
a = orderLog[orderLog['status'] == 0]
a = a.groupby(['order'])[['absOrderSizeCumFilled', 'absOrderSize']].max().reset_index()
a.columns = ['order', 'filled', 'total']
print('in total trade, any fill != total cases')
display(a[a['filled'] != a['total']])
if a[a['filled'] != a['total']].shape[0] > 0:
    removeOrderLs = a[a['filled'] != a['total']]['order'].unique()
    orderLog = orderLog[~(orderLog['order'].isin(removeOrderLs))]
    
### Assertion 9:  make sure status==1 got partial traded
print('=======================================================================================')
print('9. status == 1: partial traded')
a = orderLog[orderLog['status'] == 1]
a = a.groupby(['order'])[['absOrderSizeCumFilled', 'absOrderSize']].max().reset_index()
a.columns = ['order', 'filled', 'total']
print('in partial trade, any fill >= total or fill is 0 cases for updateType 4')
display(a[(a['filled'] >= a['total']) | (a['filled'] == 0)])
if a[(a['filled'] >= a['total']) | (a['filled'] == 0)].shape[0] > 0:
    removeOrderLs = a[(a['filled'] >= a['total']) | (a['filled'] == 0)]['order'].unique()
    orderLog = orderLog[~(orderLog['order'].isin(removeOrderLs))]
    
### Assertion 10: make sure no cancellation within 1 sec
print('=======================================================================================')
print('10. no cancellation within 1 sec')
a = orderLog[(orderLog['updateType'] == 1) & (orderLog['duration'] < 1e6)]
print('any cancellation within 1 sec')
display(a)
if a.shape[0] > 0:
    removeOrderLs = a['order'].unique()
    orderLog = orderLog[~(orderLog['order'].isin(removeOrderLs))]


### Assertion 11: make sure no order has shares > 80w or notional > 800w
print('=======================================================================================')
print('11. Orders with size > 80w or notional > 800w')
orderLog['orderNtl'] = orderLog['absOrderSize'] * orderLog['orderPrice']
if orderLog[orderLog['absOrderSize'] > 800000].shape[0] > 0:
    print('some order quantity are > 80w')
    print(orderLog[orderLog['absOrderSize'] > 800000].groupby(['colo', 'accCode'])['order'].nunique())
    display(orderLog[orderLog['absOrderSize'] > 800000][['date', 'accCode', 'secid', 'vai', 'absOrderSize', 'orderPrice',
                                                         'orderNtl', 'orderDirection', 'clock', 'order']])
            
if orderLog[orderLog['orderNtl'] > 8000000].shape[0] > 0:
    print('some order ntl are > 800w')
    print(orderLog[orderLog['orderNtl'] > 8000000].groupby(['colo', 'accCode'])['order'].nunique())
    display(orderLog[orderLog['orderNtl'] > 8000000][['date', 'accCode', 'secid', 'vai', 'absOrderSize', 'orderPrice',
                                                      'orderNtl', 'orderDirection', 'clock', 'order', "updateType", 
                                                      "tradePrice", "absOrderSizeCumFilled", "absFilledThisUpdate"]])

removeOrderLs = list(set(orderLog[orderLog['absOrderSize'] > 800000]['order'].unique()) | set(orderLog[orderLog['orderNtl'] > 8000000]['order'].unique()))
orderLog = orderLog[~(orderLog['order'].isin(removeOrderLs))]


orderLog = orderLog.sort_values(by=['date', 'secid', 'vai', 'accCode', 'clockAtArrival']).reset_index(drop=True)

orderLog['exchange'] = np.where(orderLog['secid'] >= 2000000, 'SZE', 'SSE')
orderLog['orderNtl'] = orderLog['orderPrice'] * orderLog['absOrderSize']
orderLog['tradeNtl'] = np.where(orderLog['updateType'] == 4, orderLog['tradePrice']*orderLog['absFilledThisUpdate'], 0)
orderLog["ars"] = orderLog.groupby(['order'])['ars'].transform('first')

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


1. same date, secid, vai: same direction
opposite direction for same date, same secid, same vai


Unnamed: 0,date,accCode,secid,vai,orderDirection,order
61847,20200914,8854,1600459,5894128,-1,99591
61851,20200914,9551,1600459,5894128,1,133982
61855,20200914,527103,1600459,5894128,1,187338
61858,20200914,968501,1600459,5894128,1,224573
183188,20200914,8854,1603008,8905434,-1,99818
...,...,...,...,...,...,...
12416598,20200930,9471,2002677,1616000,-1,3043833
12660565,20200930,5386,2300496,3465240,1,2982368
12660568,20200930,8924,2300496,3465240,1,3021081
12660571,20200930,9754,2300496,3465240,1,3077234


2. same date, secid, vai, accCode: one insertion
more than one insertion at same time


Unnamed: 0,date,accCode,secid,vai,order,clockAtArrival
0,20200914,5273,2300550,3100,32133,2
1,20200914,6683,2000688,1852523,91567,2
2,20200914,6683,2002214,5750210,92262,2
3,20200914,6683,2300723,1107300,95088,2
4,20200914,6683,2300723,1201900,95093,2
...,...,...,...,...,...,...
2418,20200930,9741,2300631,1315560,3066137,2
2419,20200930,9741,2300668,477800,3066619,3
2420,20200930,9741,2300681,41900,3066744,2
2421,20200930,9741,2300681,259900,3066757,2


99.62% SZE orders triggered by msg data
3. IPO stocks selling (ars = 301, 302)
            clockAtArrival  caamd    secid  updateType       vai    ars  \
346121    1600062069865523      0  1605006           0  63938781  302.0   
346125    1600062024623244      0  1605006           0  63938781  302.0   
346129    1600062108850411      0  1605006           0  63938781  302.0   
346132    1600062108850408      0  1605006           0  63938781  302.0   
346135    1600062070651381      0  1605006           0  63938781  302.0   
...                    ...    ...      ...         ...       ...    ...   
12497872  1601444570678049      0  2003003           0  14614808  302.0   
12497875  1601444569841030      0  2003003           0  14614808  302.0   
12497878  1601444569841030      0  2003003           0  14614808  302.0   
12497880  1601444567244549      0  2003003           0  14614808  302.0   
12497885  1601444567225801      0  2003003           0  14614808  302.0   

          absFilledT

6. CYB stocks total order size < 30w
7. unexpected updateType
8. status == 0: all traded
in total trade, any fill != total cases


Unnamed: 0,order,filled,total
31967,41591,300,400
34592,44833,6200,14600
37716,48331,90000,143700
40536,51492,2200,2300
52849,66504,3400,32200
...,...,...,...
2183983,3108641,500,800
2184274,3108993,19050,49000
2185320,3110356,19500,23300
2191226,3119531,500,700


9. status == 1: partial traded
in partial trade, any fill >= total or fill is 0 cases for updateType 4


Unnamed: 0,order,filled,total


10. no cancellation within 1 sec
any cancellation within 1 sec


Unnamed: 0,clockAtArrival,caamd,secid,updateType,vai,ars,absFilledThisUpdate,orderDirection,absOrderSize,absOrderSizeCumFilled,orderPrice,tradePrice,date,accCode,mse,colo,orderSysId,internalId,tradeId,sdd,aaa,ApplSeqNum,mrm,mta,mrsb,mrss,mrv,mrb100,mra100,l4tr,clock,broker,colo_broker,order,group,startClock,duration,orderDirection1,directNum,isMsg,status


11. Orders with size > 80w or notional > 800w


In [71]:
rawOrderLog[(rawOrderLog['date'] == 20200917) & (rawOrderLog['colo'] == 'zs_52_08')]

Unnamed: 0.1,Unnamed: 0,clockAtArrival,caamd,secid,updateType,vai,ars,absFilledThisUpdate,orderDirection,absOrderSize,absOrderSizeCumFilled,orderPrice,tradePrice,date,accCode,mse,colo,orderSysId,internalId,tradeId,sdd,aaa,ApplSeqNum,mrm,mta,mrsb,mrss,mrv,mrb100,mra100,l4tr,clock,broker,colo_broker,order,group,startClock,duration,orderDirection1
909474,117622,1600306622770545,1600306622770494,2000001,0,8751900,131.0,0,1,9900,0,15.48,-1.00,20200917,5281,100,zs_52_08,,552.0,,93702770.0,0.000549,2763712.0,0.005437,0.005437,0.000549,-0.001165,8751900.0,1547.0,1548.0,0.0,2020-09-17 09:37:02.770545,52,zs_52,210495,131230,1600306622770545,0,1
909475,117623,1600306622773268,1600306622773070,2000001,2,8751900,-1.0,0,1,9900,0,15.48,-1.00,20200917,5281,0,zs_52_08,166999.0,552.0,,-1.0,-1.000000,0.0,0.006017,-1.000000,-0.000474,-0.001308,8761800.0,1547.0,1550.0,0.0,2020-09-17 09:37:02.773268,52,zs_52,210495,131230,1600306622770545,2723,1
909476,117624,1600306622773365,1600306622773070,2000001,4,8751900,-1.0,9900,1,9900,9900,15.48,15.48,20200917,5281,0,zs_52_08,,552.0,103000003728504.0,-1.0,-1.000000,0.0,0.006017,-1.000000,-0.000474,-0.001308,8761800.0,1547.0,1550.0,0.0,2020-09-17 09:37:02.773365,52,zs_52,210495,131230,1600306622770545,2820,1
909477,117625,1600306623559172,1600306623559126,2000001,0,8763400,131.0,0,1,9900,0,15.48,-1.00,20200917,5281,100,zs_52_08,,554.0,,93703560.0,0.000671,2766354.0,0.005447,0.005447,0.000671,-0.001288,8763400.0,1547.0,1548.0,0.0,2020-09-17 09:37:03.559172,52,zs_52,210496,131231,1600306623559172,0,1
909478,117626,1600306623562129,1600306623562023,2000001,2,8763400,-1.0,0,1,9900,0,15.48,-1.00,20200917,5281,0,zs_52_08,167168.0,554.0,,-1.0,-1.000000,0.0,0.006026,-1.000000,-0.000352,-0.001428,8773300.0,1547.0,1550.0,0.0,2020-09-17 09:37:03.562129,52,zs_52,210496,131231,1600306623559172,2957,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1091348,299496,1600325560389366,1600325560386548,2300630,3,2390587,-1.0,0,-1,1600,0,44.32,-1.00,20200917,5377,0,zs_52_08,,2861.0,,-1.0,-1.000000,0.0,-0.000412,-1.000000,-1,-1,2396187.0,4430.0,4431.0,0.0,2020-09-17 14:52:40.389366,53,zs_53,220188,157448,1600325556734733,3654633,-1
1091361,299509,1600325560382946,1600325560382888,2300630,0,2395587,131.0,0,-1,200,0,44.30,-1.00,20200917,5276,100,zs_52_08,,4439.0,,145240400.0,0.000598,22082873.0,-0.000270,-0.000270,-0.001005,0.000598,2395587.0,4430.0,4432.0,0.0,2020-09-17 14:52:40.382946,52,zs_52,210494,157449,1600325560382946,0,-1
1091362,299510,1600325560386690,1600325560386548,2300630,2,2395587,-1.0,0,-1,200,0,44.30,-1.00,20200917,5276,0,zs_52_08,1574358.0,4439.0,,-1.0,-1.000000,0.0,-0.000412,-1.000000,-1,-1,2396187.0,4430.0,4431.0,0.0,2020-09-17 14:52:40.386690,52,zs_52,210494,157449,1600325560382946,3744,-1
1091363,299511,1600325560386789,1600325560386548,2300630,4,2395587,-1.0,100,-1,200,100,44.30,44.30,20200917,5276,0,zs_52_08,1574358.0,4439.0,101000032065290.0,-1.0,-1.000000,0.0,-0.000412,-1.000000,-1,-1,2396187.0,4430.0,4431.0,0.0,2020-09-17 14:52:40.386789,52,zs_52,210494,157449,1600325560382946,3843,-1


In [72]:
rawOrderLog[(rawOrderLog['date'] == 20200921) & (rawOrderLog['colo'] == 'zs_52_08')]

Unnamed: 0.1,Unnamed: 0,clockAtArrival,caamd,secid,updateType,vai,ars,absFilledThisUpdate,orderDirection,absOrderSize,absOrderSizeCumFilled,orderPrice,tradePrice,date,accCode,mse,colo,orderSysId,internalId,tradeId,sdd,aaa,ApplSeqNum,mrm,mta,mrsb,mrss,mrv,mrb100,mra100,l4tr,clock,broker,colo_broker,order,group,startClock,duration,orderDirection1
1711154,752564,1600652173949073,1600652173949023,2000001,0,9670418,131.0,0,-1,500,0,15.84,-1.00,20200921,5281,100,zs_52_08,,538.0,,93613940.0,0.001236,3050327.0,-0.000706,-0.000706,-0.001816,0.001236,,1584.0,1585.0,0.0,2020-09-21 09:36:13.949073,52,zs_52,383955,240214,1600652173949073,0,-1
1711155,752565,1600652173952640,1600652173949023,2000001,2,9670418,-1.0,0,-1,500,0,15.84,-1.00,20200921,5281,0,zs_52_08,232564,538.0,,-1.0,-1.000000,0.0,-0.000706,-1.000000,-0.001816,0.001236,,1584.0,1585.0,0.0,2020-09-21 09:36:13.952640,52,zs_52,383955,240214,1600652173949073,3567,-1
1711156,752569,1600652174058107,1600652174058070,2000001,4,9670418,-1.0,500,-1,500,500,15.84,15.84,20200921,5281,0,zs_52_08,,538.0,1.04e+14,-1.0,-1.000000,0.0,-0.000223,-1.000000,-1,-1,,1583.0,1584.0,0.0,2020-09-21 09:36:14.058107,52,zs_52,383955,240214,1600652173949073,109034,-1
1711163,752573,1600652175250378,1600652175250323,2000001,0,9708018,131.0,0,-1,800,0,15.84,-1.00,20200921,5281,100,zs_52_08,,541.0,,93615240.0,0.001209,3056948.0,-0.000706,-0.000706,-0.00179,0.001209,,1584.0,1585.0,0.0,2020-09-21 09:36:15.250378,52,zs_52,383956,240215,1600652175250378,0,-1
1711164,752574,1600652175254021,1600652175253960,2000001,2,9708018,-1.0,0,-1,800,0,15.84,-1.00,20200921,5281,0,zs_52_08,233566,541.0,,-1.0,-1.000000,0.0,-0.000223,-1.000000,-0.00139,0.000832,,1583.0,1584.0,0.0,2020-09-21 09:36:15.254021,52,zs_52,383956,240215,1600652175250378,3643,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2217023,902645,1600669377089333,1600669377089281,2300630,4,4064208,-1.0,600,1,600,600,42.23,42.23,20200921,5286,0,zs_52_08,,2016.0,1.01e+14,-1.0,-1.000000,0.0,-0.000121,-1.000000,-1,-1,,4221.0,4223.0,0.0,2020-09-21 14:22:57.089333,52,zs_52,389084,306984,1600669377085988,3345,1
2217030,406615,1600669377085989,1600669377085935,2300630,0,4064208,131.0,0,1,600,0,42.23,-1.00,20200921,5377,100,zs_52_08,,1670.0,,142257090.0,0.003768,19552039.0,-0.000121,-0.000121,0.003768,-0.003994,,4221.0,4223.0,0.0,2020-09-21 14:22:57.085989,53,zs_53,404671,306984,1600669377085989,0,1
2217031,406616,1600669377098413,1600669377091475,2300630,2,4064208,-1.0,0,1,600,0,42.23,-1.00,20200921,5377,0,zs_52_08,211003,1670.0,,-1.0,-1.000000,0.0,-0.000493,-1.000000,-0.002055,-0.004574,,4223.0,4251.0,0.0,2020-09-21 14:22:57.098413,53,zs_53,404671,306984,1600669377085989,12424,1
2217032,406617,1600669380248925,1600669380248866,2300630,1,4064208,2.0,0,1,600,0,42.23,-1.00,20200921,5377,100,zs_52_08,211003,1670.0,,51780.0,-1.000000,0.0,-0.000694,-1.000000,-0.001416,-0.000315,,4243.0,4251.0,0.0,2020-09-21 14:23:00.248925,53,zs_53,404671,306984,1600669377085989,3162936,1


In [3]:
# fill rate by exchange and daily Turnover
a = orderLog[(orderLog['updateType'] == 0)]['orderNtl'].sum()
b = orderLog[(orderLog['updateType'] == 4)]['tradeNtl'].sum()
print('total %.0f%%'%(b/a*100))

a = orderLog[(orderLog['exchange'] == 'SSE') & (orderLog['updateType'] == 0)]['orderNtl'].sum()
b = orderLog[(orderLog['exchange'] == 'SSE') & (orderLog['updateType'] == 4)]['tradeNtl'].sum()
print('SSE %.0f%%'%(b/a*100))

a = orderLog[(orderLog['exchange'] == 'SZE') & (orderLog['updateType'] == 0)]['orderNtl'].sum()
b = orderLog[(orderLog['exchange'] == 'SZE') & (orderLog['updateType'] == 4)]['tradeNtl'].sum()
print('SZE %.0f%%'%(b/a*100))

a = orderLog[(orderLog['exchange'] == 'SSE') & (orderLog['updateType'] == 0) & (orderLog['colo_broker'] == 'zt_52')]['orderNtl'].sum()
b = orderLog[(orderLog['exchange'] == 'SSE') & (orderLog['updateType'] == 4) & (orderLog['colo_broker'] == 'zt_52')]['tradeNtl'].sum()
print('zt_52 %.0f%%'%(b/a*100))

a = orderLog[(orderLog['exchange'] == 'SZE') & (orderLog['updateType'] == 0) & (orderLog['colo_broker'] == 'zs_52')]['orderNtl'].sum()
b = orderLog[(orderLog['exchange'] == 'SZE') & (orderLog['updateType'] == 4) & (orderLog['colo_broker'] == 'zs_52')]['tradeNtl'].sum()
print('zs_52 %.0f%%'%(b/a*100))

a = orderLog.groupby(['date', 'exchange', 'colo_broker'])['tradeNtl'].sum().reset_index()
l = len(orderLog["date"].unique())
exchangeTurnover = a.groupby(['date', 'exchange'])['tradeNtl'].sum().reset_index()
exchangeTurnover = exchangeTurnover.groupby(['exchange'])['tradeNtl'].mean().reset_index()
exchangeTurnover['tradeNtl'] = (exchangeTurnover['tradeNtl']/10000).astype('int64')
exchangeTurnover.columns = ['exchange', 'exchange turnover']
exchangeTurnover = pd.merge(a, exchangeTurnover, how='left', on=['exchange'], validate='many_to_one')
pp = exchangeTurnover.groupby(['exchange', 'exchange turnover', 'colo_broker'])["tradeNtl"].count().reset_index()
exchangeTurnover = (exchangeTurnover.groupby(['exchange', 'exchange turnover', 'colo_broker'])['tradeNtl'].mean()/10000).astype('int64').reset_index()
exchangeTurnover.columns = ['exchange', 'exchange turnover', 'colo_broker', 'turnover']
exchangeTurnover = pd.merge(exchangeTurnover, pp, on=['exchange', 'exchange turnover', 'colo_broker'])
exchangeTurnover['turnover %'] = (exchangeTurnover['turnover'] *  exchangeTurnover['tradeNtl'])/ (exchangeTurnover['exchange turnover']*l)
exchangeTurnover = exchangeTurnover.rename(columns={'colo_broker': 'broker'})
exchangeTurnover = exchangeTurnover.groupby(['exchange', 'exchange turnover', 'broker'])[['turnover', 'turnover %']].first()
exchangeTurnover['turnover %'] = exchangeTurnover['turnover %'].apply(lambda x: '%.1f%%'%(100*x))

from IPython.display import display, HTML
display(HTML(exchangeTurnover.to_html()))

total 71%
SSE 68%
SZE 73%
zt_52 69%
zs_52 79%


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,turnover,turnover %
exchange,exchange turnover,broker,Unnamed: 3_level_1,Unnamed: 4_level_1
SSE,308992,zs_53,719,0.0%
SSE,308992,zs_62,1668,0.5%
SSE,308992,zs_89,1713,0.4%
SSE,308992,zs_94,10,0.0%
SSE,308992,zs_96,0,0.0%
SSE,308992,zs_97,28880,9.3%
SSE,308992,zt_52,181775,58.8%
SSE,308992,zt_53,6799,2.0%
SSE,308992,zt_70,110,0.0%
SSE,308992,zt_88,6637,2.1%


In [4]:
# order speed comparison between brokers (under same sta and same mrstauc)
orderLog['sta'] = np.where(orderLog['ars'].isin([121, 221, 321, 131, 231, 331]), 'statwo', 'staone')
checkLog = orderLog[orderLog['updateType'].isin([0, 3, 4])].reset_index(drop=True)
checkLog = checkLog[checkLog['duration'] <= 1e6]

sta_list = orderLog['sta'].unique()

for exchg in ['SH', 'SZ']: 
    print(exchg)
    result = {}
    for col in ['date', 'broker1', 'broker2', 'isFast']:
        result[col] = []
    for i in sta_list:
        checkLog1 = checkLog[checkLog['sta'] == i]
        checkLog1['brokerNum'] = checkLog1.groupby(['date', 'secid', 'vai'])['colo_broker'].transform('nunique')
        checkLog1 = checkLog1[checkLog1['brokerNum'] >= 2]
        checkLog1['cumFillSize'] = checkLog1.groupby(['order'])['absOrderSizeCumFilled'].transform('max')
        checkLog1['status'] = np.where(checkLog1['cumFillSize'] == 0, 2, 
                                        np.where(checkLog1['cumFillSize'] < checkLog1['absOrderSize'], 1, 0))
        checkLog1 = checkLog1.groupby(['group', 'order'])[['date', 'accCode', 'secid', 'vai', 'colo_broker', 'status']].first().reset_index()

        if exchg == 'SH':
            exchangeLog1 = checkLog1[(checkLog1['secid'] < 2000000) & (checkLog1['colo_broker'].str[:2] == 'zt')]
        else:
            exchangeLog1 = checkLog1[(checkLog1['secid'] >= 2000000)& (checkLog1['colo_broker'].str[:2] == 'zs')]

        for group, groupData in exchangeLog1.groupby(['group']):
            date = groupData['date'].values[0]
            brokerLs = groupData['colo_broker'].values
            if len(brokerLs) == 0:
                continue
            statusLs = groupData['status'].values
            ixLs = [i for i in range(len(brokerLs))]
            for k, broker1, status1 in zip(ixLs[:-1], brokerLs[:-1], statusLs[:-1]):
                for broker2, status2 in zip(brokerLs[k+1:], statusLs[k+1:]):
                    if broker1 != broker2:
                        result['date'] += [date, date]
                        result['broker1'] += [broker1, broker2]
                        result['broker2'] += [broker2, broker1]

                        if status1 < status2:
                            result['isFast'] += [1, 0]
                        elif status1 > status2:
                            result['isFast'] += [0, 1]
                        else:
                            result['isFast'] += [-1, -1]   

    result = pd.DataFrame(result)
    result = result[result['isFast'] != -1]
    result['count'] = result.groupby(['broker1', 'broker2'])['isFast'].transform('count')
    result['fasterSum'] = result.groupby(['broker1', 'broker2'])['isFast'].transform('sum')
    result['faster'] = result['fasterSum']/result['count']

    summary = pd.DataFrame()
    summaryCols = []
    countCols = []
    fasterCols = []
    for date in result['date'].unique():
        a = result[result['date'] == date].reset_index(drop=True)
        a['count'] = a.groupby(['broker1', 'broker2'])['isFast'].transform('count')
        a['fasterSum'] = a.groupby(['broker1', 'broker2'])['isFast'].transform('sum')
        a['faster'] = a['fasterSum']/a['count']
        a = a.groupby(['broker1', 'broker2'])[['count', 'faster']].mean().reset_index()
        a.columns = ['broker1', 'broker2', '%s count'%date, '%s faster'%date]
        summaryCols += ['%s count'%date, '%s faster'%date]
        countCols += ['%s count'%date]
        fasterCols += ['%s faster'%date]

        if summary.empty:
            summary = a.copy()
        else:
            summary = pd.merge(summary, a, how='outer', on=['broker1', 'broker2'], validate='one_to_one')

    a = result.groupby(['broker1', 'broker2'])[['count', 'faster']].mean().reset_index()
    a.columns = ['broker1', 'broker2', 'total count', 'total faster']
    summary = pd.merge(summary, a, how='outer', on=['broker1', 'broker2'], validate='one_to_one')
    summaryCols = ['total count', 'total faster'] + summaryCols

    savePath = r'L:\orderLog\result\relativeSpeed'
    a = summary.groupby(['broker1', 'broker2'])[['total count', 'total faster']].first().reset_index()
    a.to_csv(os.path.join(savePath, 'relativeSpeedBroker_%s_%s_%s.csv'%(startDate, endDate, exchg)), index=False)

    countCols += ['total count']
    fasterCols += ['total faster']
    summary = summary.groupby(['broker1', 'broker2'])[summaryCols].first().sort_values(by=['broker1', 'broker2'])
    for col in countCols:
        summary[col] = summary[col].fillna(0)
        summary[col] = summary[col].astype('int64')

    for col in fasterCols:
        summary[col] = summary[col].fillna(-1)
        summary[col] = summary[col].apply(lambda x: '%.0f%%'%(x*100))
        summary[col] = np.where(summary[col] == '-100%', ' ', summary[col])
    from IPython.display import display, HTML
    display(HTML(summary.iloc[:, :2].to_html()))

SH


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


Unnamed: 0_level_0,Unnamed: 1_level_0,total count,total faster
broker1,broker2,Unnamed: 2_level_1,Unnamed: 3_level_1
zt_52,zt_53,8645,86%
zt_52,zt_70,2402,90%
zt_52,zt_88,1245,67%
zt_52,zt_89,34704,70%
zt_52,zt_92,286,66%
zt_52,zt_94,1969,78%
zt_52,zt_95,841,84%
zt_52,zt_96,41890,32%
zt_53,zt_52,8645,14%
zt_53,zt_70,88,69%


SZ


Unnamed: 0_level_0,Unnamed: 1_level_0,total count,total faster
broker1,broker2,Unnamed: 2_level_1,Unnamed: 3_level_1
zs_52,zs_53,28913,100%
zs_52,zs_54,1453,59%
zs_52,zs_62,21057,100%
zs_52,zs_64,576,99%
zs_52,zs_66,42026,8%
zs_52,zs_88,22431,100%
zs_52,zs_89,27001,100%
zs_52,zs_92,56,95%
zs_52,zs_94,121760,6%
zs_52,zs_96,6750,16%


In [11]:
savePath = r'L:\orderLog\result\relativeSpeed'
data1 = pd.read_csv(os.path.join(savePath, 'relativeSpeedBroker_%s_%s_%s.csv'%(startDate, endDate, 'SH')))
def bubble_sort(cur, arr, index):
    arr1 = arr[arr < 0.5]
    index1 = index[arr < 0.5]
    arr2 = arr[arr > 0.5]
    index2 = index[arr > 0.5]
    for i in range(0,len(arr1)):
        for j in range(1,len(arr1)-i):
            if arr1[j]<arr1[j-1]:
                arr1[j-1],arr1[j]=arr1[j],arr1[j-1]
                index1[j-1],index1[j]=index1[j],index1[j-1]
    for i in range(0,len(arr2)):
        for j in range(1,len(arr2)-i):
            if arr2[j]<arr2[j-1]:
                arr2[j-1],arr2[j]=arr2[j],arr2[j-1]
                index2[j-1],index2[j]=index2[j],index2[j-1]
    return cur, list(index1), list(index2)
re = pd.DataFrame()
main = []
bigger = []
smaller = []
for broker, brokerData in data1.groupby(['broker1']):
    kk = bubble_sort(broker, brokerData['total faster'].values, brokerData['broker2'].values)
    main.append(kk[0])
    bigger.append(kk[1])
    smaller.append(kk[2])
re['main'] = main
re['bigger'] = bigger
re['smaller'] = smaller
re['bigger_c'] = re['bigger'].apply(lambda x: len(x))
re['smaller_c'] = re['smaller'].apply(lambda x: len(x))
re['total'] = re['bigger_c'] + re['smaller_c']
display(re[re['total'] <= 2].sort_values(by=['bigger_c']))
display(re[re['total'] > 2].sort_values(by=['bigger_c']))

Unnamed: 0,main,bigger,smaller,bigger_c,smaller_c,total
5,zt_92,[zt_52],[],1,0,1


Unnamed: 0,main,bigger,smaller,bigger_c,smaller_c,total
8,zt_96,[],"[zt_52, zt_88, zt_89, zt_95, zt_94, zt_53, zt_70]",0,7,7
0,zt_52,[zt_96],"[zt_92, zt_88, zt_89, zt_94, zt_95, zt_53, zt_70]",1,7,8
3,zt_88,"[zt_96, zt_52]","[zt_94, zt_95, zt_53, zt_70, zt_89]",2,5,7
6,zt_94,"[zt_96, zt_52, zt_88]","[zt_89, zt_53, zt_70]",3,3,6
4,zt_89,"[zt_96, zt_88, zt_52, zt_94]","[zt_53, zt_95, zt_70]",4,3,7
7,zt_95,"[zt_96, zt_52, zt_88, zt_89]",[zt_53],4,1,5
1,zt_53,"[zt_96, zt_52, zt_88, zt_94, zt_95, zt_89]",[zt_70],6,1,7
2,zt_70,"[zt_96, zt_52, zt_88, zt_89, zt_53, zt_94]",[],6,0,6


In [12]:
data1 = pd.read_csv(os.path.join(savePath, 'relativeSpeedBroker_%s_%s_%s.csv'%(startDate, endDate, 'SZ')))
def bubble_sort(cur, arr, index):
    arr1 = arr[arr < 0.5]
    index1 = index[arr < 0.5]
    arr2 = arr[arr > 0.5]
    index2 = index[arr > 0.5]
    for i in range(0,len(arr1)):
        for j in range(1,len(arr1)-i):
            if arr1[j]<arr1[j-1]:
                arr1[j-1],arr1[j]=arr1[j],arr1[j-1]
                index1[j-1],index1[j]=index1[j],index1[j-1]
    for i in range(0,len(arr2)):
        for j in range(1,len(arr2)-i):
            if arr2[j]<arr2[j-1]:
                arr2[j-1],arr2[j]=arr2[j],arr2[j-1]
                index2[j-1],index2[j]=index2[j],index2[j-1]
    return cur, list(index1), list(index2)
re = pd.DataFrame()
main = []
bigger = []
smaller = []
for broker, brokerData in data1.groupby(['broker1']):
    kk = bubble_sort(broker, brokerData['total faster'].values, brokerData['broker2'].values)
    main.append(kk[0])
    bigger.append(kk[1])
    smaller.append(kk[2])
re['main'] = main
re['bigger'] = bigger
re['smaller'] = smaller
re['bigger_c'] = re['bigger'].apply(lambda x: len(x))
re['smaller_c'] = re['smaller'].apply(lambda x: len(x))
re['total'] = re['bigger_c'] + re['smaller_c']
display(re[re['total'] <= 2].sort_values(by=['bigger_c']))
display(re[re['total'] > 2].sort_values(by=['bigger_c']))

Unnamed: 0,main,bigger,smaller,bigger_c,smaller_c,total


Unnamed: 0,main,bigger,smaller,bigger_c,smaller_c,total
9,zs_94,[],"[zs_66, zs_96, zs_54, zs_53, zs_52, zs_89, zs_...",0,11,11
10,zs_96,"[zs_94, zs_66]","[zs_54, zs_97, zs_52, zs_88, zs_89, zs_92]",2,6,8
0,zs_52,"[zs_94, zs_66, zs_96]","[zs_54, zs_97, zs_92, zs_64, zs_88, zs_53, zs_...",3,8,11
2,zs_54,"[zs_94, zs_52, zs_96]","[zs_66, zs_88, zs_92, zs_97, zs_64]",3,5,8
5,zs_66,"[zs_94, zs_92, zs_54]","[zs_96, zs_97, zs_52, zs_88, zs_53, zs_62, zs_...",3,8,11
4,zs_64,"[zs_66, zs_54, zs_94, zs_52]",[zs_97],4,1,5
8,zs_92,"[zs_94, zs_96, zs_54, zs_52, zs_97]","[zs_66, zs_88]",5,2,7
1,zs_53,"[zs_66, zs_52, zs_97, zs_94, zs_88, zs_89]",[zs_62],6,1,7
7,zs_89,"[zs_66, zs_96, zs_52, zs_97, zs_94, zs_88]","[zs_53, zs_62]",6,2,8
11,zs_97,"[zs_54, zs_94, zs_64, zs_52, zs_66, zs_96]","[zs_92, zs_89, zs_53, zs_88, zs_62]",6,5,11


In [3]:
# order speed comparison between brokers (under same sta and same mrstauc)
orderLog['sta'] = np.where(orderLog['ars'].isin([121, 221, 321, 131, 231, 331]), 'statwo', 'staone')
orderLog["server"] = orderLog["colo"].apply(lambda x: x.split("_")[0] + x.split("_")[1] + x.split("_")[2])
orderLog["server_account"] = orderLog["server"] + '_' + orderLog['accCode'].astype('str')
checkLog = orderLog[orderLog['updateType'].isin([0, 3, 4])].reset_index(drop=True)
checkLog = checkLog[checkLog['duration'] <= 1e6]

sta_list = orderLog['sta'].unique()

for exchg in ['SZ']: 
    print(exchg)
    result = {}
    for col in ['date', 'broker1', 'broker2', 'isFast']:
        result[col] = []
    for i in sta_list:
        checkLog1 = checkLog[checkLog['sta'] == i]
        checkLog1['brokerNum'] = checkLog1.groupby(['date', 'secid', 'vai'])['server_account'].transform('nunique')
        checkLog1 = checkLog1[checkLog1['brokerNum'] >= 2]
        checkLog1['cumFillSize'] = checkLog1.groupby(['order'])['absOrderSizeCumFilled'].transform('max')
        checkLog1['status'] = np.where(checkLog1['cumFillSize'] == 0, 2, 
                                        np.where(checkLog1['cumFillSize'] < checkLog1['absOrderSize'], 1, 0))
        checkLog1 = checkLog1.groupby(['group', 'order'])[['date', 'accCode', 'secid', 'vai', 'server_account', 'status']].first().reset_index()

        if exchg == 'SH':
            exchangeLog1 = checkLog1[checkLog1['secid'] < 2000000]
        else:
            exchangeLog1 = checkLog1[checkLog1['secid'] >= 2000000]

        for group, groupData in exchangeLog1.groupby(['group']):
            date = groupData['date'].values[0]
            brokerLs = groupData['server_account'].values
            if len(brokerLs) == 0:
                continue
            statusLs = groupData['status'].values
            ixLs = [i for i in range(len(brokerLs))]
            for k, broker1, status1 in zip(ixLs[:-1], brokerLs[:-1], statusLs[:-1]):
                for broker2, status2 in zip(brokerLs[k+1:], statusLs[k+1:]):
                    if broker1 != broker2:
                        result['date'] += [date, date]
                        result['broker1'] += [broker1, broker2]
                        result['broker2'] += [broker2, broker1]

                        if status1 < status2:
                            result['isFast'] += [1, 0]
                        elif status1 > status2:
                            result['isFast'] += [0, 1]
                        else:
                            result['isFast'] += [-1, -1]   

    result = pd.DataFrame(result)
    result = result[result['isFast'] != -1]
    result['count'] = result.groupby(['broker1', 'broker2'])['isFast'].transform('count')
    result['fasterSum'] = result.groupby(['broker1', 'broker2'])['isFast'].transform('sum')
    result['faster'] = result['fasterSum']/result['count']

    summary = pd.DataFrame()
    summaryCols = []
    countCols = []
    fasterCols = []
    for date in result['date'].unique():
        a = result[result['date'] == date].reset_index(drop=True)
        a['count'] = a.groupby(['broker1', 'broker2'])['isFast'].transform('count')
        a['fasterSum'] = a.groupby(['broker1', 'broker2'])['isFast'].transform('sum')
        a['faster'] = a['fasterSum']/a['count']
        a = a.groupby(['broker1', 'broker2'])[['count', 'faster']].mean().reset_index()
        a.columns = ['broker1', 'broker2', '%s count'%date, '%s faster'%date]
        summaryCols += ['%s count'%date, '%s faster'%date]
        countCols += ['%s count'%date]
        fasterCols += ['%s faster'%date]

        if summary.empty:
            summary = a.copy()
        else:
            summary = pd.merge(summary, a, how='outer', on=['broker1', 'broker2'], validate='one_to_one')

    a = result.groupby(['broker1', 'broker2'])[['count', 'faster']].mean().reset_index()
    a.columns = ['broker1', 'broker2', 'total count', 'total faster']
    summary = pd.merge(summary, a, how='outer', on=['broker1', 'broker2'], validate='one_to_one')
    summaryCols = ['total count', 'total faster'] + summaryCols

    savePath = r'L:\orderLog\result\relativeSpeed'
    a = summary.groupby(['broker1', 'broker2'])[['total count', 'total faster']].first().reset_index()
    a.to_csv(os.path.join(savePath, 'relativeSpeedBroker1_%s_%s_%s.csv'%(startDate, endDate, exchg)), index=False)

    countCols += ['total count']
    fasterCols += ['total faster']
    summary = summary.groupby(['broker1', 'broker2'])[summaryCols].first().sort_values(by=['broker1', 'broker2'])
    for col in countCols:
        summary[col] = summary[col].fillna(0)
        summary[col] = summary[col].astype('int64')

    for col in fasterCols:
        summary[col] = summary[col].fillna(-1)
        summary[col] = summary[col].apply(lambda x: '%.0f%%'%(x*100))
        summary[col] = np.where(summary[col] == '-100%', ' ', summary[col])
    from IPython.display import display, HTML
    display(HTML(summary.iloc[:, :2].to_html()))

SZ


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0_level_0,Unnamed: 1_level_0,total count,total faster
broker1,broker2,Unnamed: 2_level_1,Unnamed: 3_level_1
zs5206_5222,zs5206_5269,3630,47%
zs5206_5222,zs5206_5273,9989,48%
zs5206_5222,zs5209_5225,6097,5%
zs5206_5222,zs5209_5226,27656,3%
zs5206_5222,zs5209_5230,2384,9%
zs5206_5222,zs5209_5290,7688,8%
zs5206_5222,zs5401_5470,45,58%
zs5206_5222,zs5401_5474,71,45%
zs5206_5222,zs6401_6480,50,98%
zs5206_5222,zs6601_6631,1,0%


In [30]:
kk = summary.reset_index()
kk['b1'] = kk['broker1'].apply(lambda x: x.split('_')[0][:2] + x.split('_')[1][:2])
kk['b2'] = kk['broker2'].apply(lambda x: x.split('_')[0][:2] + x.split('_')[1][:2])
display(kk[(kk['b1'] == 'zs88') & (kk['b2'].isin(['zs52']))].sort_values(by='total count', ascending=False))

Unnamed: 0,broker1,broker2,total count,total faster,20200914 count,20200914 faster,20200915 count,20200915 faster,20200916 count,20200916 faster,20200917 count,20200917 faster,20200918 count,20200918 faster,20200921 count,20200921 faster,20200922 count,20200922 faster,20200923 count,20200923 faster,20200924 count,20200924 faster,20200925 count,20200925 faster,20200928 count,20200928 faster,20200929 count,20200929 faster,20200930 count,20200930 faster,b1,b2
467,zs8804_8865,zs5206_5287,6859,1%,543,1%,526,1%,605,0%,580,0%,436,1%,566,1%,620,0%,569,1%,664,2%,546,1%,445,2%,429,1%,330,1%,zs88,zs52
451,zs8804_8824,zs5208_5286,5479,0%,1060,0%,971,0%,907,0%,903,0%,981,1%,657,1%,0,,0,,0,,0,,0,,0,,0,,zs88,zs52
466,zs8804_8865,zs5206_5275,3325,0%,252,1%,128,1%,302,0%,212,0%,317,0%,264,0%,353,0%,262,0%,255,0%,385,1%,228,1%,168,0%,199,1%,zs88,zs52
453,zs8804_8824,zs5209_5289,3258,0%,490,0%,552,0%,537,0%,668,0%,611,0%,400,0%,0,,0,,0,,0,,0,,0,,0,,zs88,zs52
450,zs8804_8824,zs5208_5276,2669,0%,407,0%,486,0%,480,0%,503,0%,518,1%,275,1%,0,,0,,0,,0,,0,,0,,0,,zs88,zs52
454,zs8804_8824,zs5209_5291,827,0%,176,0%,104,0%,153,1%,133,1%,124,0%,137,1%,0,,0,,0,,0,,0,,0,,0,,zs88,zs52
468,zs8804_8865,zs5208_5276,4,0%,0,,0,,0,,2,0%,0,,0,,0,,0,,0,,1,0%,1,0%,0,,0,,zs88,zs52
470,zs8804_8865,zs5208_5284,4,0%,0,,0,,0,,2,0%,0,,0,,0,,0,,0,,1,0%,1,0%,0,,0,,zs88,zs52
465,zs8804_8865,zs5206_5273,3,0%,0,,0,,0,,1,0%,0,,0,,0,,0,,0,,1,0%,1,0%,0,,0,,zs88,zs52
469,zs8804_8865,zs5208_5281,2,0%,0,,0,,0,,0,,0,,0,,0,,0,,0,,1,0%,1,0%,0,,0,,zs88,zs52


In [16]:
orderLog[(orderLog['colo'] == 'zs_88_04') & (orderLog['updateType'] == 0)].groupby(['accCode'])['date'].size()

accCode
8824      21693
8865      31901
8924      22661
8967      30744
896702    14716
Name: date, dtype: int64

In [12]:
orderLog[orderLog['colo'] == 'zs_88_04']['accCode'].unique()

array([  8967,   8824, 896702,   8865,   8924], dtype=int64)

In [10]:
orderLog[(orderLog['updateType'] == 0) & (orderLog['exchange'] == 'SZE') & (orderLog['accCode'] // 100 == 89)].groupby(['colo', 'accCode', 'sta'])['date'].size()

colo      accCode  sta   
zs_88_04  8924     staone      720
                   statwo    21941
          8967     staone     1363
                   statwo    22346
zt_88_03  8970     staone     1962
                   statwo    26447
          8971     staone     1700
                   statwo    12842
Name: date, dtype: int64

In [27]:
orderLog[(orderLog['accCode'] // 100 == 89) & (orderLog['exchange'] == 'SSE') & (orderLog['updateType'] == 0)].groupby('colo')['accCode'].unique()

colo
zs_88_04                [8967]
zt_88_03    [8970, 8971, 8943]
Name: accCode, dtype: object

In [43]:
orderLog[(orderLog['colo'].str[:5] == 'zt_94') & (orderLog['updateType'] == 0) & (orderLog['exchange'] == 'SSE')].groupby(['date', 'sta', 'colo'])['date'].size()

date      sta     colo    
20200914  staone  zt_94_02    1623
20200915  staone  zt_94_02    1259
20200916  staone  zt_94_02    1547
20200917  staone  zt_94_02    1558
20200918  staone  zt_94_02    1450
20200921  staone  zt_94_02    5629
20200922  staone  zt_94_02    6813
20200923  staone  zt_94_02    6887
20200924  staone  zt_94_02    7644
20200925  staone  zt_94_02    7101
20200928  staone  zt_94_02    6133
20200929  staone  zt_94_02    5643
20200930  staone  zt_94_02    5419
Name: date, dtype: int64

In [24]:
import pymongo
import io
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz
import pyarrow as pa
import pyarrow.parquet as pq
import numpy as np
def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    url = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    client = pymongo.MongoClient(url, maxPoolSize=None)
    db = client[db_name]
    return db

def read_memb_daily(db, name, start_date=None, end_date=None, skey=None, index_id=None, interval=None, col=None, return_sdi=True):
    collection = db[name]
    # Build projection
    prj = {'_id': 0}
    if col is not None:
        if return_sdi:
            col = ['skey', 'date', 'interval'] + col
        for col_name in col:
            prj[col_name] = 1

    # Build query
    query = {}
    if skey is not None:
        query['skey'] = {'$in': skey}
    if index_id is not None:
        query['index_id'] = {'$in': index_id}
    if interval is not None:
        query['interval'] = {'$in': interval}
    if start_date is not None:
        if end_date is not None:
            query['date'] = {'$gte': start_date, '$lte': end_date}
        else:
            query['date'] = {'$gte': start_date}
    elif end_date is not None:
        query['date'] = {'$lte': end_date}

    # Load data
    cur = collection.find(query, prj)
    df = pd.DataFrame.from_records(cur)
    if df.empty:
        df = pd.DataFrame()
    else:
        df = df.sort_values(by=['date', 'index_id', 'skey'])
    return df    

database_name = 'com_md_eq_cn'
user = "zhenyuy"
password = "bnONBrzSMGoE"

pd.set_option('max_columns', 200)
db1 = DB("192.168.10.178", database_name, user, password)
memb = read_memb_daily(db1, 'index_memb', 20200914, 20200914)

In [36]:
set(orderLog[(orderLog['colo'] == 'zt_52_04') & (orderLog['updateType'] == 0) & (orderLog['exchange'] == 'SSE') \
         & (orderLog['sta'] == 'statwo')]['secid'].unique()) & set(orderLog[(orderLog['colo'] == 'zt_88_03') & (orderLog['updateType'] == 0) & (orderLog['exchange'] == 'SSE') \
         & (orderLog['sta'] == 'statwo')]['secid'].unique())

{1600006,
 1600017,
 1600021,
 1600026,
 1600037,
 1600039,
 1600053,
 1600054,
 1600055,
 1600056,
 1600057,
 1600058,
 1600059,
 1600060,
 1600062,
 1600063,
 1600064,
 1600072,
 1600073,
 1600075,
 1600076,
 1600079,
 1600093,
 1600094,
 1600096,
 1600105,
 1600110,
 1600114,
 1600116,
 1600120,
 1600123,
 1600126,
 1600129,
 1600131,
 1600132,
 1600133,
 1600136,
 1600138,
 1600141,
 1600143,
 1600155,
 1600158,
 1600160,
 1600161,
 1600166,
 1600167,
 1600171,
 1600184,
 1600185,
 1600195,
 1600197,
 1600201,
 1600206,
 1600207,
 1600211,
 1600216,
 1600217,
 1600223,
 1600229,
 1600230,
 1600239,
 1600246,
 1600256,
 1600258,
 1600259,
 1600260,
 1600261,
 1600266,
 1600267,
 1600269,
 1600273,
 1600278,
 1600283,
 1600285,
 1600291,
 1600295,
 1600298,
 1600300,
 1600305,
 1600308,
 1600310,
 1600315,
 1600316,
 1600318,
 1600323,
 1600325,
 1600326,
 1600329,
 1600330,
 1600335,
 1600337,
 1600338,
 1600339,
 1600343,
 1600348,
 1600350,
 1600351,
 1600360,
 1600363,
 1600366,


In [23]:
pd.set_option('max_rows', 200)
orderLog[(orderLog['colo'].str[:5] == 'zt_52') & (orderLog['updateType'] == 0) & (orderLog['exchange'] == 'SSE')].groupby(['date', 'sta', 'colo'])['date'].size()

date      sta     colo    
20200914  staone  zt_52_01       85
                  zt_52_04      372
                  zt_52_05      447
                  zt_52_07      192
                  zt_52_10      292
          statwo  zt_52_04     3069
                  zt_52_05     2571
                  zt_52_07     2100
                  zt_52_10     3769
20200915  staone  zt_52_01      102
                  zt_52_04      394
                  zt_52_05      437
                  zt_52_07      130
                  zt_52_10      329
          statwo  zt_52_04     2225
                  zt_52_05     2191
                  zt_52_07     1422
                  zt_52_10     3047
20200916  staone  zt_52_01      125
                  zt_52_04      298
                  zt_52_05      406
                  zt_52_07       96
                  zt_52_10      192
          statwo  zt_52_04     2866
                  zt_52_05     2635
                  zt_52_07     1667
                  zt_52_10     3564
2

In [13]:
orderLog[(orderLog['colo'].str[:5] == 'zt_88') & (orderLog['updateType'] == 0) & (orderLog['exchange'] == 'SSE')].groupby(['date', 'sta', 'colo'])['date'].size()

date      sta     colo    
20200914  staone  zt_88_02    2147
                  zt_88_03     321
          statwo  zt_88_03    5774
20200915  staone  zt_88_02    1936
                  zt_88_03     262
          statwo  zt_88_03    5685
20200916  staone  zt_88_02    1871
                  zt_88_03     340
          statwo  zt_88_03    5914
20200917  staone  zt_88_02    2265
                  zt_88_03     283
          statwo  zt_88_03    7709
20200918  staone  zt_88_02    2634
                  zt_88_03     306
          statwo  zt_88_03    7270
20200921  staone  zt_88_02    1980
                  zt_88_03     312
          statwo  zt_88_03    6013
20200922  staone  zt_88_02    2041
                  zt_88_03     479
          statwo  zt_88_03    7229
20200923  staone  zt_88_02    1908
                  zt_88_03     434
          statwo  zt_88_03    6433
20200924  staone  zt_88_02    2351
                  zt_88_03     379
          statwo  zt_88_03    8269
20200925  staone  zt_88_02  

In [5]:
# fill rate
orderLog['Price'] = orderLog['orderPrice'].apply(lambda x: round(x*100, 0))
orderLog['firstClock'] = orderLog.groupby(['order'])['clockAtArrival'].transform('first')
orderLog['clockDif'] = orderLog['clockAtArrival'] - orderLog['firstClock']
orderLog['isImmediate'] = np.where(orderLog['secid'] >= 2000000,
                          np.where(orderLog['colo_broker'] == 'zs_62', 
                                   np.where(orderLog['clockDif'] <= 1000000, 1, 0),
                                   np.where(orderLog['colo_broker'].isin(['zt_88', 'zt_89', 'zt_96']),
                                           np.where(orderLog['clockDif'] <= 50000, 1, 0),
                                           np.where(orderLog['clockDif'] <= 20000, 1, 0))),
                          np.where(orderLog['clockDif'] <= 1000000, 1, 0))
orderLog['orderNtl'] = orderLog['Price'] * orderLog['absOrderSize'] / 100

# SZE triggered by mbd using staone
display(orderLog[(orderLog["updateType"] == 0) & (orderLog["isMsg"] == 1) & (orderLog["secid"] >= 2000000)].shape[0] 
/ orderLog[(orderLog["updateType"] == 0) & (orderLog["secid"] >= 2000000)].shape[0])

checkData = orderLog[orderLog['isImmediate'] == 1].reset_index(drop=True)
checkData['exchange'] = np.where(checkData['secid'] >= 2000000, 'SZE', 'SSE')
checkData['maxFilled'] = checkData.groupby(['order'])['absOrderSizeCumFilled'].transform('max')
checkData['immedFillNtl'] = checkData['maxFilled'] * checkData['Price'] / 100
checkData = checkData[checkData['updateType'] == 0]

checkData = checkData[(checkData["exchange"] == "SZE") & (~checkData["ars"].isin([121, 221, 321, 131, 231, 331]))& (checkData["isMsg"] == 1)]

immedFillData = checkData.groupby(['exchange', 'colo_broker'])[['orderNtl', 'immedFillNtl']].sum().reset_index()
immedFillData = immedFillData.rename(columns={'colo_broker': 'broker'})
immedFillData['immedFillPerc'] = immedFillData['immedFillNtl'] / immedFillData['orderNtl']
immedFillData = immedFillData[['exchange', 'broker', 'immedFillPerc']]


checkData['fillStatus'] = np.where(checkData['maxFilled'] == 0, 0,
                          np.where(checkData['maxFilled'] < checkData['absOrderSize'], 1, 2))
statsData = checkData.groupby(['exchange', 'colo_broker', 'fillStatus'])['secid'].count().to_frame().reset_index()
statsData.columns = ['exchange', 'broker', 'fillStatus', 'count']
statsData = statsData.reset_index()
statsData['# of orders'] = statsData.groupby(['exchange'])['count'].transform('sum')
statsData['% of orders'] = statsData.groupby(['broker', 'exchange'])['count'].transform('sum')
statsData['percent'] = statsData['count'] / statsData['% of orders']
statsData['% of orders'] = statsData['% of orders'] / statsData['# of orders']
saveCols = ['broker', 'exchange', 'percent']
noFillData = statsData[statsData['fillStatus'] == 0][saveCols].reset_index(drop=True)
noFillData = noFillData.rename(columns={'percent': 'no fill'})
partialFillData = statsData[statsData['fillStatus'] == 1][saveCols].reset_index(drop=True)
partialFillData = partialFillData.rename(columns={'percent': 'partial fill'})
fullFillData = statsData[statsData['fillStatus'] == 2][saveCols].reset_index(drop=True)
fullFillData = fullFillData.rename(columns={'percent': 'full fill'})


orderLog['exchange'] = np.where(orderLog['secid'] >= 2000000, 'SZE', 'SSE')

# orderLog = orderLog[((orderLog["ars"] != 21) & (orderLog["isMsg"] != 1) & (orderLog["exchange"] == "SSE"))|
#                      (orderLog["ars"] != 21) & (orderLog["exchange"] == "SZE")]
o1 = orderLog[(orderLog["exchange"] == "SZE") & (~orderLog["ars"].isin([121, 221, 321, 131, 231, 331]))& (orderLog["isMsg"] == 1)]
# o1 = orderLog[(orderLog["isMsg"] != 1) & (orderLog["exchange"] == "SSE") & (orderLog["ars"] != 21)]



fillRateData = o1.groupby(['exchange', 'colo_broker', 'accCode', 'secid', 'order'])[['absOrderSizeCumFilled', 'absOrderSize', 'orderPrice']].max().reset_index()
fillRateData = fillRateData.rename(columns={'colo_broker': 'broker'})
fillRateData['orderNotional'] = fillRateData['absOrderSize']*fillRateData['orderPrice']
fillRateData['fillNotional'] = fillRateData['absOrderSizeCumFilled']*fillRateData['orderPrice']
fillRateData['totalNotional'] = fillRateData.groupby(['exchange', 'broker'])['orderNotional'].transform('sum')
fillRateData['fillNotional'] = fillRateData.groupby(['exchange', 'broker'])['fillNotional'].transform('sum')
fillRateData['fillPerc'] = fillRateData['fillNotional'] / fillRateData['totalNotional']
fillRateData = fillRateData.groupby(['exchange', 'broker'])['fillPerc'].mean().reset_index()

statsData = statsData.groupby(['exchange', 'broker', '# of orders'])['% of orders'].first().reset_index()
statsData = pd.merge(statsData, noFillData, how='outer', on=['broker', 'exchange'], validate='one_to_one')
statsData = pd.merge(statsData, partialFillData, how='outer', on=['broker', 'exchange'], validate='one_to_one')
statsData = pd.merge(statsData, fullFillData, how='outer', on=['broker', 'exchange'], validate='one_to_one')
statsData = pd.merge(statsData, fillRateData, how='outer', on=['broker', 'exchange'], validate='one_to_one')
statsData = pd.merge(statsData, immedFillData, how='outer', on=['broker', 'exchange'], validate='one_to_one')


statsData = statsData.groupby(['exchange', '# of orders', 'broker'])[['% of orders', 'full fill', 'partial fill', 'no fill', 'fillPerc', 'immedFillPerc']].first()

savePath = r'L:\orderLog\result\fillRate'
statsData.reset_index().to_csv(os.path.join(savePath, 'fillRate2_%s_%s.csv'%(startDate, endDate)), index=False)


from IPython.display import display, HTML
for col in ['% of orders', 'no fill', 'partial fill', 'full fill', 'fillPerc', 'immedFillPerc']:
    statsData[col] = statsData[col].fillna(0)
    statsData[col] = statsData[col].apply(lambda x: '%.0f%%'%(x*100))
display(HTML(statsData.to_html()))

0.9962165503323894

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,% of orders,full fill,partial fill,no fill,fillPerc,immedFillPerc
exchange,# of orders,broker,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
SZE,432959,zs_52,20%,56%,6%,38%,81%,64%
SZE,432959,zs_53,0%,23%,5%,73%,39%,23%
SZE,432959,zs_54,15%,67%,5%,28%,81%,73%
SZE,432959,zs_62,5%,43%,3%,54%,50%,41%
SZE,432959,zs_64,10%,53%,3%,43%,66%,56%
SZE,432959,zs_66,6%,15%,2%,82%,53%,13%
SZE,432959,zs_88,0%,23%,4%,73%,44%,27%
SZE,432959,zs_89,1%,28%,4%,67%,53%,26%
SZE,432959,zs_92,4%,69%,1%,30%,76%,67%
SZE,432959,zs_94,4%,78%,2%,20%,90%,84%


In [6]:
# groupby (exchange, broker, isMsg):
checkData = orderLog[orderLog['isImmediate'] == 1].reset_index(drop=True)
checkData['exchange'] = np.where(checkData['secid'] >= 2000000, 'SZE', 'SSE')
checkData['maxFilled'] = checkData.groupby(['order'])['absOrderSizeCumFilled'].transform('max')
checkData['immedFillNtl'] = checkData['maxFilled'] * checkData['Price'] / 100
checkData = checkData[checkData['updateType'] == 0]


checkData = checkData[(checkData["exchange"] == "SSE") & (~checkData["ars"].isin([121, 221, 321, 131, 231, 331]))]
# checkData = checkData[checkData["ars"] != 21]
# checkData = checkData[checkData["ars"] == 21]
# checkData = checkData[(checkData["isMsg"] == 1) & 
#                       (checkData["exchange"] == "SSE")]



immedFillData = checkData.groupby(['exchange', 'colo_broker', 'isMsg'])[['orderNtl', 'immedFillNtl']].sum().reset_index()
immedFillData = immedFillData.rename(columns={'colo_broker': 'broker'})
immedFillData['immedFillPerc'] = immedFillData['immedFillNtl'] / immedFillData['orderNtl']
immedFillData = immedFillData[['exchange', 'broker', 'immedFillPerc', 'isMsg']]


checkData['fillStatus'] = np.where(checkData['maxFilled'] == 0, 0,
                          np.where(checkData['maxFilled'] < checkData['absOrderSize'], 1, 2))
statsData = checkData.groupby(['exchange', 'colo_broker', 'isMsg', 'fillStatus'])['secid'].count().to_frame().reset_index()
statsData.columns = ['exchange', 'broker', 'isMsg', 'fillStatus', 'count']
statsData = statsData.reset_index()
statsData['# of orders'] = statsData.groupby(['exchange'])['count'].transform('sum')
statsData['% of orders'] = statsData.groupby(['broker', 'exchange', "isMsg"])['count'].transform('sum')
statsData['percent'] = statsData['count'] / statsData['% of orders']
statsData['% of orders'] = statsData['% of orders'] / statsData['# of orders']
saveCols = ['broker', 'exchange', 'isMsg', 'percent']
noFillData = statsData[statsData['fillStatus'] == 0][saveCols].reset_index(drop=True)
noFillData = noFillData.rename(columns={'percent': 'no fill'})
partialFillData = statsData[statsData['fillStatus'] == 1][saveCols].reset_index(drop=True)
partialFillData = partialFillData.rename(columns={'percent': 'partial fill'})
fullFillData = statsData[statsData['fillStatus'] == 2][saveCols].reset_index(drop=True)
fullFillData = fullFillData.rename(columns={'percent': 'full fill'})


orderLog['exchange'] = np.where(orderLog['secid'] >= 2000000, 'SZE', 'SSE')

o1 = orderLog[(orderLog["exchange"] == "SSE") & (~orderLog['ars'].isin([121, 221, 321, 131, 231, 331]))]



fillRateData = o1.groupby(['exchange', 'colo_broker', 'accCode', 'secid', 'order', 'isMsg'])[['absOrderSizeCumFilled', 'absOrderSize', 'orderPrice']].max().reset_index()
fillRateData = fillRateData.rename(columns={'colo_broker': 'broker'})
fillRateData['orderNotional'] = fillRateData['absOrderSize']*fillRateData['orderPrice']
fillRateData['fillNotional'] = fillRateData['absOrderSizeCumFilled']*fillRateData['orderPrice']
fillRateData['totalNotional'] = fillRateData.groupby(['exchange', 'broker', "isMsg"])['orderNotional'].transform('sum')
fillRateData['fillNotional'] = fillRateData.groupby(['exchange', 'broker', "isMsg"])['fillNotional'].transform('sum')
fillRateData['fillPerc'] = fillRateData['fillNotional'] / fillRateData['totalNotional']
fillRateData = fillRateData.groupby(['exchange', 'broker', "isMsg"])['fillPerc'].mean().reset_index()

statsData = statsData.groupby(['exchange', 'broker', 'isMsg', '# of orders'])['% of orders'].first().reset_index()
statsData = pd.merge(statsData, noFillData, how='outer', on=['broker', 'exchange', 'isMsg'], validate='one_to_one')
statsData = pd.merge(statsData, partialFillData, how='outer', on=['broker', 'exchange', 'isMsg'], validate='one_to_one')
statsData = pd.merge(statsData, fullFillData, how='outer', on=['broker', 'exchange', 'isMsg'], validate='one_to_one')
statsData = pd.merge(statsData, fillRateData, how='outer', on=['broker', 'exchange', 'isMsg'], validate='one_to_one')
statsData = pd.merge(statsData, immedFillData, how='outer', on=['broker', 'exchange', 'isMsg'], validate='one_to_one')


# statsData = statsData.groupby(['exchange', '# of orders', 'broker'])[['% of orders', 'full fill', 'partial fill', 'no fill', 'fillPerc', 'immedFillPerc']].first()
statsData['isMsg'] = statsData['isMsg'].astype("int")
statsData = statsData.groupby(['exchange', '# of orders','broker', 'isMsg' ])[['% of orders', 'full fill', 'partial fill', 'no fill', 'fillPerc', 'immedFillPerc']].first()

savePath = r'L:\orderLog\result\fillRate'
statsData.reset_index().to_csv(os.path.join(savePath, 'fillRate1_%s_%s.csv'%(startDate, endDate)), index=False)


from IPython.display import display, HTML
for col in ['% of orders', 'no fill', 'partial fill', 'full fill', 'fillPerc', 'immedFillPerc']:
    statsData[col] = statsData[col].fillna(0)
    statsData[col] = statsData[col].apply(lambda x: '%.0f%%'%(x*100))
display(HTML(statsData.to_html()))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,% of orders,full fill,partial fill,no fill,fillPerc,immedFillPerc
exchange,# of orders,broker,isMsg,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
SSE,255197,zs_53,0,0%,44%,9%,47%,46%,39%
SSE,255197,zs_53,1,0%,60%,10%,30%,85%,66%
SSE,255197,zs_62,0,4%,19%,3%,77%,33%,17%
SSE,255197,zs_62,1,1%,45%,4%,51%,54%,44%
SSE,255197,zs_89,0,0%,34%,6%,60%,56%,30%
SSE,255197,zs_89,1,0%,45%,6%,49%,67%,45%
SSE,255197,zs_94,0,0%,34%,4%,62%,30%,20%
SSE,255197,zs_94,1,0%,67%,11%,22%,57%,46%
SSE,255197,zs_96,0,0%,0%,0%,100%,0%,0%
SSE,255197,zs_97,0,22%,6%,3%,91%,46%,12%


In [7]:
# groupby (exchange, broker, isMsg):
checkData = orderLog[orderLog['isImmediate'] == 1].reset_index(drop=True)
checkData['exchange'] = np.where(checkData['secid'] >= 2000000, 'SZE', 'SSE')
checkData['maxFilled'] = checkData.groupby(['order'])['absOrderSizeCumFilled'].transform('max')
checkData['immedFillNtl'] = checkData['maxFilled'] * checkData['Price'] / 100
checkData = checkData[checkData['updateType'] == 0]

checkData = pd.concat([checkData[(checkData["ars"].isin([121, 221, 321, 131, 231, 331])) & (checkData["exchange"] == 'SSE')], 
                       checkData[(checkData["ars"].isin([121, 221, 321, 131, 231, 331])) & (checkData["exchange"] == 'SZE') &
                                (checkData["isMsg"] == 1)]])
# checkData = checkData[(checkData["exchange"] == "SSE") & (checkData["ars"] != 21)]
# checkData = checkData[checkData["ars"] != 21]
# checkData = checkData[checkData["ars"] == 21]
# checkData = checkData[(checkData["isMsg"] == 1) & 
#                       (checkData["exchange"] == "SSE")]



immedFillData = checkData.groupby(['exchange', 'colo_broker', 'isMsg'])[['orderNtl', 'immedFillNtl']].sum().reset_index()
immedFillData = immedFillData.rename(columns={'colo_broker': 'broker'})
immedFillData['immedFillPerc'] = immedFillData['immedFillNtl'] / immedFillData['orderNtl']
immedFillData = immedFillData[['exchange', 'broker', 'immedFillPerc', 'isMsg']]


checkData['fillStatus'] = np.where(checkData['maxFilled'] == 0, 0,
                          np.where(checkData['maxFilled'] < checkData['absOrderSize'], 1, 2))
statsData = checkData.groupby(['exchange', 'colo_broker', 'isMsg', 'fillStatus'])['secid'].count().to_frame().reset_index()
statsData.columns = ['exchange', 'broker', 'isMsg', 'fillStatus', 'count']
statsData = statsData.reset_index()
statsData['# of orders'] = statsData.groupby(['exchange'])['count'].transform('sum')
statsData['% of orders'] = statsData.groupby(['broker', 'exchange', "isMsg"])['count'].transform('sum')
statsData['percent'] = statsData['count'] / statsData['% of orders']
statsData['% of orders'] = statsData['% of orders'] / statsData['# of orders']
saveCols = ['broker', 'exchange', 'isMsg', 'percent']
noFillData = statsData[statsData['fillStatus'] == 0][saveCols].reset_index(drop=True)
noFillData = noFillData.rename(columns={'percent': 'no fill'})
partialFillData = statsData[statsData['fillStatus'] == 1][saveCols].reset_index(drop=True)
partialFillData = partialFillData.rename(columns={'percent': 'partial fill'})
fullFillData = statsData[statsData['fillStatus'] == 2][saveCols].reset_index(drop=True)
fullFillData = fullFillData.rename(columns={'percent': 'full fill'})


orderLog['exchange'] = np.where(orderLog['secid'] >= 2000000, 'SZE', 'SSE')

o1 = pd.concat([orderLog[(orderLog["ars"].isin([121, 221, 321, 131, 231, 331])) & (orderLog["exchange"] == 'SSE')], orderLog[
    (orderLog["ars"].isin([121, 221, 321, 131, 231, 331]) & (orderLog["exchange"] == 'SZE') & (orderLog["isMsg"] == 1))
]])
# o1 = orderLog[(orderLog["exchange"] == "SSE") & (orderLog["ars"] != 21)]



fillRateData = o1.groupby(['exchange', 'colo_broker', 'accCode', 'secid', 'order', 'isMsg'])[['absOrderSizeCumFilled', 'absOrderSize', 'orderPrice']].max().reset_index()
fillRateData = fillRateData.rename(columns={'colo_broker': 'broker'})
fillRateData['orderNotional'] = fillRateData['absOrderSize']*fillRateData['orderPrice']
fillRateData['fillNotional'] = fillRateData['absOrderSizeCumFilled']*fillRateData['orderPrice']
fillRateData['totalNotional'] = fillRateData.groupby(['exchange', 'broker', "isMsg"])['orderNotional'].transform('sum')
fillRateData['fillNotional'] = fillRateData.groupby(['exchange', 'broker', "isMsg"])['fillNotional'].transform('sum')
fillRateData['fillPerc'] = fillRateData['fillNotional'] / fillRateData['totalNotional']
fillRateData = fillRateData.groupby(['exchange', 'broker', "isMsg"])['fillPerc'].mean().reset_index()

statsData = statsData.groupby(['exchange', 'broker', 'isMsg', '# of orders'])['% of orders'].first().reset_index()
statsData = pd.merge(statsData, noFillData, how='outer', on=['broker', 'exchange', 'isMsg'], validate='one_to_one')
statsData = pd.merge(statsData, partialFillData, how='outer', on=['broker', 'exchange', 'isMsg'], validate='one_to_one')
statsData = pd.merge(statsData, fullFillData, how='outer', on=['broker', 'exchange', 'isMsg'], validate='one_to_one')
statsData = pd.merge(statsData, fillRateData, how='outer', on=['broker', 'exchange', 'isMsg'], validate='one_to_one')
statsData = pd.merge(statsData, immedFillData, how='outer', on=['broker', 'exchange', 'isMsg'], validate='one_to_one')


# statsData = statsData.groupby(['exchange', '# of orders', 'broker'])[['% of orders', 'full fill', 'partial fill', 'no fill', 'fillPerc', 'immedFillPerc']].first()
statsData['isMsg'] = statsData['isMsg'].astype("int")
statsData = statsData.groupby(['exchange', '# of orders','broker', 'isMsg' ])[['% of orders', 'full fill', 'partial fill', 'no fill', 'fillPerc', 'immedFillPerc']].first()

savePath = r'L:\orderLog\result\fillRate'
statsData.reset_index().to_csv(os.path.join(savePath, 'fillRate3_%s_%s.csv'%(startDate, endDate)), index=False)


from IPython.display import display, HTML
for col in ['% of orders', 'no fill', 'partial fill', 'full fill', 'fillPerc', 'immedFillPerc']:
    statsData[col] = statsData[col].fillna(0)
    statsData[col] = statsData[col].apply(lambda x: '%.0f%%'%(x*100))
display(HTML(statsData.to_html()))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,% of orders,full fill,partial fill,no fill,fillPerc,immedFillPerc
exchange,# of orders,broker,isMsg,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
SSE,836735,zs_53,0,0%,33%,3%,64%,47%,31%
SSE,836735,zs_53,1,0%,65%,4%,31%,71%,56%
SSE,836735,zs_62,0,1%,13%,2%,85%,24%,13%
SSE,836735,zs_62,1,0%,34%,3%,62%,48%,36%
SSE,836735,zs_89,0,1%,37%,6%,57%,54%,36%
SSE,836735,zs_89,1,0%,57%,5%,38%,69%,52%
SSE,836735,zs_97,0,10%,37%,7%,55%,55%,40%
SSE,836735,zs_97,1,5%,46%,9%,44%,62%,49%
SSE,836735,zt_52,0,48%,53%,8%,39%,67%,60%
SSE,836735,zt_52,1,14%,57%,13%,30%,73%,67%
