In [1]:
import os
import glob
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('max_rows', 100)
pd.set_option('max_columns', 100)

perc = [0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]

In [2]:
startDate = '20200914'
endDate = '20200925'


readPath = r'\\192.168.10.30\Kevin_zhenyu\orderLog\equityTradeLogs'
dataPathLs = np.array(glob.glob(os.path.join(readPath, 'speedCompare***.csv')))
dateLs = np.array([os.path.basename(i).split('_')[1].split('.')[0] for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]
rawOrderLog = []
for thisDate, thisPath in zip(dateLs, dataPathLs):
    data = pd.read_csv(thisPath)
    data = data.rename(columns={'mdClockAtArrival': 'caamd'})
    rawOrderLog += [data]
rawOrderLog = pd.concat(rawOrderLog, sort=False)

for col in ['clockAtArrival', 'caamd', 'secid', 'updateType', 'vai', 'absFilledThisUpdate', 'orderDirection', 'absOrderSize',
            'absOrderSizeCumFilled', 'date', 'accCode', 'mse']:
    rawOrderLog[col] = rawOrderLog[col].astype('int64')   
rawOrderLog = rawOrderLog.sort_values(by=['date', 'secid', 'vai', 'accCode', 'clockAtArrival']).reset_index(drop=True)

rawOrderLog = rawOrderLog[rawOrderLog["secid"] >= 1000000]

rawOrderLog['clock'] = rawOrderLog['clockAtArrival'].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))
rawOrderLog['broker'] = rawOrderLog['accCode'] // 100
rawOrderLog["broker"] = np.where(rawOrderLog["accCode"].astype(str).apply(lambda x: len(x) == 6), rawOrderLog['accCode'] // 10000, rawOrderLog["broker"])
rawOrderLog['colo_broker'] = rawOrderLog['colo'].str[:2] + '_' + rawOrderLog['broker'].astype('str')
rawOrderLog['order'] = rawOrderLog.groupby(['date', 'accCode', 'secid', 'vai']).grouper.group_info[0]
rawOrderLog['group'] = rawOrderLog.groupby(['date', 'secid', 'vai']).grouper.group_info[0]
rawOrderLog['startClock'] = rawOrderLog.groupby(['order'])['clockAtArrival'].transform('first')
rawOrderLog['duration'] = rawOrderLog['clockAtArrival'] - rawOrderLog['startClock']
rawOrderLog['orderPrice'] = rawOrderLog['orderPrice'].apply(lambda x: round(x, 2))
rawOrderLog['tradePrice'] = rawOrderLog['tradePrice'].apply(lambda x: round(x, 2))
rawOrderLog['orderDirection1'] = np.where(rawOrderLog["orderDirection"] == -2, -1, np.where(
    rawOrderLog["orderDirection"] == 2, 1, rawOrderLog["orderDirection"]))
orderLog = rawOrderLog.copy()

### Assertion 1:  make sure same direction in same date, secid, vai
print('=======================================================================================')
print('1. same date, secid, vai: same direction')
orderLog['directNum'] = orderLog.groupby(['date', 'secid', 'vai'])['orderDirection1'].transform('nunique')
if len(orderLog[orderLog['directNum'] != 1]) > 0:
    print('opposite direction for same date, same secid, same vai')
    display(orderLog[(orderLog['directNum'] != 1) & (orderLog['updateType'] == 0)][['date', 'accCode', 'secid', 'vai', 'orderDirection', 'order']])
    orderLog = orderLog[orderLog['directNum'] == 1]

assert((orderLog.groupby(['date', 'secid', 'vai'])['orderDirection1'].nunique() == 1).all() == True)

## Assertion 2:  make sure each account, secid, vai only has one insertion
print('=======================================================================================')
print('2. same date, secid, vai, accCode: one insertion')
a = orderLog[orderLog['updateType'] == 0].groupby(['date', 'accCode', 'secid', 'vai', 'order'])['clockAtArrival'].count()
if len(a[a > 1]) > 0:
    print('more than one insertion at same time')
    a = a[a>1].reset_index()
    display(a)
    orderLog = orderLog[~(orderLog['order'].isin(a['order'].unique()))]

orderLog['isMsg'] = np.where(orderLog['updateType'] == 0, 
                             np.where(orderLog['mse'] == 100, 1, 0), np.nan)
orderLog['isMsg'] = orderLog.groupby(['order'])['isMsg'].ffill()

placeSZE = orderLog[(orderLog['secid'] >= 2000000) & (orderLog['updateType'] == 0)]
print('%.2f%% SZE orders triggered by msg data'%(placeSZE[placeSZE['isMsg'] == 1].shape[0]/placeSZE.shape[0]*100))


### Assertion 3:  check IPO stocks selling status
print('=======================================================================================')
print('3. IPO stocks selling (ars = 301, 302)')
if orderLog[orderLog['ars'].isin([301, 302])].shape[0] != 0:
    kk = orderLog[orderLog['ars'].isin([301, 302])]
    print(kk)
    try:
        assert(kk[kk['orderDirection1'] == 1].shape[0] == 0)
        print('we only sell, never buy')
    except:
        print('There are IPO buy side orders!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
        print(kk[kk['orderDirection1'] == 1])
    kk1 = kk[kk['updateType'] == 0]
    kk1 = kk1.sort_values(by=['accCode', 'secid','clockAtArrival'])
    kk1['diff'] = kk1.groupby(['accCode', 'secid'])['clockAtArrival'].apply(lambda x: x-x.shift(1))
    kk1['diff'] = kk1['diff'].fillna(0)
    try:
        assert(kk1[kk1['diff'] < 10e6].shape[0] == 0)
        print('for each stock in the same account, there is no insertion within 10 seconds of the previous insertion')
    except:
        print('There are insertion within 10 seconds for orders under same account same stock!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
        print(kk1[kk1['diff'] < 10e6])
    kk2 = kk[(kk['updateType'] == 1)]
    try:
        assert(kk2[kk2['duration'] < 3e6].shape[0] == 0)
        print('for each stock in the same account, the cancellation of an order happens more than 3 seconds after the insertion')
    except:
        print('There are cancellation within 3 seconds for orders under same account same stock!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
        print(kk2[kk2['duration'] < 3e6])


### Assertion 4: check updateType == 7 orders, make sure updateType == 7 orders < 20 per account, < 100 in total
print('=======================================================================================')
print('4. updateType 7 orders')
if orderLog[orderLog['updateType'] == 7].shape[0] != 0:
    assert(orderLog[orderLog['updateType'] == 7].groupby('accCode')['order'].nunique().max() < 20)
    assert(orderLog[orderLog['updateType'] == 7].groupby('accCode')['order'].nunique().sum() < 100)

### Assertion 5: check updateType == 6 orders, make sure updateType == 6 orders < 5% per account
print('=======================================================================================')
print('5. updateType 6 orders')
k1 = orderLog[orderLog['updateType'] == 6].groupby('accCode')['order'].nunique().reset_index()
k2 = orderLog.groupby('accCode')['order'].nunique().reset_index()
k = pd.merge(k1, k2, on='accCode', how='left')
k['prob'] = k['order_x']/k['order_y']
try:
    assert(sum(k['prob'] >= 0.05) == 0)
except:
    print('There are accounts with more than 5% updateType 6 orders!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
    print(k[k['prob'] >= 0.05])

### Assertion 6: check CYB orders, make sure CYB stocks total absOrderSize < 30w
print('=======================================================================================')
print('6. CYB stocks total order size < 30w')
try:
    assert(orderLog[(orderLog['secid'] >= 2300000) & (orderLog['updateType'] == 0)]['absOrderSize'].max() <= 300000)
except:
    print('CYB stocks total absOrderSize >= 30w!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
    
     
### Assertion 7:  make sure there is no unexpected updateType 
print('=======================================================================================')
print('7. unexpected updateType')
def getTuple(x):
    return tuple(i for i in x)

checkLog = orderLog[~((orderLog['updateType'] == 4) & (orderLog.groupby(['order'])['updateType'].shift(-1) == 4))]
checkLog = checkLog.groupby(['order'])['updateType'].apply(lambda x: getTuple(x)).reset_index()
checkLog['status'] = np.where(checkLog['updateType'].isin([(0, 2, 4), (0, 2, 1, 4), (0, 2, 1, 2, 4), (0, 2, 4, 1, 4), (0, 4), (0, 1, 4), (0, 4, 1, 4), (0, 2, 2, 4), (0, 4, 2, 4), (0, 2, 2, 1, 4), (0, 2, 2, 4, 1, 4)]),0,
                     np.where(checkLog['updateType'].isin([(0, 2, 4, 1, 3), (0, 2, 4, 1, 4, 3), (0, 2, 1, 4, 3), (0, 4, 1, 3), (0, 1, 4, 3),
                                                               (0, 2, 2, 4, 1, 3), (0, 2, 2, 4, 1, 4, 3), (0, 2, 2, 1, 4, 3), (0, 4, 2, 4, 1, 3),
                                                               (0, 4, 2, 1, 3), (0, 4, 1, 4, 3), (0, 4, 1)]), 1,
                     np.where(checkLog['updateType'].isin([(0, 2, 1, 3), (0, 2, 2, 1, 3), (0, 2, 3), (0, 3), (0, 1, 3), (0, ), (0, 2), (0, 2, 1), (0, 2, 2)]), 2, 3)))

orderLog = pd.merge(orderLog, checkLog[['order', 'status']], how='left', on=['order'], validate='many_to_one')
orderLog = orderLog[orderLog['status'].isin([0, 1, 2])].reset_index(drop=True)

### Assertion 8:  make sure status==0 got all traded
print('=======================================================================================')
print('8. status == 0: all traded')
a = orderLog[orderLog['status'] == 0]
a = a.groupby(['order'])[['absOrderSizeCumFilled', 'absOrderSize']].max().reset_index()
a.columns = ['order', 'filled', 'total']
print('in total trade, any fill != total cases')
display(a[a['filled'] != a['total']])
if a[a['filled'] != a['total']].shape[0] > 0:
    removeOrderLs = a[a['filled'] != a['total']]['order'].unique()
    orderLog = orderLog[~(orderLog['order'].isin(removeOrderLs))]
    
### Assertion 9:  make sure status==1 got partial traded
print('=======================================================================================')
print('9. status == 1: partial traded')
a = orderLog[orderLog['status'] == 1]
a = a.groupby(['order'])[['absOrderSizeCumFilled', 'absOrderSize']].max().reset_index()
a.columns = ['order', 'filled', 'total']
print('in partial trade, any fill >= total or fill is 0 cases for updateType 4')
display(a[(a['filled'] >= a['total']) | (a['filled'] == 0)])
if a[(a['filled'] >= a['total']) | (a['filled'] == 0)].shape[0] > 0:
    removeOrderLs = a[(a['filled'] >= a['total']) | (a['filled'] == 0)]['order'].unique()
    orderLog = orderLog[~(orderLog['order'].isin(removeOrderLs))]
    
### Assertion 10: make sure no cancellation within 1 sec
print('=======================================================================================')
print('10. no cancellation within 1 sec')
a = orderLog[(orderLog['updateType'] == 1) & (orderLog['duration'] < 1e6)]
print('any cancellation within 1 sec')
display(a)
if a.shape[0] > 0:
    removeOrderLs = a['order'].unique()
    orderLog = orderLog[~(orderLog['order'].isin(removeOrderLs))]


### Assertion 11: make sure no order has shares > 80w or notional > 800w
print('=======================================================================================')
print('11. Orders with size > 80w or notional > 800w')
orderLog['orderNtl'] = orderLog['absOrderSize'] * orderLog['orderPrice']
if orderLog[orderLog['absOrderSize'] > 800000].shape[0] > 0:
    print('some order quantity are > 80w')
    print(orderLog[orderLog['absOrderSize'] > 800000].groupby(['colo', 'accCode'])['order'].nunique())
    display(orderLog[orderLog['absOrderSize'] > 800000][['date', 'accCode', 'secid', 'vai', 'absOrderSize', 'orderPrice',
                                                         'orderNtl', 'orderDirection', 'clock', 'order']])
            
if orderLog[orderLog['orderNtl'] > 8000000].shape[0] > 0:
    print('some order ntl are > 800w')
    print(orderLog[orderLog['orderNtl'] > 8000000].groupby(['colo', 'accCode'])['order'].nunique())
    display(orderLog[orderLog['orderNtl'] > 8000000][['date', 'accCode', 'secid', 'vai', 'absOrderSize', 'orderPrice',
                                                      'orderNtl', 'orderDirection', 'clock', 'order', "updateType", 
                                                      "tradePrice", "absOrderSizeCumFilled", "absFilledThisUpdate"]])

removeOrderLs = list(set(orderLog[orderLog['absOrderSize'] > 800000]['order'].unique()) | set(orderLog[orderLog['orderNtl'] > 8000000]['order'].unique()))
orderLog = orderLog[~(orderLog['order'].isin(removeOrderLs))]


orderLog = orderLog.sort_values(by=['date', 'secid', 'vai', 'accCode', 'clockAtArrival']).reset_index(drop=True)

orderLog['exchange'] = np.where(orderLog['secid'] >= 2000000, 'SZE', 'SSE')
orderLog['orderNtl'] = orderLog['orderPrice'] * orderLog['absOrderSize']
orderLog['tradeNtl'] = np.where(orderLog['updateType'] == 4, orderLog['tradePrice']*orderLog['absFilledThisUpdate'], 0)
orderLog["ars"] = orderLog.groupby(['order'])['ars'].transform('first')
orderLog['sta'] = np.where(orderLog['ars'].isin([121, 221, 321, 131, 231, 331]), 'statwo', 'staone')

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


1. same date, secid, vai: same direction
opposite direction for same date, same secid, same vai


Unnamed: 0,date,accCode,secid,vai,orderDirection,order
56323,20200914,8854,1603008,8905434,-1,30680
56326,20200914,528701,1603008,8905434,1,62141
70068,20200914,8854,1603298,2761528,-1,30710
70071,20200914,9551,1603298,2761528,1,41692
194557,20200914,5456,2002880,774305,1,19871
194561,20200914,6623,2002880,774305,1,26424
194565,20200914,6627,2002880,774305,1,27361
194568,20200914,6683,2002880,774305,1,28851
194572,20200914,8854,2002880,774305,-1,31101
400994,20200915,6683,2002298,7161301,1,94504


2. same date, secid, vai, accCode: one insertion
more than one insertion at same time


Unnamed: 0,date,accCode,secid,vai,order,clockAtArrival
0,20200914,5273,2300550,3100,7203,2
1,20200914,6683,2000688,1852523,27948,2
2,20200914,6683,2300723,1201900,29521,2
3,20200914,8854,2000688,2980019,30834,2
4,20200914,8854,2002961,556897,31125,2
...,...,...,...,...,...,...
479,20200925,975602,1603192,445733,711392,2
480,20200925,975602,1603297,462866,711541,2
481,20200925,975602,1603322,517500,711587,2
482,20200925,975602,1603722,1671500,711965,2


99.65% SZE orders triggered by msg data
3. IPO stocks selling (ars = 301, 302)
4. updateType 7 orders
5. updateType 6 orders
6. CYB stocks total order size < 30w
7. unexpected updateType
8. status == 0: all traded
in total trade, any fill != total cases


Unnamed: 0,order,filled,total
7429,9735,300,400
8203,10691,6200,14600
9187,11799,90000,143700
10058,12769,2200,2300
15319,18984,3400,32200
...,...,...,...
508182,704463,86,4700
508423,704862,15200,17100
510405,707939,9800,12500
513026,711885,100,400


9. status == 1: partial traded
in partial trade, any fill >= total or fill is 0 cases for updateType 4


Unnamed: 0,order,filled,total


10. no cancellation within 1 sec
any cancellation within 1 sec


Unnamed: 0.1,Unnamed: 0,clockAtArrival,caamd,secid,updateType,vai,ars,absFilledThisUpdate,orderDirection,absOrderSize,absOrderSizeCumFilled,orderPrice,tradePrice,date,accCode,mse,colo,orderSysId,internalId,tradeId,sdd,aaa,ApplSeqNum,mrm,mta,mrsb,mrss,mrv,mrb100,mra100,l4tr,clock,broker,colo_broker,order,group,startClock,duration,orderDirection1,directNum,isMsg,status


11. Orders with size > 80w or notional > 800w


In [7]:
checkLog = orderLog[(orderLog["updateType"] == 0)]
checkLog = checkLog[checkLog['caamd'] != 0]
checkLog['internal_latency'] = checkLog["clockAtArrival"] - checkLog["caamd"]
checkLog = checkLog[checkLog['sta'] == 'statwo']
SZE = checkLog[checkLog['secid'] >= 2000000]
SSE = checkLog[checkLog['secid'] < 2000000]
SZE["exchange"] = "SZ"
SSE["exchange"] = "SH"

c1 = SSE.groupby(['colo', 'exchange', 'accCode', "date"])["internal_latency"].quantile(.95).reset_index().groupby(['colo', 'accCode', 'exchange'])["internal_latency"].mean().reset_index()
c2 = SSE.groupby(['colo', 'exchange', 'accCode', "date"])["internal_latency"].median().reset_index().groupby(['colo', 'accCode', 'exchange'])["internal_latency"].mean().reset_index()
c3 = SSE.groupby(['colo', 'accCode', 'exchange'])["internal_latency"].count().reset_index()
c4 = SSE.groupby(['colo', 'accCode', 'exchange'])["date"].unique().str.len().reset_index()
c5 = SSE.groupby(['colo', 'exchange', 'accCode',  "date"])["internal_latency"].quantile(.95).reset_index().groupby(['colo', 'accCode', 'exchange'])["internal_latency"].std().reset_index()

re2 = pd.merge(c3, c1, on=['colo', 'accCode', 'exchange'])
re2 = re2.rename(columns = {'internal_latency_x': 'count', 'internal_latency_y': '95 percentile'})
re2 = pd.merge(re2, c2, on=['colo', 'accCode', 'exchange'])
re2 = re2.rename(columns = {'internal_latency': 'median'})
re2 = pd.merge(re2, c4, on=['colo', 'accCode',  'exchange'])
re2 = pd.merge(re2, c5, on=['colo', 'accCode', 'exchange'])
re2 = re2.rename(columns = {'internal_latency': 'std'})
re2

c1 = SZE.groupby(['colo', 'exchange', 'accCode', "date"])["internal_latency"].quantile(.95).reset_index().groupby(['colo', 'accCode', 'exchange'])["internal_latency"].mean().reset_index()
c2 = SZE.groupby(['colo', 'exchange', 'accCode', "date"])["internal_latency"].median().reset_index().groupby(['colo', 'accCode', 'exchange'])["internal_latency"].mean().reset_index()
c3 = SZE.groupby(['colo', 'accCode', 'exchange'])["internal_latency"].count().reset_index()
c4 = SZE.groupby(['colo', 'accCode', 'exchange'])["date"].unique().str.len().reset_index()
c5 = SZE.groupby(['colo', 'exchange',  'accCode', "date"])["internal_latency"].quantile(.95).reset_index().groupby(['colo', 'accCode', 'exchange'])["internal_latency"].std().reset_index()

re1 = pd.merge(c3, c1, on=['colo', 'accCode', 'exchange'])
re1 = re1.rename(columns = {'internal_latency_x': 'count', 'internal_latency_y': '95 percentile'})
re1 = pd.merge(re1, c2, on=['colo', 'accCode', 'exchange'])
re1 = re1.rename(columns = {'internal_latency': 'median'})
re1 = pd.merge(re1, c4, on=['colo', 'accCode', 'exchange'])
re1 = pd.merge(re1, c5, on=['colo', 'accCode', 'exchange'])
re1 = re1.rename(columns = {'internal_latency': 'std'})
re1


# re1 = pd.merge(re1[re1["isMsg"] == 1], re1[re1["isMsg"] == 0], on=["exchange", "colo_account"], how="outer").sort_values(by="median_x").reset_index(drop=True)
# re2 = pd.merge(re2[re2["isMsg"] == 1], re2[re2["isMsg"] == 0], on=["exchange", "colo_account"], how="outer").sort_values(by="median_y").reset_index(drop=True)
re = pd.concat([re1, re2]).reset_index(drop=True)


for col in ['median', '95 percentile']:
    re[col] = re[col].astype(int)
for col in ['std']:
    re[col] = re[col].apply(lambda x: '%.2f'%(x))
    
    
from IPython.display import display, HTML
display(HTML(re.groupby(['exchange', "colo", 'accCode'])["count", "median", "95 percentile", "std"].first().to_html()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,median,95 percentile,std
exchange,colo,accCode,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SH,zs_88_04,8967,1467,62,126,8.41
SH,zs_96_06,9765,2386,122,695,1028.26
SH,zs_96_06,975602,7055,128,920,1555.78
SH,zs_96_08,6282,3477,108,194,14.8
SH,zs_96_08,9741,4808,102,184,5.79
SH,zs_96_08,9754,7921,112,210,11.26
SH,zt_52_04,522201,2324,83,165,12.07
SH,zt_52_04,522601,5474,83,175,13.91
SH,zt_52_04,526901,9083,79,162,10.75
SH,zt_52_04,527301,5403,82,177,22.37


In [35]:
checkLog = orderLog[(orderLog["updateType"] == 0)]
checkLog = checkLog[checkLog['caamd'] != 0]
checkLog['internal_latency'] = checkLog["clockAtArrival"] - checkLog["caamd"]
checkLog = checkLog[checkLog['sta'] == 'statwo']
SZE = checkLog[checkLog['secid'] >= 2000000]
SSE = checkLog[checkLog['secid'] < 2000000]
SZE["exchange"] = "SZ"
SSE["exchange"] = "SH"

c1 = SSE.groupby(['colo', 'exchange', 'colo_broker', "date"])["internal_latency"].quantile(.95).reset_index().groupby(['colo', 'colo_broker', 'exchange'])["internal_latency"].mean().reset_index()
c2 = SSE.groupby(['colo', 'exchange', 'colo_broker', "date"])["internal_latency"].median().reset_index().groupby(['colo', 'colo_broker', 'exchange'])["internal_latency"].mean().reset_index()
c3 = SSE.groupby(['colo', 'colo_broker', 'exchange'])["internal_latency"].count().reset_index()
c4 = SSE.groupby(['colo', 'colo_broker', 'exchange'])["date"].unique().str.len().reset_index()
c5 = SSE.groupby(['colo', 'exchange', 'colo_broker',  "date"])["internal_latency"].quantile(.95).reset_index().groupby(['colo', 'colo_broker', 'exchange'])["internal_latency"].std().reset_index()

re2 = pd.merge(c3, c1, on=['colo', 'colo_broker', 'exchange'])
re2 = re2.rename(columns = {'internal_latency_x': 'count', 'internal_latency_y': '95 percentile'})
re2 = pd.merge(re2, c2, on=['colo', 'colo_broker', 'exchange'])
re2 = re2.rename(columns = {'internal_latency': 'median'})
re2 = pd.merge(re2, c4, on=['colo', 'colo_broker',  'exchange'])
re2 = pd.merge(re2, c5, on=['colo', 'colo_broker', 'exchange'])
re2 = re2.rename(columns = {'internal_latency': 'std'})
re2

c1 = SZE.groupby(['colo', 'exchange', 'colo_broker', "date"])["internal_latency"].quantile(.95).reset_index().groupby(['colo', 'colo_broker', 'exchange'])["internal_latency"].mean().reset_index()
c2 = SZE.groupby(['colo', 'exchange', 'colo_broker', "date"])["internal_latency"].median().reset_index().groupby(['colo', 'colo_broker', 'exchange'])["internal_latency"].mean().reset_index()
c3 = SZE.groupby(['colo', 'colo_broker', 'exchange'])["internal_latency"].count().reset_index()
c4 = SZE.groupby(['colo', 'colo_broker', 'exchange'])["date"].unique().str.len().reset_index()
c5 = SZE.groupby(['colo', 'exchange',  'colo_broker', "date"])["internal_latency"].quantile(.95).reset_index().groupby(['colo', 'colo_broker', 'exchange'])["internal_latency"].std().reset_index()

re1 = pd.merge(c3, c1, on=['colo', 'colo_broker', 'exchange'])
re1 = re1.rename(columns = {'internal_latency_x': 'count', 'internal_latency_y': '95 percentile'})
re1 = pd.merge(re1, c2, on=['colo', 'colo_broker', 'exchange'])
re1 = re1.rename(columns = {'internal_latency': 'median'})
re1 = pd.merge(re1, c4, on=['colo', 'colo_broker', 'exchange'])
re1 = pd.merge(re1, c5, on=['colo', 'colo_broker', 'exchange'])
re1 = re1.rename(columns = {'internal_latency': 'std'})
re1


# re1 = pd.merge(re1[re1["isMsg"] == 1], re1[re1["isMsg"] == 0], on=["exchange", "colo_account"], how="outer").sort_values(by="median_x").reset_index(drop=True)
# re2 = pd.merge(re2[re2["isMsg"] == 1], re2[re2["isMsg"] == 0], on=["exchange", "colo_account"], how="outer").sort_values(by="median_y").reset_index(drop=True)
re = pd.concat([re1, re2]).reset_index(drop=True)


for col in ['median', '95 percentile']:
    re[col] = re[col].astype(int)
for col in ['std']:
    re[col] = re[col].apply(lambda x: '%.2f'%(x))
    
    
from IPython.display import display, HTML
add = checkLog.groupby(['exchange', 'colo_broker', 'colo', 'date'])['secid'].nunique().reset_index().groupby(['exchange', 'colo_broker', 'colo'])['secid'].mean().astype(int).reset_index()
add = add.rename(columns={'secid':'# of stocks'})
add['exchange'] = np.where(add['exchange'] == 'SSE', 'SH', 'SZ')
re = pd.merge(re, add, on=['exchange', 'colo_broker', 'colo'])
display(HTML(re.groupby(['exchange', 'colo_broker', "colo"])["# of stocks", "count", "median", "95 percentile", "std"].first().to_html()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,# of stocks,count,median,95 percentile,std
exchange,colo_broker,colo,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
SH,zs_62,zs_96_08,85,3477,108,194,14.8
SH,zs_89,zs_88_04,60,1467,62,126,8.41
SH,zs_97,zs_96_06,330,9441,127,876,1443.66
SH,zs_97,zs_96_08,330,12729,109,200,9.77
SH,zt_52,zt_52_04,346,29619,80,168,10.53
SH,zt_52,zt_52_05,573,26296,96,166,6.75
SH,zt_52,zt_52_07,374,17423,86,154,5.0
SH,zt_52,zt_52_10,487,38710,77,148,4.59
SH,zt_53,zt_52_05,103,4473,101,171,7.77
SH,zt_53,zt_52_07,30,763,85,149,19.85


In [20]:
checkLog[(checkLog['colo'] == 'zs_96_06') & (checkLog['date'] == 20200918) & (checkLog['internal_latency'] > 200)]['sdd'].describe([01., 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]).astype('int64')

count          732
mean     121650427
std       20626124
min       93701890
20%       95937732
30%      105023000
40%      110003322
50%      130437000
60%      135824200
70%      140703000
80%      143206400
90%      145116900
100%     145648000
max      145648000
Name: sdd, dtype: int64

In [31]:
orderLog[orderLog['accCode'] == 523001]['colo'].unique()

array(['zt_52_07'], dtype=object)

In [22]:
print('SZE')
totalAccount = orderLog[orderLog['secid'] >= 2000000]['accCode'].unique()
for date, dateLog in orderLog[orderLog['secid'] >= 2000000].groupby(['date']):
    dailyAccount = dateLog['accCode'].unique()
    print(date, sorted(list(set(totalAccount) - set(dailyAccount))))
print(totalAccount)
print('SSE')
totalAccount = orderLog[orderLog['secid'] < 2000000]['accCode'].unique()
for date, dateLog in orderLog[orderLog['secid'] < 2000000].groupby(['date']):
    dailyAccount = dateLog['accCode'].unique()
    print(date, sorted(list(set(totalAccount) - set(dailyAccount))))
print(totalAccount)

SZE
20200914 [5229, 5230, 8924, 537403]
20200915 [5229, 5230, 8924, 537403]
20200916 [5229, 5230, 8924, 537403]
20200917 [5229, 5230, 8924, 537403]
20200918 [5229, 5230, 8924]
20200921 [5229, 5230, 8924]
20200922 [5229, 5230, 8824, 8924]
20200923 [5229, 5230, 8824]
20200924 [5229, 5230, 8824]
20200925 [8824]
[  5284   8967   8970   9765   9655   5269   5222   5273   9454   9741
   9756   6480   5289   6282   5377 966301   6237   8824   8971   9754
   5276   5286   5225   5226   5470   5474   5290   8854   9451   9448
 896702   5291   6678   5275   5287   6683   9685   6627   6623   9758
   9471   5456   9243   5281   9461   8865   9208 537403   8924   5230
   5229]
SSE
20200914 [8967, 522901, 523001]
20200915 [8967, 522901, 523001]
20200916 [8967, 522901, 523001]
20200917 [522901, 523001, 537401, 537403]
20200918 [522901, 523001]
20200921 [522901, 523001]
20200922 [522901, 523001]
20200923 [522901, 523001]
20200924 [522901, 523001]
20200925 []
[966701   8971 527301 522201   9441 524201

In [31]:
checkLog = orderLog[(orderLog["updateType"] == 0) & (((orderLog['date'] >= 20200918) & (orderLog['colo'].isin(['zt_52_05']))) | 
                                                    ((orderLog['date'] >= 20200923) & (orderLog['colo'].isin(['zs_88_04']))) |
                                                    ((orderLog['date'] >= 20200925) & (orderLog['colo'].isin(['zs_52_08']))) |
                                                    ((orderLog['date'] >= 20200925) & (orderLog['colo'].isin(['zs_52_09']))) |
                                                    ((orderLog['date'] >= 20200917) & (orderLog['colo'].isin(['zs_88_04']))) |
                                                    ((orderLog['date'] >= 20200925) & (orderLog['colo'].isin(['zt_52_05']))) |
                                                    ((orderLog['date'] >= 20200925) & (orderLog['colo'].isin(['zt_52_07']))) )]
# checkLog = orderLog[(orderLog["updateType"] == 0) & (((orderLog['date'] < 20200918) & (orderLog['colo'].isin(['zt_52_05']))) | 
#                                                     ((orderLog['date'] < 20200923) & (orderLog['colo'].isin(['zs_88_04']))) |
#                                                     ((orderLog['date'] < 20200925) & (orderLog['colo'].isin(['zs_52_08']))) |
#                                                     ((orderLog['date'] < 20200925) & (orderLog['colo'].isin(['zs_52_09']))) |
#                                                     ((orderLog['date'] < 20200917) & (orderLog['colo'].isin(['zs_88_04']))) |
#                                                     ((orderLog['date'] < 20200925) & (orderLog['colo'].isin(['zt_52_05']))) |
#                                                     ((orderLog['date'] < 20200925) & (orderLog['colo'].isin(['zt_52_07']))) )]
# checkLog = orderLog[(orderLog["updateType"] == 0)]
checkLog = checkLog[checkLog['caamd'] != 0]
checkLog['internal_latency'] = checkLog["clockAtArrival"] - checkLog["caamd"]
checkLog = checkLog[checkLog['sta'] == 'statwo']
SZE = checkLog[checkLog['secid'] >= 2000000]
SSE = checkLog[checkLog['secid'] < 2000000]
SZE["exchange"] = "SZ"
SSE["exchange"] = "SH"

c1 = SSE.groupby(['colo', 'exchange', "date"])["internal_latency"].quantile(.95).reset_index().groupby(['colo', 'exchange'])["internal_latency"].mean().reset_index()
c2 = SSE.groupby(['colo', 'exchange', "date"])["internal_latency"].median().reset_index().groupby(['colo', 'exchange'])["internal_latency"].mean().reset_index()
c3 = SSE.groupby(['colo', 'exchange'])["internal_latency"].count().reset_index()
c4 = SSE.groupby(['colo', 'exchange'])["date"].unique().str.len().reset_index()
c5 = SSE.groupby(['colo', 'exchange', "date"])["internal_latency"].quantile(.95).reset_index().groupby(['colo', 'exchange'])["internal_latency"].std().reset_index()

re2 = pd.merge(c3, c1, on=['colo', 'exchange'])
re2 = re2.rename(columns = {'internal_latency_x': 'count', 'internal_latency_y': '95 percentile'})
re2 = pd.merge(re2, c2, on=['colo', 'exchange'])
re2 = re2.rename(columns = {'internal_latency': 'median'})
re2 = pd.merge(re2, c4, on=['colo', 'exchange'])
re2 = pd.merge(re2, c5, on=['colo', 'exchange'])
re2 = re2.rename(columns = {'internal_latency': 'std'})
re2

c1 = SZE.groupby(['colo', 'exchange', "date"])["internal_latency"].quantile(.95).reset_index().groupby(['colo', 'exchange'])["internal_latency"].mean().reset_index()
c2 = SZE.groupby(['colo', 'exchange', "date"])["internal_latency"].median().reset_index().groupby(['colo', 'exchange'])["internal_latency"].mean().reset_index()
c3 = SZE.groupby(['colo', 'exchange'])["internal_latency"].count().reset_index()
c4 = SZE.groupby(['colo', 'exchange'])["date"].unique().str.len().reset_index()
c5 = SZE.groupby(['colo', 'exchange',  "date"])["internal_latency"].quantile(.95).reset_index().groupby(['colo', 'exchange'])["internal_latency"].std().reset_index()

re1 = pd.merge(c3, c1, on=['colo', 'exchange'])
re1 = re1.rename(columns = {'internal_latency_x': 'count', 'internal_latency_y': '95 percentile'})
re1 = pd.merge(re1, c2, on=['colo', 'exchange'])
re1 = re1.rename(columns = {'internal_latency': 'median'})
re1 = pd.merge(re1, c4, on=['colo', 'exchange'])
re1 = pd.merge(re1, c5, on=['colo', 'exchange'])
re1 = re1.rename(columns = {'internal_latency': 'std'})
re1


# re1 = pd.merge(re1[re1["isMsg"] == 1], re1[re1["isMsg"] == 0], on=["exchange", "colo_account"], how="outer").sort_values(by="median_x").reset_index(drop=True)
# re2 = pd.merge(re2[re2["isMsg"] == 1], re2[re2["isMsg"] == 0], on=["exchange", "colo_account"], how="outer").sort_values(by="median_y").reset_index(drop=True)
re = pd.concat([re1, re2]).reset_index(drop=True)


for col in ['median', '95 percentile']:
    re[col] = re[col].astype(int)
for col in ['std']:
    re[col] = re[col].apply(lambda x: '%.2f'%(x))
    
    
from IPython.display import display, HTML
add = checkLog.groupby(['exchange', 'colo', 'date'])['secid'].nunique().reset_index().groupby(['exchange', 'colo'])['secid'].mean().astype(int).reset_index()
add = add.rename(columns={'secid':'# of stocks'})
add['exchange'] = np.where(add['exchange'] == 'SSE', 'SH', 'SZ')
re = pd.merge(re, add, on=['exchange', 'colo'])
display(HTML(re.groupby(['exchange', "colo"])["# of stocks", "count", "median", "95 percentile", "std"].first().to_html()))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0_level_0,Unnamed: 1_level_0,# of stocks,count,median,95 percentile,std
exchange,colo,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SH,zs_88_04,60,1467,62,126,8.41
SH,zt_52_05,701,20472,97,168,7.81
SH,zt_52_07,478,1867,92,159,
SZ,zs_52_08,250,3856,60,105,
SZ,zs_52_09,856,7199,50,95,
SZ,zs_88_04,368,17052,53,74,3.1
SZ,zt_52_05,65,1188,53,96,4.54


In [30]:
re = re[['colo', 'exchange', '# of stocks', '95 percentile', 'median']]
re = re.rename(columns={"95 percentile":"prev_95p", "median":"prev_med", "# of stocks":"prev_numOfStocks"})
result1 = re
result1

Unnamed: 0,colo,exchange,prev_numOfStocks,prev_95p,prev_med
0,zs_52_08,SZ,244,92,54
1,zs_52_09,SZ,822,87,49
2,zs_88_04,SZ,352,71,51
3,zt_52_05,SZ,61,96,53
4,zs_88_04,SH,56,122,63
5,zt_52_05,SH,656,165,96
6,zt_52_07,SH,392,153,86


In [32]:
re = re[['colo', 'exchange', '# of stocks', '95 percentile', 'median']]
re = re.rename(columns={"95 percentile":"cur_95p", "median":"cur_med", "# of stocks":"cur_numOfStocks"})
result1 = pd.merge(re, result1, on=['colo', 'exchange'])
result1 = result1[['colo', 'exchange', 'prev_numOfStocks', 'cur_numOfStocks', 'prev_med', 'cur_med', 'prev_95p', 'cur_95p']]
result1

Unnamed: 0,colo,exchange,prev_numOfStocks,cur_numOfStocks,prev_med,cur_med,prev_95p,cur_95p
0,zs_52_08,SZ,244,250,54,60,92,105
1,zs_52_09,SZ,822,856,49,50,87,95
2,zs_88_04,SZ,352,368,51,53,71,74
3,zt_52_05,SZ,61,65,53,53,96,96
4,zs_88_04,SH,56,60,63,62,122,126
5,zt_52_05,SH,656,701,96,97,165,168
6,zt_52_07,SH,392,478,86,92,153,159
