### 1. load data

In [1]:
import os
import glob
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('max_rows', 100)
pd.set_option('max_columns', 100)

perc = [0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]

In [2]:
startDate = '20200401'
endDate = '20200403'

readPath = r'\\192.168.10.30\Kevin_zhenyu\orderLog\equityTradeLogs'
dataPathLs = np.array(glob.glob(os.path.join(readPath, 'speedCompare***.csv')))
dateLs = np.array([os.path.basename(i).split('_')[1].split('.')[0] for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]
rawOrderLog = []
for thisDate, thisPath in zip(dateLs, dataPathLs):
    data = pd.read_csv(thisPath)
    data = data.rename(columns={'mdClockAtArrival': 'caamd'})
    rawOrderLog += [data]
rawOrderLog = pd.concat(rawOrderLog, sort=False)
for col in ['clockAtArrival', 'caamd', 'secid', 'updateType', 'vai', 'absFilledThisUpdate', 'orderDirection', 'absOrderSize',
            'absOrderSizeCumFilled', 'date', 'accCode', 'mse']:
    rawOrderLog[col] = rawOrderLog[col].astype('int64')   
rawOrderLog = rawOrderLog.sort_values(by=['date', 'secid', 'vai', 'accCode', 'clockAtArrival']).reset_index(drop=True)

targetStock = rawOrderLog['secid'].unique()
targetStock = np.array([int(str(i)[1:]) for i in targetStock])
targetStockSZ = sorted(targetStock[targetStock < 600000])
targetStockSH = sorted(targetStock[targetStock >= 600000])

rawOrderLog['clock'] = rawOrderLog['clockAtArrival'].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))
rawOrderLog['broker'] = rawOrderLog['accCode'] // 100
rawOrderLog['colo_broker'] = rawOrderLog['colo'].str[:2] + '_' + rawOrderLog['broker'].astype('str')
rawOrderLog['order'] = rawOrderLog.groupby(['date', 'accCode', 'secid', 'vai']).grouper.group_info[0]
rawOrderLog['group'] = rawOrderLog.groupby(['date', 'secid', 'vai']).grouper.group_info[0]
rawOrderLog['startClock'] = rawOrderLog.groupby(['order'])['clockAtArrival'].transform('first')
rawOrderLog['duration'] = rawOrderLog['clockAtArrival'] - rawOrderLog['startClock']
rawOrderLog['orderPrice'] = rawOrderLog['orderPrice'].apply(lambda x: round(x, 2))
rawOrderLog['tradePrice'] = rawOrderLog['tradePrice'].apply(lambda x: round(x, 2))
orderLog = rawOrderLog.copy()

### make sure no order has shares > 80w or notional > 800w
orderLog['orderNtl'] = orderLog['absOrderSize'] * orderLog['orderPrice']
# assert(orderLog[orderLog['absOrderSize'] > 800000].shape[0] == 0)
# assert(orderLog[orderLog['orderNtl'] > 8000000].shape[0] == 0)
if orderLog[orderLog['absOrderSize'] > 800000].shape[0] > 0:
    print('some order quantity are > 80w')
    display(orderLog[orderLog['absOrderSize'] > 800000][['date', 'accCode', 'secid', 'vai', 'absOrderSize', 'orderPrice',
                                                         'orderNtl', 'orderDirection', 'clock', 'order']])
            
if orderLog[orderLog['orderNtl'] > 8000000].shape[0] > 0:
    print('some order ntl are > 800w')
    display(orderLog[orderLog['orderNtl'] > 8000000][['date', 'accCode', 'secid', 'vai', 'absOrderSize', 'orderPrice',
                                                      'orderNtl', 'orderDirection', 'clock', 'order', "updateType", 
                                                      "tradePrice", "absOrderSizeCumFilled", "absFilledThisUpdate"]])

### make sure same direction in same colo_broker
orderLog['directNum'] = orderLog.groupby(['date', 'secid', 'vai'])['orderDirection'].transform('nunique')
if len(orderLog[orderLog['directNum'] != 1]) > 0:
    print('opposite direction for same date, same secid, same vai')
    display(orderLog[orderLog['directNum'] != 1][['date', 'accCode', 'secid', 'vai', 'orderDirection']])
    orderLog = orderLog[orderLog['directNum'] == 1]

assert((orderLog.groupby(['date', 'secid', 'vai'])['orderDirection'].nunique() == 1).all() == True)

## make sure each account, secid, vai only has one insertion
a = orderLog[orderLog['updateType'] == 0].groupby(['date', 'accCode', 'secid', 'vai', 'order'])['clockAtArrival'].count()
if len(a[a > 1]) > 0:
    print('more than one insertion at same time')
    a = a[a>1].reset_index()
    display(a)
    orderLog = orderLog[~(orderLog['order'].isin(a['order'].unique()))]

orderLog['isMsg'] = np.where(orderLog['updateType'] == 0, 
                             np.where(orderLog['mse'] == 100, 1, 0), np.nan)
orderLog['isMsg'] = orderLog.groupby(['order'])['isMsg'].ffill()

placeSZE = orderLog[(orderLog['secid'] >= 2000000) & (orderLog['updateType'] == 0)]
print('%.2f%% SZE orders triggered by msg data'%(placeSZE[placeSZE['isMsg'] == 1].shape[0]/placeSZE.shape[0]*100))

### make sure there is no unexpected updateType 
def getTuple(x):
    return tuple(i for i in x)

checkLog = orderLog[~((orderLog['updateType'] == 4) & (orderLog.groupby(['order'])['updateType'].shift(-1) == 4))]
checkLog = checkLog.groupby(['order'])['updateType'].apply(lambda x: getTuple(x)).reset_index()
checkLog['status'] = np.where(checkLog['updateType'].isin([(0, 2, 4), (0, 2, 1, 4), (0, 2, 1, 2, 4), (0, 2, 4, 1, 4), (0, 4), (0, 4, 1, 4)]), 0,
                     np.where(checkLog['updateType'].isin([(0, 2, 4, 1, 3), (0, 2, 4, 1, 4, 3), (0, 2, 1, 4, 3), (0, 4, 1, 3)]), 1,
                     np.where(checkLog['updateType'] == (0, 2, 1, 3), 2,
                     np.where(checkLog['updateType'].isin([(0, 3)]), 3,
                     np.where(checkLog['updateType'].isin([(0, ), (0, 2), (0, 2, 1)]), 4, 5)))))

display(checkLog[checkLog['status'] == 5])
orderLog = pd.merge(orderLog, checkLog[['order', 'status']], how='left', on=['order'], validate='many_to_one')
orderLog = orderLog[orderLog['status'].isin([0, 1, 2])].reset_index(drop=True)

### check status==0 got all traded
a = orderLog[orderLog['status'] == 0]
a = a.groupby(['order'])[['absOrderSizeCumFilled', 'absOrderSize']].max().reset_index()
a.columns = ['order', 'filled', 'total']
print('in total trade, any fill != total cases')
display(a[a['filled'] != a['total']])
if a[a['filled'] != a['total']].shape[0] > 0:
    removeOrderLs = a[a['filled'] != a['total']]['order'].unique()
    orderLog = orderLog[~(orderLog['order'].isin(removeOrderLs))]
    
### check status==1 got partial traded
a = orderLog[orderLog['status'] == 1]
a = a.groupby(['order'])[['absOrderSizeCumFilled', 'absOrderSize']].max().reset_index()
a.columns = ['order', 'filled', 'total']
print('in partial trade, any fill >= total or fill is 0 cases for updateType 4')
display(a[(a['filled'] >= a['total']) | (a['filled'] == 0)])
if a[(a['filled'] >= a['total']) | (a['filled'] == 0)].shape[0] > 0:
    removeOrderLs = a[(a['filled'] >= a['total']) | (a['filled'] == 0)]['order'].unique()
    orderLog = orderLog[~(orderLog['order'].isin(removeOrderLs))]
    
### check if any cancellation within 1 sec
a = orderLog[(orderLog['updateType'] == 1) & (orderLog['duration'] < 1e6)]
print('any cancellation within 1 sec')
display(a)
if a.shape[0] > 0:
    removeOrderLs = a['order'].unique()
    orderLog = orderLog[~(orderLog['order'].isin(removeOrderLs))]

orderLog = orderLog.sort_values(by=['date', 'secid', 'vai', 'accCode', 'clockAtArrival']).reset_index(drop=True)

orderLog['exchange'] = np.where(orderLog['secid'] >= 2000000, 'SZE', 'SSE')
orderLog['orderNtl'] = orderLog['orderPrice'] * orderLog['absOrderSize']
orderLog['tradeNtl'] = np.where(orderLog['updateType'] == 4, orderLog['tradePrice']*orderLog['absFilledThisUpdate'], 0)

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


more than one insertion at same time


Unnamed: 0,date,accCode,secid,vai,order,clockAtArrival
0,20200401,9454,2300042,11500,12557,2
1,20200401,9758,2300572,4700,20343,2
2,20200402,8865,2002918,1200,29663,2
3,20200403,5456,2300785,2800,44670,2
4,20200403,8865,2300785,2800,49143,2
5,20200403,9248,1603559,3389200,50932,2
6,20200403,9451,2300566,14000,52098,2


98.07% SZE orders triggered by msg data


Unnamed: 0,order,updateType,status
6746,6746,"(0, 4, 2, 4)",5
6812,6812,"(0, 4, 2, 1, 3)",5
6856,6856,"(0, 4, 2, 4)",5
6873,6873,"(0, 4, 2, 4)",5
6946,6946,"(0, 4, 2, 1, 3)",5
21804,21806,"(0, 4, 2, 4)",5
23472,23474,"(0, 4, 2, 4)",5
27666,27668,"(0, 4, 2, 4)",5
27767,27769,"(0, 4, 2, 4)",5
28336,28338,"(0, 4, 2, 4)",5


in total trade, any fill != total cases


Unnamed: 0,order,filled,total
18591,23609,17,100
28365,36143,8700,42300
29481,37603,57000,74000
33430,42444,19300,24000
37636,48060,99,9400
37783,48267,11800,12000
39005,49832,400,2300


in partial trade, any fill >= total or fill is 0 cases for updateType 4


Unnamed: 0,order,filled,total


any cancellation within 1 sec


Unnamed: 0,clockAtArrival,caamd,secid,updateType,vai,ars,absFilledThisUpdate,orderDirection,absOrderSize,absOrderSizeCumFilled,orderPrice,tradePrice,date,accCode,mse,colo,orderSysId,tradeId,sdd,aaa,ApplSeqNum,clock,broker,colo_broker,order,group,startClock,duration,orderNtl,directNum,isMsg,status


### 2. fill rate

In [22]:
acc1 = [6272, 5269, 8871, 9551, 9667]
acc2 = list(set(orderLog[(orderLog["exchange"] == 'SSE') & (orderLog["isMsg"] == 1)]["accCode"].unique()) - set(acc1))
colo_list = orderLog[(orderLog["exchange"] == "SSE") & (orderLog["isMsg"] == 1) & (orderLog["accCode"].isin(acc1))]["colo"].unique()
acc3 = orderLog[(orderLog["exchange"] == "SSE") & (orderLog["isMsg"] == 0) & (orderLog["colo"].isin(colo_list))]["accCode"].unique()
orderLog = pd.concat([orderLog[(orderLog["exchange"] == "SSE") & (orderLog["isMsg"] == 1) & (orderLog["accCode"].isin(list(set(acc1)|set(acc2))))],
                     orderLog[(orderLog["exchange"] == "SSE") & (orderLog["isMsg"] == 0) & (orderLog["accCode"].isin(acc3))]])
orderLog = orderLog.sort_values(by=['date', 'secid', 'vai', 'accCode', 'clockAtArrival']).reset_index(drop=True)
orderLog["gp"] = np.where((orderLog["exchange"] == 'SSE') & (orderLog["isMsg"] == 1) & (orderLog["accCode"].isin(acc1)), "gp1",
                         np.where((orderLog["exchange"] == 'SSE') & (orderLog["isMsg"] == 1) & (orderLog["accCode"].isin(acc2)), "gp2", "gp3"))

In [31]:
orderLog['Price'] = orderLog['orderPrice'].apply(lambda x: round(x*100, 0))
orderLog['orderNtl'] = orderLog['Price'] * orderLog['absOrderSize'] / 100
orderLog = orderLog[orderLog['updateType'].isin([0, 3, 4])].reset_index(drop=True)
orderLog['firstUpdateType'] = orderLog.groupby(['order'])['updateType'].transform('first')
orderLog['firstClock'] = orderLog.groupby(['order'])['clockAtArrival'].transform('first')
orderLog['clockDif'] = orderLog['clockAtArrival'] - orderLog['firstClock']
orderLog["ars"] = orderLog.groupby(['order'])['ars'].transform('first')
orderLog['isImmediate'] = np.where(orderLog['secid'] >= 2000000,
                          np.where(orderLog['colo_broker'] == 'zs_62', 
                                   np.where(orderLog['clockDif'] <= 1000000, 1, 0),
                                   np.where(orderLog['colo_broker'] == 'zt_88',
                                           np.where(orderLog['clockDif'] <= 50000, 1, 0),
                                           np.where(orderLog['clockDif'] <= 20000, 1, 0))),
                          np.where(orderLog['clockDif'] <= 1000000, 1, 0))                        

In [44]:
# groupby (exchange, account, colo, isMsg, gp):
checkData = orderLog[orderLog['isImmediate'] == 1].reset_index(drop=True)
checkData['maxFilled'] = checkData.groupby(['order'])['absOrderSizeCumFilled'].transform('max')
checkData['immedFillNtl'] = checkData['maxFilled'] * checkData['Price'] / 100
checkData = checkData[checkData['updateType'] == 0]

checkData = checkData[(checkData["exchange"] == "SSE") & (checkData["ars"] != 21)]

immedFillData = checkData.groupby(['exchange', 'accCode', 'isMsg', 'gp', 'colo'])[['orderNtl', 'immedFillNtl']].sum().reset_index()
immedFillData['immedFillPerc'] = immedFillData['immedFillNtl'] / immedFillData['orderNtl']
immedFillData = immedFillData[['exchange', 'accCode', 'immedFillPerc', 'isMsg', 'gp', 'colo']]


checkData['fillStatus'] = np.where(checkData['maxFilled'] == 0, 0,
                          np.where(checkData['maxFilled'] < checkData['absOrderSize'], 1, 2))
statsData = checkData.groupby(['exchange', 'accCode', 'isMsg', 'gp', 'colo', 'fillStatus'])['secid'].count().to_frame().reset_index()
statsData.columns = ['exchange', 'accCode', 'isMsg', 'gp', 'colo', 'fillStatus', 'count']
statsData = statsData.reset_index()
statsData['# of orders'] = statsData.groupby(['exchange'])['count'].transform('sum')
statsData['% of orders'] = statsData.groupby(['accCode', 'exchange', "isMsg", 'gp', 'colo'])['count'].transform('sum')
statsData['percent'] = statsData['count'] / statsData['% of orders']
statsData['% of orders'] = statsData['% of orders'] / statsData['# of orders']
saveCols = ['accCode', 'exchange', 'isMsg', 'gp', 'colo', 'percent']
noFillData = statsData[statsData['fillStatus'] == 0][saveCols].reset_index(drop=True)
noFillData = noFillData.rename(columns={'percent': 'no fill'})
partialFillData = statsData[statsData['fillStatus'] == 1][saveCols].reset_index(drop=True)
partialFillData = partialFillData.rename(columns={'percent': 'partial fill'})
fullFillData = statsData[statsData['fillStatus'] == 2][saveCols].reset_index(drop=True)
fullFillData = fullFillData.rename(columns={'percent': 'full fill'})


o1 = orderLog[(orderLog["exchange"] == "SSE") & (orderLog["ars"] != 21)]



fillRateData = o1.groupby(['exchange', 'accCode', 'secid', 'order', 'isMsg', "gp", 'colo'])[['absOrderSizeCumFilled', 'absOrderSize', 'orderPrice']].max().reset_index()
fillRateData['orderNotional'] = fillRateData['absOrderSize']*fillRateData['orderPrice']
fillRateData['fillNotional'] = fillRateData['absOrderSizeCumFilled']*fillRateData['orderPrice']
fillRateData['totalNotional'] = fillRateData.groupby(['exchange', 'accCode', "isMsg", "gp", 'colo'])['orderNotional'].transform('sum')
fillRateData['fillNotional'] = fillRateData.groupby(['exchange',  'accCode', "isMsg", "gp", 'colo'])['fillNotional'].transform('sum')
fillRateData['fillPerc'] = fillRateData['fillNotional'] / fillRateData['totalNotional']
fillRateData = fillRateData.groupby(['exchange', 'accCode', "isMsg", "gp", 'colo'])['fillPerc'].mean().reset_index()

statsData = statsData.groupby(['exchange', 'accCode', 'isMsg', "gp", 'colo', '# of orders'])['% of orders'].first().reset_index()
statsData = pd.merge(statsData, noFillData, how='outer', on=['exchange', 'accCode', 'isMsg', "gp", 'colo'], validate='one_to_one')
statsData = pd.merge(statsData, partialFillData, how='outer', on=['exchange', 'accCode', 'isMsg', "gp", 'colo'], validate='one_to_one')
statsData = pd.merge(statsData, fullFillData, how='outer', on=['exchange', 'accCode', 'isMsg', "gp", 'colo'], validate='one_to_one')
statsData = pd.merge(statsData, fillRateData, how='outer', on=['exchange', 'accCode', 'isMsg', "gp", 'colo'], validate='one_to_one')
statsData = pd.merge(statsData, immedFillData, how='outer', on=['exchange', 'accCode', 'isMsg', "gp", 'colo'], validate='one_to_one')


# statsData = statsData.groupby(['exchange', '# of orders', 'broker'])[['% of orders', 'full fill', 'partial fill', 'no fill', 'fillPerc', 'immedFillPerc']].first()
statsData['isMsg'] = statsData['isMsg'].astype("int")
statsData = statsData[statsData["isMsg"] == 1].groupby(['exchange', '# of orders','isMsg','colo', 'accCode',"gp" ])[['% of orders', 'full fill', 'partial fill', 'no fill', 'fillPerc', 'immedFillPerc']].first()

from IPython.display import display, HTML
for col in ['% of orders', 'no fill', 'partial fill', 'full fill', 'fillPerc', 'immedFillPerc']:
    statsData[col] = statsData[col].fillna(0)
    statsData[col] = statsData[col].apply(lambda x: '%.0f%%'%(x*100))
display(HTML(statsData.to_html()))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,% of orders,full fill,partial fill,no fill,fillPerc,immedFillPerc
exchange,# of orders,isMsg,colo,accCode,gp,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
SSE,15204,1,zs_94_04,6237,gp2,1%,68%,3%,28%,51%,30%
SSE,15204,1,zs_96_02,9741,gp2,4%,79%,12%,9%,92%,87%
SSE,15204,1,zs_96_05,6272,gp1,1%,60%,16%,24%,67%,59%
SSE,15204,1,zt_52_01,5269,gp1,4%,69%,17%,14%,91%,81%
SSE,15204,1,zt_52_01,5271,gp2,1%,65%,20%,15%,88%,83%
SSE,15204,1,zt_52_01,5274,gp2,0%,62%,16%,22%,86%,83%
SSE,15204,1,zt_58_01,5868,gp2,1%,44%,2%,54%,65%,34%
SSE,15204,1,zt_88_02,8843,gp2,1%,50%,8%,42%,69%,59%
SSE,15204,1,zt_88_02,8871,gp1,1%,56%,22%,22%,69%,58%
SSE,15204,1,zt_88_03,8854,gp2,0%,0%,0%,100%,100%,0%


In [64]:
orderLog["ars"] = orderLog.groupby(["date", "order"])["ars"].transform("first")
SZE = orderLog[(orderLog['exchange'] == 'SSE') & (orderLog["ars"] != 21) & (orderLog["isMsg"] == 0)]
test = SZE[(SZE["colo_broker"] == "zt_88")]
test1 = test[test["updateType"] == 0]
t1 = test1.groupby(["date", "accCode"])["orderNtl"].sum().reset_index()
test2 = test[test["updateType"] == 4]
t2 = test2.groupby(["date", "accCode"])["tradeNtl"].sum().reset_index()
h = pd.merge(t1, t2, on=["date", "accCode"])
h["prob"] = h["tradeNtl"] / h["orderNtl"] * 100
h = pd.merge(h, test1.groupby(["date", "accCode"])["orderNtl"].count().reset_index(), on=["date", "accCode"])
h = h.rename(columns={"orderNtl_y": "size"})
h["prob"] = h["prob"].astype("int")
from IPython.display import display, HTML
display(HTML(h[["date", "accCode", "size", "prob"]].groupby(["date", "size"]).first().to_html()))

Unnamed: 0_level_0,Unnamed: 1_level_0,accCode,prob
date,size,Unnamed: 2_level_1,Unnamed: 3_level_1
20200401,226,8870,76
20200401,254,8871,56
20200401,390,8843,77
20200402,284,8843,72
20200402,295,8871,70
20200402,351,8870,73
20200403,175,8871,48
20200403,242,8870,64
20200403,290,8843,75


### 3. Internal Latency

In [76]:
checkLog = orderLog[orderLog["updateType"] == 0]
checkLog['internal_latency'] = checkLog["clockAtArrival"] - checkLog["caamd"]
checkLog["strategy"] = np.where(checkLog["ars"] == 21, "statwo", "staone")
SZE = checkLog[checkLog['secid'] >= 2000000]
SSE = checkLog[checkLog['secid'] < 2000000]
SSE["exchange"] = "SH"
SSE = SSE[SSE["strategy"] == "staone"]

c1 = SSE.groupby(["exchange", "colo", "accCode", "strategy", "isMsg", "gp", "date"])["internal_latency"].quantile(.95).reset_index().groupby(["exchange", "colo", "accCode", "strategy", "isMsg", "gp"])["internal_latency"].mean().reset_index()
c2 = SSE.groupby(["exchange", "colo", "accCode", "strategy", "isMsg", "gp", "date"])["internal_latency"].median().reset_index().groupby(["exchange", "colo", "accCode", "strategy", "isMsg", "gp"])["internal_latency"].mean().reset_index()
c3 = SSE.groupby(["exchange", "colo", "accCode", "strategy", "isMsg", "gp"])["internal_latency"].count().reset_index()
c4 = SSE.groupby(["exchange", "colo", "accCode", "strategy", "isMsg", "gp"])["date"].unique().str.len().reset_index()
c5 = SSE.groupby(["exchange", "colo", "accCode", "strategy", "isMsg", "gp", "date"])["internal_latency"].quantile(.95).reset_index().groupby(["exchange", "colo", "accCode", "strategy", "isMsg", "gp"])["internal_latency"].std().reset_index()

re2 = pd.merge(c3, c1, on=["exchange", "colo", "accCode", "strategy", "isMsg", "gp"])
re2 = re2.rename(columns = {'internal_latency_x': 'count', 'internal_latency_y': '95 percentile'})
re2 = pd.merge(re2, c2, on=["exchange", "colo", "accCode", "strategy", "isMsg", "gp"])
re2 = re2.rename(columns = {'internal_latency': 'median'})
re2 = pd.merge(re2, c4, on=["exchange", "colo", "accCode", "strategy", "isMsg", "gp"])
re2 = pd.merge(re2, c5, on=["exchange", "colo", "accCode", "strategy", "isMsg", "gp"])
re2 = re2.rename(columns = {'internal_latency': 'std'})
re2




# re1 = pd.merge(re1[re1["isMsg"] == 1], re1[re1["isMsg"] == 0], on=["exchange", "colo_account"], how="outer").sort_values(by="median_x").reset_index(drop=True)
# re2 = pd.merge(re2[re2["isMsg"] == 1], re2[re2["isMsg"] == 0], on=["exchange", "colo_account"], how="outer").sort_values(by="median_y").reset_index(drop=True)

for col in ['isMsg','median', '95 percentile']:
    re2[col] = re2[col].astype(int)
for col in ['std']:
    re2[col] = re2[col].apply(lambda x: '%.2f'%(x))
    
re2 = re2.rename(columns={"colo": "server", "accCode": "account"}) 

from IPython.display import display, HTML
# HTML(re.groupby(["exchange", "colo_account", "isMsg"]).first().to_html())

display(HTML(re2[re2["gp"].isin(["gp1", "gp3"])].groupby(["exchange", "server", "gp", "account", "isMsg"])["count", "median", "95 percentile", "std"].first().to_html()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,count,median,95 percentile,std
exchange,server,gp,account,isMsg,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
SH,zs_96_05,gp1,6272,1,159,10,31,6.77
SH,zs_96_05,gp3,6272,0,772,23,35,3.93
SH,zt_52_01,gp1,5269,1,595,16,57,4.86
SH,zt_52_01,gp3,5269,0,2041,40,79,2.9
SH,zt_52_01,gp3,5271,0,820,39,102,10.77
SH,zt_52_01,gp3,5273,0,2737,38,86,8.4
SH,zt_52_01,gp3,5274,0,372,39,86,11.12
SH,zt_88_02,gp1,8871,1,143,21,91,13.88
SH,zt_88_02,gp3,8843,0,964,34,85,20.83
SH,zt_88_02,gp3,8870,0,819,41,101,20.16


### Return and alpha

In [92]:
import os
import glob
import datetime
import numpy as np
import pandas as pd
import pickle
from IPython.display import display, HTML
startDate = '20200401'
endDate = '20200403'

readPath = "F:\\orderLog\\result\\90s return\\"
dataPathLs = np.array(glob.glob(readPath + 'OrderLog1_***.csv'))
dateLs = np.array([os.path.basename(i).split('_')[1].split('.')[0] for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]
df = []

for thisDate, thisPath in zip(dateLs, dataPathLs):
    data = pd.read_csv(thisPath)
    data["ars"] = data.groupby(['order'])['ars'].transform('first')
    data["sdd"] = data.groupby(['order'])['sdd'].transform('first')
    data['isMsg'] = np.where(data['updateType'] == 0, np.where(data['mse'] == 100, 1, 0), np.nan)
    data['isMsg'] = data.groupby(['order'])['isMsg'].ffill()
    data["sta"] = np.where(data["ars"] == 21, "statwo", "staone")
    data = data[(data["updateType"] == 4) & (data["exchange"] == "SSE")]
    data = data[data["sdd"] >= 93300000]
    data["indexRet"] = data["close_F90s"]/data["close"] - 1
    data["buyAlpha"] = np.where(data["orderDirection"] == 1, data["buyRet"] - data["indexRet"] * data["beta_60"], np.nan)
    data["sellAlpha"] = np.where(data["orderDirection"] == -1, data["sellRet"] + data["indexRet"] * data["beta_60"], np.nan)
    data["pp"] = (data.groupby("order").cumcount()+1)[lambda x: x <= 1]
    data["buyNum"] = np.where((data["orderDirection"] == 1) & (data["pp"]==1), 1, 0)
    data["sellNum"] = np.where((data["orderDirection"] == -1) & (data["pp"]==1), 1, 0)
    data["gp"] = np.where((data["exchange"] == 'SSE') & (data["isMsg"] == 1) & (data["accCode"].isin(acc1)), "gp1",
                         np.where((data["exchange"] == 'SSE') & (data["isMsg"] == 1) & (data["accCode"].isin(acc2)), "gp2",
                                 np.where((data["exchange"] == 'SSE') & (data['isMsg'] == 0) & (data["accCode"].isin(acc3)), "gp3", np.nan)))
    df += [data]
df = pd.concat(df, sort=False)
df = df[df["sta"] == "staone"]

df["buyNtl"] = np.where(~df["buyRet"].isnull(), df["tradeNtl"], np.nan)
df["sellNtl"] = np.where(~df["sellRet"].isnull(), df["tradeNtl"], np.nan)
df["sumbuyNtl"] = df.groupby(["exchange", "colo", "isMsg", "accCode", "gp"])["buyNtl"].transform(sum)
df["sumsellNtl"] = df.groupby(["exchange", "colo", "isMsg", "accCode", "gp"])["sellNtl"].transform(sum)
    
df["sumsellRet"] = df["tradeNtl"] * df["sellRet"]
df["sumsellRet"] = df.groupby(["exchange", "colo", "isMsg", "accCode", "gp"])["sumsellRet"].transform(sum)
    
df["sumbuyAlpha"] = df["tradeNtl"] * df["buyAlpha"]
df["sumbuyAlpha"] = df.groupby(["exchange", "colo", "isMsg", "accCode", "gp"])["sumbuyAlpha"].transform(sum)
    
df["sumsellAlpha"] = df["tradeNtl"] * df["sellAlpha"]
df["sumsellAlpha"] = df.groupby(["exchange", "colo", "isMsg", "accCode", "gp"])["sumsellAlpha"].transform(sum)
    
df["sumbuyRet"] = df["tradeNtl"] * df["buyRet"]
df["sumbuyRet"] = df.groupby(["exchange", "colo", "isMsg", "accCode", "gp"])["sumbuyRet"].transform(sum)

df["buyRet"] = df["sumbuyRet"] / df["sumbuyNtl"]
df["sellRet"] = df["sumsellRet"] / df["sumsellNtl"]
df["buyAlpha"] = df["sumbuyAlpha"] / df["sumbuyNtl"]
df["sellAlpha"] = df["sumsellAlpha"] / df["sumsellNtl"]
df["buyOrderNum"] = df.groupby(["exchange", "colo", "isMsg", "accCode", "gp"])["buyNum"].transform(sum)
df["sellOrderNum"] = df.groupby(["exchange", "colo", "isMsg", "accCode", "gp"])["sellNum"].transform(sum)

from IPython.display import display, HTML
for col in ["buyRet", "sellRet", "buyAlpha", "sellAlpha"]:
    df[col] = df[col].apply(lambda x: '%.2f'%(x*10000))
df["accCode"] = df["accCode"].astype("int")
HTML(df[df["gp"].isin(["gp1", "gp2"])].groupby(["exchange", "isMsg", "colo","accCode","gp"])["buyOrderNum", "buyRet", "buyAlpha", "sellOrderNum", "sellRet", "sellAlpha"].first().to_html())
HTML(df[df["gp"].isin(["gp1", "gp3"])].groupby(["exchange", "colo","gp", "accCode","isMsg"])["buyOrderNum", "buyRet", "buyAlpha", "sellOrderNum", "sellRet", "sellAlpha"].first().to_html())

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,buyOrderNum,buyRet,buyAlpha,sellOrderNum,sellRet,sellAlpha
exchange,colo,gp,accCode,isMsg,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
SSE,zs_96_05,gp1,6272,1.0,54,4.97,3.03,86,-0.69,-0.6
SSE,zs_96_05,gp3,6272,0.0,141,16.12,8.9,323,20.85,19.41
SSE,zt_52_01,gp1,5269,1.0,153,19.18,13.23,405,11.59,7.83
SSE,zt_52_01,gp3,5269,0.0,372,15.56,9.66,1302,9.63,6.34
SSE,zt_52_01,gp3,5271,0.0,162,20.15,17.61,288,10.25,9.93
SSE,zt_52_01,gp3,5273,0.0,413,14.84,12.4,2011,12.22,9.82
SSE,zt_52_01,gp3,5274,0.0,76,5.04,2.99,142,2.88,1.85
SSE,zt_88_02,gp1,8871,1.0,64,11.04,6.33,59,12.54,8.34
SSE,zt_88_02,gp3,8843,0.0,163,13.55,6.56,184,12.32,9.51
SSE,zt_88_02,gp3,8870,0.0,217,7.11,4.34,312,5.59,2.46
