In order to detect anomalies in the transactions, we looked at the time between the request for a new order or a cancelation and the acknowledgement coming from the exchange to bank. We built and trained autoencoder models to detect outliers and anomalies using the PyOD library. 

#reading data

In [5]:
import pandas as pd

aq_data = pd.read_json("/content/drive/MyDrive/ML/Hackathon (1)/Hackathon/AequitasData.json")
alpha_data = pd.read_json("/content/drive/MyDrive/ML/Hackathon (1)/Hackathon/AlphaData.json")
tsx_data = pd.read_json("/content/drive/MyDrive/ML/Hackathon (1)/Hackathon/TSXData.json")

total_df = pd.concat([aq_data, alpha_data, tsx_data])
total_df

Unnamed: 0,TimeStamp,TimeStampEpoch,Direction,OrderID,MessageType,Symbol,OrderPrice,Exchange
0,2023-01-06 09:28:00.013814157,2023-01-06 14:28:00.013814157,NBFToExchange,b8c529be-9283-11ed-ad3c-047c16291a22,NewOrderRequest,0455B,13.02,Aequitas
1,2023-01-06 09:28:00.013890960,2023-01-06 14:28:00.013890960,NBFToExchange,b8c529bf-9283-11ed-a725-047c16291a22,NewOrderRequest,0455B,13.14,Aequitas
2,2023-01-06 09:28:00.014031152,2023-01-06 14:28:00.014031152,NBFToExchange,b8c529c0-9283-11ed-bf41-047c16291a22,CancelRequest,0455B,,Aequitas
3,2023-01-06 09:28:00.014086398,2023-01-06 14:28:00.014086398,NBFToExchange,b8c529c1-9283-11ed-8407-047c16291a22,CancelRequest,0455B,,Aequitas
4,2023-01-06 09:28:00.014213645,2023-01-06 14:28:00.014213645,ExchangeToNBF,b8c529be-9283-11ed-ad3c-047c16291a22,NewOrderAcknowledged,0455B,13.02,Aequitas
...,...,...,...,...,...,...,...,...
212217,2023-01-06 09:31:59.995169747,2023-01-06 14:31:59.995169747,NBFToExchange,b9921f99-9283-11ed-a400-047c16291a22,NewOrderRequest,DC0OC,45.67,TSX
212218,2023-01-06 09:31:59.995210138,2023-01-06 14:31:59.995210138,NBFToExchange,b9921f83-9283-11ed-bcf1-047c16291a22,CancelRequest,DC0OC,,TSX
212219,2023-01-06 09:31:59.995219075,2023-01-06 14:31:59.995219075,ExchangeToNBF,b9921f99-9283-11ed-a400-047c16291a22,NewOrderAcknowledged,DC0OC,45.67,TSX
212220,2023-01-06 09:31:59.995257550,2023-01-06 14:31:59.995257550,ExchangeToNBF,b9921f83-9283-11ed-bcf1-047c16291a22,CancelAcknowledged,DC0OC,,TSX


In [6]:
#sample to see different messages recieved 
sample = total_df[total_df["OrderID"]=="b8c529be-9283-11ed-ad3c-047c16291a22"]
sample

Unnamed: 0,TimeStamp,TimeStampEpoch,Direction,OrderID,MessageType,Symbol,OrderPrice,Exchange
0,2023-01-06 09:28:00.013814157,2023-01-06 14:28:00.013814157,NBFToExchange,b8c529be-9283-11ed-ad3c-047c16291a22,NewOrderRequest,0455B,13.02,Aequitas
4,2023-01-06 09:28:00.014213645,2023-01-06 14:28:00.014213645,ExchangeToNBF,b8c529be-9283-11ed-ad3c-047c16291a22,NewOrderAcknowledged,0455B,13.02,Aequitas
172,2023-01-06 09:28:00.993798829,2023-01-06 14:28:00.993798829,NBFToExchange,b8c529be-9283-11ed-ad3c-047c16291a22,CancelRequest,0455B,,Aequitas
174,2023-01-06 09:28:00.994174634,2023-01-06 14:28:00.994174634,ExchangeToNBF,b8c529be-9283-11ed-ad3c-047c16291a22,Cancelled,0455B,,Aequitas


In [7]:
print(type(sample.iloc[0]["TimeStamp"]))
sample.iloc[0]["TimeStamp"].timestamp()-sample.iloc[1]["TimeStamp"].timestamp()

<class 'pandas._libs.tslibs.timestamps.Timestamp'>


-0.0004000663757324219

#preparing new data

In [8]:
import numpy as np
import datetime

In [9]:
unique_ids = total_df["OrderID"].drop_duplicates()

req_time = []
cancel_time = []
r_exchanges = []
r_symbols = []
c_exchanges = []
c_symbols = []
o_anomaly_time = []
c_anomaly_time = []


for i in range(len(unique_ids)):
  id = unique_ids.iloc[i]

  transactions = total_df[total_df["OrderID"]==id]

  o_req = transactions[transactions["MessageType"]=="NewOrderRequest"]
  o_ack = transactions[transactions["MessageType"]=="NewOrderAcknowledged"]
  
  c_req = transactions[transactions["MessageType"]=="CancelRequest"]
  c_ack = transactions[transactions["MessageType"]=="CancelAcknowledged"]
  cancelled = transactions[transactions["MessageType"]=="Cancelled"]

  if (len(o_req)!=0 and len(o_ack)!=0):
  
    req_time.append((o_ack.iloc[0]["TimeStamp"].timestamp()-o_req.iloc[0]["TimeStamp"].timestamp())*1000)
    o_anomaly_time.append(np.squeeze(o_req["TimeStamp"].values))
    r_exchanges.append(str(np.squeeze(o_req["Exchange"].values)))
    r_symbols.append(str(np.squeeze(o_req["Symbol"].values)))

  else:

    req_time.append(np.nan)
    o_anomaly_time.append(np.nan)
    r_exchanges.append(np.nan)
    r_symbols.append(np.nan)


  if len(c_req)!=0:

    if len(c_ack)!=0:

      cancel_time.append((c_ack.iloc[0]["TimeStamp"].timestamp()-c_req.iloc[0]["TimeStamp"].timestamp())*1000)

    elif len(cancelled)!=0:

      cancel_time.append((cancelled.iloc[0]["TimeStamp"].timestamp()-c_req.iloc[0]["TimeStamp"].timestamp())*1000)

    else:

      cancel_time.append(np.nan)
    c_anomaly_time.append(np.squeeze(c_req["TimeStamp"].values))
    c_exchanges.append(str(np.squeeze(c_req["Exchange"].values)))
    c_symbols.append(str(np.squeeze(c_req["Symbol"].values)))

  else:

    cancel_time.append(np.nan)
    c_anomaly_time.append(np.nan)
    c_exchanges.append(np.nan)
    c_symbols.append(np.nan)

In [10]:
#making the new datasets

order = pd.DataFrame()
order["id"] = unique_ids
order["time"] = o_anomaly_time
order["exchange"] = r_exchanges
order["symbol"] = r_symbols
order["time_difference"] = req_time

cancel = pd.DataFrame()
cancel["id"] = unique_ids
cancel["time"] = c_anomaly_time
cancel["exchange"] = c_exchanges
cancel["symbol"] = c_symbols
cancel["time_difference"] = cancel_time


In [40]:
order = order.dropna()
order.to_csv("/content/drive/MyDrive/ML/cleaned_order")

In [41]:
cancel = cancel.dropna()
cancel.to_csv("/content/drive/MyDrive/ML/cleaned_cancel")

In [13]:
ord_data = order.drop(columns=["time", "id"])
can_data = cancel.drop(columns=["time", "id"])
type(ord_data["symbol"])

pandas.core.series.Series

In [14]:
ord_data = pd.get_dummies(ord_data, columns=["symbol", "exchange"])
can_data = pd.get_dummies(can_data, columns=["symbol", "exchange"])
ord_data

Unnamed: 0,time_difference,symbol_0455B,symbol_05VGI,symbol_07FM5,symbol_07PKN,symbol_0M3Q9,symbol_1H2XW,symbol_1K95A,symbol_1NTNT,symbol_215JE,...,symbol_Y18QU,symbol_Y3SCQ,symbol_Z5HII,symbol_Z9T7U,symbol_ZASDH,symbol_ZHOXR,symbol_ZVLDO,exchange_Aequitas,exchange_Alpha,exchange_TSX
0,0.400066,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0.390053,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
9,0.394106,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
11,0.393867,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
16,0.408888,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
212197,0.063181,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
212203,0.052214,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
212208,0.054121,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
212212,0.051975,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1


In [15]:
ord_data

Unnamed: 0,time_difference,symbol_0455B,symbol_05VGI,symbol_07FM5,symbol_07PKN,symbol_0M3Q9,symbol_1H2XW,symbol_1K95A,symbol_1NTNT,symbol_215JE,...,symbol_Y18QU,symbol_Y3SCQ,symbol_Z5HII,symbol_Z9T7U,symbol_ZASDH,symbol_ZHOXR,symbol_ZVLDO,exchange_Aequitas,exchange_Alpha,exchange_TSX
0,0.400066,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0.390053,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
9,0.394106,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
11,0.393867,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
16,0.408888,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
212197,0.063181,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
212203,0.052214,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
212208,0.054121,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
212212,0.051975,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1


#Making the model

In [16]:
import tensorflow as tf
from tensorflow.keras import layers, losses
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
import matplotlib.pyplot as plt

##preparing train and test sets

In [17]:
data_r = ord_data.drop(columns=["time_difference"])
labels_r = ord_data["time_difference"]

data_c = can_data.drop(columns=["time_difference"])
labels_c = can_data["time_difference"]

In [18]:
train_data_r, test_data_r, train_labels_r, test_labels_r = train_test_split(data_r, labels_r, test_size = 0.2, random_state = 42)
train_data_c, test_data_c, train_labels_c, test_labels_c = train_test_split(data_c, labels_c, test_size = 0.2, random_state = 42)
train_data_r.shape

(36719, 149)

In [2]:
!pip install pyod

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyod
  Downloading pyod-1.0.7.tar.gz (147 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m147.7/147.7 KB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyod
  Building wheel for pyod (setup.py) ... [?25l[?25hdone
  Created wheel for pyod: filename=pyod-1.0.7-py3-none-any.whl size=181101 sha256=778a14d746c65e01537e514060ef6f80fe1f9f5abc6003f50266e94ed775c523
  Stored in directory: /root/.cache/pip/wheels/f7/e2/c1/1c7fd8b261e72411f6509afb429c84532e40ddcd96074473f4
Successfully built pyod
Installing collected packages: pyod
Successfully installed pyod-1.0.7


In [3]:
from pyod.utils.data import generate_data

In [33]:
from pyod.models.auto_encoder import AutoEncoder
atcdr_r = AutoEncoder(contamination=0.02, hidden_neurons =[2, 2])
atcdr_r.fit(train_data_r)

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_15 (Dense)            (None, 149)               22350     
                                                                 
 dropout_12 (Dropout)        (None, 149)               0         
                                                                 
 dense_16 (Dense)            (None, 149)               22350     
                                                                 
 dropout_13 (Dropout)        (None, 149)               0         
                                                                 
 dense_17 (Dense)            (None, 2)                 300       
                                                                 
 dropout_14 (Dropout)        (None, 2)                 0         
                                                                 
 dense_18 (Dense)            (None, 2)                

AutoEncoder(batch_size=32, contamination=0.02, dropout_rate=0.2, epochs=100,
      hidden_activation='relu', hidden_neurons=[2, 2], l2_regularizer=0.1,
      loss=<function mean_squared_error at 0x7f2257241ee0>,
      optimizer='adam', output_activation='sigmoid', preprocessing=True,
      random_state=None, validation_size=0.1, verbose=1)

In [28]:
# Training data
y_train_scores_r = atcdr_r.decision_function(train_data_r)
y_train_pred_r = atcdr_r.predict(train_data_r)

# Test data
y_test_scores_r = atcdr_r.decision_function(test_data_r)
y_test_pred = atcdr_r.predict(test_data_r) # outlier labels (0 or 1)
y_test_pred #for req

NotFittedError: ignored

In [37]:
atcdr_c = AutoEncoder(contamination=0.02, hidden_neurons =[2, 2])
atcdr_c.fit(train_data_c)

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_20 (Dense)            (None, 144)               20880     
                                                                 
 dropout_16 (Dropout)        (None, 144)               0         
                                                                 
 dense_21 (Dense)            (None, 144)               20880     
                                                                 
 dropout_17 (Dropout)        (None, 144)               0         
                                                                 
 dense_22 (Dense)            (None, 2)                 290       
                                                                 
 dropout_18 (Dropout)        (None, 2)                 0         
                                                                 
 dense_23 (Dense)            (None, 2)                

AutoEncoder(batch_size=32, contamination=0.02, dropout_rate=0.2, epochs=100,
      hidden_activation='relu', hidden_neurons=[2, 2], l2_regularizer=0.1,
      loss=<function mean_squared_error at 0x7f2257241ee0>,
      optimizer='adam', output_activation='sigmoid', preprocessing=True,
      random_state=None, validation_size=0.1, verbose=1)

In [25]:
# Training data
y_train_scores_c = atcdr_c.decision_function(train_data_c)
y_train_pred_c = atcdr_c.predict(train_data_c)

# Test data
y_test_scores_c = atcdr_c.decision_function(test_data_c)
y_test_pred_c = atcdr_c.predict(test_data_c) # outlier labels (0 or 1)
y_test_pred_c



array([0, 0, 0, ..., 0, 0, 0])

In [26]:
len(y_test_pred), len(y_test_pred_c)

(9180, 9165)

In [38]:
#final preds for cancel
pred_c_scores = atcdr_c.decision_function(data_c)
pred_c = atcdr_c.predict(data_c)



In [39]:
#total cancelation anomalies 

pred_c.sum()

902

In [43]:
pd.DataFrame(pred_r).to_csv("/content/drive/MyDrive/ML/final_r_preds")

In [44]:
pd.DataFrame(pred_c).to_csv("/content/drive/MyDrive/ML/final_c_preds")

In [34]:
pred_r_scores = atcdr_r.decision_function(data_r)
pred_r = atcdr_r.predict(data_r)
len(pred_r)



45899

In [35]:
#total order placement anomalies 
pred_r.sum()

885

#getting the final dataset

In [86]:
anomaly_1_list = []
for i in range(len(pred_r)):
  if pred_r[i]:
    anomaly_1_list.append("Request Anomaly")
  else:
    anomaly_1_list.append("Normal")

anomaly_2_list = []
for i in range(len(pred_c)):
  if pred_c[i]:
    anomaly_2_list.append("Cancelation Anomaly")
  else:
    anomaly_2_list.append("Normal")


In [87]:
order["anomaly"] = anomaly_1_list
cancel["anomaly"] = anomaly_2_list

In [91]:
with open("/content/drive/MyDrive/Hackathon Squad/a_1.txt", 'a') as f:
    order_string = order.to_string(header=False, index=False)
    f.write(order_string)

In [93]:
with open("/content/drive/MyDrive/Hackathon Squad/a_2.txt", 'a') as f:
    cancel_string = cancel.to_string(header=False, index=False)
    f.write(cancel_string)

In [94]:
#thresholds to classify a transaction as anomaly (miliseconds)
threshold_r = atcdr_r.threshold_
threshold_c = atcdr_c.threshold_
threshold_r, threshold_c

(28.568705957419404, 27.638146923711812)