In [1]:
from dapnn.imports import *
from dapnn.data_processing import *
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import random
import torch

import warnings
warnings.filterwarnings(action='once')

In [2]:
# Set the random seed for reproducible results
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

seed_everything()

keeping only the traces in the log that will create the normal model

In [7]:
log_path='data/csv/PDC2020_training/'
log = import_log(log_path)

num_traces = len(log['trace_id'].unique())
ratio = 0.8
traces_for_normal_model = int(num_traces * ratio)

log = log[log['trace_id'].str.split(' ').str[1].astype(int) < traces_for_normal_model]
log.head(5)

['data/csv/PDC2020_training/pdc_2020_0000000.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0000001.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0000010.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0000011.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0000100.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0000101.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0000110.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0000111.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0001000.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0001001.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0001010.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0001011.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0001100.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0001101.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0001110.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0001111.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0010000.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0010001.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0010010.c

Unnamed: 0_level_0,activity,trace_id,event_id
trace_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
trace 1,###start###,trace 1,0
trace 1,t11,trace 1,1
trace 1,t26,trace 1,2
trace 1,t21,trace 1,3
trace 1,t32,trace 1,4


In [8]:
def training_dl(log,cat_names='activity',seed=45,ws=5,bs=32):
    categorify=Categorify()
    o=PPObj(log,procs=categorify,cat_names=cat_names,y_names=cat_names,splits=split_traces(log,test_seed=seed,validation_seed=seed))
    dls=o.get_dls(windows=partial(windows_fast,ws=ws),bs=bs)
    return o,dls,categorify

In [34]:
class ControlFlowModel(torch.nn.Module) :
    def __init__(self, o) :
        super().__init__()
        hidden=25
        vocab_act=len(o.procs.categorify['activity'])
        emb_dim_act = int(np.sqrt(vocab_act))+1

        self.emb_act = nn.Embedding(vocab_act,emb_dim_act)
        
        self.lstm_act = nn.LSTM(emb_dim_act, hidden, batch_first=True, num_layers=2)

        self.linear_act = nn.Linear(hidden, vocab_act)

    def forward(self, xcat):
        xcat=xcat[:,0]
        x_act=xcat
        x_act = self.emb_act(x_act)
        x_act,_ = self.lstm_act(x_act)
        x_act = x_act[:,-1]
        x_act = self.linear_act(x_act)
        return x_act

In [10]:
class HideOutput:
    'A utility function that hides all outputs in a context'
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout


def training_loop(learn,epoch,print_output,lr_find,fixed_learning_rate=0.01):
    '''
    Basic training loop that uses learning rate finder and one cycle training. 
    See fastai docs for more information
    '''
    if lr_find:
        lr=np.median([learn.lr_find(show_plot=print_output)[0] for i in range(5)])
        learn.fit_one_cycle(epoch,float(lr))
    else: learn.fit(epoch,fixed_learning_rate)


def train_validate(dls,m,metrics=accuracy,loss=F.cross_entropy,epoch=20,print_output=True,model_dir=".",lr_find=True,
                   patience=5,min_delta=0.005,show_plot=True,store_path='tmp',model_name='.model'):
    '''
    Trains a model on the training set with early stopping based on the validation loss.
    Afterwards, applies it to the test set.
    '''
    cbs = [
      EarlyStoppingCallback(monitor='valid_loss',min_delta=min_delta, patience=patience),
      SaveModelCallback(fname=model_name),
      ]
    learn=Learner(dls, m, path=store_path, model_dir=model_dir, loss_func=loss ,metrics=metrics,cbs=cbs)

    if print_output:
        training_loop(learn,epoch,show_plot,lr_find=lr_find)
        return learn.validate(dl=dls[2])
    else:
        with HideOutput(),learn.no_bar(),learn.no_logging():
            training_loop(learn,epoch,show_plot,lr_find=lr_find)
            return learn.validate(dl=dls[2])

In [11]:
squeeze_cross_entropy = lambda x,y:F.cross_entropy(x,y[0])
squeeze_accuracy =lambda x,y:accuracy(x,y[0])

In [12]:
o,dls,categorify = training_dl(log)
m = ControlFlowModel(o)
train_val = train_validate(dls,m,epoch=20,metrics=squeeze_accuracy,loss=squeeze_cross_entropy, show_plot=False)

  if not is_categorical_dtype(c):


epoch,train_loss,valid_loss,Unnamed: 3,time
0,1.16731,1.16357,0.550733,05:48
1,1.090757,1.146879,0.537908,05:47
2,1.114977,1.140593,0.552122,05:48
3,1.172542,1.149181,0.546106,05:47
4,1.138215,1.148465,0.547308,05:48
5,1.113332,1.135598,0.550795,05:47
6,1.133046,1.128004,0.551089,05:51
7,1.105586,1.128067,0.54537,05:45
8,1.10991,1.122441,0.55168,05:45
9,1.097285,1.12409,0.549291,05:45


Better model found at epoch 0 with valid_loss value: 1.1635704040527344.
Better model found at epoch 1 with valid_loss value: 1.146878719329834.
Better model found at epoch 2 with valid_loss value: 1.140593409538269.
Better model found at epoch 5 with valid_loss value: 1.1355983018875122.
Better model found at epoch 6 with valid_loss value: 1.1280035972595215.
Better model found at epoch 8 with valid_loss value: 1.1224406957626343.
Better model found at epoch 10 with valid_loss value: 1.1215351819992065.
Better model found at epoch 11 with valid_loss value: 1.1153411865234375.
Better model found at epoch 12 with valid_loss value: 1.1092145442962646.
Better model found at epoch 13 with valid_loss value: 1.1067320108413696.
Better model found at epoch 14 with valid_loss value: 1.0969079732894897.
Better model found at epoch 15 with valid_loss value: 1.0940167903900146.
Better model found at epoch 16 with valid_loss value: 1.090057373046875.
Better model found at epoch 17 with valid_loss 

Better model found at epoch 0 with valid_loss value: 0.5627988576889038.


In [39]:
# save the model m
torch.save(m.state_dict(), 'save_model/reg_model.pth')

In [None]:
# load the model m
m = ControlFlowModel(o)
m.load_state_dict(torch.load('save_model/reg_model.pth'))

In [36]:
test_path = 'data/csv/PDC2020_ground_truth/'
test_log = import_log(test_path)
test_log[test_log['normal'] == False].head(5)

['data/csv/PDC2020_ground_truth/pdc_2020_0000000.csv.gz', 'data/csv/PDC2020_ground_truth/pdc_2020_0000001.csv.gz', 'data/csv/PDC2020_ground_truth/pdc_2020_0000010.csv.gz', 'data/csv/PDC2020_ground_truth/pdc_2020_0000011.csv.gz', 'data/csv/PDC2020_ground_truth/pdc_2020_0000100.csv.gz', 'data/csv/PDC2020_ground_truth/pdc_2020_0000101.csv.gz', 'data/csv/PDC2020_ground_truth/pdc_2020_0000110.csv.gz', 'data/csv/PDC2020_ground_truth/pdc_2020_0000111.csv.gz', 'data/csv/PDC2020_ground_truth/pdc_2020_0001000.csv.gz', 'data/csv/PDC2020_ground_truth/pdc_2020_0001001.csv.gz', 'data/csv/PDC2020_ground_truth/pdc_2020_0001010.csv.gz', 'data/csv/PDC2020_ground_truth/pdc_2020_0001011.csv.gz', 'data/csv/PDC2020_ground_truth/pdc_2020_0001100.csv.gz', 'data/csv/PDC2020_ground_truth/pdc_2020_0001101.csv.gz', 'data/csv/PDC2020_ground_truth/pdc_2020_0001110.csv.gz', 'data/csv/PDC2020_ground_truth/pdc_2020_0001111.csv.gz', 'data/csv/PDC2020_ground_truth/pdc_2020_0010000.csv.gz', 'data/csv/PDC2020_ground_truth

Unnamed: 0_level_0,activity,trace_id,case:pdc:costs,normal,event_id
trace_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
trace 1,###start###,trace 1,2.0,False,0
trace 1,t11,trace 1,2.0,False,1
trace 1,t21,trace 1,2.0,False,2
trace 1,t26,trace 1,2.0,False,3
trace 1,t35,trace 1,2.0,False,4


In [37]:
def process_test(test_log,categorify,cat_names='activity'):
    o=PPObj(test_log,procs=categorify,cat_names=cat_names,y_names=cat_names,do_setup=False)
    o.process() # map to the same categories as in the training sett
    return o


def predict_next_step(o,m,ws=5):
    wds,idx=partial(windows_fast,ws=ws)(o.xs, o.event_ids)
    res=(m(LongTensor(wds).cuda())) #changed to cpu for memory reasons
    return res,idx


def calc_anomaly_score(res,o,idx):
    sm = nn.Softmax(dim=1)
    y = o.items['activity'].iloc[idx].values
    p = sm(res)
    pred = p.max(1)[0]
    truth = p[list(range(0, len(y))),y]
    a_score = ((pred - truth) / pred).cuda().detach().numpy()
    return a_score


def get_anomalies(a_score,o,idx,threshhold=0.98):
    df=pd.DataFrame(columns=['a_score'])
    df['a_score'] = a_score
    df['trace_id'] = o.items.iloc[idx]['trace_id'].values
    df['normal'] = o.items.iloc[idx]['normal'].values
    # print(df.head(50)) # THIS DF CONTAINS ALL THE ANOMALY SCORES

    y_true = (df.loc[df.trace_id.drop_duplicates().index].normal==False).tolist()
    cases = df.loc[df.trace_id.drop_duplicates().index].trace_id.tolist()
    anomalies = set(list(df.loc[df['a_score'] > threshhold]['trace_id']))
    y_pred=[case in anomalies for case in cases]
    return y_pred, y_true

In [None]:
# split test_log into 20 because of memory issues
y_pred, y_true = [], []
num_traces = len(test_log.trace_id.unique())
for i in range(50):
    print("round ", i)

    lower_bound = int(i * num_traces / 50)
    upper_bound = int((i + 1) * num_traces / 50)

    condition = (test_log['trace_id'].str.split(' ').str[1].astype(int) > lower_bound) & \
                (test_log['trace_id'].str.split(' ').str[1].astype(int) < upper_bound)

    test_log_part = test_log[condition]
    o_test = process_test(test_log_part,categorify)
    res,idx = predict_next_step(o_test,m)
    a_score = calc_anomaly_score(res,o_test,idx)
    y_pred_part, y_true_part = get_anomalies(a_score,o_test,idx)
    y_pred += y_pred_part
    y_true += y_true_part

In [41]:
o = process_test(test_log,categorify)
nsp, idx = predict_next_step(o, m)

anomaly_score = calc_anomaly_score(nsp, o, idx)
y_pred, y_true = get_anomalies(anomaly_score,o,idx)

  if not is_categorical_dtype(c):


RuntimeError: CUDA out of memory. Tried to allocate 206.00 MiB (GPU 0; 10.91 GiB total capacity; 6.58 MiB already allocated; 16.38 MiB free; 14.00 MiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
print(f'F1 score: {f1_score(y_true, y_pred)}')
print(f'Accuracy score: {accuracy_score(y_true, y_pred)}')
print(f'Precision score: {precision_score(y_true, y_pred)}')
print(f'Recall score: {recall_score(y_true, y_pred)}')