In [1]:
from dapnn.imports import *
from dapnn.data_processing import *
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import random
import torch

import warnings
warnings.filterwarnings(action='once')

In [2]:
# Set the random seed for reproducible results
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

seed_everything()

In [3]:
def mapping(log):
    """map log activities to numbers, TODO: maybe remove start and end?"""

    # get unique activities
    activities = log['activity'].unique()
    # sort them
    activities.sort()
    # create a dictionary of activities and numbers
    mapping = dict(zip(activities, range(len(activities))))

    # map activities to numbers
    log['activity'] = log['activity'].map(mapping)

    return log, mapping
    
def groupTraces(log):
    """group traces by trace id and keep the order of the events
    returns a df of lists of events"""
    grouped = log.set_index('trace_id').groupby('trace_id', as_index=False)['activity'].apply(list)
    grouped['activity'] = grouped['activity'].apply(lambda x: [i for i in x])
    return grouped


def df_to_file(df, file_name):
    with open(file_name, 'w') as f:
        for _, row in df.iterrows():
            line = ' '.join(map(str, row['activity']))
            f.write(line + '\n')

In [4]:
save_path = "romBigData/"

# Train Set

In [5]:
log_path='data/csv/PDC2020_training/'
log = import_log(log_path)
log.head(5)

['data/csv/PDC2020_training/pdc_2020_0000000.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0000001.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0000010.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0000011.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0000100.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0000101.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0000110.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0000111.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0001000.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0001001.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0001010.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0001011.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0001100.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0001101.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0001110.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0001111.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0010000.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0010001.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0010010.c

Unnamed: 0_level_0,activity,trace_id,event_id
trace_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
trace 1,###start###,trace 1,0
trace 1,t11,trace 1,1
trace 1,t26,trace 1,2
trace 1,t21,trace 1,3
trace 1,t32,trace 1,4


In [6]:
num_traces = len(log.trace_id.unique())
print("Number of traces: ", num_traces)

ratio = 0.8
traces_for_normal_model = int(num_traces * ratio)
traces_for_process_model = num_traces - traces_for_normal_model

print("Traces for normal model: ", traces_for_normal_model)
print("Traces for process model: ", traces_for_process_model)

Number of traces:  192000
Traces for normal model:  153600
Traces for process model:  38400


In [7]:
# TODO: maybe take random traces for normal model and the rest for process model
log, train_mapper = mapping(log) # TODO: maybe the train mapper should be saved for later use and should be done only for the normal model
grouped = groupTraces(log)

grouped_model = grouped[:traces_for_normal_model]
grouped_process = grouped[traces_for_normal_model:]

df_to_file(grouped_model, save_path + 'train')
df_to_file(grouped_process, save_path + 'process_train')

  grouped = log.set_index('trace_id').groupby('trace_id', as_index=False)['activity'].apply(list)


## Train Set Statistics

In [8]:
print('='*10 + 'normal model' + '='*10)
model_lengths = grouped_model['activity'].apply(lambda x: len(x))
print('num of traces: ', len(grouped_model))
print('avg sequence length: ', model_lengths.mean())
print('max sequence length: ', model_lengths.max())
print('min sequence length: ', model_lengths.min())
print('median sequence length: ', model_lengths.median())
print('num of activities: ', len(train_mapper))

print('='*10 + 'process model' + '='*10)
process_lengths = grouped_process['activity'].apply(lambda x: len(x))
print('num of traces: ', len(grouped_process))
print('avg sequence length: ', process_lengths.mean())
print('max sequence length: ', process_lengths.max())
print('min sequence length: ', process_lengths.min())
print('median sequence length: ', process_lengths.median())
print('num of activities: ', len(train_mapper))

num of traces:  153600
avg sequence length:  26.458541666666665
max sequence length:  346
min sequence length:  5
median sequence length:  18.0
num of activities:  38
num of traces:  38400
avg sequence length:  38.846927083333334
max sequence length:  312
min sequence length:  5
median sequence length:  29.0
num of activities:  38


# Test Set

In [9]:
test_path = 'data/csv/PDC2020_ground_truth/'
test_log = import_log(test_path)
test_log.head(5)

['data/csv/PDC2020_ground_truth/pdc_2020_0000000.csv.gz', 'data/csv/PDC2020_ground_truth/pdc_2020_0000001.csv.gz', 'data/csv/PDC2020_ground_truth/pdc_2020_0000010.csv.gz', 'data/csv/PDC2020_ground_truth/pdc_2020_0000011.csv.gz', 'data/csv/PDC2020_ground_truth/pdc_2020_0000100.csv.gz', 'data/csv/PDC2020_ground_truth/pdc_2020_0000101.csv.gz', 'data/csv/PDC2020_ground_truth/pdc_2020_0000110.csv.gz', 'data/csv/PDC2020_ground_truth/pdc_2020_0000111.csv.gz', 'data/csv/PDC2020_ground_truth/pdc_2020_0001000.csv.gz', 'data/csv/PDC2020_ground_truth/pdc_2020_0001001.csv.gz', 'data/csv/PDC2020_ground_truth/pdc_2020_0001010.csv.gz', 'data/csv/PDC2020_ground_truth/pdc_2020_0001011.csv.gz', 'data/csv/PDC2020_ground_truth/pdc_2020_0001100.csv.gz', 'data/csv/PDC2020_ground_truth/pdc_2020_0001101.csv.gz', 'data/csv/PDC2020_ground_truth/pdc_2020_0001110.csv.gz', 'data/csv/PDC2020_ground_truth/pdc_2020_0001111.csv.gz', 'data/csv/PDC2020_ground_truth/pdc_2020_0010000.csv.gz', 'data/csv/PDC2020_ground_truth

Unnamed: 0_level_0,activity,trace_id,case:pdc:costs,normal,event_id
trace_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
trace 1,###start###,trace 1,2.0,False,0
trace 1,t11,trace 1,2.0,False,1
trace 1,t21,trace 1,2.0,False,2
trace 1,t26,trace 1,2.0,False,3
trace 1,t35,trace 1,2.0,False,4


In [10]:
test_log.trace_id.unique()

array(['trace 1', 'trace 2', 'trace 3', ..., 'trace 191998',
       'trace 191999', 'trace 192000'], dtype=object)

In [11]:
abnormal = test_log[test_log['normal'] == False].copy()
normal = test_log[test_log['normal'] == True].copy()

abnormal['activity'] = abnormal['activity'].map(train_mapper)
normal['activity'] = normal['activity'].map(train_mapper)


abnormal_grouped = groupTraces(abnormal)
normal_grouped = groupTraces(normal)

df_to_file(abnormal_grouped, save_path + 'test_abnormal')
df_to_file(normal_grouped, save_path + 'test_normal')

  grouped = log.set_index('trace_id').groupby('trace_id', as_index=False)['activity'].apply(list)
  grouped = log.set_index('trace_id').groupby('trace_id', as_index=False)['activity'].apply(list)


## Test Set Statistics

In [12]:
abnormal_lengths = abnormal_grouped['activity'].apply(lambda x: len(x))
normal_lengths = normal_grouped['activity'].apply(lambda x: len(x))

print('num of traces: ', len(abnormal_grouped) + len(normal_grouped))
print('avg sequence length: ', (abnormal_lengths.sum() + normal_lengths.sum()) / (len(abnormal_grouped) + len(normal_grouped)))

print('='*10 + ' abnormal ' + '='*10)
print('num of abnormal traces: ', len(abnormal_grouped))
print('avg sequence length: ', abnormal_lengths.mean())
print('max sequence length: ', abnormal_lengths.max())
print('min sequence length: ', abnormal_lengths.min())
print('median sequence length: ', abnormal_lengths.median())

print('='*10 + ' normal ' + '='*10)
print('num of normal traces: ', len(normal_grouped))
print('avg sequence length: ', normal_lengths.mean())
print('max sequence length: ', normal_lengths.max())
print('min sequence length: ', normal_lengths.min())
print('median sequence length: ', normal_lengths.median())


num of traces:  192000
avg sequence length:  28.876885416666667
num of abnormal traces:  91566
avg sequence length:  29.113011379769784
max sequence length:  292
min sequence length:  5
median sequence length:  19.0
num of normal traces:  100434
avg sequence length:  28.661608618595295
max sequence length:  302
min sequence length:  6
median sequence length:  18.0
