In [1]:
from dapnn.imports import *
from dapnn.data_processing import *
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import random
import torch

import warnings
warnings.filterwarnings(action='once')

In [2]:
# Set the random seed for reproducible results
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

seed_everything()

In [17]:
def mapping(log):
    """map log activities to numbers, TODO: maybe remove start and end?"""

    # get unique activities
    activities = log['activity'].unique()
    # sort them
    activities.sort()
    # create a dictionary of activities and numbers
    mapping = dict(zip(activities, range(len(activities))))

    # map activities to numbers
    log['activity'] = log['activity'].map(mapping)

    return log, mapping
    
def groupTraces(log):
    """group traces by trace id and keep the order of the events
    returns a df of lists of events"""
    grouped = log.set_index('trace_id').groupby('trace_id', as_index=False)['activity'].apply(list)
    grouped['activity'] = grouped['activity'].apply(lambda x: [i for i in x])
    return grouped


def df_to_file(df, file_name):
    with open(file_name, 'w') as f:
        for _, row in df.iterrows():
            line = ' '.join(map(str, row['activity']))
            f.write(line + '\n')

# Train Set

In [9]:
log_path='data/csv/PDC2020_training/'
log = import_log(log_path)
log.head(5)

['data/csv/PDC2020_training/pdc_2020_0000000.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0000001.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0000010.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0000011.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0000100.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0000101.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0000110.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0000111.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0001000.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0001001.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0001010.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0001011.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0001100.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0001101.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0001110.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0001111.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0010000.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0010001.csv.gz', 'data/csv/PDC2020_training/pdc_2020_0010010.c

Unnamed: 0_level_0,activity,trace_id,event_id
trace_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
trace 1,###start###,trace 1,0
trace 1,t11,trace 1,1
trace 1,t26,trace 1,2
trace 1,t21,trace 1,3
trace 1,t32,trace 1,4


In [5]:
log.trace_id.unique()

array(['trace 1', 'trace 2', 'trace 3', ..., 'trace 191998',
       'trace 191999', 'trace 192000'], dtype=object)

In [18]:
log, train_mapper = mapping(log)
grouped = groupTraces(log)
df_to_file(grouped, f'romBigData/train')

  grouped = log.set_index('trace_id').groupby('trace_id', as_index=False)['activity'].apply(list)


KeyboardInterrupt: 

# Test Set

In [None]:
test_log = import_log(f'data/csv/PDC2020_ground_truth/{log_name}.csv.gz')
test_log.head(5)

In [None]:
abnormal = test_log[test_log['normal'] == False].copy()
normal = test_log[test_log['normal'] == True].copy()

abnormal['activity'] = abnormal['activity'].map(train_mapper)
normal['activity'] = normal['activity'].map(train_mapper)

print(normal)

abnormal_grouped = groupTraces(abnormal)
normal_grouped = groupTraces(normal)

df_to_file(abnormal_grouped, f'romdata/test_abnormal')
df_to_file(normal_grouped, f'romdata/test_normal')