In [2]:
import re
import os
import sys
sys.dont_write_bytecode = True
import json
import time
import math
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
# from tqdm.notebook import tqdm
from utils import *
from datetime import datetime
from argparse import Namespace
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from collections import defaultdict, Counter, OrderedDict
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch import Tensor
import torch.nn.functional as F
from torch_scatter import scatter
from datasets import (
    Dataset,
    load_dataset, 
    load_metric, 
    load_from_disk, 
    concatenate_datasets
)
# from transformers import (
#     AutoConfig,
#     AutoTokenizer,
#     AutoModel,
#     AutoModelForSeq2SeqLM,
# )

seed = 42

### Inference & Build graph

In [3]:
def gen_structure_data(server):
    """Get labels for the structured data."""
    dev_to_path = {
        k: [os.path.join(AIT_DATA_ROOT, 'data', v, server), os.path.join(AIT_DATA_ROOT, 'labels', v, server)]
        for k,v in AIT_NAME_DICT.items()
    }
    label_df = {'log': [], 'label': [], 'device': []}
    for dev, path_pair in dev_to_path.items():
        label_df['log'].extend(pd.read_csv(path_pair[0], names=['log'])['log'].to_list())
        label_col = pd.read_csv(path_pair[1], header=None)
        label_df['label'].extend([(str(l1), str(l2)) for l1, l2 in zip(label_col[0], label_col[1])])
        label_df['device'].extend([dev for _ in range(len(label_col))])

    label_df = pd.DataFrame(label_df)
    temp_dir = 'dataset/slogert/' + server + '/2-logpai/'
    struct_files = [os.path.join(temp_dir, f) for f in os.listdir(temp_dir) if f.endswith('structured.csv')]
    struct_df = pd.read_csv(struct_files[0], delimiter=",")
    if len(struct_files) > 1:
        for file_path in struct_files[1:]:
            struct_df = struct_df.append(pd.read_csv(file_path, delimiter=","), ignore_index=True)
    struct_df['Label'] = label_df['label']
    print('label set:', set(tuple(x) for x in struct_df['Label']))
    struct_dataset = Dataset.from_dict(struct_df)
    print('structure', struct_dataset)
    print('example[0]:', struct_dataset[0])
    return struct_df, struct_dataset

# Predict and store results on templates
def gen_entities_from_patterns(struct_df, struct_dataset):
    preds_pattern = {}
    for eventID, insIDs in struct_df.groupby(['EventId']).groups.items():
        instance = struct_dataset[int(insIDs[0])] # pick the first instance of each group
        print(instance['Content'])
        preds = prediction(instance['Content'])  # token classification
        print('Pred:', preds)
        entities = list(get_entities_bio(preds)) # merge tokens within the same entity
        entities.sort(key=lambda x: x[1])
        # print('Extracted entities:', entities)
        input_tokens = list(filter(None, re.split(TOKENIZE_PATTERN, instance['Content'])))  
        ent_list = [(tag, ' '.join(input_tokens[start:end+1])) for (tag, start, end) in entities]
        print('Extracted entities:', ent_list)
        preds_pattern[eventID] = entities

    preds = []
    for i, instance in enumerate(struct_dataset):
        ent_ids = preds_pattern[instance['EventId']]
        log = instance['Content']
        input_tokens = list(filter(None, re.split(TOKENIZE_PATTERN, log))) 
        ent_list = [(tag, ' '.join(input_tokens[start:end+1])) for (tag, start, end) in ent_ids]
        preds.append(ent_list)
    
    struct_dataset = struct_dataset.add_column('Preds', preds)
    return preds_pattern, struct_dataset

#### Auth.log

In [4]:
PROCESSED_DIR = 'dataset/processed'
server0 = 'auth.log' # Handle auth.log 
out_file0 = 'auth-log-dataset-processed.json'

if not os.path.isdir(PROCESSED_DIR):
    os.makedirs(PROCESSED_DIR)

out_path0 = os.path.join(PROCESSED_DIR, out_file0)
if os.path.exists(out_path0):
    struct_dataset0 = load_from_disk(out_path0)
else:
    struct_df0, struct_dataset0 = gen_structure_data(server0)
    preds_pattern0, struct_dataset0 = gen_entities_from_patterns(struct_df0, struct_dataset0)
    struct_dataset0.save_to_disk(out_path0)

print(struct_dataset0)
print("example[0]:", struct_dataset0[0])

Dataset({
    features: ['LineId', 'Device', 'Month', 'Date', 'Time', 'Type', 'Component', 'Content', 'EventId', 'EventTemplate', 'ParameterList', 'Label', 'Preds'],
    num_rows: 4610
})
example[0]: {'LineId': 1, 'Device': 'mail.cup.com', 'Month': 'Feb', 'Date': 29, 'Time': '00:09:01', 'Type': 'mail-0', 'Component': 'CRON[32002]', 'Content': 'pam_unix(cron:session): session opened for user root by (uid=0)', 'EventId': 'e962e6c3', 'EventTemplate': 'pam_unix(cron:<*>): session opened for user <*> by (uid=<*>)', 'ParameterList': "['session', 'root', '0']", 'Label': ['0', '0'], 'Preds': [['path', 'pam_unix'], ['user', 'root']]}


#### Daemon.log

In [5]:
server1 = 'daemon.log' # Handle auth.log
out_file1 = 'daemon-log-dataset-processed.json'
out_path1 = os.path.join(PROCESSED_DIR, out_file1)

if os.path.exists(out_path1):
    struct_dataset1 = load_from_disk(out_path1)
else:
    struct_df1, struct_dataset1 = gen_structure_data(server1)
    preds_pattern1, struct_dataset1 = gen_entities_from_patterns(struct_df1, struct_dataset1)
    struct_dataset1.save_to_disk(out_path1)

print(struct_dataset1)
print("example[0]:", struct_dataset1[0])

Dataset({
    features: ['LineId', 'Device', 'Month', 'Date', 'Time', 'Type', 'Component', 'Content', 'EventId', 'EventTemplate', 'ParameterList', 'Label', 'Preds'],
    num_rows: 6121
})
example[0]: {'LineId': 1, 'Device': 'mail.cup.com', 'Month': 'Feb', 'Date': 29, 'Time': '00:09:19', 'Type': 'mail-0', 'Component': 'systemd[1]', 'Content': 'Starting Clean php session files...', 'EventId': '73f564e0', 'EventTemplate': 'Starting Clean php session files...', 'ParameterList': '[]', 'Label': ['0', '0'], 'Preds': []}


#### Mail.log

In [6]:
server2 = 'mail.log' # Handle auth.log 
out_file2 = 'mail-log-dataset-processed.json'
out_path2 = os.path.join(PROCESSED_DIR, out_file2)

if os.path.exists(out_path2):
    struct_dataset2 = load_from_disk(out_path2)
else:
    struct_df2, struct_dataset2 = gen_structure_data(server2)
    preds_pattern2, struct_dataset2 = gen_entities_from_patterns(struct_df2, struct_dataset2)
    struct_dataset2.save_to_disk(out_path2)

print(struct_dataset2)
print("example[0]:", struct_dataset2[0])

Dataset({
    features: ['LineId', 'Device', 'Month', 'Date', 'Time', 'Type', 'Component', 'Content', 'EventId', 'EventTemplate', 'ParameterList', 'Label', 'Preds'],
    num_rows: 360468
})
example[0]: {'LineId': 1, 'Device': 'mail.onion.com', 'Month': 'Mar', 'Date': 3, 'Time': '08:06:22', 'Type': 'mail', 'Component': 'dovecot', 'Content': 'imap(idella): Logged out in=211 out=5132', 'EventId': '7edba00f', 'EventTemplate': 'imap(<*>): Logged out in=<*> out=<*>', 'ParameterList': "['idella', '211', '5132']", 'Label': ['0', '0'], 'Preds': [['user', 'idella'], ['port', '5132']]}


#### Messages

In [7]:
server3 = 'messages' # Handle auth.log 
out_file3 = 'messages-dataset-processed.json'
out_path3 = os.path.join(PROCESSED_DIR, out_file3)

if os.path.exists(out_path3):
    struct_dataset3 = load_from_disk(out_path3)
else:
    struct_df3, struct_dataset3 = gen_structure_data(server3)
    preds_pattern3, struct_dataset3 = gen_entities_from_patterns(struct_df3, struct_dataset3)
    struct_dataset3.save_to_disk(out_path3)

print(struct_dataset3)
print("example[0]:", struct_dataset3[0])

Dataset({
    features: ['LineId', 'Device', 'Month', 'Date', 'Time', 'Type', 'Component', 'Content', 'EventId', 'EventTemplate', 'ParameterList', 'Label', 'Preds'],
    num_rows: 111577
})
example[0]: {'LineId': 1, 'Device': 'mail.cup.com', 'Month': 'Feb', 'Date': 29, 'Time': '00:00:12', 'Type': 'mail-0', 'Component': 'HORDE', 'Content': '[horde] Login success for karri to horde (192.168.10.190) [pid 31779 on line 163 of "/var/www/mail.cup.com/login.php"]', 'EventId': 'c9f3df73', 'EventTemplate': '[<*>] Login success for <*> <*> <*> <*> [pid <*> on line <*> of "/<*>"]', 'ParameterList': "['horde', 'karri', 'to horde (192.168.10.190)', '31779', '163', 'var/www/mail.cup.com/login.php']", 'Label': ['0', '0'], 'Preds': [['server', 'horde'], ['user', 'karri'], ['server', 'horde'], ['ip', '192.168.10.190'], ['pid', '31779']]}


#### Syslog

In [8]:
server4 = 'syslog' # Handle auth.log 
out_file4 = 'syslog-dataset-processed.json'
out_path4 = os.path.join(PROCESSED_DIR, out_file4)

if os.path.exists(out_path4):
    struct_dataset4 = load_from_disk(out_path4)
else:
    struct_df4, struct_dataset4 = gen_structure_data(server4)
    preds_pattern4, struct_dataset4 = gen_entities_from_patterns(struct_df4, struct_dataset4)
    struct_dataset4.save_to_disk(out_path4)

print(struct_dataset4)
print("example[0]:", struct_dataset4[0])

Dataset({
    features: ['LineId', 'Device', 'Month', 'Date', 'Time', 'Type', 'Component', 'Content', 'EventId', 'EventTemplate', 'ParameterList', 'Label', 'Preds'],
    num_rows: 480547
})
example[0]: {'LineId': 1, 'Device': 'mail.spiral.com', 'Month': 'Mar', 'Date': 5, 'Time': '22:01:32', 'Type': 'mail', 'Component': 'dovecot', 'Content': 'imap-login: Login: user=<gilberte>, method=PLAIN, rip=127.0.0.1, lip=127.0.0.1, mpid=30824, secured, session=<3oP8rSKgHoZ/AAAB>', 'EventId': '19d70957', 'EventTemplate': 'imap-login: Login: user=<<*>>, method=<*>, rip=<*>, lip=<*>, mpid=<*>, secured, <*>', 'ParameterList': "['<gilberte>', 'PLAIN', '127.0.0.1', '127.0.0.1', '30824', 'session=<3oP8rSKgHoZ/AAAB>']", 'Label': ['0', '0'], 'Preds': [['user', 'gilberte'], ['ip', '127.0.0.1'], ['ip', '127.0.0.1'], ['pid', '30824'], ['session', '3oP8rSKgHoZ/AAAB']]}


#### User.log

In [9]:
server5 = 'user.log' # Handle user.log 
out_file5 = 'user-dataset-processed.json'
out_path5 = os.path.join(PROCESSED_DIR, out_file5)

if os.path.exists(out_path5):
    struct_dataset5 = load_from_disk(out_path5)
else:
    struct_df5, struct_dataset5 = gen_structure_data(server5)
    preds_pattern5, struct_dataset5 = gen_entities_from_patterns(struct_df5, struct_dataset5)
    struct_dataset5.save_to_disk(out_path5)

print(struct_dataset5)
print("example[0]:", struct_dataset5[0])

Dataset({
    features: ['LineId', 'Device', 'Month', 'Date', 'Time', 'Type', 'Component', 'Content', 'EventId', 'EventTemplate', 'ParameterList', 'Label', 'Preds'],
    num_rows: 111579
})
example[0]: {'LineId': 1, 'Device': 'mail.insect.com', 'Month': 'Mar', 'Date': 5, 'Time': '17:13:31', 'Type': 'mail', 'Component': 'HORDE', 'Content': '[nag] PHP ERROR: Declaration of Horde_Form_Type_country::init($prompt = NULL) should be compatible with Horde_Form_Type_enum::init($values, $prompt = NULL) [pid 10486 on line 0 of "/usr/share/php/Horde/Form/Type.php"]', 'EventId': 'c7a9a5e4', 'EventTemplate': '[<*>] PHP ERROR: Declaration of Horde_Form_Type_country::<*>($prompt = NULL) should be compatible with Horde_Form_Type_enum::<*>($values, $prompt = NULL) [pid <*> on line <*> of "/<*>"]', 'ParameterList': "['nag', 'init', 'init', '10486', '0', 'usr/share/php/Horde/Form/Type.php']", 'Label': ['0', '0'], 'Preds': [['pid', '10486'], ['pid', '0']]}


#### Generate Grouped Datasets (by time interval)

In [10]:
# Combine Day-Month-Time into numerical values
def time2value(examples):
    stamplist, datelist = [], []
    for i, hms_time in enumerate(examples['Time']):
        mon = examples['Month'][i]
        day = examples['Date'][i]
        mon_number = datetime.strptime(mon, '%b').month
        date_string = f'2020-{mon_number}-{day} {hms_time}' # standard datetime string
        datelist.append(date_string)
        dt = datetime.strptime(date_string, "%Y-%m-%d %H:%M:%S").timetuple()
        allsecs = time.mktime(dt)
        stamplist.append(allsecs) # timestamp (numerical) values

    examples['Timestamp'] = stamplist
    examples['Datetime'] = datelist
    return examples


# Merge all hosts and sort logs by Timestamp
PROCESSED_DIR = 'dataset/AIT-LDS-v1_1/processed'
out_file = 'whole-dataset-processed.json'
out_path = os.path.join(PROCESSED_DIR, out_file)
if os.path.exists(out_path):
    whole_dataset = load_from_disk(out_path)
else:
    whole_dataset = concatenate_datasets(
        [struct_dataset0, struct_dataset1, struct_dataset2, struct_dataset3, struct_dataset4, struct_dataset5]
    )
    whole_dataset = whole_dataset.map(time2value, batched=True, load_from_cache_file=None)
    whole_dataset = whole_dataset.remove_columns(['Month', 'Date', 'Time'])
    whole_dataset = whole_dataset.sort('Timestamp') # sort by timestamp
    whole_dataset.save_to_disk(out_path)

print('whole', whole_dataset)
# print("example[0]:", whole_dataset[0])
whole_df = whole_dataset.to_pandas()
whole_df.head()

# Get statistics
df_labels = whole_df.Label.apply(lambda x: 0 if set(x) == set('0') else 1)
print('# anomalies', sum(df_labels))
print('# events', len(whole_df.EventId.unique()))
print('average length', np.mean([len(row.Content) for idx, row in whole_df.iterrows()]))

whole Dataset({
    features: ['LineId', 'Device', 'Type', 'Component', 'Content', 'EventId', 'EventTemplate', 'ParameterList', 'Label', 'Preds', 'Timestamp', 'Datetime'],
    num_rows: 1074902
})
# anomalies 45651
# events 287
average length 159.32038176503534


In [11]:
whole_dataset.to_json('dataset/AIT/AIT.json')

Creating json from Arrow format: 100%|██████████| 1075/1075 [00:18<00:00, 59.68ba/s]


740837983

#### Generate Graphs (by time interval)

In [None]:
def splitbyinterval(df, interval):
    new_df = df.copy(deep=True)
    new_df['Datetime'] = pd.to_datetime(new_df['Datetime'], errors='coerce')
    period = new_df.groupby(pd.Grouper(key='Datetime', freq=interval)).ngroup()
    new_df['Period'] = np.char.add('period_', (pd.factorize(period)[0]).astype(str))
    return new_df

def gen_period_graph(dataframe, root_dir):
    num_unique_intervals = len(dataframe.Period.unique())
    graph_stats = []
    stats_file = os.path.join(root_dir, 'graph_stats.json')
    invalid_entities = set([('ip', '127.0.0.1'), ('pid', '0')]) # filtering invalid entities
    # num_unique_intervals
    for period_id in tqdm(range(num_unique_intervals)):
        df_sub = dataframe.loc[dataframe.Period == f'period_{period_id}'] # get subdf for current group
        num_logs = len(df_sub) # count number of logs in current group
        num_anomaly = len(df_sub.loc[df_sub.Label.apply(lambda x: '0' not in x)]) # count anomaly logs in this group
        anomaly_rate = num_anomaly/num_logs if num_logs else 0 # calculate anomaly rate
        # Store graph
        out_normal_file = os.path.join(root_dir, f'period_{period_id}.txt')
        gen_file = os.path.join(root_dir, f'period_{period_id}.html')
        # Generate entity-tag pairs
        with open(out_normal_file, 'w') as f:
            # for entities in df_sub['Preds']: # for each log 
            #     for i in range(len(entities)-1):
            #         for j in range(i+1, len(entities)):
            #             # tag0, entity0, tag1, entity1
            #             f.write(entities[i][0]+'\t'+entities[i][1]+'\t'+entities[j][0]+'\t'+entities[j][1])
            #             f.write('\n')
            for idx in range(num_logs):
                instance = df_sub.iloc[idx]
                # Connect eventID to Component
                component = re.split('([\[\]])', instance.Component)[0] # remove [XXXX] part
                f.write('event' + '\t' + instance.EventId + '\t' + 'component' + '\t' + component)
                f.write('\n')
                # Connect Component to Device
                f.write('component' + '\t' + component + '\t' + 'device' + '\t' + instance.Device)
                f.write('\n')
                # Connect eventID to each entity
                for entity in instance.Preds:
                    if tuple(entity) not in invalid_entities: # valid entity pair
                        f.write('event' + '\t' + instance.EventId + '\t' + entity[0] + '\t' + entity[1])
                        f.write('\n')
                
        f.close()
        # Generate graph visualization file
        visualize(out_normal_file, gen_file)
        ent_count, edge_count = read_hyper(out_normal_file)
        num_edges = len(edge_count)
        num_nodes = len(ent_count)
        avg_degree = num_edges/num_nodes if num_nodes else 0
        avg_degree = math.ceil(avg_degree*10000)/10000
        stats = {'graph_ID': period_id, '#nodes': num_nodes, '#edges': num_edges, 'degree': avg_degree, '#log': num_logs, '#anomaly': num_anomaly, 'anomaly_rate': anomaly_rate}
        graph_stats.append(stats)

    with open(stats_file, 'w') as f:
        for stats in graph_stats: # for each log 
            f.write(json.dumps(stats))
            f.write('\n')

    f.close()

In [None]:
# Split by time interval
# interval = '10min'
# interval = '5min'
interval = '2min'
grouped_df = splitbyinterval(whole_df, interval)
demo_dir = os.path.join('dataset/new_graph', interval)

if not os.path.exists(demo_dir):
    os.makedirs(demo_dir)

# Generate subgraphs based on time interval
# gen_period_graph(grouped_df, demo_dir)

In [None]:
common_df_path = '/nfs/intern_data/yufli/dataset/AIT_0.5min_df.csv'
data_df = pd.read_csv(common_df_path, engine='c', na_filter=False, memory_map=True)
data_df

#### HDFS dataset demo

In [None]:
from loglizer.models import *
from loglizer import dataloader, preprocessing

In [None]:
struct_log = 'dataset/HDFS-demo/HDFS_100k.log_structured.csv' # The structured log file
label_file = 'dataset/HDFS-demo/anomaly_label.csv' # The anomaly label file

(x_train, y_train), (x_test, y_test) = dataloader.load_HDFS(
    struct_log,
    label_file=label_file,
    window='session', 
    train_ratio=0.5,
    split_type='uniform'
)

feature_extractor = preprocessing.FeatureExtractor()
x_train = feature_extractor.fit_transform(x_train, term_weighting='tf-idf')
x_test = feature_extractor.transform(x_test)

In [None]:
model = SVM()
model.fit(x_train, y_train)
print('Train validation:')
precision, recall, f1 = model.evaluate(x_train, y_train)

print('Test validation:')
precision, recall, f1 = model.evaluate(x_test, y_test)

#### Test on HDFS benchmark dataset

In [None]:
log_file = 'dataset/HDFS-demo/HDFS_100k.log_structured.csv' # the structured log file
label_file = 'dataset/HDFS-demo/anomaly_label.csv' # the anomaly label file
struct_df = pd.read_csv(log_file, na_filter=False, memory_map=True)
struct_df

In [None]:
from graph_generation import add_preds_to_df

tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large')
bart_model = AutoModel.from_pretrained(f"results/BART_seq2seq/10-shot-0")
add_preds_to_df(struct_df, 'regex', bart_model, tokenizer, 0)

ptk = AutoTokenizer.from_pretrained('bert-large-uncased')
plm = AutoModel.from_pretrained('bert-large-uncased')

In [None]:
# Get blockId and corresponding logs
print("Getting BlockIDs and Logs!!! Total number of logs: {}".format(struct_df.shape[0]))
data_dict = OrderedDict()
for idx, row in tqdm(struct_df.iterrows()):
    blkId_list = re.findall(r'(blk_-?\d+)', row['Content'])
    blkId_set = set(blkId_list)
    for blk_Id in blkId_set:
        if not blk_Id in data_dict:
            data_dict[blk_Id] = defaultdict(list)
            data_dict[blk_Id]['BlockId'] = blk_Id
        for col in struct_df.columns:
            data_dict[blk_Id][col].append(row[col])

data_df = pd.DataFrame(data_dict.values())

# Add labels to each block 
label_data = pd.read_csv(label_file, engine='c', na_filter=False, memory_map=True)
label_data = label_data.set_index('BlockId')
label_dict = label_data['Label'].to_dict()
data_df['Label'] = data_df['BlockId'].apply(lambda x: 1 if label_dict[x] == 'Anomaly' else 0)

In [None]:
data_df.head()

In [None]:
# Split train and test data
print("Splitting graph datasets!!!")
num_total = data_df.shape[0]
normal_samples = data_df[data_df.Label == 0]
anomaly_samples = data_df[data_df.Label == 1]
num_normal = normal_samples.shape[0]
num_anomaly = anomaly_samples.shape[0]
anomaly_rate = num_anomaly/num_total if num_total else 0

train_df, test_normal_df = train_test_split(normal_samples, test_size=0.2, random_state=seed)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=seed)
test_df = pd.concat([anomaly_samples, test_normal_df], ignore_index=True)
test_anomaly_rate = num_anomaly/test_df.shape[0] if test_df.shape[0] else 0

print("Total number of graphs: {}, normal graphs: {}, anomaly graphs: {}, anomaly ratio: {:.4f}".format(
    num_total, num_normal, num_anomaly, anomaly_rate))
print("Train data size: {}, validation data size: {}, test data size: {}, test anomaly ratio: {:.4f}".format(
    train_df.shape[0], val_df.shape[0], test_df.shape[0], test_anomaly_rate))

#### Torch geometric dataset

In [None]:
from graph_dataset import HDFSDataset

# Test graph dataset
root = 'dataset/HDFS/regex'
plm = AutoModel.from_pretrained('bert-large-uncased')
ptk = AutoTokenizer.from_pretrained('bert-large-uncased')
# Define tag_to_id dict
tag2id = {ent:i for i, ent in enumerate(LABEL2TEMPLATE.keys())}
tag2id['event'] = len(tag2id)
tag2id['component'] = len(tag2id)
tag2id['date'] = len(tag2id) # for hdfs dataset
# df = pd.concat([train_df, val_df, test_df], ignore_index=True)
df = pd.DataFrame([])

# Define hyperparameters
hparams = Namespace(
    df=df,
    plm=plm,
    ptk=ptk,
    tag2id=tag2id,
)
hdfs_data = HDFSDataset(root, hparams=hparams)

In [None]:
# Get graph statistics (using huggingface dataset)
hdfs_stats_data = hdfs_data.graph_stats
print(hdfs_stats_data)
# Retrieve normal and anomaly indices
anomaly_ids = [i for i, y in enumerate(hdfs_stats_data['label']) if y == 1]
normal_ids = [i for i, y in enumerate(hdfs_stats_data['label']) if y == 0]
print("#anomly {}, #normal {}".format(len(anomaly_ids), len(normal_ids)))

In [None]:
# Visualization
k = 10
# randomly chose k normal and anomaly graphs
a_sub_ids = random.sample(anomaly_ids, k=k)
for idx in a_sub_ids:
    hdfs_data._visualize(idx)
n_sub_ids = random.sample(normal_ids, k=k)
for idx in n_sub_ids:
    hdfs_data._visualize(idx)

### BGL dataset few-shot learning

In [None]:
log_file = 'dataset/BGL/BGL.log_structured.csv' # the structured log file
bgl_df = pd.read_csv(log_file, na_filter=False, memory_map=True)
bgl_df['Tag'] = bgl_df['Label'].apply(lambda x: 0 if x == '-' else 1)
bgl_df.head()

In [None]:
from graph_generation import splitbyinterval

bgl_demo_df = bgl_df.sample(frac=0.1) # 471349
grouped_df = splitbyinterval(bgl_demo_df, interval='2min')
data_dict = OrderedDict()
for idx, row in tqdm(grouped_df.iterrows()):
    group_id = row['Period']
    if group_id not in data_dict:
        data_dict[group_id] = defaultdict(list)

    for col in grouped_df.columns:
            data_dict[group_id][col].append(row[col])
    data_dict[group_id]

bgl_data_df = pd.DataFrame(data_dict.values())
# Add labels to each group
bgl_data_df['EventLabels'] = bgl_data_df['Label'].apply(lambda x: [0 if item=='-' else 1 for item in x])
bgl_data_df['Label'] = bgl_data_df['Label'].apply(lambda x: 0 if set(x) == set('-') else 1)

In [None]:
bgl_data_df

#### BGL Dataset

In [4]:
from graph_dataset import BGLDataset
from torch_geometric.loader import DataLoader

# Test graph dataset
root = 'dataset/BGL/seq2seq-node-0.5min-template-bertembed/'
# Define tag_to_id dict
tag2id = {ent:i for i, ent in enumerate(LABEL2TEMPLATE.keys())}
tag2id['event'] = len(tag2id)
tag2id['component'] = len(tag2id)
# df = pd.concat([train_df, val_df, test_df], ignore_index=True)
df = pd.DataFrame([])

# Define hyperparameters
hparams = Namespace(
    df=df,
    # plm=plm,
    # ptk=ptk,
    tag2id=tag2id,
)
bgl_data = BGLDataset(root, hparams=hparams)
bgl_loader = DataLoader(bgl_data, batch_size=8, shuffle=True)

Using default argument for "using_event_template"
Using default argument for "batch_size"


Using custom data configuration default-be38ee7521d6d37a
Found cached dataset json (/home/dsi/yufli/.cache/huggingface/datasets/json/default-be38ee7521d6d37a/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)
100%|██████████| 1/1 [00:00<00:00, 64.18it/s]


In [None]:
# Get graph statistics (using huggingface dataset)
bgl_stats_data = bgl_data.graph_stats
print(bgl_stats_data) # |G| = 36169
print(bgl_stats_data[0])
# # Retrieve normal and anomaly indices
# anomaly_ids = [i for i, y in enumerate(bgl_stats_data['label']) if y == 1]
# normal_ids = [i for i, y in enumerate(bgl_stats_data['label']) if y == 0]
# print("#anomly {}, #normal {}".format(len(anomaly_ids), len(normal_ids)))


In [None]:
# Using template as node attribute input
model_path = 'bert-large-uncased' # 'bert-large-uncased', 'facebook/bart-large', 'results/BART_seq2seq/bgl/10-shot-0-10negrate/'
plm = AutoModel.from_pretrained(model_path)
ptk = AutoTokenizer.from_pretrained(model_path)

In [None]:
import math
strategy=0
batch_size=46

graph_dict = bgl_stats_data[11]
ent_tag_dict = {x[1]:x[0] for x in graph_dict['nodes']} # {entity: tag}
graph_prompts = []

for ent, tag in ent_tag_dict.items():
    if tag == 'event':
        prompt = ent + ' is an event ID .'
    elif tag == 'component':
        prompt = ent + ' is a log message component .'
    else:
        prompt = ent + LABEL2TEMPLATE[tag][strategy]
    # print(prompt)
    graph_prompts.append(prompt)

# print(len(graph_prompts))
# Batch handling
if len(graph_prompts) > batch_size:
    feature = []
    num_batch = math.ceil(len(graph_prompts)/batch_size)
    for i in range(num_batch):
        batch_prompts = graph_prompts[i*batch_size: min(len(graph_prompts), (i+1)*batch_size)]

        # Tokenize
        tokenized_inputs = ptk(
            batch_prompts, 
            max_length=1024,
            padding=False,
            truncation=True,
        )

        # Pad dynamically 
        batch = ptk.pad(
            tokenized_inputs,
            padding=True,
            max_length=1024,
            pad_to_multiple_of=8,
            return_tensors="pt",
        ) # ['input_ids', 'token_type_ids', 'attention_mask']

        # Encode
        encode_outputs = plm(**batch) # ['last_hidden_state', 'pooler_output']
        feature.append(encode_outputs.pooler_output.detach().cpu()) # B X H

    feature = torch.cat(feature, dim=0)
    print(feature.shape)

else:
    # Tokenize
    tokenized_inputs = ptk(
        graph_prompts, 
        max_length=1024,
        padding=False,
        truncation=True,
    )

    # Pad dynamically 
    batch = ptk.pad(
        tokenized_inputs,
        padding=True,
        max_length=1024,
        pad_to_multiple_of=8,
        return_tensors="pt",
    ) # ['input_ids', 'token_type_ids', 'attention_mask']

    # Encode
    encode_outputs = plm(**batch) # ['last_hidden_state', 'pooler_output']
    feature = encode_outputs.pooler_output.detach().cpu() # B X H
    print(feature.shape)

In [7]:
# Visualization
k = 10
# randomly chose k normal and anomaly graphs
a_sub_ids = random.sample(range(len(bgl_data)), k=k)
for idx in a_sub_ids:
    bgl_data._visualize(idx)
# n_sub_ids = random.sample(normal_ids, k=k)
# for idx in n_sub_ids:
#     bgl_data._visualize(idx)

In [None]:
from graph_model import GCNGraphEmbedding, GCNNodeEmbedding

m = GCNGraphEmbedding(0.5, 1041, 128, 3)
data0 = bgl_data[0]
for i, batch in enumerate(bgl_loader):
    if i == 0:
        batch0 = batch
        break
outputs = m(x=data0['x'], 
    edge_index=data0['edge_index'], 
    batch = torch.LongTensor([0]*data0['x'].shape[0])
)
print(outputs.size())

### BGL Node Study

In [None]:
from graph_dataset import BGLNodeDataset
from torch_geometric.loader import DataLoader, NeighborLoader
from graph_model import NodeConv, GCN, AENodeConv

root = 'dataset/BGL/regex-node-2min'
# Test graph dataset
plm = AutoModel.from_pretrained('bert-large-uncased')
ptk = AutoTokenizer.from_pretrained('bert-large-uncased')
seq2seq = AutoModelForSeq2SeqLM.from_pretrained('t5-large')
# seq2seq = AutoModelForSeq2SeqLM.from_pretrained('facebook/bart-large')
# Define tag_to_id dict
tag2id = {ent:i for i, ent in enumerate(LABEL2TEMPLATE.keys())}
tag2id['event'] = len(tag2id)
tag2id['component'] = len(tag2id)
df = pd.DataFrame([])
model_kwargs = {'model_type': 'ae-gcnae'}
# Define model args
in_channels = EMBED_SIZE + len(tag2id)

# Define hyperparameters
hparams = Namespace(
    df=df,
    plm=plm,
    ptk=ptk,
    tag2id=tag2id,
    feature_dim=in_channels,
    model_kwargs=model_kwargs,
)
bgl_node_data = BGLNodeDataset(root, hparams=hparams)
bgl_node_loader = DataLoader(bgl_node_data, batch_size=64, shuffle=False)
node_model = AENodeConv(hparams)
node_batch = next(iter(bgl_node_loader))

In [None]:
# Get graph statistics (using huggingface dataset)
bgl_node_stats_data = bgl_node_data.graph_stats
print(bgl_node_stats_data)
# Retrieve normal and anomaly indices
anomaly_ids = [i for i, y in enumerate(bgl_node_stats_data['label']) if sum(y) > 0]
normal_ids = [i for i, y in enumerate(bgl_node_stats_data['label']) if sum(y) == 0]
print("#anomly {}, #normal {}".format(len(anomaly_ids), len(normal_ids)))

In [None]:
# Visualization
k = 10
# randomly chose k normal and anomaly graphs
a_sub_ids = random.sample(anomaly_ids, k=k)
for idx in a_sub_ids:
    bgl_node_data._visualize(idx)
n_sub_ids = random.sample(normal_ids, k=k)
for idx in n_sub_ids:
    bgl_node_data._visualize(idx)

#### Anomaly case study (BGL)

In [18]:
log_file = 'dataset/BGL/BGL.log_structured.csv' # the structured log file
bgl_df = pd.read_csv(log_file, na_filter=False, memory_map=True)
bgl_df['Tag'] = bgl_df['Label'].apply(lambda x: 0 if x == '-' else 1)
bgl_df['Datetime'] = bgl_df['Timestamp'].apply(lambda x: datetime.fromtimestamp(x))
print(bgl_df.shape)
bgl_df.head()

(4713493, 16)


Unnamed: 0,LineId,Label,Timestamp,Date,Node,Time,NodeRepeat,Type,Component,Level,Content,EventId,EventTemplate,ParameterList,Tag,Datetime
0,1,-,1117838570,2005.06.03,R02-M1-N0-C:J12-U11,2005-06-03-15.42.50.363779,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected,3aa50e45,instruction cache parity error corrected,[],0,2005-06-03 18:42:50
1,2,-,1117838570,2005.06.03,R02-M1-N0-C:J12-U11,2005-06-03-15.42.50.527847,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected,3aa50e45,instruction cache parity error corrected,[],0,2005-06-03 18:42:50
2,3,-,1117838570,2005.06.03,R02-M1-N0-C:J12-U11,2005-06-03-15.42.50.675872,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected,3aa50e45,instruction cache parity error corrected,[],0,2005-06-03 18:42:50
3,4,-,1117838570,2005.06.03,R02-M1-N0-C:J12-U11,2005-06-03-15.42.50.823719,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected,3aa50e45,instruction cache parity error corrected,[],0,2005-06-03 18:42:50
4,5,-,1117838570,2005.06.03,R02-M1-N0-C:J12-U11,2005-06-03-15.42.50.982731,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected,3aa50e45,instruction cache parity error corrected,[],0,2005-06-03 18:42:50


In [21]:
from graph_generation import splitbyinterval, get_train_test_data

# sub_bgl_df = bgl_df.loc[bgl_df.LineId <= 100000]
interval = '0.5min'
grouped_df = splitbyinterval(bgl_df, interval)
data_dict = OrderedDict()
for idx, row in tqdm(grouped_df.iterrows()):
    group_id = row['Period']
    if group_id not in data_dict:
        data_dict[group_id] = defaultdict(list)

    for col in grouped_df.columns:
        data_dict[group_id][col].append(row[col])
    data_dict[group_id]

data_df = pd.DataFrame(data_dict.values())
# Add labels to each group
data_df['EventLabels'] = data_df['Label'].apply(lambda x: [0 if item=='-' else 1 for item in x])
data_df['Label'] = data_df['Label'].apply(lambda x: 0 if set(x) == set('-') else 1)
print(data_df.shape)
data_df.head()

In [24]:
bgl_event_df = bgl_df.groupby('EventId').head(1)[['Content', 'EventTemplate', 'ParameterList']]
bgl_event_df['ner_tags'] = [[] for _ in range(len(bgl_event_df))]
print(bgl_event_df.shape)
bgl_event_df.to_csv('dataset/BGL/BGL.log_structured_event_1.csv')
train_event_df = bgl_event_df.sample(n=500, random_state=seed)
train_event_df.to_csv('dataset/BGL/BGL.log_structured_event_train.csv', index=False)
remain_event_df = bgl_event_df[~bgl_event_df.index.isin(train_event_df.index)]
test_event_df = remain_event_df.sample(frac=0.8, random_state=seed)
val_event_df = remain_event_df[~remain_event_df.index.isin(test_event_df.index)]
val_event_df.to_csv('dataset/BGL/BGL.log_structured_event_val.csv', index=False)
test_event_df.to_csv('dataset/BGL/BGL.log_structured_event_test.csv', index=False)
print(bgl_event_df.shape, train_event_df.shape, val_event_df.shape, test_event_df.shape)

(1848, 4)
(1848, 4) (500, 4) (270, 4) (1078, 4)


In [25]:
from ast import literal_eval

t_file = 'dataset/BGL/BGL.log_structured_event_train.csv'
v_file = 'dataset/BGL/BGL.log_structured_event_val.csv'

def process_csv(file):
    t_df = pd.read_csv(file)
    # Parse string of list
    t_df.ParameterList = t_df.ParameterList.apply(literal_eval)
    t_df.ner_tags = t_df.ner_tags.apply(literal_eval)
    # Get rid of unnamed columns
    if 'Unnamed: 0' in t_df.columns:
        t_df.drop(columns=['Unnamed: 0'], inplace=True)
    # Test correspondence
    for i, row in t_df.iterrows():
        if len(row.ParameterList) != len(row.ner_tags):
            raise ValueError('number of named entities does not match tags!')
    return t_df

df1 = process_csv(t_file)
df2 = process_csv(v_file)
df = pd.concat([df1, df2], ignore_index=True)
df = process_csv(t_file)
df.rename(columns={
    "Content": "logex:example", 
    "EventTemplate": "logex:pattern", 
    "ParameterList": "logex:hasParameterList", 
    "ner_tags": "logex:hasNERtag",
}, inplace=True)
df

ValueError: number of named entities does not match tags!

In [None]:
from utils import ENTITY_COLUMN_NAME, TAG_COLUMN_NAME, LOG_COLUMN_NAME
from datasets import load_dataset, Dataset
from collections import defaultdict

data = Dataset.from_pandas(df)

# Save dataset
out_f = '/nfs/intern_data/yufli/dataset/BGL/dataset.json'
data.to_json(out_f)

# Load dataset
data = load_dataset('json', data_files=out_f)['train']

# Get tag-entity statistics
entity_set = defaultdict(set)
entity_count = defaultdict(list)
for i, instance in enumerate(data):
    for ent, tag in zip(instance[ENTITY_COLUMN_NAME], instance[TAG_COLUMN_NAME]):
            entity_set[tag].add(ent)
            entity_count[tag].append(i)

entity_occ = sum(len(ids) for ids in entity_count.values())
print("Total #entities: {}, average #entities per log: {:.3f}".format(entity_occ, entity_occ/len(data)))
print("Entity distribution ({}): {}".format(len(entity_count), {k:len(v) for k,v in entity_count.items()}))
print('log: "%s",'%data[LOG_COLUMN_NAME][0], 
        'entities: %s,'%data[ENTITY_COLUMN_NAME][0], 
        'tags: %s.'%data[TAG_COLUMN_NAME][0])
for tag, entities in entity_set.items():
    print("\t{} ({}): {}".format(tag, len(entities), entities))

In [None]:
# Split train(10-shot & 5-shot)/val/test data for NER training
n_shots = 10
n_shot_ids = []
ten_shot_ids = []
random.seed(seed)
for tag in entity_count:
    tag_ids = random.choices(entity_count[tag], k=10) 
    ten_shot_ids.extend(tag_ids) # 10-shot
    n_shot_ids.extend(tag_ids[:n_shots])

n_shot_data = data.select(n_shot_ids).shuffle(seed=seed) # n-shot
remain_ids = list(set(range(len(data))) - set(ten_shot_ids)) 
val_ids = random.sample(remain_ids, int(len(remain_ids)*0.3))
val_data = data.select(val_ids)
test_ids = list(set(remain_ids) - set(val_ids))
test_data = data.select(test_ids)
print(n_shot_data)
print(val_data)
print(test_data)

In [None]:
event_anomaly_rate_dict = defaultdict(float)

for eventid, ids in bgl_df.groupby('EventId').groups.items():
    subgroup = bgl_df.loc[ids]
    event_anomaly_rate_dict[eventid] = sum(subgroup.Tag)/len(ids)

event_anomaly_rate = sorted(event_anomaly_rate_dict.items(), key=lambda x: x[1], reverse=True)
print(len(event_anomaly_rate))
print(event_anomaly_rate[:57])

In [None]:
from math import ceil, floor

all_labels = np.array([0 if sum(x) == 0 else 1 for x in bgl_node_data.graph_stats['label']])
anomaly_size = sum(all_labels)
normal_size = len(all_labels) - anomaly_size

n_train = floor(normal_size*0.8) 
val_size = ceil(n_train*0.2)
train_size = floor(n_train*0.8)
test_size = len(bgl_node_data) - train_size - val_size
test_anomaly_rate = anomaly_size/test_size

train_graph_data = bgl_node_data[:train_size]
val_graph_data = bgl_node_data[train_size:train_size + val_size]
test_graph_data = bgl_node_data[train_size + val_size:]

print(len(train_graph_data), len(val_graph_data), len(test_graph_data))

test_loader = DataLoader(
    test_graph_data, 
    batch_size=256, 
    shuffle=False, 
)

event_id = EMBED_SIZE + tag2id['event']

In [None]:
test_gt = []

for batch_idx, batch in enumerate(test_loader):
    is_event_nodes = batch['x'][:, event_id] == 1
    labels = batch['y'][is_event_nodes]
    test_gt.append(labels)

test_gt = torch.cat(test_gt, dim=0).numpy() # N (only for event predictions)
print(len(test_gt), sum(test_gt), sum(test_gt)/len(test_gt))
test_node_stats = bgl_node_data.graph_stats.select(range(train_size + val_size, len(bgl_node_data)))
test_node_stats

In [None]:
test_pred = pd.read_csv('results/bgl/regex-node/GCN-mlp-2min-new/predictions.csv', na_filter=False, memory_map=True)
test_pred.drop(test_pred.columns[test_pred.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)

event_count = []
idx = 0

for graph_id, graph_dict in tqdm(enumerate(test_node_stats)):
    ent_tag_dict = {x[1]:x[0] for x in graph_dict['nodes']} # {entity: tag}
    for ent, tag in ent_tag_dict.items():
        if tag == 'event':
            value = {'eventId': ent, 'graphId': train_size + val_size + graph_id}
            row = test_pred.iloc[idx]
            for col in test_pred.columns:
                value[col] = row[col]
            event_count.append(value)
            idx += 1

event_pred_df = pd.DataFrame(event_count)
event_pred_df

In [None]:
event_stats = {}
TP_graph_set, FN_graph_set, FP_graph_set, TN_graph_set = set(), set(), set(), set()

for idx, row in tqdm(event_pred_df.iterrows()):
    y = 'anomaly' if row.GT == 1 else 'normal'
    if row.eventId not in event_stats:
        event_stats[row.eventId] = {'TP': [], 'FP': [], 'TN': [], 'FN': []}

    if row.GT == 1:
        if row['top80%'] == 1:
            event_stats[row.eventId]['TP'].append(row.graphId)
            if row.graphId not in TP_graph_set:
                # bgl_node_data._visualize(row.graphId, name=f'graph{row.graphId}_{row.eventId}_TP.html')
                TP_graph_set.add(row.graphId)
        else:
            event_stats[row.eventId]['FN'].append(row.graphId)
            if row.graphId not in FN_graph_set:
                # bgl_node_data._visualize(row.graphId, name=f'graph{row.graphId}_{row.eventId}_FN.html')
                FN_graph_set.add(row.graphId)
    else:
        if row['top80%'] == 1:
            event_stats[row.eventId]['FP'].append(row.graphId)
            if row.graphId not in FP_graph_set:
                # bgl_node_data._visualize(row.graphId, name=f'graph{row.graphId}_{row.eventId}_TP.html')
                FP_graph_set.add(row.graphId)
        else:
            event_stats[row.eventId]['TN'].append(row.graphId)
            if row.graphId not in TN_graph_set:
                # bgl_node_data._visualize(row.graphId, name=f'graph{row.graphId}_{row.eventId}_FN.html')
                TN_graph_set.add(row.graphId)

for event, stats in event_stats.items():
    stats['Precision'] = len(stats['TP'])/(len(stats['TP'])+len(stats['FP'])) if len(stats['TP'])+len(stats['FP']) else 0
    stats['Recall'] = len(stats['TP'])/(len(stats['TP'])+len(stats['FN'])) if len(stats['TP'])+len(stats['FN']) else 0
    stats['F1'] = 2*stats['Precision']*stats['Recall']/(stats['Precision']+stats['Recall']) if stats['Precision']+stats['Recall'] else 0

In [None]:
anomaly_events = set([x[0] for x in event_anomaly_rate[:57]])
anomaly_event_stats = []
for event, stats in event_stats.items():
    if event in anomaly_events:
        stats['anomaly_rate'] = event_anomaly_rate_dict[event]
        stats['eventId'] = event
        anomaly_event_stats.append(stats)

anomaly_event_df = pd.DataFrame(anomaly_event_stats)
cols = anomaly_event_df.columns.tolist()
cols = cols[-2:] + cols[:-2]
anomaly_event_df = anomaly_event_df[cols]
anomaly_event_df = anomaly_event_df.sort_values(by=['F1', 'Precision'], ascending=False, ignore_index=True)
anomaly_event_df.to_csv('results/bgl/regex-node/GCN-mlp-2min-new/event-results-analysis.csv', index=False)
anomaly_event_df.head()

In [None]:
for idx, row in tqdm(anomaly_event_df.iterrows()):
    if len(row.TP) and len(row.FN):
        for graphId in row.TP:
            visualize(
                graph_stats=bgl_node_data.graph_stats, 
                root=bgl_node_data.root, 
                out_dir=os.path.join(bgl_node_data.root, 'graph', row.eventId), 
                idx=graphId, 
                name=f'graph_{graphId}_TP.html',
            )
        for graphId in row.FN:
            visualize(
                graph_stats=bgl_node_data.graph_stats, 
                root=bgl_node_data.root, 
                out_dir=os.path.join(bgl_node_data.root, 'graph', row.eventId), 
                idx=graphId, 
                name=f'graph_{graphId}_FN.html',
            )

### New Embedding

In [None]:
event_id = EMBED_SIZE + tag2id['event']
is_event_nodes = batch['x'][:, event_id] == 1
event_x= batch['x'][is_event_nodes]
preds = node_model(
    x=batch['x'], 
    edge_index=batch['edge_index'], 
    batch=batch['batch'],
)
event_preds = preds[is_event_nodes]
event_labels = batch['y'][is_event_nodes]
print(event_x.shape, event_preds.shape, event_labels.shape)

In [None]:
# Epoch end
event2preds = defaultdict(list)
event_x_list = event_x.tolist()

for x, preds in zip(event_x_list, event_preds):
    event2preds[tuple(x)].append(preds)
    
print(len(event2preds))
print([len(v) for k,v in event2preds.items()])
print(sum(len(v) == 1 for k,v in event2preds.items()))

# Average embeddings for each event
event2avg = {k: torch.stack(preds, dim=0).mean(dim=0) for k, preds in event2preds.items()}
print(len(event2avg))

In [None]:
# Training epoch
train_dists = defaultdict(list)
mse_loss = nn.MSELoss(reduction='none')

targets = []
for x in event_x_list:
    targets.append(event2avg[tuple(x)])

targets = torch.stack(targets, dim=0)
print(event_preds.size(), targets.size())
individual_loss = mse_loss(event_preds, targets).sum(dim=-1) # B

for i, x in enumerate(event_x_list):
    train_dists[tuple(x)].append(individual_loss[i].detach().item())
    
print(len(train_dists))

#### AE Baselines

In [None]:
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModel,
    AutoModelForSeq2SeqLM,
)
import pandas as pd
from utils import *
from argparse import Namespace
from graph_dataset import BGLNodeDataset
from torch_geometric.loader import DataLoader, NeighborLoader
from graph_model import NodeConv, GCN, AENodeConv
from SetBart import BartForConditionalGeneration

root = '/nfs/intern_data/yufli/dataset/BGL/regex-node-halfmin-template'

# Test graph dataset
plm = AutoModel.from_pretrained('bert-large-uncased')
ptk = AutoTokenizer.from_pretrained('bert-large-uncased')

# seq2seq = AutoModelForSeq2SeqLM.from_pretrained('t5-large')
config = AutoConfig.from_pretrained('facebook/bart-large')
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large')
seq2seq = AutoModelForSeq2SeqLM.from_pretrained('facebook/bart-large')
seq2seq2 = BartForConditionalGeneration.from_pretrained('facebook/bart-large')

# Define tag_to_id dict
tag2id = {ent:i for i, ent in enumerate(LABEL2TEMPLATE.keys())}
tag2id['event'] = len(tag2id)
tag2id['component'] = len(tag2id)
df = pd.DataFrame([])
model_kwargs = {'model_type': 'ae-gcnae'}

# Define model args
in_channels = EMBED_SIZE + len(tag2id)

# Define hyperparameters
hparams = Namespace(
    df=df,
    plm=plm,
    ptk=ptk,
    tag2id=tag2id,
    feature_dim=in_channels,
    model_kwargs=model_kwargs,
)
bgl_node_data = BGLNodeDataset(root, hparams=hparams)
bgl_node_loader = DataLoader(bgl_node_data, batch_size=8, shuffle=False)
node_model = AENodeConv(hparams)

# Modify bart VOCAB
seq2seq.resize_token_embeddings(bgl_node_data.num_nodes)
seq2seq2.resize_token_embeddings(bgl_node_data.num_nodes)

In [None]:
x0 = bgl_node_data[0]
stats0 = bgl_node_data.graph_stats[0]
ent_tag_dict0 = {x[1]:x[0] for x in stats0['nodes']} # {entity: tag}
print(x0)
print(stats0)
print(ent_tag_dict0)
print(list(ent_tag_dict0))

In [None]:
from torch_geometric.data import Data

merged_data = Data(x=x0.x, edge_index=x0.edge_index, y=x0.y, ids=x0.ids, node_strs=list(ent_tag_dict0))
print(merged_data)
print(merged_data.node_strs)

In [None]:
for i, batch in enumerate(bgl_node_loader):
    if i == 0:
        batch0 = batch
    elif i == 1:
        batch1 = batch
    elif i == 2:
        batch2 = batch
    elif i == 3:
        batch3 = batch
    elif i == 4:
        batch4 = batch
    elif i == 5:
        batch5 = batch
    elif i == 6:
        batch6 = batch
    else:
        break

In [None]:
import torch
import torch.nn.functional as F

out_channels = 128
num_layers = 4
embed_dim = 768
# split the number of layers for the encoder and decoders
decoder_layers = int(num_layers / 2)
encoder_layers = num_layers - decoder_layers
num_neigh = -1
alpha = 0.5

gcn_encoder = GCN(in_channels=batch0.x.shape[1],
            hidden_channels=out_channels,
            out_channels=embed_dim,
            num_layers=encoder_layers,
            dropout=0.3,
            act=F.relu)
attr_decoder = GCN(in_channels=embed_dim,
            hidden_channels=out_channels,
            out_channels=batch0.x.shape[1],
            num_layers=decoder_layers,
            dropout=0.3,
            act=F.relu)
struct_decoder = GCN(in_channels=embed_dim,
            hidden_channels=out_channels,
            out_channels=batch0.x.shape[1],
            num_layers=decoder_layers - 1,
            dropout=0.3,
            act=F.relu)

def get_half_index(batch, batch_size=8):
    half_idx = (batch >= int(batch_size/2)).nonzero()[0].item()
    return half_idx

def get_preds_from_batch(node_batch, gcn_model):
    node_idx = torch.arange(node_batch.x.shape[0])
    s = to_dense_adj(node_batch.edge_index)[0] # |V| X |V|
    x_ = gcn_model(x=node_batch.x, edge_index=node_batch.edge_index) # |V| X 1024
    # Split by half number of graphs in this batch
    half_idx = get_half_index(node_batch.batch)
    former_batch = batch0.batch[:half_idx]
    former_x_ = x_[:half_idx]
    former_ids = node_batch.ids[:half_idx]

    latter_batch = batch0.batch[half_idx:]
    latter_x_ = x_[half_idx:]
    latter_ids = node_batch.ids[half_idx:]
    
    return x_.unsqueeze(0), former_ids.unsqueeze(0), former_x_.unsqueeze(0), \
        latter_ids.unsqueeze(0), latter_x_.unsqueeze(0)

def to_sparse_batch(x, mask):
    h = x.shape[-1]
    if x.dim() == 3:
        mask_ = mask.unsqueeze(-1).expand(x.size())
        return torch.masked_select(x, mask_).reshape(-1, h) # |V| X 1024
    elif x.dim() == 2:
        return torch.masked_select(x, mask) # |V|
    

x0_, x0_former_ids, x0_former_embed, x0_latter_ids, x0_latter_embed = get_preds_from_batch(batch0, gcn_encoder)
x1_, x1_former_ids, x1_former_embed, x1_latter_ids, x1_latter_embed = get_preds_from_batch(batch1, gcn_encoder)
x2_, x2_former_ids, x2_former_embed, x2_latter_ids, x2_latter_embed = get_preds_from_batch(batch2, gcn_encoder)
x3_, x3_former_ids, x3_former_embed, x3_latter_ids, x3_latter_embed = get_preds_from_batch(batch3, gcn_encoder)
x4_, x4_former_ids, x4_former_embed, x4_latter_ids, x4_latter_embed = get_preds_from_batch(batch4, gcn_encoder)
x5_, x5_former_ids, x5_former_embed, x5_latter_ids, x5_latter_embed = get_preds_from_batch(batch5, gcn_encoder)
x6_, x6_former_ids, x6_former_embed, x6_latter_ids, x6_latter_embed = get_preds_from_batch(batch6, gcn_encoder)
print([item.shape for item in [x0_, x0_former_ids, x0_former_embed, x0_latter_ids, x0_latter_embed]])
print([item.shape for item in [x1_, x1_former_ids, x1_former_embed, x1_latter_ids, x1_latter_embed]])
print([item.shape for item in [x2_, x2_former_ids, x2_former_embed, x2_latter_ids, x2_latter_embed]])
print([item.shape for item in [x3_, x3_former_ids, x3_former_embed, x3_latter_ids, x3_latter_embed]])
print([item.shape for item in [x4_, x4_former_ids, x4_former_embed, x4_latter_ids, x4_latter_embed]])
print([item.shape for item in [x5_, x5_former_ids, x5_former_embed, x5_latter_ids, x5_latter_embed]])
print([item.shape for item in [x6_, x6_former_ids, x6_former_embed, x6_latter_ids, x6_latter_embed]])

In [None]:
from torch_geometric.utils import subgraph

torch.manual_seed(8)

# handle node and edge index
G = batch5
max_length = 512
perm = torch.randperm(G.num_graphs)
print(perm)
print(G)
for graph_id in perm:
    print(G.get_example(graph_id))
# print(G.get_example(0))
# print(G.get_example(0).edge_index)
# print(G.get_example(2))
# print(G.get_example(2).edge_index)
# print(G.from_data_list([G.get_example(0), G.get_example(2)]))
# print(G.from_data_list([G.get_example(0), G.get_example(2)]).edge_index)

accum_nodes = 0
data_list = []

for graph_id in perm:
    data = G.get_example(graph_id)
    if accum_nodes + data.num_nodes <= max_length:
        accum_nodes += data.num_nodes
        data_list.append(data)

sub_G = G.from_data_list(data_list)

print('Subgraph', sub_G)
# perm = torch.randperm(G.num_nodes)
# print(G.x)
# print(G.edge_index)
# idx = perm[:max_length]
# print(idx)
# sub_G = G.subgraph(idx)
print(sub_G.x)
print(sub_G.edge_index)
# print(sub_G.edge_index.max(), sub_G.edge_index.min())
if not sub_G.edge_index.shape[-1]:
    # Empty edge index
    print("Empty edge index !!!")
    s0 = torch.zeros((sub_G.num_nodes, sub_G.num_nodes))
else:
    s0 = to_dense_adj(sub_G.edge_index, max_num_nodes=len(idx))[0]
print(s0)

In [None]:
# Calculate seq2seq loss
# encoder_output2 = seq2seq.encoder(inputs_embeds=x_2.unsqueeze(0)) # for t5
max_length = config.max_position_embeddings

output = seq2seq(
    inputs_embeds=x5_former_embed[:, :max_length, :], 
    labels=x5_latter_ids[:, :max_length],
    decoder_inputs_embeds=x5_latter_embed[:, :max_length, :],
) # for bart

print(output.keys()) # 'loss', 'logits', 'encoder_last_hidden_state'
print(output.loss) # float
print(output.logits.shape) # B X T (output seq) X vocab_size
print(output.logits)
print(output.encoder_last_hidden_state.shape) # B X T (input seq) X H

In [None]:
# Calculate seq2seq loss
# encoder_output2 = seq2seq.encoder(inputs_embeds=x_2.unsqueeze(0)) # for t5
max_length = config.max_position_embeddings

output2 = seq2seq2(
    inputs_embeds=x5_former_embed[:, :max_length, :], 
    labels=x5_latter_ids[:, :max_length],
    decoder_inputs_embeds=x5_latter_embed[:, :max_length, :],
) # for bart

print(output2.keys()) # 'loss', 'logits', 'encoder_last_hidden_state'
print(output2.loss) # float
print(output2.logits.shape) # B X T (output seq) X vocab_size
print(output2.logits)
print(output2.encoder_last_hidden_state.shape) # B X T (input seq) X H

#### XLNet

In [None]:
xlnet_config = AutoConfig.from_pretrained('xlnet-large-cased')
xnlet_base = AutoModel.from_pretrained('xlnet-base-cased')
xnlet_large = AutoModel.from_pretrained('xlnet-large-cased')

# Modify bart VOCAB
xnlet_base.resize_token_embeddings(bgl_node_data.num_nodes)

In [None]:
# xnlet_model decoder
# from transformers.models.xlnet.modeling_xlnet

decoder_output_final = xnlet_base(
    inputs_embeds=x5_, 
    # decoder_inputs_embeds=x5_latter_embed[:, :max_length, :],
) # for xlnet

print(decoder_output_final.keys()) # 'last_hidden_state', 'mems'
print(decoder_output_final.last_hidden_state.shape) # B X T (input seq) X H

#### GPT-2

In [None]:
gpt2_config = AutoConfig.from_pretrained('gpt2')
gpt2_model = AutoModel.from_pretrained('gpt2')

# Modify bart VOCAB
gpt2_model.resize_token_embeddings(bgl_node_data.num_nodes)

In [None]:
wpe = nn.Embedding(gpt2_config.max_position_embeddings, gpt2_config.hidden_size)
position_ids = torch.arange(0, batch5.num_graphs, dtype=torch.long)
position_ids = position_ids.unsqueeze(0).view(-1, batch5.num_graphs)
print(position_ids)
graph_embed = wpe(position_ids).squeeze(0)
print(graph_embed.shape)

node_embed = torch.stack([graph_embed[graphid] for graphid in batch5.batch], dim=0) # |V| X 1024
print(node_embed.shape)

#### BERT

In [None]:
bert_config = AutoConfig.from_pretrained('bert-base-uncased')
bert_model = AutoModel.from_pretrained('bert-base-uncased')
bert_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
# bert_model.resize_token_embeddings(131313)

In [None]:
text = '102.10.67.322 is an IP address.'
ids1 = bert_tokenizer.encode(text, add_special_tokens=False)
ids2 = bert_tokenizer(text, add_special_tokens=False)

print(ids1)
print(ids2)

In [None]:
bert_model.embeddings.word_embeddings.weight[ids1].sum(dim=0).detach().shape

In [None]:
# from transformers.models.bert.modeling_bert

encoder_output_bert = bert_model.encoder(
    x5_, 
    # decoder_inputs_embeds=x5_latter_embed[:, :max_length, :],
) # for xlnet

print(decoder_output_final.keys()) # 'last_hidden_state', 'mems'
print(decoder_output_final.last_hidden_state.shape) # B X T (input seq) X H

In [None]:
# from transformers.models.gpt2.modeling_gpt2

max_length = gpt2_config.max_position_embeddings

decoder_output_gpt2 = gpt2_model(
    inputs_embeds=x5_[:, :max_length, :], 
    # decoder_inputs_embeds=x5_latter_embed[:, :max_length, :],
) # for gpt2

print(decoder_output_gpt2.keys()) # 'last_hidden_state', 'past_key_values'
print(decoder_output_gpt2.last_hidden_state.shape) # B X T (input seq) X H

In [None]:
# Encode sequence
encoder_output2 = seq2seq2.model.encoder(
    inputs_embeds=x5_[:, :max_length, :],
)
print(encoder_output2.keys())
print(encoder_output2.last_hidden_state.shape)
print(encoder_output2.last_hidden_state)

In [None]:
# Encode sequence (BERT)
encoder_output3 = plm.encoder(
    hidden_states=x5_[:, :max_length, :], 
)
print(encoder_output3.keys())
print(encoder_output3.last_hidden_state.shape)
print(encoder_output3.last_hidden_state)

In [None]:
# GCN decode (reconstruction)
pair_max, _ = batch5.edge_index.max(dim=0)
cutted_edge_index = batch5.edge_index.T[pair_max < max_length].T

hidden = encoder_output.last_hidden_state.squeeze(0) # |V_cute| X 1024
x_recover = attr_decoder(x=hidden.detach().cpu(), edge_index=cutted_edge_index)

# Decode adjacency matrix
h_ = struct_decoder(hidden.detach().cpu(), cutted_edge_index)
s_ = h_ @ h_.T

print(x_recover.shape, s_.shape)

In [None]:
# Statistic graph number of nodes
from tqdm import tqdm

node_nums = []
for i, ins in tqdm(enumerate(bgl_node_data.graph_stats)):
    node_set = set([x[1] for x in ins['nodes']])
    node_nums.append(len(node_set))

In [None]:
from collections import Counter

node_nums = np.array(node_nums)
hundred_rate = sum(node_nums > 128)/len(node_nums)
print('{:4f}% of graphs with more than 128 nodes'.format(hundred_rate*100))
thousand_rate = sum(node_nums > 1024)/len(node_nums)
print('{:4f}% of graphs with more than 1024 nodes'.format(thousand_rate*100))
two_thousand_rate = sum(node_nums > 2048)/len(node_nums)
print('{:4f}% of graphs with more than 2048 nodes'.format(two_thousand_rate*100))

node_dist = Counter(node_nums)
# print(node_dist)

In [None]:
TOKENIZE_PATTERN = ' |(=)|(:) |([()])|(,) |([\[\]])|([{}])|([<>])|(\.) |(\.$)'
# s = "Power Good signal deactivated: R73-M1-N5. A service action may be required."
s = "monitor caught java.lang.IllegalStateException: while executing I2C Operation caught java.net.SocketException: Broken pipe and is stopping"
words = list(filter(None, re.split(TOKENIZE_PATTERN, s)))
print(words)

### AIT Node Dataset

In [None]:
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModel,
)
import pandas as pd
from utils import *
from argparse import Namespace
from graph_dataset import BGLNodeDataset
from torch_geometric.loader import DataLoader, NeighborLoader
from graph_model import NodeConv, GCN, AENodeConv

root = '/nfs/intern_data/yufli/dataset/AIT/seq2seq-node-0.5min-template'

# Define tag_to_id dict
tag2id = {ent:i for i, ent in enumerate(LABEL2TEMPLATE.keys())}
tag2id['event'] = len(tag2id)
tag2id['component'] = len(tag2id)
df = pd.DataFrame([])
model_kwargs = {'model_type': 'ae-gcnae'}

# Define model args
in_channels = EMBED_SIZE + len(tag2id)

# Define hyperparameters
hparams = Namespace(
    df=df,
    tag2id=tag2id,
    feature_dim=in_channels,
    model_kwargs=model_kwargs,
)
ait_node_data = BGLNodeDataset(root, hparams=hparams)
ait_node_loader = DataLoader(ait_node_data, batch_size=8, shuffle=False)
b0 = next(iter(ait_node_loader))

In [None]:
import torch.nn as nn

# Transformer forward

# Graph encode
e_ = gcn_encoder(b0.x, b0.edge_index) # |V| X E

# Add position embedding (graph-level)
wpe = nn.Embedding(gpt2_config.max_position_embeddings, gpt2_config.hidden_size)
position_ids = torch.arange(0, b0.num_graphs, dtype=torch.long)
position_ids = position_ids.unsqueeze(0).view(-1, b0.num_graphs) # |G|
graph_embed = wpe(position_ids).squeeze(0) # |G| X E
node_embed = torch.stack([graph_embed[graphid] for graphid in b0.batch], dim=0) # |V| X E
e_ = e_ + node_embed # |V| X E

# GPT2 LM
outputs = gpt2_lm_model(
    inputs_embeds=e_.unsqueeze(0),
    labels=b0.ids.unsqueeze(0),
)
print(outputs.keys()) # ['loss', 'logits', 'past_key_values', 'hidden_states']
print(outputs.loss)
last_hidden_state = outputs.hidden_states[-1].squeeze(0) # |V| X E
print(last_hidden_state.shape)

In [None]:
model_path = 'results/BART_seq2seq/ait/10-shot-0'
config = AutoConfig.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
seq2seq = AutoModelForSeq2SeqLM.from_pretrained(model_path)

In [None]:
# strategy = 0
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 
# seq2seq = seq2seq

# def get_attribute(examples):

#     all_graph_ids = []
#     for i, node_pair_list in enumerate(examples['nodes']):
        
#         ent_tag_dict = {x[1]:x[0] for x in node_pair_list} # {entity: tag}
#         # print(ent_tag_dict)

#         # For each graph, gets template list
#         graph_templates = []
#         for ent, tag in ent_tag_dict.items():
#             # print('ent {} tag {}'.format(ent, tag))
#             # First get template
#             if tag == 'event':
#                 template = ent + ' is an event ID .'
#             elif tag == 'component':
#                 template = ent + ' is a log message component .'
#             else:
#                 template = ent + LABEL2TEMPLATE[tag][strategy]
            
#             graph_templates.append(template)

#         # Tokenize and encode each template in a graph
#         tokenized_inputs = tokenizer(
#             graph_templates,
#             padding=False,
#             truncation=True,
#             max_length=1024,
#         )

#         # Pad dynamically 
#         batch = tokenizer.pad(
#             tokenized_inputs,
#             padding=True,
#             max_length=1024,
#             pad_to_multiple_of=8,
#             return_tensors="pt",
#         )

#         # Get Encoder outputs
#         encoder_outputs = seq2seq.model.encoder(**batch)

#         all_graph_ids.append(encoder_outputs.last_hidden_state)

#         # print(tokenized_inputs)
    
#     examples['embeddings'] = all_graph_ids
#     return examples


# subdata = ait_node_data.graph_stats.select(range(100)).map(
#     get_attribute,
#     batched=True,
#     num_proc=None,
#     load_from_cache_file=False,
# )

### Edge Detection

In [None]:
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModel,
)
import pandas as pd
from utils import *
from argparse import Namespace
from graph_dataset import BGLNodeDataset
from torch_geometric.loader import DataLoader, NeighborLoader
from graph_model import NodeConv, GCN, AENodeConv

root = '/nfs/intern_data/yufli/dataset/BGL/seq2seq-edge-0.5min-template-bertembed'

# Define tag_to_id dict
tag2id = {ent:i for i, ent in enumerate(LABEL2TEMPLATE.keys())}
tag2id['event'] = len(tag2id)
tag2id['component'] = len(tag2id)
df = pd.DataFrame([])
model_kwargs = {'model_type': 'dynamic'}

# Define model args
in_channels = EMBED_SIZE + len(tag2id)

# Define hyperparameters
hparams = Namespace(
    df=df,
    tag2id=tag2id,
    feature_dim=in_channels,
    model_kwargs=model_kwargs,
)
ait_node_data = BGLNodeDataset(root, hparams=hparams)
ait_node_loader = DataLoader(ait_node_data, batch_size=4, shuffle=False)
b0 = next(iter(ait_node_loader))

In [None]:
from torch_geometric.utils import degree

p_a = nn.Parameter(torch.randn(1024))
p_b = nn.Parameter(torch.randn(1024))
beta = 0.5
mu = 0.5
gamma = 0.5

def score_func(hidden, i, j, weight):
    dist = (p_a * hidden[i] + p_b * hidden[j]).norm()**2
    print('dist {}, weight {}'.format(dist, weight))
    return weight * (beta * (dist - mu)).sigmoid()

def neg_sampling(degrees, i, j, s):
    # negative sampling
    prob_i = degrees[i]/(degrees[i] + degrees[j]) if degrees[i] + degrees[j] else 0
    if torch.rand(1) <= prob_i:
        # replace node i
        i_prime = j
        while i_prime == j or s[i_prime, j] != 0:
            i_prime = torch.randint(s.size()[0], (1,)).item()
        return i_prime, j
    else:
        # replace node j
        j_prime = i
        while j_prime == i or s[i, j_prime] != 0:
            j_prime = torch.randint(s.size()[0], (1,)).item()
        return i, j_prime

def margin_loss(hidden, edge_index, s, num_nodes):
    degrees = degree(edge_index[0], num_nodes)
    scores = []
    for i, j in edge_index.T.tolist():
        pos_score = score_func(hidden, i, j, s[i, j])
        # negative sampling
        i_prime, j_prime = neg_sampling(degrees, i, j, s)
        neg_score = score_func(hidden, i_prime, j_prime, s[i, j])

        if pos_score <= neg_score:
            edge_loss = max(0, gamma + pos_score - neg_score)
            scores.append(edge_loss)
            # edge_loss = F.relu(gamma + pos_score - neg_score)
            # print(edge_loss)
            # loss += edge_loss
            # print(loss)
    print(torch.stack(scores))
    return sum(scores)

In [None]:
# BATCH
print(b0, b0.num_graphs)
b0.s = to_dense_adj(b0.edge_index)[0]
print('feature', b0.x)
print('edge index', b0.edge_index)
print('adjacency matrix', b0.s)
# margin_loss(b0.x, b0.edge_index, b0.s, b0.num_nodes)

In [None]:
# Graph in BATCH
for i in range(b0.num_graphs): 
    print(b0[i])

i = 0
print('graph #{}: {}'.format(i, b0[i]))
print('# of nodes', b0[i].num_nodes)
print('feature', b0[i].x)
print('edge index', b0[i].edge_index)
print('adjacency matrix', to_dense_adj(b0[i].edge_index)[0])
print('degree', degree(b0[i].edge_index[0], b0[i].num_nodes))

In [None]:
degree(b0.edge_index[0], b0.num_nodes)

In [None]:
print(b0.s)
print(b0.s.size()[0])
print(b0.s[1])
print(b0.s[1].nonzero().size()[0])
print(b0.s[:,1])
print(b0.s[:,1].nonzero().size()[0])

### Sock Shop Data

In [9]:
file = 'dataset/sockshop/database.csv'
df = pd.read_csv(file) # 49442 rows
df['ParameterList'] = [[] for _ in range(len(df))] # create entity list
df['NerList'] = [[] for _ in range(len(df))] # create ner list
df.head()

Unnamed: 0,source,@timestamp,container_id,log,container_name,message,ParameterList,NerList
0,stdout,2023-01-05T19:54:04+00:00,b7a9cccb5f14f1fa7f5c82593112ff5d07523ac0e8b1fe...,2023-01-05 19:54:04.558 INFO 7 --- [ ...,/docker-compose-queue-master-1,,[],[]
1,stdout,2023-01-05T19:54:04+00:00,b7a9cccb5f14f1fa7f5c82593112ff5d07523ac0e8b1fe...,2023-01-05 19:54:04.604 INFO 7 --- [ ...,/docker-compose-queue-master-1,,[],[]
2,stdout,2023-01-05T19:54:04+00:00,b7a9cccb5f14f1fa7f5c82593112ff5d07523ac0e8b1fe...,2023-01-05 19:54:04.634 INFO 7 --- [ ...,/docker-compose-queue-master-1,,[],[]
3,stdout,2023-01-05T19:54:04+00:00,b7a9cccb5f14f1fa7f5c82593112ff5d07523ac0e8b1fe...,2023-01-05 19:54:04.787 INFO 7 --- [ ...,/docker-compose-queue-master-1,,[],[]
4,stdout,2023-01-05T19:54:06+00:00,b7a9cccb5f14f1fa7f5c82593112ff5d07523ac0e8b1fe...,2023-01-05 19:54:06.603 INFO 7 --- [ost-start...,/docker-compose-queue-master-1,,[],[]


In [21]:
df.loc[
    # df['log'].str.contains("item", na = False) &
    df['log'].str.contains("customer", na = False)
].to_csv('dataset/sockshop/database_customer.csv') # select rows with item & customer