In [1]:
import os
import pandas as pd
from argparse import ArgumentParser

from logbert.bert_pytorch import Predictor, Trainer
from logbert.bert_pytorch.dataset import WordVocab
from common import Utils


# define options
options=dict()


options["model_name"] = "logbert"
options["dataset_name"] = "aiia"
options["device"] = "cuda"
options["output_dir"] = "~/.output/"
options["model_dir"] = "logbert_aiia/"

options["train_ratio"] = 1
options["valid_ratio"] = 0.1
options["test_ratio"] = 1

options["max_epoch"] = 200
options["n_epochs_stop"] = 5
options["n_warm_up_epoch"] = 0
options["batch_size"] = 32
options["lr"] = 0.001

options["is_logkey"] = True
options["is_time"] = False
options["min_freq"] = 1

options["seq_len"] = 512
options["min_len"] = 10
options["max_len"] = 512
options["mask_ratio"] = 0.5

options["window_size"] = 20
options["adaptive_window"] = True
options["deepsvdd_loss"] = False
options["deepsvdd_loss_test"] = False

options["scale"] = None
options["scale_path"] = None
options["hidden"] = 256
options["layers"] = 4

options["attn_heads"] = 4
options["num_workers"] = 5
options["adam_beta1"] = 0.9
options["adam_beta2"] = 0.999
options["adam_weight_decay"] = 0.00
options["log_freq"] = 100
options["num_candidates"] = 15


options["output_dir"] = os.path.expanduser(options["output_dir"] + options["dataset_name"] + "/")
options["model_dir"] = options["output_dir"] + options["model_dir"]

options["train_vocab"] = options["output_dir"] + "train"
options["vocab_path"] = options["output_dir"] + "vocab.pkl"  # pickle file
options["model_path"] = options["model_dir"] + "best_model.pth"
options["scale_path"] = options["model_dir"] + "scale.pkl"

options["testset_files"] = ["evalue-"+str(i)+".txt.test" for i in range(0,10)]

if not os.path.exists(options["model_dir"]):
    os.makedirs(options["model_dir"], exist_ok=True)

Utils.seed_everything(seed=1234)

print("Save options parameters")
Utils.save_parameters(options, options["model_dir"] + "parameters.txt")

Save options parameters


In [2]:
if not os.path.exists(options["vocab_path"]):
    with open(options["train_vocab"], "r") as f:
        texts = f.readlines()
    vocab = WordVocab(texts, min_freq=options["min_freq"])
    print("VOCAB SIZE:", len(vocab))
    print("save vocab in", options["vocab_path"])
    print("\n")
    vocab.save_vocab(options["vocab_path"])

In [None]:
Trainer(options).train()

  8%|▊         | 3066/36855 [00:00<00:01, 30653.82it/s]

Loading vocab /root/.output/aiia/vocab.pkl
vocab Size:  261
before filtering short session
train size  33170
valid size  3685


100%|██████████| 36855/36855 [00:01<00:00, 30533.18it/s]


Num of train seqs 33170
Num of valid seqs 3685
Building BERT model
Creating BERT Trainer
True
Total Parameters: 2243846
Training Start


Epoch: 0 | phase: train, loss=2.990953335775832
logkey loss: 2.990953335775832, deepsvdd loss: 0.0

Epoch: 0 | phase: valid, loss=1.5545365800028261
logkey loss: 1.5545365800028261, deepsvdd loss: 0.0

Log saved
 Model Saved on: /root/.output/aiia/logbert_aiia/best_model.pth


Epoch: 1 | phase: train, loss=1.249563146051753
logkey loss: 1.249563146051753, deepsvdd loss: 0.0

Epoch: 1 | phase: valid, loss=0.9906563567078632
logkey loss: 0.9906563567078632, deepsvdd loss: 0.0

Log saved
 Model Saved on: /root/.output/aiia/logbert_aiia/best_model.pth


Epoch: 2 | phase: train, loss=0.839844182191208
logkey loss: 0.839844182191208, deepsvdd loss: 0.0

Epoch: 2 | phase: valid, loss=0.6998442209285238
logkey loss: 0.6998442209285238, deepsvdd loss: 0.0

Log saved
 Model Saved on: /root/.output/aiia/logbert_aiia/best_model.pth


Epoch: 3 | phase: train, loss

In [None]:
Predictor(options).predict_aiia()

In [None]:
result_df = pd.DataFrame(columns=['Result','StartLineNum','Detail','TimeCost'])

for i, testset_file in enumerate(options["testset_files"]):
    print("Now predicting "+testset_file+".")
    evaluefile_path = "evalue/" + testset_file

    predict_result, elapsed_time = Predictor(options).predict_testset_aiia(evaluefile_path,seq_threshold=0.1)
    new_row = {"Result":predict_result,"StartLineNum":2,"Detail":1,"TimeCost":elapsed_time}
    result_df = result_df.append(new_row,ignore_index=True)

result_df.to_csv(options["output_dir"]+"result.csv")