# 1. Importing

In [1]:
import logging
import os
import sys
import time

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.decomposition import FastICA

from CONSTANTS import DEVICE, LOG_ROOT, PROJECT_ROOT, SESSION
from models.gru import AttGRUModel
from module.Common import batch_variable_inst, data_iter, generate_tinsts_binary_label
from module.Optimizer import Optimizer
from preprocessing.AutoLabeling import Probabilistic_Labeling
from preprocessing.datacutter.SimpleCutting import cut_by
from preprocessing.Preprocess import Preprocessor
from representations.sequences.statistics import Sequential_TF
from representations.templates.statistics import (
    Simple_template_TF_IDF,
    Template_TF_IDF_without_clean,
)
from utils.Vocab import Vocab

2024-09-15 06:56:32,631 - AttGRU - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: Construct logger for Attention-Based GRU succeeded, current working directory: /Users/minhthienlongvo/research/MetaLog, logs will be written in /Users/minhthienlongvo/research/MetaLog/logs
2024-09-15 06:56:33,003 - Preprocessor - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: Construct logger for MetaLog succeeded, current working directory: /Users/minhthienlongvo/research/MetaLog, logs will be written in /Users/minhthienlongvo/research/MetaLog/logs
2024-09-15 06:56:33,005 - StatisticsRepresentation. - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: Construct logger for Statistics Representation succeeded, current working directory: /Users/minhthienlongvo/research/MetaLog, logs will be written in /Users/minhthienlongvo/research/MetaLog/logs
2024-09-15 06:56:33,006 - Statistics_Template_Encoder - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: Construct logger for Statistics Template Encoder succeede

# 2. Custom default params

In [2]:
# Custom params
lstm_hiddens = 64
num_layer = 4
batch_size = 100
drop_out = 0.2
epochs = 10

word2vec_file = "glove.42B.300d.txt"
dim = 300
alpha = 2e-3
beta = 2
gamma = 2e-3


parser = "IBM"
mode = "train"
min_cluster_size = 100
min_samples = 100
reduce_dimension = 50
threshold = 0.5

# 3. Function for updating model

In [3]:
def get_updated_network(old, new, lr, load=False):
    updated_theta = {}
    state_dicts = old.state_dict()
    param_dicts = dict(old.named_parameters())

    for i, (k, v) in enumerate(state_dicts.items()):
        if k in param_dicts.keys() and param_dicts[k].grad is not None:
            updated_theta[k] = param_dicts[k] - lr * param_dicts[k].grad
        else:
            updated_theta[k] = state_dicts[k]
    if load:
        new.load_state_dict(updated_theta)
    else:
        new = put_theta(new, updated_theta)
    return new


def put_theta(model, theta):
    def k_param_fn(tmp_model, name=None):
        if len(tmp_model._modules) != 0:
            for k, v in tmp_model._modules.items():
                if name is None:
                    k_param_fn(v, name=str(k))
                else:
                    k_param_fn(v, name=str(name + "." + k))
        else:
            for k, v in tmp_model._parameters.items():
                if not isinstance(v, torch.Tensor):
                    continue
                tmp_model._parameters[k] = theta[str(name + "." + k)]

    k_param_fn(model)
    return model

# 4. MetaLog class

In [4]:
class MetaLog:
    def __init__(self, vocab, num_layer, hidden_size, drop_out, label2id):
        self.label2id = label2id
        self.vocab = vocab
        self.num_layer = num_layer
        self.hidden_size = hidden_size
        self.batch_size = 128
        self.test_batch_size = 1024
        self.drop_out = drop_out
        self.model = AttGRUModel(vocab, self.num_layer, self.hidden_size, self.drop_out)
        self.bk_model = AttGRUModel(
            vocab, self.num_layer, self.hidden_size, self.drop_out, is_backup=True
        )
        if torch.cuda.is_available():
            self.model = self.model.cuda(DEVICE)
            self.bk_model = self.bk_model.cuda(DEVICE)
        elif hasattr(torch.mps, "is_available") and torch.mps.is_available():
            self.model = self.model.to(DEVICE)
            self.bk_model = self.bk_model.to(DEVICE)
        self.loss = nn.BCELoss()

    def forward(self, inputs, targets):
        tag_logits = self.model(inputs)
        tag_logits = F.softmax(tag_logits, dim=1)
        loss = self.loss(tag_logits, targets)
        return loss

    def bk_forward(self, inputs, targets):
        tag_logits = self.bk_model(inputs)
        tag_logits = F.softmax(tag_logits, dim=1)
        loss = self.loss(tag_logits, targets)
        return loss

    def predict(self, inputs, threshold=None):
        with torch.no_grad():
            tag_logits = self.model(inputs)
            tag_logits = F.softmax(tag_logits, dim=1)
        if threshold is not None:
            probs = tag_logits.detach().cpu().numpy()
            anomaly_id = self.label2id["Anomalous"]
            pred_tags = np.zeros(probs.shape[0])
            for i, logits in enumerate(probs):
                if logits[anomaly_id] >= threshold:
                    pred_tags[i] = anomaly_id
                else:
                    pred_tags[i] = 1 - anomaly_id

        else:
            pred_tags = tag_logits.detach().max(1)[1].cpu()
        return pred_tags, tag_logits

    def evaluate(self, dataset, instances, threshold=0.5):
        logger.info(f"Start evaluating {dataset} by threshold {threshold}")
        with torch.no_grad():
            self.model.eval()
            globalBatchNum = 0
            TP, TN, FP, FN = 0, 0, 0, 0
            tag_correct, tag_total = 0, 0
            for onebatch in data_iter(instances, self.test_batch_size, False):
                tinst = generate_tinsts_binary_label(onebatch, vocab_BGL, False)
                if torch.cuda.is_available():
                    tinst.to_cuda(DEVICE)
                elif hasattr(torch.mps, "is_available") and torch.mps.is_available():
                    tinst.to_mps(DEVICE)
                self.model.eval()
                pred_tags, tag_logits = self.predict(tinst.inputs, threshold)
                for inst, bmatch in batch_variable_inst(
                    onebatch, pred_tags, tag_logits, processor_BGL.id2tag
                ):
                    tag_total += 1
                    if bmatch:
                        tag_correct += 1
                        if inst.label == "Normal":
                            TN += 1
                        else:
                            TP += 1
                    else:
                        if inst.label == "Normal":
                            FP += 1
                        else:
                            FN += 1
                globalBatchNum += 1
            if TP + FP != 0:
                precision = 100 * TP / (TP + FP)
                recall = 100 * TP / (TP + FN)
                f1_score = 2 * precision * recall / (precision + recall)
                # fpr = 100 * FP / (FP + TN)
                logger.info(
                    f"{dataset}: F1 score = {f1_score} | Precision = {precision} | Recall = {recall}"
                )
            else:
                logger.info(
                    f"{dataset}: F1 score = {0} | Precision = {0} | Recall = {0}"
                )
                precision, recall, f1_score = 0, 0, 0
        return precision, recall, f1_score

# 5. Logging

## 5.1. Logging config

In [5]:
logger = logging.getLogger("MetaLog")
logger.setLevel(logging.DEBUG)
console_handler = logging.StreamHandler(sys.stderr)
console_handler.setLevel(logging.DEBUG)
console_handler.setFormatter(
    logging.Formatter(
        "%(asctime)s - %(name)s - " + SESSION + " - %(levelname)s: %(message)s"
    )
)
file_handler = logging.FileHandler(os.path.join(LOG_ROOT, "MetaLog.log"))
file_handler.setLevel(logging.INFO)
file_handler.setFormatter(
    logging.Formatter(
        "%(asctime)s - %(name)s - " + SESSION + " - %(levelname)s: %(message)s"
    )
)
logger.addHandler(console_handler)
logger.addHandler(file_handler)
logger.info(
    f"Construct logger for MetaLog succeeded, current working directory: {os.getcwd()}, logs will be written in {LOG_ROOT}"
)

2024-09-15 06:56:33,042 - MetaLog - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: Construct logger for MetaLog succeeded, current working directory: /Users/minhthienlongvo/research/MetaLog, logs will be written in /Users/minhthienlongvo/research/MetaLog/logs


## 5.2. Log custom params

In [6]:
logger.info(
    f"Network params: lstm_hiddens = {lstm_hiddens} | num_layer = {num_layer} | drop_out = {drop_out}."
)
logger.info(
    f"Hyper-params: alpha = {alpha} | beta = {beta} | gamma = {gamma} | word2vec_file = {word2vec_file}."
)

2024-09-15 06:56:33,047 - MetaLog - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: Network params: lstm_hiddens = 128 | num_layer = 4 | drop_out = 0.5.
2024-09-15 06:56:33,048 - MetaLog - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: Hyper-params: alpha = 0.002 | beta = 2 | gamma = 0.002 | word2vec_file = glove.42B.300d.txt.


# 6. Import dataset

## 6.1. Import BGL dataset

In [7]:
dataset = "BGL"

# # Mark results saving directories.
save_dir = os.path.join(PROJECT_ROOT, "outputs")
prob_label_res_file_BGL = os.path.join(
    save_dir,
    "results/MetaLog/"
    + dataset
    + "_"
    + parser
    + "/prob_label_res/mcs-"
    + str(min_cluster_size)
    + "_ms-"
    + str(min_samples),
)
rand_state_BGL = os.path.join(
    save_dir,
    "results/MetaLog/" + dataset + "_" + parser + "/prob_label_res/random_state",
)

output_model_dir = os.path.join(
    save_dir, "models/MetaLog/" + dataset + "_" + parser + "/model"
)
output_res_dir = os.path.join(
    save_dir, "results/MetaLog/" + dataset + "_" + parser + "/detect_res"
)

# Training, Validating and Testing instances.
template_encoder_BGL = (
    Template_TF_IDF_without_clean(word2vec_file)
    if dataset == "NC"
    else Simple_template_TF_IDF(word2vec_file)
)
processor_BGL = Preprocessor()
train_BGL, _, test_BGL = processor_BGL.process(
    dataset=dataset,
    parsing=parser,
    cut_func=cut_by(0.3, 0., 0.01),
    template_encoding=template_encoder_BGL.present,
)

# Log sequence representation.
sequential_encoder_BGL = Sequential_TF(processor_BGL.embedding)
train_reprs_BGL = sequential_encoder_BGL.present(train_BGL)
for index, inst in enumerate(train_BGL):
    inst.repr = train_reprs_BGL[index]
test_reprs_BGL = sequential_encoder_BGL.present(test_BGL)
for index, inst in enumerate(test_BGL):
    inst.repr = test_reprs_BGL[index]

# Dimension reduction if specified.
transformer_BGL = None
if reduce_dimension != -1:
    start_time = time.time()
    print(f"Start FastICA, target dimension: {reduce_dimension}.")
    transformer_BGL = FastICA(n_components=reduce_dimension)
    train_reprs_BGL = transformer_BGL.fit_transform(train_reprs_BGL)
    for idx, inst in enumerate(train_BGL):
        inst.repr = train_reprs_BGL[idx]
    print(f"Finished at {round(time.time() - start_time, 2)}.")

# Probabilistic labeling.
# Sample normal instances.
train_normal_BGL = [x for x, inst in enumerate(train_BGL) if inst.label == "Normal"]
normal_ids_BGL = train_normal_BGL[: int(0.5 * len(train_normal_BGL))]
label_generator_BGL = Probabilistic_Labeling(
    min_samples=min_samples,
    min_clust_size=min_cluster_size,
    res_file=prob_label_res_file_BGL,
    rand_state_file=rand_state_BGL,
)
labeled_train_BGL = label_generator_BGL.auto_label(train_BGL, normal_ids_BGL)

# Load Embeddings
vocab_BGL = Vocab()
vocab_BGL.load_from_dict(processor_BGL.embedding)

2024-09-15 06:56:33,054 - Statistics_Template_Encoder - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: Loading word2vec dict from glove.42B.300d.txt.
100%|██████████| 1917494/1917494 [00:55<00:00, 34657.57it/s]
2024-09-15 06:57:35,233 - Statistics_Template_Encoder - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: Total 1917494 words in glove.42B.300d.txt dict.
2024-09-15 06:57:35,264 - BGLLoader - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: Construct self.logger success, current working directory: /Users/minhthienlongvo/research/MetaLog, logs will be written in /Users/minhthienlongvo/research/MetaLog/logs
2024-09-15 06:57:35,272 - BGLLoader - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: Start load from previous extraction. File path /Users/minhthienlongvo/research/MetaLog/datasets/BGL/raw_log_seqs.txt
100%|██████████| 85577/85577 [00:00<00:00, 104884.56it/s]
2024-09-15 06:57:36,261 - BGLLoader - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: Extraction finished successful

* * * * * * * * * * * * * * * * * * * * * * * * * * * * * ciod * * * * * * * * * * * * * * * * * * * * * * ciodb ciod * * * * * * * * ciod * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * fefff * fefff * * * * * * * * * * * * * ciod * * * * * ciod * * * * * * * * * ciod * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * regctl miscompare * * * * sernum * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * mpgood * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * bglmaster * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * ciod * * * * * * * * * ciod * * * * * * * * * * * * * * * * sernum * * ciod * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * mpgood mpgood * * * * * * * * * * * mpgood mpgood ciod * * *

100%|██████████| 85577/85577 [00:00<00:00, 176389.28it/s]
2024-09-15 06:57:39,175 - Preprocessor - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: Train: 16352 Normal, 110 Anomalous instances.
2024-09-15 06:57:39,176 - Preprocessor - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: Test: 32922 Normal, 26982 Anomalous instances.
2024-09-15 06:57:39,511 - Preprocessor - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: Update train instances' event-idx mapping.
2024-09-15 06:57:39,590 - Preprocessor - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: Embed size: 120 in pre dataset.
2024-09-15 06:57:39,591 - Preprocessor - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: Update test instances' event-idx mapping.
2024-09-15 06:57:39,724 - Preprocessor - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: Embed size: 401 in pre+post dataset.


Start FastICA, target dimension: 50.


2024-09-15 06:57:42,330 - Solitary_HDBSCAN - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: Construct logger for Solitary_HDBSCAN succeeded, current working directory: /Users/minhthienlongvo/research/MetaLog, logs will be written in /Users/minhthienlongvo/research/MetaLog/logs
2024-09-15 06:57:42,331 - Prob_Label - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: Construct logger for Probabilistic labeling succeeded, current working directory: /Users/minhthienlongvo/research/MetaLog, logs will be written in /Users/minhthienlongvo/research/MetaLog/logs
2024-09-15 06:57:42,332 - Prob_Label - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: Found previous labeled file, will load and continue to accelerate the process.
2024-09-15 06:57:42,333 - Prob_Label - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: Start load previous clustered results from /Users/minhthienlongvo/research/MetaLog/outputs/results/MetaLog/BGL_IBM/prob_label_res/mcs-100_ms-100
2024-09-15 06:57:42,348 - Vocab - SESSIO

Finished at 1.08.


## 6.2. Import HDFS dataset

In [8]:
dataset = "HDFS"
# Mark results saving directories.
save_dir = os.path.join(PROJECT_ROOT, "outputs")
prob_label_res_file_HDFS = os.path.join(
    save_dir,
    "results/MetaLog/"
    + dataset
    + "_"
    + parser
    + "/prob_label_res/mcs-"
    + str(min_cluster_size)
    + "_ms-"
    + str(min_samples),
)
rand_state_HDFS = os.path.join(
    save_dir,
    "results/MetaLog/" + dataset + "_" + parser + "/prob_label_res/random_state",
)

# Training, Validating and Testing instances.
template_encoder_HDFS = (
    Template_TF_IDF_without_clean(word2vec_file)
    if dataset == "NC"
    else Simple_template_TF_IDF(word2vec_file)
)
processor_HDFS = Preprocessor()
train_HDFS, _, _ = processor_HDFS.process(
    dataset=dataset,
    parsing=parser,
    cut_func=cut_by(0.3, 0.),
    template_encoding=template_encoder_HDFS.present,
)

# Log sequence representation.
sequential_encoder_HDFS = Sequential_TF(processor_HDFS.embedding)
train_reprs_HDFS = sequential_encoder_HDFS.present(train_HDFS)
for index, inst in enumerate(train_HDFS):
    inst.repr = train_reprs_HDFS[index]

# Dimension reduction if specified.
transformer_HDFS = None
if reduce_dimension != -1:
    start_time = time.time()
    print(f"Start FastICA, target dimension: {reduce_dimension}.")
    transformer_HDFS = FastICA(n_components=reduce_dimension)
    train_reprs_HDFS = transformer_HDFS.fit_transform(train_reprs_HDFS)
    for idx, inst in enumerate(train_HDFS):
        inst.repr = train_reprs_HDFS[idx]
    print(f"Finished at {round(time.time() - start_time, 2)}.")

labeled_train_HDFS = train_HDFS

2024-09-15 06:57:42,363 - Statistics_Template_Encoder - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: Loading word2vec dict from glove.42B.300d.txt.
100%|██████████| 1917494/1917494 [00:56<00:00, 33911.56it/s]
2024-09-15 06:58:47,051 - Statistics_Template_Encoder - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: Total 1917494 words in glove.42B.300d.txt dict.
2024-09-15 06:58:47,067 - HDFSLoader - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: Construct self.logger success, current working directory: /Users/minhthienlongvo/research/MetaLog, logs will be written in /Users/minhthienlongvo/research/MetaLog/logs
2024-09-15 06:58:47,073 - HDFSLoader - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: Start load from previous extraction. File path /Users/minhthienlongvo/research/MetaLog/datasets/HDFS/raw_log_seqs.txt
100%|██████████| 575061/575061 [00:02<00:00, 244238.29it/s]
2024-09-15 06:58:49,558 - HDFSLoader - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: Extraction finished succ

* ipandport block* * * block* ipandport * ipandport ipandport ipandport ipandport ipandport ipandport block* ipandport ipandport ipandport ipandport block* ipandport ipandport ipandport ipandport * * * block* ipandport * ipandport * ipandport * ipandport ipandport ipandport * ipandport * ipandport ipandport block* ipandport block* ipandport ipandport * * * * ipandport ipandport * ipandport ipandport block* * * 

100%|██████████| 575061/575061 [00:02<00:00, 275987.76it/s]
2024-09-15 06:58:57,114 - Preprocessor - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: Train: 165180 Normal, 7338 Anomalous instances.
2024-09-15 06:58:57,114 - Preprocessor - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: Test: 393043 Normal, 9500 Anomalous instances.
2024-09-15 06:58:58,359 - Preprocessor - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: Update train instances' event-idx mapping.
2024-09-15 06:58:58,695 - Preprocessor - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: Embed size: 41 in pre dataset.
2024-09-15 06:58:58,696 - Preprocessor - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: Update test instances' event-idx mapping.
2024-09-15 06:58:59,095 - Preprocessor - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: Embed size: 46 in pre+post dataset.


Start FastICA, target dimension: 50.
Finished at 18.7.




## 6.3. Aggregate vocab and label2id

In [9]:
vocab = Vocab()
new_embedding = {}
for key in processor_BGL.embedding.keys():
    new_embedding[key] = processor_BGL.embedding[key]
for key in processor_HDFS.embedding.keys():
    new_embedding[key + 432] = processor_HDFS.embedding[key]
# Load Embeddings
vocab_HDFS = Vocab()
vocab_HDFS.load_from_dict(processor_HDFS.embedding)
vocab.load_from_dict(new_embedding)

metalog = MetaLog(
    vocab=vocab,
    num_layer=num_layer,
    hidden_size=lstm_hiddens,
    drop_out=drop_out,
    label2id=processor_BGL.label2id,
)

2024-09-15 06:59:20,367 - Vocab - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: Total words: 50
2024-09-15 06:59:20,368 - Vocab - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: The dim of pretrained embeddings: 300
2024-09-15 06:59:20,376 - Vocab - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: Total words: 482
2024-09-15 06:59:20,377 - Vocab - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: The dim of pretrained embeddings: 300
2024-09-15 06:59:20,426 - AttGRU - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: ==== Model Parameters ====
2024-09-15 06:59:20,427 - AttGRU - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: Input Dimension: 300
2024-09-15 06:59:20,427 - AttGRU - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: Hidden Size: 128
2024-09-15 06:59:20,427 - AttGRU - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: Num Layers: 4
2024-09-15 06:59:20,428 - AttGRU - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: Dropout: 0.5


2 256
Orthogonal pretrainer loss: 7.80e-06
2 256
Orthogonal pretrainer loss: 1.03e-05


# 7. Training

In [10]:
# Meta-learning
log = "layer={}_hidden={}_epoch={}".format(num_layer, lstm_hiddens, epochs)
best_model_file = os.path.join(output_model_dir, log + "_best.pt")
last_model_file = os.path.join(output_model_dir, log + "_last.pt")

if not os.path.exists(output_model_dir):
    os.makedirs(output_model_dir)
if mode == "train":
    # Train
    optimizer = Optimizer(
        filter(lambda p: p.requires_grad, metalog.model.parameters()), lr=gamma
    )
    global_step = 0
    best_f1_score = 0
    for epoch in range(epochs):
        metalog.model.train()
        metalog.bk_model.train()
        start = time.strftime("%H:%M:%S")
        logger.info(
            f"Starting epoch: {epoch} | phase: train | start time: {start} | learning rate: {optimizer.lr}."
        )

        batch_num = int(np.ceil(len(labeled_train_HDFS) / float(batch_size)))
        batch_iter = 0
        batch_num_test = int(np.ceil(len(labeled_train_BGL) / float(batch_size)))
        batch_iter_test = 0
        total_bn = max(batch_num, batch_num_test)
        meta_train_loader = data_iter(labeled_train_HDFS, batch_size, True)
        meta_test_loader = data_iter(labeled_train_BGL, batch_size, True)

        for i in range(total_bn):
            optimizer.zero_grad()
            
            # Meta-train on HDFS and BGL
            meta_train_batch = meta_train_loader.__next__()
            meta_test_batch = meta_test_loader.__next__()
            tinst_tr = generate_tinsts_binary_label(meta_train_batch, vocab_HDFS)
            if torch.cuda.is_available():
                tinst_tr.to_cuda(DEVICE)
            elif hasattr(torch.mps, "is_available") and torch.mps.is_available():
                tinst_tr.to_mps(DEVICE)
            loss = metalog.forward(tinst_tr.inputs, tinst_tr.targets)
            loss_value = loss.data.cpu().numpy()
            loss.backward(retain_graph=True)
            batch_iter += 1
            if torch.cuda.is_available():
                metalog.bk_model = (
                    get_updated_network(metalog.model, metalog.bk_model, alpha)
                    .train()
                    .cuda()
                )
            elif hasattr(torch.mps, "is_available") and torch.mps.is_available():
                metalog.bk_model = (
                    get_updated_network(metalog.model, metalog.bk_model, alpha)
                    .train()
                    .to(DEVICE)
                )
            else:
                metalog.bk_model = get_updated_network(
                    metalog.model, metalog.bk_model, alpha
                ).train()
            
            # Meta-test on BGL
            tinst_test = generate_tinsts_binary_label(meta_test_batch, vocab_BGL)
            if torch.cuda.is_available():
                tinst_test.to_cuda(DEVICE)
            elif hasattr(torch.mps, "is_available") and torch.mps.is_available():
                tinst_test.to_mps(DEVICE)
            loss_te = beta * metalog.bk_forward(
                tinst_test.inputs, tinst_test.targets
            )
            loss_value_te = loss_te.data.cpu().numpy() / beta
            loss_te.backward()
            batch_iter_test += 1
            
            # Aggregate the gradients and update the model.
            optimizer.step()
            global_step += 1
            if global_step % 500 == 0:
                logger.info(
                    f"Step: {global_step} | Epoch: {epoch} | Meta-train loss: {loss_value} | Meta-test loss: {loss_value_te}."
                )
            if batch_iter == batch_num:
                meta_train_loader = data_iter(labeled_train_HDFS, batch_size, True)
                batch_iter = 0
            if batch_iter_test == batch_num_test:
                meta_test_loader = data_iter(labeled_train_BGL, batch_size, True)
                batch_iter_test = 0

        if train_BGL:
            metalog.evaluate("Train", train_BGL + train_HDFS)
            
        if test_BGL:
            _, _, f1_score = metalog.evaluate("Test", test_BGL)

            if f1_score > best_f1_score:
                logger.info(
                    f"Exceed best F1 score: history = {best_f1_score}, current = {f1_score}."
                )
                torch.save(metalog.model.state_dict(), best_model_file)
                best_f1_score = f1_score

        logger.info(f"Training epoch {epoch} finished.")
        torch.save(metalog.model.state_dict(), last_model_file)

2024-09-15 06:59:21,163 - MetaLog - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: Starting epoch: 0 | phase: train | start time: 06:59:21 | learning rate: [0.002].
  param_grad = param.grad
2024-09-15 07:11:12,692 - MetaLog - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: Step: 500 | Epoch: 0 | Meta-train loss: 0.05241364985704422 | Meta-test loss: 0.38199684023857117.
2024-09-15 07:23:08,205 - MetaLog - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: Step: 1000 | Epoch: 0 | Meta-train loss: 0.06264618784189224 | Meta-test loss: 0.4360302984714508.
2024-09-15 07:35:02,926 - MetaLog - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: Step: 1500 | Epoch: 0 | Meta-train loss: 0.004530108068138361 | Meta-test loss: 0.29435890913009644.
2024-09-15 07:40:20,678 - MetaLog - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: Start evaluating Train by threshold 0.5
2024-09-15 07:41:44,468 - MetaLog - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: Train: F1 score = 95.10587934233254 | Preci

# 8. Evaluate

In [11]:
if os.path.exists(last_model_file):
    logger.info("=== Final Model ===")
    metalog.model.load_state_dict(torch.load(last_model_file))
    metalog.evaluate("Last model on Test BGL", test_BGL)
if os.path.exists(best_model_file):
    logger.info("=== Best Model ===")
    metalog.model.load_state_dict(torch.load(best_model_file))
    metalog.evaluate("Best model on Test BGL", test_BGL)
logger.info("All Finished!")

2024-09-15 14:12:40,558 - MetaLog - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: === Final Model ===
  metalog.model.load_state_dict(torch.load(last_model_file))
2024-09-15 14:12:40,600 - MetaLog - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: Start evaluating Last model on Test BGL by threshold 0.5
2024-09-15 14:13:20,535 - MetaLog - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: Last model on Test BGL: F1 score = 68.6776113059186 | Precision = 53.519231565437 | Recall = 95.81572900452153
2024-09-15 14:13:20,538 - MetaLog - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: === Best Model ===
  metalog.model.load_state_dict(torch.load(best_model_file))
2024-09-15 14:13:20,562 - MetaLog - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: Start evaluating Best model on Test BGL by threshold 0.5
2024-09-15 14:14:00,294 - MetaLog - SESSION_f56fe5747d539d46f028a40c6b1e8b17 - INFO: Best model on Test BGL: F1 score = 70.75422733357009 | Precision = 56.023966084097594 | Recall = 95.9936

# 9. Export to graph

In [None]:
import re

import matplotlib.pyplot as plt

STATISTICS_TEMPLATE_LOG_PATH = "logs/Statistics_Template.log"
METALOG_LOG_PATH = "logs/MetaLog.log"

with open(STATISTICS_TEMPLATE_LOG_PATH, "r") as file:
    lines = file.readlines()

    for line in lines:
        regex = rf"^.+ - Statistics_Template_Encoder - {SESSION} - INFO: Loading word2vec dict from (.+)\.$"
        match = re.search(regex, line)

        if match is not None:
            word2vec_file = match.group().split()[-1]
            break

TITLE = f"BILATERAL GENERALIZATION TRANSFERRING HDFS TO BGL\n(using {word2vec_file})\n"

train_f1_scores = []
test_f1_scores = []
meta_train_losses = []
meta_test_losses = []
with open(METALOG_LOG_PATH, "r") as file:
    lines = file.readlines()

    for line in lines:
        regex = rf"^.+ - MetaLog - {SESSION} - INFO: Train: F1 score = (.+) \| Precision = (.+) \| Recall = (.+)$"
        match = re.search(regex, line)

        if match is not None:
            params = match.group().split("|")

            f1_score = float(params[0].split()[-1])
            train_f1_scores.append(f1_score)

    for line in lines:
        regex = rf"^.+ - MetaLog - {SESSION} - INFO: Test: F1 score = (.+) \| Precision = (.+) \| Recall = (.+)$"
        match = re.search(regex, line)

        if match is not None:
            params = match.group().split("|")

            f1_score = float(params[0].split()[-1])
            test_f1_scores.append(f1_score)

    for line in lines:
        regex = rf"^.* - MetaLog - {SESSION} - INFO: Step: (.+) \| Epoch: (.+) \| Meta-train loss: (.+) \| Meta-test loss: (.+)\.$"
        match = re.search(regex, line)

        if match is not None:
            params = match.group().split("|")

            meta_train_loss = float(params[2].split()[-1].split()[-1])
            meta_train_losses.append(meta_train_loss)

            meta_test_loss = float(params[3].split()[-1].split()[-1][:-1])
            meta_test_losses.append(meta_test_loss)


fig, axs = plt.subplots(2, 1, figsize=(16, 8))
num_epoches = [i for i in range(len(train_f1_scores))]
num_validations = [i * 500 for i in range(len(meta_train_losses))]

axs[0].set_ylim(0, 110)
axs[0].plot(num_epoches, train_f1_scores, color="tab:blue")
axs[0].plot(num_epoches, test_f1_scores, color="tab:orange")
axs[0].legend(["Train", "Test"])
axs[0].set_xlabel("Epoch")
axs[0].set_ylabel("F1 Score")

max_test_f1_score = max(test_f1_scores)
best_test_f1_score = 0

for i in range(len(num_epoches)):
    if test_f1_scores[i] == max_test_f1_score:
        best_test_f1_score = i
    axs[0].plot(num_epoches[i], train_f1_scores[i], "o", color="tab:blue", zorder=10)
    axs[0].text(
        num_epoches[i],
        train_f1_scores[i] + 5,
        round(train_f1_scores[i], 2),
        color="red" if train_f1_scores[i] == train_f1_scores else "black",
        ha="center",
    )

    axs[0].plot(num_epoches[i], test_f1_scores[i], "o", color="tab:orange", zorder=10)
    axs[0].text(
        num_epoches[i],
        test_f1_scores[i] - 10,
        round(test_f1_scores[i], 2),
        ha="center",
    )

axs[1].set_ylim(0, 1)
axs[1].plot(num_validations, meta_train_losses, color="tab:blue")
axs[1].plot(num_validations, meta_test_losses, color="tab:orange")
axs[1].legend(["Meta-train loss", "Meta-test loss"])
axs[1].set_xlabel("Step")
axs[1].set_ylabel("Loss")

LOSS_EPS = 0.05
min_test_loss = min(meta_test_losses)
for i in range(0, len(num_validations), 2):
    axs[1].plot(
        num_validations[i], meta_train_losses[i], "o", color="tab:blue", zorder=10
    )
    axs[1].text(
        num_validations[i],
        meta_train_losses[i] + LOSS_EPS,
        round(meta_train_losses[i], 4),
        ha="center",
    )

    axs[1].plot(
        num_validations[i], meta_test_losses[i], "o", color="tab:orange", zorder=10
    )
    axs[1].text(
        num_validations[i],
        meta_test_losses[i] + LOSS_EPS,
        round(meta_test_losses[i], 4),
        color="red" if meta_test_losses[i] == min_test_loss else "black",
        ha="center",
    )

fig.suptitle(
    f"{TITLE}\n" + f"Best model F1 Score = {test_f1_scores[best_test_f1_score]}\n"
)

fig.plot()
fig.savefig(f"visualization/graphs/{word2vec_file}-{SESSION}.png")
