# Setup

In [1]:
cd /home/jovyan/assets

/home/jovyan/assets


In [2]:
# -- Base -- #
import os
import random
import joblib
import logging
import time
import re
from copy import deepcopy
from dataclasses import dataclass
import sys
import yaml
import csv
from typing import (
    List,
    Dict,
    Tuple
)
from yaspin import yaspin
from functools import partial

# -- Tokenizer -- #
import tokenizers
from tokenizers.models import WordPiece

from tokenizers import (
    Tokenizer,
    normalizers
)

from tokenizers.normalizers import (
    Lowercase,
    NFD,
    StripAccents
)

from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing
from tokenizers.trainers import WordPieceTrainer
from tokenizers import decoders

# -- PreTrained BERT -- #
from transformers import create_optimizer
from transformers import PreTrainedTokenizerFast
from datasets import Dataset

# -- Metrics -- #
import numpy as np
import pandas as pd
import sqlite3 as sql

# -- Tensorflow -- #
import tensorflow as tf

# -- Misc Models -- #
import drain3
from gensim.models.phrases import Phrases
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# -- Custom -- #
from libs.transformers.src.transformers.models.bert.modeling_tf_bert import TFBertForPreTraining
from libs.transformers.src.transformers.models.bert.configuration_bert import BertConfig
from libs.transformers.src.transformers.modeling_tf_utils import shape_list
from libs.transformers.src.transformers.data.data_tf_collator import TFDataCollatorForLanguageModeling

tf.__version__

2021-07-16 12:11:35.341827: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


'2.4.1'

In [3]:
def convert_sqlite_to_csv(inputFolder, ext, tableName):
    """ inputFolder - Folder where sqlite files are located. 
        ext - Extension of your sqlite file (eg. db, sqlite, sqlite3 etc.)
        tableName - table name from which you want to select the data.
    """
    csvWriter = csv.writer(open(inputFolder+'/output.csv', 'w', newline=''))
    for file1 in os.listdir(inputFolder):
        if file1.endswith('.'+ext):
            conn = sql.connect(inputFolder+'/'+file1)
            cursor = conn.cursor()
            cursor.execute("SELECT * FROM "+tableName)
            rows = cursor.fetchall()
            for row in rows:
                csvWriter.writerow(row)
            continue
        else:
            continue

# Extensions

In [4]:
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
for device in gpu_devices:
    tf.config.experimental.set_memory_growth(device, True)

2021-07-16 12:11:37.314334: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-07-16 12:11:37.315617: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2021-07-16 12:11:37.366428: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-07-16 12:11:37.366871: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:0a:00.0 name: GeForce RTX 2080 Ti computeCapability: 7.5
coreClock: 1.635GHz coreCount: 68 deviceMemorySize: 10.76GiB deviceMemoryBandwidth: 573.69GiB/s
2021-07-16 12:11:37.366889: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2021-07-16 12:11:37.378942: I tensorflow/stream_executor/platform/defau

In [5]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


## Environmental Variables

In [6]:
SOURCE = '/home/' + os.environ['USER']
CONTAINER = 'core.soaesb'

## Logging

In [7]:
logging.basicConfig(
    format='%(asctime)s %(levelname)s | %(message)s',
    level=logging.INFO,
    stream=sys.stdout
)

logger = logging.getLogger(__name__)

# Define Dataset

## Define Database Functions

In [8]:
def database_builder(path: str) -> pd.DataFrame():
    logger.info('Building DataFrame ...')
    (_, _, files) = next(os.walk(path))
    sql_query = 'SELECT * FROM logs'
    data = []
    for f in files:
        if '.db' in f:
            conn = create_connection(path + f)
            d = pd.read_sql_query(sql_query, conn)
            data.append(d)
    logger.info('...complete!')
    return pd.concat(data)


def create_connection(path: str) -> sql.Connection:
    """
    Creates a database connection
    :param path: str
        path to database object
    :return sql.Connection
        a connection to the database
    """
    try:
        conn = sql.connect(path)
        logger.info('Connected to database ' + path)
        return conn
    except sql.Error as e:
        logger.warning(e)

## Define Dataset Main

In [9]:
dataset = database_builder(SOURCE + '/data/')
container_dataset = dataset[dataset['container_name'] == CONTAINER]

2021-07-16 12:11:37,850 INFO | Building DataFrame ...
2021-07-16 12:11:37,855 INFO | Connected to database /home/jovyan/data/elastic_logs.db
2021-07-16 12:11:38,468 INFO | ...complete!


# W2V Pipeline

## Pipeline Objects

### Configuration

In [10]:
def set_attributes(self, config: dict):
    try:
        config = config[self.__class__.__name__]
    except Exception as e:
        logger.warning(e)
        logger.warning('No configuration found for ' +
                       self.__class__.__name__)

    for attr in config.keys():
        setattr(self, attr, config[attr])


@dataclass
class PreprocessingGlobalConfig:
    embed_size: int = 512
    max_vocab_size: int = 2000
    buffer_size: int = 10000
    global_training: bool = True
    path: str = '/results/'

    def load(self, config):
        set_attributes(self, config)


@dataclass
class PhraserModelConfig:
    min_count: int = 5
    threshold: float = 7
    load_model: bool = True
    save_model: bool = False
    training: bool = True
    model_name: str = 'phrase_model.joblib'

    def load(self, config):
        set_attributes(self, config)


@dataclass
class TextClusteringConfig:
    load_model: bool = True
    save_model: bool = False
    training: bool = True
    model_name: str = 'template_miner.joblib'

    def load(self, config):
        set_attributes(self, config)


class PreprocessingPipelineConfig:
    def __init__(self):
        self.PreprocessingGlobalConfig = PreprocessingGlobalConfig()
        self.PhraserModelConfig = PhraserModelConfig()
        self.TextClusteringConfig = TextClusteringConfig()

    def load(self, path):
        try:
            with open(path) as f:
                preprocessing_config = yaml.load(f, Loader=yaml.FullLoader)
        except FileNotFoundError as e:
            logger.warning(e)
            return None

        self.PreprocessingGlobalConfig.load(preprocessing_config)
        self.PhraserModelConfig.load(preprocessing_config)
        self.TextClusteringConfig.load(preprocessing_config)

### Tokenizer

In [11]:
class PrimeTokenizer:
    def __init__(self, max_seq_length: int):
        self.prime_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
        self.prime_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])
        self.prime_tokenizer.pre_tokenizer = Whitespace()
        self.prime_tokenizer.decoder = decoders.WordPiece()
        self.prime_tokenizer.enable_padding(length=max_seq_length)
        self.prime_tokenizer.enable_truncation(max_seq_length)

        self.prime_tokenizer.post_processor = TemplateProcessing(
            single="[CLS] $A [SEP]",
            pair="[CLS] $A [SEP] $B:1 [SEP]:1",
            special_tokens=[
                ("[CLS]", 1),
                ("[SEP]", 2),
            ],
        )

        self.trainer = WordPieceTrainer(
            vocab_size=153411,
            special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
        )

    def text_to_sequence(self, input_) -> List[tokenizers.Encoding]:
        if type(input_) is list:
            return self.prime_tokenizer.encode_batch(input_)
        return self.prime_tokenizer.encode(input_)

    def sequence_to_text(self, input_) -> List[str]:
        if type(input_) is list:
            return self.prime_tokenizer.decode_batch(batch)
        return self.prime_tokenizer.decode(input_)

    def train(self, data):
        log_itr = iter(data)
        self.prime_tokenizer.train_from_iterator(log_itr, self.trainer)
        self.save()

    def get_tokenizer(self) -> Tokenizer:
        return self.prime_tokenizer

    def get_vocab(self) -> Dict[str, int]:
        return self.prime_tokenizer.get_vocab()
    
    def get_vocab_size(self) -> int:
        return self.prime_tokenizer.get_vocab_size()
    
    def save(self):
        self.prime_tokenizer.save(SOURCE + "/results/prime_tokenizer.json")
        
    def load(self):
        self.prime_tokenizer = Tokenizer.from_file(SOURCE + "/results/prime_tokenizer.json")

### Generic Save Model

In [12]:
def save_model(model, path):
#     if not os.path.exists(path):
#         return

    if os.path.isfile(path):
        os.remove(path)
#     elif os.path.isdir(path):
#         shutil.rmtree(path)
#         return

    joblib.dump(model, path)

### PhraseCaptureLayer

In [13]:
class PhraserModel:

    def __init__(self,
                 config: PhraserModelConfig,
                 global_config: PreprocessingGlobalConfig):

        super(PhraserModel, self).__init__()
        self.min_count = config.min_count
        self.threshold = config.threshold
        self.load_model = config.load_model
        self.save_model = config.save_model
        self.path = global_config.path
        self.model_name = config.model_name
        self.training = config.training

        if self.load_model:
            self.phrase_model = joblib.load(SOURCE +
                                            self.path +
                                            self.model_name)
        else:
            self.phrase_model = Phrases(min_count=self.min_count,
                                        threshold=self.threshold)

    def __call__(self, corpus: pd.DataFrame, training=None) -> list:
        if training is None:
            training = self.training
            
        def reorganize_return(corpus_with_phrases):
            log_list = []
            for tokenized_log in corpus_with_phrases:
                log_list.append(' '.join(tokenized_log))
            return log_list

        split_corpus = [log.split(' ') for log in corpus['log']]

        corpus_with_phrases = None
        if not training:
            frozen_model = self.phrase_model.freeze()
            corpus_with_phrases = self.phrase_model.__getitem__(split_corpus)
        else:
            self.phrase_model.add_vocab(split_corpus)

            if self.save_model:
                save_model(self.phrase_model, SOURCE + self.path + self.model_name)

            corpus_with_phrases = self.phrase_model.__getitem__(split_corpus)
            
        return reorganize_return(corpus_with_phrases)

### TextClusteringLayer

In [14]:
class TextClustering:

    def __init__(self,
                 config: TextClusteringConfig,
                 global_config: PreprocessingGlobalConfig):

        super(TextClustering, self).__init__()
        self.load_model = config.load_model
        self.save_model = config.save_model
        self.path = global_config.path
        self.model_name = config.model_name
        self.training = config.training

        if self.load_model is True:
            self.template_miner = joblib.load(SOURCE +
                                              self.path +
                                              self.model_name)
        else:
            self.template_miner = drain3.TemplateMiner()

    def __call__(self, corpus: list, training=None) -> list:
        if training is None:
            training = self.training
            
        if training:
            for log in corpus:
                self.template_miner.add_log_message(log)
            if self.save_model:
                save_model(self.template_miner,
                           SOURCE + self.path + self.model_name)

            for idx, log in enumerate(corpus):
                template = self.template_miner.match(log).get_template()
                corpus[idx] = template

            return [re.sub(pattern=r' +',
                           repl=' ',
                           string=cluster) for cluster in corpus]
        else:
#             log_list = self.get_unique_templates()
#             print(f'Length of the log list: {len(log_list)}')
#             return log_list
            log_list = []
            log_set = set()
            for log in corpus:
                match_cluster = self.template_miner.match(log)
                if match_cluster is None:
                    match_cluster = self.template_miner.add_log_message(log)['template_mined']
                    log_set.add(match_cluster)
                else:
                    log_set.add(match_cluster.get_template())
        
#             l = [re.sub(pattern=r' +',
#                            repl=' ',
#                            string=cluster) for cluster in log_list]
            return list(log_set)
        
    def get_unique_templates(self) -> list:
        template_list = []
        for cluster in self.template_miner.drain.clusters:
            template_list.append(cluster.get_template())
        return [re.sub(pattern=r' +',
                       repl=' ',
                       string=cluster) for cluster in template_list]

### Preprocessing Pipeline

In [15]:
def process_all_batches(n_iter, log_labels, batch_size):
    batches = []

    for idx in range(n_iter + 1):
        log_batch, labels = process_batch(dataset, idx, log_labels, batch_size)

        batches.append((log_batch, labels))

    return batches

def process_batch(dataset: pd.DataFrame,
                  idx: int,
                  labels: dict,
                  batch_size: int) -> tuple:
    start_window = idx * batch_size
    end_window = (idx + 1) * batch_size
    batched_data = dataset.iloc[start_window:end_window]
    encoded_batch = prime_tokenizer.text_to_sequence(batched_data['log'].to_list())
    id_batch = [log.ids for log in encoded_batch]
#     y_batch = labels[batched_data['label']]
    y_batch = [labels[idx] for idx in batched_data['label']]

    tf_idf = tf.convert_to_tensor(id_batch, dtype=tf.float32)
    y_idf  = tf.convert_to_tensor(y_batch, dtype=tf.float32)
    
    return tf_idf, y_idf

## Unsupervised Learning Pipeline

In [40]:
EncodedSeq = List[int]

def normalize_logs(logs: pd.DataFrame) -> pd.DataFrame:
    # remove timestamps and double spaces
    regexp = re.compile(
        r"""
        (?:               # Match all enclosed
        \d{4}-\d{2}-\d{2} # YYYY-MM-DD
        [\sT]             # Accept either a space or T
        \d{2}:\d{2}:\d{2} # HH:MM:SS
        ([.,]\d{3}|\s)    # Accept either a space or milliseconds
        )                 # End timestamp match
        | (?:\s{2,})      # Remove double spaces   
        | [^a-zA-Z\d:]    # Clean non-alphanumeric characters
        """, re.X)        # re.X enables comments and whitespace

    c_logs = deepcopy(logs)
    c_logs.loc[:, 'log'].replace(
        to_replace=regexp, 
        value=' ', 
        regex=True,
        inplace=True
    )

    return c_logs


def extract_unique_labels(logs: pd.DataFrame) -> dict:
    # -- Labels -- #
    label_unique = logs['label'].unique()
    binary_labels = LabelEncoder().fit_transform(label_unique)

    log_labels = {}
    # TODO: This seems a bit messy, could it be cleaned up? 
    for idx, label in enumerate(label_unique):
        log_labels.update({
            label: binary_labels[idx]
        })
    return log_labels

def create_sentence_pairing(examples):
    first_seqs = []
    nsp_labels = []

    examples["log"] = [
        line for line in examples["log"] if len(line) > 0 and not line.isspace()
    ]
    
    log_list = list(examples['log'])
    for idx in range(len(log_list)):
        first_value = log_list[idx]
        if random.random() > 0.5:
            # Pair with proper following log sequence
            second_value = log_list[(idx + 1) % len(log_list)]

            # IsNext Label
            nsp_labels.append(0)
        else:
            # Pair with random log
            rand_idx = random.randint(0, len(log_list) - 1)
            second_value = log_list[rand_idx]

            # IsNotNext Label
            nsp_labels.append(1)

        first_seqs.append((first_value, second_value))

    return {"log": first_seqs, "next_sentence_label": nsp_labels}


def generate_test_train_split(tokenized_datasets, test_size=.30):
    # Train - Test Split
    train_indices, test_indices = train_test_split(
        list(range(len(tokenized_datasets))), test_size=test_size
    )

    test_dataset = tokenized_datasets.select(test_indices)
    train_dataset = tokenized_datasets.select(train_indices)

    return train_dataset, test_dataset


class UnsupervisedLearningPipeline:
    def __init__(
        self, 
        config: PreprocessingPipelineConfig, 
        epochs: int = 3, 
        batch_size: int = 50, 
        seq_length: int = 200
    ) -> None:

        # Logs
        self.normalized_logs = None
        self.log_labels = None
        self.logs_with_phrases = list()
        
        # Tokenizers
        self.bert_tokenizer = PrimeTokenizer(seq_length)
        self.fast_tokenizer = None
        
        # Dataset for Training/Evaluation
        self.data_collator = None
        self.tf_train_dataset = None
        self.tf_test_dataset = None
        self.token_logs = None
        
        # BERT Model
        self.bert_config = None
        self.BERT = None
        self.optimizer = None
        self.lr_schedule = None
        self.loss_fn = None
        self.train_acc_metric = None
        self.val_acc_metric = None
        
        # Models
        self.pm = PhraserModel(config.PhraserModelConfig, 
                               config.PreprocessingGlobalConfig)
        self.tc = TextClustering(config.TextClusteringConfig,
                                 config.PreprocessingGlobalConfig)
        
        # Hyperparameters
        self.max_seq_len = seq_length
        self.n_logs = 0
        self.n_iter = 0
        self.epochs = epochs
        self.batch_size = batch_size
        self.batches_per_epoch = None
        self.logs_as_templates = None
        

    def initialize_fast_tokenizer(self):
        tokenizer_obj = self.bert_tokenizer.get_tokenizer()
        fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer_obj)
        fast_tokenizer.model_max_length = self.max_seq_len
        fast_tokenizer.unk_token = "[UNK]"
        fast_tokenizer.sep_token = "[SEP]"
        fast_tokenizer.pad_token = "[PAD]"
        fast_tokenizer.cls_token = "[CLS]"
        fast_tokenizer.mask_token = "[MASK]"
        self.fast_tokenizer = fast_tokenizer
        return fast_tokenizer
    
    
    @staticmethod
    def generate_data_signatures(train_dataset, test_dataset):
        # Train Signatures
        train_signature = {
            feature: tf.TensorSpec(shape=(None,), dtype=tf.int64)
            for feature in train_dataset.features
            if feature != "special_tokens_mask" and feature != "next_sentence_label"
        }
        train_signature["next_sentence_label"] = tf.TensorSpec(shape=(), dtype=tf.int64)
        train_signature["labels"] = train_signature["input_ids"]
        train_signature = (train_signature, {"labels": train_signature["labels"], "next_sentence_label": train_signature["next_sentence_label"]})
        
        # Test Signatures
        test_signature = {
            feature: tf.TensorSpec(shape=(None,), dtype=tf.int64)
            for feature in test_dataset.features
            if feature != "special_tokens_mask" and feature != "next_sentence_label"
        }
        test_signature["next_sentence_label"] = tf.TensorSpec(shape=(), dtype=tf.int64)
        test_signature["labels"] = test_signature["input_ids"]
        test_signature = (test_signature, {"labels": test_signature["labels"], "next_sentence_label": test_signature["next_sentence_label"]})
        
        return train_signature, test_signature
    

    def tokenize_function(self, examples):
        # Remove empty lines
        return self.fast_tokenizer(
            examples["log"],
            padding=False,
            truncation=True,
            max_length=self.max_seq_len,
            return_special_tokens_mask=True,
        )


    def get_pre_training_data(self):
        fast_tokenizer = self.initialize_fast_tokenizer()
        
        dt = self.normalized_logs.drop(["label", "container_name", "timestamp"], axis=1)
        data = Dataset.from_pandas(dt)
        
        tokenized_datasets = data.map(
            create_sentence_pairing,
            batched=True,
            num_proc=1,
            remove_columns=["log"],
            desc="Creating sentence pairings for NSP Head"
        )
        tokenized_datasets = tokenized_datasets.map(
            self.tokenize_function,
            batched=True,
            num_proc=1,
            remove_columns=["log"],
            desc="Running tokenizer on dataset line_by_line"
        )
        
        train_dataset, test_dataset = generate_test_train_split(tokenized_datasets)
        train_signature, test_signature = self.generate_data_signatures(train_dataset, test_dataset)
        
        self.batches_per_epoch = len(train_dataset) // self.batch_size
        
        data_collator = TFDataCollatorForLanguageModeling(
            tokenizer=self.fast_tokenizer,
            padding_length=self.max_seq_len,
            batch_size=self.batch_size
        )
        
        tokenized_generator = partial(data_collator, train_dataset, fast_tokenizer)
        test_generator = partial(data_collator, test_dataset, fast_tokenizer)
        
        self.tf_train_dataset = (
            tf.data.Dataset.from_generator(tokenized_generator, output_signature=train_signature)
            .batch(batch_size=self.batch_size, drop_remainder=True)
            .repeat(int(5))
        )
        
        self.tf_test_dataset = (
            tf.data.Dataset.from_generator(test_generator, output_signature=test_signature)
            .batch(batch_size=self.batch_size, drop_remainder=True)
        )


    def train_bert_tokenizer(self, load_model=False):
        if load_model:
            self.bert_tokenizer.load()
        else:
            self.bert_tokenizer.train(self.logs_as_templates)
            
            
    def compute_loss(self, labels, logits):
        # make sure only labels that are not equal to -100
        # are taken into account as loss
        masked_lm_active_loss = tf.not_equal(tf.reshape(tensor=labels["labels"], shape=(-1,)), -100)
        masked_lm_reduced_logits = tf.boolean_mask(
            tensor=tf.reshape(tensor=logits[0], shape=(-1, shape_list(logits[0])[2])),
            mask=masked_lm_active_loss,
        )
        masked_lm_labels = tf.boolean_mask(
            tensor=tf.reshape(tensor=labels["labels"], shape=(-1,)), mask=masked_lm_active_loss
        )
        next_sentence_active_loss = tf.not_equal(tf.reshape(tensor=labels["next_sentence_label"], shape=(-1,)), -100)
        next_sentence_reduced_logits = tf.boolean_mask(
            tensor=tf.reshape(tensor=logits[1], shape=(-1, 2)), mask=next_sentence_active_loss
        )
        next_sentence_label = tf.boolean_mask(
            tensor=tf.reshape(tensor=labels["next_sentence_label"], shape=(-1,)), mask=next_sentence_active_loss
        )

        return (masked_lm_labels, masked_lm_reduced_logits), (next_sentence_label, next_sentence_reduced_logits)
    
    
    @tf.function
    def train_step(self, x, y):
        with tf.GradientTape() as tape:
            logits = self.BERT(x, training=True)
            mlm, nsp = self.compute_loss(y, (logits["prediction_logits"], logits["seq_relationship_logits"]))
            masked_lm_loss = self.loss_fn(mlm[0], mlm[1])
            next_sentence_loss = self.loss_fn(nsp[0], nsp[1])
            loss_value = masked_lm_loss + next_sentence_loss

        grads = tape.gradient(loss_value, self.BERT.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.BERT.trainable_weights))

        # Update training metric.
        self.train_acc_metric.update_state(mlm[0], mlm[1])
        self.train_acc_metric.update_state(nsp[0], nsp[1])

        return loss_value
    

    @tf.function
    def test_step(self, x, y):
        val_logits = self.BERT(x, training=False)
        mlm, nsp = self.compute_loss(y, (val_logits["prediction_logits"], val_logits["seq_relationship_logits"]))
        # Update val metrics
        self.val_acc_metric.update_state(mlm[0], mlm[1])
        self.val_acc_metric.update_state(nsp[0], nsp[1])
        
        
    def prepare_bert_model(self):
        self.bert_config = BertConfig(
            vocab_size=self.bert_tokenizer.get_vocab_size(),
            hidden_size=512,
            num_hidden_layers=8,
            num_attention_heads=8
        )

        self.optimizer, self.lr_schedule = create_optimizer(
            init_lr=1e-4,
            num_train_steps=int(5 * self.batches_per_epoch),
            num_warmup_steps=2,
            adam_beta1=0.9,
            adam_beta2=0.999,
            adam_epsilon=0.1,
            weight_decay_rate=0.01,
        )
        
        self.BERT = TFBertForPreTraining(self.bert_config)
        self.BERT.resize_token_embeddings(len(self.fast_tokenizer))
        self.loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

        # Prepare the metrics.
        self.train_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy()
        self.val_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy()
    

    def pre_train_bert(self):
        # Prepare BERT Model for Pre-Training
        self.prepare_bert_model()
        
        # Prefetch Data
        self.tf_train_dataset = self.tf_train_dataset.prefetch(tf.data.AUTOTUNE)
        self.tf_test_dataset = self.tf_test_dataset.prefetch(tf.data.AUTOTUNE)
        
        with yaspin(text="", color='blue') as sp:
            # Training BERT
            for epoch in range(self.epochs):
                print("\nStart of epoch %d" % (epoch,))
                start_time = time.time()

                # Iterate over the batches of the dataset.
                for step, (x_batch_train, y_batch_train) in enumerate(self.tf_train_dataset):
                    loss_value = self.train_step(x_batch_train, y_batch_train)

                    # Log every 200 batches.
                    if step % 200 == 0:
                        print(
                            "Training loss (for one batch) at step %d: %.4f"
                            % (step, float(loss_value))
                        )
                        print("Seen so far: %d samples" % ((step + 1) * self.batch_size))

                # Display metrics at the end of each epoch.
                train_acc = self.train_acc_metric.result()
                print("Training acc over epoch: %.4f" % (float(train_acc),))

                # Reset training metrics at the end of each epoch
                self.train_acc_metric.reset_states()

                # Run an evaluation loop at the end of each epoch.
                for x_batch_test, y_batch_test in self.tf_test_dataset:
                    self.test_step(x_batch_test, y_batch_test)
                eval_acc = self.val_acc_metric.result()
                self.val_acc_metric.reset_states()
                print("Testing acc: %.4f" % (float(eval_acc),))
                print("Time taken: %.2fs" % (time.time() - start_time))
            
            sp.text = ""
            sp.ok('✔ Finished Training Epochs')
            
    
    def run_bert(self, batch):
        return self.BERT(batch,
                         output_attentions=True)

    def fit(self, logs: pd.DataFrame):
        assert len(logs.index) > 0, 'process received an empty dataframe!'
        
        with yaspin(text="Normalizing Logs", color='green') as sp:
            self.normalized_logs = normalize_logs(logs)
            sp.text = ""
            sp.ok('✔ Completed log normalization')
            
            sp.text = "Extracting phrases"
            self.logs_with_phrases = self.pm(self.normalized_logs)
            sp.text = ""
            sp.ok('✔ Completed phrase extraction')
            
            sp.text = "Converting to log templates"
            self.logs_as_templates = np.array(self.tc(self.logs_with_phrases))
            sp.text = ""
            sp.ok('✔ Completed log template conversion')
        
            sp.text = "Extracting Unique Labels"
            self.log_labels = extract_unique_labels(self.normalized_logs)
            sp.text = ""
            sp.ok('✔ Completed extracting unique labels')

            sp.text = "Training Tokenizer"
            self.train_bert_tokenizer()
            sp.text = ""
            sp.ok('✔ Completed training of custom tokenizer')
                        
            sp.text = "Processing training dataset"
            self.get_pre_training_data()
            sp.text = ""
            sp.ok('✔ Completed processing training dataset')
            
            sp.text = "Pretraining BERT"
            self.pre_train_bert()
            sp.text = ""
            sp.ok('✔ Completed BERT pretraining')
            
        
    def transform(self, batch: pd.DataFrame):
        x = self.normalize_logs(batch)
        x = self.pm(x, False)
        x = list(self.tc(x, False))
        x = self.serve_batch(x)
        return self.run_bert(x)

## W2V Pipeline Main

In [41]:
config_path = SOURCE + '/assets/notebooks/PreprocessingConfig.yaml'
preprocessing_config = PreprocessingPipelineConfig()
preprocessing_config.load(config_path)

In [42]:
# -- Unsupervised Learning Pipeline -- #

'''
Input: pd.DataFrame with batch_size number of rows 
Seq: 
    Normalize 
    Phraser
    Clustering
    Extract Unique Layers
    BERT
Returns: transformers.TFBertForPreTrainingOutput
'''

# --- SUBWORD TOKENIZER --
w2vp = UnsupervisedLearningPipeline(preprocessing_config, epochs=5, batch_size=2, seq_length=300)
training_outputs = w2vp.fit(dataset[:1000])

2021-07-16 12:35:07,306 INFO | Starting Drain3 template miner
2021-07-16 12:35:07,307 INFO | Loading configuration from drain3.ini
[K[32m✔ Completed log normalization[0m 
2021-07-16 12:35:07,343 INFO | exporting phrases from Phrases<0 vocab, min_count=5, threshold=7, max_vocab_size=40000000>
2021-07-16 12:35:07,343 INFO | FrozenPhrases lifecycle event {'msg': 'exported FrozenPhrases<0 phrases, min_count=5, threshold=7> from Phrases<0 vocab, min_count=5, threshold=7, max_vocab_size=40000000> in 0.00s', 'datetime': '2021-07-16T12:35:07.343740', 'gensim': '4.0.1', 'python': '3.8.8 | packaged by conda-forge | (default, Feb 20 2021, 16:22:27) \n[GCC 9.3.0]', 'platform': 'Linux-5.8.0-55-generic-x86_64-with-glibc2.10', 'event': 'created'}
[K[32m✔ Completed phrase extraction[0m 
[K[32m✔ Completed log template conversion[0m 
[K[32m✔ Completed extracting unique labels[0m 



[K[32m✔ Completed training of custom tokenizer[0m 


Creating sentence pairings for NSP Head:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on dataset line_by_line:   0%|          | 0/1 [00:00<?, ?ba/s]

[K[32m✔ Completed processing training dataset[0m 
[34m⠋[0m [K
Start of epoch 0
[34m⠇[0m [KTraining loss (for one batch) at step 0: 8.6040
Seen so far: 2 samples
[34m⠸[0m [KTraining loss (for one batch) at step 200: 8.5414
Seen so far: 402 samples
[34m⠇[0m [KTraining loss (for one batch) at step 400: 8.0799
Seen so far: 802 samples
[34m⠸[0m [KTraining loss (for one batch) at step 600: 7.7189
Seen so far: 1202 samples
[34m⠇[0m [KTraining loss (for one batch) at step 800: 7.2637
Seen so far: 1602 samples
[34m⠸[0m [KTraining loss (for one batch) at step 1000: 8.0167
Seen so far: 2002 samples
[34m⠧[0m [KTraining loss (for one batch) at step 1200: 7.4470
Seen so far: 2402 samples
[34m⠹[0m [KTraining loss (for one batch) at step 1400: 7.0914
Seen so far: 2802 samples
[34m⠧[0m [KTraining loss (for one batch) at step 1600: 7.5767
Seen so far: 3202 samples
[34m⠹[0m [KTraining acc over epoch: 0.0574
[34m⠋[0m [KValidation acc: 0.0624
Time taken: 59.09s

Start

NameError: name 'outputs' is not defined

In [None]:
for i in w2vp.tf_train_dataset:
    print(i)
    break

In [None]:
%%capture
!pip install bertviz

In [None]:
from bertviz import head_view, model_view

In [None]:
test_data['log']

In [None]:
sentence = test_data.iloc[0]['log']
sentence

In [None]:
tokens = w2vp.bert_tokenizer.text_to_sequence(sentence).tokens
tokens

In [None]:
head_view(attention, tokens)

In [None]:
model_view(attention, tokens)

In [None]:
w2vp.BERT.summary()

In [None]:
copy_bert = deepcopy(w2vp.BERT)

In [None]:
copy_bert.get_output_at

In [None]:
w2vp.BERT.save(SOURCE + '/results/')

In [None]:
tf_data




