# Setup

In [None]:
# -- Base -- #
import os
import random
import joblib
import logging
import re
from copy import deepcopy
from dataclasses import dataclass
import sys
import yaml
import csv
from typing import (
    List,
    Dict
)
from yaspin import yaspin

# -- Tokenizer -- #
import tokenizers
from tokenizers.models import WordPiece

from tokenizers import (
    Tokenizer,
    normalizers
)

from tokenizers.normalizers import (
    Lowercase,
    NFD,
    StripAccents
)

from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing
from tokenizers.trainers import WordPieceTrainer
from tokenizers import decoders

# -- PreTrained BERT -- #
from transformers import PreTrainedTokenizerFast

# -- Metrics -- #
import numpy as np
import pandas as pd
import sqlite3 as sql

# -- Tensorflow -- #
import tensorflow as tf

# -- Misc Models -- #
import drain3
from gensim.models.phrases import Phrases
from sklearn.preprocessing import LabelEncoder

# -- Custom -- #
from libs.transformers.src.transformers.models.bert.modeling_tf_bert import TFBertForPreTraining
from libs.transformers.src.transformers.models.bert.configuration_bert import BertConfig
from libs.transformers.src.transformers.modeling_tf_utils import shape_list
from libs.transformers.src.transformers.data.data_tf_collator import TFDataCollatorForLanguageModeling

tf.__version__

In [None]:
# needed to access local files
cd ..

In [None]:
def convert_sqlite_to_csv(inputFolder, ext, tableName):
    """ inputFolder - Folder where sqlite files are located. 
        ext - Extension of your sqlite file (eg. db, sqlite, sqlite3 etc.)
        tableName - table name from which you want to select the data.
    """
    csvWriter = csv.writer(open(inputFolder+'/output.csv', 'w', newline=''))
    for file1 in os.listdir(inputFolder):
        if file1.endswith('.'+ext):
            conn = sql.connect(inputFolder+'/'+file1)
            cursor = conn.cursor()
            cursor.execute("SELECT * FROM "+tableName)
            rows = cursor.fetchall()
            for row in rows:
                csvWriter.writerow(row)
            continue
        else:
            continue

# Extensions

In [None]:
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
for device in gpu_devices:
    tf.config.experimental.set_memory_growth(device, True)

In [None]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

## Environmental Variables

In [None]:
SOURCE = '/home/' + os.environ['USER']
CONTAINER = 'core.soaesb'

## Logging

In [None]:
logging.basicConfig(
    format='%(asctime)s %(levelname)s | %(message)s',
    level=logging.INFO,
    stream=sys.stdout
)

logger = logging.getLogger(__name__)

# Define Dataset

## Define Database Functions

In [None]:
def database_builder(path: str) -> pd.DataFrame():
    logger.info('Building DataFrame ...')
    (_, _, files) = next(os.walk(path))
    sql_query = 'SELECT * FROM logs'
    data = []
    for f in files:
        if '.db' in f:
            conn = create_connection(path + f)
            d = pd.read_sql_query(sql_query, conn)
            data.append(d)
    logger.info('...complete!')
    return pd.concat(data)


def create_connection(path: str) -> sql.Connection:
    """
    Creates a database connection
    :param path: str
        path to database object
    :return sql.Connection
        a connection to the database
    """
    try:
        conn = sql.connect(path)
        logger.info('Connected to database ' + path)
        return conn
    except sql.Error as e:
        logger.warning(e)

## Define Dataset Main

In [None]:
dataset = database_builder(SOURCE + '/data/')
container_dataset = dataset[dataset['container_name'] == CONTAINER]

# W2V Pipeline

## Pipeline Objects

### Configuration

In [None]:
def set_attributes(self, config: dict):
    try:
        config = config[self.__class__.__name__]
    except Exception as e:
        logger.warning(e)
        logger.warning('No configuration found for ' +
                       self.__class__.__name__)

    for attr in config.keys():
        setattr(self, attr, config[attr])


@dataclass
class PreprocessingGlobalConfig:
    embed_size: int = 512
    max_vocab_size: int = 2000
    buffer_size: int = 10000
    global_training: bool = True
    path: str = '/results/'

    def load(self, config):
        set_attributes(self, config)


@dataclass
class PhraserModelConfig:
    min_count: int = 5
    threshold: float = 7
    load_model: bool = True
    save_model: bool = False
    training: bool = True
    model_name: str = 'phrase_model.joblib'

    def load(self, config):
        set_attributes(self, config)


@dataclass
class TextClusteringConfig:
    load_model: bool = True
    save_model: bool = False
    training: bool = True
    model_name: str = 'template_miner.joblib'

    def load(self, config):
        set_attributes(self, config)


class PreprocessingPipelineConfig:
    def __init__(self):
        self.PreprocessingGlobalConfig = PreprocessingGlobalConfig()
        self.PhraserModelConfig = PhraserModelConfig()
        self.TextClusteringConfig = TextClusteringConfig()

    def load(self, path):
        try:
            with open(path) as f:
                preprocessing_config = yaml.load(f, Loader=yaml.FullLoader)
        except FileNotFoundError as e:
            logger.warning(e)
            return None

        self.PreprocessingGlobalConfig.load(preprocessing_config)
        self.PhraserModelConfig.load(preprocessing_config)
        self.TextClusteringConfig.load(preprocessing_config)

### Tokenizer

In [None]:
class PrimeTokenizer:
    def __init__(self, max_seq_length: int):
        self.prime_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
        self.prime_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])
        self.prime_tokenizer.pre_tokenizer = Whitespace()
        self.prime_tokenizer.decoder = decoders.WordPiece()
        self.prime_tokenizer.enable_padding(length=max_seq_length)
        self.prime_tokenizer.enable_truncation(max_seq_length)

        self.prime_tokenizer.post_processor = TemplateProcessing(
            single="[CLS] $A [SEP]",
            pair="[CLS] $A [SEP] $B:1 [SEP]:1",
            special_tokens=[
                ("[CLS]", 1),
                ("[SEP]", 2),
            ],
        )

        self.trainer = WordPieceTrainer(
            vocab_size=153411,
            special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
        )

    def text_to_sequence(self, input_) -> List[tokenizers.Encoding]:
        if type(input_) is list:
            return self.prime_tokenizer.encode_batch(input_)
        return self.prime_tokenizer.encode(input_)

    def sequence_to_text(self, input_) -> List[str]:
        if type(input_) is list:
            return self.prime_tokenizer.decode_batch(batch)
        return self.prime_tokenizer.decode(input_)

    def train(self, data):
        log_itr = iter(data)
        self.prime_tokenizer.train_from_iterator(log_itr, self.trainer)
        self.save()

    def get_tokenizer(self) -> Tokenizer:
        return self.prime_tokenizer

    def get_vocab(self) -> Dict[str, int]:
        return self.prime_tokenizer.get_vocab()
    
    def get_vocab_size(self) -> int:
        return self.prime_tokenizer.get_vocab_size()
    
    def save(self):
        self.prime_tokenizer.save(SOURCE + "/results/prime_tokenizer.json")
        
    def load(self):
        self.prime_tokenizer = Tokenizer.from_file(SOURCE + "/results/prime_tokenizer.json")

### Generic Save Model

In [None]:
def save_model(model, path):
#     if not os.path.exists(path):
#         return

    if os.path.isfile(path):
        os.remove(path)
#     elif os.path.isdir(path):
#         shutil.rmtree(path)
#         return

    joblib.dump(model, path)

### PhraseCaptureLayer

In [None]:
class PhraserModel:

    def __init__(self,
                 config: PhraserModelConfig,
                 global_config: PreprocessingGlobalConfig):

        super(PhraserModel, self).__init__()
        self.min_count = config.min_count
        self.threshold = config.threshold
        self.load_model = config.load_model
        self.save_model = config.save_model
        self.path = global_config.path
        self.model_name = config.model_name
        self.training = config.training

        if self.load_model:
            self.phrase_model = joblib.load(SOURCE +
                                            self.path +
                                            self.model_name)
        else:
            self.phrase_model = Phrases(min_count=self.min_count,
                                        threshold=self.threshold)

    def __call__(self, corpus: pd.DataFrame, training=None) -> list:
        if training is None:
            training = self.training
            
        def reorganize_return(corpus_with_phrases):
            log_list = []
            for tokenized_log in corpus_with_phrases:
                log_list.append(' '.join(tokenized_log))
            return log_list

        split_corpus = [log.split(' ') for log in corpus['log']]

        corpus_with_phrases = None
        if not training:
            frozen_model = self.phrase_model.freeze()
            corpus_with_phrases = self.phrase_model.__getitem__(split_corpus)
        else:
            self.phrase_model.add_vocab(split_corpus)

            if self.save_model:
                save_model(self.phrase_model, SOURCE + self.path + self.model_name)

            corpus_with_phrases = self.phrase_model.__getitem__(split_corpus)
            
        return reorganize_return(corpus_with_phrases)

### TextClusteringLayer

In [None]:
class TextClustering:

    def __init__(self,
                 config: TextClusteringConfig,
                 global_config: PreprocessingGlobalConfig):

        super(TextClustering, self).__init__()
        self.load_model = config.load_model
        self.save_model = config.save_model
        self.path = global_config.path
        self.model_name = config.model_name
        self.training = config.training

        if self.load_model is True:
            self.template_miner = joblib.load(SOURCE +
                                              self.path +
                                              self.model_name)
        else:
            self.template_miner = drain3.TemplateMiner()

    def __call__(self, corpus: list, training=None) -> list:
        if training is None:
            training = self.training
            
        if training:
            for log in corpus:
                self.template_miner.add_log_message(log)
            if self.save_model:
                save_model(self.template_miner,
                           SOURCE + self.path + self.model_name)

            for idx, log in enumerate(corpus):
                template = self.template_miner.match(log).get_template()
                corpus[idx] = template

            return [re.sub(pattern=r' +',
                           repl=' ',
                           string=cluster) for cluster in corpus]
        else:
#             log_list = self.get_unique_templates()
#             print(f'Length of the log list: {len(log_list)}')
#             return log_list
            log_list = []
            log_set = set()
            for log in corpus:
                match_cluster = self.template_miner.match(log)
                if match_cluster is None:
                    match_cluster = self.template_miner.add_log_message(log)['template_mined']
                    log_set.add(match_cluster)
                else:
                    log_set.add(match_cluster.get_template())
        
#             l = [re.sub(pattern=r' +',
#                            repl=' ',
#                            string=cluster) for cluster in log_list]
            return list(log_set)
        
    def get_unique_templates(self) -> list:
        template_list = []
        for cluster in self.template_miner.drain.clusters:
            template_list.append(cluster.get_template())
        return [re.sub(pattern=r' +',
                       repl=' ',
                       string=cluster) for cluster in template_list]

### Preprocessing Pipeline

In [None]:
def process_all_batches(n_iter, log_labels, batch_size):
    batches = []

    for idx in range(n_iter + 1):
        log_batch, labels = process_batch(dataset, idx, log_labels, batch_size)

        batches.append((log_batch, labels))

    return batches

def process_batch(dataset: pd.DataFrame,
                  idx: int,
                  labels: dict,
                  batch_size: int) -> tuple:
    start_window = idx * batch_size
    end_window = (idx + 1) * batch_size
    batched_data = dataset.iloc[start_window:end_window]
    encoded_batch = prime_tokenizer.text_to_sequence(batched_data['log'].to_list())
    id_batch = [log.ids for log in encoded_batch]
#     y_batch = labels[batched_data['label']]
    y_batch = [labels[idx] for idx in batched_data['label']]

    tf_idf = tf.convert_to_tensor(id_batch, dtype=tf.float32)
    y_idf  = tf.convert_to_tensor(y_batch, dtype=tf.float32)
    
    return tf_idf, y_idf

## Unsupervised Learning Pipeline

In [None]:
EncodedSeq = list[int]


def normalize_logs(logs: pd.DataFrame) -> pd.DataFrame:
    # remove timestamps and double spaces
    regexp = re.compile(
        r"""
        (?:               # Match all enclosed
        \d{4}-\d{2}-\d{2} # YYYY-MM-DD
        [\sT]             # Accept either a space or T
        \d{2}:\d{2}:\d{2} # HH:MM:SS
        ([.,]\d{3}|\s)    # Accept either a space or milliseconds
        )                 # End timestamp match
        | (?:\s{2,})      # Remove double spaces   
        | [^a-zA-Z\d:]    # Clean non-alphanumeric characters
        """, re.X)        # re.X enables comments and whitespace

    c_logs = deepcopy(logs)
    c_logs.loc[:, 'log'].replace(
        to_replace=regexp, 
        value=' ', 
        regex=True,
        inplace=True
    )

    return c_logs


def batch_to_tensor(batch) -> tf.Tensor:
    enc_list = []
    for enc_obj in batch:
        enc_tensor = tf.convert_to_tensor(enc_obj, dtype=tf.int32)
        enc_list.append(enc_tensor)
    return tf.stack(enc_list, name='tensor_batch')


def serve_batch(batch, tokenizer):
    log_batch = tokenizer.text_to_sequence(batch)
    batch_ids = [log.ids for log in log_batch]
    return batch_to_tensor(batch_ids)


def extract_unique_labels(logs: pd.DataFrame) -> dict:
    # -- Labels -- #
    label_unique = logs['label'].unique()
    binary_labels = LabelEncoder().fit_transform(label_unique)

    log_labels = {}
    # TODO: This seems a bit messy, could it be cleaned up? 
    for idx, label in enumerate(label_unique):
        log_labels.update({
            label: binary_labels[idx]
        })
    return log_labels


def create_sentence_pairing(
    logs: pd.DataFrame, 
    n_iter: int, 
    batch_size: int
) -> tuple[list[EncodedSeq], list[EncodedSeq], list[int]]:

    first_seqs = [], second_seqs = [], nsp_labels = []
    log_list = logs['log'].to_list()
    
    for idx in range(len(log_list)):
        if random.random() > 0.5:
            # Pair with proper following log sequence
            second_value = log_list[(idx + 1) % len(log_list)]
            
            # IsNext Label
            nsp_labels.append(0)
        else:
            # Pair with random log
            rand_idx = random.randint(0, len(log_list) - 1)
            # TODO: can rand_idx == idx? or should we add a check here?
            second_value = log_list[rand_idx]
            
            # IsNotNext Label
            nsp_labels.append(1)
            
        first_seqs.append(log_list[idx])
        second_seqs.append(second_value)
        
    nsp_labels = tf.reshape(
        tf.constant(nsp_labels), 
        (n_iter, batch_size),
        'nsp_labels'
    )
        
    return first_seqs, second_seqs, nsp_labels


def tokenize_data(tokenizer, groupings):
    # Remove empty lines
    return tokenizer(
        groupings[0],
        groupings[1],
        padding=False,
        truncation=True,
        max_length=1024,
        # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it
        # receives the `special_tokens_mask`.
        return_special_tokens_mask=True
    )


def collate_data(encodings, tokenizer, max_seq_len, batch_size):
    # Assure data is non-ragged and are of type tf.Tensor
    encodings['input_ids'] = tf.ragged.constant(encodings['input_ids']).to_tensor(0)
    encodings['token_type_ids'] = tf.ragged.constant(encodings['token_type_ids']).to_tensor(0)
    encodings['attention_mask'] = tf.ragged.constant(encodings['attention_mask']).to_tensor(0)

    batch_encodings = tf.data.Dataset.from_tensor_slices(encodings.data)

    data_collator = TFDataCollatorForLanguageModeling(
        tokenizer,
        padding_length=max_seq_len,
        batch_size=batch_size
    )

    return data_collator(batch_encodings)


class UnsupervisedLearningPipeline:
    def __init__(
        self, 
        config: PreprocessingPipelineConfig, 
        epochs: int = 3, 
        batch_size: int = 50, 
        seq_length: int = 200
    ) -> None:

        self.normalized_logs = None
        self.log_labels = None
        self.bert_tokenizer = PrimeTokenizer(seq_length)
        self.fast_tokenizer = None
        self.train_dataset = None
        self.token_logs = None
        self.bert_config = None
        self.BERT = None
        self.max_seq_len = seq_length
        self.n_logs = 0
        self.n_iter = 0
        self.epochs = epochs
        self.batch_size = batch_size
        self.logs_with_phrases = list()
        self.logs_as_templates = None
        self.data_collator = None
        
        self.pm = PhraserModel(config.PhraserModelConfig, 
                               config.PreprocessingGlobalConfig)
        
        self.tc = TextClustering(config.TextClusteringConfig,
                                 config.PreprocessingGlobalConfig)
        

    def initialize_fast_tokenizer(self):
        tokenizer_obj = self.bert_tokenizer.get_tokenizer()
        fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer_obj)
        fast_tokenizer.model_max_length = self.max_seq_len
        fast_tokenizer.unk_token = "[UNK]"
        fast_tokenizer.sep_token = "[SEP]"
        fast_tokenizer.pad_token = "[PAD]"
        fast_tokenizer.cls_token = "[CLS]"
        fast_tokenizer.mask_token = "[MASK]"
        self.fast_tokenizer = fast_tokenizer
        return fast_tokenizer


    def get_pre_training_data(self):
        labels = self.normalized_logs['label'].to_list()
        log_labels = [self.log_labels[l] for l in labels]

        fast_tokenizer = self.initialize_fast_tokenizer()
        
        first_sequences, pair_sequences, nsp_labels = create_sentence_pairing(
            self.normalized_logs,
            self.n_iter,
            self.batch_size
        )
        
        encodings = fast_tokenizer(
            first_sequences,
            pair_sequences,
            truncation=True,
            padding=False
        )

#         tokenized_datasets = data.map(
#             tokenize_function,
#             batched=True,
#             num_proc=1,
#             remove_columns=["log"],
#             desc="Running tokenizer on dataset line_by_line"
#         )
                
        encoded_objs = collate_data(
            encodings,
            self.fast_tokenizer,
            self.batch_size,
            self.max_seq_len
        )
        
        pre_training_data = dict()
        target_values = dict()
        for key in list(encoded_objs)[0].keys():
            temp_tensor = tf.constant([x[key].numpy() for x in encoded_objs])
            if key == "labels":
                target_values[key] = temp_tensor
                continue
            pre_training_data[key] = temp_tensor
        
#         pre_training_data["next_sentence_label"] = nsp_labels
        target_values["next_sentence_label"] = nsp_labels
        
#         log_labels = tf.reshape(tf.constant(log_labels), (self.n_iter, self.batch_size))
        
        pre_train_dataset = tf.data.Dataset.from_tensor_slices((
            pre_training_data,
            target_values
        ))
    
        pre_train_dataset = pre_train_dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
        
        return pre_train_dataset


    def train_bert_tokenizer(self, load_model=False):
        if load_model:
            self.bert_tokenizer.load()
        else:
            self.bert_tokenizer.train(self.logs_as_templates)


    def serve_batch_w_idx(self, idx):
        start_idx = idx * self.batch_size
        end_idx = start_idx + self.batch_size
        
        log_batch = self.logs_as_templates[start_idx:end_idx].tolist()
        batch = self.bert_tokenizer.text_to_sequence(log_batch)
        batch_ids = [log.ids for log in batch]
        return batch_to_tensor(batch_ids)
    


    

    def custom_compute_loss(self, labels: tf.Tensor, logits: tf.Tensor) -> tf.Tensor:
        loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
            from_logits=True, reduction=tf.keras.losses.Reduction.NONE
        )
        # make sure only labels that are not equal to -100
        # are taken into account as loss
#         print(f"HEY: {logits.name}")
        
#         if logits.shape == ():
#             return 0
        
#         if "seq_relationship" in logits.name:
#             next_sentence_active_loss = tf.not_equal(tf.reshape(tensor=labels, shape=(-1,)), -100)
#             next_sentence_reduced_logits = tf.boolean_mask(
#                 tensor=tf.reshape(tensor=logits, shape=(-1, 2)), mask=next_sentence_active_loss
#             )
#             next_sentence_label = tf.boolean_mask(
#                 tensor=tf.reshape(tensor=labels, shape=(-1,)), mask=next_sentence_active_loss
#             )
#             next_sentence_loss = tf.cast(loss_fn(y_true=next_sentence_label, y_pred=next_sentence_reduced_logits), tf.float32)
#             print(next_sentence_loss)
#             return next_sentence_loss
            
        masked_lm_active_loss = tf.not_equal(tf.reshape(tensor=labels, shape=(-1,)), -100)
        masked_lm_reduced_logits = tf.boolean_mask(
            tensor=tf.reshape(tensor=logits, shape=(-1, shape_list(logits)[2])),
            mask=masked_lm_active_loss,
        )
        masked_lm_labels = tf.boolean_mask(
            tensor=tf.reshape(tensor=labels, shape=(-1,)), mask=masked_lm_active_loss
        )
#         next_sentence_active_loss = tf.not_equal(tf.reshape(tensor=labels["next_sentence_label"], shape=(-1,)), -100)
#         next_sentence_reduced_logits = tf.boolean_mask(
#             tensor=tf.reshape(tensor=logits[1], shape=(-1, 2)), mask=next_sentence_active_loss
#         )
#         next_sentence_label = tf.boolean_mask(
#             tensor=tf.reshape(tensor=labels["next_sentence_label"], shape=(-1,)), mask=next_sentence_active_loss
#         )
        masked_lm_loss = loss_fn(y_true=masked_lm_labels, y_pred=masked_lm_reduced_logits)
        #next_sentence_loss = tf.cast(loss_fn(y_true=next_sentence_label, y_pred=next_sentence_reduced_logits), tf.float32)
    #         masked_lm_loss = tf.reshape(tensor=masked_lm_loss, shape=(-1, shape_list(next_sentence_loss)[0]))
    #         masked_lm_loss = tf.reduce_mean(input_tensor=masked_lm_loss, axis=0)

        print(masked_lm_loss)
        return masked_lm_loss #+ next_sentence_loss

    # Given a callable model, inputs, outputs, and a learning rate...
    def backprop(self, current_loss, learning_rate):

      with tf.GradientTape() as t:
          # Use GradientTape to calculate the gradients with respect to W and b
          dw, db = t.gradient(current_loss, [self.BERT.bert.w, self.BERT.b])

          # Subtract the gradient scaled by the learning rate
          self.BERT.w.assign_sub(learning_rate * dw)
          self.BERT.b.assign_sub(learning_rate * db)


    def pre_train_bert(self):
        self.bert_config = BertConfig(
            vocab_size=self.bert_tokenizer.get_vocab_size(),
            hidden_size=512,
            num_hidden_layers=8,
            num_attention_heads=8
        )

#         self.bert_config = CONFIG_MAPPING["bert"]()
        
#         self.BERT = TFBertForPreTraining.from_config(self.bert_config)
        self.BERT = TFBertForPreTraining(self.bert_config)
        self.BERT.resize_token_embeddings(len(self.fast_tokenizer))
        optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
        self.BERT.compile(optimizer=optimizer, loss=self.BERT.compute_loss)
        
#         batch = list(self.train_dataset)[0]
                
        with yaspin(text="", color='blue') as sp:
            self.BERT.fit(self.train_dataset, epochs=5)
#             for epoch in range(self.epochs):
#             sp.text = f'Training BERT EPOCH {epoch}/{self.epochs}'
#             for batch in self.train_dataset.as_numpy_iterator():
# #                     self.BERT.fit(x={"input_ids": batch[0]["input_ids"], 
# #                                            "attention_mask": batch[0]["attention_mask"],
# #                                            "token_type_ids": batch[0]["token_type_ids"]},
# #                                          y={"labels": batch[0]["labels"],
# #                                                "next_sentence_label": batch[0]["next_sentence_label"]},
# #                                         epochs=5
# #                                  )
# #                     self.BERT.fit(batch[0], )
#                 outputs = self.BERT(
#                     input_ids=batch[0]["input_ids"],
#                     attention_mask=batch[0]["attention_mask"],
#                     token_type_ids=batch[0]["token_type_ids"],
#                     labels=batch[0]["labels"],
#                     next_sentence_label=batch[0]["next_sentence_label"],
#                     training=True,
#         #                         output_attentions=True,
#         #                         output_hidden_states=True,
#                     return_dict=True
#                 )
#                 self.BERT.fit(x=[batch[0]["input_ids"], batch[0]["attention_mask"], batch[0]["token_type_ids"]],
#                                      y=outputs[1:3])
    

            sp.text = ""
            sp.ok('✔ Finished Training Epochs')
            
        return outputs
    
    def run_bert(self, batch):
        return self.BERT(batch,
                         output_attentions=True)

    def fit(self, logs: pd.DataFrame):
        assert len(logs.index) > 0, 'process received an empty dataframe!'
        
        with yaspin(text="Normalizing Logs", color='green') as sp:
            self.normalized_logs = (self.normalize_logs(logs))
            sp.text = ""
            sp.ok('✔ Completed log normalization')
            
            sp.text = "Extracting phrases"
            self.logs_with_phrases = self.pm(self.normalized_logs)
            sp.text = ""
            sp.ok('✔ Completed phrase extraction')
            
            sp.text = "Converting to log templates"
            self.logs_as_templates = np.array(self.tc(self.logs_with_phrases))
            sp.text = ""
            sp.ok('✔ Completed log template conversion')
        
            sp.text = "Extracting Unique Labels"
            self.log_labels = self.extract_unique_labels()
            sp.text = ""
            sp.ok('✔ Completed extracting unique labels')

            sp.text = "Training Tokenizer"
            self.train_bert_tokenizer()
            sp.text = ""
            sp.ok('✔ Completed training of custom tokenizer')
            
            self.n_logs = len(self.normalized_logs['log'])
            self.n_iter = self.n_logs // self.batch_size
                        
            sp.text = "Processing training dataset"
            self.train_dataset = self.get_pre_training_data()
            sp.text = ""
            sp.ok('✔ Completed processing training dataset')
            
            sp.text = "Pretraining BERT"
            outputs = self.pre_train_bert()
            sp.text = ""
            sp.ok('✔ Completed BERT pretraining')
            
        return outputs
        
    def transform(self, batch: pd.DataFrame):
        x = self.normalize_logs(batch)
        x = self.pm(x, False)
        x = list(self.tc(x, False))
        x = self.serve_batch(x)
        return self.run_bert(x)

In [None]:
from transformers import CONFIG_MAPPING, MODEL_FOR_MASKED_LM_MAPPING
from transformers import DataCollatorForLanguageModeling
MODEL_CONFIG_CLASSES = list(MODEL_FOR_MASKED_LM_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
print("If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES))
CONFIG_MAPPING["bert"]()

In [None]:
# list(w2vp.train_dataset)[0]

## W2V Pipeline Main

In [None]:
config_path = SOURCE + '/assets/notebooks/PreprocessingConfig.yaml'
preprocessing_config = PreprocessingPipelineConfig()
preprocessing_config.load(config_path)

In [None]:
from datasets import Dataset

In [None]:
w2vp.normalized_logs

In [None]:
dt = w2vp.normalized_logs.drop(["label", "container_name", "timestamp"], axis=1)

In [None]:
data = Dataset.from_pandas(dt[:1000])

In [None]:
def sample_generator(dataset, tokenizer, mlm_probability=0.15, pad_to_multiple_of=None):
    if tokenizer.mask_token is None:
        raise ValueError("This tokenizer does not have a mask token which is necessary for masked language modeling. ")
    # Trim off the last partial batch if present
    sample_ordering = np.random.permutation(len(dataset))
    for sample_idx in sample_ordering:
        example = dataset[int(sample_idx)]
        # Handle dicts with proper padding and conversion to tensor.
        example = tokenizer.pad(example, return_tensors="np", pad_to_multiple_of=pad_to_multiple_of)
        special_tokens_mask = example.pop("special_tokens_mask", None)
        example["input_ids"], example["labels"] = mask_tokens(
            example["input_ids"], mlm_probability, tokenizer, special_tokens_mask=special_tokens_mask
        )
        if tokenizer.pad_token_id is not None:
            example["labels"][example["labels"] == tokenizer.pad_token_id] = -100
        example = {key: tf.convert_to_tensor(arr) for key, arr in example.items()}

        yield example, {"labels": example["labels"], "next_sentence_label": example["next_sentence_label"]}  # TF needs some kind of labels, even if we don't use them
    return


def mask_tokens(inputs, mlm_probability, tokenizer, special_tokens_mask):
    """
    Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
    """
    labels = np.copy(inputs)
    # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
    probability_matrix = np.random.random_sample(labels.shape)
    special_tokens_mask = special_tokens_mask.astype(np.bool_)

    probability_matrix[special_tokens_mask] = 0.0
    masked_indices = probability_matrix > (1 - mlm_probability)
    labels[~masked_indices] = -100  # We only compute loss on masked tokens

    # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
    indices_replaced = (np.random.random_sample(labels.shape) < 0.8) & masked_indices
    inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)

    # 10% of the time, we replace masked input tokens with random word
    indices_random = (np.random.random_sample(labels.shape) < 0.5) & masked_indices & ~indices_replaced
    random_words = np.random.randint(low=0, high=len(tokenizer), size=np.count_nonzero(indices_random), dtype=np.int64)
    inputs[indices_random] = random_words

    # The rest of the time (10% of the time) we keep the masked input tokens unchanged
    return inputs, labels

In [None]:
def create_sentence_pairing(examples):
    first_seqs = []
    nsp_labels = []

    examples["log"] = [
        line for line in examples["log"] if len(line) > 0 and not line.isspace()
    ]
    
    log_list = list(examples['log'])
    for idx in range(len(log_list)):
        first_value = log_list[idx]
        if random.random() > 0.5:
            # Pair with proper following log sequence
            second_value = log_list[(idx + 1) % len(log_list)]

            # IsNext Label
            nsp_labels.append(0)
        else:
            # Pair with random log
            rand_idx = random.randint(0, len(log_list) - 1)
            second_value = log_list[rand_idx]

            # IsNotNext Label
            nsp_labels.append(1)

        first_seqs.append((first_value, second_value))

#     nsp_labels = tf.reshape(tf.constant(nsp_labels), (self.n_iter, self.batch_size))

    return {"log": first_seqs, "next_sentence_label": nsp_labels}

def tokenize_function(examples):
    # Remove empty lines
    return w2vp.fast_tokenizer(
        examples["log"],
        padding=False,
        truncation=True,
        max_length=1024,
        # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it
        # receives the `special_tokens_mask`.
        return_special_tokens_mask=True,
    )



tokenized_datasets = data.map(
    create_sentence_pairing,
    batched=True,
    num_proc=1,
    remove_columns=["log"],
    desc="Running tokenizer on dataset line_by_line"
)
tokenized_datasets = tokenized_datasets.map(
    tokenize_function,
    batched=True,
    num_proc=1,
    remove_columns=["log"],
    desc="Running tokenizer on dataset line_by_line"
)

In [None]:
from sklearn.model_selection import train_test_split

train_indices, val_indices = train_test_split(
    list(range(len(tokenized_datasets))), test_size=.30
)

eval_dataset = tokenized_datasets.select(val_indices)
train_dataset = tokenized_datasets.select(train_indices)

train_signature = {
    feature: tf.TensorSpec(shape=(None,), dtype=tf.int64)
    for feature in train_dataset.features
    if feature != "special_tokens_mask"
}
train_signature["labels"] = train_signature["input_ids"]
train_signature = (train_signature, {"labels": train_signature["labels"], "next_sentence_label": train_signature["next_sentence_label"]} )

In [None]:
from functools import partial

In [None]:
tokenized_generator = partial(sample_generator, train_dataset, w2vp.fast_tokenizer)

tf_train_dataset = (
    tf.data.Dataset.from_generator(tokenized_generator, output_signature=train_signature)
    .batch(batch_size=10 * 50, drop_remainder=True)
    .repeat(int(5))
)

In [None]:
eval_generator = partial(sample_generator, eval_dataset, w2vp.fast_tokenizer)
eval_signature = {
    feature: tf.TensorSpec(shape=(None,), dtype=tf.int64)
    for feature in eval_dataset.features
    if feature != "special_tokens_mask"
}
eval_signature["labels"] = eval_signature["input_ids"]
eval_signature = (eval_signature, {"labels": eval_signature["labels"], "next_sentence_label": eval_signature["next_sentence_label"]})
tf_eval_dataset = (
    tf.data.Dataset.from_generator(eval_generator, output_signature=eval_signature)
    .batch(batch_size=10 * 50, drop_remainder=True)
)

In [None]:
from transformers import create_optimizer

batches_per_epoch = len(train_dataset) // (10 * 50)

optimizer, lr_schedule = create_optimizer(
    init_lr=1e-4,
    num_train_steps=int(5 * batches_per_epoch),
    num_warmup_steps=2,
    adam_beta1=0.9,
    adam_beta2=0.999,
    adam_epsilon=0.1,
    weight_decay_rate=0.01,
)

In [None]:
w2vp.BERT.compile(optimizer=optimizer, loss={"loss": w2vp.BERT.compute_loss})

In [None]:
class Phobert(TFBertForPreTraining):

    def train_step(self, input_data):
        x, y = input_data
        
        with tf.GradientTape() as tape:
            y_pred = self(x, training=True)  # Forward Pass
            loss = self.compute_loss(
                                     {"labels": y["labels"], "next_sentence_label": y["next_sentence_label"]}, 
                                     (y_pred["prediction_logits"], y_pred["seq_relationship_logits"])
            )
        
        # Computing Gradients
        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)
        
        # Updating the Weights    
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
#         self.compiled_metrics.update_state(y, y_pred)
        
        print(f"Metrics: {self.compiled_metrics.metrics}")
        print(f"InputData: {len(input_data)}")
        print(f"y_pred: {len(y_pred)}")
        
#         output = {m.name: m.result for m in self.metrics}        
#         return (output)
        return {"loss": loss}

In [None]:
bert_config = BertConfig(
    vocab_size=w2vp.bert_tokenizer.get_vocab_size(),
    hidden_size=512,
    num_hidden_layers=8,
    num_attention_heads=8
)

phobert = Phobert(bert_config)
phobert.resize_token_embeddings(len(w2vp.fast_tokenizer))
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
phobert.compile(optimizer=optimizer, loss=phobert.compute_loss)

In [None]:
with yaspin(text="", color='blue') as sp:
    history = phobert.fit(
            tf_train_dataset, 
            validation_data=tf_eval_dataset,
            epochs=int(5),
            steps_per_epoch=len(train_dataset) // (50 * 10)
    )

In [None]:
# -- Unsupervised Learning Pipeline -- #

'''
Input: pd.DataFrame with batch_size number of rows 
Seq: 
    Normalize 
    Phraser
    Clustering
    Extract Unique Layers
    BERT
Returns: transformers.TFBertForPreTrainingOutput
'''

# --- SUBWORD TOKENIZER --
w2vp = UnsupervisedLearningPipeline(preprocessing_config, epochs=10, batch_size=10, seq_length=60)
training_outputs = w2vp.fit(dataset[:1000])

In [None]:
list(training_outputs)

In [None]:
@tf.function
def square_ragged_tensors(examples, key):
    if isinstance(examples, dict):
        return  examples[key].to_tensor(0)
    else:
        return examples.to_tensor(0)

def some_func(examples: tf.data.Dataset, is_ragged=False) -> tf.data.Dataset:
    if is_ragged:
        if isinstance(list(examples)[0], dict):
            for k_idx in range(len(list(examples)[0].keys())):
                key = list(list(examples)[0].keys())[k_idx]
                if isinstance(list(examples)[0][key], tf.RaggedTensor):
                    list(examples)[0][key] = examples.batch(2).map(lambda x: square_ragged_tensors(x, key)).unbatch()
        else:
            examples = examples.batch(2).map(square_ragged_tensors).unbatch()
        
    return examples
        
a = tf.constant([0, 1, 2, 3])
b = tf.constant([0, 1, 2, 3, 5, 6])
rg = tf.ragged.stack([a, b])
ex = {"input_ids": rg, "hello_world": ["goodbye", "world"]}

batch_encodings = tf.data.Dataset.from_tensor_slices(ex)

list(some_func(batch_encodings, True))
# ex["input_ids"].to_tensor(0)

In [None]:
%%capture
!pip install bertviz

In [None]:
from bertviz import head_view, model_view

In [None]:
test_data['log']

In [None]:
sentence = test_data.iloc[0]['log']
sentence

In [None]:
tokens = w2vp.bert_tokenizer.text_to_sequence(sentence).tokens
tokens

In [None]:
head_view(attention, tokens)

In [None]:
model_view(attention, tokens)

In [None]:
w2vp.BERT.summary()

In [None]:
copy_bert = deepcopy(w2vp.BERT)

In [None]:
copy_bert.get_output_at

In [None]:
w2vp.BERT.save(SOURCE + '/results/')

In [None]:
tf_data




