# Setup

Feel free to change

In [1]:
# -- Base -- #
import os
import random
import joblib
import logging
import time
import re
import io
from datetime import datetime
from tqdm import tqdm
from tqdm.auto import trange
import ipdb
from copy import deepcopy
from dataclasses import dataclass
import sys, getopt
import json
from pathlib import Path
import yaml
import shutil
import csv
from typing import (
    List,
    Dict
)
from yaspin import yaspin
from yaspin.spinners import Spinners

# -- Tokenizer -- #
import tokenizers
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers import normalizers

from tokenizers.normalizers import (
    Lowercase,
    NFD,
    StripAccents
)

from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing
from tokenizers.trainers import WordPieceTrainer
from tokenizers import decoders

# -- PreTrained BERT -- #
import transformers
from transformers import TFAlbertForSequenceClassification
from transformers import PreTrainedTokenizerFast
from transformers import TFTrainer

# -- Metrics -- #
import numpy as np
import pandas as pd
import sqlite3 as sql
import tensorboard

# -- Tensorflow -- #
import tensorflow as tf
from tensorflow.keras.metrics import Mean
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam

from tensorflow.keras.layers import (
    Softmax,
    Dense,
    AdditiveAttention,
    MultiHeadAttention,
    Layer
)

from tensorflow.keras.layers import (
    LayerNormalization,
    Dropout,
    Embedding
)

from tensorflow.keras import (
    Sequential,
    Model
)
from tensorflow.train import Checkpoint, CheckpointManager
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model

# -- Misc Models -- #
import drain3
from gensim.models.phrases import Phrases
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.decomposition import FastICA
from sklearn.neighbors import NearestNeighbors
from sklearn.mixture import GaussianMixture
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import KMeans
from sklearn.random_projection import SparseRandomProjection
from sklearn.svm import SVC

# -- Dash -- #
import dash
import dash_table
from jupyter_dash import JupyterDash
import dash_core_components as dcc
import dash_html_components as html

from dash.dependencies import (
    Input,
    Output,
    State
)

# import plotly.io as pio
# import plotly.express as px
from dash import no_update
from flask_caching import Cache

%load_ext tensorboard

2021-07-08 12:19:14.192274: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [2]:
cd ..

/home/jovyan/assets


In [3]:
from libs.transformers.src.transformers.data.data_tf_collator import TFDataCollatorForLanguageModeling

In [5]:
def convert_sqlite_to_csv(inputFolder, ext, tableName):
    """ inputFolder - Folder where sqlite files are located. 
        ext - Extension of your sqlite file (eg. db, sqlite, sqlite3 etc.)
        tableName - table name from which you want to select the data.
    """
    csvWriter = csv.writer(open(inputFolder+'/output.csv', 'w', newline=''))
    for file1 in os.listdir(inputFolder):
        if file1.endswith('.'+ext):
            conn = sql.connect(inputFolder+'/'+file1)
            cursor = conn.cursor()
            cursor.execute("SELECT * FROM "+tableName)
            rows = cursor.fetchall()
            for row in rows:
                csvWriter.writerow(row)
            continue
        else:
            continue

Extensions

In [6]:
import tensorflow as tf
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
for device in gpu_devices:
    tf.config.experimental.set_memory_growth(device, True)

2021-07-08 12:19:25.526853: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-07-08 12:19:25.528362: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2021-07-08 12:19:25.581405: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-07-08 12:19:25.581831: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:0a:00.0 name: GeForce RTX 2080 Ti computeCapability: 7.5
coreClock: 1.635GHz coreCount: 68 deviceMemorySize: 10.76GiB deviceMemoryBandwidth: 573.69GiB/s
2021-07-08 12:19:25.581848: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2021-07-08 12:19:25.594528: I tensorflow/stream_executor/platform/defau

In [7]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [8]:
tf.__version__

'2.4.1'

## Environmental Variables


---



In [9]:
SOURCE = '/home/' + os.environ['USER']
CONTAINER = 'core.soaesb'

## Logging

In [10]:
logging.basicConfig(format='%(asctime)s %(levelname)s | %(message)s',
                    level=logging.INFO,
                    stream=sys.stdout)
logger = logging.getLogger(__name__)

# Define Dataset

## Define Database Functions

In [11]:
def database_builder(path: str) -> pd.DataFrame():
    logger.info('Building DataFrame ...')
    (_, _, files) = next(os.walk(path))
    sql_query = 'SELECT * FROM logs'
    data = []
    for f in files:
        if '.db' in f:
            conn = create_connection(path + f)
            d = pd.read_sql_query(sql_query, conn)
            data.append(d)
    logger.info('...complete!')
    return pd.concat(data)


def create_connection(path: str) -> sql.Connection:
    """
    Creates a database connection
    :param path: str
        path to database object
    :return sql.Connection
        a connection to the database
    """
    try:
        conn = sql.connect(path)
        logger.info('Connected to database ' + path)
        return conn
    except sql.Error as e:
        logger.warning(e)

## Define Dataset Main

In [12]:
dataset = database_builder(SOURCE + '/data/')
container_dataset = dataset[dataset['container_name'] == CONTAINER]

2021-07-08 12:19:31,823 INFO | Building DataFrame ...
2021-07-08 12:19:31,828 INFO | Connected to database /home/jovyan/data/elastic_logs.db
2021-07-08 12:19:32,446 INFO | ...complete!


# W2V Pipeline

## Pipeline Objects

### Configuration

In [13]:
def set_attributes(self, config: dict):
    try:
        config = config[self.__class__.__name__]
    except Exception as e:
        logger.warning(e)
        logger.warning('No configuration found for ' +
                       self.__class__.__name__)

    for attr in config.keys():
        setattr(self, attr, config[attr])


@dataclass
class PreprocessingGlobalConfig:
    embed_size: int = 512
    max_vocab_size: int = 2000
    buffer_size: int = 10000
    global_training: bool = True
    path: str = '/results/'

    def load(self, config):
        set_attributes(self, config)


@dataclass
class PhraserModelConfig:
    min_count: int = 5
    threshold: float = 7
    load_model: bool = True
    save_model: bool = False
    training: bool = True
    model_name: str = 'phrase_model.joblib'

    def load(self, config):
        set_attributes(self, config)


@dataclass
class TextClusteringConfig:
    load_model: bool = True
    save_model: bool = False
    training: bool = True
    model_name: str = 'template_miner.joblib'

    def load(self, config):
        set_attributes(self, config)


class PreprocessingPipelineConfig:
    def __init__(self):
        self.PreprocessingGlobalConfig = PreprocessingGlobalConfig()
        self.PhraserModelConfig = PhraserModelConfig()
        self.TextClusteringConfig = TextClusteringConfig()

    def load(self, path):
        try:
            with open(path) as f:
                preprocessing_config = yaml.load(f, Loader=yaml.FullLoader)
        except FileNotFoundError as e:
            logger.warning(e)
            return None

        self.PreprocessingGlobalConfig.load(preprocessing_config)
        self.PhraserModelConfig.load(preprocessing_config)
        self.TextClusteringConfig.load(preprocessing_config)

### Tokenizer

In [14]:
class PrimeTokenizer:
    def __init__(self, max_seq_length: int):
        self.prime_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))

        self.prime_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])

        self.prime_tokenizer.pre_tokenizer = Whitespace()

        self.prime_tokenizer.post_processor = TemplateProcessing(
            single="[CLS] $A [SEP]",
            pair="[CLS] $A [SEP] $B:1 [SEP]:1",
            special_tokens=[
                ("[CLS]", 1),
                ("[SEP]", 2),
            ],
        )

        self.trainer = WordPieceTrainer(
            vocab_size=153411, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
        )

        self.prime_tokenizer.decoder = decoders.WordPiece()
        self.prime_tokenizer.enable_padding(length=max_seq_length)
        self.prime_tokenizer.enable_truncation(max_seq_length)

    def text_to_sequence(self, input_) -> List[tokenizers.Encoding]:
        if type(input_) is list:
            return self.prime_tokenizer.encode_batch(input_)
        return self.prime_tokenizer.encode(input_)

    def sequence_to_text(self, input_) -> List[str]:
        if type(input_) is list:
            return self.prime_tokenizer.decode_batch(batch)
        return self.prime_tokenizer.decode(input_)

    def train(self, data):
        log_itr = iter(data)
#         tqdm_log_itr = tqdm(iterable=log_itr, total=len(data))
# tqdm_log_itr.__iter__()
        self.prime_tokenizer.train_from_iterator(log_itr, self.trainer)
        self.save()

    def get_tokenizer(self) -> Tokenizer:
        return self.prime_tokenizer

    def get_vocab(self) -> Dict[str, int]:
        return self.prime_tokenizer.get_vocab()
    
    def get_vocab_size(self) -> int:
        return self.prime_tokenizer.get_vocab_size()
    
    def save(self):
        self.prime_tokenizer.save(SOURCE + "/results/prime_tokenizer.json")
        
    def load(self):
        self.prime_tokenizer = Tokenizer.from_file(SOURCE + "/results/prime_tokenizer.json")

### Generic Save Model

In [15]:
def save_model(model, path):
#     if not os.path.exists(path):
#         return

    if os.path.isfile(path):
        os.remove(path)
#     elif os.path.isdir(path):
#         shutil.rmtree(path)
#         return

    joblib.dump(model, path)

### PhraseCaptureLayer

In [16]:
class PhraserModel:

    def __init__(self,
                 config: PhraserModelConfig,
                 global_config: PreprocessingGlobalConfig):

        super(PhraserModel, self).__init__()
        self.min_count = config.min_count
        self.threshold = config.threshold
        self.load_model = config.load_model
        self.save_model = config.save_model
        self.path = global_config.path
        self.model_name = config.model_name
        self.training = config.training

        if self.load_model:
            self.phrase_model = joblib.load(SOURCE +
                                            self.path +
                                            self.model_name)
        else:
            self.phrase_model = Phrases(min_count=self.min_count,
                                        threshold=self.threshold)

    def __call__(self, corpus: pd.DataFrame, training=None) -> list:
        if training is None:
            training = self.training
            
        def reorganize_return(corpus_with_phrases):
            log_list = []
            for tokenized_log in corpus_with_phrases:
                log_list.append(' '.join(tokenized_log))
            return log_list

        split_corpus = [log.split(' ') for log in corpus['log']]

        corpus_with_phrases = None
        if not training:
            frozen_model = self.phrase_model.freeze()
            corpus_with_phrases = self.phrase_model.__getitem__(split_corpus)
        else:
            self.phrase_model.add_vocab(split_corpus)

            if self.save_model:
                save_model(self.phrase_model, SOURCE + self.path + self.model_name)

            corpus_with_phrases = self.phrase_model.__getitem__(split_corpus)
            
        return reorganize_return(corpus_with_phrases)

### TextClusteringLayer

In [17]:
class TextClustering:

    def __init__(self,
                 config: TextClusteringConfig,
                 global_config: PreprocessingGlobalConfig):

        super(TextClustering, self).__init__()
        self.load_model = config.load_model
        self.save_model = config.save_model
        self.path = global_config.path
        self.model_name = config.model_name
        self.training = config.training

        if self.load_model is True:
            self.template_miner = joblib.load(SOURCE +
                                              self.path +
                                              self.model_name)
        else:
            self.template_miner = drain3.TemplateMiner()

    def __call__(self, corpus: list, training=None) -> list:
        if training is None:
            training = self.training
            
        if training:
            for log in corpus:
                self.template_miner.add_log_message(log)
            if self.save_model:
                save_model(self.template_miner,
                           SOURCE + self.path + self.model_name)

            for idx, log in enumerate(corpus):
                template = self.template_miner.match(log).get_template()
                corpus[idx] = template

            return [re.sub(pattern=r' +',
                           repl=' ',
                           string=cluster) for cluster in corpus]
        else:
#             log_list = self.get_unique_templates()
#             print(f'Length of the log list: {len(log_list)}')
#             return log_list
            log_list = []
            log_set = set()
            for log in corpus:
                match_cluster = self.template_miner.match(log)
                if match_cluster is None:
                    match_cluster = self.template_miner.add_log_message(log)['template_mined']
                    log_set.add(match_cluster)
                else:
                    log_set.add(match_cluster.get_template())
        
#             l = [re.sub(pattern=r' +',
#                            repl=' ',
#                            string=cluster) for cluster in log_list]
            return list(log_set)
        
    def get_unique_templates(self) -> list:
        template_list = []
        for cluster in self.template_miner.drain.clusters:
            template_list.append(cluster.get_template())
        return [re.sub(pattern=r' +',
                       repl=' ',
                       string=cluster) for cluster in template_list]

### Preprocessing Pipeline

In [18]:
def process_all_batches(n_iter, log_labels, batch_size):
    batches = []

    for idx in range(n_iter + 1):
        log_batch, labels = process_batch(dataset, idx, log_labels, batch_size)

        batches.append((log_batch, labels))

    return batches

def process_batch(dataset: pd.DataFrame,
                  idx: int,
                  labels: dict,
                  batch_size: int) -> tuple:
    start_window = idx * batch_size
    end_window = (idx + 1) * batch_size
    batched_data = dataset.iloc[start_window:end_window]
    encoded_batch = prime_tokenizer.text_to_sequence(batched_data['log'].to_list())
    id_batch = [log.ids for log in encoded_batch]
#     y_batch = labels[batched_data['label']]
    y_batch = [labels[idx] for idx in batched_data['label']]

    tf_idf = tf.convert_to_tensor(id_batch, dtype=tf.float32)
    y_idf  = tf.convert_to_tensor(y_batch, dtype=tf.float32)
    
    return tf_idf, y_idf

## Unsupervised Learning Pipeline

In [19]:
from libs.transformers.src.transformers.models.bert.modeling_tf_bert import TFBertForPreTraining
from libs.transformers.src.transformers.models.bert.configuration_bert import BertConfig

In [30]:
class UnsupervisedLearningPipeline:
    def __init__(self, config: PreprocessingPipelineConfig, epochs=3, batch_size=50, seq_length=200):
        self.normalized_logs = None
        self.log_labels = None
        self.bert_tokenizer = PrimeTokenizer(seq_length)
        self.fast_tokenizer = None
        self.train_dataset = None
        self.token_logs = None
        self.bert_config = None
        self.BERT = None
        self.max_seq_len = seq_length
        self.n_logs = 0
        self.n_iter = 0
        self.epochs = epochs
        self.batch_size = batch_size
        self.logs_with_phrases = list()
        self.logs_as_templates = None
        
        self.data_collator = None
        
        self.pm = PhraserModel(config.PhraserModelConfig, 
                               config.PreprocessingGlobalConfig)
        
        self.tc = TextClustering(config.TextClusteringConfig,
                                 config.PreprocessingGlobalConfig)
        
    def normalize_logs(self, logs: pd.DataFrame) -> pd.DataFrame:
        # remove timestamps and double spaces
        regexp = re.compile(
            r"""
            (?:               # Match all enclosed
            \d{4}-\d{2}-\d{2} # YYYY-MM-DD
            [\sT]             # Accept either a space or T
            \d{2}:\d{2}:\d{2} # HH:MM:SS
            ([.,]\d{3}|\s)    # Accept either a space or milliseconds
            )                 # End timestamp match
            | (?:\s{2,})      # Remove double spaces   
            | [^a-zA-Z\d:]    # Clean non-alphanumeric characters
            """, re.X)        # re.X enables comments and whitespace

        c_logs = deepcopy(logs)
        c_logs.loc[:, 'log'].replace(to_replace=regexp, 
                                     value=' ', 
                                     regex=True,
                                     inplace=True)
        return c_logs
        
    def extract_unique_labels(self) -> dict:
        # -- Labels -- #
        label_unique = self.normalized_logs['label'].unique()
        binary_labels = LabelEncoder().fit_transform(label_unique)

        log_labels = {}
        for idx, label in enumerate(label_unique):
            log_labels.update({
                label: binary_labels[idx]
            })
        return log_labels
    
    def create_sentence_pairing(self):
        first_seqs = []
        second_seqs = []
        nsp_labels = []
        
        log_list = self.normalized_logs['log'].to_list()
        for idx in range(len(log_list)):
            first_value = log_list[idx]
            if random.random() > 0.5:
                # Pair with proper following log sequence
                second_value = log_list[(idx + 1) % len(log_list)]
                
                # IsNext Label
                nsp_labels.append(0)
            else:
                # Pair with random log
                rand_idx = random.randint(0, len(log_list) - 1)
                second_value = log_list[rand_idx]
                
                # IsNotNext Label
                nsp_labels.append(1)
                
            first_seqs.append(first_value)
            second_seqs.append(second_value)
            
        nsp_labels = tf.reshape(tf.constant(nsp_labels), (self.n_iter, self.batch_size))
            
        return first_seqs, second_seqs, nsp_labels
    
    def collate_data(self, encodings):
        # Assure data is non-ragged and are of type tf.Tensor
        encodings['input_ids'] = tf.ragged.constant(encodings['input_ids']).to_tensor(0)
        encodings['token_type_ids'] = tf.ragged.constant(encodings['token_type_ids']).to_tensor(0)
        encodings['attention_mask'] = tf.ragged.constant(encodings['attention_mask']).to_tensor(0)
        
        self.data_collator = TFDataCollatorForLanguageModeling(self.fast_tokenizer, 
                                                               padding_length=self.max_seq_len, 
                                                               batch_size=self.batch_size)
        
        batch_encodings = tf.data.Dataset.from_tensor_slices(encodings.data)
        
        return self.data_collator(batch_encodings)
    
    def initialize_fast_tokenizer(self):
        the_tokenizer_obj = self.bert_tokenizer.get_tokenizer()
        fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=the_tokenizer_obj)
        fast_tokenizer.model_max_length = self.max_seq_len
        fast_tokenizer.unk_token = "[UNK]"
        fast_tokenizer.sep_token = "[SEP]"
        fast_tokenizer.pad_token = "[PAD]"
        fast_tokenizer.cls_token = "[CLS]"
        fast_tokenizer.mask_token = "[MASK]"
        self.fast_tokenizer = fast_tokenizer
        return fast_tokenizer
    
    def get_pre_training_data(self):
        labels = self.normalized_logs['label'].to_list()
        log_labels = [self.log_labels[l] for l in labels]

        fast_tokenizer = self.initialize_fast_tokenizer()
        
        first_sequences, pair_sequences, nsp_labels = self.create_sentence_pairing()
        
        encodings = fast_tokenizer(first_sequences, pair_sequences, truncation=True, padding=False)
                
        encoded_objs = self.collate_data(encodings)
        
        pre_training_data = dict()
        for key in list(encoded_objs)[0].keys():
            pre_training_data[key] = tf.constant([x[key].numpy() for x in encoded_objs])
        pre_training_data["next_sentence_label"] = nsp_labels
        
        log_labels = tf.reshape(tf.constant(log_labels), (self.n_iter, self.batch_size))
        
        pre_train_dataset = tf.data.Dataset.from_tensor_slices((
            pre_training_data,
            log_labels
        ))
        
        return pre_train_dataset

    def train_bert_tokenizer(self, load_model=False):
        if load_model:
            self.bert_tokenizer.load()
        else:
            self.bert_tokenizer.train(self.logs_as_templates)

    @staticmethod
    def batch_to_tensor(batch) -> tf.Tensor:
        enc_list = []
        for enc_obj in batch:
            enc_tensor = tf.convert_to_tensor(enc_obj, dtype=tf.int32)
            enc_list.append(enc_tensor)
        return tf.stack(enc_list, name='tensor_batch')

    def serve_batch_w_idx(self, idx):
        start_idx = idx * self.batch_size
        end_idx = start_idx + self.batch_size
        
        log_batch = self.logs_as_templates[start_idx:end_idx].tolist()
        batch = self.bert_tokenizer.text_to_sequence(log_batch)
        batch_ids = [log.ids for log in batch]
        return self.batch_to_tensor(batch_ids)
    
    def serve_batch(self, batch):
        log_batch = self.bert_tokenizer.text_to_sequence(batch)
        batch_ids = [log.ids for log in log_batch]
        return self.batch_to_tensor(batch_ids)

    def pre_train_bert(self):
        self.bert_config = BertConfig(
            vocab_size=self.bert_tokenizer.get_vocab_size(),
            hidden_size=512,
            num_hidden_layers=8,
            num_attention_heads=8
        )

        self.BERT = TFBertForPreTraining(self.bert_config)
        optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
        self.BERT.compile(optimizer=optimizer, loss=self.BERT.compute_loss)
                
        with yaspin(text="", color='blue') as sp:
            for epoch in range(self.epochs):
                sp.text = f'Training BERT EPOCH {epoch}/{self.epochs}'
                for batch in self.train_dataset.as_numpy_iterator():
                    outputs = self.BERT(
                        input_ids=batch[0]["input_ids"],
                        attention_mask=batch[0]["attention_mask"],
                        token_type_ids=batch[0]["token_type_ids"],
                        labels=batch[0]["labels"],
                        next_sentence_label=batch[0]["next_sentence_label"],
                        training=True,
                        output_attentions=True,
                        output_hidden_states=True,
                        return_dict=True
                    )

            sp.text = ""
            sp.ok('✔ Finished Training Epochs')
            
        return outputs
    
    def run_bert(self, batch):
        return self.BERT(batch,
                         output_attentions=True)

    def fit(self, logs: pd.DataFrame):
        assert len(logs.index) > 0, 'process received an empty dataframe!'
        
        with yaspin(text="Normalizing Logs", color='green') as sp:
            self.normalized_logs = self.normalize_logs(logs)
            sp.text = ""
            sp.ok('✔ Completed log normalization')
            
            sp.text = "Extracting phrases"
            self.logs_with_phrases = self.pm(self.normalized_logs)
            sp.text = ""
            sp.ok('✔ Completed phrase extraction')
            
            sp.text = "Converting to log templates"
            self.logs_as_templates = np.array(self.tc(self.logs_with_phrases))
            sp.text = ""
            sp.ok('✔ Completed log template conversion')
        
            sp.text = "Extracting Unique Labels"
            self.log_labels = self.extract_unique_labels()
            sp.text = ""
            sp.ok('✔ Completed extracting unique labels')

            sp.text = "Training Tokenizer"
            self.train_bert_tokenizer()
            sp.text = ""
            sp.ok('✔ Completed training of custom tokenizer')
            
            self.n_logs = len(self.normalized_logs['log'])
            self.n_iter = self.n_logs // self.batch_size
                        
            sp.text = "Processing training dataset"
            self.train_dataset = self.get_pre_training_data()
            sp.text = ""
            sp.ok('✔ Completed processing training dataset')
            
            sp.text = "Pretraining BERT"
            outputs = self.pre_train_bert()
            sp.text = ""
            sp.ok('✔ Completed BERT pretraining')
            
        return outputs
        
    def transform(self, batch: pd.DataFrame):
        x = self.normalize_logs(batch)
        x = self.pm(x, False)
        x = list(self.tc(x, False))
        x = self.serve_batch(x)
        return self.run_bert(x)

## W2V Pipeline Main

In [31]:
config_path = SOURCE + '/assets/notebooks/PreprocessingConfig.yaml'
preprocessing_config = PreprocessingPipelineConfig()
preprocessing_config.load(config_path)

In [32]:
# -- Unsupervised Learning Pipeline -- #

'''
Input: pd.DataFrame with batch_size number of rows 
Seq: 
    Normalize 
    Phraser
    Clustering
    Extract Unique Layers
    BERT
Returns: transformers.TFBertForPreTrainingOutput
'''

# --- SUBWORD TOKENIZER --
w2vp = UnsupervisedLearningPipeline(preprocessing_config, epochs=10, batch_size=10, seq_length=200)
training_outputs = w2vp.fit(dataset[:100000])

2021-07-08 12:53:51,803 INFO | Starting Drain3 template miner
2021-07-08 12:53:51,805 INFO | Loading configuration from drain3.ini
[K[32m✔ Completed log normalization[0m 
2021-07-08 12:53:55,072 INFO | exporting phrases from Phrases<0 vocab, min_count=5, threshold=7, max_vocab_size=40000000>
2021-07-08 12:53:55,073 INFO | FrozenPhrases lifecycle event {'msg': 'exported FrozenPhrases<0 phrases, min_count=5, threshold=7> from Phrases<0 vocab, min_count=5, threshold=7, max_vocab_size=40000000> in 0.00s', 'datetime': '2021-07-08T12:53:55.073507', 'gensim': '4.0.1', 'python': '3.8.8 | packaged by conda-forge | (default, Feb 20 2021, 16:22:27) \n[GCC 9.3.0]', 'platform': 'Linux-5.8.0-55-generic-x86_64-with-glibc2.10', 'event': 'created'}
[K[32m✔ Completed phrase extraction[0m 
[K[32m✔ Completed log template conversion[0m 
[K[32m✔ Completed extracting unique labels[0m 



[K[32m✔ Completed training of custom tokenizer[0m 


NameError: name 'final_list' is not defined

In [None]:
training_outputs

In [None]:
from transformers.modeling_tf_utils import input_processing, shape_list

def compute_loss(labels: tf.Tensor, logits: tf.Tensor) -> tf.Tensor:
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True# reduction=tf.keras.losses.Reduction.NONE
    )
    # make sure only labels that are not equal to -100
    # are taken into account as loss
    masked_lm_active_loss = tf.not_equal(tf.reshape(tensor=labels["labels"], shape=(-1,)), -100)
    
    masked_lm_reduced_logits = tf.boolean_mask(
        tensor=tf.reshape(tensor=logits[0], shape=(-1, shape_list(logits[0])[2])),
        mask=masked_lm_active_loss,
    )
    
    masked_lm_labels = tf.boolean_mask(
        tensor=tf.reshape(tensor=labels["labels"], shape=(-1,)), mask=masked_lm_active_loss
    )
    
    next_sentence_active_loss = tf.not_equal(tf.reshape(tensor=labels["next_sentence_label"], shape=(-1,)), -100)
    
    next_sentence_reduced_logits = tf.boolean_mask(
        tensor=tf.reshape(tensor=logits[1], shape=(-1, 2)), mask=next_sentence_active_loss
    )
    
    next_sentence_label = tf.boolean_mask(
        tensor=tf.reshape(tensor=labels["next_sentence_label"], shape=(-1,)), mask=next_sentence_active_loss
    )
    
    masked_lm_loss = loss_fn(y_true=masked_lm_labels, y_pred=masked_lm_reduced_logits)
    
    next_sentence_loss = loss_fn(y_true=next_sentence_label, y_pred=next_sentence_reduced_logits)
    
    masked_lm_loss = tf.reshape(tensor=masked_lm_loss, shape=(-1, shape_list(next_sentence_loss)[0]))
    
    masked_lm_loss = tf.reduce_mean(input_tensor=masked_lm_loss, axis=0)

    return masked_lm_loss + next_sentence_loss

In [None]:
from torch.nn import CrossEntropyLoss
import torch

In [None]:
prediction_scores = torch.Tensor(prediction_scores.numpy())
prediction_scores = prediction_scores.type(torch.float64)
seq_relationship_score = torch.Tensor(seq_relationship_score.numpy())
seq_relationship_score = seq_relationship_score.type(torch.float64)
labels = torch.Tensor(inputs["labels"].numpy())
labels = labels.type(torch.LongTensor)
next_sentence_label = torch.Tensor(inputs["next_sentence_label"].numpy())
next_sentence_label = next_sentence_label.type(torch.LongTensor)

if labels is not None and next_sentence_label is not None:
    loss_fct = CrossEntropyLoss()
    masked_lm_loss = loss_fct(prediction_scores.view(-1, 2506), labels.view(-1))
    next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
    total_loss = masked_lm_loss + next_sentence_loss

In [None]:
total_loss

In [None]:
masked_lm_loss

In [None]:
next_sentence_loss

In [None]:
w2vp.bert_tokenizer.get_vocab_size()

In [None]:
from transformers.models.bert.modeling_tf_bert import TFBertMainLayer, TFBertNSPHead, TFBertMLMHead

config = transformers.BertConfig(
    vocab_size=w2vp.bert_tokenizer.get_vocab_size(),
    hidden_size=512,
    num_hidden_layers=8,
    num_attention_heads=8
)

bert = TFBertMainLayer(config, name="bert")
nsp = TFBertNSPHead(config, name="nsp___cls")
mlm = TFBertMLMHead(config, input_embeddings=bert.embeddings, name="mlm___cls")

In [None]:
one = tf.zeros((64, 32, 2506))
labels = inputs["labels"]
nsp_ = inputs["next_sentence_label"]
logits1 = seq_relationship_score

loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True#, reduction=tf.keras.losses.Reduction.NONE
)

masked_lm_active_loss = tf.not_equal(tf.reshape(tensor=labels, shape=(-1,)), -100)
tensor=tf.reshape(tensor=one, shape=(-1, shape_list(one)[2]))

masked_lm_reduced_logits = tf.boolean_mask(
    tensor=tensor,#tf.reshape(tensor=logits[0], shape=(-1, shape_list(logits[0])[2])),
    mask=masked_lm_active_loss,
)

masked_lm_labels = tf.boolean_mask(
    tensor=tf.reshape(tensor=labels, shape=(-1,)), mask=masked_lm_active_loss
)

next_sentence_active_loss = tf.not_equal(tf.reshape(tensor=nsp_, shape=(-1,)), -100)

next_sentence_reduced_logits = tf.boolean_mask(
    tensor=tf.reshape(tensor=logits1, shape=(-1, 2)), mask=next_sentence_active_loss
)

next_sentence_label = tf.boolean_mask(
    tensor=tf.reshape(tensor=nsp_, shape=(-1,)), mask=next_sentence_active_loss
)

masked_lm_loss = loss_fn(y_true=masked_lm_labels, y_pred=masked_lm_reduced_logits)


next_sentence_loss = tf.cast(loss_fn(y_true=next_sentence_label, y_pred=next_sentence_reduced_logits), tf.float32)

# masked_lm_loss = tf.reduce_mean(input_tensor=masked_lm_loss, axis=0)
# masked_lm_loss = tf.reshape(tensor=masked_lm_loss, shape=(-1, shape_list(next_sentence_loss)[0]))

masked_lm_loss + next_sentence_loss

In [None]:
inputs = list(w2vp.train_dataset)[0][0]

outputs = bert(
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    token_type_ids=inputs["token_type_ids"],
    position_ids=None,#inputs["position_ids"],
    head_mask=None,#inputs["head_mask"],
    inputs_embeds=None,#inputs["inputs_embeds"],
    output_attentions=True,#inputs["output_attentions"],
    output_hidden_states=True,#inputs["output_hidden_states"],
    return_dict=True,#inputs["return_dict"],
    training=True#inputs["training"],
)

sequence_output, pooled_output = outputs[:2]
prediction_scores = mlm(sequence_output=sequence_output, training=True)
seq_relationship_score = nsp(pooled_output=pooled_output)

print(prediction_scores.shape)
print(seq_relationship_score.shape)

total_loss = None

print(inputs["labels"].shape)
print(inputs["next_sentence_label"].shape)
print(w2vp.bert_tokenizer.get_vocab_size())

if inputs["labels"] is not None and inputs["next_sentence_label"] is not None:
    d_labels = {"labels": inputs["labels"]}
    d_labels["next_sentence_label"] = inputs["next_sentence_label"]
    total_loss = compute_loss(labels=d_labels, logits=(prediction_scores, seq_relationship_score))


In [None]:
@tf.function
def square_ragged_tensors(examples, key):
    if isinstance(examples, dict):
        return  examples[key].to_tensor(0)
    else:
        return examples.to_tensor(0)

def some_func(examples: tf.data.Dataset, is_ragged=False) -> tf.data.Dataset:
    if is_ragged:
        if isinstance(list(examples)[0], dict):
            for k_idx in range(len(list(examples)[0].keys())):
                key = list(list(examples)[0].keys())[k_idx]
                if isinstance(list(examples)[0][key], tf.RaggedTensor):
                    list(examples)[0][key] = examples.batch(2).map(lambda x: square_ragged_tensors(x, key)).unbatch()
        else:
            examples = examples.batch(2).map(square_ragged_tensors).unbatch()
        
    return examples
        
a = tf.constant([0, 1, 2, 3])
b = tf.constant([0, 1, 2, 3, 5, 6])
rg = tf.ragged.stack([a, b])
ex = {"input_ids": rg, "hello_world": ["goodbye", "world"]}

batch_encodings = tf.data.Dataset.from_tensor_slices(ex)

list(some_func(batch_encodings, True))
# ex["input_ids"].to_tensor(0)

In [None]:
w2vp.train_dataset.as_numpy_iterator().__next__()[0]["next_sentence_label"].shape

In [None]:
dataset[:1000]

In [None]:
# w2vp.train_dataset.as_numpy_iterator().__next__()[0]

In [None]:
training_outputs

In [None]:
test_data = dataset[:1]

In [None]:
test_data.head()

In [None]:
inference_output = w2vp.transform(test_data)

# inference_output = training_outputs

In [None]:
import torch

attention = []

for att in inference_output.attentions:
    new_att = np.array(att)
    tt_att = torch.Tensor(new_att)
    attention.append(tt_att)
    
attention = tuple(attention)

print(attention[0].shape)

print(len(attention))

In [None]:
%%capture
!pip install bertviz

In [None]:
from bertviz import head_view, model_view

In [None]:
test_data['log']

In [None]:
sentence = test_data.iloc[0]['log']
sentence

In [None]:
tokens = w2vp.bert_tokenizer.text_to_sequence(sentence).tokens
tokens

In [None]:
head_view(attention, tokens)

In [None]:
model_view(attention, tokens)

In [None]:
w2vp.BERT.summary()

In [None]:
copy_bert = deepcopy(w2vp.BERT)

In [None]:
copy_bert.get_output_at

In [None]:
w2vp.BERT.save(SOURCE + '/results/')

In [None]:
tf_data

## W2V Dash 

### Supporting Functions

In [None]:
def tree_parser(node, inner_list, outer_list, root_node, depth):
    d = node.key_to_child_node  # dict
    for token in list(d.keys()):
        if len(root_node.key_to_child_node.keys()) == 0:
            ret_list = []
            for row in outer_list:
                proper_len = int(row[1])
                if len(row) == proper_len+1 or len(row) + 1 == depth:
                    ret_list.append(row)
            return ret_list
        inner_list.append(token)
        child = d[token]
        if child.key_to_child_node:
            tree_parser(child, inner_list, outer_list, root_node, depth)
        else:
            d.pop(token)
            outer_list.append(inner_list)
            inner_list = ['root']
            tree_parser(root_node, inner_list, outer_list, root_node, depth)

In [None]:
def tree_to_list_parser(node):
    tree_df = []
    curr_path = []
    tree_dict = {}
    prev_root = [("root", node)]
    while len(prev_root) > 0:
        # Peek at last value
        curr_root = prev_root[-1]

        # Get the node element
        curr_node = curr_root[1].key_to_child_node

        # Follow path value if not already there
        if len(curr_path) <= 0 or curr_path[-1] != curr_root[0]:
            curr_path.append(curr_root[0])

        visited = False
        if curr_root[1] in tree_dict:
            visited = True
        else:
            tree_dict[curr_root[1]] = True

        # Check if value has any leaf nodes
        if not visited and len(curr_node.keys()) > 0:
            # Add those to the stack
            for nn in curr_node.items():
                prev_root.append((nn[0], nn[1]))
        else:
            # Remove previous node in the path
            prev_root.pop()

            # Record to the database if leaf
            if len(curr_node.keys()) <= 0:
                tree_df.append(deepcopy(curr_path))

            # Move back up tree
            curr_path.pop()
    return tree_df

In [None]:
def appendSpherical_np(xyz):
    ptsnew = np.hstack((xyz, np.zeros(xyz.shape)))
    xy = xyz[:, 0]**2 + xyz[:, 1]**2
    ptsnew[:, 3] = np.sqrt(xy + xyz[:, 2]**2)
    ptsnew[:, 4] = np.arctan2(np.sqrt(xy), xyz[:, 2])  # for elevation angle defined from Z-axis down
    # ptsnew[:,4] = np.arctan2(xyz[:,2], np.sqrt(xy)) # for elevation angle defined from XY-plane up
    ptsnew[:, 5] = np.arctan2(xyz[:, 1], xyz[:, 0])
    return ptsnew

In [None]:
def get_spherical_coords(xyz):
    sph = np.zeros(shape=xyz.shape)
    xy = xyz[:, 0]**2 + xyz[:, 1]**2
    sph[:, 0] = np.sqrt(xy + xyz[:, 2]**2)
    sph[:, 1] = np.arctan2(np.sqrt(xy), xyz[:, 2])
    sph[:, 2] = np.arctan2(xyz[:, 1], xyz[:, 0])
    return sph

The output of the W2V pipeline is a matrix of size [vocab size x embedding size] 

### Environmental Variables

In [None]:
# -- W2V Dash Environmental Variables -- #

W2V_NEIGHBORS = 20
RECURSION_LIMIT = 10**6
N_PROJ_DIM = 3
DASH_SEED = 0

### Generate Projection Data

In [None]:
# -- Generate Data for Word Embeddings Projector -- #

# shape = vocab size x embedding dim size
weights = np.ndarray(shape=(len(embed_weights), w2v_config["embed_size"]))

# -- Populate Matrix for PCA -- #
for idx, weight in enumerate(list(embed_weights.values())):
    weights[idx, :] = weight

# -- Dimensionality Reduction -- #
pca = PCA(n_components=N_PROJ_DIM, random_state=DASH_SEED).fit(weights)
ica = FastICA(n_components=N_PROJ_DIM, random_state=DASH_SEED).fit(weights)
srp = SparseRandomProjection(n_components=N_PROJ_DIM, random_state=DASH_SEED).fit(weights)
reduced_embeddings = pca.transform(weights)

# -- Calculate Nearest Neighbors -- #
model = NearestNeighbors(n_neighbors=W2V_NEIGHBORS, algorithm='auto')
trained_embeddings = model.fit(reduced_embeddings)

# Currently the array has a shape of vocab size x N_PROJ_DIM and contains
# the fitted PCA data. We need to add the vocab in the first column so
# we know which vectors are represented.
scatter_plot_3d_cols = ['token', 'x1', 'x2', 'x3']
embedding_vocab_arr = np.array(list(embed_weights.keys()))
embedding_vocab_arr = np.expand_dims(embedding_vocab_arr, 1)
named_reduced_embeddings = np.hstack((embedding_vocab_arr, reduced_embeddings))
scatter_plot_3d_df = pd.DataFrame(
    data=named_reduced_embeddings,
    columns=scatter_plot_3d_cols)
scatter_plot_3d_df['x1'] = pd.to_numeric(scatter_plot_3d_df['x1'])
scatter_plot_3d_df['x2'] = pd.to_numeric(scatter_plot_3d_df['x2'])
scatter_plot_3d_df['x3'] = pd.to_numeric(scatter_plot_3d_df['x3'])

We will build our plot using the tree_parser function. This function recursively
steps through the drain3.TemplateMiner.drain.Node structure of our 
**TextClusteringLayer** (TCL). The recursion populates a np.array which is then used
to build a pandas dataframe which the plotly treemap accepts. There is a column
appended to the tail of the dataframe which counts the number of stars 
(wild card masks) present in the row. This is used to define the colors shown.

### Generate Treemap Data

In [None]:
# By default python's recursion limit is 10**4 which is too small for our needs
sys.setrecursionlimit(RECURSION_LIMIT)

# The root node is the master node of the tree and will be our return point
root_node = deepcopy(w2vp.TCL.template_miner.drain.root_node)
parsed_tree = tree_to_list_parser(root_node)
parsed_tree_df = pd.DataFrame(data=parsed_tree)

# The returned dataframe has generic columns so we will provide custom labels
n_cols = len(parsed_tree_df.columns)
col_name_list = []
for idx in range(n_cols):
    col_name_list.append('level' + str(idx))
parsed_tree_df.columns = col_name_list

'''
Without a color column our treemap would just be plain. We thought that taking
the sum of the drain mask would be an interesting way to color the treemap.
This lambda function will sum those values in each row and return them to a new
columnn named 'sum'
'''
parsed_tree_df['sum'] = parsed_tree_df.apply(lambda x: x.str.contains('<*>'), axis=1).sum(axis=1)  # noqa

### Dash Variables

In [None]:
pio.templates.default = "plotly_dark"
external_stylesheets_url = 'https://drive.google.com/uc?export=view&id=19OXGQ5iJIjRZD4VEZ-xiVChDmj0-SlSF'  # noqa
external_stylesheets = [external_stylesheets_url]

CACHE_CONFIG = dict()
CACHE_CONFIG['CACHE_TYPE'] = 'filesystem'
CACHE_CONFIG['CACHE_DIR'] = SOURCE + '/results/dash_cache'

### Colors

In [None]:
color_d = dict()
color_d['blue'] = 'rgb(66, 133, 244)'
color_d['red'] = 'rgb(219, 68, 55)'
color_d['yellow'] = 'rgb(244, 180, 0)'
color_d['orange'] = 'rgb(255, 165, 0)'
color_d['green'] = 'rgb(15, 157, 88)'
color_d['mint'] = 'rgb(3, 218, 198)'
color_d['dark mint'] = 'rgb(1, 135, 134)'
color_d['dark purple'] = 'rgb(55, 0, 179)'
color_d['purple'] = 'rgb(98, 0, 238)'

### Dash Formatting

In [None]:
# ================= #
#  3d Scatter Plot  #
# ================= #

# Line formatting
scatter_plot_3d_line = dict()
scatter_plot_3d_line['width'] = 2
scatter_plot_3d_line['color'] = color_d['dark mint']

scatter_plot_3d_selected_line = dict()
scatter_plot_3d_selected_line['width'] = 2
scatter_plot_3d_selected_line['color'] = color_d['dark mint']

scatter_plot_3d_nonselected_line = dict()
scatter_plot_3d_nonselected_line['width'] = 2
scatter_plot_3d_nonselected_line['color'] = color_d['dark mint']

scatter_plot_3d_darker_line = dict()
scatter_plot_3d_darker_line['width'] = 2
scatter_plot_3d_darker_line['color'] = color_d['dark purple']


# Marker formatting
scatter_plot_3d_marker = dict()
scatter_plot_3d_marker['size'] = 5
scatter_plot_3d_marker['line'] = scatter_plot_3d_line
scatter_plot_3d_marker['color'] = color_d['mint']

scatter_plot_3d_selected_marker = dict()
scatter_plot_3d_selected_marker['size'] = 5
scatter_plot_3d_selected_marker['color'] = color_d['mint']
scatter_plot_3d_selected_marker['line'] = scatter_plot_3d_selected_line

scatter_plot_3d_nonselected_marker = dict()
scatter_plot_3d_nonselected_marker['size'] = 5
scatter_plot_3d_nonselected_marker['color'] = color_d['mint']
scatter_plot_3d_nonselected_marker['opacity'] = 0.15
scatter_plot_3d_nonselected_marker['line'] = scatter_plot_3d_nonselected_line

scatter_plot_3d_marker_no_color = dict()
scatter_plot_3d_marker_no_color['size'] = 5
scatter_plot_3d_marker_no_color['line'] = scatter_plot_3d_darker_line

scatter_plot_3d_marker_cluster_center = dict()
scatter_plot_3d_marker_cluster_center['size'] = 10
scatter_plot_3d_marker_cluster_center['color'] = color_d['orange']
scatter_plot_3d_marker_cluster_center['opacity'] = 0.5
scatter_plot_3d_marker_cluster_center['line'] = scatter_plot_3d_darker_line

scatter_plot_3d_selected_table_marker = dict()
scatter_plot_3d_selected_table_marker['size'] = 5
scatter_plot_3d_selected_table_marker['color'] = color_d['yellow']
scatter_plot_3d_selected_table_marker['line'] = scatter_plot_3d_darker_line


# Style
scatter_plot_3d_style = dict()
scatter_plot_3d_style['height'] = '100%'
scatter_plot_3d_style['width'] = '100%'


# ========= #
#  Treemap  #
# ========= #

# Style
treemap_style = dict()
treemap_style['height'] = '100%'
treemap_style['width'] = '100%'


# ============ #
#  Data Table  #
# ============ #

# Style
data_table_cell_style = dict()
data_table_cell_style['textAlign'] = 'left'
data_table_cell_style['overflow'] = 'hidden'
data_table_cell_style['textOverflow'] = 'ellipsis'
data_table_cell_style['maxWidth'] = 0
data_table_cell_style['backgroundColor'] = 'rgb(20, 20, 20)'
data_table_cell_style['color'] = 'white'

data_table_header_style = dict()
data_table_header_style['backgroundColor'] = color_d['purple']


# ======== #
#  Labels  #
# ======== #

# Style
clustering_alg_drop_down_label_style = dict()
clustering_alg_drop_down_label_style['color'] = 'white'

coordinate_space_drop_down_label_style = dict()
coordinate_space_drop_down_label_style['color'] = 'white'

dim_reduction_drop_down_label_style = dict()
dim_reduction_drop_down_label_style['color'] = 'white'

### Dash Configuration

In [None]:
# ================= #
#  3d Scatter Plot  #
# ================= #
scatter_plot_3d_config = dict()
scatter_plot_3d_config['responsive'] = True


# ========= #
#  Treemap  #
# ========= #
treemap_config = dict()
treemap_config['responsive'] = True

### Dash Dropdown Options

In [None]:
clustering_alg_drop_down_options = [
    {'label': 'KNN', 'value': 'KNN'},
    {'label': 'GMM', 'value': 'GMM'},
    {'label': 'Bayesian GMM', 'value': 'BGMM'},
    {'label': 'Affinity Prop.', 'value': 'AP'},
    {'label': 'KMEANS', 'value': 'KM'},
    {'label': 'SVM', 'value': 'SVM'},
]

coordinate_space_drop_down_options = [
    {'label': 'Cartesian', 'value': 'CT'},
    {'label': 'Spherical', 'value': 'SP'}
]

dim_reduction_drop_down_options = [
    {'label': 'PCA', 'value': 'PCA'},
    {'label': 'ICA', 'value': 'ICA'},
    {'label': 'LDA', 'value': 'LDA'},
    {'label': 'Sparse RP', 'value': 'SRP'},
    {'label': 'Gaussian RP', 'value': 'GRP'}
]

### Dash Main

In [None]:
app = JupyterDash(__name__, external_stylesheets=external_stylesheets)
cache = Cache()
cache.init_app(app.server, config=CACHE_CONFIG)


# =============== #
#  Cluster Table  #
# =============== #
table = pd.DataFrame(
    data=list(embed_weights.keys()),
    columns=['token'])

# ============= #
#  Scatterplot  #
# ============= #
scatter_plot_3d_fig = px.scatter_3d(
                      scatter_plot_3d_df,
                      x='x1',
                      y='x2',
                      z='x3',
                      hover_name='token')

scatter_plot_2d_fig = px.scatter(
                     scatter_plot_3d_df,
                     x='x1',
                     y='x2',
                     hover_name='token')

scatter_plot_3d_fig.update_traces(marker=scatter_plot_3d_marker)
scatter_plot_3d_fig['layout']['uirevision'] = 1

scatter_plot_2d_fig.update_traces(marker=scatter_plot_3d_marker)
scatter_plot_2d_fig['layout']['uirevision'] = 1


# ========= #
#  Treemap  #
# ========= #
treemap_fig = px.treemap(
    parsed_tree_df,
    path=col_name_list,
    color='sum')


# ============ #
#  App Layout  #
# ============ #
app.layout = html.Div([

        html.Div([

            # -- Clustering Technique Dropdown -- #
            html.Label(
                "Clustering Algorithm (TODO)",
                style=clustering_alg_drop_down_label_style),
            dcc.Dropdown(
                id='cluster-dropdown',
                options=clustering_alg_drop_down_options,
                value='KNN'),

            # -- Coordinate Space Dropdown -- #
            html.Label(
                "Coordinate Space",
                style=coordinate_space_drop_down_label_style),
            dcc.Dropdown(
                id='coord-dropdown',
                options=coordinate_space_drop_down_options,
                value='CT'),

            # -- Dimensionality Reduction Technique Dropdown -- #
            html.Label(
                "Dimensionality Reduction (TODO)",
                style=dim_reduction_drop_down_label_style),
            dcc.Dropdown(
                id='dr-dropdown',
                options=dim_reduction_drop_down_options,
                value='PCA'
            )
        ], className='options-graph-container'),

        # -- 3d Scatter Plot -- #
        html.Div(
            [dcc.Graph(
                id='3d_scat',
                figure=scatter_plot_3d_fig,
                config=scatter_plot_3d_config,
                style=scatter_plot_3d_style),
             dcc.Slider(
                id='my-slider',
                min=0.5,
                max=0.9,
                step=0.05,
                value=0.5)],
            className='main-graph-container',
            id='graph_div'),

        # -- Tree Map -- #
        html.Div(
            dcc.Graph(
                id='3d_tree',
                figure=treemap_fig,
                config=treemap_config,
                style=treemap_style),
            className='secondary-graph-container',
            id='tree_div'),

        # -- Neighbors Datatable -- #
        html.Div(
            children=[dash_table.DataTable(
                 id='table',
                 columns=[{"name": i, "id": i} for i in table.columns],
                 data=pd.DataFrame().to_dict('records'),
                 style_cell=data_table_cell_style,
                 style_header=data_table_header_style,
             )],
            className='related-graph',
            id='data_table'),

        # signal value to trigger callbacks
        dcc.Store(id='signal')],

    id='report-container')


# ============= #
#  Memoization  #
# ============= #

# Table of Contents:
# -----------------------------
# 1. Projection DataFrame
# 2. Coordinates
# 3. Dimensionality Reductions
# 4. Clustering Algorithms
# -----------------------------

# -- 1. Projection DataFrame -- #
@cache.memoize()
def dataframe_store(embeddings):
    new_df = pd.DataFrame(
        data=embeddings,
        columns=scatter_plot_3d_cols)
    new_df['x1'] = pd.to_numeric(new_df['x1'])
    new_df['x2'] = pd.to_numeric(new_df['x2'])
    new_df['x3'] = pd.to_numeric(new_df['x3'])
    return new_df


# -- 2. Coordinates -- #
@cache.memoize()
def coordinate_space_store(value, embeddings):
    # calculate new coordinate space
    if value == 'SP':
        spherical_embeddings = get_spherical_coords(embeddings)
        embeddings_stack_tup = (embedding_vocab_arr, spherical_embeddings)
        named_embeddings = np.hstack(embeddings_stack_tup)
    elif value == "CT":
        embeddings_stack_tup = (embedding_vocab_arr, embeddings)
        named_embeddings = np.hstack(embeddings_stack_tup)
    else:
        return no_update
    return named_embeddings


# -- 3. Dimensionality Reduction -- #
@cache.memoize()
def dimension_reduct_store(value):
    # calculate new dimensionality reduction algorithm
    if value == "PCA":
        dr_embeddings = pca.transform(weights)
    elif value == "ICA":
        dr_embeddings = ica.transform(weights)
    elif value == "SRP":
        dr_embeddings = srp.transform(weights)
    else:
        return no_update
    return dr_embeddings


# -- 4. Clustering Algorithms -- #
@cache.memoize()
def clustering_algo_store(value, damp_value):
    # calculate new clustering algorithm
    if value == "KNN":
        model = NearestNeighbors(n_neighbors=W2V_NEIGHBORS, algorithm='auto')
    elif value == "AP":
        model = AffinityPropagation(damping=damp_value, random_state=DASH_SEED)
    elif value == "KM":
        model = KMeans(n_clusters=4)
    elif value == "GMM":
        model = GaussianMixture(n_components=4)
    elif value == "SVM":
        model = SVC(kernel='poly', degree=3, probability=True, random_state=DASH_SEED)
    return model


# =========== #
#  Callbacks  #
# =========== #

# -- Calculate Projection Data -- #
@app.callback(Output('signal', 'data'),
              Input('dr-dropdown', 'value'),
              Input('cluster-dropdown', 'value'),
              Input('coord-dropdown', 'value'),
              Input('my-slider', 'value'))
def compute_coordinate_space(dr_val, cluster_val, coord_val, damp_value):
    return (dr_val, cluster_val, coord_val, damp_value)


# -- Point Selection Mechanics -- #
@app.callback(Output("table", "data"),
              Output("3d_scat", "figure"),
              Input('3d_scat', 'clickData'),
              Input("signal", "data"),
              Input("table", "selected_rows"))
def select_point(clickData, value, rows):
    ctx = dash.callback_context
    ids = [c['prop_id'] for c in ctx.triggered]

    embeddings = dimension_reduct_store(value[0])
    model = clustering_algo_store(value[1], value[3])
    named_embeddings = coordinate_space_store(value[2], embeddings)
    df = dataframe_store(named_embeddings)

    clustering_model = model.fit(named_embeddings[:, 1:4].astype(float))

    if '3d_scat.clickData' in ids:
        if clickData:
            for p in clickData['points']:
                if value[1] != "KNN":
                    return no_update, no_update

                coord_list = [p['x'], p['y'], p['z']]
                query_arr = np.array(coord_list).reshape(1, -1)

                _, neighbors = clustering_model.kneighbors(X=query_arr)
                neighbors_list = neighbors.tolist()[0]
                tokens = []
                for idx in neighbors_list:
                    tokens.append(table.iloc[idx])
                update = pd.DataFrame(data=tokens)

                selected_df = df[df.index.isin(neighbors_list)]
                nonselected_df = df.drop(index=neighbors_list)

                ff = px.scatter_3d(
                    selected_df,
                    x='x1',
                    y='x2',
                    z='x3',
                    hover_name='token')

                ff = ff.update_traces(marker=scatter_plot_3d_selected_marker)

                if rows is not None:
                    table_point = selected_df[selected_df['token'] == rows]
                    ff2_1 = px.scatter_3d(
                            table_point,
                            x='x1',
                            y='x2',
                            z='x3',
                            text='token')

                    ff2_1 = ff2_1.update_traces(marker=scatter_plot_3d_selected_table_marker)
                    ff.add_trace(ff2_1.data[0])

                ff2 = px.scatter_3d(
                    nonselected_df,
                    x='x1',
                    y='x2',
                    z='x3',
                    hover_name='token')

                ff2 = ff2.update_traces(marker=scatter_plot_3d_nonselected_marker)

                ff.add_trace(ff2.data[0])
                ff['layout']['uirevision'] = 1

                return update.to_dict('records'), ff
    elif 'signal.data' in ids:
        if value[1] != "KNN":
            y_pred = clustering_model.predict(embeddings)

            df.insert(0, "Label", y_pred, True)
            ff = px.scatter_3d(
                df,
                x='x1',
                y='x2',
                z='x3',
                color='Label',
                hover_name='token')

            ff.update_traces(marker=scatter_plot_3d_marker_no_color)

            if "GMM" not in value[1]:
                centers = pd.DataFrame(data=clustering_model.cluster_centers_, columns=["x1", "x2", "x3"])
                ff2 = px.scatter_3d(
                    centers,
                    x='x1',
                    y='x2',
                    z='x3')
                ff2.update_traces(marker=scatter_plot_3d_marker_cluster_center)

                ff.add_trace(ff2.data[0])
        else:
            ff = px.scatter_3d(
                df,
                x='x1',
                y='x2',
                z='x3',
                hover_name='token')

            ff.update_traces(marker=scatter_plot_3d_marker)

        ff['layout']['uirevision'] = 1

        return no_update, ff
    else:
        return no_update, no_update


app.run_server(host='0.0.0.0', mode='jupyterlab')

# Transformer Pipeline

In [None]:
@dataclass
class TransformerGlobalConfig:
    d_model: int = 512
    max_seq_length: int = 200
    global_training: bool = True
    storage_path: str = '/results/'

    def load(self, config):
        set_attributes(self, config)


@dataclass
class BERTLayerConfig:
    num_attention_heads: int = 8
    num_encoder_layers: int = 12
    dff: int = 2048
    max_seq_len: int = 2048
    dropout_rate: float = 0.1
    load_model: bool = False
    save_model: bool = True
    training: bool = True

    def load(self, config):
        set_attributes(self, config)


@dataclass
class HitAnomalyLayerConfig:
    num_attention_heads: int = 12
    num_encoder_layers: int = 3
    dff: int = 2048
    max_seq_len: int = 2048
    dropout_rate: float = 0.1
    load_model: bool = False
    save_model: bool = True
    training: bool = True

    def load(self, config):
        set_attributes(self, config)


class TransformerConfig:
    def __init__(self):
        self._global = TransformerGlobalConfig()
        self.BERT = BERTLayerConfig()
        self.HitAnomaly = HitAnomalyLayerConfig()

    def load(self, path):
        try:
            with open(path) as f:
                transformer_config = yaml.load(f, Loader=yaml.FullLoader)
        except FileNotFoundError as e:
            logger.warning(e)
            return None

        self._global.load(transformer_config)
        self.BERT.load(transformer_config)
        self.HitAnomaly.load(transformer_config)
        
def set_attributes_from_object(self, *args):
    try:
        for obj in args:
            for attr_key, attr in obj.__dict__.items():
                setattr(self, attr_key, attr)
    except Exception as e:
        logger.warning(e)

## Metric Objects

### Loss Function

In [None]:
def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_sum(loss_) / tf.reduce_sum(mask)

### Accuracy Function

In [None]:
def accuracy_function(real, pred):
    accuracies = tf.equal(real, tf.argmax(pred, axis=1))

    mask = tf.math.logical_not(tf.math.equal(real, 0))
    accuracies = tf.math.logical_and(mask, accuracies)

    accuracies = tf.cast(accuracies, dtype=tf.float32)
    mask = tf.cast(mask, dtype=tf.float32)
    return tf.reduce_sum(accuracies) / tf.reduce_sum(mask)

### Custom Learning Rate Schedule

In [None]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):

    def __init__(self, d_model: int, warmup_steps=4000):
        super(CustomSchedule, self).__init__()

        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

## Pipeline Objects

### PositionalEncodingLayer

In [None]:
class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, max_steps, max_dims, dtype=tf.float32, **kwargs):
        super().__init__(dtype=dtype, **kwargs)
        if max_dims % 2 == 1: max_dims += 1  # max_dims must be even
        p, i = np.meshgrid(np.arange(max_steps), np.arange(max_dims // 2))
        pos_emb = np.empty((1, max_steps, max_dims))
        pos_emb[0, :, ::2] = np.sin(p / 10000 ** (2 * i / max_dims)).T
        pos_emb[0, :, 1::2] = np.cos(p / 10000 ** (2 * i / max_dims)).T
        self.positional_embedding = tf.constant(pos_emb.astype(self.dtype))

    def call(self, inputs):
        shape = tf.shape(inputs)
        return inputs + self.positional_embedding[:, :shape[-2], :shape[-1]]

### EncoderBlock

In [None]:
class EncoderBlock(Layer):

    def __init__(
        self,
        d_model: int,
        num_heads: int,
        dff: int,
        rate=0.1):
        super(EncoderBlock, self).__init__()

        self.multi_headed_attention = MultiHeadAttention(num_heads=num_heads,
                                                         key_dim=d_model // num_heads,
                                                         dropout=0.1,
                                                         attention_axes=(1))

        self.feed_forward_network = Sequential([
            Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
            Dense(d_model, activation='relu')  # (batch_size, seq_len, d_model)
        ])

        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)

        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, x, mask):
        # (1) - Attention Score
#         logger.info('MULTIHEADED ATTENTION')
#         logger.info(x.shape)
        attn_output, attn_weights = self.multi_headed_attention(
            x,
            x,
            return_attention_scores=True)  # (batch_size, input_seq_len, d_model)

        # (2) - Add & Normalize
        attn_output = self.dropout1(attn_output, training=True)
        out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)

        # (3) - Feed Forward NN
        feed_forward_output = self.feed_forward_network(out1)  # (batch_size, input_seq_len, d_model)

        # (4) - Add & Normalize
        feed_forward_output = self.dropout2(feed_forward_output, training=True)
        out2 = self.layernorm2(out1 + feed_forward_output)  # (batch_size, input_seq_len, d_model)

        return tf.convert_to_tensor(out2), tf.convert_to_tensor(attn_weights)

### BERT

In [None]:
class BERTLayer(Layer):
    def __init__(
        self,
        global_config: TransformerGlobalConfig,
        config: BERTLayerConfig):
        super(BERTLayer, self).__init__()

        set_attributes_from_object(
            self, 
            global_config,
            config)

        self.bert_layer_blocks = [EncoderBlock(
            self.d_model,
            self.num_attention_heads,
            self.dff,
            rate=self.dropout_rate) for _ in range(self.num_encoder_layers)]

    def call(self, input_: tf.tuple, **kwargs):
        enc_input = input_[0]
#         logger.info('BERT LAYER')
#         logger.info(enc_input.shape)
        encoding_padding_mask = None
        # BERT for Log Sequence Embedding
        for layer_idx in range(self.num_encoder_layers):
#             logger.info('BERT LAYER LOOP')
#             logger.info(enc_input.shape)            
            enc_output, attention = self.bert_layer_blocks[layer_idx](enc_input, encoding_padding_mask)
            bert_ret = tf.tuple(enc_output, attention)
        return bert_ret

### HitAnomaly

In [None]:
class HitAnomalyLayer(Layer):
    def __init__(
        self,
        vocab_size: int,
        global_config: TransformerGlobalConfig,
        config: HitAnomalyLayerConfig):
        super(HitAnomalyLayer, self).__init__()
        
        self.vocab_size = vocab_size
        set_attributes_from_object(
            self, 
            global_config,
            config)
        
        self.encoding_blocks = [EncoderBlock(
            self.d_model,
            self.num_attention_heads,
            self.dff,
            rate=self.dropout_rate
        ) for _ in range(self.num_encoder_layers)]

        self.hidden_layer_output = []

#     @tf.function(jit_compile=True)
    def call(self, input_: tf.tuple, **kwargs):
        enc_input = input_[0]
        encoding_padding_mask = None

        # Encoder Block Hidden Layers for Log Encoder
        # (batch_size, inp_seq_len, d_model), (batch_size, class, inp_seq_len, inp_seq_len)
        for layer_idx in range(self.num_encoder_layers - 1):
            enc_output, att = self.encoding_blocks[layer_idx](enc_input, encoding_padding_mask)
            self.hidden_layer_output.append(enc_output)

        fin_output = enc_output
        final_output = tf.reduce_mean(fin_output, axis=1)
        final_output = tf.expand_dims(final_output, axis=0)

        # Last Encoding Block for Log Sequence Representation
        out, att = self.encoding_blocks[self.num_encoder_layers - 1](final_output, encoding_padding_mask)
        self.hidden_layer_output.append(out)

        # Final Pooling Layer
        seq_representation = tf.reduce_mean(out, axis=1)

        return seq_representation, att

### Transformer

In [None]:
class Transformer(Model):

    def __init__(
        self,
        tokenizer: PrimeTokenizer,
        config: TransformerConfig):
        super(Transformer, self).__init__()
        
        self.vocab_size = tokenizer.get_vocab_size()   
        set_attributes_from_object(
            self, 
            config._global)

        self.embedding = Embedding(
            self.vocab_size,
            self.d_model,
            input_length=self.max_seq_len)

        self.pos_encoding = PositionalEncoding(
            self.max_seq_len, 
            self.d_model)

        self.bert_layer = BERTLayer(
            config._global,
            config.BERT)

        self.hitanomaly_layer = HitAnomalyLayer(
            self.vocab_size,
            config._global,
            config.HitAnomaly)

        #self.dropout = Dropout(rate)

#     @tf.function(jit_compile=True)
    def call(self, input_tuple: tf.tuple, **kwargs):
        log_batch = input_tuple[0]
#         logger.info('INITIAL')
#         logger.info(log_batch.shape)
        encoding_padding_mask = None # input_tuple[1]
        
        embedding_tensor = self.embedding(log_batch) # (batch_size, input_seq_len, d_model)
#         logger.info('POST EMBEDDING LAYER')
#         logger.info(embedding_tensor.shape)
        
        embedding_tensor = self.pos_encoding(embedding_tensor)
#         logger.info('POST POSITIONAL ENCODING')
#         logger.info(embedding_tensor.shape)
        #embedding_tensor = self.dropout(embedding_tensor, training=TRAINING)

        # BERT for Log Sequence Embedding
        bert_arg = tf.tuple(embedding_tensor, encoding_padding_mask)
        bert_ret = self.bert_layer(bert_arg)

        # Encoder Block Hidden Layers for Log Sequence Representation
#         seq_representation, att = self.hitanomaly_layer(tf.tuple(enc_output, encoding_padding_mask))
        

        return bert_ret

## Transformer Main

### Batch Processing

### Main (Initialization)

In [None]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only use the first GPU
  try:
    tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
  except RuntimeError as e:
    # Visible devices must be set before GPUs have been initialized
    print(e)

# -- Transformer Model -- #
transformer_config_path = SOURCE + '/assets/notebooks/TransformerConfig.yaml'
transformer_config = TransformerConfig()
transformer_config.load(transformer_config_path)
# optimus_prime = Transformer(prime_tokenizer, transformer_config)

t_config = transformer_config._global

# -- Pipeline Info -- #
attns = []
    
# -- Data Batches -- #
# batched_dataset = process_all_batches(n_iter, log_labels, t_config.batch_size)

# -- Model Metrics -- #
learning_rate = CustomSchedule(t_config.d_model)
epoch_loss = Mean(name='train_loss')
epoch_accuracy = Mean(name='train_accuracy')
loss_object = SparseCategoricalCrossentropy(from_logits=True)
optimizer = Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

# -- Classification Step Layers -- #
add_att_layer = AdditiveAttention()
softmax = Softmax()
s1 = Sequential([
    Dense(t_config.batch_size, activation=t_config.activation),
    Dense(4, activation=t_config.activation),
    Softmax()
])

# -- Checkpoints -- #
# checkpoint_path = SOURCE + "checkpoints/"
# checkpoint = Checkpoint(step=tf.Variable(1), transformer=optimus_prime, optimizer=optimizer)
# checkpoint_manager = CheckpointManager(checkpoint, checkpoint_path, max_to_keep=5)

# tf.debugging.set_log_device_placement(True)
writer = tf.summary.create_file_writer(SOURCE + t_config.logdir)

### TrainStep

In [None]:
batch_size = transformer_config._global.batch_size
max_seq_len = transformer_config._global.max_seq_len

train_step_signature = [
    tf.TensorSpec(shape=([batch_size, max_seq_len]), dtype=tf.int32),
    tf.TensorSpec(shape=([batch_size]), dtype=tf.int8)
]

@tf.function(input_signature=train_step_signature, 
             experimental_compile=True)
def train_step(log_batch: tf.Tensor, labels: tf.Tensor):
    
    transformer_input = tf.tuple([
        log_batch,  # <tf.Tensor: shape=(batch_size, max_seq_len), dtype=int32>
        labels  # <tf.Tensor: shape=(batch_size, num_classes), dtype=float32>
    ])
    
    with tf.GradientTape() as tape:
        transformer_ret = optimus_prime(transformer_input)
#         a_s = add_att_layer([Rs, Rs])
#         y = softmax(a_s * Rs)
#         print(a_s.shape)
        # y = Rs
#         loss = tf.py_function(loss_function, [labels, y], tf.float32)
#         pred = s1(y)
#         labels = tf.cast(labels, tf.int8)
    # Optimize the model
#     grads = tape.gradient(loss, optimus_prime.trainable_variables)
#     optimizer.apply_gradients(zip(grads, optimus_prime.trainable_variables))

#     acc = accuracy_function(labels, pred)

    # Tracking Progress
#     epoch_loss.update_state(loss)  # Adding Batch Loss
#     epoch_accuracy.update_state(acc)

    return transformer_ret

In [None]:
train_step_signature

### Main (Training)

In [None]:
attentions = []

for epoch in range(t_config.epoch):

    start = time.time()
    epoch_loss.reset_states()
    epoch_accuracy.reset_states()
    dataset_iter = iter(batched_dataset)

    t = tqdm(range(n_iter), desc="Epoch: {:03d}, Loss: {:.3f}, Accuracy: {:.3%}".format(0, 0, 0), position=0, leave=True)
    for _ in t:
        batch = next(dataset_iter)
        log_batch = batch[0]
        labels = batch[1]

        # Returns Eager Tensor for Predictions
#         tf.summary.trace_on()
#         tf.profiler.experimental.start(SOURCE + t_config.logdir)

#         with writer.as_default():
        transforer_ret = train_step(log_batch, labels)
        attentions.append(transformer_ret)
          # with tf.summary.record_if(True):

#             tf.summary.trace_export(
#               name = "training_trace",
#               step=0,
#               profiler_outdir=SOURCE + t_config.logdir
#             )

#         tf.profiler.experimental.stop()
#         tf.summary.trace_off()

#         checkpoint.step.assign_add(1)

#         if int(checkpoint.step) % 10 == 0:
#             save_path = checkpoint_manager.save()

        t.set_description(desc="Epoch: {:03d}, Loss: {:.3f}, Accuracy: {:.3%} ".format(epoch,
                                                                    epoch_loss.result(),
                                                                    epoch_accuracy.result()))
        t.refresh()

In [None]:
joblib.dump(acc, SOURCE + '/results/attention.joblib')

In [None]:
joblib.dump(Rs, SOURCE + '/results/Rs.joblib')