In [1]:
%%capture
!pip install tensorflow_probability

In [27]:
import numpy as np
from tensorflow.data.ops.dataset_ops import BatchDataset

ModuleNotFoundError: No module named 'tensorflow.data.ops'

In [4]:
tf.executing_eagerly()

True

In [33]:
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple, Union

import tensorflow as tf
from tensorflow.data import Dataset

from transformers.tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase


# def _tf_collate_batch2(
#     examples: Union[Dataset, list],
#     tokenizer
#     ):
#     """Collate `examples` into a suitable input for TFBertForPreTraining. Any padding is handled prior to the function call."""
    
#     if isinstance(examples, list):
#         examples = Dataset.from_tensor_slices(l_test)


    
    
    
    
    
    
    
    
    
    
    
    
#     # Tensorize if necessary.
#     if isinstance(examples[0], (list, tuple)):
#         examples = [tf.constant(e, dtype=tf.float64) for e in examples]

#     # Check if padding is necessary.
#     length_of_first = examples[0].shape[0]
#     are_tensors_same_length = all(x.shape[0] == length_of_first for x in examples)
#     if are_tensors_same_length and (pad_to_multiple_of is None or length_of_first % pad_to_multiple_of == 0):
#         return tf.stack(examples, axis=0)

#     # If yes, check if we have a `pad_token`.
#     if tokenizer._pad_token is None:
#         raise ValueError(
#             "You are attempting to pad samples but the tokenizer you are using"
#             f" ({tokenizer.__class__.__name__}) does not have a pad token."
#         )

#     # Creating the full tensor and filling it with our data.
#     max_length = max(x.shape[0] for x in examples)
#     if pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
#         max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of

#     result = tf.fill([len(examples), max_length], tokenizer.pad_token_id)
#     for i, example in enumerate(examples):
#         if tokenizer.padding_side == "right":
#             temp_result = result.numpy()
#             temp_result[i, : example.shape[0]] = example
#             result = tf.convert_to_tensor(temp_result)
#         else:
#             temp_result = result.numpy()
#             result[i, -example.shape[0] :] = example
#             result = tf.convert_to_tensor(temp_result)
#     return result


@dataclass
class TFDataCollatorForLanguageModeling2:
    """
    Data collator used for language modeling. Inputs are dynamically padded to the maximum length of a batch if they
    are not all of the same length.

    Args:
        tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
            The tokenizer used for encoding the data.
        mlm (:obj:`bool`, `optional`, defaults to :obj:`True`):
            Whether or not to use masked language modeling. If set to :obj:`False`, the labels are the same as the
            inputs with the padding tokens ignored (by setting them to -100). Otherwise, the labels are -100 for
            non-masked tokens and the value to predict for the masked token.
        mlm_probability (:obj:`float`, `optional`, defaults to 0.15):
            The probability with which to (randomly) mask tokens in the input, when :obj:`mlm` is set to :obj:`True`.
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.

    .. note::

        For best performance, this data collator should be used with a dataset having items that are dictionaries or
        BatchEncoding, with the :obj:`"special_tokens_mask"` key, as returned by a
        :class:`~transformers.PreTrainedTokenizer` or a :class:`~transformers.PreTrainedTokenizerFast` with the
        argument :obj:`return_special_tokens_mask=True`.
    """

    tokenizer: PreTrainedTokenizerBase
    mlm: bool = True
    mlm_probability: float = 0.15
    pad_to_multiple_of: Optional[int] = None

    def __post_init__(self):
        if self.mlm and self.tokenizer.mask_token is None:
            raise ValueError(
                "This tokenizer does not have a mask token which is necessary for masked language modeling. "
                "You should pass `mlm=False` to train on causal language modeling instead."
            )

    def __call__(self, examples: Union[List[Union[List[int], tf.Tensor, Dict[str, tf.Tensor]]], Dataset]) -> Dataset:
        
        if not isinstance(examples, Dataset):
            if isinstance(examples[0], dict):
                examples = Dataset.from_tensor_slices([x['input_ids'] for x in test_list_dict])                
            else:
                examples = Dataset.from_tensor_slices(examples)
        
        # Handle dict or lists with proper padding and conversion to tensor.
        if isinstance(examples[0], (dict, BatchEncoding)):
            batch = self.tokenizer.pad(examples, return_tensors="tf", pad_to_multiple_of=self.pad_to_multiple_of)
        else:
            batch = {
                "input_ids": _tf_collate_batch(examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
            }

        # If special token mask has been preprocessed, pop it from the dict.
        special_tokens_mask = batch.pop("special_tokens_mask", None)
        if self.mlm:
            batch["input_ids"], batch["labels"] = self.tf_mask_tokens(
                batch["input_ids"], special_tokens_mask=special_tokens_mask
            )
        else:
            labels = tf.identity(batch["input_ids"])
            if self.tokenizer.pad_token_id is not None:
                labels = labels.numpy()
                labels[labels == self.tokenizer.pad_token_id] = -100
                labels = tf.constant(labels)
            batch["labels"] = labels
        return batch

    def tf_mask_tokens(
        self, inputs: tf.Tensor, special_tokens_mask: Optional[tf.Tensor] = None
    ) -> Tuple[tf.Tensor, tf.Tensor]:
        """
        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
        """
        labels = tf.identity(inputs)
        # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
        probability_matrix = tf.fill(labels.shape, self.mlm_probability)
        if special_tokens_mask is None:
            special_tokens_mask = [
                self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in list(labels)
            ]
            special_tokens_mask = tf.cast(tf.constant(special_tokens_mask, dtype=tf.float16), dtype=tf.bool)
        else:
            special_tokens_mask = tf.cast(special_tokens_mask, dtype=tf.bool)

        probability_matrix = tf.where(~special_tokens_mask, probability_matrix, 0)
        masked_indices = tfp.distributions.Bernoulli(probs=probability_matrix, dtype=tf.bool).sample()

        labels = tf.where(masked_indices, labels, -100)  # We only compute loss on masked tokens

        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])

        indices_replaced = (
            tfp.distributions.Bernoulli(probs=tf.fill(labels.shape, 0.8), dtype=tf.bool).sample() & masked_indices
        )

        mask_token = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
        inputs = tf.where(~indices_replaced, inputs, mask_token)

        # 10% of the time, we replace masked input tokens with random word
        indices_random = (
            tfp.distributions.Bernoulli(probs=tf.fill(labels.shape, 0.5), dtype=tf.bool).sample()
            & masked_indices
            & ~indices_replaced
        )

        random_words = tf.random.uniform(labels.shape, maxval=len(self.tokenizer), dtype=tf.float64)

        inputs = tf.cast(inputs, dtype=tf.float64)
        inputs = tf.where(indices_random, random_words, inputs)

        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
        return inputs, labels


In [38]:
l_test = list(np.random.randint(0, 10, (4,5)))
l_test = tf.data.Dataset.from_tensor_slices(l_test)
l_test = list(l_test.as_numpy_iterator())
l_test

[array([9, 8, 6, 8, 4]),
 array([8, 9, 4, 1, 3]),
 array([7, 2, 4, 3, 2]),
 array([5, 0, 3, 2, 9])]

In [22]:
examples = list(dataset.as_numpy_iterator())
examples

[array([0, 1, 2]), array([3, 4, 5]), array([6, 7])]

In [10]:
import tensorflow as tf
from transformers import BertConfig
from transformers import TFBertForPreTraining
from transformers import PreTrainedTokenizerFast
from tokenizers import Tokenizer
from bertviz import head_view, model_view
from transformers import DataCollatorForLanguageModeling

import torch
from tqdm import tqdm



# -- Tokenizer -- #
import tokenizers
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers import normalizers

from transformers.tokenization_utils_base import BatchEncoding

from tokenizers.normalizers import (
    Lowercase,
    NFD,
    StripAccents
)

from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing
from tokenizers.trainers import WordPieceTrainer
from tokenizers import decoders

import tensorflow_probability as tfp

In [11]:
class PrimeTokenizer:
    def __init__(self, max_seq_length: int):
        self.prime_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))

        self.prime_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])

        self.prime_tokenizer.pre_tokenizer = Whitespace()

        self.prime_tokenizer.post_processor = TemplateProcessing(
            single="[CLS] $A [SEP]",
            pair="[CLS] $A [SEP] $B:1 [SEP]:1",
            special_tokens=[
                ("[CLS]", 1),
                ("[SEP]", 2),
            ],
        )

        self.trainer = WordPieceTrainer(
            vocab_size=153411, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
        )

        self.prime_tokenizer.decoder = decoders.WordPiece()
        # self.prime_tokenizer.enable_padding(length=max_seq_length)
        # self.prime_tokenizer.enable_truncation(max_seq_length)

    def text_to_sequence(self, input_):
        if type(input_) is list:
            return self.prime_tokenizer.encode_batch(input_)
        return self.prime_tokenizer.encode(input_)

    def sequence_to_text(self, input_):
        if type(input_) is list:
            return self.prime_tokenizer.decode_batch(batch)
        return self.prime_tokenizer.decode(input_)

    def train(self, data):
        log_itr = iter(data)
#         tqdm_log_itr = tqdm(iterable=log_itr, total=len(data))
# tqdm_log_itr.__iter__()
        self.prime_tokenizer.train_from_iterator(log_itr, self.trainer)
        self.save()

    def get_tokenizer(self):
        return self.prime_tokenizer

    def get_vocab(self):
        return self.prime_tokenizer.get_vocab()
    
    def get_vocab_size(self):
        return self.prime_tokenizer.get_vocab_size()
    
    def save(self):
        self.prime_tokenizer.save("prime_tokenizer.json")
        
    def load(self):
        self.prime_tokenizer = Tokenizer.from_file("prime_tokenizer.json")

In [12]:
tokenizer = PrimeTokenizer(200)

In [13]:
corpus = ["00000067:solr-1.clicls[0016:adfd]", "00000064:solr-1.srvcls[0020:adfd]"]

In [14]:
tokenizer.train(corpus)

In [15]:
the_tokenizer_obj = tokenizer.get_tokenizer()

fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=the_tokenizer_obj)
fast_tokenizer.unk_token = "[UNK]"
fast_tokenizer.sep_token = "[SEP]"
fast_tokenizer.pad_token = "[PAD]"
fast_tokenizer.cls_token = "[CLS]"
fast_tokenizer.mask_token = "[MASK]"
batch_encodings = fast_tokenizer(corpus, truncation=True, padding=True, )

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [71]:
batch_encodings

{'input_ids': [[1, 64, 13, 54, 5, 8, 6, 65, 14, 62, 13, 53, 15, 2], [1, 63, 13, 54, 5, 8, 6, 66, 14, 61, 13, 53, 15, 2]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [104]:
batch = tokenizer.text_to_sequence(corpus)

batch_ids = [log.ids for log in batch for i in range(2)]

batch_ids

[[1, 64, 13, 54, 5, 8, 6, 65, 14, 62, 13, 53, 15, 2],
 [1, 64, 13, 54, 5, 8, 6, 65, 14, 62, 13, 53, 15, 2],
 [1, 63, 13, 54, 5, 8, 6, 66, 14, 61, 13, 53, 15, 2],
 [1, 63, 13, 54, 5, 8, 6, 66, 14, 61, 13, 53, 15, 2]]

In [103]:
batch_ids = tf.convert_to_tensor([batch_ids, batch_ids, batch_ids, batch_ids ])
batch_ids

<tf.Tensor: shape=(4, 4, 14), dtype=int32, numpy=
array([[[ 1, 64, 13, 54,  5,  8,  6, 65, 14, 62, 13, 53, 15,  2],
        [ 1, 64, 13, 54,  5,  8,  6, 65, 14, 62, 13, 53, 15,  2],
        [ 1, 63, 13, 54,  5,  8,  6, 66, 14, 61, 13, 53, 15,  2],
        [ 1, 63, 13, 54,  5,  8,  6, 66, 14, 61, 13, 53, 15,  2]],

       [[ 1, 64, 13, 54,  5,  8,  6, 65, 14, 62, 13, 53, 15,  2],
        [ 1, 64, 13, 54,  5,  8,  6, 65, 14, 62, 13, 53, 15,  2],
        [ 1, 63, 13, 54,  5,  8,  6, 66, 14, 61, 13, 53, 15,  2],
        [ 1, 63, 13, 54,  5,  8,  6, 66, 14, 61, 13, 53, 15,  2]],

       [[ 1, 64, 13, 54,  5,  8,  6, 65, 14, 62, 13, 53, 15,  2],
        [ 1, 64, 13, 54,  5,  8,  6, 65, 14, 62, 13, 53, 15,  2],
        [ 1, 63, 13, 54,  5,  8,  6, 66, 14, 61, 13, 53, 15,  2],
        [ 1, 63, 13, 54,  5,  8,  6, 66, 14, 61, 13, 53, 15,  2]],

       [[ 1, 64, 13, 54,  5,  8,  6, 65, 14, 62, 13, 53, 15,  2],
        [ 1, 64, 13, 54,  5,  8,  6, 65, 14, 62, 13, 53, 15,  2],
        [ 1, 63, 13,

In [105]:
data_collator = TFDataCollatorForLanguageModeling(fast_tokenizer)

In [106]:
data_collator(batch_ids)

{'input_ids': <tf.Tensor: shape=(4, 14), dtype=float64, numpy=
 array([[ 1.        ,  4.        , 13.        ,  4.        ,  4.        ,
          8.        ,  6.        , 65.        , 14.        , 62.        ,
         13.        , 53.        , 15.        ,  2.        ],
        [ 1.        , 64.        , 13.        ,  4.        ,  5.        ,
          8.        ,  6.        , 65.        , 14.        , 62.        ,
         13.        , 53.        , 15.        ,  2.        ],
        [ 1.        , 63.        ,  4.        , 54.        ,  4.        ,
          8.        ,  6.        , 66.        , 25.24303333, 61.        ,
         13.        , 53.        , 15.        ,  2.        ],
        [ 1.        , 63.        , 13.        , 54.        ,  5.        ,
          8.        ,  6.        , 66.        , 14.        , 61.        ,
         13.        , 53.        ,  4.        ,  2.        ]])>,
 'labels': <tf.Tensor: shape=(4, 14), dtype=float64, numpy=
 array([[-100.,   64., -100.,   54

In [107]:
dataset = tf.data.Dataset.from_tensor_slices(batch_ids)

print(dataset)

<TensorSliceDataset shapes: (14,), types: tf.int32>


In [111]:
dataset = dataset.map(lambda x: data_collator([x]))

OperatorNotAllowedInGraphError: in user code:

    <ipython-input-108-8f204293394e>:1 None  *
        lambda x: data_collator([x])
    <ipython-input-2-fafe5a402890>:112 __call__  *
        batch["input_ids"], batch["labels"] = self.tf_mask_tokens(
    <ipython-input-2-fafe5a402890>:134 tf_mask_tokens  *
        special_tokens_mask = [
    /opt/conda/lib/python3.8/site-packages/tensorflow/python/framework/ops.py:505 __iter__  **
        self._disallow_iteration()
    /opt/conda/lib/python3.8/site-packages/tensorflow/python/framework/ops.py:501 _disallow_iteration
        self._disallow_in_graph_mode("iterating over `tf.Tensor`")
    /opt/conda/lib/python3.8/site-packages/tensorflow/python/framework/ops.py:479 _disallow_in_graph_mode
        raise errors.OperatorNotAllowedInGraphError(

    OperatorNotAllowedInGraphError: iterating over `tf.Tensor` is not allowed in Graph execution. Use Eager execution or decorate this function with @tf.function.


In [113]:
tf.executing_eagerly()

True