In [32]:
## Training SentenceTransformers model from scratch
%time
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer

with open('sample-logs.txt') as f:
    lines = f.readlines()

paths = [str(x) for x in lines]

## Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

## Customize training
tokenizer.train(files=['sample-logs.txt'], vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 7.39 µs





In [45]:
## Salva modelo

#!mkdir Models
tokenizer.save_model("./samplelogs", "samplelogs")

['./samplelogs/samplelogs-vocab.json', './samplelogs/samplelogs-merges.txt']

In [46]:
## Testa modelo

from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

tokenizer = ByteLevelBPETokenizer(
    "./samplelogs/sample-logs-vocab.json",
    "./samplelogs/sample-logs-merges.txt",
)
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

print(tokenizer.encode("ERROR MEMORY"))
print(tokenizer.encode("ERROR MEMORY").tokens)

Encoding(num_tokens=10, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
['<s>', 'ER', 'RO', 'R', 'ĠM', 'E', 'M', 'O', 'RY', '</s>']


In [28]:
## Check that we have a GPU
#!nvidia-smi

# Check that PyTorch sees it
import torch
torch.cuda.is_available()

False

In [40]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [48]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained("/samplelogs", max_len=512)

HFValidationError: Repo id must use alphanumeric chars or '-', '_', '.', '--' and '..' are forbidden, '-' and '.' cannot start or end the name, max length is 96: '/samplelogs'.

In [8]:
## Classe do dataset

import torch
from torch.utils.data import Dataset

class LogsDataset(Dataset):
    def __init__(self, evaluate: bool = False):
        tokenizer = ByteLevelBPETokenizer(
            "sample-logs-vocab.json",
            "sample-logs-merges.txt",
        )
        tokenizer._tokenizer.post_processor = BertProcessing(
            ("</s>", tokenizer.token_to_id("</s>")),
            ("<s>", tokenizer.token_to_id("<s>")),
        )
        tokenizer.enable_truncation(max_length=512)
        # or use the RobertaTokenizer from `transformers` directly.

        self.examples = []

        src_files = "sample-logs.txt"
        for src_file in src_files:
            print("🔥", src_file)
            lines = src_file.read_text(encoding="utf-8").splitlines()
            self.examples += [x.ids for x in tokenizer.encode_batch(lines)]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        # We’ll pad at the batch level.
        return torch.tensor(self.examples[i])