In [None]:
#Instalando a versÃ£o mais nova do transformers direto da master
# !pip install git+https://github.com/huggingface/transformers
# !pip install datasets
!pip list | grep -E 'transformers|tokenizers'

In [1]:
import os

import pandas as pd

from transformers import ElectraTokenizerFast
from tokenizers.processors import BertProcessing
from tokenizers import ByteLevelBPETokenizer, BertWordPieceTokenizer

In [2]:
tokenizer = BertWordPieceTokenizer(
    unk_token='[UNK]',
    sep_token='[SEP]',
    cls_token='[CLS]',
    pad_token='[PAD]',
    mask_token='[MASK]',
    clean_text=False,
    handle_chinese_chars=False,
    strip_accents=False,
    lowercase=True
)

# tokenizer = ByteLevelBPETokenizer()

In [3]:
tokenizer.train(
    files=['data/texto-sm.txt'],
    vocab_size=52000,
    min_frequency=3,
    limit_alphabet=1000,
#     special_tokens=['[PAD]', '[UNK]', '[CLS]', '[MASK]', '[SEP]'],
    wordpieces_prefix='##',
    show_progress=True
)

In [4]:
tokenizer.encode('Sou da paz!').tokens

['sou', 'da', 'paz', '!']

In [5]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ('[SEP]', tokenizer.token_to_id('[SEP]')),
    ('[CLS]', tokenizer.token_to_id('[CLS]')),
)
tokenizer.enable_truncation(max_length=512)

In [6]:
tokenizer.encode('Sou da paz!').tokens

['[CLS]', 'sou', 'da', 'paz', '!', '[SEP]']

In [7]:
token_dir = 'content'
if not os.path.exists(token_dir):
  os.makedirs(token_dir)
tokenizer.save_model(token_dir, 'electranez')

['content/electranez-vocab.txt']

In [20]:
%timeit

from datasets import load_dataset
dataset = load_dataset('csv', data_files={'train':['data/texto-sm.csv']})
print(dataset)

Using custom data configuration default-f88fa38aaf68c077
Reusing dataset csv (/home/modanez/.cache/huggingface/datasets/csv/default-f88fa38aaf68c077/0.0.0/965b6429be0fc05f975b608ce64e1fa941cc8fb4f30629b523d2390f3c0e1a93)


DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 528352
    })
})


In [8]:
import torch
if torch.cuda.is_available():  
    device = torch.device("cuda")
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

We will use the GPU: GeForce GTX 1070


In [26]:
from transformers import ElectraTokenizer, ElectraTokenizerFast

electra_tokenizer = ElectraTokenizerFast(
    vocab_file='content/electranez-vocab.txt', 
    do_lower_case=True, 
    do_basic_tokenize=True, 
    never_split=None, 
    unk_token='[UNK]', 
    sep_token='[SEP]', 
    pad_token='[PAD]', 
    cls_token='[CLS]', 
    mask_token='[MASK]', 
    max_length=512, 
    tokenize_chinese_chars=False, 
    strip_accents=False,
    name='electranez-base'
)

In [32]:
# electra_tokenizer.save_vocabulary('electranez/')
electra_tokenizer.save_pretrained('electranez/')

('electranez/tokenizer_config.json',
 'electranez/special_tokens_map.json',
 'electranez/vocab.txt',
 'electranez/added_tokens.json')

In [24]:
from transformers import ElectraConfig, ElectraModel

configuration = ElectraConfig(
    vocab_size=52000,
    embedding_size=128,
    hidden_size=256,    
    num_hidden_layers=12,
    num_attention_heads=4,
    intermediate_size=1024,
    hidden_act='gelu',
    hidden_dropout_prob=0.1,
    attention_probs_dropout_prob=0.1,
    max_position_embeddings=512,
    type_vocab_size=2,
    initializer_range=0.02,
    layer_norm_eps=1e-12,
    summary_type='first',
    summary_use_proj=True,
    summary_activation='gelu',
    summary_last_dropout=0.1,
    position_embedding_type='absolute',
)

model = ElectraModel(configuration)
configuration = model.config

# electra_tokenizer = ElectraTokenizer.from_pretrained('electranez/', max_length=512)

In [25]:
# For the discriminator, ElectraForPreTraining was used.
# For the generator, ElectraForMaskedLM was used.

from transformers import ElectraForMaskedLM, ElectraForPreTraining

discrimator = ElectraForPreTraining(config=configuration)
generator = ElectraForMaskedLM(config=configuration)

print(discrimator.num_parameters())
print(generator.num_parameters())

In [36]:
tokenizer2 = ElectraTokenizer.from_pretrained('electranez/')
model = ElectraForPreTraining.from_pretrained('electranez/')

file electranez/config.json not found


OSError: Can't load config for 'electranez/'. Make sure that:

- 'electranez/' is a correct model identifier listed on 'https://huggingface.co/models'

- or 'electranez/' is the correct path to a directory containing a config.json file



In [None]:
#@title Step 11: Defining a Data Collator
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=electra_tokenizer, mlm=True, mlm_probability=0.15
)

In [None]:
#@title Step 12: Initializing the Trainer
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='electranez',
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=1,
)

trainer = Trainer(
    model=generator,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [None]:
temp

In [None]:
#@title Step 13: Pre-training the Model

trainer.train()

In [None]:
#@title Step 14: Saving the Final Model(+tokenizer + config) to disk
trainer.save_model("content")

In [None]:
#@title Step 15: Language Modeling with the FillMaskPipeline
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="content/KantaiBERT",
    tokenizer="content/KantaiBERT"
)