In [1]:
from transformers import BertConfig
from transformers import BertModel
from transformers import BertTokenizer
from transformers import BertForPreTraining
from tokenizers import BertWordPieceTokenizer
from dataclasses import dataclass
from pathlib import Path

In [42]:
out_base = Path('./polished/models/bert/')

In [3]:
data_file = Path('./data/ka_nse_mil.txt')

### Central config for useful hyperparameters

In [4]:
@dataclass
class GeoBertConfig:
    do_lower_case: bool
    do_basic_tokenize: bool
    tokenize_chinese_chars: bool
    word_embedding_size: int
    num_transformer_layers: int
    num_heads: int
    hidden_feed_forward_size: int
    mlm_probability: float

In [5]:
config = GeoBertConfig(
    do_lower_case=False, # georgian doesn't have lower case
    do_basic_tokenize=True, # doesn't really matter, basic tokenizetion speeds up stuff
    tokenize_chinese_chars=False, # we don't care about chinese chars
    word_embedding_size=360,
    num_transformer_layers=12,
    num_heads=12,
    hidden_feed_forward_size=1024,
    mlm_probability=0.15,
)

### Train sub-word tokenizer

In [6]:
wb_tokenizer = BertWordPieceTokenizer(clean_text=True, handle_chinese_chars=True,
                                      strip_accents=True, lowercase=True)

wb_tokenizer.train(str(data_file),
                   vocab_size=30000, min_frequency=5,
                   special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"])
wb_tokenizer.save_model(str(out_base))






['models/bert/ka/vocab.txt']

In [7]:
wb_tokenizer.encode('Hi broz').tokens

['h', '##i', 'bro', '##z']

In [8]:
wb_tokenizer.encode('შემომეჭამა').tokens

['შემომ', '##ეჭ', '##ამა']

In [43]:
wb_tokenizer.save_model(str(out_base / 'wordpiece'))

['polished/models/bert/wordpiece/vocab.txt']

### Create BertTokenizer which is now aware of special token meanings

In [9]:
tokenizer = BertTokenizer(str(out_base / 'wordpiece' / "vocab.txt"),
                              do_lower_case=config.do_lower_case, do_basic_tokenize=config.do_basic_tokenize, 
                              bos_token='[CLS]', 
                              eos_token='[SEP]', sep_token='[SEP]', 
                              cls_token='[CLS]', unk_token='[UNK]', 
                              pad_token='[PAD]', mask_token='[MASK]',)

In [44]:
tokenizer.save_pretrained(str(out_base / 'berttokenizer'))

tokenizer config file saved in polished/models/bert/berttokenizer/tokenizer_config.json
Special tokens file saved in polished/models/bert/berttokenizer/special_tokens_map.json


('polished/models/bert/berttokenizer/tokenizer_config.json',
 'polished/models/bert/berttokenizer/special_tokens_map.json',
 'polished/models/bert/berttokenizer/vocab.txt',
 'polished/models/bert/berttokenizer/added_tokens.json')

### Load up dataset for NSE task
* Data file format:

```txt
sentence-1 from document-1\n
sentence-2 from document-1\n
sentence-3 from document-1\n
\n
sentence-1 from document-2\n
sentence-2 from document-2\n
...
```
* `TextDatasetForNextSentencePrediction` does pairing of sentences for NSE task for us
    * 50% of times it will pair random chunks of text
    * 50% of times pair will be contigius in underlying data

In [10]:
from transformers import TextDatasetForNextSentencePrediction
dataset = TextDatasetForNextSentencePrediction(
    tokenizer=tokenizer,
    file_path=str(data_file),
    block_size = 256, # max sentence len. 512 because georgian is pretty long compared to others
)



In [32]:
dataset.examples = [ex for ex in dataset.examples if len(ex['input_ids']) < 512]

* `input_ids` - input token ids
* `token_type_ids` - which sequence does each token belong to
* `next_sentence_label` - NSE task expected output

In [33]:
len(dataset)

298706

In [34]:
len(dataset)

298706

In [13]:
tokenizer.decode(dataset[100]['input_ids'])

'[CLS] საჯარო ლექციების ფარგლებში მონაწილეებს შეხვდებიან წარმატებული ადამიანები სხვადასხვა სფეროდან ( ბიზნესი, პოლიტიკა, მეცნიერება და სხვ. ). [SEP] მოწვეული სპიკერები მსმენელებს გაუზიარებენ გამოცდილებას, მისცემენ რეკომენდაციებს და გააცნობენ საკუთარ ხედვალ საჯარო სამსახურის განვითარებასთან დაკავშირებით. გარდა ადგილობრივი ექსპერტებისდ, მსმენელებს შეხვდებიან მაღალი რანგის საერთაშორისო ექსპერტები. [SEP]'

In [14]:
print(f'{tokenizer.vocab_size = }, {len(dataset) = }')

tokenizer.vocab_size = 30000, len(dataset) = 298711


In [15]:
hug_config = BertConfig(tokenizer.vocab_size, 
                    hidden_size=config.word_embedding_size,
                    num_hidden_layers=config.num_transformer_layers, 
                    num_attention_heads=config.num_heads,
                    intermediate_size=config.hidden_feed_forward_size)
model = BertForPreTraining(hug_config)

In [16]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=config.mlm_probability,
)

In [17]:
import torch
torch.cuda.empty_cache()

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    num_train_epochs=5,
    per_device_train_batch_size=16,
    
    prediction_loss_only=True,
    
    logging_steps=100,
    logging_first_step=True,
    
    output_dir= "./models/bert/en/enbert",
    overwrite_output_dir=True,
    save_steps=10_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 298706
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 93350


Step,Training Loss
1,8.7855
100,8.3063
200,8.2786
300,8.3348
400,8.2563
500,8.3111
600,8.2698
700,8.2523
800,8.2687
900,8.2257


Bad pipe message: %s [b"g.\xdaN\xac=l\x9a}\xdb\xbdt2\xc3|3\x8e\xb2 AC\xe8\x8e\x0c\xf9\xfa'\x1et\xdf\xbb\xef\xa0\xdfhjn\x82/7s\xb5\xd2\xd6`\xfe\x82\x88\x04\xb8\xa1\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00"]
Bad pipe message: %s [b'@i\xbcj],\x80v\x1d\xfe\n\xc3.<;\xd0\x85a\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0', b"V\xc0R\xc0$\xc0(\x00k\x00j\xc0#\xc0'\x00g\x00@\xc0\n\xc0\x14\x009\x008\xc0\t\xc0\x13\x003\x002\x00\x9d\xc0\xa1\xc0\x9d\xc0Q\x00\x9c\xc0\xa0\xc0\x9c\xc0P\x00=\x00<\x005\x00/\x00\x9a\x00\x99\xc0\x07\xc0\x11\x00\x96\x00\x05\x00\xff\x01\x00\x00j\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1"]
Bad pipe message: %s [b"cd\xacO\xcc\x9e\xef%>\xa5bB\xd4\xaaj#\xb

In [37]:
# TODO: log testing data loss tommorow

In [38]:
from transformers import pipeline

In [45]:
trainer.save_model(str(out_base / 'model'))

Saving model checkpoint to polished/models/bert/model
Configuration saved in polished/models/bert/model/config.json
Model weights saved in polished/models/bert/model/pytorch_model.bin


In [40]:
tokenizer.save_pretrained('./models/bert/en/katok', max_len=512)

tokenizer config file saved in ./models/bert/en/katok/tokenizer_config.json
Special tokens file saved in ./models/bert/en/katok/special_tokens_map.json


('./models/bert/en/katok/tokenizer_config.json',
 './models/bert/en/katok/special_tokens_map.json',
 './models/bert/en/katok/vocab.txt',
 './models/bert/en/katok/added_tokens.json')