In [56]:
from transformers import BertConfig
from transformers import BertModel
from transformers import BertTokenizer
from transformers import BertForPreTraining
from tokenizers import BertWordPieceTokenizer
from transformers import TextDatasetForNextSentencePrediction

from dataclasses import dataclass
from pathlib import Path
import torch
import torch.nn as nn

In [57]:
out_base = Path('./polished/models/ka_only_no_w2v_bert/')

In [58]:
train_file = './no_en_data/ka_nse_train.txt'
test_file = './no_en_data/ka_nse_test.txt'
valid_file = './no_en_data/ka_nse_valid.txt'

### Central config for useful hyperparameters

In [7]:
@dataclass
class GeoBertConfig:
    do_lower_case: bool
    do_basic_tokenize: bool
    tokenize_chinese_chars: bool
    word_embedding_size: int
    num_transformer_layers: int
    num_heads: int
    hidden_feed_forward_size: int
    mlm_probability: float

In [8]:
config = GeoBertConfig(
    do_lower_case=False, # georgian doesn't have lower case
    do_basic_tokenize=True, # doesn't really matter, basic tokenizetion speeds up stuff
    tokenize_chinese_chars=False, # we don't care about chinese chars
    word_embedding_size=300, # same as w2v embeddings
    num_transformer_layers=12,
    num_heads=12,
    hidden_feed_forward_size=1024,
    mlm_probability=0.15,
)

### Open already trained BertTokenizer which is now aware of special token meanings

In [60]:
tokenizer = BertTokenizer.from_pretrained('./polished/models/v2bert/berttokenizer')

Didn't find file ./polished/models/v2bert/berttokenizer/added_tokens.json. We won't load it.
loading file ./polished/models/v2bert/berttokenizer/vocab.txt
loading file None
loading file ./polished/models/v2bert/berttokenizer/special_tokens_map.json
loading file ./polished/models/v2bert/berttokenizer/tokenizer_config.json


### Load up dataset for NSE task
* Data file format:

```txt
sentence-1 from document-1\n
sentence-2 from document-1\n
sentence-3 from document-1\n
\n
sentence-1 from document-2\n
sentence-2 from document-2\n
...
```
* `TextDatasetForNextSentencePrediction` does pairing of sentences for NSE task for us
    * 50% of times it will pair random chunks of text
    * 50% of times pair will be contigius in underlying data

In [10]:
train_dataset = TextDatasetForNextSentencePrediction(
    tokenizer=tokenizer,
    file_path=str(train_file),
    block_size = 256, # max sentence len. 512 because georgian is pretty long compared to others
)
test_dataset = TextDatasetForNextSentencePrediction(
    tokenizer=tokenizer,
    file_path=str(test_file),
    block_size = 256, # max sentence len. 512 because georgian is pretty long compared to others
)
valid_dataset = TextDatasetForNextSentencePrediction(
    tokenizer=tokenizer,
    file_path=str(valid_file),
    block_size = 256, # max sentence len. 512 because georgian is pretty long compared to others
)



### Filter out all sentences with len >= 300 so cuda does not run into OOM

In [11]:
train_dataset.examples = [ex for ex in train_dataset.examples if len(ex['input_ids']) < 300]
test_dataset.examples = [ex for ex in test_dataset.examples if len(ex['input_ids']) < 300][:100] # for quick eval prints
valid_dataset.examples = [ex for ex in valid_dataset.examples if len(ex['input_ids']) < 300]

* `input_ids` - input token ids
* `token_type_ids` - which sequence does each token belong to
* `next_sentence_label` - NSE task expected output

In [61]:
len(train_dataset), len(test_dataset), len(valid_dataset)

(132598, 100, 19241)

In [62]:
tokenizer.decode(train_dataset[100]['input_ids'])

'[CLS] გადმოგიშლი მთელ ჩემს ცხოვრებას, რომელიც ნამდვილად მაშინ დაიწყო, როდესაც პირველად გიხილე. მანამდე ჩემთვის არსებობდა მხოლოდ რაღაც ბუნდოვანი, გაურკვეველი სამყარო, შემდეგ აინუნშიაც რომ აღარ მომსვლია, რაღაც ბნელი ჯურღმული, აბლაბუდითა და მტვერით დაფარული საგნები და ხალხი, რომელთა შესახებ ჩემმა მეხსიერებამ თითქმის აღარაფრი შემოინახა. შენ რომ გამოჩნდი, ცამეტი წლისა ვიყავი, ვცხოვრობდი იმავე სახლში, სადაც ახლა ცხოვრობ, სწორედ იმ სახლში, სადაც ახლა ზიხარ და ხელთ გიჭირავს ეს წერილი – ჩემი ცხოვრების უკანასკნელი ამოსუნთქვა ; ვცხოვრობდი იმავე სართულზე, შენი კარის მეზობელი ვიყავი. აბა, რაღას გეხსომებით – ფინანსურ საქმეთა მრჩევლის ღატაკი ქვრივი ( ყოველთვის ძაძა ეცვა ) და ოდნავ მოჩიტული გამხდარი გოგონა. [SEP] ქუთაისის მერიის განმარტებით, მათ უკვე მოილაპარაკეს ბაზრობების ხელმძღვანელებთან, რომლებიც მზად არიან გარემოვაჭრეებს სამი თვით უფასოდ გამოუყონ დახლები. მოვაჭრეებისთვის მერიის მიერ შეთავაზებული პირობები მიუღებელი აღმოჩნდა. [SEP]'

In [63]:
print(f'{tokenizer.vocab_size = }, {len(train_dataset) = }')

tokenizer.vocab_size = 30000, len(train_dataset) = 132598


In [64]:
hug_config = BertConfig(tokenizer.vocab_size, 
                    hidden_size=config.word_embedding_size,
                    num_hidden_layers=config.num_transformer_layers, 
                    num_attention_heads=config.num_heads,
                    intermediate_size=config.hidden_feed_forward_size)
model = BertForPreTraining(hug_config)

### Load up w2v embeddings

DO NOT RUN THIS TO GET MODEL WITHOUT W2V EMBEDDINGS

In [None]:
from gensim.models import Word2Vec

In [32]:
wvmodel = Word2Vec.load('./polished/models/word2vec/subword.model')

In [33]:
existing = model.bert.embeddings.word_embeddings.weight

In [34]:
pretrained_embeddings = torch.zeros((tokenizer.vocab_size, config.word_embedding_size))

In [35]:
for k, i in tokenizer.get_vo6648cab().items():
    if k in wvmodel.wv:
        pretrained_embeddings[i] = torch.tensor(wvmodel.wv[k])
    else:
        pretrained_embeddings[i] = existing[i]

Check in vocab word

In [36]:
assert torch.all(pretrained_embeddings[tokenizer.get_vocab()['ა']] == torch.tensor(wvmodel.wv['ა']))

Check out of vocab word

In [37]:
assert torch.all(pretrained_embeddings[1] == existing[1])

### Actually replace embeddings

In [38]:
model.bert.embeddings.word_embeddings = nn.Embedding.from_pretrained(embeddings=pretrained_embeddings, 
                                                                     freeze = False, # We want model to learn [SEP] embeddings for example
                                                                     padding_idx=0,
                                                                    )

-----------------

### Set up mlm collator

In [65]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=config.mlm_probability,
)

In [66]:
import torch
torch.cuda.empty_cache()

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    num_train_epochs=10,
    per_device_train_batch_size=12,
    
    #prediction_loss_only=True,
    evaluation_strategy='steps',
    logging_steps=100,
    eval_accumulation_steps=1,
    #eval_steps=2000,
    logging_first_step=True,
    
    output_dir= str(out_base/ 'trainer'),
    # overwrite_output_dir=True,
    save_steps=1000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()

using `logging_steps` to initialize `eval_steps` to 100
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 132598
  Num Epochs = 10
  Instantaneous batch size per device = 12
  Total train batch size (w. parallel, distributed & accumulation) = 12
  Gradient Accumulation steps = 1
  Total optimization steps = 110500


Step,Training Loss,Validation Loss
100,10.2383,9.642043
200,9.5012,9.13376
300,9.1316,8.999308
400,8.9766,8.918166
500,8.9389,8.667453
600,8.8961,8.785086
700,8.8116,8.81385
800,8.8208,8.684848
900,8.7867,8.553764
1000,8.7513,8.638752


***** Running Evaluation *****
  Num examples = 100
  Batch size = 8
***** Running Evaluation *****
  Num examples = 100
  Batch size = 8
***** Running Evaluation *****
  Num examples = 100
  Batch size = 8
***** Running Evaluation *****
  Num examples = 100
  Batch size = 8
***** Running Evaluation *****
  Num examples = 100
  Batch size = 8
***** Running Evaluation *****
  Num examples = 100
  Batch size = 8
***** Running Evaluation *****
  Num examples = 100
  Batch size = 8
***** Running Evaluation *****
  Num examples = 100
  Batch size = 8
***** Running Evaluation *****
  Num examples = 100
  Batch size = 8
***** Running Evaluation *****
  Num examples = 100
  Batch size = 8
Saving model checkpoint to polished/models/ka_only_no_w2v_bert/trainer/checkpoint-1000
Configuration saved in polished/models/ka_only_no_w2v_bert/trainer/checkpoint-1000/config.json
Model weights saved in polished/models/ka_only_no_w2v_bert/trainer/checkpoint-1000/pytorch_model.bin
***** Running Evaluation **

In [68]:
trainer.save_model(str(out_base / 'ka_only_no_w2v_bert_model'))

Saving model checkpoint to polished/models/ka_only_no_w2v_bert/ka_only_no_w2v_bert_model
Configuration saved in polished/models/ka_only_no_w2v_bert/ka_only_no_w2v_bert_model/config.json
Model weights saved in polished/models/ka_only_no_w2v_bert/ka_only_no_w2v_bert_model/pytorch_model.bin


In [69]:
trainer.save_state()

In [49]:
#assert torch.all(model.bert.embeddings.word_embeddings.weight[tokenizer.get_vocab()['ა']] == torch.tensor(wvmodel.wv['ა']))