In [1]:
#pip install torch==1.7.1+cpu torchvision==0.8.2+cpu torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html

In [2]:
#pip install transformers[sentencepiece]

In [3]:
from transformers import AutoTokenizer, AutoModelForMaskedLM



## Load tokenizer and model

In [4]:
tokenizer = AutoTokenizer.from_pretrained('/home/robert/git/python/LitLat-BERT_transformers/') 
model = AutoModelForMaskedLM.from_pretrained('/home/robert/git/python/LitLat-BERT_transformers/') 
 


Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


In [None]:
# https://www.kaggle.com/vbookshelf/basics-of-bert-and-xlm-roberta-pytorch
# https://anubhav20057.medium.com/step-by-step-guide-abstractive-text-summarization-using-roberta-e93978234a90

Check vocab size

In [5]:
tokenizer.vocab_size

84196

What are the special tokens

In [8]:
tokenizer.special_tokens_map



{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>'}

In [9]:
print('bos_token_id <s>:', tokenizer.bos_token_id)
print('eos_token_id </s>:', tokenizer.eos_token_id)
print('sep_token_id </s>:', tokenizer.sep_token_id)
print('pad_token_id <pad>:', tokenizer.pad_token_id)


bos_token_id <s>: 0
eos_token_id </s>: 2
sep_token_id </s>: 2
pad_token_id <pad>: 84196


### How to use tokenizer to create XLM-RoBERTa input

In [16]:
MAX_LEN = 15

sentence1 = 'Laba diena.'
sentence2 = 'Kaip sekasi?'

encoded_dict = tokenizer.encode_plus(
            sentence1, sentence2,      
            add_special_tokens = True,
            truncation= True,
            max_length = MAX_LEN,     
            pad_to_max_length = True,
            return_attention_mask = True,   
            return_tensors = 'pt' # return pytorch tensors
       )


encoded_dict

{'input_ids': tensor([[    0, 22716,  2478,    14,     2,     2,  1021, 24924,  7283,     2,
         84196, 84196, 84196, 84196, 84196]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]])}

In [17]:
input_ids = encoded_dict['input_ids'][0]
att_mask = encoded_dict['attention_mask'][0]

# These are torch tensors.
print(input_ids)
print(att_mask)

tensor([    0, 22716,  2478,    14,     2,     2,  1021, 24924,  7283,     2,
        84196, 84196, 84196, 84196, 84196])
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0])


### Decoding a sequence of tokens

In [18]:
input_ids = encoded_dict['input_ids'][0]

print(input_ids)

tensor([    0, 22716,  2478,    14,     2,     2,  1021, 24924,  7283,     2,
        84196, 84196, 84196, 84196, 84196])


In [19]:
# skip_special_tokens – if set to True, will replace special tokens.

a = tokenizer.decode(input_ids,
                skip_special_tokens=False)

b = tokenizer.decode(input_ids,
                skip_special_tokens=True)



print(a)
print(b)

<s> Laba diena.</s></s> Kaip sekasi?</s><pad><pad><pad><pad><pad>
Laba diena. Kaip sekasi?


### Overflowing tokens and Stride
When a sentence is truncated (because it's length exceeds max_length) it's possible to get the tokenizer to return the tokens that were cut off. These truncated tokens will be returned in a list called overflowing_tokens.

In [43]:
MAX_LEN = 15 # This value could be set as 256, 512 etc.

sentence1 = 'Laba diena. Kaip jums sekasi?'
sentence2 = 'šiandien labai grazi diena ir cia dar ne viskas!'


encoded_dict = tokenizer.encode_plus(
            sentence1, sentence2,
            add_special_tokens=True,
            #truncation = True
            max_length = MAX_LEN,
            stride=0,
            padding = 'max_length',
            return_tensors = 'pt',
            return_overflowing_tokens=True,
       )


encoded_dict

{'input_ids': tensor([[    0, 22716,  2478,    14,  1021,  1411, 24924,  7283,     2,     2,
           750,   273,  6317,   303,  4917,  4075,  2478,    11,  5311,   190,
            77,  1860,  7821,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'overflow_to_sample_mapping': tensor([0])}

***
## Tokenize the data

In [62]:
import pandas as pd

d = {'Summary': ['viskas super!', 'man bloga.'], 'Text' : ['Siandien labai grazi sauleta diena.', 'Labai daug valgiau, todel man skauda pilva.']}
df = pd.DataFrame(data=d)

df

Unnamed: 0,Summary,Text
0,viskas super!,Siandien labai grazi sauleta diena.
1,man bloga.,"Labai daug valgiau, todel man skauda pilva."


In [63]:
from datasets import Dataset

In [67]:
train_data = Dataset.from_pandas(df)
train_data


Dataset({
    features: ['Summary', 'Text'],
    num_rows: 2
})

In [68]:
tokenizer.bos_token = tokenizer.cls_token
tokenizer.eos_token = tokenizer.sep_token

#parameter setting
batch_size=256  #
encoder_max_length=40
decoder_max_length=8

def process_data_to_model_inputs(batch):
  # tokenize the inputs and labels
  inputs = tokenizer(batch["Text"], padding="max_length", truncation=True, max_length=encoder_max_length)
  outputs = tokenizer(batch["Summary"], padding="max_length", truncation=True, max_length=decoder_max_length)

  batch["input_ids"] = inputs.input_ids
  batch["attention_mask"] = inputs.attention_mask
  batch["decoder_input_ids"] = outputs.input_ids
  batch["decoder_attention_mask"] = outputs.attention_mask
  batch["labels"] = outputs.input_ids.copy()

  # because RoBERTa automatically shifts the labels, the labels correspond exactly to `decoder_input_ids`. 
  # We have to make sure that the PAD token is ignored
  batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]

  return batch

#processing training data
train_data = train_data.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["Text", "Summary"]
)
train_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

## processing validation data
#val_data = val_data.map(
#    process_data_to_model_inputs, 
#    batched=True, 
#    batch_size=batch_size, 
#    remove_columns=["Text", "Summary"]
#)
#val_data.set_format(
#    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
#)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [73]:
train_data

Dataset({
    features: ['attention_mask', 'decoder_attention_mask', 'decoder_input_ids', 'input_ids', 'labels'],
    num_rows: 2
})

In [74]:
from transformers import EncoderDecoderModel

In [76]:
roberta_shared = EncoderDecoderModel.from_encoder_decoder_pretrained('EMBEDDIA/litlat-bert', 'EMBEDDIA/litlat-bert', tie_encoder_decoder=True)

Some weights of XLMRobertaForCausalLM were not initialized from the model checkpoint at EMBEDDIA/litlat-bert and are newly initialized: ['roberta.encoder.layer.0.crossattention.self.query.weight', 'roberta.encoder.layer.0.crossattention.self.query.bias', 'roberta.encoder.layer.0.crossattention.self.key.weight', 'roberta.encoder.layer.0.crossattention.self.key.bias', 'roberta.encoder.layer.0.crossattention.self.value.weight', 'roberta.encoder.layer.0.crossattention.self.value.bias', 'roberta.encoder.layer.0.crossattention.output.dense.weight', 'roberta.encoder.layer.0.crossattention.output.dense.bias', 'roberta.encoder.layer.0.crossattention.output.LayerNorm.weight', 'roberta.encoder.layer.0.crossattention.output.LayerNorm.bias', 'roberta.encoder.layer.1.crossattention.self.query.weight', 'roberta.encoder.layer.1.crossattention.self.query.bias', 'roberta.encoder.layer.1.crossattention.self.key.weight', 'roberta.encoder.layer.1.crossattention.self.key.bias', 'roberta.encoder.layer.1.cros

In [77]:
model.eval

<bound method Module.eval of XLMRobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(84201, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
   