In [1]:
from transformers import pipeline
from transformers import BertForMaskedLM
from transformers import BertTokenizer
import torch.nn

In [2]:
model = BertForMaskedLM.from_pretrained('./models/bert/en/enbert/checkpoint-70000/')

Some weights of the model checkpoint at ./models/bert/en/enbert/checkpoint-70000/ were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
import torch

In [4]:
tokenizer = BertTokenizer.from_pretrained ('./models/bert/en/entok/')

In [5]:
token_ids = tokenizer('გარეთ წვიმს. იმედია ხვალ შევძლებთ, რომ')['input_ids']

In [6]:
token_ids = token_ids[:-1]

In [7]:
token_ids = token_ids + [tokenizer.mask_token_id]

In [8]:
tokenizer.convert_ids_to_tokens(token_ids)

['[CLS]',
 'გარეთ',
 'წვიმ',
 '##ს',
 '.',
 'იმედია',
 'ხვალ',
 'შევძლებთ',
 ',',
 'რომ',
 '[MASK]']

In [9]:
token_ids = torch.tensor(token_ids).view((1, -1))
                                        # (n, )   (1, -1) (1, n)

In [11]:
probs = model(input_ids=token_ids).logits[0, -1, :]
probs

tensor([-9.3978, -2.7374, -2.5754,  ..., -3.8149, -2.7380, -1.4564],
       grad_fn=<SliceBackward0>)

In [12]:
tops = list(reversed(list(sorted([(v, i) for i, v in enumerate(probs)]))))[:20]

In [13]:
tops

[(tensor(6.9921, grad_fn=<UnbindBackward0>), 1331),
 (tensor(6.2411, grad_fn=<UnbindBackward0>), 1666),
 (tensor(6.0535, grad_fn=<UnbindBackward0>), 1573),
 (tensor(6.0422, grad_fn=<UnbindBackward0>), 1382),
 (tensor(5.7928, grad_fn=<UnbindBackward0>), 1833),
 (tensor(5.7887, grad_fn=<UnbindBackward0>), 1404),
 (tensor(5.7439, grad_fn=<UnbindBackward0>), 16),
 (tensor(5.5634, grad_fn=<UnbindBackward0>), 1377),
 (tensor(5.5562, grad_fn=<UnbindBackward0>), 2314),
 (tensor(5.4576, grad_fn=<UnbindBackward0>), 2377),
 (tensor(5.3419, grad_fn=<UnbindBackward0>), 1550),
 (tensor(5.3312, grad_fn=<UnbindBackward0>), 2216),
 (tensor(5.2471, grad_fn=<UnbindBackward0>), 2759),
 (tensor(5.1502, grad_fn=<UnbindBackward0>), 1655),
 (tensor(5.1298, grad_fn=<UnbindBackward0>), 1562),
 (tensor(5.0849, grad_fn=<UnbindBackward0>), 2192),
 (tensor(4.8996, grad_fn=<UnbindBackward0>), 1479),
 (tensor(4.7708, grad_fn=<UnbindBackward0>), 1481),
 (tensor(4.7010, grad_fn=<UnbindBackward0>), 2326),
 (tensor(4.672

In [14]:
tokenizer.convert_ids_to_tokens([i for _, i in tops])

['არ',
 'ძალიან',
 'ასე',
 'ის',
 'ისე',
 'ეს',
 ',',
 'მე',
 'კარგი',
 'კარგად',
 'ვერ',
 'ყველაფერი',
 'მინდა',
 'თქვენ',
 'ჩვენ',
 'ბევრი',
 'კი',
 'უნდა',
 'აღარ',
 'ყველაფერს']

In [15]:
def generator(token_ids: list[int]):
    if token_ids[-1] == tokenizer.sep_token_id:
        token_ids = token_ids[:-1]
    token_ids = token_ids + [tokenizer.mask_token_id]
    token_ids = torch.tensor(token_ids).view((1, -1))
    #print(token_ids)
    #print((tokenizer.convert_ids_to_tokens(list(token_ids))))
    probs = model(input_ids=token_ids).logits[0, -1, :]   
    return probs

In [16]:
def get_perplexity(sentence: str):
    token_ids = tokenizer(sentence)['input_ids']
    sent = []
    perp = 1
    for i in range(len(token_ids) - 1):
        sent.append(token_ids[i])
        perp = perp + torch.log(torch.nn.Softmax(dim=0)(generator(sent))[token_ids[i+1]])
    return torch.exp(-perp/tokenizer.vocab_size)

In [17]:
get_perplexity('გამარჯობა, ნახვამდის.')

tensor(1.0017, grad_fn=<ExpBackward0>)

In [155]:
def get_top_k_top_p(sentence: str, k: int):
    probs = generator(tokenizer(sentence)['input_ids'])
    tops = list(reversed(list(sorted([(v, i) for i, v in enumerate(probs)]))))[:k]
    return tokenizer.convert_ids_to_tokens([i for _, i in tops])

In [None]:
get_top_k('დღეს თბილისის 128 საჯარო სკოლაში ჩატარდა', 20)

In [25]:
logits = generator(tokenizer('დღეს თბილისის 128 საჯარო სკოლაში ჩატარდა')['input_ids'])
logits

tensor([-10.6936,  -1.7466,  -2.6995,  ...,  -4.9306,  -5.2190,  -3.6105],
       grad_fn=<SliceBackward0>)

In [23]:
def top_k_top_p(logits, top_k=0, top_p=0.0):
    top_k = min(top_k, logits.size(-1)) # make sure top_k is smaller than logit size
    if top_k > 0:
        # keep only top_k entries
        rm_id = logits < torch.topk(logits, top_k)[0][..., -1, None]
        logits[rm_id] = -float('inf')
        
    if top_p > 0:
        # keep entries that sum up to top_p percentage
        sorted_logits, sorted_ids = torch.sort(logits, descending=True)
        cum_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
        
        sorted_rm_id = cum_probs > top_p
        sorted_rm_id[..., 1:] = sorted_rm_id[..., :-1].clone()
        sorted_rm_id[..., 0] = 0
        
        rm_id = sorted_ids[sorted_rm_id]
        logits[rm_id] = -float('inf')
        
    return logits

In [53]:
out = top_k_top_p(logits, top_k=50, top_p=0.9)
probs = torch.softmax(logits, dim=-1)
next_token = torch.multinomial(probs, 1)
tokenizer.ids_to_tokens[next_token.data.item()]

'თბილისის'