In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.nn import functional as F
import random

model_name = 'flax-community/papuGaPT2'
device = 'cuda'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

In [2]:
text = "Ta wiewiórka jest sprytna. Ale ze mną nie wygra!"
ids = tokenizer(text, return_tensors='pt')['input_ids'][0]
tokens = [tokenizer.decode(n) for n in ids]
print(tokens, len(tokens))
print(*tokens, sep='')

['Ta', ' wie', 'wi', 'órka', ' jest', ' spry', 'tna', '.', ' Ale', ' ze', ' mną', ' nie', ' wygra', '!'] 14
Ta wiewiórka jest sprytna. Ale ze mną nie wygra!


In [3]:
input_ids = tokenizer(text, return_tensors='pt')['input_ids'].to(device)
with torch.no_grad():
    output = model(input_ids=input_ids)
print (output.logits.shape)    

torch.Size([1, 14, 50257])


In [4]:
output.logits

tensor([[[-155.0427, -157.5392, -158.5438,  ..., -148.4494, -154.7137,
          -150.1377],
         [-108.9348, -111.8164, -112.2461,  ..., -101.2084,  -99.7926,
           -99.7692],
         [-107.2887, -106.1522, -107.3445,  ..., -104.5265,  -99.1293,
           -96.8490],
         ...,
         [-157.6226, -164.6961, -163.0924,  ..., -152.9383, -153.5791,
          -150.1659],
         [-162.3093, -164.7065, -163.3595,  ..., -148.7633, -151.1780,
          -149.8193],
         [-173.4292, -179.4016, -179.4410,  ..., -155.1117, -155.1993,
          -151.8837]]], device='cuda:0')

In [5]:
import numpy as np

def sample_next_token(sentence_token_ids, allowed_next_token_ids):
	with torch.no_grad():
		output = model(input_ids=sentence_token_ids)
		allowed_token_probs = F.softmax(output.logits.squeeze()[-1][allowed_next_token_ids]).cpu().numpy()
	print(allowed_token_probs)
	return np.random.choice(len(allowed_token_probs), p=allowed_token_probs)

In [11]:
sample_next_token(tokenizer("Ala ma kota", return_tensors='pt').input_ids.to(device), [91,    88,    74,  2986, 42202])

  allowed_token_probs = F.softmax(output.logits.squeeze()[-1][allowed_next_token_ids]).cpu().numpy()


2

In [None]:
torch.gather

In [None]:
def log_probs_from_logits(logits, labels):
    logp = F.log_softmax(logits, dim=-1)
    return torch.gather(logp, 2, labels.unsqueeze(2)).squeeze(-1)

def sentence_prob(sentence_txt):
    input_ids = tokenizer(sentence_txt, return_tensors='pt')['input_ids'].to(device)
    with torch.no_grad():
        output = model(input_ids=input_ids)
        log_probs = log_probs_from_logits(output.logits[:, :-1, :], input_ids[:, 1:])
        seq_log_probs = torch.sum(log_probs)
    return seq_log_probs.cpu().numpy()    


In [None]:
input_ids = tokenizer("wtf abcd", return_tensors='pt')['input_ids'].to(device)

In [None]:
input_ids

tensor([[   91,    88,    74,  2986, 42202]], device='cuda:0')

In [None]:
tokenizer.decode(2986)

' ab'

In [None]:
[tokenizer.decode(id) for id in input_ids]

['wtf abcd']

In [None]:
words = 'Ala ma dwa tłuste koty i ślicznego kanarka'.split()
tords = [tokenizer(word).input_ids for word in words]

In [None]:
class TokenTrie(dict):
	def __init__(self, sequences=[]):
		self.terminal = False
		for sequence in sequences:
			self.insert(sequence)

	def insert(self, sequence):
		if not sequence:
			self.terminal = True
			return self

		prefix, suffix = sequence[0], sequence[1:]
		if prefix in self:
			self[prefix].insert(suffix)
		else:
			self[prefix] = TokenTrie().insert(suffix)
		return self

In [None]:
t = TokenTrie(tords)

In [None]:
t

{37: {314: {}},
 351: {},
 14073: {},
 5838: {666: {}},
 315: {327: {}},
 77: {},
 642: {1054: {}},
 303: {45538: {}}}