In [1]:
# bigscience/bloom-560m

In [9]:
import numpy as np

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
from llm_lang.utils import get_dataset, get_tokens, get_token_stats, get_data_column

ds = get_dataset(lang="all")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [3]:
bloom = AutoModelForCausalLM.from_pretrained("bigscience/bloom-560m")
bloom_tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")

# bloom_tokenizer.add_bos_token = True  # This is not supported by the tokenizer, so we have to add it manually below

In [4]:
from transformers import GPT2Tokenizer, GPT2Model, GPT2LMHeadModel
import torch
from torch import nn


tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Add a bos token so we can compute the conditional likelihood of a sentence starting with nothing
tokenizer.add_bos_token = True

model = GPT2LMHeadModel.from_pretrained('gpt2')
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)


In [118]:
# input_text = 'सोमवार को, स्टैनफ़ोर्ड यूनिवर्सिटी स्कूल ऑफ़ मेडिसिन के वैज्ञानिकों ने एक नए डायग्नोस्टिक उपकरण के आविष्कार की घोषणा की जो कोशिकाओं को उनके प्रकार के आधार पर छाँट सकता है: एक छोटी प्रिंट करने योग्य चिप जिसे स्टैण्डर्ड इंकजेट प्रिंटर का उपयोग करके लगभग एक अमेरिकी सेंट के लिए निर्मित किया जा सकता है.'
# input_text = get_data_column(ds, "hin_Deva")[0]
# input_text = get_data_column(ds, "ace_Arab")[0]
# input_text = get_data_column(ds, "bel_Cyrl")[0]
# input_text = get_data_column(ds, "kat_Geor")[0]
input_text = get_data_column(ds, "eng_Latn")[0]
# input_text = get_data_column(ds, "tgl_Latn")[0]

print(input_text)

bos = bloom_tokenizer(bloom_tokenizer.bos_token, return_tensors='pt')
encoded_input = bloom_tokenizer(input_text, return_tensors='pt')

On Monday, scientists from the Stanford University School of Medicine announced the invention of a new diagnostic tool that can sort cells by type: a tiny printable chip that can be manufactured using standard inkjet printers for possibly about one U.S. cent each.


In [119]:
encoded_input["input_ids"] = torch.concat([bos["input_ids"], encoded_input["input_ids"]], dim=1)
encoded_input["attention_mask"] = torch.concat([bos["attention_mask"], encoded_input["attention_mask"]], dim=1)

encoded_input

{'input_ids': tensor([[     1,   5534,  69629,     15,  92416,   1485,    368, 112640,  13378,
          28087,    461,  78971,  57774,    368,  28595,    461,    267,   2084,
          51130,  22489,    861,   1400,   5322,  13953,   1331,  47054,    267,
          77941,    643, 214564,  53780,    861,   1400,    722, 175640,   3936,
          16577,  44072,   3367,   7380,    525,    613,  48825,   3638,   2592,
            764,     17,     54,     17,   2538,   5546,     17]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1]])}

In [121]:
log_probs = []

with torch.no_grad():
    for seq_len in range(1, 20):  # encoded_input["input_ids"].shape[1]):
        _encoded_input = dict(
            input_ids=encoded_input["input_ids"][:, :seq_len],
            attention_mask=encoded_input["attention_mask"][:, :seq_len],
        )

        output = bloom(**_encoded_input)
        log_prob = nn.Softmax(dim=-1)(output.logits)[0][-1].log()

        log_probs.append(log_prob[encoded_input["input_ids"][0, seq_len]].item())

np.exp(log_probs[-1]), np.mean(log_probs), log_probs[:10]

(0.14960450098928835,
 -3.4218738736388716,
 [-8.767157554626465,
  -9.427632331848145,
  -1.0896586179733276,
  -11.044292449951172,
  -2.226774215698242,
  -1.0207033157348633,
  -5.899941444396973,
  -0.7607376575469971,
  -1.7424187660217285,
  -0.014853817410767078])

In [114]:
log_probs

[-31.78192901611328, -2.8252975940704346]

In [84]:
bloom_tokenizer.decode([log_prob.exp().argmax().item()])

' sa'

In [86]:
bloom_tokenizer.decode(encoded_input["input_ids"][0][:-1])

'<s>Noong Lunes, inanunsiyo ng mga siyentipiko mula sa Stanford University School of Medicine ang imbensyon ng panibagong kagamitan sa pag-diagnose na makakauri sa mga cell ayon sa uri: isang maliit na chip na maaaring maprint na maaaring magawa gamit ang standard inkjet na mga printer at posibleng nasa isang U.S. sentimo kada isa'

In [49]:
np.exp(log_probs[-1])

0.4882468870905477

In [55]:
np.exp(log_probs[-1])

0.24645011850618206

In [50]:
sum(log_probs)

-201.52620968595147

In [56]:
sum(log_probs)

-297.2635999247432

In [88]:
np.mean(log_probs)

-6.631872419901985

In [23]:
bloom_tokenizer.decode([17])

'.'

In [57]:
encoded_input["input_ids"][0, seq_len]

tensor(17)