# Lab 1 playground
Below are a few examples you can run if you wish! These make use of the `transformers` library.
The examples require around ~1 GB of available storage.

In [None]:
# install requirements:
%pip install transformers
%pip install torch

### GPT

In [8]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_id = "EleutherAI/gpt-neo-125m"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)

print(f"{model_id}'s vocab size is {model.config.vocab_size}")

query = "We study because"
# this encodes the query according to the model's setup and vocabulary
encoded_text = tokenizer(query, return_tensors="pt")
print(encoded_text)  # notice the input_ids, which are the token indices in the vocab
print(f"Inspect a vocab id 1135: {tokenizer.decode(1135)}")

with torch.no_grad():  # disable gradient calculation for inference
    outputs = model(**encoded_text)
next_token = outputs.logits[-1, -1, :] # select the last token from the sequence and all the vocab logits (:)
print("The outputs of the next token will be a distribution over the entire vocabulary")
print(f"Output: {next_token.shape[0]} == Vocab: {model.config.vocab_size}")

probdist = torch.softmax(next_token, -1)  # -1 means last axis, or the vocab axis

k_best = 5
topk_next_tokens= torch.topk(probdist, k_best)

print("-" * 50)
print(query)
for idx, prob in zip(topk_next_tokens.indices, topk_next_tokens.values):
    print(f"- {tokenizer.decode(idx).strip()} (prob {prob.item():.4f})")

EleutherAI/gpt-neo-125m's vocab size is 50257
{'input_ids': tensor([[1135, 2050,  780]]), 'attention_mask': tensor([[1, 1, 1]])}
Inspect a vocab id 1135: We
The outputs of the next token will be a distribution over the entire vocabulary
Output: 50257 == Vocab: 50257
--------------------------------------------------
We study because
- of (prob 0.2096)
- it (prob 0.1508)
- we (prob 0.0938)
- the (prob 0.0833)
- they (prob 0.0553)


In [9]:
# it can also be used in a simple pipeline:
from transformers import pipeline

gpt_pipe = pipeline("text-generation", model="EleutherAI/gpt-neo-125m");
output = gpt_pipe("This model is pretty bad, but")
print(output[0]["generated_text"])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


This model is pretty bad, but I think it's a good one.

I'm not


### A BERT masked example

In [10]:
from transformers import BertTokenizer, BertForMaskedLM
import torch

bert_model_id = "bert-base-cased"
tokenizer = BertTokenizer.from_pretrained(bert_model_id)
model = BertForMaskedLM.from_pretrained(bert_model_id)

MASK = tokenizer.mask_token
text = f"At NTNU you can study {MASK} engineering"

encoded = tokenizer.encode(text)
decoded = tokenizer.decode(encoded)
tokens = tokenizer.tokenize(decoded)

input_ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor(input_ids).unsqueeze(0)

with torch.no_grad():
    predictions = model(input_ids)[0]

_, top_k_indices = torch.topk(predictions[0, tokens.index(MASK)], k=5)
top_k_tokens = tokenizer.convert_ids_to_tokens(top_k_indices)

print("-" * 50)
print(text)
for token in top_k_tokens:
    print(f"- {token}")


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'bert.pooler.dense.bias', 'cls.seq_relationship.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


--------------------------------------------------
At NTNU you can study [MASK] engineering
- electrical
- civil
- mechanical
- chemical
- software
