# Lab 1 playground
Below are a few examples you can run if you wish! These make use of the `transformers` library.
The examples require around ~0 GB of available storage.

In [None]:
# install requirements:
%pip install transformers
%pip install torch

### GPT

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_id = "EleutherAI/gpt-neo-125m"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)

print(f"{model_id}'s vocab size is {model.config.vocab_size}")

query = "We study because"
# this encodes the query according to the model's setup and vocabulary
encoded_text = tokenizer(query, return_tensors="pt")
print(encoded_text)  # notice the input_ids, which are the token indices in the vocab
print(f"Inspect a vocab id 1135: {tokenizer.decode(1135)}")

with torch.no_grad():  # disable gradient calculation for inference
    outputs = model(**encoded_text)
next_token = outputs.logits[-1, -1, :] # select the last token from the sequence and all the vocab logits (:)
print("The outputs of the next token will be a distribution over the entire vocabulary")
print(f"Output: {next_token.shape[0]} == Vocab: {model.config.vocab_size}")

probdist = torch.softmax(next_token, -1)  # -1 means last axis, or the vocab axis

k_best = 5
topk_next_tokens= torch.topk(probdist, k_best)

print("-" * 50)
print(query)
for idx, prob in zip(topk_next_tokens.indices, topk_next_tokens.values):
    print(f"- {tokenizer.decode(idx).strip()} (prob {prob.item():.4f})")

In [None]:
# it can also be used in a simple pipeline:
from transformers import pipeline

gpt_pipe = pipeline("text-generation", model="EleutherAI/gpt-neo-125m");
output = gpt_pipe("This model is pretty bad, but")
print(output[0]["generated_text"])

### A BERT masked example

In [None]:
from transformers import BertTokenizer, BertForMaskedLM

bert_model_id = "bert-base-cased"
tokenizer = BertTokenizer.from_pretrained(bert_model_id)
model = BertForMaskedLM.from_pretrained(bert_model_id)

MASK = tokenizer.mask_token
text = f"At NTNU you can study {MASK} engineering"

encoded = tokenizer.encode(text)
decoded = tokenizer.decode(encoded)
tokens = tokenizer.tokenize(decoded)
mask_position = tokens.index('[MASK]')

input_ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor(input_ids).unsqueeze(0)

with torch.no_grad():
    predictions = model(input_ids)[0]

top_k_values, top_k_indices = torch.topk(predictions[0, mask_position], k=10)
top_k_tokens = tokenizer.convert_ids_to_tokens(top_k_indices)

print("-" * 50)
print(text)
for token, score in zip(top_k_tokens, top_k_values):
    print(f"{token} (prob: {score.item():.4f})")
