# Lab 1 playground
Below are a few examples you can run if you wish! These make use of the `transformers` library.
The examples require around ~1 GB of available storage.

In [1]:
# install requirements:
# %pip install transformers
# %pip install torch

### GPT

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_id = "EleutherAI/gpt-neo-125m"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)

print(f"{model_id}'s vocab size is {model.config.vocab_size}")

query = "We study because"
# this encodes the query according to the model's setup and vocabulary
encoded_text = tokenizer(query, return_tensors="pt")
print(encoded_text)  # notice the input_ids, which are the token indices in the vocab
print(f"Inspect a vocab id 1135: {tokenizer.decode(1135)}")

with torch.no_grad():  # disable gradient calculation for inference
    outputs = model(**encoded_text)
next_token = outputs.logits[-1, -1, :] # select the last token from the sequence and all the vocab logits (:)
print("The outputs of the next token will be a distribution over the entire vocabulary")
print(f"Output: {next_token.shape[0]} == Vocab: {model.config.vocab_size}")

probdist = torch.softmax(next_token, -1)  # -1 means last axis, or the vocab axis

k_best = 5
topk_next_tokens= torch.topk(probdist, k_best)

print("-" * 50)
print(query)
for idx, prob in zip(topk_next_tokens.indices, topk_next_tokens.values):
    print(f"- {tokenizer.decode(idx).strip()} (prob {prob.item():.4f})")

EleutherAI/gpt-neo-125m's vocab size is 50257
{'input_ids': tensor([[1135, 2050,  780]]), 'attention_mask': tensor([[1, 1, 1]])}
Inspect a vocab id 1135: We
The outputs of the next token will be a distribution over the entire vocabulary
Output: 50257 == Vocab: 50257
--------------------------------------------------
We study because
- of (prob 0.2096)
- it (prob 0.1508)
- we (prob 0.0938)
- the (prob 0.0833)
- they (prob 0.0553)


In [7]:
# it can also be used in a simple pipeline:
from transformers import pipeline

gpt_pipe = pipeline("text-generation", model="EleutherAI/gpt-neo-125m");

### Naive prompting

In [11]:
def pred(text, max_tok=100):
    out = gpt_pipe(
        text,
        truncation=True,
        max_length=max_tok,
        num_return_sequences=1,
        do_sample=True,
        temperature=1.0,
        top_k=50,
        top_p=0.95,
        no_repeat_ngram_size=2,
        num_beams=2,
        early_stopping=True,
        length_penalty=0.5,
        use_cache=True,
        pad_token_id=50256,
    )
    print(out[0]["generated_text"])


pred("How do I make waffles?")

How do I make waffles?

Waffles are made of flour, sugar, and salt. The ingredients in the batter are the same as those in a waffle iron, but the ingredients are different. In the recipe, I’m going to use sugar. I use a teaspoonful of sugar for the dough. If you want to make your own, you can use powdered sugar instead of the sugar you use when making your batter. You can also use the powdered ingredient in your


### Better prompting:

In [9]:
pred("Recipe for delicious waffles. Ingredients:\n")

Recipe for delicious waffles. Ingredients:

1/2 cup (1 stick) unsalted butter, softened
(about 1/4 cup)
¼ cup plus 2 tablespoons brown sugar
or 1 teaspoon vanilla extract
and 1 cup confectioners' sugar, plus more for sprinkling on top of the batter.
Preheat the oven to 350°F. Place the butter in a bowl and set aside. In a large bowl, whisk together the melted butter and the brown


### A BERT masked example

In [10]:
from transformers import BertTokenizer, BertForMaskedLM
import torch

import logging
# avoid transformers warnings
logging.getLogger("transformers").setLevel(logging.ERROR)

bert_model_id = "bert-base-cased"
tokenizer = BertTokenizer.from_pretrained(bert_model_id)
model = BertForMaskedLM.from_pretrained(bert_model_id)

MASK = tokenizer.mask_token
def bert(text):
    text = text.replace("_", MASK)
    encoded = tokenizer.encode(text)
    decoded = tokenizer.decode(encoded)
    tokens = tokenizer.tokenize(decoded)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = torch.tensor(input_ids).unsqueeze(0)

    with torch.no_grad():
        predictions = model(input_ids)[0]

    _, top_k_indices = torch.topk(predictions[0, tokens.index(MASK)], k=5)
    top_k_tokens = tokenizer.convert_ids_to_tokens(top_k_indices)
    print(text.replace(MASK, "_"))
    for token in top_k_tokens:
        print(f"- {token}")
    
bert("At NTNU you can study _ engineering")

At NTNU you can study _ engineering
- electrical
- civil
- mechanical
- chemical
- software
