<a href="https://colab.research.google.com/github/tim-a-davis/silly_little_language_modeling_thing_at_utd/blob/main/CurtGPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# What is a language model

*italicized text*# New Section

In [549]:
import requests
from collections import defaultdict, Counter
import random
import time
import math
from IPython.display import HTML, display, clear_output
import ipywidgets as widgets
from einops import rearrange
import matplotlib.pyplot as plt

In [274]:
def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))

get_ipython().events.register('pre_run_cell', set_css)

In [3]:
class TrigramModel:
    def __init__(self, url):
        self.trigram_freq = defaultdict(Counter)
        self._train(url)

    def _train(self, url):
        r = requests.get(url)
        text = r.text.lower().split()

        # Create trigrams
        for i in range(len(text) - 2):
            trigram = (text[i], text[i + 1], text[i + 2])
            self.trigram_freq[(trigram[0], trigram[1])][trigram[2]] += 1

    def _get_weighted_random_word(self, counter):
        total = sum(counter.values())
        random_choice = random.randint(1, total)

        for word, freq in counter.items():
            random_choice -= freq
            if random_choice <= 0:
                return word

    def predict(self, text, n_words):
        words = text.lower().split()
        output = words.copy()

        for _ in range(n_words):
            last_bigram = tuple(output[-2:])
            if last_bigram in self.trigram_freq:
                next_word = self._get_weighted_random_word(
                    self.trigram_freq[last_bigram]
                )
                output.append(next_word)
            else:
                break

        return " ".join(output)

    def get_frequencies_of_bigram(self, text):
        words = text.lower().split()
        bigram = tuple(words[-2:])
        return bigram, self.trigram_freq[bigram]


In [4]:
model = TrigramModel("http://gutenberg.net.au/ebooks06/0608511.txt")

In [5]:
text = "as it started to sway, the master-at-arms"
n_words = 50  # Number of words ahead to predict

prediction = model.predict(text, n_words)
for i, letter in enumerate(prediction):
    if not i % 100: print("\n")
    print(letter, end='', flush=True)
    time.sleep(0.003)



as it started to sway, the master-at-arms of a singing-bird on the victor of waterloo ventures not t

o handle such breadths of heavy canvas as the handsome sailor, merrily joined in; then addressing hi

s messmates exclaimed, "there now, who says that jimmy legs is down on you." "and what," rejoined bi

lly in spilling the soup just when

In [6]:

model.get_frequencies_of_bigram(text)

(('the', 'master-at-arms'),
 Counter({'of': 1,
          'was': 4,
          'has': 1,
          'in': 1,
          'noticed': 1,
          'that': 1,
          'never': 1,
          'being': 1,
          'acted': 1,
          'about': 1,
          'said.': 1,
          'said': 1,
          'as': 1,
          'and': 1}))

![overly-complicated-diagram](http://www.phon.ox.ac.uk/jcoleman/old_SLP/Lecture_6/figure7-8.png)

http://www.phon.ox.ac.uk/jcoleman/old_SLP/Lecture_6/trigram-modelling.html

In [7]:
!pip install -q trl transformers accelerate peft datasets bitsandbytes einops

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.0/118.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.1/258.1 kB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.6/85.6 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.8/294.8 kB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━

In [43]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig

In [44]:
torch.set_default_device('cuda')
model = AutoModelForCausalLM.from_pretrained("microsoft/phi-1_5", trust_remote_code=True, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5", trust_remote_code=True, torch_dtype="auto")

In [567]:
prompt = "For both pretraining and finetuning, we concat"
inputs = tokenizer(prompt, return_tensors="pt", return_attention_mask=False)
print(inputs)


{'input_ids': tensor([[ 1890,  1111,  2181, 24674,   290,   957,   316, 46493,    11,   356,
          1673,   265]], device='cuda:0')}


In [568]:
for token_id in inputs["input_ids"][0]:
    id = token_id.item()
    token = tokenizer.decode(id)
    print(f"{id: <5} ----> {token}")

1890  ----> For
1111  ---->  both
2181  ---->  pret
24674 ----> raining
290   ---->  and
957   ---->  fin
316   ----> et
46493 ----> uning
11    ----> ,
356   ---->  we
1673  ---->  conc
265   ----> at


In [571]:
outputs = model.generate(**inputs, max_new_tokens=11)
output_tokens = [tokenizer.decode(id) for id in outputs[0]]

In [572]:
def print_tokens(ids, tokens):
    tokens = [token.replace(" ", "·") for token in tokens]
    max_widths = [max(len(str(id)), len(token)) for id, token in zip(ids, tokens)]
    aligned_ids = [str(id).center(max_widths[i]) for i, id in enumerate(ids)]
    aligned_arrows = ['↓'.center(max_widths[i]) for i in range(len(ids))]
    aligned_tokens = [token.center(max_widths[i]) for i, token in enumerate(tokens)]
    print(' '.join(aligned_ids))
    print(' '.join(aligned_arrows))
    print(repr(' '.join(aligned_tokens))[1:-1])


print("Output:\n" + "".join(output_tokens) + "\n\nToken Mapping:")
print_tokens(outputs.cpu().tolist()[0], output_tokens)

print("\n\n* The · characters represent spaces in the token")

Output:
For both pretraining and finetuning, we concat the input and output of the encoder and decoder

Token Mapping:
1890  1111  2181  24674  290  957  316 46493 11 356  1673 265 262   5128  290    5072  286 262  2207 12342 290  875  12342
 ↓     ↓     ↓      ↓     ↓    ↓    ↓    ↓   ↓   ↓    ↓    ↓   ↓     ↓     ↓      ↓     ↓   ↓    ↓     ↓    ↓    ↓     ↓  
For  ·both ·pret raining ·and ·fin  et uning ,  ·we ·conc  at ·the ·input ·and ·output ·of ·the ·enc  oder ·and ·dec  oder


* The · characters represent spaces in the token


In [564]:
single_forward_pass = model.forward(**inputs)
top_10_token_ids = single_forward_pass.logits[0, -1, :].cpu().argsort().tolist()[-10:][::-1]
top_10_tokens = [tokenizer.decode(token) for token in top_10_token_ids]
top_10_probs = single_forward_pass.logits[0, -1, :].cpu()[top_10_token_ids]

print("Top 10 next possible tokens given our input:\n")
print("token        ~probability")
print("-"*25)
for token, prob in zip(top_10_tokens, top_10_probs):
    print(f"{repr(token)[1:-1]: <7} ----> {prob: >11}")

Top 10 next possible tokens given our input:

token        ~probability
-------------------------
 the    ---->     30.1875
 a      ---->    29.96875
 out    ---->     27.8125
 off    ---->   27.359375
 his    ---->   26.234375
 an     ---->   25.484375
 another ---->   25.328125
 two    ---->     24.5625
 shelter ---->   24.546875
 one    ---->   22.765625


In [573]:
# Initialize input tensor `x` (size: [batch_size, sequence_length])


# Forward through Embedding layer
layer = 5
head = 0
token_idx = 16
def get_attn_weights(inputs, layer, head):
    x = model.layers[0](**inputs)
    for i in range(1, layer):
        x = model.layers[i](x)
    x = model.layers[layer].ln(x)
    model.layers[layer].mixer
    qkv = model.layers[layer].mixer.Wqkv(x)
    qkv = rearrange(qkv, "... (three h d) -> ... three h d", three=3, d=model.layers[layer].mixer.head_dim)
    qkv = model.layers[layer].mixer.rotary_emb(qkv)
    batch_size, seqlen = qkv.shape[0], qkv.shape[1]
    q, k, v = qkv.unbind(dim=2)
    softmax_scale = 1.0 / math.sqrt(q.shape[-1])
    scores = torch.einsum('bthd,bshd->bhts', q, k * softmax_scale)
    causal_mask = torch.triu(torch.full(size=(seqlen, seqlen), fill_value=-10000.0, device=scores.device), 1)
    scores = scores + causal_mask.to(dtype=scores.dtype)
    attention = torch.softmax(scores, dim=-1, dtype=v.dtype)
    output = torch.einsum('bhts,bshd->bthd', attention, v)
    weights = attention[0, head].cpu()
    return weights


def display_attention_weights(inputs, layer, head, token_idx):
    input_tokens = [tokenizer.decode(id) for id in inputs["input_ids"][0]]
    weights = get_attn_weights(inputs, layer, head)
    with out:
        fig, ax = plt.subplots(figsize=(3, 1*(len(input_tokens)//4)))
        ax.axis('off')
        tl = len(input_tokens)
        ax.set_ylim(0, len(input_tokens))
        ax.set_xlim(0, 10)
        for i, token in enumerate(input_tokens):
            ax.text(3, len(input_tokens)-i, token, ha='right', va='top')
            ax.text(8, len(input_tokens)-i, token, ha='left', va='top')
        ax.fill_between([0, 3.3], [tl-token_idx, tl-token_idx], [tl-token_idx-0.75, tl-token_idx-0.75], color='blue', alpha=0.4)
        for i, weight in enumerate(weights[token_idx].cpu().tolist()):
            ax.fill_between([7.7, 13], [tl-i, tl-i], [tl-i-0.75, tl-i-0.75], color='blue', alpha=math.sqrt(weight)*0.7)
            ax.plot([3.35, 7.65], [tl-token_idx - 0.375, tl-i], c="blue", alpha=math.sqrt(weight)*0.7, lw=0.5)
        out.clear_output()
        plt.show()


def handler(_):
    display_attention_weights(inputs, layer.value, head.value, token_idx.value)


In [574]:
layer = widgets.Dropdown(options=list(range(1, 24)), description="Layer")
head = widgets.Dropdown(options=list(range(0, 32)), description="Attn Head:")
token_idx = widgets.Dropdown(options=list(zip([tokenizer.decode(id) for id in inputs["input_ids"][0]], list(range(len(inputs["input_ids"][0]))))), description="Token:")
button = widgets.Button(description="Plot")
button.on_click(handler)

out = widgets.Output()

display(layer, head, token_idx, button)
display(out)

Dropdown(description='Layer', options=(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, …

Dropdown(description='Attn Head:', options=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, …

Dropdown(description='Token:', options=(('For', 0), (' both', 1), (' pret', 2), ('raining', 3), (' and', 4), (…

Button(description='Plot', style=ButtonStyle())

Output()

In [449]:
inputs

{'input_ids': tensor([[  464,  2068,  7586, 21831]], device='cuda:0')}

In [411]:
sum(weights[token_idx].cpu().tolist())

1.0002321600914001

In [403]:
weight

0.0

torch.Size([1, 12])