<a href="https://colab.research.google.com/github/tim-a-davis/silly_little_language_modeling_thing_at_utd/blob/main/CurtGPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# What is a language model

*italicized text*# New Section

In [2]:
#@title Installing dependencies
!pip install -q trl transformers accelerate peft datasets bitsandbytes einops

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.0/118.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.1/258.1 kB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.6/85.6 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.8/294.8 kB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━

In [3]:
#@title Imports & setup
import requests
from collections import defaultdict, Counter
import random
import time
import math
from IPython.display import HTML, display, clear_output
import ipywidgets as widgets
from einops import rearrange
import matplotlib.pyplot as plt
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))

get_ipython().events.register('pre_run_cell', set_css)

In [4]:
class TrigramModel:
    def __init__(self, url):
        self.trigram_freq = defaultdict(Counter)
        self._train(url)

    def _train(self, url):
        r = requests.get(url)
        text = r.text.lower().split()

        # Create trigrams
        for i in range(len(text) - 2):
            trigram = (text[i], text[i + 1], text[i + 2])
            self.trigram_freq[(trigram[0], trigram[1])][trigram[2]] += 1

    def _get_weighted_random_word(self, counter):
        total = sum(counter.values())
        random_choice = random.randint(1, total)

        for word, freq in counter.items():
            random_choice -= freq
            if random_choice <= 0:
                return word

    def predict(self, text, n_words):
        words = text.lower().split()
        output = words.copy()

        for _ in range(n_words):
            last_bigram = tuple(output[-2:])
            if last_bigram in self.trigram_freq:
                next_word = self._get_weighted_random_word(
                    self.trigram_freq[last_bigram]
                )
                output.append(next_word)
            else:
                break

        return " ".join(output)

    def get_frequencies_of_bigram(self, text):
        words = text.lower().split()
        bigram = tuple(words[-2:])
        return bigram, self.trigram_freq[bigram]


In [5]:
model = TrigramModel("http://gutenberg.net.au/ebooks06/0608511.txt")

In [6]:
prompt = "as it started to sway, the master-at-arms"
n_words = 50  # Number of words ahead to predict

prediction = model.predict(prompt, n_words)
for i, letter in enumerate(prediction):
    if not i % 100: print("\n")
    print(letter, end='', flush=True)
    time.sleep(0.003)



as it started to sway, the master-at-arms was a bachelor of forty or thereabouts, a little inclined 

to give up in obedience to captain vere. without movement, he lay near that end of the smokers on th

at primitive basis. small wonder then that the other men when ranged before him showed to ill advant

age after billy, or

In [7]:
model.get_frequencies_of_bigram(prompt)

(('the', 'master-at-arms'),
 Counter({'of': 1,
          'was': 4,
          'has': 1,
          'in': 1,
          'noticed': 1,
          'that': 1,
          'never': 1,
          'being': 1,
          'acted': 1,
          'about': 1,
          'said.': 1,
          'said': 1,
          'as': 1,
          'and': 1}))

![overly-complicated-diagram](http://www.phon.ox.ac.uk/jcoleman/old_SLP/Lecture_6/figure7-8.png)

http://www.phon.ox.ac.uk/jcoleman/old_SLP/Lecture_6/trigram-modelling.html

In [8]:
torch.set_default_device('cuda')
model = AutoModelForCausalLM.from_pretrained("microsoft/phi-1_5", trust_remote_code=True, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5", trust_remote_code=True, torch_dtype="auto")

Downloading (…)lve/main/config.json:   0%|          | 0.00/880 [00:00<?, ?B/s]

Downloading (…)former_sequential.py:   0%|          | 0.00/2.23k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/phi-1_5:
- configuration_mixformer_sequential.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading (…)former_sequential.py:   0%|          | 0.00/32.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/phi-1_5:
- modeling_mixformer_sequential.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading pytorch_model.bin:   0%|          | 0.00/2.84G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/69.0 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [18]:
prompt = "For both pretraining and finetuning, we concat"
prompt = "as it started to sway, the master-at-arms"
inputs = tokenizer(prompt, return_tensors="pt", return_attention_mask=False)
print(inputs)


{'input_ids': tensor([[  292,   340,  2067,   284, 20009,    11,   262,  4958,    12,   265,
            12,  8357]], device='cuda:0')}


In [10]:
for token_id in inputs["input_ids"][0]:
    id = token_id.item()
    token = tokenizer.decode(id)
    print(f"{id: <5} ----> {token}")

292   ----> as
340   ---->  it
2067  ---->  started
284   ---->  to
20009 ---->  sway
11    ----> ,
262   ---->  the
4958  ---->  master
12    ----> -
265   ----> at
12    ----> -
8357  ----> arms


In [11]:
outputs = model.generate(**inputs, max_new_tokens=11)
output_tokens = [tokenizer.decode(id) for id in outputs[0]]

In [13]:
#@title Helper function for printing token ids and tokens
def print_tokens(ids, tokens, line_size=25):
    tokens = [token.replace(" ", "·") for token in tokens]
    def chunk_list(lst, max_size):
        for i in range(0, len(lst), max_size):
            yield lst[i:i + max_size]
    id_chunks = list(chunk_list(ids, line_size))
    token_chunks = list(chunk_list(tokens, line_size))
    for ids, tokens in zip(id_chunks, token_chunks):
        max_widths = [max(len(str(id)), len(token)) for id, token in zip(ids, tokens)]
        aligned_ids = [str(id).center(max_widths[i]) for i, id in enumerate(ids)]
        aligned_arrows = ['↓'.center(max_widths[i]) for i in range(len(ids))]
        aligned_tokens = [token.center(max_widths[i]) for i, token in enumerate(tokens)]
        print(' '.join(aligned_ids))
        print(' '.join(aligned_arrows))
        print(repr(' '.join(aligned_tokens))[1:-1])
        print("\n")




In [14]:
print("Output:\n" + "".join(output_tokens) + "\n\nToken Mapping:")
print_tokens(outputs.cpu().tolist()[0], output_tokens)

print("\n\n* The · characters represent spaces in the token")

Output:
as it started to sway, the master-at-arms, a seasoned veteran of the battlefield, stepped forward.

Token Mapping:
292 340   2067   284 20009 11 262    4958  12 265 12 8357 11 257   29314     9298   286 262     13480     11  10764     2651   13
 ↓   ↓     ↓      ↓    ↓   ↓   ↓      ↓    ↓   ↓  ↓   ↓   ↓   ↓      ↓        ↓      ↓   ↓        ↓       ↓     ↓        ↓     ↓ 
 as ·it ·started ·to ·sway ,  ·the ·master -   at -  arms ,   ·a ·seasoned ·veteran ·of ·the ·battlefield ,  ·stepped ·forward . 




* The · characters represent spaces in the token


In [15]:
single_forward_pass = model.forward(**inputs)

print(f"Shape of outputs: {single_forward_pass.logits.shape}\n\n")

top_10_token_ids = single_forward_pass.logits[0, -1, :].cpu().argsort().tolist()[-10:][::-1]
top_10_tokens = [tokenizer.decode(token) for token in top_10_token_ids]
top_10_probs = single_forward_pass.logits[0, -1, :].cpu()[top_10_token_ids]

print("Top 10 next possible tokens given our input:\n")
print("token        ~probability")
print("-"*25)
for token, prob in zip(top_10_tokens, top_10_probs):
    print(f"{repr(token)[1:-1]: <7} ----> {prob: >11}")

Shape of outputs: torch.Size([1, 12, 51200])


Top 10 next possible tokens given our input:

token        ~probability
-------------------------
,       ---->    16.65625
 quickly ---->     16.4375
 knew   ---->   16.296875
 and    ---->     16.1875
 couldn ---->     16.1875
 swiftly ---->    16.03125
 skill  ---->  15.9140625
 decided ---->   15.828125
 took   ---->  15.7578125
 of     ---->   15.640625


In [16]:
#@title Helper functions for getting and displaying attention weights

def get_attn_weights(inputs, layer, head):
    x = model.layers[0](**inputs)
    for i in range(1, layer):
        x = model.layers[i](x)
    x = model.layers[layer].ln(x)
    model.layers[layer].mixer
    qkv = model.layers[layer].mixer.Wqkv(x)
    qkv = rearrange(qkv, "... (three h d) -> ... three h d", three=3, d=model.layers[layer].mixer.head_dim)
    qkv = model.layers[layer].mixer.rotary_emb(qkv)
    batch_size, seqlen = qkv.shape[0], qkv.shape[1]
    q, k, v = qkv.unbind(dim=2)
    softmax_scale = 1.0 / math.sqrt(q.shape[-1])
    scores = torch.einsum('bthd,bshd->bhts', q, k * softmax_scale)
    causal_mask = torch.triu(torch.full(size=(seqlen, seqlen), fill_value=-10000.0, device=scores.device), 1)
    scores = scores + causal_mask.to(dtype=scores.dtype)
    attention = torch.softmax(scores, dim=-1, dtype=v.dtype)
    output = torch.einsum('bhts,bshd->bthd', attention, v)
    weights = attention[0, head].cpu()
    return weights


def display_attention_weights(inputs, layer, head, token_idx):
    input_tokens = [tokenizer.decode(id) for id in inputs["input_ids"][0]]
    weights = get_attn_weights(inputs, layer, head)
    with out:
        fig, ax = plt.subplots(figsize=(3, 1*(len(input_tokens)//4)))
        ax.axis('off')
        tl = len(input_tokens)
        ax.set_ylim(0, len(input_tokens))
        ax.set_xlim(0, 10)
        for i, token in enumerate(input_tokens):
            ax.text(3, len(input_tokens)-i, token, ha='right', va='top')
            ax.text(8, len(input_tokens)-i, token, ha='left', va='top')
        ax.fill_between([0, 3.3], [tl-token_idx, tl-token_idx], [tl-token_idx-0.75, tl-token_idx-0.75], color='blue', alpha=0.4)
        for i, weight in enumerate(weights[token_idx].cpu().tolist()):
            ax.fill_between([7.7, 13], [tl-i, tl-i], [tl-i-0.75, tl-i-0.75], color='blue', alpha=math.sqrt(weight)*0.7)
            ax.plot([3.35, 7.65], [tl-token_idx - 0.375, tl-i], c="blue", alpha=math.sqrt(weight)*0.7, lw=0.5)
        out.clear_output()
        plt.show()


def handler(_):
    display_attention_weights(inputs, layer.value, head.value, token_idx.value)


In [19]:
#@title Select the Layer, Attention Head, and Token to view the attention weights
layer = widgets.Dropdown(options=list(range(1, 24)), description="Layer")
head = widgets.Dropdown(options=list(range(0, 32)), description="Attn Head:")
token_idx = widgets.Dropdown(options=list(zip([tokenizer.decode(id) for id in inputs["input_ids"][0]], list(range(len(inputs["input_ids"][0]))))), description="Token:")
button = widgets.Button(description="Plot")
button.on_click(handler)

out = widgets.Output()

display(layer, head, token_idx, button)
display(out)

Dropdown(description='Layer', options=(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, …

Dropdown(description='Attn Head:', options=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, …

Dropdown(description='Token:', options=(('as', 0), (' it', 1), (' started', 2), (' to', 3), (' sway', 4), (','…

Button(description='Plot', style=ButtonStyle())

Output()

In [20]:
layer = 22

x = model.layers[0](**inputs)
for i in range(1, layer):
    x = model.layers[i](x)
x = model.layers[layer].ln(x)
x = model.layers[layer].mixer(x)

In [31]:
x[0, -1].tolist()

fig, ax = plt.subplots()

[0.1776123046875,
 0.58203125,
 0.1805419921875,
 -0.0227813720703125,
 -0.57861328125,
 -0.00258636474609375,
 0.321533203125,
 0.11102294921875,
 -0.00759124755859375,
 -0.19580078125,
 -0.08184814453125,
 0.2042236328125,
 -0.0020599365234375,
 0.027435302734375,
 -0.057098388671875,
 0.45068359375,
 0.069091796875,
 -0.392822265625,
 0.383056640625,
 0.2366943359375,
 0.2381591796875,
 0.39208984375,
 0.432373046875,
 0.46630859375,
 0.4931640625,
 0.14404296875,
 -0.642578125,
 -0.0535888671875,
 -0.2054443359375,
 -0.5927734375,
 -0.77978515625,
 -0.482421875,
 0.11041259765625,
 0.36669921875,
 0.1290283203125,
 -0.1754150390625,
 0.08929443359375,
 0.486083984375,
 0.099609375,
 0.08172607421875,
 0.2353515625,
 -0.0765380859375,
 -0.462890625,
 0.029052734375,
 0.163330078125,
 0.322021484375,
 0.036834716796875,
 0.796875,
 0.41650390625,
 -0.52783203125,
 0.250244140625,
 0.07379150390625,
 -0.412109375,
 -0.280029296875,
 0.27978515625,
 0.12548828125,
 -0.236572265625,
 0.

torch.Size([1, 12])