<a href="https://colab.research.google.com/github/tim-a-davis/silly_little_language_modeling_thing_at_utd/blob/main/CurtGPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# What is a language model

*italicized text*# New Section

In [1]:
import requests
from collections import defaultdict, Counter
import random
import time

In [2]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))

get_ipython().events.register('pre_run_cell', set_css)

In [3]:
class TrigramModel:
    def __init__(self, url):
        self.trigram_freq = defaultdict(Counter)
        self._train(url)

    def _train(self, url):
        r = requests.get(url)
        text = r.text.lower().split()

        # Create trigrams
        for i in range(len(text) - 2):
            trigram = (text[i], text[i + 1], text[i + 2])
            self.trigram_freq[(trigram[0], trigram[1])][trigram[2]] += 1

    def _get_weighted_random_word(self, counter):
        total = sum(counter.values())
        random_choice = random.randint(1, total)

        for word, freq in counter.items():
            random_choice -= freq
            if random_choice <= 0:
                return word

    def predict(self, text, n_words):
        words = text.lower().split()
        output = words.copy()

        for _ in range(n_words):
            last_bigram = tuple(output[-2:])
            if last_bigram in self.trigram_freq:
                next_word = self._get_weighted_random_word(
                    self.trigram_freq[last_bigram]
                )
                output.append(next_word)
            else:
                break

        return " ".join(output)

    def get_frequencies_of_bigram(self, text):
        words = text.lower().split()
        bigram = tuple(words[-2:])
        return bigram, self.trigram_freq[bigram]


In [4]:
model = TrigramModel("http://gutenberg.net.au/ebooks06/0608511.txt")

In [5]:
text = "as it started to sway, the master-at-arms"
n_words = 50  # Number of words ahead to predict

prediction = model.predict(text, n_words)
for i, letter in enumerate(prediction):
    if not i % 100: print("\n")
    print(letter, end='', flush=True)
    time.sleep(0.003)



as it started to sway, the master-at-arms of a singing-bird on the victor of waterloo ventures not t

o handle such breadths of heavy canvas as the handsome sailor, merrily joined in; then addressing hi

s messmates exclaimed, "there now, who says that jimmy legs is down on you." "and what," rejoined bi

lly in spilling the soup just when

In [6]:

model.get_frequencies_of_bigram(text)

(('the', 'master-at-arms'),
 Counter({'of': 1,
          'was': 4,
          'has': 1,
          'in': 1,
          'noticed': 1,
          'that': 1,
          'never': 1,
          'being': 1,
          'acted': 1,
          'about': 1,
          'said.': 1,
          'said': 1,
          'as': 1,
          'and': 1}))

![overly-complicated-diagram](http://www.phon.ox.ac.uk/jcoleman/old_SLP/Lecture_6/figure7-8.png)

http://www.phon.ox.ac.uk/jcoleman/old_SLP/Lecture_6/trigram-modelling.html

In [7]:
!pip install -q trl transformers accelerate peft datasets bitsandbytes einops

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.0/118.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.1/258.1 kB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.6/85.6 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.8/294.8 kB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━

In [43]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig

In [44]:
torch.set_default_device('cuda')
model = AutoModelForCausalLM.from_pretrained("microsoft/phi-1_5", trust_remote_code=True, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5", trust_remote_code=True, torch_dtype="auto")

In [153]:
inputs = tokenizer("For both pretraining and finetuning, we concat", return_tensors="pt", return_attention_mask=False)
print(inputs)


{'input_ids': tensor([[ 1890,  1111,  2181, 24674,   290,   957,   316, 46493,    11,   356,
          1673,   265]], device='cuda:0')}


In [154]:
for token_id in inputs["input_ids"][0]:
    id = token_id.item()
    token = tokenizer.decode(id)
    print(f"{id: <5} ----> {token}")

1890  ----> For
1111  ---->  both
2181  ---->  pret
24674 ----> raining
290   ---->  and
957   ---->  fin
316   ----> et
46493 ----> uning
11    ----> ,
356   ---->  we
1673  ---->  conc
265   ----> at


In [163]:
outputs = model.generate(**inputs, max_new_tokens=20)
output_tokens = [tokenizer.decode(id) for id in outputs[0]]

In [170]:
def print_tokens(ids, tokens):
    tokens = [token.replace(" ", "·") for token in tokens]
    max_widths = [max(len(str(id)), len(token)) for id, token in zip(ids, tokens)]
    aligned_ids = [str(id).center(max_widths[i]) for i, id in enumerate(ids)]
    aligned_arrows = ['↓'.center(max_widths[i]) for i in range(len(ids))]
    aligned_tokens = [token.center(max_widths[i]) for i, token in enumerate(tokens)]
    print(' '.join(aligned_ids))
    print(' '.join(aligned_arrows))
    print(repr(' '.join(aligned_tokens))[1:-1])


print("Output:\n" + "".join(output_tokens) + "\n\nToken Mapping:")
print_tokens(outputs.cpu().tolist()[0], output_tokens)



Output:
For both pretraining and finetuning, we concat the input and output of the encoder and decoder, respectively.

```python


Token Mapping:
1890  1111  2181  24674  290  957  316 46493 11 356  1673 265 262   5128  290    5072  286 262  2207 12342 290  875  12342 11      8148     13 198 198 15506 63 29412  198
 ↓     ↓     ↓      ↓     ↓    ↓    ↓    ↓   ↓   ↓    ↓    ↓   ↓     ↓     ↓      ↓     ↓   ↓    ↓     ↓    ↓    ↓     ↓   ↓        ↓       ↓   ↓   ↓    ↓   ↓    ↓     ↓ 
For  ·both ·pret raining ·and ·fin  et uning ,  ·we ·conc  at ·the ·input ·and ·output ·of ·the ·enc  oder ·and ·dec  oder ,  ·respectively .   \n   \n    ``  `  python  \n 


In [41]:
# Initialize input tensor `x` (size: [batch_size, sequence_length])
x = ...

# Forward through Embedding layer
x = model.layers[0](x)

# Forward through ParallelBlock layers
for i in range(1, 25):  # Assuming 24 ParallelBlock layers are there
    x = model.layers[i](x)

# Now `x` contains the tensor before it goes into the `CausalLMHead` layer.


In [176]:
model.forward(**inputs).logits.shape

torch.Size([1, 12, 51200])

torch.Size([1, 12])