In [2]:
from utils import get_verdict_story

In [2]:
raw_text = get_verdict_story(force_download=True)

Downloading 'The Verdict' short story...
  Trying: https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt
  ✓ Success!
✓ Saved to the_verdict.txt (20479 characters)


In [4]:
with open("./the_verdict.txt","r", encoding="utf-8") as f:
    raw_text = f.read()

### Naive implementation
1. Split raw_text in to tokens
2. Create dict of words to token id. (this is buidl vocabulary)

In [25]:
import re

def split_text_tokens(input_text: str) -> list:
    # Pattern: match words or punctuation, skip whitespace
    pattern = r'\b\w+\b|[^\w\s]'
    tokens = re.findall(pattern, input_text)
    return tokens

def create_vocabulary(tokens) -> dict:
    tokens.sort()
    unique_tokens = set(tokens)
    vocab = {token:integer for integer, token in enumerate(unique_tokens)}
    return vocab



In [29]:
# Test
tokens = split_text_tokens(raw_text)
print(f"First 200 chars: {raw_text[:200]}")
print(f"\nTokens: {tokens[:30]}")
print(f"Total tokens: {len(tokens)}")

vocab = create_vocabulary(tokens)
print(f"\nVocabulary size: {len(vocab)}")
print(f"Gisburn token ID: {vocab['Gisburn']}")
print(f"Gisburn token ID: {vocab['genius']}")

First 200 chars: I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a

Tokens: ['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '-', '-', 'though', 'a', 'good', 'fellow', 'enough', '-', '-', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that']
Total tokens: 4827

Vocabulary size: 1148
Gisburn token ID: 105
Gisburn token ID: 422


In [30]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
                                
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [44]:
tokenizer = SimpleTokenizerV1(vocab)
token_ids = tokenizer.encode("You monumental Lord!!!")
result = tokenizer.decode(token_ids)
result

'You monumental Lord!!!'

In [1]:
import sys
import importlib

print("tiktoken version:", importlib.metadata.version("tiktoken"))

PackageNotFoundError: No package metadata was found for tiktoken

In [None]:
# Check which Python interpreter is being used
import sys
print("Python executable:", sys.executable)
print("Python version:", sys.version)