In [1]:
import regex as re

In [17]:
SPECIAL_TOKENS = {
    "<|bos|>": 100000,
    "<|eos|>": 100001,
    "<|endoftext|>": 100002,
}

Special_token_id={v:k for k,v in SPECIAL_TOKENS.items()}

In [43]:
NE_Pattern=re.compile(r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+""")
def find_patterns(text):
    return NE_Pattern.findall(text)

In [44]:
special_pattern=re.compile("(" + "|".join(re.escape(t) for t in SPECIAL_TOKENS) + ")")
def find_special_pattern(text):
    return special_pattern.split(text)

In [45]:
def get_stats(ids):
    count={}
    for pair in zip(ids,ids[1:]):
        count[pair]=count.get(pair,0)+1
    return count

In [46]:
def merge_arr(arr,pair,idx):
    new_arr=[]
    i=0
    while(i<len(arr)):
        if i < len(arr)-1 and arr[i]==pair[0] and arr[i+1]==pair[1]:
            new_arr.append(idx)
            i+=2
        else :
            new_arr.append(arr[i])
            i+=1
    return new_arr

In [47]:
def train_bpe(corpus,vocab_size=100000):
    ids=[]
    for text in corpus:
        for part in find_special_pattern(text):
            if part in SPECIAL_TOKENS:
                continue
            else:
                for small_part in find_patterns(part):
                    ids.extend(small_part.encode("utf-8"))
    merges={}
    next_id=256
    while(next_id<vocab_size):
        stats=get_stats(ids)
        if not stats:
            break
        pair = max(stats,key=stats.get)
        merges[pair]=next_id
        ids=merge(ids,pair,next_id)
        next_id+=1

        if next_id%1000==0:
            print(f"trained vocab size: {next_id}")
    return merges
    

In [48]:
def build_vocab(merges):
    vocab={idx:bytes([idx]) for idx in range(256)}
    for (p0,p1),idx in merges.items():
        vocab[idx]=vocab[p0]+vocab[p1]
    return vocab

In [103]:
corpus = [
" hello world ! ",
" hello world ? ",
" hello there ! ",
" hello again , world . ",
" what is your name ? ",
" how does this work ? ",
" why is tokenization important ? ",
" can machines understand language ? ",
" yes , they can learn patterns . ",
" this is a simple example . ",

" numbers like 123 and 456 appear often . ",
" floating point values like 3.14 are common . ",
" percentages like 50 % and 100 % exist . ",
" dates like 2024 - 01 - 01 are used . ",
" time formats like 10 : 30 am appear . ",
" phone numbers like 123 - 456 - 7890 exist . ",
" currency values include $ 100 , € 50 , ₹ 500 . ",
" math expressions like 5 + 3 = 8 work . ",
" equations such as x = y + z appear . ",
" brackets ( ) [ ] { } are symbols . ",

" punctuation includes commas , periods . and semicolons ; ",
" question marks ? and exclamation marks ! matter . ",
" quotes like \" hello \" and ' world ' appear . ",
" ellipsis ... is also punctuation . ",
" dashes - and underscores _ exist . ",
" special characters @ # $ % ^ & * appear . ",
" slashes / and backslashes \\ are used . ",
" pipes | and colons : appear . ",
" comparison operators < > <= >= == != exist . ",
" logical operators && || ! appear . ",

" programming languages include python , java , and c ++ . ",
" python code uses indentation . ",
" loops like for i in range ( 10 ) run . ",
" conditionals use if else statements . ",
" functions return values . ",
" variables store data . ",
" arrays store multiple values . ",
" dictionaries map keys to values . ",
" classes define objects . ",
" methods belong to classes . ",

" machine learning is a subset of ai . ",
" deep learning uses neural networks . ",
" transformers use attention mechanisms . ",
" training involves optimizing weights . ",
" loss functions measure error . ",
" gradients update parameters . ",
" backpropagation computes gradients . ",
" datasets contain many samples . ",
" validation checks performance . ",
" testing evaluates models . ",

" data science combines statistics and coding . ",
" probability theory is important . ",
" random variables follow distributions . ",
" mean , median , and mode are statistics . ",
" variance measures spread . ",
" correlation indicates relationships . ",
" visualization helps understanding . ",
" charts include bar , line , and pie . ",
" dashboards summarize information . ",
" reports communicate results . ",

" web development includes frontend and backend . ",
" html , css , and javascript are used . ",
" apis connect systems . ",
" requests use http methods . ",
" responses include status codes . ",
" servers handle traffic . ",
" databases store persistent data . ",
" sql queries retrieve rows . ",
" nosql systems scale horizontally . ",
" caching improves speed . ",

" cloud computing enables scalability . ",
" distributed systems handle failures . ",
" concurrency allows parallel execution . ",
" threads share memory . ",
" processes isolate execution . ",
" synchronization avoids race conditions . ",
" locks control access . ",
" deadlocks must be avoided . ",
" performance optimization matters . ",
" monitoring tracks metrics . ",

" cybersecurity protects systems . ",
" encryption secures communication . ",
" hashing stores passwords . ",
" authentication verifies identity . ",
" authorization controls access . ",
" vulnerabilities can be exploited . ",
" patches fix security issues . ",
" firewalls filter traffic . ",
" attacks include phishing and malware . ",
" secure coding reduces risk . ",

" software engineering requires planning . ",
" version control uses git . ",
" commits track changes . ",
" branches isolate features . ",
" merges combine work . ",
" conflicts must be resolved . ",
" testing improves reliability . ",
" continuous integration automates checks . ",
" deployment releases software . ",
" maintenance keeps systems running . "
"! ! ! ! ! ! ! ! !!  . . . . . . . . . . . . . . . . .12345678909876543234567 87 654321 234 56789 87 6543 23 4567 8"
" 1234 2 3 56 7 6 32 36 8 8 6 323  ! ! !! ! ! ! ! !!  . . . . . . . ."
"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! ! ! ! !! ! ! ! !! !  > > > >  .... . . . . . . . . . . . .  . . . . .. . . . . . "
]



merges = train_bpe(corpus, vocab_size=285)
vocab = build_vocab(merges)

In [104]:
def encode(text,merge):

    ids=[]
    for part in find_special_pattern(text):
        if part in SPECIAL_TOKENS:
            ids.append(SPECIAL_TOKENS[part])
        else:
            for small_part in find_patterns(part):
                token = list (small_part.encode("utf-8"))
            
                while len (token)>=2:
                    status=get_stats(token)
                    pair=min(status,key = lambda p: merges.get(p,float('inf')))
                    if pair not in merges:
                        break
                    num=merges[pair]
                    token=merge_arr(token,pair,num)
                    ids.extend(token)
    return ids

In [105]:
def decode(ids,vocab):
    out=[]
    for idx in ids:
        if idx in Special_token_id:
            out.append(Special_token_id[idx].encode("utf-8"))
        else:
            out.append(vocab[idx])
    return b"".join(out).decode("utf-8", errors="replace")

In [108]:
text = "<|bos|> hello world  <|eos|>"
ids = encode(text, merges)
print(ids)
print(decode(ids, vocab))

[100000, 32, 104, 101, 108, 281, 32, 119, 269, 108, 100, 100001]
<|bos|> hello world<|eos|>
