<a href="https://colab.research.google.com/github/taaha3244/LLM-from-scratch/blob/main/LLM_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tiktoken
Successfully installed tiktoken-0.7.0


In [2]:
from importlib.metadata import version
import re
import tiktoken
import torch

print("torch version:", version("torch"))
print("tiktoken version:", version("tiktoken"))

torch version: 2.3.0+cu121
tiktoken version: 0.7.0


In [3]:
with open("/content/input.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

print("Total number of character:", len(raw_text))
print(raw_text[:99])

Total number of character: 1115394
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
Yo


In [4]:
text = "Hello, world. Is this-- a test?"

result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
result = [item.strip() for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']


In [5]:
preprocessed = re.split(r'([,.?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30])
print(len(preprocessed))

['First', 'Citizen:', 'Before', 'we', 'proceed', 'any', 'further', ',', 'hear', 'me', 'speak', '.', 'All:', 'Speak', ',', 'speak', '.', 'First', 'Citizen:', 'You', 'are', 'all', 'resolved', 'rather', 'to', 'die', 'than', 'to', 'famish', '?']
246527


In [6]:
all_words = sorted(list(set(preprocessed)))
vocab_size = len(all_words)

print(vocab_size)

16583


In [7]:
vocab = {token:integer for integer,token in enumerate(all_words)}

In [8]:
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 50:
        break

('!', 0)
('&C:', 1)
('&c', 2)
("'", 3)
(',', 4)
('--', 5)
('.', 6)
('3', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('ABHORSON:', 12)
('ABRAHAM:', 13)
('ADRIAN:', 14)
('AEacides', 15)
('AEdile:', 16)
('AEdiles', 17)
('AEneas', 18)
('AEsop', 19)
('ALL', 20)
('ALL:', 21)
('ALONSO:', 22)
('ANGELO:', 23)
('ANNE:', 24)
('ANOTHER:', 25)
('ANTIGONUS:', 26)
('ANTONIO:', 27)
('ARCHBISHOP', 28)
('ARCHIDAMUS:', 29)
('ARIEL:', 30)
('AUFIDIUS:', 31)
('AUMERLE:', 32)
('AUTOLYCUS:', 33)
('Abase', 34)
('Abate', 35)
('Abated', 36)
('Abbot', 37)
('Abbot:', 38)
('Abel', 39)
('Abhorred', 40)
('Abhorson', 41)
('Abides', 42)
('Able', 43)
('About', 44)
('Above', 45)
('Abraham', 46)
('Absolute', 47)
('Accept', 48)
('Accomplish', 49)
('According', 50)


In [9]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [10]:
tokenizer = SimpleTokenizerV1(vocab)

text = "All men are abhorred in this world"
ids = tokenizer.encode(text)
print(ids)

[110, 10320, 3626, 3090, 9076, 14840, 16422]


In [11]:
tokenizer.decode(ids)

'All men are abhorred in this world'

In [12]:
tokenizer.decode(tokenizer.encode(text))

'All men are abhorred in this world'

In [13]:
preprocessed = re.split(r'([,.?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]

all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token:integer for integer,token in enumerate(all_tokens)}

In [14]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = { i:s for s,i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [item if item in self.str_to_int
                        else "<|unk|>" for item in preprocessed]

        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [15]:
tokenizer = SimpleTokenizerV2(vocab)

text1 = "Hello, do you like tea?"
text2 = "In the moonlit terraces of the palace lover."

text = " <|endoftext|> ".join((text1, text2))

print(text)

Hello, do you like tea? <|endoftext|> In the moonlit terraces of the palace lover.


In [16]:
tokenizer.encode(text)

[16584,
 4,
 6503,
 16553,
 9811,
 16584,
 10,
 16583,
 1383,
 14763,
 16584,
 16584,
 11030,
 14763,
 11281,
 9991,
 6]

In [17]:
tokenizer.decode(tokenizer.encode(text))

'<|unk|>, do you like <|unk|>? <|endoftext|> In the <|unk|> <|unk|> of the palace lover.'

In [18]:
import importlib
import tiktoken

print("tiktoken version:", importlib.metadata.version("tiktoken"))

tiktoken version: 0.7.0


In [19]:
tokenizer = tiktoken.get_encoding("gpt2")

In [20]:
text = "Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknownPlace."

integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 286, 617, 34680, 27271, 13]


In [22]:
with open("/content/input.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

338025


In [24]:
enc_sample = enc_text[500:]
enc_sample[:2]

[307, 39497]

In [25]:
context_size = 10

x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

print(f"x: {x}")
print(f"y:      {y}")

x: [307, 39497, 286, 14227, 26, 198, 258, 22027, 31025, 11]
y:      [39497, 286, 14227, 26, 198, 258, 22027, 31025, 11, 351]


In [26]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(context, "---->", desired)

[307] ----> 39497
[307, 39497] ----> 286
[307, 39497, 286] ----> 14227
[307, 39497, 286, 14227] ----> 26
[307, 39497, 286, 14227, 26] ----> 198
[307, 39497, 286, 14227, 26, 198] ----> 258
[307, 39497, 286, 14227, 26, 198, 258] ----> 22027
[307, 39497, 286, 14227, 26, 198, 258, 22027] ----> 31025
[307, 39497, 286, 14227, 26, 198, 258, 22027, 31025] ----> 11
[307, 39497, 286, 14227, 26, 198, 258, 22027, 31025, 11] ----> 351


In [27]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))

 be ---->  barren
 be barren ---->  of
 be barren of ---->  accusations
 be barren of accusations ----> ;
 be barren of accusations; ----> 

 be barren of accusations;
 ----> he
 be barren of accusations;
he ---->  hath
 be barren of accusations;
he hath ---->  faults
 be barren of accusations;
he hath faults ----> ,
 be barren of accusations;
he hath faults, ---->  with


In [28]:
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [29]:
def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=0
    )

    return dataloader

In [33]:
dataloader = create_dataloader_v1(raw_text, batch_size=4, max_length=4, stride=4, shuffle=False)

data_iter = iter(dataloader)
print(data_iter )
first_batch = next(data_iter)
print(first_batch)

<torch.utils.data.dataloader._SingleProcessDataLoaderIter object at 0x7f65fe37e860>
[tensor([[ 5962, 22307,    25,   198],
        [ 8421,   356,  5120,   597],
        [ 2252,    11,  3285,   502],
        [ 2740,    13,   198,   198]]), tensor([[22307,    25,   198,  8421],
        [  356,  5120,   597,  2252],
        [   11,  3285,   502,  2740],
        [   13,   198,   198,  3237]])]


In [37]:
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[ 5962, 22307,    25,   198],
        [ 8421,   356,  5120,   597],
        [ 2252,    11,  3285,   502],
        [ 2740,    13,   198,   198],
        [ 3237,    25,   198,  5248],
        [  461,    11,  2740,    13],
        [  198,   198,  5962, 22307],
        [   25,   198,  1639,   389]])

Targets:
 tensor([[22307,    25,   198,  8421],
        [  356,  5120,   597,  2252],
        [   11,  3285,   502,  2740],
        [   13,   198,   198,  3237],
        [   25,   198,  5248,   461],
        [   11,  2740,    13,   198],
        [  198,  5962, 22307,    25],
        [  198,  1639,   389,   477]])


In [None]:
!pip install textblob

