In [1]:
import tiktoken
print("titoken version:",tiktoken.__version__)

titoken version: 0.11.0


In [2]:
with open("data/the-verdict.txt" ,'r',encoding="utf-8" ) as f:
    raw_text=f.read()

print("Total number of characters in data:" , len(raw_text))
print(raw_text[:99])

Total number of characters in data: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


---
<span style="color:#FF5733">
In this notebook basically , we will be creating the input and target pair datasets for LLM to be training one

By now we know that the LLM predict one work at a time.

We will now prepare the dataset were in we give one word and the output (label) is next word . This ouput is again converted 
into input along with origin input word , given as input to get next word output

This is also called as self-supervised learning or autoreggresive model where instead of providing the 
labelled data like supervsioned learning, the dataset itself is converted into input and label
<span>

---
<span style="color:#FF5733">
We now Initialize the gpt-2 BPE . 

This will tokenize the input text.

<span>

In [3]:
tokenizer=tiktoken.get_encoding("gpt2")

In [4]:
encode_text=tokenizer.encode(raw_text)

In [5]:
encode_text[:10]

[40, 367, 2885, 1464, 1807, 3619, 402, 271, 10899, 2138]

In [6]:
print(len(encode_text))

5145


---
<span style="color:#FF5733">
Lets look at an example of how to prepare input and target Pair.

<span>


In [7]:
# Here we remove the first 50 tokens

encode_sample = encode_text[50:]

In [8]:
# Define the length of input . the 4 meaning it will accept 4 tokens
context_size=4

# The context size of 4 meaning , that the model is trained to look at sequence of 4 words (or token)
# to predict the next word in sequence
# Example :-
# The input x is first 4 tokens [1,2,3,4] , and the target y is the next 4 token [2,3,4,5]

x = encode_sample[:context_size]
y=encode_sample[1:context_size+1]

print(f"x: {x}")
print(f"y:      {y}") #--> shifted by one

# This signifies that if 
# x is input and 
# y is output/target/labelled data
# So if we pass x as input {290} we get output {4920} (This new o/p is converted as i/p in next iteration)
# Nxt iteration if we pass x = {290,4920} we get output {2241} 

x: [290, 4920, 2241, 287]
y:      [4920, 2241, 287, 257]


In [9]:
# this block will illustrate the above 

for i in range(1,context_size+1):
    input = encode_sample[:i]
    output= encode_sample[i]

    print(f"{input} ---> {output}")

[290] ---> 4920
[290, 4920] ---> 2241
[290, 4920, 2241] ---> 287
[290, 4920, 2241, 287] ---> 257


In [10]:
# this block will illustrate the above but now with actual text .
# This is how LLM will predict

for i in range(1,context_size+1):
    context = encode_sample[:i]
    desired= encode_sample[i]

    print(tokenizer.decode(context) ,"--->", tokenizer.decode([desired]))

 and --->  established
 and established --->  himself
 and established himself --->  in
 and established himself in --->  a


<span style="color:#FF5733">
This is example of how to prepare input and target Pair.
We will use pyttorch tensor to make it for more performant 

<span>

---

<span style="color:#FF5733">
We will now implement the same stuff seem above using pytorch
dataset and dataloader . It created the same input and labelled (output/target) tensors
<span>



In [19]:
from torch.utils.data import Dataset , DataLoader
import torch

In [24]:
class GPTDatasetV1(Dataset):
    def __init__(self, text , tokenizer , max_length , stride):
        self.input_tensor=[]
        self.output_tensor=[]

        # Tokenize entire text usng gpt2 BPE
        token_ids=tokenizer.encode(text , allowed_special={"<|endoftext|>"})

        for i in range(0,len(token_ids)-max_length,stride):
            input_chunk=token_ids[i:i+max_length]
            output_chunk=token_ids[i+1:i+max_length+1]
            self.input_tensor.append(torch.tensor(input_chunk))
            self.output_tensor.append(torch.tensor(output_chunk))

    def __len__(self):
        return len(self.input_tensor)

    def __getitem__(self, idx):
        return self.input_tensor[idx] , self.output_tensor[idx]

In [25]:
test=GPTDatasetV1("I am In the place" , tokenizer ,4,4)

In [27]:
def create_dataloader_v1(text , batch_size=4,max_length=256,
                           stride=128,shuffle=True,drop_last=True,
                            num_workers=0):
    tokenizer=tiktoken.get_encoding("gpt2")
    dataset=GPTDatasetV1(text,tokenizer,max_length,stride)

    dataloader=DataLoader(
                          dataset,
                          batch_size=batch_size,
                          shuffle=shuffle,
                          drop_last=drop_last,
                          num_workers=num_workers
                         )

    return dataloader
    

In [38]:
import torch

dataload = create_dataloader_v1(raw_text,
                                batch_size=8,
                                max_length=4,
                                stride=4,
                                shuffle=False
                               )
data_iter=iter(dataload)
inputs , outputs =next(data_iter)
print("Inputs:" ,inputs)
print("Outputs:" ,outputs)

Inputs: tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
Outputs: tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])
