<a href="https://colab.research.google.com/github/vektor8891/llm/blob/main/projects/10_gpt/10_gpt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# !pip install torchtext==0.15.1
# !pip install portalocker

# Text pipeline
## Dataset

In [2]:
from torchtext.datasets import IMDB

# Load the dataset
train_iter, val_iter = IMDB()

In [3]:
data_itr=iter(train_iter)
# retrieving the third first record
next(data_itr)
next(data_itr)
next(data_itr)

(1,
 "If only to avoid making this type of film in the future. This film is interesting as an experiment but tells no cogent story.<br /><br />One might feel virtuous for sitting thru it because it touches on so many IMPORTANT issues but it does so without any discernable motive. The viewer comes away with no new perspectives (unless one comes up with one while one's mind wanders, as it will invariably do during this pointless film).<br /><br />One might better spend one's time staring out a window at a tree growing.<br /><br />")

In [8]:
import torch

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DEVICE

device(type='cpu')

## Preprocessing data

In [5]:
# Define special symbols and indices
UNK_IDX, PAD_IDX, EOS_IDX = 0, 1, 2
# Make sure the tokens are in order of their indices to properly insert them in vocab
special_symbols = ['<unk>', '<pad>', '<|endoftext|>' ]

In [6]:
from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer("basic_english")

In [7]:
from torchtext.vocab import build_vocab_from_iterator

def yield_tokens(data_iter):

    for _,data_sample in data_iter:
        yield  tokenizer(data_sample)

vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=special_symbols, special_first=True)
vocab.set_default_index(UNK_IDX)



###  Text to index and index to Text

In [9]:
text_to_index=lambda text: [vocab(token) for token in tokenizer(text)]
index_to_en = lambda seq_en: " ".join([vocab.get_itos()[index] for index in seq_en])

In [10]:
#check
index_to_en(torch.tensor([0,1,2]))

'<unk> <pad> <|endoftext|>'

### Collate function

In [11]:
def get_sample(block_size, text):
    # Determine the length of the input text
    sample_leg = len(text)
    # Calculate the stopping point for randomly selecting a sample
    # This ensures the selected sample doesn't exceed the text length
    random_sample_stop = sample_leg - block_size


    # Check if a random sample can be taken (if the text is longer than block_size)
    if random_sample_stop >= 1:
        # Randomly select a starting point for the sample
        random_start = torch.randint(low=0, high=random_sample_stop, size=(1,)).item()
        # Define the endpoint of the sample
        stop = random_start + block_size

        # Create the input and target sequences
        src_sequence = text[random_start:stop]
        tgt_sequence= text[random_start + 1:stop + 1]

    # Handle the case where the text length is exactly equal or less the block size
    elif random_sample_stop <= 0:
        # Start from the beginning and use the entire text
        random_start = 0
        stop = sample_leg
        src_sequence= text[random_start:stop]
        tgt_sequence = text[random_start + 1:stop]
        # Append an empty string to maintain sequence alignment
        tgt_sequence.append( '<|endoftext|>')

    return src_sequence, tgt_sequence

In [12]:
BATCH_SIZE=1

batch_of_tokens=[]

for i in range(BATCH_SIZE):
  _,text =next(iter(train_iter))
  batch_of_tokens.append(tokenizer(text))

In [13]:
text=batch_of_tokens[0][0:100]
text[0:100]
batch_of_tokens

[['i',
  'rented',
  'i',
  'am',
  'curious-yellow',
  'from',
  'my',
  'video',
  'store',
  'because',
  'of',
  'all',
  'the',
  'controversy',
  'that',
  'surrounded',
  'it',
  'when',
  'it',
  'was',
  'first',
  'released',
  'in',
  '1967',
  '.',
  'i',
  'also',
  'heard',
  'that',
  'at',
  'first',
  'it',
  'was',
  'seized',
  'by',
  'u',
  '.',
  's',
  '.',
  'customs',
  'if',
  'it',
  'ever',
  'tried',
  'to',
  'enter',
  'this',
  'country',
  ',',
  'therefore',
  'being',
  'a',
  'fan',
  'of',
  'films',
  'considered',
  'controversial',
  'i',
  'really',
  'had',
  'to',
  'see',
  'this',
  'for',
  'myself',
  '.',
  'the',
  'plot',
  'is',
  'centered',
  'around',
  'a',
  'young',
  'swedish',
  'drama',
  'student',
  'named',
  'lena',
  'who',
  'wants',
  'to',
  'learn',
  'everything',
  'she',
  'can',
  'about',
  'life',
  '.',
  'in',
  'particular',
  'she',
  'wants',
  'to',
  'focus',
  'her',
  'attentions',
  'to',
  'making',
 

In [14]:
block_size=10
src_sequences, tgt_sequence=get_sample( block_size, text)

In [15]:
print("src: ",src_sequences)
print("tgt: ",tgt_sequence)

src:  ['had', 'to', 'see', 'this', 'for', 'myself', '.', 'the', 'plot', 'is']
tgt:  ['to', 'see', 'this', 'for', 'myself', '.', 'the', 'plot', 'is', 'centered']


In [16]:
# Initialize empty lists to store source and target sequences
src_batch, tgt_batch = [], []

# Define the batch size
BATCH_SIZE = 2

# Loop to create batches of source and target sequences
for i in range(BATCH_SIZE):
    # Retrieve the next data point from the training iterator
    _,text = next(iter(train_iter))

    # Generate source and target sequences using the get_sample function
    src_sequence_text, tgt_sequence_text = get_sample(block_size, tokenizer(text))

    # Convert source and target sequences to tokenized vocabulary indices
    src_sequence_indices = vocab(src_sequence_text)
    tgt_sequence_indices = vocab(tgt_sequence_text)

    # Convert the sequences to PyTorch tensors with dtype int64
    src_sequence = torch.tensor(src_sequence_indices, dtype=torch.int64)
    tgt_sequence = torch.tensor(tgt_sequence_indices, dtype=torch.int64)

    # Append the source and target sequences to their respective batches
    src_batch.append(src_sequence)
    tgt_batch.append(tgt_sequence)

    # Print the output for every 2nd sample (adjust as needed)
    print(f"Sample {i}:")
    print("Source Sequence (Text):", src_sequence_text)
    print("Source Sequence (Indices):", src_sequence_indices)
    print("Source Sequence (Shape):", src_sequence.shape)
    print("Target Sequence (Text):", tgt_sequence_text)
    print("Target Sequence (Indices):", tgt_sequence_indices)
    print("Target Sequence (Shape):", tgt_sequence.shape)

Sample 0:
Source Sequence (Text): ['the', 'fact', 'that', 'any', 'sex', 'shown', 'in', 'the', 'film', 'is']
Source Sequence (Indices): [4, 198, 16, 93, 338, 693, 14, 4, 25, 11]
Source Sequence (Shape): torch.Size([10])
Target Sequence (Text): ['fact', 'that', 'any', 'sex', 'shown', 'in', 'the', 'film', 'is', 'shown']
Target Sequence (Indices): [198, 16, 93, 338, 693, 14, 4, 25, 11, 693]
Target Sequence (Shape): torch.Size([10])
Sample 1:
Source Sequence (Text): ['politicians', 'and', 'ordinary', 'denizens', 'of', 'stockholm', 'about', 'their', 'opinions', 'on']
Source Sequence (Indices): [7457, 7, 2318, 29828, 9, 16111, 52, 80, 4554, 28]
Source Sequence (Shape): torch.Size([10])
Target Sequence (Text): ['and', 'ordinary', 'denizens', 'of', 'stockholm', 'about', 'their', 'opinions', 'on', 'politics']
Target Sequence (Indices): [7, 2318, 29828, 9, 16111, 52, 80, 4554, 28, 2407]
Target Sequence (Shape): torch.Size([10])


In [18]:
from torch.nn.utils.rnn import pad_sequence

BLOCK_SIZE=30
def collate_batch(batch):
    src_batch, tgt_batch = [], []
    for _,_textt in batch:
      src_sequence,tgt_sequence=get_sample(BLOCK_SIZE,tokenizer(_textt))
      src_sequence=vocab(src_sequence)
      tgt_sequence=vocab(tgt_sequence)
      src_sequence= torch.tensor(src_sequence, dtype=torch.int64)
      tgt_sequence = torch.tensor(tgt_sequence, dtype=torch.int64)
      src_batch.append(src_sequence)
      tgt_batch.append(tgt_sequence)


    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX, batch_first=False)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX, batch_first=False)

    return src_batch.to(DEVICE), tgt_batch.to(DEVICE)

In [20]:
from torch.utils.data import DataLoader

BATCH_SIZE=1
dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
val_dataloader= DataLoader(val_iter , batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)

### Iterating through data samples