In [20]:
! pip3 install tiktoken
! pip3 install striprtf



In [21]:
from striprtf.striprtf import rtf_to_text

with open("/text.rtf", "r", encoding="utf-8") as f:
    raw_text = f.read()

raw_text = rtf_to_text(raw_text)

print("Total number of characters (clean): ", len(raw_text))
print("--- Preview ---")
print(raw_text[:99])

Total number of characters (clean):  2982
--- Preview ---
Personal Statement
Engineering the best solutions to complex problems is rarely a matter of raw com


In [22]:
import importlib
import tiktoken
print("Tiktoken version: ", importlib.metadata.version('tiktoken'))

Tiktoken version:  0.12.0


In [23]:
tokenizer = tiktoken.get_encoding("gpt2")

In [24]:
enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

639


In [25]:
context_size = 4

In [26]:
x = enc_text[:context_size]
y = enc_text[1:context_size+1]

print(f"x: {x}")
print(f"y:        {y}")

x: [30228, 21983, 198, 13798]
y:        [21983, 198, 13798, 1586]


In [27]:
for i in range(1, context_size+1):
  context = enc_text[:i]
  target = enc_text[i]

  print(context, "---->", target)

[30228] ----> 21983
[30228, 21983] ----> 198
[30228, 21983, 198] ----> 13798
[30228, 21983, 198, 13798] ----> 1586


In [29]:
for i in range(1, context_size+1):
  context = enc_text[:i]
  target = enc_text[i]

  print(tokenizer.decode(context), "---->", tokenizer.decode([target]))

Personal ---->  Statement
Personal Statement ----> 

Personal Statement
 ----> Engine
Personal Statement
Engine ----> ering


# **Implementing a Data Loader**

In [30]:
from torch.utils.data import Dataset, DataLoader

In [35]:
class GPTDatasetV1(Dataset):
  def __init__(self, txt, tokenizer, max_length, stride):
    self.input_ids = []
    self.target_ids = []

    # tokenize the entire text
    token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

    # use sliding window with max_length as sequence length
    # since we will be sliding by 4 words, the stride here will be 4
    # which means we skip 4 words to create next sequence.
    for i in range(0, len(token_ids)-max_length, stride):
      input_chunk = token_ids[i:i+max_length]
      target_chunk = token_ids[i+1:i+max_length+1]
      self.input_ids.append(torch.tensor(input_chunk))
      self.target_ids.append(torch.tensor(target_chunk))

  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.target_ids[idx]

In [37]:
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

  # Initialize the tokenizer
  tokenizer = tiktoken.get_encoding("gpt2")

  # Initialize the dataset
  dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

  # Create the dataloader
  dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle,
                          drop_last=drop_last, num_workers=num_workers)

  return dataloader

In [44]:
import torch

# convert the dataloader to python iterator to get the data batch from raw_text
dataloader = create_dataloader_v1(raw_text, batch_size=1, max_length=4,
                                  stride=1, shuffle=False)

data_iter = iter(dataloader)
first_batch = next(data_iter)

print(first_batch)

[tensor([[30228, 21983,   198, 13798]]), tensor([[21983,   198, 13798,  1586]])]


In [45]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[21983,   198, 13798,  1586]]), tensor([[  198, 13798,  1586,   262]])]


In [49]:
# experiment with batch_size and stride
dataloader = create_dataloader_v1(raw_text, batch_size=8,
                                  max_length=4, stride=4,
                                  shuffle=False)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)

print("Inputs:\n", inputs)
print("\nTargets:\n",  targets)

Inputs:
 tensor([[30228, 21983,   198, 13798],
        [ 1586,   262,  1266,  8136],
        [  284,  3716,  2761,   318],
        [ 8365,   257,  2300,   286],
        [ 8246, 14492,  1176,   198],
        [17749,    26,   340,   318],
        [ 8793,   416,  8263,   422],
        [  262, 10084, 45207,   286]])

Targets:
 tensor([[21983,   198, 13798,  1586],
        [  262,  1266,  8136,   284],
        [ 3716,  2761,   318,  8365],
        [  257,  2300,   286,  8246],
        [14492,  1176,   198, 17749],
        [   26,   340,   318,  8793],
        [  416,  8263,   422,   262],
        [10084, 45207,   286,   661]])
