In [1]:
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
with open('the-verdict.txt','r') as f:
    raw_text=f.read()

In [9]:
len(raw_text)

20480

In [10]:
import re

In [22]:
all_text=re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed=[i.strip() for i in all_text if i.strip()]

In [24]:
all_words=sorted(set(preprocessed))
vocab_size=len(all_words)
vocab_size

1130

In [26]:
vocab={v:idx for idx,v in enumerate(all_words)}


In [55]:
# Listing 2.3 Implementing a simple text tokenizer
class SimpleTokenizerV1:
    def __init__(self,vocab):
        super(SimpleTokenizerV1).__init__()
        self.str_to_int=vocab
        self.int_to_str={v:k for k,v in vocab.items()}
    def encode(self,text):
        prepro=re.split(r'([,.:;?_!"()\']|--|\s)', text)
        prepro=[item.strip() for item in prepro if item.strip()]
        prepro=[item  if item in self.str_to_int else "<|unk|>" for item in prepro ]
        ids=[self.str_to_int[item] for item in prepro]
        return ids
    def decode(self,ids):
        text=' '.join([self.int_to_str[id] for id in ids])
        text=re.sub(r'\s+([,.?!"()\'])', r'\1', text)  
        return text

In [56]:
tokenizer=SimpleTokenizerV1(vocab)
text = """"It's the last he painted, you know," Mrs. Gisburn said with pardonable pride."""

ids=tokenizer.encode(text)
tokenizer.decode(ids)

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

In [57]:
# <|unk|> 和 <|endoftext|>  ["<|endoftext|>", "<|unk|>"]
all_tokens=sorted(set(preprocessed))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
len(all_tokens)

vocab={v:idx for idx,v in enumerate(all_tokens)}
len(vocab.items())
vocab["<|endoftext|>"]

1132

In [59]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text=" <|endoftext|> ".join((text1,text2))
text

tokenizer=SimpleTokenizerV1(vocab)
ids=tokenizer.encode(text)
tokenizer.decode(ids)

'<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.'

In [60]:
from importlib.metadata import version
import tiktoken
print("tiktoken version:", version("tiktoken"))

tiktoken version: 0.9.0


In [62]:
tokenizer = tiktoken.get_encoding("gpt2")

In [69]:
with open('the-verdict.txt','r') as f:
    text=f.read()
    
enc_text=tokenizer.encode(text)
len(enc_text)
enc_sample=enc_text[50:]


In [68]:
context_size = 4    
x=enc_sample[:context_size]
y=enc_sample[1:context_size+1]

print(f'x:{x}')
print(f'y:   {y}')

x:[290, 4920, 2241, 287]
y:   [4920, 2241, 287, 257]


In [74]:
for i in range(1,context_size+1):
    context=enc_sample[i]
    desired=enc_sample[1:i]
    print('contex:{0:}---desired:{1:}'.format(tokenizer.decode([context]),tokenizer.decode(desired)))
    

contex: established---desired:
contex: himself---desired: established
contex: in---desired: established himself
contex: a---desired: established himself in


In [75]:
# Listing 2.5 A dataset for batched inputs and targets
import torch
from torch.utils.data import Dataset, DataLoader



In [87]:
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        ids=tokenizer.encode(txt)
        self.x,self.y=[],[]
        
        for i in range(0,len(ids)-max_length,stride):
            context=ids[i:i+max_length]
            desire=ids[i+1:i+max_length+1]
            
            self.x.append(torch.tensor(context))
            self.y.append(torch.tensor(desire))
    def __getitem__(self,idx):
        return self.x[idx],self.y[idx]
    def __len__(self):
        return len(self.x)
            
        

In [88]:
# Listing 2.6 A data loader to generate batches with input-with pairs
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True, num_workers=0):
    tokenizer=tiktoken.get_encoding('gpt2')
    dataset=GPTDatasetV1(txt,tokenizer,max_length,stride)
    
    return DataLoader(dataset,batch_size=batch_size,shuffle=True,num_workers=num_workers,drop_last=drop_last)
    

In [89]:
dataloader = create_dataloader_v1(raw_text, batch_size=1, max_length=4, stride=1, shuffle=False)

In [90]:
for x,y in dataloader:
    print(x)
    print(y)
    break

tensor([[  502,    11, 12704,   257]])
tensor([[   11, 12704,   257,  1310]])


In [91]:
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4)

In [92]:
batch=next(iter(dataloader))
batch

[tensor([[ 257, 9254,  286,  922],
         [4544, 9325,  701,    8],
         [ 607, 5229,  338, 1243],
         [ 606,  477, 1497, 2845],
         [3940,  416,  262, 1807],
         [ 373, 9675,  379,  717],
         [3088,  617,  286,  616],
         [ 290, 8104,  465, 1021]]),
 tensor([[ 9254,   286,   922,    12],
         [ 9325,   701,     8,   373],
         [ 5229,   338,  1243,    13],
         [  477,  1497,  2845,   530],
         [  416,   262,  1807,    25],
         [ 9675,   379,   717,    11],
         [  617,   286,   616, 49025],
         [ 8104,   465,  1021,   319]])]