In [3]:
# 读取文本，获得raw_text
with open('the-verdict.txt','r')as f:
    raw_data=f.read()

'I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would have been Rome or Florence.)\n\n"The height of his glory"--that was what the women called it. I can hear Mrs. Gideon Thwing--his last Chicago sitter--deploring his unaccountable abdication. "Of course it\'s going to send the value of my picture \'way up; but I don\'t think of that, Mr. Rickham--the loss to Arrt is all I think of." The word, on Mrs. Thwing\'s lips, multiplied its _rs_ as though they were reflected in an endless vista of mirrors. And it was not only the Mrs. Thwings who mourned. Had not the exquisite Hermia Croft, at the last Grafton Gallery show, stopped me before Gisburn\'s "Moon-dancers" to say, with tears in her eyes: "We shall not look upon its like again"?\n\nWell!--even 

In [4]:
import re

In [6]:
# 得到分词
process=re.split(r'([,.:;?_!"()\']|--|\s)', raw_data)
# 去掉空格
process=[item.strip() for item in process if item.strip()]

In [10]:
all_words=sorted(set(process))
vocab_size=len(all_words)
# 词汇表
vocab={val:idx for idx,val in enumerate(all_words)}


!
"
'


{'!': 0,
 '"': 1,
 "'": 2,
 '(': 3,
 ')': 4,
 ',': 5,
 '--': 6,
 '.': 7,
 ':': 8,
 ';': 9,
 '?': 10,
 'A': 11,
 'Ah': 12,
 'Among': 13,
 'And': 14,
 'Are': 15,
 'Arrt': 16,
 'As': 17,
 'At': 18,
 'Be': 19,
 'Begin': 20,
 'Burlington': 21,
 'But': 22,
 'By': 23,
 'Carlo': 24,
 'Chicago': 25,
 'Claude': 26,
 'Come': 27,
 'Croft': 28,
 'Destroyed': 29,
 'Devonshire': 30,
 'Don': 31,
 'Dubarry': 32,
 'Emperors': 33,
 'Florence': 34,
 'For': 35,
 'Gallery': 36,
 'Gideon': 37,
 'Gisburn': 38,
 'Gisburns': 39,
 'Grafton': 40,
 'Greek': 41,
 'Grindle': 42,
 'Grindles': 43,
 'HAD': 44,
 'Had': 45,
 'Hang': 46,
 'Has': 47,
 'He': 48,
 'Her': 49,
 'Hermia': 50,
 'His': 51,
 'How': 52,
 'I': 53,
 'If': 54,
 'In': 55,
 'It': 56,
 'Jack': 57,
 'Jove': 58,
 'Just': 59,
 'Lord': 60,
 'Made': 61,
 'Miss': 62,
 'Money': 63,
 'Monte': 64,
 'Moon-dancers': 65,
 'Mr': 66,
 'Mrs': 67,
 'My': 68,
 'Never': 69,
 'No': 70,
 'Now': 71,
 'Nutley': 72,
 'Of': 73,
 'Oh': 74,
 'On': 75,
 'Once': 76,
 'Only': 77,
 '

In [11]:
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
class SimpleTokenizerV1():
    def __init__(self,vocab):
        self.str_to_int=vocab
        self.int_to_str={val:key for key,val in vocab.items()}
    def encode(self,text):
        process=re.split(r'([,.?_!"()\']|--|\s)', text)
        process=[item.strip() for item in process if item.strip()]
        return [self.str_to_int[item] for item in process]
    def decode(self,ids):
        return ' '.join([self.int_to_str[id] for id in ids])

In [16]:
tokenizer=SimpleTokenizerV1(vocab)
text = """"It's the last he painted, you know," Mrs. Gisburn said with pardonable pride."""
ids=tokenizer.encode(text)
ids
tokenizer.decode(ids)

'" It \' s the last he painted , you know , " Mrs . Gisburn said with pardonable pride .'

In [17]:
# "<|endoftext|>", "<|unk|>"

In [22]:
# 添加未知单词和连接符
all_tokens=sorted(set(process))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
vocab={val:idx for idx,val in enumerate(all_tokens)}
len(all_tokens)

1132

In [23]:
class SimpleTokenizerV2():
    def __init__(self,vocab):
        self.str_to_int=vocab
        self.int_to_str={val:key for key,val in vocab.items()}
    def encode(self,text):
        process=re.split(r'([,.?_!"()\']|--|\s)', text)
        process=[item.strip() for item in process if item.strip()]
        return [self.str_to_int[item] if item in self.str_to_int else  self.str_to_int["<|unk|>"] for item in process]
    def decode(self,ids):
        return ' '.join([self.int_to_str[id] for id in ids])

In [24]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text=" <|endoftext|> ".join((text1,text2))
tokenizer=SimpleTokenizerV2(vocab)
tokenizer.decode(tokenizer.encode(text))

'<|unk|> , do you like tea ? <|endoftext|> In the sunlit terraces of the <|unk|> .'

In [25]:
# 引入开源库分词器 BPE 字节对编码
import tiktoken

In [28]:
tokenizer=tiktoken.get_encoding('gpt2')
enc_text=tokenizer.encode(raw_data)
len(enc_text)
enc_text[:5]

[40, 367, 2885, 1464, 1807]

In [29]:
import torch
from torch.utils.data import DataLoader,Dataset

In [34]:

class GPTDatasetV1(Dataset):
    def __init__(self,txt,tokenizer,max_length,stride):
        self.input_ids=[]
        self.output_ids=[]
        ids=tokenizer.encode(txt)
        # 滑动窗口，得到input和Output
        for i in range(0,len(ids)-max_length,stride):
            self.input_ids.append(torch.tensor(ids[i:i+max_length]))
            self.output_ids.append(torch.tensor(ids[i+1:i+max_length+1]))
    def __getitem__(self,id):
        return self.input_ids[id],self.output_ids[id]
    def __len__(self):
        return len(self.input_ids)        

In [39]:
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True, num_workers=0):
    tokenizer=tiktoken.get_encoding('gpt2')
    dataset=GPTDatasetV1(txt,tokenizer,max_length,stride)
    
    dataloader=DataLoader(dataset,batch_size=batch_size,shuffle=shuffle,drop_last=drop_last,num_workers=num_workers)
    
    return dataloader
    
    

In [46]:
max_length = 4
dataloader=create_dataloader_v1(raw_data,batch_size=8,max_length=max_length)
next(iter(dataloader))

[tensor([[  314,   550,  3750,   351],
         [  383,  8631,  3872,   373],
         [11061,   340,    11,  3114],
         [ 1092,   517,   621,   611],
         [ 1310,  1165,   881, 40642],
         [ 2612,  4369,    11,   523],
         [  520,  5493,  6776,   878],
         [ 6164,    25,   366, 16773]]),
 tensor([[  550,  3750,   351,   262],
         [ 8631,  3872,   373,    11],
         [  340,    11,  3114,   510],
         [  517,   621,   611,   314],
         [ 1165,   881, 40642,   972],
         [ 4369,    11,   523,   326],
         [ 5493,  6776,   878,   502],
         [   25,   366, 16773,   290]])]

In [47]:
vocab_size = 50257
output_dim = 256
# 添加嵌入层
token_embedding_layer=torch.nn.Embedding(num_embeddings=vocab_size,embedding_dim=output_dim)
inputs,outputs=next(iter(dataloader))
inputs.shape

torch.Size([8, 4])

In [48]:
input_emb=token_embedding_layer(inputs)
input_emb.shape

torch.Size([8, 4, 256])

In [50]:
# 添加位置层
pos_emb=torch.nn.Embedding(num_embeddings=max_length,embedding_dim=output_dim)
pos_emb_layer=pos_emb(torch.arange(max_length))
pos_emb_layer.shape

torch.Size([4, 256])

In [51]:
input=input_emb+pos_emb_layer
input.shape

torch.Size([8, 4, 256])