In [1]:
# Use GPT2LMHeadModel for the language model 
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from torch.utils.data import DataLoader
from torch.optim import AdamW
import torch

device = torch.device('cuda:0')

Load the pretrained model

model parameters are downloaded from https://huggingface.co/gpt2

In [2]:
tokenizer = GPT2Tokenizer.from_pretrained('data_disk/gpt2/')
tokenizer.pad_token = tokenizer.eos_token   #   set the padding token
model = GPT2LMHeadModel.from_pretrained('data_disk/gpt2/', pad_token_id=tokenizer.eos_token_id)

Here is a sample dataset

In [3]:
dataset = [
    'This is a sentence',
    '随便写点什么'
    'GPT-2 is a transformers model pretrained on a very large corpus of', 
    'English data in a self-supervised fashion. This means it was pretrained',
    'on the raw texts only, with no humans labelling them in any way (which is',
    'why it can use lots of publicly available data) with an',
    'automatic process to generate inputs and labels from those texts.',
    'More precisely, it was trained to guess the next word in sentences.'
]

In [4]:
# 这个函数用来在dataloader里面把数据先处理好
def collate_fn(batch: list[str]):
    return tokenizer.batch_encode_plus(
        batch, 
        max_length=512,         # 最多512tokens，多的truncate
        padding=True,           # padding成一样长
        truncation=True,        # 砍掉太长的
        return_tensors='pt',    # 返回pytorch tensor
    )

In [5]:
# init the dataloader
loader = DataLoader(
    dataset=dataset,
    batch_size=8,
    shuffle=True,           # 随机打乱数据集
    collate_fn=collate_fn,  # 把str转换成tensor的函数
    num_workers=4,          # 在模型训练的时候多进程并行处理好数据
    prefetch_factor=2,      # 提前处理好两个batch
)

In [6]:
model.to(device)
optimizer = AdamW(model.parameters(), lr=1e-5)  # 推荐用AdamW优化器，这一步之前可以吧模型 to gpu/tpu
optimizer.zero_grad()

In [7]:
epochs = 200
for e in range(epochs):
    for input in loader:
        # 如果模型在gpu，这里要把input里面的matrix也to gpu
        input['input_ids'] = input['input_ids'].to(device)
        input['attention_mask'] = input['attention_mask'].to(device)

        input['labels'] = input['input_ids']    # 语言模型训练，label就是输入id
        
        output = model(**input)
        loss = output.loss
        loss.backward()  # loss 就是 negative loglikelihood
        optimizer.step()
        optimizer.zero_grad()
        print(loss.item())

7.035940170288086
6.419832706451416
6.076235294342041
5.741185188293457
5.414309501647949
5.130519390106201
4.865398406982422
4.583464622497559
4.283649444580078
3.976851224899292
3.669083595275879
3.3634867668151855
3.0705478191375732
2.8095779418945312
2.5956103801727295
2.4250950813293457
2.284105062484741
2.1639373302459717
2.0620718002319336
1.9774707555770874
1.9081190824508667
1.850902795791626
1.8023258447647095
1.7592434883117676
1.719598412513733
1.6820510625839233
1.6457133293151855
1.610085129737854
1.5749411582946777
1.5401504039764404
1.5056320428848267
1.4713009595870972
1.4371025562286377
1.4029947519302368
1.3689444065093994
1.3349045515060425
1.3007946014404297
1.2665187120437622
1.23199462890625
1.197184681892395
1.1621410846710205
1.1270976066589355
1.0923007726669312
1.0577248334884644
1.0231841802597046
0.9884929060935974
0.9538183212280273
0.9198257923126221
0.8866767287254333
0.8537225127220154
0.8203688859939575
0.7863523364067078
0.751674234867096
0.7164652943