In [None]:
import config

In [None]:
from datasets import load_dataset

dataset_path = str(config.SST2_PATH)
ds = load_dataset(dataset_path)
ds_train, ds_val = ds['train'], ds['validation']

print(ds)
print(ds_train)
print(ds_train[6])
print(ds_train[:10])

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_path = config.GPT2_PATH
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

In [None]:
# 只使用文本内容sentence，不使用情感标签
def tokenize(batch):
    return tokenizer(batch['sentence'])


map_kwargs = {
    'batched': True,
    'batch_size': 512,
    'remove_columns': ['idx', 'sentence', 'label']
}

tokenized_dataset_train = ds_train.map(tokenize, **map_kwargs)
tokenized_dataset_val = ds_val.map(tokenize, **map_kwargs)

print(tokenized_dataset_train[0])
print(tokenized_dataset_train[5:10])

In [None]:
for i, seq in enumerate(tokenized_dataset_train[5:10]['input_ids']):
    print(f'{i + 1}: {tokenizer.decode(seq)}')

In [None]:
print(len(tokenized_dataset_train), len(tokenized_dataset_val))

tokenized_dataset_train = tokenized_dataset_train.filter(lambda x: len(x['input_ids']) > 5)
tokenized_dataset_val = tokenized_dataset_val.filter(lambda x: len(x['input_ids']) > 5)

print(len(tokenized_dataset_train), len(tokenized_dataset_val))

In [None]:
tokenized_dataset_train.set_format(type='torch')
tokenized_dataset_val.set_format(type='torch')

print(tokenized_dataset_train[0])
print(tokenized_dataset_train[:5])

In [None]:
# 检查pad token的设置（应该为空）
print(tokenizer.pad_token)
# # 检查eos token的设置
print(tokenizer.eos_token)
# N+ Implementation论文（第5页）说法不同
# 但我们会使用attention_mask来移除用于填充的额外eos_token
# 通过attention_mask来区分真正的结束token和用于填充的token
tokenizer.pad_token = tokenizer.eos_token

In [None]:
from torch.utils.data import DataLoader
from transformers import DataCollatorForLanguageModeling

# mlm=False，将数据整理成“因果语言建模”需要的数据格式
# “因果语言建模”就是“预测下一个token”类型的任务，也就是gpt风格的自回归模型
# 如果mlm=True，那么数据整理成bert风格的任务所需的数据格式
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)  # labels

dataloader_params = {
    'batch_size': 16,  # 6G显存正好够用
    'collate_fn': data_collator
}

train_dataloader = DataLoader(tokenized_dataset_train, **dataloader_params)
val_dataloader = DataLoader(tokenized_dataset_val, **dataloader_params)

print(len(train_dataloader))

batch = next(iter(train_dataloader))
print(batch.keys())
print(batch['input_ids'].shape)
print(batch['input_ids'][0])
print(batch['labels'][0])
print(batch['attention_mask'][0])

In [None]:
import torch

# 要更新的是model的参数
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
# 一般sft会训练1个epoch
num_epochs = 1

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


def validate(epoch):
    model.eval()
    total_loss = 0.0
    for i, batch in enumerate(val_dataloader):
        batch = batch.to(device)
        with torch.no_grad():
            outputs = model(**batch)
            loss = outputs.loss  # 损失
            total_loss += loss.item()
    print(f'val_loss at {epoch} epoch:', total_loss / len(val_dataloader))

In [None]:
model.to(device)
validate(0)
for epoch in range(num_epochs):
    model.train()
    for i, batch in enumerate(train_dataloader):
        batch = batch.to(device)
        outputs = model(**batch)
        loss = outputs.loss
        print(f'Loss: {loss.item()}')
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    validate(epoch + 1)

In [None]:
sft_model_path = str(config.GPT2_SFT_PATH)
model.save_pretrained(sft_model_path)
tokenizer.save_pretrained(sft_model_path)

In [None]:
from transformers import pipeline, set_seed
from pprint import pprint

g = pipeline('text-generation', model=sft_model_path)
set_seed(42)
pprint(g("this is a", max_length=30, num_return_sequences=1))