In [14]:
from transformers import AutoModelForCausalLM, AutoTokenizer
model_name = '/Users/zhangyf/llm/gpt2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

In [15]:
from datasets import load_dataset
dataset_path = './sst2'
ds = load_dataset(dataset_path)
ds_train, ds_val = ds['train'], ds['validation']

print(ds)
print(ds_train)
print(ds_train[6])
print(ds_train[:10])

DatasetDict({
    train: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 872
    })
    test: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 1821
    })
})
Dataset({
    features: ['idx', 'sentence', 'label'],
    num_rows: 67349
})
{'idx': 6, 'sentence': 'demonstrates that the director of such hollywood blockbusters as patriot games can still turn out a small , personal film with an emotional wallop . ', 'label': 1}
{'idx': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 'sentence': ['hide new secretions from the parental units ', 'contains no wit , only labored gags ', 'that loves its characters and communicates something rather beautiful about human nature ', 'remains utterly satisfied to remain the same throughout ', 'on the worst revenge-of-the-nerds clichés the filmmakers could dredge up ', "that 's far too tragic to merit such superfic

In [16]:
# 只使用文本内容sentence，不使用情感标签
def tokenize(batch):
    return tokenizer(batch['sentence'])

map_kwargs = {
    'batched': True,
    'batch_size': 512,
    'remove_columns': ['idx', 'sentence', 'label']
}

tokenized_dataset_train = ds_train.map(tokenize, **map_kwargs)
tokenized_dataset_val = ds_val.map(tokenize, **map_kwargs)

print(tokenized_dataset_train[0])
print(tokenized_dataset_train[5:10])

{'input_ids': [24717, 649, 3200, 507, 422, 262, 21694, 4991, 220], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}
{'input_ids': [[5562, 705, 82, 1290, 1165, 15444, 284, 17004, 884, 31194, 3513, 220], [26567, 2536, 689, 326, 262, 3437, 286, 884, 289, 31777, 2512, 30181, 355, 29408, 1830, 460, 991, 1210, 503, 257, 1402, 837, 2614, 2646, 351, 281, 7016, 3355, 404, 764, 220], [1659, 473, 84, 948, 220], [64, 19095, 17280, 12, 1941, 12, 727, 705, 82, 26781, 19518, 220], [533, 517, 7744, 1807, 832, 621, 287, 749, 4600, 826, 12, 28973, 705, 7328, 220]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}


In [17]:
for i, seq in enumerate(tokenized_dataset_train[5:10]['input_ids']):
    print(f'{i+1}: {tokenizer.decode(seq)}')

1: that 's far too tragic to merit such superficial treatment 
2: demonstrates that the director of such hollywood blockbusters as patriot games can still turn out a small , personal film with an emotional wallop . 
3: of saucy 
4: a depressed fifteen-year-old 's suicidal poetry 
5: are more deeply thought through than in most ` right-thinking ' films 


In [18]:
# 去掉少于 6 个 token 的文本
print(len(tokenized_dataset_train), len(tokenized_dataset_val))

tokenized_dataset_train = tokenized_dataset_train.filter(lambda x: len(x['input_ids']) > 5)
tokenized_dataset_val = tokenized_dataset_val.filter(lambda x: len(x['input_ids']) > 5)

print(len(tokenized_dataset_train), len(tokenized_dataset_val))

67349 872
49401 867


In [19]:
# 准备 dataloader 数据加载器
# 设置为 PyTorch 的数据格式
tokenized_dataset_train.set_format(type='torch')
tokenized_dataset_val.set_format(type='torch')

print(tokenized_dataset_train[0])
print(tokenized_dataset_train[:5])

{'input_ids': tensor([24717,   649,  3200,   507,   422,   262, 21694,  4991,   220]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1])}
{'input_ids': [tensor([24717,   649,  3200,   507,   422,   262, 21694,  4991,   220]), tensor([ 3642,  1299,   645, 20868,   837,   691,  2248,  1850,   308,  3775,
          220]), tensor([ 5562, 10408,   663,  3435,   290, 48556,  1223,  2138,  4950,   546,
         1692,  3450,   220]), tensor([ 2787,  1299, 15950, 11378,   284,  3520,   262,   976,  3690,   220]), tensor([  261,   262,  5290, 15827,    12,  1659,    12,  1169,    12,  1008,
         9310, 35478, 20954,   262, 28303,   714, 47478,   469,   510,   220])], 'attention_mask': [tensor([1, 1, 1, 1, 1, 1, 1, 1, 1]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])]}


In [20]:
# 检查pad token的设置（应该为空）
print(tokenizer.pad_token)
# # 检查eos token的设置
print(tokenizer.eos_token)
# N+ Implementation论文（第5页）说法不同
# 但我们会使用attention_mask来移除用于填充的额外eos_token
# 通过attention_mask来区分真正的结束token和用于填充的token
tokenizer.pad_token = tokenizer.eos_token

None
<|endoftext|>


In [21]:
# 示例说明
text1 = "Hello world"           # 短文本
text2 = "Hello world how are you today"  # 长文本

# 填充后：
# text1: "Hello world<|endoftext|><|endoftext|><|endoftext|>"  # 后面的是填充
# text2: "Hello world how are you today<|endoftext|>"         # 最后的是真正结束

# attention_mask区分：
# text1: [1, 1, 1, 0, 0, 0]  # 1表示真实token，0表示填充token
# text2: [1, 1, 1, 1, 1, 1, 1, 1]  # 全部都是真实token

In [22]:
from torch.utils.data import DataLoader
from transformers import DataCollatorForLanguageModeling
# mlm=False，将数据整理成“因果语言建模”需要的数据格式
# “因果语言建模”就是“预测下一个token”类型的任务，也就是gpt风格的自回归模型
# 如果mlm=True，那么数据整理成bert风格的任务所需的数据格式
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False) # labels

dataloader_params = {
    'batch_size': 16, # 6G显存正好够用
    'collate_fn': data_collator
}

train_dataloader = DataLoader(tokenized_dataset_train, **dataloader_params)
val_dataloader = DataLoader(tokenized_dataset_val, **dataloader_params)

print(len(train_dataloader))

batch = next(iter(train_dataloader))
print(batch.keys())
print(batch['input_ids'].shape)
print(batch['input_ids'][0])
print(batch['labels'][0])
print(batch['attention_mask'][0])

3088
KeysView({'input_ids': tensor([[24717,   649,  3200,   507,   422,   262, 21694,  4991,   220, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256],
        [ 3642,  1299,   645, 20868,   837,   691,  2248,  1850,   308,  3775,
           220, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256],
        [ 5562, 10408,   663,  3435,   290, 48556,  1223,  2138,  4950,   546,
          1692,  3450,   220, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256],
        [ 2787,  1299, 15950, 11378,   284,  3520,   262,   976,  3690,   220,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 502

  arr = np.array(obj)
  arr = np.array(obj)


In [23]:
import torch
# 要更新的是model的参数
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
# 一般sft会训练1个epoch
num_epochs = 1

In [24]:
def validate(epoch):
    model.eval() # 评估模式，禁用模型的随机性，例如dropout等特性
    total_loss = 0.0
    for i, batch in enumerate(val_dataloader):
        batch = batch.to(device)
        with torch.no_grad():
            outputs = model(**batch)
            loss = outputs.loss # 损失
            total_loss += loss.item()
    print(f'val_loss at {epoch} epoch:', total_loss / len(val_dataloader))

In [25]:
# 续写文章的sft
device = torch.device('mps' if torch.mps.is_available() else 'cpu')
model.to(device)
validate(0)
for epoch in range(num_epochs):
    model.train()
    for i, batch in enumerate(train_dataloader):
        batch = batch.to(device)
        outputs = model(**batch)
        loss = outputs.loss
        print(f'Loss: {loss.item()}')
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    validate(epoch+1)

  arr = np.array(obj)
  arr = np.array(obj)
`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


val_loss at 0 epoch: 5.174781010367654
Loss: 6.016317367553711
Loss: 5.630809307098389
Loss: 5.695191383361816
Loss: 5.2630615234375
Loss: 5.174826145172119
Loss: 4.916262626647949
Loss: 4.7063069343566895
Loss: 5.204032897949219
Loss: 5.0653557777404785
Loss: 5.249309539794922
Loss: 5.162892818450928
Loss: 5.058197498321533
Loss: 5.423579692840576
Loss: 4.559014320373535
Loss: 4.7218546867370605
Loss: 4.710030555725098
Loss: 4.994248867034912
Loss: 4.532787322998047
Loss: 4.631570816040039
Loss: 5.145158767700195
Loss: 5.122682094573975
Loss: 4.3845720291137695
Loss: 4.83773136138916
Loss: 4.355783939361572
Loss: 4.824408054351807
Loss: 4.490377426147461
Loss: 5.0886335372924805
Loss: 4.741083145141602
Loss: 4.1639485359191895
Loss: 4.6926164627075195
Loss: 4.283303260803223
Loss: 4.578400611877441
Loss: 4.436331272125244
Loss: 4.757376670837402
Loss: 4.3918633460998535
Loss: 4.905493259429932
Loss: 4.725922107696533
Loss: 4.398403644561768
Loss: 4.382633686065674
Loss: 4.691870689392

In [26]:
model.save_pretrained('/Users/zhangyf/llm/gpt2-sft')
tokenizer.save_pretrained('/Users/zhangyf/llm/gpt2-sft')

('/Users/zhangyf/llm/gpt2-sft/tokenizer_config.json',
 '/Users/zhangyf/llm/gpt2-sft/special_tokens_map.json',
 '/Users/zhangyf/llm/gpt2-sft/vocab.json',
 '/Users/zhangyf/llm/gpt2-sft/merges.txt',
 '/Users/zhangyf/llm/gpt2-sft/added_tokens.json',
 '/Users/zhangyf/llm/gpt2-sft/tokenizer.json')

In [28]:
from transformers import pipeline, set_seed
from pprint import pprint
g = pipeline('text-generation', model='/Users/zhangyf/llm/gpt2-sft')
set_seed(42)
pprint(g("this is a bad movie", max_length=30, num_return_sequences=1))
# 可以看到，经过了sft，模型不再乱输出了，而是输出了电影相关的描述

Device set to use mps:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Both `max_new_tokens` (=256) and `max_length`(=30) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


[{'generated_text': 'this is a bad movie... icky and uninspiring, icky and '
                    'icky icky                              '
                    '.                                                                                                                                                                                                                  '}]
