In [57]:
from transformers import AutoTokenizer,AutoModelForCausalLM

model_name="uer/gpt2-chinese-cluecorpussmall"
tokenizer=AutoTokenizer.from_pretrained(model_name)
model=AutoModelForCausalLM.from_pretrained(model_name)

In [58]:
!pip install peft


Looking in indexes: https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple


In [59]:
from datasets import load_dataset
dataset=load_dataset("Iess/chinese_modern_poetry", split="train")
dataset= dataset.shuffle(seed=42).select(range(50))

In [60]:
print(dataset.column_names)



['uuid', 'prompt', 'response']


In [61]:
def tokenize_fn(examples):
    return tokenizer(examples["response"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(tokenize_fn, batched=True)


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map: 100%|██████████| 50/50 [00:00<00:00, 296.49 examples/s]


In [62]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["c_attn"],  # GPT-2 注意力层
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)


In [63]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

training_args = TrainingArguments(
    output_dir="./lora_output",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    learning_rate=3e-4,
    logging_steps=10,
    save_steps=50,
    save_total_limit=2,
    fp16=True,
    save_safetensors=False
)

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)


In [64]:
trainer.train()


Step,Training Loss
10,4.0658
20,4.0042


TrainOutput(global_step=21, training_loss=3.998757146653675, metrics={'train_runtime': 277.8213, 'train_samples_per_second': 0.54, 'train_steps_per_second': 0.076, 'total_flos': 9832425062400.0, 'train_loss': 3.998757146653675, 'epoch': 3.0})

In [65]:
prompt = "春风又绿江南水，"
inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(**inputs, max_length=50, do_sample=True, top_k=50)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


春 风 又 绿 江 南 水 ， 雨 中 又 绿 江 南 水 ， 一 片 白 水 与 江 南 水 共 同 组 成 雨 中 又 绿 江 南 水 ， 是 生 态 性 水 系 ， 是 人 类
