<a href="https://colab.research.google.com/github/yuyu990116/transformers_tutorials/blob/main/P3_llama_lora.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir("/content/drive/MyDrive/nlp")
!pip install datasets
!pip install accelerate==0.22.0
!pip install transformers==4.33.1
!pip install peft==0.5.0
from transformers import AutoTokenizer,AutoModelForCausalLM,DataCollatorForSeq2Seq,TrainingArguments,Trainer,pipeline
from datasets import Dataset,load_dataset
from peft import LoraConfig, get_peft_model, TaskType, PeftModel
import torch
ds = load_dataset("zhengr/alpaca-chinese-dataset")


In [None]:
!pip install modelscope
from modelscope.hub.snapshot_download import snapshot_download
snapshot_download(model_id="Shanghai_AI_Laboratory/internlm-20b", cache_dir="/content/drive/MyDrive/Pretrained_models")

In [None]:
model = AutoModelForCausalLM.from_pretrained("/content/drive/MyDrive/Pretrained_models/Shanghai_AI_Laboratory/internlm-20b",low_cpu_mem_usage=True,torch_dtype=torch.half)
#low_cpu_mem_usage=True,torch_dtype=torch.half会节省空间
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/Pretrained_models/Shanghai_AI_Laboratory/internlm-20b")

In [None]:
tokenizer
tokenizer.pad_token_id

In [None]:
#llama2的默认paddingside是左边，经过data_process处理后会出问题
tokenizer.padding_side = "right"
tokenizer.pad_token_id = 2
tokenizer

In [None]:
def data_process(example): #这次数据处理不进行batched，只处理单个的数据，因为label部分不太容易做
  max_length=256 # Llama分词器没针对中文进行训练，它会将一个中文字切分为多个token，因此需要放开一些最大长度，保证数据的完整性
  #tokenizer这里要设置add_special_tokens=False
  tokenized_input=tokenizer("\n".join(["User:"+example["instruction"],example["input"]]).strip()+"\nAssistant:", add_special_tokens=False)
  tokenized_output=tokenizer(example["output"], add_special_tokens=False) #不能在这里把eos_token跟文本放在一起后直接送入tokenizer，不然会导致eos_token在解码的时候无法被解成结束标识符
  input_ids=tokenized_input["input_ids"]+tokenized_output["input_ids"] + [tokenizer.eos_token_id]
  attention_mask=tokenized_input["attention_mask"]+tokenized_output["attention_mask"] + [1] #加的这个1是给eostoken用的
  labels= [-100]*len(tokenized_input["input_ids"])+tokenized_output["input_ids"]+ [tokenizer.eos_token_id]
  if len(input_ids)>max_length:
    input_ids=input_ids[:max_length]
    attention_mask=attention_mask[:max_length]
    labels=labels[:max_length]
  return {
      "input_ids":input_ids,
      "attention_mask":attention_mask,
      "labels":labels
  }

In [None]:
tokenized_ds = ds.map(process_func, remove_columns=ds.column_names)
tokenized_ds

In [None]:
print(tokenized_ds[0]["input_ids"])

In [None]:
tokenizer.decode(tokenized_ds[0]["input_ids"])

In [None]:
tokenizer.decode(list(filter(lambda x: x != -100, tokenized_ds[1]["labels"])))

In [None]:
config = LoraConfig(task_type=TaskType.CAUSAL_LM,)
config

In [None]:
model = get_peft_model(model, config)
config


In [None]:
#虽然之前加载model的时候指定了torch_dtype，但是用peftmodel加载以后，lora的部分还没有转成半精度，所以需要再转一次
model = model.half()
#此时整个模型都是半精度，而优化方法如果想要使用adam,那就需要将adam_epsilon调大（默认1e-8) 可以改成1e-4

In [None]:
model.enable_input_require_grads() # 设置gradient_checkpointing=True时，要执行该方法

In [None]:
# args = TrainingArguments(
#     output_dir="./chatbot",
#     per_device_train_batch_size=2,
#     gradient_accumulation_steps=8,
#     logging_steps=10,
#     num_train_epochs=1,
# )

In [None]:
args = TrainingArguments(
    output_dir="./llama_lora",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    gradient_checkpointing=True,
    adam_epsilon=1e-4
    logging_steps=5,
    num_train_epochs=1,
    save_steps=5
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_ds["train"].select(range(6000)),
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True)
)
trainer.train()

In [None]:
p_model = PeftModel.from_pretrained(model, model_id="/content/drive/MyDrive/nlp/llama_lora/checkpoint-10")
p_model

p_model = p_model.cuda()
ipt = tokenizer("Human: {}\n{}".format("考试有哪些技巧？", "").strip() + "\n\nAssistant: ", return_tensors="pt").to(p_model.device)
tokenizer.decode(p_model.generate(**ipt,max_length=128)[0], skip_special_tokens=True)