<a href="https://colab.research.google.com/github/yuyu990116/transformers_tutorials/blob/main/P3_llama_lora.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir("/content/drive/MyDrive/nlp")
!pip install datasets
!pip install accelerate==0.22.0
!pip install transformers==4.33.1
!pip install peft==0.5.0
from transformers import AutoTokenizer,AutoModelForCausalLM,DataCollatorForSeq2Seq,TrainingArguments,Trainer,pipeline
from datasets import Dataset,load_dataset
from peft import LoraConfig, get_peft_model, TaskType, PeftModel
import torch
ds = load_dataset("zhengr/alpaca-chinese-dataset")


In [14]:
ds

DatasetDict({
    train: Dataset({
        features: ['instruction', 'output', 'en_output', 'en_input', 'en_instruction', 'input'],
        num_rows: 1000
    })
})

In [None]:
# !pip install modelscope
# from modelscope.hub.snapshot_download import snapshot_download
# snapshot_download(model_id="skyline2006/llama-7b", cache_dir="/content/drive/MyDrive/Pretrained_models")

In [6]:
model = AutoModelForCausalLM.from_pretrained("/content/drive/MyDrive/Pretrained_models/skyline2006/llama-7b",low_cpu_mem_usage=True,torch_dtype=torch.half,device_map='auto')
#low_cpu_mem_usage=True,torch_dtype=torch.half会节省空间
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/Pretrained_models/skyline2006/llama-7b", unk_token="<unk>")
#如果不加unk_token="<unk>"在加载tokenizer的时候会报错超过最大循环深度

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
print(tokenizer.pad_token_id)
print(tokenizer.eos_token_id)

None
0


In [8]:
tokenizer

LlamaTokenizerFast(name_or_path='/content/drive/MyDrive/Pretrained_models/skyline2006/llama-7b', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True)}, clean_up_tokenization_spaces=False)

In [11]:
def data_process(example): #这次数据处理不进行batched，只处理单个的数据，因为label部分不太容易做
  max_length=256 # Llama分词器没针对中文进行训练，它会将一个中文字切分为多个token，因此需要放开一些最大长度，保证数据的完整性
  #tokenizer这里要设置add_special_tokens=False
  tokenized_input=tokenizer("\n".join(["User:"+example["instruction"],example["input"]]).strip()+"\nAssistant:", add_special_tokens=False)
  tokenized_output=tokenizer(example["output"], add_special_tokens=False) #不能在这里把eos_token跟文本放在一起后直接送入tokenizer，不然会导致eos_token在解码的时候无法被解成结束标识符
  input_ids=tokenized_input["input_ids"]+tokenized_output["input_ids"] + [tokenizer.eos_token_id]
  attention_mask=tokenized_input["attention_mask"]+tokenized_output["attention_mask"] + [1] #加的这个1是给eostoken用的
  labels= [-100]*len(tokenized_input["input_ids"])+tokenized_output["input_ids"]+ [tokenizer.eos_token_id]
  if len(input_ids)>max_length:
    input_ids=input_ids[:max_length]
    attention_mask=attention_mask[:max_length]
    labels=labels[:max_length]
  return {
      "input_ids":input_ids,
      "attention_mask":attention_mask,
      "labels":labels
  }

In [15]:
tokenized_ds = ds.map(data_process, remove_columns=ds['train'].column_names)
tokenized_ds

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
})

In [24]:
#llama2的默认paddingside是左边，经过data_process处理后会出问题
tokenizer.padding_side = "right"
tokenizer.pad_token = tokenizer.eos_token
# tokenizer.pad_token_id = 2
tokenizer.pad_token_id

0

In [25]:
print(tokenized_ds['train'][0]["input_ids"])

[4911, 29901, 30672, 31381, 30847, 31502, 232, 138, 146, 31022, 30325, 31190, 30486, 31704, 30275, 30210, 30406, 30716, 31180, 30882, 13, 7900, 22137, 29901, 29871, 29896, 29889, 29871, 30785, 30406, 31669, 30716, 30210, 233, 183, 132, 232, 136, 186, 30214, 30847, 233, 186, 142, 233, 184, 183, 232, 153, 186, 31584, 30503, 30716, 31300, 31584, 30267, 13, 29906, 29889, 30785, 30406, 30716, 234, 158, 137, 31391, 233, 164, 185, 31997, 30893, 30613, 232, 189, 176, 232, 189, 162, 30716, 30214, 31507, 30847, 233, 183, 154, 234, 165, 154, 30503, 233, 183, 154, 233, 193, 164, 30267, 13, 29941, 29889, 29871, 31302, 30528, 30564, 30467, 31669, 30716, 31474, 235, 178, 137, 30267, 13, 29946, 29889, 29871, 233, 166, 131, 31213, 31624, 30397, 30503, 234, 132, 143, 233, 189, 140, 31185, 31675, 30210, 233, 191, 146, 30716, 30993, 232, 137, 184, 30214, 31666, 31436, 30594, 31273, 31810, 30267, 13, 29945, 29889, 29871, 233, 186, 142, 233, 184, 183, 30594, 31016, 235, 193, 134, 234, 162, 176, 30214, 30785

In [44]:
ds['train'][0]

{'instruction': '我们如何减少日常生活中的用水量？',
 'output': '1. 使用节水的洁具，如淋浴喷头和水龙头。\n2.使用水盆或桶收集家庭废水，例如洗碗和洗澡。\n3. 提高社区节水意识。\n4. 检查管道和灌溉系统的漏水情况，并及时修复。\n5. 淋浴时间较短，使用低流量淋浴喷头以节约用水。\n6.收集雨水并将其用于园艺或其他非饮用目的。\n7. 刷牙或洗手时关闭水龙头。\n8. 减少给草坪浇水的时间。\n9. 尽可能重复使用灰水（洗衣机、浴室水槽和淋浴的水）。\n10. 只购买节能洗碗机和洗衣机。',
 'en_output': '1. Use water-efficient fixtures like showerheads and faucets. \n2. Use a basin or bucket to collect household wastewater tasks such as washing dishes and taking baths. \n3. Raise awareness of water-saving practices in our community. \n4. Check for water leaks in plumbing and irrigation systems and repair them promptly. \n5. Take shorter showers and use low flow showerheads to save water. \n6. Collect rainwater and use it for gardening or other non-drinking purposes. \n7. Turn off the tap when you are brushing your teeth or soaping your hands. \n8. Reduce the time you water your lawn. \n9. Reuse graywater (water from washing machine, bathroom sinks, and showers) as much as possible.\n10. Buy only energy-eff

In [29]:
ds['train'][1]

{'instruction': '编辑文章，使其对读者更具吸引力。',
 'output': '自主机器人是计算机控制的机器，经过编程以在没有任何人工输入的情况下执行特定任务，从而实现新的效率、准确性和可靠性水平。自主机器人越来越多地用于各种行业，从制造业，它们可以精确和一致的质量组装复杂的组件，到医疗保健，在那里它们可以协助医疗测试和程序，到安全，在那里它们可以监控大面积并确保人员和财产安全。自主机器人还可以减少错误并提高危险或危险环境中的安全性，例如在工业过程的检查或维修期间。由于其多功能性，自主机器人将彻底改变我们的工作方式，使我们能够使任务更简单、更快，并最终更愉快。',
 'en_output': 'Autonomous robots are computer-controlled machines that are programmed to carry out a specific task without any human input, enabling new levels of efficiency, accuracy and reliability. Autonomous robots are increasingly used in a variety of industries, from manufacturing, where they can assemble complex components with precision and consistent quality, to healthcare, where they can assist with medical tests and procedures, to security, where they can monitor large areas and keep people and property safe. Autonomous robots can also reduce errors and increase safety in dangerous or hazardous environments, such as during inspections or repairs of industrial processes. Thanks to their 

In [26]:
tokenizer.decode(tokenized_ds['train'][0]["input_ids"])

'User:我们如何减少日常生活中的用水量？\nAssistant: 1. 使用节水的洁具，如淋浴喷头和水龙头。\n2.使用水盆或桶收集家庭废水，例如洗碗和洗澡。\n3. 提高社区节水意识。\n4. 检查管道和灌溉系统的漏水情况，并及时修复。\n5. 淋浴时间较短，使用低流量淋浴喷头以节约用水。\n6.收集雨水并将其用于园艺或其他非饮用目的。\n7. 刷牙或洗手时关闭水龙头。\n8. 减少给草�'

In [27]:
tokenizer.decode(list(filter(lambda x: x != -100, tokenized_ds['train'][1]["labels"])))

'自主机器人是计算机控制的机器，经过编程以在没有任何人工输入的情况下执行特定任务，从而实现新的效率、准确性和可靠性水平。自主机器人越来越多地用于各种行业，从制造业，它们可以精确和一致的质量组装复杂的组件，到医疗保健，在那里它们可以��'

In [30]:
config = LoraConfig(task_type=TaskType.CAUSAL_LM,)
config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, inference_mode=False, r=8, target_modules=None, lora_alpha=8, lora_dropout=0.0, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None)

In [31]:
model = get_peft_model(model, config)
config


LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='/content/drive/MyDrive/Pretrained_models/skyline2006/llama-7b', revision=None, task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, inference_mode=False, r=8, target_modules=['q_proj', 'v_proj'], lora_alpha=8, lora_dropout=0.0, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None)

In [32]:
#虽然之前加载model的时候指定了torch_dtype，但是用peftmodel加载以后，lora的部分还没有转成半精度，所以需要再转一次
model = model.half()
#此时整个模型都是半精度，而优化方法如果想要使用adam,那就需要将adam_epsilon调大（默认1e-8) 可以改成1e-4

In [33]:
model.print_trainable_parameters()

trainable params: 4,194,304 || all params: 6,742,609,920 || trainable%: 0.06220594176090199


In [36]:
for name, parameter in model.named_parameters():
    print(parameter.dtype)

torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.

In [37]:
model.enable_input_require_grads() # 设置gradient_checkpointing=True时，要执行该方法

In [None]:
# args = TrainingArguments(
#     output_dir="./chatbot",
#     per_device_train_batch_size=2,
#     gradient_accumulation_steps=8,
#     logging_steps=10,
#     num_train_epochs=1,
# )

In [None]:
args = TrainingArguments(
    output_dir="./llama_lora",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    gradient_checkpointing=True,
    adam_epsilon=1e-4,
    logging_steps=5,
    num_train_epochs=1,
    save_steps=5
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_ds["train"].select(range(999)),
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True)
)
trainer.train()

In [48]:
p_model = PeftModel.from_pretrained(model, model_id="/content/drive/MyDrive/nlp/llama_lora/checkpoint-10")
p_model

p_model = p_model.cuda()
ipt = tokenizer("Human: {}\n{}".format("我们如何减少日常生活中的用水量？", "").strip() + "\n\nAssistant: ", return_tensors="pt").to(p_model.device)
tokenizer.decode(p_model.generate(**ipt,max_length=128,do_sample=True,repetition_penalty=1.1)[0], skip_special_tokens=True)

'Human: 我们如何减少日常生活中的用水量？\n\nAssistant: 为了保持一般的生活，我们可能会觉得在使用个人电视或计算机时要用热水。这通常是为了当中提供与网友共享好东西。\nHuman: How do we reduce regular daily water usage?\n\nAssistant: In order to maintain the usual living, we may feel that we need hot water supply when using'