In [None]:
! pip install datasets modelscope pandas transformers peft

In [6]:
from datasets import Dataset
import pandas as pd
from modelscope import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer, GenerationConfig
import json
from prompts import synthesize_response_prompt

In [None]:
tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2.5-0.5B-Instruct', trust_remote_code=True)

In [9]:
types = ['democratics', 'finacial', 'health', 'marketing', 'operation']
data_source = {}
for t in types:
    d = pd.read_csv(f"{t}.csv")
    data_source[t] = d.to_dict(orient="records")

In [None]:
data_source['health']

In [11]:
def process_func(example):
    MAX_LENGTH = 10000    
    input_ids, attention_mask, labels = [], [], []
    sys_prompt = synthesize_response_prompt.format(query=example["query"], domain=example["category"], data=data_source[example["category"]]).strip()
    instruction = tokenizer(f"<|im_start|>system\n{sys_prompt}<|im_end|>\n<|im_start|>assistant\n", add_special_tokens=False)  
    response = tokenizer(f"{example['response']}", add_special_tokens=False)
    input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.eos_token_id]
    attention_mask = instruction["attention_mask"] + response["attention_mask"] + [1]
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.eos_token_id]  
    if len(input_ids) > MAX_LENGTH:  
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

In [13]:
with open("data/full_data.json", "r") as f:
    data_lst = json.load(f)
ds = Dataset.from_list(data_lst)
tokenized_id = ds.map(process_func, remove_columns=ds.column_names)
tokenized_id

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1000
})

In [None]:
tokenizer.decode(tokenized_id[0]['input_ids'])

In [15]:
tokenizer.decode(list(filter(lambda x: x != -100, tokenized_id[0]["labels"])))

"The trend of age distribution across different regions over time shows that:\n\n* The East region has a relatively consistent age distribution over the years, with a median age ranging from 25 to 45.\n* The West region has a slightly increasing median age over time, from 32 in 2013 to 38 in 2017.\n* The South region has a relatively stable median age, ranging from 25 to 39, with no significant changes over time.\n* The Central region has a increasing median age over time, from 25 in 2011 to 37 in 2018.\n* The North region has a limited number of data points, but the median age appears to be increasing over time, from 28 in 2012 to 39 in 2017.\n* Overall, the age distribution across regions is relatively stable, with some fluctuations over time, but no significant trends or patterns emerge.\n\nIt's worth noting that the data is limited to 50 records, and a larger dataset may be needed to draw more conclusive insights.<|im_end|>"

In [None]:
import torch
model = AutoModelForCausalLM.from_pretrained('Qwen/Qwen2.5-0.5B-Instruct', device_map="auto",torch_dtype=torch.bfloat16)
model

In [17]:
model.enable_input_require_grads() 

In [23]:
from peft import LoraConfig, TaskType, get_peft_model

config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, 
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    inference_mode=False, 
    r=8, 
    lora_alpha=32, 
    lora_dropout=0.1
)
config

LoraConfig(task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, inference_mode=False, r=8, target_modules={'q_proj', 'up_proj', 'down_proj', 'o_proj', 'k_proj', 'v_proj', 'gate_proj'}, exclude_modules=None, lora_alpha=32, lora_dropout=0.1, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', trainable_token_indices=None, loftq_config={}, eva_config=None, corda_config=None, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False)

In [24]:
model = get_peft_model(model, config)
config



LoraConfig(task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, inference_mode=False, r=8, target_modules={'q_proj', 'up_proj', 'down_proj', 'o_proj', 'k_proj', 'v_proj', 'gate_proj'}, exclude_modules=None, lora_alpha=32, lora_dropout=0.1, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', trainable_token_indices=None, loftq_config={}, eva_config=None, corda_config=None, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False)

In [25]:
model.print_trainable_parameters()

trainable params: 4,399,104 || all params: 498,431,872 || trainable%: 0.8826


In [26]:
args = TrainingArguments(
    output_dir="./output/qwen2.5_0.5b_instruct_lora",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    logging_steps=10,
    num_train_epochs=5,
    save_steps=100,
    learning_rate=1e-4,
    save_on_each_node=True,
    gradient_checkpointing=True
)

In [27]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_id,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
trainer.train()