In [None]:
# %%
!pip install -q h5py typing-extensions wheel
!pip install -q -U bitsandbytes
# !pip install -q -U git+https://github.com/huggingface/transformers.git
# !pip install -q -U git+https://github.com/huggingface/peft.git
# !pip install -q -U git+https://github.com/huggingface/accelerate.git
# !pip install -q datasets

# %%
!nvidia-smi

In [None]:
# ## Load Pre-trained model and tokenizer

# %%
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
model_id = "Qwen/Qwen2.5-3B-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True, # Activate nested quantization for 4-bit base models (double quantization)
    bnb_4bit_quant_type="nf4", # Quantization type (fp4 or nf4), According to QLoRA paper, for training 4-bit base models (e.g. using LoRA adapters) one should use
    bnb_4bit_compute_dtype=torch.bfloat16
)
# model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_id)


# %% [markdown]
# ## Preprocess the quantized model for training

# %%
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

# %%
from peft import LoraConfig, get_peft_model

# You can try differnt parameter-effient strategy for model trianing, for more info, please check https://github.com/huggingface/peft
config = LoraConfig(
    r=8,
    lora_alpha=8,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)
print("cell finished")

In [None]:
# %% [markdown]
# ## Chat Template Usage

# %%
from jinja2 import Template
template = Template(tokenizer.chat_template)
message = "Please introduce yourself"
print(f"message:\n{message}\n")
message_send_to_model=template.render(messages=[{"role": "user", "content": message}],bos_token=tokenizer.bos_token,add_generation_prompt=True)
print(f"message_send_to_model:\n{message_send_to_model}")

# %%
template = Template(tokenizer.chat_template)

@torch.no_grad()
def generate(prompt):
    modelInput=template.render(messages=[{"role": "user", "content": prompt}],bos_token= tokenizer.bos_token,add_generation_prompt=True)
    print("-"*80)
    print(f"model_input_string:\n{modelInput}")
    input_ids = tokenizer.encode(modelInput, add_special_tokens=False, return_tensors='pt').to("cuda:0")
    outputs = model.generate(input_ids, do_sample=False)
    model_return_string = tokenizer.decode(*outputs, skip_special_tokens=False)
    print("-"*80)
    print(f"model_return_string:\n{model_return_string}")
    generated_ids = outputs[:, input_ids.shape[1]:]
    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=False)
    return generated_text

query = "Please introduce yourself"
print("-"*80)
print(f"query:\n{query}")
response = generate(query)
print("-"*80)
print(f"response:\n{response}")

# %% [markdown]
# ## Data Preparation

# %% [markdown]
# Let's load a common dataset, english quotes, to fine tune our model on famous quotes.

In [None]:
# original huatuo dataset
# # %%
# from datasets import load_dataset

# data = load_dataset("Abirate/english_quotes")

# dataset = load_dataset("FreedomIntelligence/Huatuo26M-Lite")
# dataset = dataset['train'].map(lambda sample: {"conversations": [{"from": "human", "value": sample['question']}, {"from": "gpt", "value": sample['answer']}]}, batched=False)

# dataset[3]

In [None]:
# load 法律数据集
from datasets import load_dataset, Features, Value, Sequence, concatenate_datasets

# 定义column转换方法
def convert_to_conversation(sample):
    return {
        "conversations": [
            {"from": "human", "value":  sample['input']},
            {"from": "gpt", "value": sample['output']}
        ]
    }

dataset = load_dataset("ShengbinYue/DISC-Law-SFT", data_files=['DISC-Law-SFT-Pair-QA-released.jsonl', 'DISC-Law-SFT-Pair.jsonl'])

# 应用转换
dataset = dataset['train'].map(
    convert_to_conversation,
    remove_columns=dataset['train'].column_names,  # 移除原始列
    batched=False
)

features = Features({
    'id': Value('string'),
    'reference': Sequence(Value('string')),
    'input': Value('string'),
    'output': Value('string')
})

dataset2 = load_dataset("ShengbinYue/DISC-Law-SFT", features = features,data_files=['DISC-Law-SFT-Triplet-QA-released.jsonl', 'DISC-Law-SFT-Triplet-released.jsonl'])

dataset2 = dataset2['train'].map(
    convert_to_conversation,
    remove_columns=dataset2['train'].column_names,  # 移除原始列
    batched=False
)

merged_dataset = concatenate_datasets([dataset, dataset2])
merged_dataset

In [None]:
# import torch
# from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
# model_id = "Qwen/Qwen2.5-3B-Instruct"
# tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
# %%
from torch.utils.data import random_split
train_dataset_size, val_dataset_size = 20, 8
train_dataset, val_dataset, _ = random_split(dataset, [train_dataset_size, val_dataset_size, len(dataset)-train_dataset_size-val_dataset_size])
# train_dataset, val_dataset, _ = random_split(dataset2, [0.5,0.1,0.4])
# print(train_dataset[0]['conversations'])

# %% [markdown]
# ### Customized Dataset
# Create a specialized dataset class named "InstructionDataset" designed to handle our custom dataset.

# %%
import transformers
from typing import Dict, Sequence, List
from torch.utils.data import Dataset
from dataclasses import dataclass

def preprocess(
    sources,
    tokenizer: transformers.PreTrainedTokenizer,
) -> Dict:
    template = Template(tokenizer.chat_template)
    max_seq_len = tokenizer.model_max_length
    messages = []
    for i, source in enumerate(sources):
        if source[0]["from"] != "human":
            # Skip the first one if it is not from human
            source = source[1:]

        for j in range(0, len(source), 2):
            if j+1 >= len(source): continue
            q = source[j]["value"]
            a = source[j+1]["value"]
            assert q is not None and a is not None, f'q:{q} a:{a}'
            input =  template.render(messages=[{"role": "user", "content": q},{"role": "assistant", "content": a}],bos_token=tokenizer.bos_token,add_generation_prompt=False)
            input_ids = tokenizer.encode(input, add_special_tokens= False)

            query = template.render(messages=[{"role": "user", "content": q}],bos_token=tokenizer.bos_token,add_generation_prompt=True)
            query_ids = tokenizer.encode(query, add_special_tokens= False)

            labels = [-100]*len(query_ids) + input_ids[len(query_ids):]
            assert len(labels) == len(input_ids)
            if len(input_ids) == 0: continue
            messages.append({"input_ids": input_ids[-max_seq_len:], "labels": labels[-max_seq_len:]})

    input_ids = [item["input_ids"] for item in messages]
    labels = [item["labels"] for item in messages]

    max_len = max(len(x) for x in input_ids)

    max_len = min(max_len, max_seq_len)
    input_ids = [ item[:max_len] + [tokenizer.eos_token_id]*(max_len-len(item)) for item in input_ids]
    labels = [ item[:max_len] + [-100]*(max_len-len(item)) for item in labels]

    input_ids = torch.LongTensor(input_ids)
    labels = torch.LongTensor(labels)
    return {
        "input_ids": input_ids,
        "labels": labels
    }


class InstructDataset(Dataset):
    def __init__(self, data: Sequence, tokenizer: transformers.PreTrainedTokenizer) -> None:
        super().__init__()
        self.tokenizer = tokenizer
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index) -> Dict[str, torch.Tensor]:
        sources = self.data[index]
        if isinstance(index, int):
            sources = [sources]
        data_dict = preprocess([e['conversations'] for e in sources], self.tokenizer)
        if isinstance(index, int):
            data_dict = dict(input_ids=data_dict["input_ids"][0], labels=data_dict["labels"][0])
        return data_dict


@dataclass
class DataCollatorForSupervisedDataset(object):
    tokenizer: transformers.PreTrainedTokenizer
    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
        input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))
        input_ids = torch.nn.utils.rnn.pad_sequence(
            input_ids,
            batch_first=True,
            padding_value=self.tokenizer.pad_token_id)
        labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)
        return dict(
            input_ids=input_ids,
            labels=labels,
            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
        )

# %%
train_dataset = InstructDataset(train_dataset, tokenizer)
val_dataset = InstructDataset(val_dataset, tokenizer)
data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
print("cell finished")

In [None]:
!nvidia-smi

In [None]:
# %%
sample_data = train_dataset[9]
IGNORE_INDEX=-100

print("=" * 80)
print("Debuging: ")
print(f"Input_ids\n{sample_data['input_ids']}")
print(f"Label_ids\n{sample_data['labels']}")
print("-" * 80)
print(f"Input:\n{tokenizer.decode(sample_data['input_ids'])}")
print("-" * 80)
N_id = tokenizer.encode("N", add_special_tokens= False)[0]
print(f"Label:\n{tokenizer.decode([N_id if x == -100 else x for x in sample_data['labels']])}")
print("=" * 80)


# %% [markdown]
# ## Training

# %% [markdown]
# ### General Training Hyperparameters

# %%
# Set training parameters
training_arguments = transformers.TrainingArguments(
    output_dir="./checkpoints" ,
    # resume_from_checkpoint = "./checkpoints-1",
    num_train_epochs=1,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=2,
    optim='paged_adamw_32bit',
    save_steps=0,
    logging_steps=1,
    learning_rate=2e-7,
    weight_decay=0.001,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    gradient_checkpointing=True,
    report_to="none",
    dataloader_drop_last=True
)

# %%

model.train()
trainer = transformers.Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_arguments,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator
)
trainer.train()

In [None]:
# %%
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

model.print_trainable_parameters()

# %% [markdown]
# Once the training is completed, we can evaluate our model and get its perplexity on the validation set like this:

# %%
import math
# !pip install -q -U git+https://github.com/huggingface/accelerate.git
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")


# %% [markdown]
# ## Save Trained LoRA

In [None]:
# %%
!pwd
output_path = "ilora"
trainer.save_model(output_path)

# %% [markdown]
# ### Test the trained model

# %%
template = Template(tokenizer.chat_template)
@torch.no_grad()
def generate(prompt):
    modelInput = template.render(messages=[{"role": "user", "content": prompt}],bos_token= tokenizer.bos_token,add_generation_prompt=True)
    input_ids = tokenizer.encode(modelInput, add_special_tokens=False, return_tensors='pt').to("cuda:0")
    outputs = model.generate(input_ids, temperature=1.0, max_new_tokens = 500)
    model_return_string = tokenizer.decode(*outputs, skip_special_tokens=False)
    print("-"*80)
    print(f"model_return_string:\n{model_return_string}")
    generated_ids = outputs[:, input_ids.shape[1]:]
    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=False)
    return generated_text

query = "我被打了，我可以寻求哪些法律援助？"
print(f"query:\n{query}")
response = generate(query)
print("-"*80)
print(f"response:\n{response}")

# %% [markdown]
# # Clean GPU Memory

# %%
!nvidia-smi

In [None]:
# %%
# Empty VRAM
# del model
# del trainer
import gc
import torch
torch.cuda.empty_cache()
gc.collect()
gc.collect()

# %%
!nvidia-smi

In [None]:
# # %% [markdown]
# # ## Load the trained model back and integrate the trained LoRA within.

# # %%

# from jinja2 import Template
# template = Template(tokenizer.chat_template)
# message = "Please introduce yourself"
# print(f"message:\n{message}\n")
# message_send_to_model=template.render(messages=[{"role": "user", "content": message}],bos_token=tokenizer.bos_token,add_generation_prompt=True)
# print(f"message_send_to_model:\n{message_send_to_model}")

# # %%
# template = Template(tokenizer.chat_template)

# from peft import PeftModel
# output_path = "ilora"
# IGNORE_INDEX=-100

# # model = AutoModelForCausalLM.from_pretrained(model_id, load_in_8bit=True, device_map={"":0})
# model = AutoModelForCausalLM.from_pretrained(model_id, load_in_8bit=True, device_map='auto')
# model = PeftModel.from_pretrained(model, output_path, is_trainable = True)
# model = model.merge_and_unload()
# model.config.max_length = 512
# # model.eval()

# tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, padding_side="left")
# # tokenizer.pad_token = tokenizer.unk_token


# # ======================== resume from checkpoint ==============================
# from peft import prepare_model_for_kbit_training

# model.gradient_checkpointing_enable()
# model = prepare_model_for_kbit_training(model)

# # %%
# from peft import LoraConfig, get_peft_model

# # You can try differnt parameter-effient strategy for model trianing, for more info, please check https://github.com/huggingface/peft
# config = LoraConfig(
#     r=8,
#     lora_alpha=8,
#     lora_dropout=0.05,
#     bias="none",
#     task_type="CAUSAL_LM",
# )

# model = get_peft_model(model, config)
# print("cell finished")


# training_arguments = transformers.TrainingArguments(
#     output_dir="./checkpoints_phase2" ,
#     resume_from_checkpoint = True,
#     num_train_epochs=1,
#     per_device_train_batch_size=2,
#     per_device_eval_batch_size=2,
#     gradient_accumulation_steps=2,
#     optim='paged_adamw_32bit',
#     save_steps=0,
#     logging_steps=1,
#     learning_rate=2e-7,
#     weight_decay=0.001,
#     max_steps=-1,
#     warmup_ratio=0.03,
#     group_by_length=True,
#     lr_scheduler_type="cosine",
#     gradient_checkpointing=True,
#     report_to="none",
#     dataloader_drop_last=True
# )

# # %%

# model.train()
# trainer = transformers.Trainer(
#     model=model,
#     tokenizer=tokenizer,
#     args=training_arguments,
#     train_dataset=train_dataset,
#     eval_dataset=val_dataset,
#     data_collator=data_collator
# )
# trainer.train()

In [None]:
# %% [markdown]
# ## Answer generation

# %%
@torch.no_grad()
def generate(prompts):
    model_inputs = [template.render(messages=[{"role": "user", "content": prompt}], bos_token=tokenizer.bos_token, add_generation_prompt=True) for prompt in prompts]
    input_ids = tokenizer(model_inputs, add_special_tokens=False, return_tensors='pt', padding=True).to("cuda:0")

    outputs = model.generate(input_ids.input_ids,attention_mask=input_ids.attention_mask, max_new_tokens=500)

    generated_texts = []
    for i in range(len(prompts)):
        generated_ids = outputs[i, input_ids.input_ids.shape[1]:]
        generated_text = tokenizer.decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        generated_texts.append(generated_text)

    return generated_texts

# test
print("\n\n".join(generate(["I get hit", "和妻子发生离婚财产纠纷，属于什么案件？刑事还是民事？"])))


# %%
!wget https://NLP-course-cuhksz.github.io/Assignments/Assignment1/task1/data/1.exam.json

In [None]:
# %%
import json

with open('1.exam.json') as f:
  data = json.load(f)
  data = data[:20] # just for demo

print(data[0])

# %%
your_prompt = """请回答下面的多选题，你需要先给出一段解答分析，然后再按照下面的格式样例给出正确答案，格式样例：[答案] B
{question}
{options}"""

def get_query(da):
  da['options'] = '\n'.join([f"{k}:{v}" for k, v in da['option'].items() if v])
  return your_prompt.format_map(da)

for item in data:
  item['query'] = get_query(item)


print(data[0]['query'])

# %%
model_answers = generate([item['query'] for item in data])
print(f'\n{model_answers[0]}')

# %%
import re
from tqdm import tqdm

def get_ans(ans):
    match = re.findall(r'.*?([A-E]+(?:[、, ]+[A-E]+)*)', ans)
    if match:
        last_match = match[-1]
        return ''.join(re.split(r'[、, ，]+', last_match))
    return ''

correct_num = 0
total_num = 0
for model_answer, item in tqdm(zip(model_answers, data)):
  if get_ans(model_answer) == item['answer']:
    correct_num += 1
  total_num += 1
  item['model_answer'] = model_answer

print(f"ACC: {correct_num/total_num:.2%}")

result_path = "/content/result.jso"
with open(result_path, "w", encoding="utf-8") as file:
    json.dump(data, file, ensure_ascii=False, indent=4)
    print(f"Results are save in {result_path}")