In [1]:
# 向量模型下载
from modelscope import snapshot_download
model_dir = snapshot_download("AI-ModelScope/bge-small-zh-v1.5", cache_dir='.')
lora_model_dir = snapshot_download('IEITYuan/Yuan2-2B-Mars-hf', cache_dir='.')

# 导入必要的库
import torch
import numpy as np
import pandas as pd
from typing import List
from datasets import Dataset
from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer
from peft import LoraConfig, TaskType, get_peft_model

In [2]:
### 定义向量模型类
class EmbeddingModel:
    def __init__(self, path: str) -> None:
        self.tokenizer = AutoTokenizer.from_pretrained(path)
        self.model = AutoModel.from_pretrained(path).cuda()
        print(f'Loading EmbeddingModel from {path}.')

    def get_embeddings(self, texts: List[str]) -> List[float]:
        encoded_input = self.tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
        encoded_input = {k: v.cuda() for k, v in encoded_input.items()}
        with torch.no_grad():
            model_output = self.model(**encoded_input)
            sentence_embeddings = model_output[0][:, 0]
        return torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1).tolist()

### 向量库索引类，用于检索知识库
class VectorStoreIndex:
    def __init__(self, document_path: str, embed_model: EmbeddingModel) -> None:
        self.documents = [line.strip() for line in open(document_path, 'r', encoding='utf-8')]
        self.embed_model = embed_model
        self.vectors = self.embed_model.get_embeddings(self.documents)
        print(f'Loaded {len(self.documents)} documents from {document_path}.')

    def get_similarity(self, vector1: List[float], vector2: List[float]) -> float:
        return np.dot(vector1, vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2) or 1)

    def query(self, question: str, k: int = 1) -> List[str]:
        question_vector = self.embed_model.get_embeddings([question])[0]
        similarities = [self.get_similarity(question_vector, vector) for vector in self.vectors]
        return [self.documents[i] for i in np.argsort(similarities)[-k:][::-1]]

### 大语言模型类
class LLM:
    def __init__(self, model_path: str) -> None:
        self.tokenizer = AutoTokenizer.from_pretrained(model_path, add_eos_token=False, add_bos_token=False, eos_token='<eod>')
        self.tokenizer.add_tokens(['<sep>', '<pad>', '<mask>'], special_tokens=True)
        self.model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16, trust_remote_code=True).cuda()
        print(f'Loaded LLM model from {model_path}.')

    def generate(self, question: str, context: List[str] = None) -> str:
        prompt = f'背景：{context}\n问题：{question}\n请基于背景，回答问题。' if context else question
        inputs = self.tokenizer(prompt + "<sep>", return_tensors="pt")["input_ids"].cuda()
        outputs = self.model.generate(inputs, do_sample=False, max_length=1024)
        return self.tokenizer.decode(outputs[0]).split("<sep>")[-1]

In [3]:
### 加载数据集和LoRA配置
df = pd.read_json('./metro_qa_data.json')
ds = Dataset.from_pandas(df)
tokenizer = AutoTokenizer.from_pretrained(lora_model_dir, add_eos_token=False, add_bos_token=False, eos_token='<eod>')
tokenizer.add_tokens(['<sep>', '<pad>', '<mask>'], special_tokens=True)

def process_func(example):
    MAX_LENGTH = 384
    question = tokenizer(f"{example['question']}<sep>")
    answer = tokenizer(f"{example['answer']}<eod>")
    input_ids = question["input_ids"] + answer["input_ids"]
    attention_mask = [1] * len(input_ids)
    labels = [-100] * len(question["input_ids"]) + answer["input_ids"]
    return {
        "input_ids": input_ids[:MAX_LENGTH],
        "attention_mask": attention_mask[:MAX_LENGTH],
        "labels": labels[:MAX_LENGTH]
    }

tokenized_id = ds.map(process_func, remove_columns=ds.column_names)
model = AutoModelForCausalLM.from_pretrained(lora_model_dir, device_map="auto", torch_dtype=torch.bfloat16, trust_remote_code=True)
model.enable_input_require_grads()

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggin

Map:   0%|          | 0/71 [00:00<?, ? examples/s]

In [None]:
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, 
    target_modules=["q_proj", "k_proj", "v_proj"],
    r=8,
    lora_alpha=32,
    lora_dropout=0.1
)

model = get_peft_model(model, lora_config)
args = TrainingArguments(
    output_dir="./output/Yuan2.0-2B_lora_bf16",
    per_device_train_batch_size=12,
    num_train_epochs=3,
    learning_rate=5e-5
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_id,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
)

# 设置填充标记，若已添加 pad_token 可直接跳过
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# 将填充标记设置为 [PAD] 或 eos_token
tokenizer.pad_token = tokenizer.pad_token if tokenizer.pad_token else tokenizer.eos_token


trainer.train()

Step,Training Loss


In [None]:
# 定义 generate_lora 函数，使用 LoRA 模型生成回答
def generate_lora(question, context=None):
    # 将 context 转换为字符串
    input_text = question if context is None else " ".join(context) + " " + question
    
    # 使用 tokenizer 将输入文本转换为张量格式
    inputs = tokenizer(input_text, return_tensors="pt").input_ids.to(model.device)
    
    # 使用 LoRA 模型生成回答，设置 max_new_tokens 以控制生成长度
    output = model.generate(inputs, max_new_tokens=100)  # 设置适合的长度
       
    # 解码输出张量为可读文本
    answer = tokenizer.decode(output[0], skip_special_tokens=True)
    return answer



def generate_combined_output(question):
    print('> 1. 纯粹大模型：')
    print(llm.generate(question))

    print('> 2. 大模型 + RAG：')
    context = index.query(question)
    print(llm.generate(question, context))

    print('> 3. 大模型 + LoRA：')
    print(generate_lora(question))

    print('> 4. 大模型 + RAG + LoRA：')
    print(generate_lora(question, context))

# 创建嵌入模型和索引
embed_model = EmbeddingModel('./AI-ModelScope/bge-small-zh-v1___5')
index = VectorStoreIndex('./metro_knowledge.txt', embed_model)

# 创建大语言模型实例
llm = LLM('./IEITYuan/Yuan2-2B-Mars-hf')

# 测试生成
test_question = "地铁清分系统为什么失效？"
generate_combined_output(test_question)