清理初始文本数据

In [1]:
import os
import json
from glob import glob
import pickle
import re

def remove_repeated_segments(text, min_repeat_len=50):
    """
    移除文本中连续重复的片段。
    """
    pattern = re.compile(r'(.{%d,}?)\1+' % min_repeat_len, re.DOTALL)
    return pattern.sub(lambda m: m.group(1), text)

data_dir = r"C:\Users\15278\CHATBOT\data_crawl-20250703T175549Z-1-001\data_crawl"
json_files = glob(os.path.join(data_dir, "quora_all_scraped_*_extracted.json"))

cleaned_texts = []

for file_path in json_files:
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f).get("data", [])
        for item in data:
            question = item.get("question_text", "").strip()
            answers = item.get("answer_texts", [])
            for ans in answers:
                # 拼接问答文本
                raw_text = f"Q: {question}\nA: {ans.strip()}"
                # 清洗重复内容
                cleaned_text = remove_repeated_segments(raw_text)
                cleaned_texts.append(cleaned_text)

print(f"✅ 清洗后问答对数量：{len(cleaned_texts)}")

# 保存清洗后的文本
with open("qa_chunks_cleaned.pkl", "wb") as f:
    pickle.dump(cleaned_texts, f)

print("✅ 清洗后的文本已保存到 qa_chunks_cleaned.pkl")


✅ 清洗后问答对数量：10088
✅ 清洗后的文本已保存到 qa_chunks_cleaned.pkl


In [3]:
from sentence_transformers import SentenceTransformer
import faiss
import pickle

# 读清洗好的文本
with open("qa_chunks_cleaned.pkl", "rb") as f:
    texts = pickle.load(f)

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

embeddings = model.encode(texts, show_progress_bar=True, convert_to_numpy=True)

dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

# 保存向量库和索引
with open("qa_chunks_cleaned.pkl", "wb") as f:
    pickle.dump(texts, f)
faiss.write_index(index, "qa_index_cleaned.faiss")

print("✅ 新向量库和索引构建完成！")


  from .autonotebook import tqdm as notebook_tqdm
Batches: 100%|██████████| 316/316 [00:17<00:00, 18.46it/s]


✅ 新向量库和索引构建完成！


In [8]:
import faiss
import pickle
import numpy as np

# 1. 加载索引
index = faiss.read_index("qa_index_cleaned.faiss")

# 2. 加载文本块（qa_chunks.pkl）
with open("qa_chunks_cleaned.pkl", "rb") as f:
    qa_chunks = pickle.load(f)

# 3. 检查索引和文本块数量是否一致
assert index.ntotal == len(qa_chunks), f"索引数量 ({index.ntotal}) 和文本块数量 ({len(qa_chunks)}) 不一致"

# 4. 获取前3条向量 + 对应文本
print("索引中的前三条向量及对应文本：\n")
for i in range(3):
    vector = index.reconstruct(i)  # 获取第i条向量
    text = qa_chunks[i]            # 获取第i条对应的文本
    print(f"=== 第 {i} 条 ===")
    print(f"[向量长度]: {len(vector)}")
    print(f"[文本内容]: {text}\n")


索引中的前三条向量及对应文本：

=== 第 0 条 ===
[向量长度]: 384
[文本内容]: Q: As a Java Developer, how can I switch my career to Machine Learning?
A: What skill do you need to learn? Machine learning. If you want to skip the rant and go to my actual advice, scroll to the bottom! Rant Yes, it’s an obvious advantage that you know some programming. Coding proficiency is a basic requirement to do machine learning, although it’s not as important as in a typical software developer job. Mathematical optimization and calculus is somewhat important. Linear algebra and statistics is really important. Probability and combinatorics is really important. But these are just prerequisites. Most importantly, machine learning is an entire branch of computer science Continue ReadingWhat skill do you need to learn? Machine learning. If you want to skip the rant and go to my actual advice, scroll to the bottom! Rant Yes, it’s an obvious advantage that you know some programming. Coding proficiency is a basic requirement to do mach

In [21]:
import pickle
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

# 加载原始文本
with open("qa_chunks_cleaned.pkl", "rb") as f:
    texts = pickle.load(f)

# 加载 embedding 模型
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# 创建 FAISS 向量库
vectorstore = FAISS.from_texts(texts, embedding_model)

# 存储为 LangChain 格式（带 index.faiss + index.pkl）
vectorstore.save_local("qa_index_cleaned")


In [2]:
import os
import pickle
import torch
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import HuggingFacePipeline
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from transformers import AutoTokenizer, pipeline, BitsAndBytesConfig, AutoModelForCausalLM
from langchain.schema import Document

# ======== 1. 加载 FAISS 向量库 ========
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.load_local("qa_index_cleaned", embedding_model,
    allow_dangerous_deserialization=True)
retriever = vectorstore.as_retriever(search_kwargs={"k": 6})

# ======== 2. 加载本地量化 Qwen 模型 ========
model_name = "unsloth/qwen2-1.5b-bnb-4bit"
bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    quantization_config=bnb_config,
    device_map="auto"
)

generate_text = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=False,
    max_new_tokens=512,
    do_sample=True,
    temperature=0.7,
    repetition_penalty=1.1
)

llm = HuggingFacePipeline(pipeline=generate_text)

# ======== 3. 构建 RAG Chain ========
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True,
    chain_type="stuff"  # 可改为 map_reduce 等
)

# ======== 4. 查询接口 ========
def ask(query):
    result = qa_chain({"query": query})
    print("\n💬 问题:", query)
    print("🤖 答案:\n", result["result"])
    print("\n🔍 来源片段:")
    for i, doc in enumerate(result["source_documents"]):
        print(f"\n--- 片段 {i+1} ---\n{doc.page_content[:300]}...")

# ======== 5. 运行 CLI ========
if __name__ == "__main__":
    while True:
        query = input("\n请输入你的问题（输入 exit 退出）：\n> ")
        if query.lower() in ["exit", "quit"]:
            break
        ask(query)


  from .autonotebook import tqdm as notebook_tqdm
  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
Device set to use cuda:0
  llm = HuggingFacePipeline(pipeline=generate_text)



请输入你的问题（输入 exit 退出）：
>  What do software developers, engineers and/or programmers do during times of massive layoffs such as the dot-com bust or the 2008 recession?


  result = qa_chain({"query": query})



💬 问题: What do software developers, engineers and/or programmers do during times of massive layoffs such as the dot-com bust or the 2008 recession?
🤖 答案:
  It dependsSome I know left the field altogether because they had strong interest and/or ability to do something else.I know a few who had the skills to go into roles such as teaching and used those skills to supplement their lowered income.Others had saved enough to ride out the fallout.Others actually never directly suffered any negative issues.Others pretty much died professionally and can now be found as retail sales clerks and such.

🔍 来源片段:

--- 片段 1 ---
Q: What do software developers, engineers and/or programmers do during times of massive layoffs such as the dot-com bust or the 2008 recession?
A: It would dependSome I know left the field altogether because they had strong interest and/or ability to do something else.I know a few who had the skills...

--- 片段 2 ---
Q: What do software developers, engineers and/or programmers d


请输入你的问题（输入 exit 退出）：
>  exit


In [14]:
with open("qa_chunks_cleaned.pkl", "rb") as f:
    texts = pickle.load(f)
with open("qa_chunks_cleaned.pkl", "wb") as f:
    pickle.dump(texts, f)
texts = pickle.load(open("qa_chunks_cleaned.pkl", "rb"))
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embeddings = model.encode(texts, show_progress_bar=True, convert_to_numpy=True)


In [20]:
import random
import numpy as np
import faiss
import pickle

# 假设 texts 和 embeddings 已经准备好了

# 1. 打乱索引顺序
indices = list(range(len(texts)))
random.shuffle(indices)

# 2. 按比例切分
train_ratio = 0.8
train_size = int(len(texts) * train_ratio)

train_indices = indices[:train_size]
test_indices = indices[train_size:]

# 3. 分别获取 train 和 test 的文本和向量
train_texts = [texts[i] for i in train_indices]
test_texts = [texts[i] for i in test_indices]

train_embeddings = embeddings[train_indices]
test_embeddings = embeddings[test_indices]

# 4. 保存文本
with open("train_texts.pkl", "wb") as f:
    pickle.dump(train_texts, f)
with open("test_texts.pkl", "wb") as f:
    pickle.dump(test_texts, f)

# 5. 建立并保存对应的FAISS索引
dimension = embeddings.shape[1]

train_index = faiss.IndexFlatL2(dimension)
train_index.add(train_embeddings)
faiss.write_index(train_index, "train_index.faiss")

test_index = faiss.IndexFlatL2(dimension)
test_index.add(test_embeddings)
faiss.write_index(test_index, "test_index.faiss")

print(f"训练集大小: {len(train_texts)}，测试集大小: {len(test_texts)}")
print("✅ 划分完成并保存！")


训练集大小: 8070，测试集大小: 2018
✅ 划分完成并保存！


In [23]:
import pickle
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re
import faiss
from unsloth import FastLanguageModel
from transformers import AutoTokenizer
import torch

# 加载测试集文本
with open("test_texts.pkl", "rb") as f:
    test_texts = pickle.load(f)

# 初始化embedding模型（与你QA时用的同一个）
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# 加载FAISS索引和原始文本库（供检索上下文）
index = faiss.read_index("qa_index_cleaned.faiss")
with open("qa_chunks_cleaned.pkl", "rb") as f:
    texts = pickle.load(f)

# 加载本地Qwen2模型（4bit）
model_name = "unsloth/qwen2-1.5b-bnb-4bit"
max_seq_length = 2048

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=None,
    load_in_4bit=True
)
model.eval()

device = "cuda" if torch.cuda.is_available() else "cpu"


def answer_question(query, top_k=3):
    # 4.1 Encode query
    query_vec = embedder.encode([query], convert_to_numpy=True)
    
    # 4.2 Search top_k relevant chunks
    D, I = index.search(query_vec, top_k)
    
    # 4.3 Get retrieved texts
    retrieved_chunks = [texts[i] for i in I[0]]
    
    # 4.4 Join context
    context = "\n---\n".join(retrieved_chunks)
    
    # 4.5 Construct prompt in English
    prompt = f"""You are an intelligent QA assistant. Please answer the user's question based on the following background knowledge:

Background documents:
{context}

User question:
{query}

Answer:"""
    
    # 4.6 Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    
    # 4.7 Generate answer with sampling for diversity
    outputs = model.generate(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_new_tokens=256,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        eos_token_id=tokenizer.eos_token_id,
    )
    
    # 4.8 Decode output
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # 4.9 Extract answer only
    answer = generated_text[len(prompt):].strip()
    return answer

# 从 Q&A文本块中提取问题和标准答案（支持多行答案）
def extract_qa(text):
    q_match = re.search(r"Q:\s*(.*)", text)
    a_match = re.search(r"A:\s*(.*)", text, re.DOTALL)
    question = q_match.group(1).strip() if q_match else ""
    answer = a_match.group(1).strip() if a_match else ""
    return question, answer

def evaluate_accuracy(test_texts, threshold=0.7, max_samples=200):
    total = min(len(test_texts), max_samples)
    correct = 0

    for i, text in enumerate(test_texts[:total]):
        question, true_answer = extract_qa(text)
        if not question or not true_answer:
            print(f"跳过样本 {i+1}，缺少问题或答案。")
            continue
        
        pred_answer = answer_question(question)
        if not pred_answer:
            print(f"Warning: Empty prediction for query: {question}")
            pred_answer = ""

        try:
            true_vec = embedder.encode([true_answer], convert_to_numpy=True)
            pred_vec = embedder.encode([pred_answer], convert_to_numpy=True)
            sim = cosine_similarity(true_vec, pred_vec)[0][0]
        except Exception as e:
            print(f"Error calculating similarity for sample {i+1}: {e}")
            sim = 0.0

        print(f"样本 {i+1}/{total}")
        print(f"问题: {question}")
        print(f"标准答案: {true_answer}")
        print(f"模型答案: {pred_answer}")
        print(f"相似度: {sim:.3f}")
        print("-" * 40)

        if sim >= threshold:
            correct += 1

    accuracy = correct / total if total > 0 else 0
    print(f"✅ 测试集准确率（前{total}条）: {accuracy*100:.2f}% （相似度阈值: {threshold}）")

if __name__ == "__main__":
    evaluate_accuracy(test_texts, threshold=0.6, max_samples=200)



Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.7.3: Fast Qwen2 patching. Transformers: 4.53.2.
   \\   /|    NVIDIA GeForce RTX 4070 Laptop GPU. Num GPUs = 1. Max memory: 7.996 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.31.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
样本 1/200
问题: What are the career options for a B.Tech graduate in an IoT company?
标准答案: Yes, pursuing a B.Tech (Hons) in Electronics and Communication Engineering (ECE) will lead to an excellent career in Artificial Intelligence (AI) and the Internet of Things (IoT). Particularly in India there are so many initiatives to promote the Electronics Industry in India. To boost the e

Unsloth: Input IDs of length 2218 > the model's max sequence length of 2048.
We shall truncate it ourselves. It's imperative if you correct this issue first.


样本 4/200
问题: Is it possible to learn algorithms and data structures in just two weeks before an interview? What are some shortcuts to learning these topics?
标准答案: One should know that the big companies keep raising the interview bar, becauseThey CANTHEY canIf everyone wants to work for you and every applicant is good at algebra, how do you pick the smartest of the bunch? You add calculus. If everyone is good at algebra + calculus? You add data structures. Then advanced data structures. Then crazy data structures. Until, only maniacs come for the interview. That is the state of art, err, world today. So, basically, you are asking how to go mad in 2 months when people usually take 4–6 years. On a serious note, don’t learn DS & A just for the interviews. You areContinue ReadingOne should know that the big companies keep raising the interview bar, becauseThey CANTHEY canIf everyone wants to work for you and every applicant is good at algebra, how do you pick the smartest of the bunch? You 

Unsloth: Input IDs of length 2165 > the model's max sequence length of 2048.
We shall truncate it ourselves. It's imperative if you correct this issue first.


样本 11/200
问题: Is job hopping actually looked down upon as a software engineer? I can make a $30k increase in 1-2 years hopping vs sitting in a position for 4+ years.
标准答案: Like many of you reading this, I’ve been looking for ways to earn money online in addition to my part-time job. But you know how it is – the internet is full of scams and shady-grady stuff, so I spent weeks trying to find something legit. And I finally did! Freecash surprised me in all the right ways. I’ve earned over $1,000 in one month without ‘living’ on the platform. I was skeptical right up until the moment I cashed out to my PayPal. What is Freecash all about? Basically, it’s a platform that pays you for testing apps and games and completing surveys. This helps developers improve their applContinue ReadingLike many of you reading this, I’ve been looking for ways to earn money online in addition to my part-time job. But you know how it is – the internet is full of scams and shady-grady stuff, so I spent weeks tr

Unsloth: Input IDs of length 3095 > the model's max sequence length of 2048.
We shall truncate it ourselves. It's imperative if you correct this issue first.


样本 14/200
问题: How can I find a mentor to help guide me through life, and pursue a career in software development?
标准答案: Hi, There are many ways to get mentors! You can search in your college seniors or friends relative who is working on software development You can search on online who is online teaching you will find them on tutorials sites, and youtube. I hope it helps you! Cheers!
模型答案: Hi, There are many ways to get mentors! You can search in your college seniors or friends relative who is working on software development You can search on online who is online teaching you will find them on tutorials sites, and youtube. I hope it helps you! Cheers!
相似度: 1.000
----------------------------------------
样本 15/200
问题: What are some tips to avoid burnout when working in tech (programming, software engineering)?
标准答案: Learn to say “No. I’m not going to do that.” A lot. Your employer wants maximum productivity for the least amount of money. They will gladly accept you working yourself to de

Unsloth: Input IDs of length 2050 > the model's max sequence length of 2048.
We shall truncate it ourselves. It's imperative if you correct this issue first.


样本 38/200
问题: What can I do if I don't know which career path to choose in computer science?
标准答案: Choosing a career path in computer science can be challenging due to the field's diversity. Here are several steps you can take to help you make a decision:Self-Assessment: - Interests: Identify what aspects of computer science interest you the most (e.g., programming, data analysis, cybersecurity, AI). - Skills: Evaluate your strengths and weaknesses in various areas like coding, problem-solving, or design.Explore Different Areas: - Courses: Take introductory courses in various subfields (e.g., web development, machine learning, systems programming). - Projects: Work on personal oContinue ReadingChoosing a career path in computer science can be challenging due to the field's diversity. Here are several steps you can take to help you make a decision:Self-Assessment: - Interests: Identify what aspects of computer science interest you the most (e.g., programming, data analysis, cybersecurit

Unsloth: Input IDs of length 4242 > the model's max sequence length of 2048.
We shall truncate it ourselves. It's imperative if you correct this issue first.


样本 47/200
问题: What should I add in resume as a new grad while applying for entry level product manger position?
模型答案: In addition to the basic details like academics and internship experience, Talk about things you have built. This can be a software / hardware, an app or a library, a course project or even better something you built on your own. Add positions of responsibilities you have held. e.g. Volunteer in your university's technology fest, Student Representative in Career Cell. Accomplishments - scholarships, awards won in extra-curricular activities. Ensure that whatever you are adding in your Resume highlights your problem solving skills, ability to build and ship products and communication skills. You can find some good CV tips here. Most Important - Your Resume should be in 1 page.
相似度: 0.375
----------------------------------------
样本 48/200
问题: What are the tips for students of Computer Science?
标准答案: The advice I’d give consists of three parts.Average is not bad. In a stan

Unsloth: Input IDs of length 2479 > the model's max sequence length of 2048.
We shall truncate it ourselves. It's imperative if you correct this issue first.


样本 51/200
问题: Should a computer science student look for internships in the field of data analysis to get into the field or work more on skills before placements?
标准答案: Always take internships or work experience when you can get it. If your goal is to work in a particular industry, try to find opportunities in the field you want to work in or one closely related to it. If you cannot find work, try to identify why you're not getting callbacks or job offers. Is it that you lack a particular skill that is common to the field you're aiming for? At the same time, when you're unable to find work in your target industry, try to work on projects that will demonstrate competence in that area. As for hiring in data analytics, it may be hard to find internships in that fContinue ReadingAlways take internships or work experience when you can get it. If your goal is to work in a particular industry, try to find opportunities in the field you want to work in or one closely related to it. If you cann

Unsloth: Input IDs of length 2216 > the model's max sequence length of 2048.
We shall truncate it ourselves. It's imperative if you correct this issue first.


样本 63/200
问题: How can AI tools like ChatGPT help with podcast interviews?
标准答案: Virtual assistants can streamline the podcast management process by taking over a variety of time-consuming tasks, such as :Scheduling and Coordination : Arranging recording times with guests, managing calendars and sending reminders.Content Research : Preparing questions, topics and show notes to keep episodes engaging and relevant.Editing and Post-Production : Handling basic audio editing, adding intros/outros and ensuring sound quality.Publishing and Distribution : Uploading episodes to platforms like Spotify, Apple Podcasts, and YouTube and writing detailed descriptions.Marketing and PromoContinue ReadingVirtual assistants can streamline the podcast management process by taking over a variety of time-consuming tasks, such as :Scheduling and Coordination : Arranging recording times with guests, managing calendars and sending reminders.Content Research : Preparing questions, topics and show notes to keep 

Unsloth: Input IDs of length 2058 > the model's max sequence length of 2048.
We shall truncate it ourselves. It's imperative if you correct this issue first.


样本 71/200
问题: If I have the following skills, MS Excel, Tableau, SQL, Python, Numpy, Pandas, Git & Github, can I get a data analyst job?
标准答案: That’s a great skill set you have started. Particularly the Tableau and SQL skills. I know many businesses use those packages in their day to day needs. I think you have a great shot at it.
模型答案: Yes, you have the skills required to get a data analyst job. However, it's important to note that the decision ultimately comes down to how well you perform during an interview.
相似度: 0.301
----------------------------------------
样本 72/200
问题: What are some of the cultural differences of Asia with Europe and America?
标准答案: I'm going to point out just a couple of things here : 1. Greetings : Differs between different parts of the continent. But mostly, in Asia, people either bow or give a verbal greeting. In Europe, it's handshake in some places while in other, you can find embraces and kiss on the cheek as a greeting. 2. Food : This is an obvious one. A

Unsloth: Input IDs of length 2054 > the model's max sequence length of 2048.
We shall truncate it ourselves. It's imperative if you correct this issue first.


样本 79/200
问题: When an employer asks if you are subject to a non-compete agreement on a job application, if you answer yes, does it hurt you even though they are in different industries?
标准答案: It should not hurt but some employers prefer not to hire with non compete since there is the potential for a lawsuit even if it lacks merit. That said, it is always a good idea to attach the non compete document or an excerpt from it ( the clause dealing with the specifics) with your application. Since employers may not read it or may not be comfortable interpreting the legal language, you should provide a brief explanation in your cover letter. It should be noted that non competes have to be reasonable to be enforceable. It cannot be so broad as to prevent someone from earning a livelihood andContinue ReadingIt should not hurt but some employers prefer not to hire with non compete since there is the potential for a lawsuit even if it lacks merit. That said, it is always a good idea to attach the 

Unsloth: Input IDs of length 2149 > the model's max sequence length of 2048.
We shall truncate it ourselves. It's imperative if you correct this issue first.


样本 97/200
问题: How would a new grad with no experience in product management prepare for a PM interview?
标准答案: Where do I start? I’m a huge financial nerd, and have spent an embarrassing amount of time talking to people about their money habits. Here are the biggest mistakes people are making and how to fix them: 1. Not having a separate high interest savings account: Having a separate account allows you to see the results of all your hard work and keep your money separate so you're less tempted to spend it. Plus with rates above 5.00%, the interest you can earn compared to most banks really adds up. Here is a list of the top savings accounts available today. Deposit $5 before moving on because this is oneContinue ReadingWhere do I start? I’m a huge financial nerd, and have spent an embarrassing amount of time talking to people about their money habits. Here are the biggest mistakes people are making and how to fix them: 1. Not having a separate high interest savings account: Having a s

Unsloth: Input IDs of length 2205 > the model's max sequence length of 2048.
We shall truncate it ourselves. It's imperative if you correct this issue first.


样本 99/200
问题: What are some "soft" skills that a person can learn in order to improve their communication, team work and leadership skills?
标准答案: Leadership is a challenging craft, but it is also an essential one. Not every leader needs to be a manager, but everyone who is a manager should be a leader. Unfortunately, being a manager doesn't automatically make you a are a leader. And the reason is most organisations (and the individuals within them) either don't fully understand or have a limited understanding of what leadership actually means. There is no ready-made recipe for it however I summarised 5 crucial leadership attributes that new managers need to understand to lead their team effectively. Here they are:Self-awareness and leadingContinue ReadingLeadership is a challenging craft, but it is also an essential one. Not every leader needs to be a manager, but everyone who is a manager should be a leader. Unfortunately, being a manager doesn't automatically make you a are a leader.

Unsloth: Input IDs of length 2051 > the model's max sequence length of 2048.
We shall truncate it ourselves. It's imperative if you correct this issue first.


样本 126/200
问题: Does data mining job require intensive programming skills?
标准答案: You should be okay. Python has good packages that don't assume a lot of knowledge, and R is structured similarly to Matlab, so it should be easy to pick up (also have good online tutorials for learning R from scratch). I'd focus more on learning the math. You should have a decent background coming from EE, but be sure you have a solid understanding of statistics, linear algebra, and multivariable calculus. I'd also recommend electives in statistics during your program (generalized linear models and experimental design are good choices). I'm attaching a PPT that overviews many areas of data scienContinue ReadingYou should be okay. Python has good packages that don't assume a lot of knowledge, and R is structured similarly to Matlab, so it should be easy to pick up (also have good online tutorials for learning R from scratch). I'd focus more on learning the math. You should have a decent background coming fro

Unsloth: Input IDs of length 2433 > the model's max sequence length of 2048.
We shall truncate it ourselves. It's imperative if you correct this issue first.


样本 130/200
问题: What course can I pursue after MBA in Finance, CFA, FRM, CS?
标准答案: That really depends on you, what are your interests, what would you like to do. I make some comments on different programs here: Guzman Safon's answer to Finance Jobs: Which of these certifications is most essential-CFA,CAIA,FRM?
模型答案: There are several courses you can pursue after obtaining a MBA in Finance, CFA, FRM, or CS. The choice depends on your interests, career goals, and desired specialization. Here are some examples:

1. MBA in Finance: If you are interested in pursuing a career in finance, an MBA in Finance can provide you with a broad understanding of financial markets and investment strategies. You can pursue courses in areas such as financial accounting, financial economics, corporate finance, and investment analysis.

2. CFA (Chartered Financial Analyst): The CFA designation is a globally recognized investment management credential. As a CFA, you can pursue a career in investment managemen

Unsloth: Input IDs of length 2088 > the model's max sequence length of 2048.
We shall truncate it ourselves. It's imperative if you correct this issue first.


样本 158/200
问题: What are the key components of a robust MLOps pipeline, from model development to deployment and monitoring?
标准答案: Say what? All of them. What do you think they do? Data is the hardest part of machine learning and most of these types are data experts. This idea that modeling is hard is fucking laughable.
模型答案: The key components of a robust MLOps pipeline are data sourcing, data wrangling, data cleansing, model development, model tuning, model deployment, model monitoring, and model maintenance. These components are critical for ensuring that the machine learning process is robust, efficient, and effective.
相似度: 0.300
----------------------------------------


Unsloth: Input IDs of length 3127 > the model's max sequence length of 2048.
We shall truncate it ourselves. It's imperative if you correct this issue first.


样本 159/200
问题: What are the top skills that will power the next generation of FinTech?
标准答案: IT jobs are in high demand among unemployed workers or college graduates, as well as qualified candidates are in short supply for most positions within the industry. Fintech, however, takes that to another level. As a booming industry that is continually changing legacy institutions by disruption and the rapid pace of evolution, financial technologies need people to fuel the progress. The interest in the industry is at its highest at the moment, and there is little surprise that many people want to land a job offer in this sphere. Whether one is aiming solely at monetary benefits (which is fine)Continue ReadingIT jobs are in high demand among unemployed workers or college graduates, as well as qualified candidates are in short supply for most positions within the industry. Fintech, however, takes that to another level. As a booming industry that is continually changing legacy institutions by di

Unsloth: Input IDs of length 2743 > the model's max sequence length of 2048.
We shall truncate it ourselves. It's imperative if you correct this issue first.


样本 162/200
问题: Will natural language processing engineers find it hard to get work in the future? Once computers are capable of near-perfect text and speech processing and good tools are freely available, will most NLP engineers be out of work?
标准答案: As someone who has bet his career on it, I definitely believe in a bright future of natural language processing. NLP is a big part of what we do at Coseer (Coseer - Welcome!) - automate tedious cognitive tasks in businesses. The fundamental thesis is that human beings think, communicate and understand necessarily in an unstructured and subjective way. The same sentence e.g. "The weather is nice today." comes up with different imagery, thoughts, memories, and decision trees for different people. On the other hand, the decision science is binary (or quantified). "The weather is nice today." meanContinue ReadingAs someone who has bet his career on it, I definitely believe in a bright future of natural language processing. NLP is a big part of

Unsloth: Input IDs of length 2379 > the model's max sequence length of 2048.
We shall truncate it ourselves. It's imperative if you correct this issue first.


样本 166/200
问题: How does the USA approach mental health in the workplace, especially in high-stress industries?
标准答案: The ‘USA’ is a very spacious multidimensional term but it is run under a capitalist system. Corporations are entities that have a ‘fiduciary’ responsibility to put making a profit for the shareholders above everything else. There are allowances for some things like mental health, another very large nebulous term, but they are secondary. In comparison with countries like the Scandinavians, this concern is practically non-existent. They also have very strong unions and we don’t, but that is another topic. Most businesses focus on hiring individuals that will produce the most with the least amounContinue ReadingThe ‘USA’ is a very spacious multidimensional term but it is run under a capitalist system. Corporations are entities that have a ‘fiduciary’ responsibility to put making a profit for the shareholders above everything else. There are allowances for some things like m

Unsloth: Input IDs of length 3119 > the model's max sequence length of 2048.
We shall truncate it ourselves. It's imperative if you correct this issue first.


样本 188/200
问题: What are some jobs in artificial intelligence? Can they be done remotely?
标准答案: One word - Yes. How? Just keep reading this you'll get to know many things.. Do you know about the .com bubble that increased the price of internet companies shares? What happened then was everyone was trying to have their .com website and then raising money and then eventually we all know what happened… I believe that somewhat similar thing is going to happen now also. A lot of people started doing computer science… in 2021, over hiring happen (due to US Fed Bank lowering interest rates).. chain effect… more people entered software field. I believe this bubble equivalent to dot-com bubble will Continue ReadingOne word - Yes. How? Just keep reading this you'll get to know many things.. Do you know about the .com bubble that increased the price of internet companies shares? What happened then was everyone was trying to have their .com website and then raising money and then eventually we all k

Unsloth: Input IDs of length 2065 > the model's max sequence length of 2048.
We shall truncate it ourselves. It's imperative if you correct this issue first.


样本 191/200
问题: How close are we to achieve AGI (artificial General Intelligence)? Is it even a viable idea?
标准答案: Is it even a viable idea?I’ve been working on AGI theory since 2013. In 2017, I worked on the concept of AGI task locking which is something required for things like hunter/killer robots. That ended with a disproof. In all this time I haven’t found any similar contradictionse for AGI theory, so I would say that at this point a disproof is very unlikely. I’ve done some engineering estimates and it seems to be practical with current technology.How close are we to achieve AGI (artificial General Intelligence)?The theory has to be completed first. There has been considerably more progress made in Continue ReadingIs it even a viable idea?I’ve been working on AGI theory since 2013. In 2017, I worked on the concept of AGI task locking which is something required for things like hunter/killer robots. That ended with a disproof. In all this time I haven’t found any similar contradic

Unsloth: Input IDs of length 2133 > the model's max sequence length of 2048.
We shall truncate it ourselves. It's imperative if you correct this issue first.


样本 198/200
问题: How can I hire a natural language processing engineer without getting hoodwinked?
标准答案: Here are some things to look for in a candidate and also candidates whom you can immediately remove from your list. What to look for in a candidate?Solid NLP projects even if these are academic projects its OK. But, they should have at least completed two or three projects. This can be text classification, entity extraction, text summarization, topics extraction, sentiment analysis and other relevant topics. If they have a working demo to show, that’s even better. Linking to code on GitHub or Research Papers is also a good thing.Software engineering skills. Back in the days, researchers and proContinue ReadingHere are some things to look for in a candidate and also candidates whom you can immediately remove from your list. What to look for in a candidate?Solid NLP projects even if these are academic projects its OK. But, they should have at least completed two or three projects. This 

In [12]:
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
)
from ragas import evaluate
from datasets import Dataset
import pickle
from sentence_transformers import SentenceTransformer
import faiss
import re
from unsloth import FastLanguageModel
from transformers import AutoTokenizer
import torch

# 加载测试集 Q/A 文本
with open("test_texts.pkl", "rb") as f:
    test_texts = pickle.load(f)

# 加载检索数据
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
index = faiss.read_index("qa_index_cleaned.faiss")
with open("qa_chunks_cleaned.pkl", "rb") as f:
    texts = pickle.load(f)

# 加载 Qwen 本地模型
model_name = "unsloth/qwen2-1.5b-bnb-4bit"
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True
)
model.eval()
device = "cuda" if torch.cuda.is_available() else "cpu"

# 提取 QA
def extract_qa(text):
    q_match = re.search(r"Q:\s*(.*)", text)
    a_match = re.search(r"A:\s*(.*)", text, re.DOTALL)
    question = q_match.group(1).strip() if q_match else ""
    answer = a_match.group(1).strip() if a_match else ""
    return question, answer

# 使用 embedding + faiss 获取上下文段落
def retrieve_contexts(query, top_k=3):
    query_vec = embedder.encode([query], convert_to_numpy=True)
    D, I = index.search(query_vec, top_k)
    return [texts[i] for i in I[0]]

# 执行本地 Qwen 模型回答
def answer_question_with_context(query, context):
    prompt = f"""You are an intelligent QA assistant. Please answer the user's question based on the following background knowledge:

Background documents:
{context}

User question:
{query}

Answer:"""
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_new_tokens=256,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        eos_token_id=tokenizer.eos_token_id,
    )
    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated[len(prompt):].strip()

# 构造 ragas 格式数据
def build_ragas_dataset(test_texts, max_samples=100):
    questions, answers, contexts, references = [], [], [], []
    for i, item in enumerate(test_texts[:max_samples]):
        q, a = extract_qa(item)
        if not q or not a:
            continue
        ctxs = retrieve_contexts(q)
        context = "\n---\n".join(ctxs)
        pred = answer_question_with_context(q, context)
        questions.append(q)
        answers.append(pred)
        contexts.append(ctxs)
        references.append(a)
        print(f"✅ {i+1}/{max_samples} 完成")

    dataset_dict = {
        "question": questions,
        "answer": answers,
        "contexts": contexts,
        "ground_truth": references
    }

    return Dataset.from_dict(dataset_dict)

# 评估
def evaluate_ragas(dataset):
    results = evaluate(
        dataset=dataset,
        metrics=[
            faithfulness,
            answer_relevancy,
            context_precision,
            context_recall,
        ]
    )
    print("📊 RAGAS 评估结果：")
    for k, v in results.items():
        print(f"{k}: {v:.3f}")

if __name__ == "__main__":
    print("📦 构建数据集中...")
    ragas_data = build_ragas_dataset(test_texts, max_samples=100)
    print("✅ 数据集准备完成，开始评估...\n")
    evaluate_ragas(ragas_data)



Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.7.3: Fast Qwen2 patching. Transformers: 4.53.2.
   \\   /|    NVIDIA GeForce RTX 4070 Laptop GPU. Num GPUs = 1. Max memory: 7.996 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.31.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
📦 构建数据集中...
✅ 1/100 完成
✅ 2/100 完成
✅ 3/100 完成


Unsloth: Input IDs of length 2218 > the model's max sequence length of 2048.
We shall truncate it ourselves. It's imperative if you correct this issue first.


✅ 4/100 完成
✅ 5/100 完成
✅ 6/100 完成
✅ 7/100 完成
✅ 8/100 完成
✅ 9/100 完成
✅ 10/100 完成


Unsloth: Input IDs of length 2165 > the model's max sequence length of 2048.
We shall truncate it ourselves. It's imperative if you correct this issue first.


✅ 11/100 完成
✅ 12/100 完成
✅ 13/100 完成


Unsloth: Input IDs of length 3095 > the model's max sequence length of 2048.
We shall truncate it ourselves. It's imperative if you correct this issue first.


✅ 14/100 完成
✅ 15/100 完成
✅ 16/100 完成
✅ 17/100 完成
✅ 18/100 完成
✅ 19/100 完成
✅ 20/100 完成
✅ 21/100 完成
✅ 22/100 完成
✅ 23/100 完成
✅ 24/100 完成
✅ 25/100 完成
✅ 26/100 完成
✅ 27/100 完成
✅ 28/100 完成
✅ 29/100 完成
✅ 30/100 完成
✅ 31/100 完成
✅ 32/100 完成
✅ 33/100 完成
✅ 34/100 完成
✅ 35/100 完成
✅ 36/100 完成
✅ 37/100 完成


Unsloth: Input IDs of length 2050 > the model's max sequence length of 2048.
We shall truncate it ourselves. It's imperative if you correct this issue first.


✅ 38/100 完成
✅ 39/100 完成
✅ 40/100 完成
✅ 41/100 完成
✅ 42/100 完成
✅ 43/100 完成
✅ 44/100 完成
✅ 45/100 完成
✅ 46/100 完成
✅ 47/100 完成


Unsloth: Input IDs of length 4242 > the model's max sequence length of 2048.
We shall truncate it ourselves. It's imperative if you correct this issue first.


✅ 48/100 完成
✅ 49/100 完成
✅ 50/100 完成


Unsloth: Input IDs of length 2479 > the model's max sequence length of 2048.
We shall truncate it ourselves. It's imperative if you correct this issue first.


✅ 51/100 完成
✅ 52/100 完成
✅ 53/100 完成
✅ 54/100 完成
✅ 55/100 完成
✅ 56/100 完成
✅ 57/100 完成
✅ 58/100 完成
✅ 59/100 完成
✅ 60/100 完成
✅ 61/100 完成
✅ 62/100 完成


Unsloth: Input IDs of length 2216 > the model's max sequence length of 2048.
We shall truncate it ourselves. It's imperative if you correct this issue first.


✅ 63/100 完成
✅ 64/100 完成
✅ 65/100 完成
✅ 66/100 完成
✅ 67/100 完成
✅ 68/100 完成
✅ 69/100 完成
✅ 70/100 完成


Unsloth: Input IDs of length 2058 > the model's max sequence length of 2048.
We shall truncate it ourselves. It's imperative if you correct this issue first.


✅ 71/100 完成
✅ 72/100 完成
✅ 73/100 完成
✅ 74/100 完成
✅ 75/100 完成
✅ 76/100 完成
✅ 77/100 完成
✅ 78/100 完成


Unsloth: Input IDs of length 2054 > the model's max sequence length of 2048.
We shall truncate it ourselves. It's imperative if you correct this issue first.


✅ 79/100 完成
✅ 80/100 完成
✅ 81/100 完成
✅ 82/100 完成
✅ 83/100 完成
✅ 84/100 完成
✅ 85/100 完成
✅ 86/100 完成
✅ 87/100 完成
✅ 88/100 完成
✅ 89/100 完成
✅ 90/100 完成
✅ 91/100 完成
✅ 92/100 完成
✅ 93/100 完成
✅ 94/100 完成
✅ 95/100 完成
✅ 96/100 完成


Unsloth: Input IDs of length 2149 > the model's max sequence length of 2048.
We shall truncate it ourselves. It's imperative if you correct this issue first.


✅ 97/100 完成
✅ 98/100 完成


Unsloth: Input IDs of length 2205 > the model's max sequence length of 2048.
We shall truncate it ourselves. It's imperative if you correct this issue first.


✅ 99/100 完成
✅ 100/100 完成
✅ 数据集准备完成，开始评估...



OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable

In [18]:
import pickle
import re
import faiss
import torch
import numpy as np
from datasets import Dataset
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from unsloth import FastLanguageModel
from transformers import AutoTokenizer
import os
# ===== 1. 加载测试集 Q/A 文本 =====
with open("test_texts.pkl", "rb") as f:
    test_texts = pickle.load(f)

# ===== 2. 加载检索数据 =====
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
index = faiss.read_index("qa_index_cleaned.faiss")
with open("qa_chunks_cleaned.pkl", "rb") as f:
    texts = pickle.load(f)

# ===== 3. 加载本地 Qwen 模型 =====
model_name = "unsloth/qwen2-1.5b-bnb-4bit"
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True
)
model.eval()
device = "cuda" if torch.cuda.is_available() else "cpu"

# ===== 4. 辅助函数：提取问题和答案 =====
def extract_qa(text):
    q_match = re.search(r"Q:\s*(.*)", text)
    a_match = re.search(r"A:\s*(.*)", text, re.DOTALL)
    question = q_match.group(1).strip() if q_match else ""
    answer = a_match.group(1).strip() if a_match else ""
    return question, answer

# ===== 5. 检索上下文 =====
def retrieve_contexts(query, top_k=3):
    query_vec = embedder.encode([query], convert_to_numpy=True)
    D, I = index.search(query_vec, top_k)
    return [texts[i] for i in I[0]]

# ===== 6. 用 Qwen 生成回答 =====
def answer_question_with_context(query, context):
    prompt = f"""You are an intelligent QA assistant. Please answer the user's question based on the following background knowledge:

Background documents:
{context}

User question:
{query}

Answer:"""
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_new_tokens=256,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        eos_token_id=tokenizer.eos_token_id,
    )
    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated[len(prompt):].strip()

# ===== 7. 构造用于本地评估的 Dataset =====
def build_ragas_dataset(test_texts, max_samples=100):
    questions, answers, contexts, references = [], [], [], []
    for i, item in enumerate(test_texts[:max_samples]):
        q, a = extract_qa(item)
        if not q or not a:
            print(f"跳过样本 {i+1}：缺少问题或答案")
            continue
        ctxs = retrieve_contexts(q)
        context = "\n---\n".join(ctxs)
        pred = answer_question_with_context(q, context)
        questions.append(q)
        answers.append(pred)
        contexts.append(ctxs)
        references.append(a)
        print(f"✅ 样本 {i+1}/{max_samples} 完成")
    dataset_dict = {
        "question": questions,
        "answer": answers,
        "contexts": contexts,
        "ground_truth": references
    }
    return Dataset.from_dict(dataset_dict)

# ===== 8. 本地版 ragas-lite 评估函数 =====
def average_cos_sim(a, b):
    return cosine_similarity(a, b).mean()

def evaluate_ragas_locally(dataset, embedder):
    print("\n📊 使用本地 embedding 评估 ragas-lite 指标...\n")
    scores = {
        "faithfulness": [],
        "answer_relevancy": [],
        "context_precision": [],
        "context_recall": [],
    }

    for i in range(len(dataset)):
        q = dataset["question"][i]
        a = dataset["answer"][i]
        gt = dataset["ground_truth"][i]
        ctxs = dataset["contexts"][i]

        try:
            q_vec = embedder.encode([q])
            a_vec = embedder.encode([a])
            gt_vec = embedder.encode([gt])
            ctx_vecs = embedder.encode(ctxs)

            # 1. faithfulness: answer 与 context 的平均相似度
            faith = average_cos_sim(a_vec, ctx_vecs)
            scores["faithfulness"].append(faith)

            # 2. answer relevancy: answer 与 question 的相似度
            rel = cosine_similarity(a_vec, q_vec)[0][0]
            scores["answer_relevancy"].append(rel)

            # 3. context precision: GT 与最相关 context 的相似度
            precision = max([cosine_similarity(gt_vec, c.reshape(1, -1))[0][0] for c in ctx_vecs])
            scores["context_precision"].append(precision)

            # 4. context recall: GT 与所有 context 的平均相似度
            recall = average_cos_sim(gt_vec, ctx_vecs)
            scores["context_recall"].append(recall)

        except Exception as e:
            print(f"❌ 第 {i+1} 条评估失败: {e}")

    # 输出平均得分
    print("\n✅ 本地 ragas-lite 评估完成:\n")
    for k, v in scores.items():
        avg = np.mean(v)
        print(f"{k}: {avg:.3f}")

# ===== 9. 主程序入口 =====
if __name__ == "__main__":
    dataset_path = "ragas_data.pkl"
    if os.path.exists(dataset_path):
        print(f"📂 检测到已保存数据集，正在加载 {dataset_path} ...")
        with open(dataset_path, "rb") as f:
            ragas_data = pickle.load(f)
    else:
        print("📦 构建数据集中...")
        ragas_data = build_ragas_dataset(test_texts, max_samples=100)
        print(f"💾 保存构建好的数据集到 {dataset_path} ...")
        with open(dataset_path, "wb") as f:
            pickle.dump(ragas_data, f)

    print("\n✅ 数据集准备完成，开始评估...\n")
    evaluate_ragas_locally(ragas_data, embedder)


==((====))==  Unsloth 2025.7.3: Fast Qwen2 patching. Transformers: 4.53.2.
   \\   /|    NVIDIA GeForce RTX 4070 Laptop GPU. Num GPUs = 1. Max memory: 7.996 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.31.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
📦 构建数据集中...
✅ 样本 1/100 完成
✅ 样本 2/100 完成
✅ 样本 3/100 完成
✅ 样本 4/100 完成
✅ 样本 5/100 完成
✅ 样本 6/100 完成
✅ 样本 7/100 完成
✅ 样本 8/100 完成
✅ 样本 9/100 完成
✅ 样本 10/100 完成
✅ 样本 11/100 完成
✅ 样本 12/100 完成
✅ 样本 13/100 完成
✅ 样本 14/100 完成
✅ 样本 15/100 完成
✅ 样本 16/100 完成
✅ 样本 17/100 完成
✅ 样本 18/100 完成
✅ 样本 19/100 完成
✅ 样本 20/100 完成
✅ 样本 21/100 完成
✅ 样本 22/100 完成
✅ 样本 23/100 完成
✅ 样本 24/100 完成
✅ 样本 25/100 完成
✅ 样本 26/100 完成
✅ 样本 27/100 完成
✅ 样本 28/100 完成
✅ 样本 29/100 完成
✅ 样本 30/100 完成
✅ 样本 31/100 完成
✅ 样本 32/100 完成
✅ 样本 33/100 完成
✅ 样本 34/100 完成
✅ 样本 

In [12]:
import pickle
import re
import faiss
import torch
import numpy as np
from datasets import Dataset
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from unsloth import FastLanguageModel
from transformers import AutoTokenizer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from rank_bm25 import BM25Okapi
import jieba
import os

# ===== 1. 文档预处理层 =====
class DocumentPreprocessor:
    """文档预处理层：实现动态分块"""
    
    def __init__(self, chunk_size=512, chunk_overlap=50):
        # 使用 RecursiveCharacterTextSplitter 实现动态分块
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=["\n\n", "\n", "。", "！", "？", "；", "，", " ", ""],
            keep_separator=True,
            length_function=len
        )
    
    def dynamic_chunking(self, documents):
        """动态分块：根据语义结构进行文档分割"""
        print("🔧 开始动态分块处理...")
        all_chunks = []
        
        for i, doc in enumerate(documents):
            # 使用递归式动态分块策略
            chunks = self.text_splitter.split_text(doc)
            all_chunks.extend(chunks)
            
            if (i + 1) % 100 == 0:
                print(f"✅ 已处理 {i + 1} 个文档")
        
        print(f"📊 动态分块完成，共生成 {len(all_chunks)} 个文本块")
        return all_chunks

# ===== 2. 嵌入与索引层 =====
class HybridRetriever:
    """混合检索层：集成向量检索和关键词检索"""
    
    def __init__(self, embedder_model="sentence-transformers/all-MiniLM-L6-v2"):
        # 向量化检索组件
        self.embedder = SentenceTransformer(embedder_model)
        self.faiss_index = None
        self.texts = None
        
        # BM25关键词检索组件
        self.bm25 = None
        self.tokenized_docs = None
    
    def build_vector_index(self, texts):
        """构建FAISS向量索引"""
        print("🔧 构建向量索引中...")
        self.texts = texts
        
        # 生成文档嵌入向量
        embeddings = self.embedder.encode(texts, convert_to_numpy=True, show_progress_bar=True)
        
        # 使用FAISS构建高效向量索引
        dimension = embeddings.shape[1]
        self.faiss_index = faiss.IndexFlatIP(dimension)  # 内积相似度
        
        # 归一化向量以使用余弦相似度
        faiss.normalize_L2(embeddings)
        self.faiss_index.add(embeddings.astype('float32'))
        
        print(f"✅ 向量索引构建完成，维度: {dimension}, 文档数: {len(texts)}")
    
    def build_bm25_index(self, texts):
        """构建BM25关键词索引"""
        print("🔧 构建BM25关键词索引中...")
        
        # 对文档进行分词
        self.tokenized_docs = [list(jieba.cut(text)) for text in texts]
        
        # 构建BM25索引
        self.bm25 = BM25Okapi(self.tokenized_docs)
        
        print("✅ BM25索引构建完成")
    
    def vector_search(self, query, top_k=5):
        """向量化检索"""
        if self.faiss_index is None:
            raise ValueError("向量索引未构建，请先调用 build_vector_index")
        
        # 查询向量化
        query_vec = self.embedder.encode([query], convert_to_numpy=True)
        faiss.normalize_L2(query_vec)
        
        # FAISS检索
        scores, indices = self.faiss_index.search(query_vec.astype('float32'), top_k)
        
        results = []
        for score, idx in zip(scores[0], indices[0]):
            if idx != -1:  # 有效索引
                results.append({
                    'text': self.texts[idx],
                    'score': float(score),
                    'method': 'vector'
                })
        
        return results
    
    def bm25_search(self, query, top_k=5):
        """BM25关键词检索"""
        if self.bm25 is None:
            raise ValueError("BM25索引未构建，请先调用 build_bm25_index")
        
        # 查询分词
        query_tokens = list(jieba.cut(query))
        
        # BM25检索
        scores = self.bm25.get_scores(query_tokens)
        
        # 获取top_k结果
        top_indices = np.argsort(scores)[::-1][:top_k]
        
        results = []
        for idx in top_indices:
            results.append({
                'text': self.texts[idx],
                'score': float(scores[idx]),
                'method': 'bm25'
            })
        
        return results
    
    def hybrid_search(self, query, top_k=6, vector_weight=0.6, bm25_weight=0.4):
        """混合检索：结合向量检索和BM25检索"""
        vector_results = self.vector_search(query, top_k)
        bm25_results = self.bm25_search(query, top_k)
        
        # 合并结果并重新排序
        combined_results = {}
        
        # 处理向量检索结果
        for result in vector_results:
            text = result['text']
            if text not in combined_results:
                combined_results[text] = {'text': text, 'vector_score': 0, 'bm25_score': 0}
            combined_results[text]['vector_score'] = result['score']
        
        # 处理BM25检索结果
        for result in bm25_results:
            text = result['text']
            if text not in combined_results:
                combined_results[text] = {'text': text, 'vector_score': 0, 'bm25_score': 0}
            combined_results[text]['bm25_score'] = result['score']
        
        # 计算混合得分
        final_results = []
        for item in combined_results.values():
            # 归一化分数并加权组合
            hybrid_score = (vector_weight * item['vector_score'] + 
                          bm25_weight * item['bm25_score'])
            final_results.append({
                'text': item['text'],
                'score': hybrid_score,
                'method': 'hybrid'
            })
        
        # 按得分排序并返回top_k
        final_results.sort(key=lambda x: x['score'], reverse=True)
        return final_results[:top_k]

# ===== 3. 查询增强层 =====
class QueryEnhancer:
    """查询增强层：实现HyDE假设文档生成"""
    
    def __init__(self, model, tokenizer, device):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device
    
    def generate_hypothetical_document(self, query):
        """HyDE：生成假设文档来改善检索效果"""
        hyde_prompt = f"""请基于以下问题生成一个详细的假设性文档，该文档应该包含问题的答案和相关背景信息：

问题: {query}

假设文档:"""
        
        inputs = self.tokenizer(hyde_prompt, return_tensors="pt", truncation=True, max_length=512).to(self.device)
        
        with torch.no_grad():
            outputs = self.model.generate(
                input_ids=inputs['input_ids'],
                attention_mask=inputs['attention_mask'],
                max_new_tokens=200,
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
                eos_token_id=self.tokenizer.eos_token_id,
                pad_token_id=self.tokenizer.eos_token_id
            )
        
        generated = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        hypothetical_doc = generated[len(hyde_prompt):].strip()
        
        return hypothetical_doc
    
    def enhanced_query(self, original_query):
        """查询增强：结合原查询和假设文档"""
        hypothetical_doc = self.generate_hypothetical_document(original_query)
        
        # 组合原查询和假设文档
        enhanced_query = f"{original_query} {hypothetical_doc}"
        
        return enhanced_query, hypothetical_doc

# ===== 4. 生成层 =====
class QAChain:
    """QA Chain框架：实现结构化的问答流程"""
    
    def __init__(self, model, tokenizer, device):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device
    
    def format_context(self, retrieved_docs):
        """整合检索文档为结构化上下文"""
        if not retrieved_docs:
            return "暂无相关背景信息。"
        
        context_parts = []
        for i, doc in enumerate(retrieved_docs, 1):
            context_parts.append(f"文档{i}：{doc['text']}")
        
        return "\n\n".join(context_parts)
    
    def generate_answer(self, query, context):
        """QA Chain：生成结构化答案"""
        qa_prompt = f"""你是一个智能问答助手。请基于提供的背景知识回答用户问题。

背景知识：
{context}

用户问题：{query}

请提供准确、详细的答案："""
        
        inputs = self.tokenizer(qa_prompt, return_tensors="pt", truncation=True, max_length=1800).to(self.device)
        
        with torch.no_grad():
            outputs = self.model.generate(
                input_ids=inputs['input_ids'],
                attention_mask=inputs['attention_mask'],
                max_new_tokens=256,
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
                eos_token_id=self.tokenizer.eos_token_id,
                pad_token_id=self.tokenizer.eos_token_id
            )
        
        generated = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        answer = generated[len(qa_prompt):].strip()
        
        return answer

# ===== 5. 完整RAG系统集成 =====
class CompleteRAGSystem:
    """完整的四层RAG架构系统"""
    
    def __init__(self, model_name="unsloth/qwen2-1.5b-bnb-4bit"):
        # 初始化各层组件
        self.preprocessor = DocumentPreprocessor()
        self.retriever = HybridRetriever()
        
        # 加载生成模型
        print("🔧 加载生成模型...")
        self.model, self.tokenizer = FastLanguageModel.from_pretrained(
            model_name=model_name,
            max_seq_length=2048,
            dtype=None,
            load_in_4bit=True
        )
        self.model.eval()
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        
        # 初始化查询增强层和生成层
        self.query_enhancer = QueryEnhancer(self.model, self.tokenizer, self.device)
        self.qa_chain = QAChain(self.model, self.tokenizer, self.device)
    
    def build_knowledge_base(self, documents):
        """构建知识库：包含所有索引"""
        # 1. 文档预处理层：动态分块
        chunks = self.preprocessor.dynamic_chunking(documents)
        
        # 2. 嵌入与索引层：构建混合索引
        self.retriever.build_vector_index(chunks)
        self.retriever.build_bm25_index(chunks)
        
        return chunks
    
    def answer_question(self, query, use_hyde=True, use_hybrid=True):
        """完整的问答流程"""
        print(f"\n🤔 用户问题：{query}")
        
        # 3. 查询增强层：HyDE处理（可选）
        if use_hyde:
            enhanced_query, hypothetical_doc = self.query_enhancer.enhanced_query(query)
            print(f"🔍 HyDE假设文档：{hypothetical_doc[:100]}...")
            search_query = enhanced_query
        else:
            search_query = query
        
        # 混合检索获取相关文档
        if use_hybrid:
            retrieved_docs = self.retriever.hybrid_search(search_query, top_k=3)
            print(f"📚 混合检索获取到 {len(retrieved_docs)} 个相关文档")
        else:
            retrieved_docs = self.retriever.vector_search(search_query, top_k=3)
            print(f"📚 向量检索获取到 {len(retrieved_docs)} 个相关文档")
        
        # 4. 生成层：QA Chain生成答案
        context = self.qa_chain.format_context(retrieved_docs)
        answer = self.qa_chain.generate_answer(query, context)
        
        return {
            'question': query,
            'answer': answer,
            'retrieved_docs': retrieved_docs,
            'context': context
        }

# ===== 6. 加载测试数据并运行 =====
def main():
    # 加载测试集 Q/A 文本
    with open("test_texts.pkl", "rb") as f:
        test_texts = pickle.load(f)
    
    # 提取所有文档作为知识库
    documents = [text for text in test_texts]  # 使用前500个作为知识库
    
    # 初始化完整RAG系统
    rag_system = CompleteRAGSystem()
    
    # 构建知识库
    print("📦 构建完整知识库...")
    chunks = rag_system.build_knowledge_base(documents)
    
    # 测试问答
    test_questions = [
        "Is it possible to learn algorithms and data structures in just two weeks before an interview? What are some shortcuts to learning these topics?",
        "Is making more money the primary reason for job hopping as a software engineer?",
        "What are the career options for a B.Tech graduate in an IoT company?"
    ]
    
    print("\n" + "="*60)
    print("🚀 开始测试完整RAG系统")
    print("="*60)
    
    for i, question in enumerate(test_questions, 1):
        print(f"\n【测试 {i}】")
        result = rag_system.answer_question(question, use_hyde=True, use_hybrid=True)
        
        print(f"💬 答案：{result['answer']}")
        print("-" * 40)

if __name__ == "__main__":
    main()

🔧 加载生成模型...
==((====))==  Unsloth 2025.7.3: Fast Qwen2 patching. Transformers: 4.53.2.
   \\   /|    NVIDIA GeForce RTX 4070 Laptop GPU. Num GPUs = 1. Max memory: 7.996 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.31.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
📦 构建完整知识库...
🔧 开始动态分块处理...
✅ 已处理 100 个文档
✅ 已处理 200 个文档
✅ 已处理 300 个文档
✅ 已处理 400 个文档
✅ 已处理 500 个文档
✅ 已处理 600 个文档
✅ 已处理 700 个文档
✅ 已处理 800 个文档
✅ 已处理 900 个文档
✅ 已处理 1000 个文档
✅ 已处理 1100 个文档
✅ 已处理 1200 个文档
✅ 已处理 1300 个文档
✅ 已处理 1400 个文档
✅ 已处理 1500 个文档
✅ 已处理 1600 个文档
✅ 已处理 1700 个文档
✅ 已处理 1800 个文档
✅ 已处理 1900 个文档
✅ 已处理 2000 个文档
📊 动态分块完成，共生成 12780 个文本块
🔧 构建向量索引中...


Batches: 100%|██████████| 400/400 [00:06<00:00, 59.40it/s] 


✅ 向量索引构建完成，维度: 384, 文档数: 12780
🔧 构建BM25关键词索引中...
✅ BM25索引构建完成

🚀 开始测试完整RAG系统

【测试 1】

🤔 用户问题：Is it possible to learn algorithms and data structures in just two weeks before an interview? What are some shortcuts to learning these topics?
🔍 HyDE假设文档：Is it possible to learn algorithms and data structures in just two weeks before an interview? What a...
📚 混合检索获取到 1 个相关文档
💬 答案：是的，有可能在两周内学习算法和数据结构。以下是一些学习算法和数据结构的捷径：
1. 通过阅读教材来学习。教材是最基本的、最系统的学习方式，可以帮助你系统地掌握算法和数据结构的基础知识。
2. 参加在线课程。许多在线课程提供算法和数据结构的教程，这些课程的难度和深度可以满足不同的学习需求。
3. 利用在线资源。有许多在线资源可以帮助你学习算法和数据结构，例如网上课程、视频教程、博客文章等。
4. 多做练习。多做算法和数据结构的练习可以帮助你加深理解和掌握知识。练习可以分为两种类型：一类是基础练习，另一类是高级练习。
5. 与导师或同事讨论。与导师或同事讨论算法和数据结构可以帮助你解决实际问题，同时也可以加深你的理解。
6. 通过实践来学习。通过实践，你可以更好地理解算法和数据结构的概念，同时也可以提高自己的实践能力。
7. 利用在线社区。许多在线社区可以帮助你与其他学习者交流，分享学习经验和问题解决方法。
8. 通过参加比赛和竞赛来学习。参加比赛和竞赛可以帮助你更好地
----------------------------------------

【测试 2】

🤔 用户问题：Is making more money the primary reason for job hopping as a software engineer?
🔍 HyDE假设文档：As a software engineer, making more m

In [18]:
import pickle
import re
import faiss
import torch
import numpy as np
from datasets import Dataset
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from unsloth import FastLanguageModel
from transformers import AutoTokenizer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from rank_bm25 import BM25Okapi
import jieba
import os

# ===== 1. 文档预处理层 =====
class DocumentPreprocessor:
    """文档预处理层：实现动态分块"""
    
    def __init__(self, chunk_size=512, chunk_overlap=50):
        # 使用 RecursiveCharacterTextSplitter 实现动态分块
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=["\n\n", "\n", "。", "！", "？", "；", "，", " ", ""],
            keep_separator=True,
            length_function=len
        )
    
    def dynamic_chunking(self, documents):
        """动态分块：根据语义结构进行文档分割"""
        print("🔧 开始动态分块处理...")
        all_chunks = []
        
        for i, doc in enumerate(documents):
            # 使用递归式动态分块策略
            chunks = self.text_splitter.split_text(doc)
            all_chunks.extend(chunks)
            
            if (i + 1) % 100 == 0:
                print(f"✅ 已处理 {i + 1} 个文档")
        
        print(f"📊 动态分块完成，共生成 {len(all_chunks)} 个文本块")
        return all_chunks

# ===== 2. 嵌入与索引层 =====
class HybridRetriever:
    """混合检索层：集成向量检索和关键词检索"""
    
    def __init__(self, embedder_model="sentence-transformers/all-MiniLM-L6-v2"):
        # 向量化检索组件
        self.embedder = SentenceTransformer(embedder_model)
        self.faiss_index = None
        self.texts = None
        
        # BM25关键词检索组件
        self.bm25 = None
        self.tokenized_docs = None
    
    def build_vector_index(self, texts):
        """构建FAISS向量索引"""
        print("🔧 构建向量索引中...")
        self.texts = texts
        
        # 生成文档嵌入向量
        embeddings = self.embedder.encode(texts, convert_to_numpy=True, show_progress_bar=True)
        
        # 使用FAISS构建高效向量索引
        dimension = embeddings.shape[1]
        self.faiss_index = faiss.IndexFlatIP(dimension)  # 内积相似度
        
        # 归一化向量以使用余弦相似度
        faiss.normalize_L2(embeddings)
        self.faiss_index.add(embeddings.astype('float32'))
        
        print(f"✅ 向量索引构建完成，维度: {dimension}, 文档数: {len(texts)}")
    
    def build_bm25_index(self, texts):
        """构建BM25关键词索引"""
        print("🔧 构建BM25关键词索引中...")
        
        # 对文档进行分词
        self.tokenized_docs = [list(jieba.cut(text)) for text in texts]
        
        # 构建BM25索引
        self.bm25 = BM25Okapi(self.tokenized_docs)
        
        print("✅ BM25索引构建完成")
    
    def vector_search(self, query, top_k=5):
        """向量化检索"""
        if self.faiss_index is None:
            raise ValueError("向量索引未构建，请先调用 build_vector_index")
        
        # 查询向量化
        query_vec = self.embedder.encode([query], convert_to_numpy=True)
        faiss.normalize_L2(query_vec)
        
        # FAISS检索
        scores, indices = self.faiss_index.search(query_vec.astype('float32'), top_k)
        
        results = []
        for score, idx in zip(scores[0], indices[0]):
            if idx != -1:  # 有效索引
                results.append({
                    'text': self.texts[idx],
                    'score': float(score),
                    'method': 'vector'
                })
        
        return results
    
    def bm25_search(self, query, top_k=5):
        """BM25关键词检索"""
        if self.bm25 is None:
            raise ValueError("BM25索引未构建，请先调用 build_bm25_index")
        
        # 查询分词
        query_tokens = list(jieba.cut(query))
        
        # BM25检索
        scores = self.bm25.get_scores(query_tokens)
        
        # 获取top_k结果
        top_indices = np.argsort(scores)[::-1][:top_k]
        
        results = []
        for idx in top_indices:
            results.append({
                'text': self.texts[idx],
                'score': float(scores[idx]),
                'method': 'bm25'
            })
        
        return results
    
    def hybrid_search(self, query, top_k=6, vector_weight=0.6, bm25_weight=0.4):
        """混合检索：结合向量检索和BM25检索"""
        vector_results = self.vector_search(query, top_k)
        bm25_results = self.bm25_search(query, top_k)
        
        # 合并结果并重新排序
        combined_results = {}
        
        # 处理向量检索结果
        for result in vector_results:
            text = result['text']
            if text not in combined_results:
                combined_results[text] = {'text': text, 'vector_score': 0, 'bm25_score': 0}
            combined_results[text]['vector_score'] = result['score']
        
        # 处理BM25检索结果
        for result in bm25_results:
            text = result['text']
            if text not in combined_results:
                combined_results[text] = {'text': text, 'vector_score': 0, 'bm25_score': 0}
            combined_results[text]['bm25_score'] = result['score']
        
        # 计算混合得分
        final_results = []
        for item in combined_results.values():
            # 归一化分数并加权组合
            hybrid_score = (vector_weight * item['vector_score'] + 
                          bm25_weight * item['bm25_score'])
            final_results.append({
                'text': item['text'],
                'score': hybrid_score,
                'method': 'hybrid'
            })
        
        # 按得分排序并返回top_k
        final_results.sort(key=lambda x: x['score'], reverse=True)
        return final_results[:top_k]

# ===== 3. 查询增强层 =====
class QueryEnhancer:
    """查询增强层：实现HyDE假设文档生成"""
    
    def __init__(self, model, tokenizer, device):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device
    
    def generate_hypothetical_document(self, query):
        """HyDE：生成假设文档来改善检索效果"""
        hyde_prompt = f"""请基于以下问题生成一个详细的假设性文档，该文档应该包含问题的答案和相关背景信息：

问题: {query}

假设文档:"""
        
        inputs = self.tokenizer(hyde_prompt, return_tensors="pt", truncation=True, max_length=512).to(self.device)
        
        with torch.no_grad():
            outputs = self.model.generate(
                input_ids=inputs['input_ids'],
                attention_mask=inputs['attention_mask'],
                max_new_tokens=200,
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
                eos_token_id=self.tokenizer.eos_token_id,
                pad_token_id=self.tokenizer.eos_token_id
            )
        
        generated = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        hypothetical_doc = generated[len(hyde_prompt):].strip()
        
        return hypothetical_doc
    
    def enhanced_query(self, original_query):
        """查询增强：结合原查询和假设文档"""
        hypothetical_doc = self.generate_hypothetical_document(original_query)
        
        # 组合原查询和假设文档
        enhanced_query = f"{original_query} {hypothetical_doc}"
        
        return enhanced_query, hypothetical_doc

# ===== 4. 生成层 =====
class QAChain:
    """QA Chain框架：实现结构化的问答流程"""
    
    def __init__(self, model, tokenizer, device):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device
    
    def format_context(self, retrieved_docs):
        """整合检索文档为结构化上下文"""
        if not retrieved_docs:
            return "暂无相关背景信息。"
        
        context_parts = []
        for i, doc in enumerate(retrieved_docs, 1):
            context_parts.append(f"文档{i}：{doc['text']}")
        
        return "\n\n".join(context_parts)
    
    def generate_answer(self, query, context):
        """QA Chain：生成结构化答案"""
        qa_prompt = f"""你是一个智能问答助手。请基于提供的背景知识回答用户问题。

背景知识：
{context}

用户问题：{query}

请提供准确、详细的答案："""
        
        inputs = self.tokenizer(qa_prompt, return_tensors="pt", truncation=True, max_length=1800).to(self.device)
        
        with torch.no_grad():
            outputs = self.model.generate(
                input_ids=inputs['input_ids'],
                attention_mask=inputs['attention_mask'],
                max_new_tokens=256,
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
                eos_token_id=self.tokenizer.eos_token_id,
                pad_token_id=self.tokenizer.eos_token_id
            )
        
        generated = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        answer = generated[len(qa_prompt):].strip()
        
        return answer

# ===== 5. 完整RAG系统集成 =====
class CompleteRAGSystem:
    """完整的四层RAG架构系统"""
    
    def __init__(self, model_name="unsloth/qwen2-1.5b-bnb-4bit"):
        # 初始化各层组件
        self.preprocessor = DocumentPreprocessor()
        self.retriever = HybridRetriever()
        
        # 加载生成模型
        print("🔧 加载生成模型...")
        self.model, self.tokenizer = FastLanguageModel.from_pretrained(
            model_name=model_name,
            max_seq_length=2048,
            dtype=None,
            load_in_4bit=True
        )
        self.model.eval()
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        
        # 初始化查询增强层和生成层
        self.query_enhancer = QueryEnhancer(self.model, self.tokenizer, self.device)
        self.qa_chain = QAChain(self.model, self.tokenizer, self.device)
    
    def build_knowledge_base(self, documents):
        """构建知识库：包含所有索引"""
        # 1. 文档预处理层：动态分块
        chunks = self.preprocessor.dynamic_chunking(documents)
        
        # 2. 嵌入与索引层：构建混合索引
        self.retriever.build_vector_index(chunks)
        self.retriever.build_bm25_index(chunks)
        
        return chunks
    
    def answer_question(self, query, use_hyde=True, use_hybrid=True):
        """完整的问答流程"""
        print(f"\n🤔 用户问题：{query}")
        
        # 3. 查询增强层：HyDE处理（可选）
        if use_hyde:
            enhanced_query, hypothetical_doc = self.query_enhancer.enhanced_query(query)
            print(f"🔍 HyDE假设文档：{hypothetical_doc[:100]}...")
            search_query = enhanced_query
        else:
            search_query = query
        
        # 混合检索获取相关文档
        if use_hybrid:
            retrieved_docs = self.retriever.hybrid_search(search_query, top_k=3)
            print(f"📚 混合检索获取到 {len(retrieved_docs)} 个相关文档")
        else:
            retrieved_docs = self.retriever.vector_search(search_query, top_k=3)
            print(f"📚 向量检索获取到 {len(retrieved_docs)} 个相关文档")
        
        # 4. 生成层：QA Chain生成答案
        context = self.qa_chain.format_context(retrieved_docs)
        answer = self.qa_chain.generate_answer(query, context)
        
        return {
            'question': query,
            'answer': answer,
            'retrieved_docs': retrieved_docs,
            'context': context
        }

# ===== 6. 加载测试数据并运行 =====
def main():
    # 加载测试集 Q/A 文本
    with open("test_texts.pkl", "rb") as f:
        test_texts = pickle.load(f)
    
    # 使用全部文档作为知识库
    documents = [text for text in test_texts]

    # 初始化完整RAG系统
    rag_system = CompleteRAGSystem()

    # 构建知识库
    print("📦 构建完整知识库...")
    chunks = rag_system.preprocessor.dynamic_chunking(documents)

    # 展示分块后的前5段文本
    print("\n📄 前 5 段动态分块文本：")
    for i, chunk in enumerate(chunks[:5], 1):
        print(f"\n--- Chunk {i} ---")
        print(chunk)

    # 对前5段文本做向量化
    print("\n📈 前 5 段文本的嵌入向量（前10维）：")
    embedder = rag_system.retriever.embedder
    embeddings = embedder.encode(chunks[:5], convert_to_numpy=True)

    for i, emb in enumerate(embeddings, 1):
        print(f"\n--- Embedding {i} ---")
        print(f"Dimension: {emb.shape[0]}")
        print(f"First 10 dimensions: {emb[:10]}")

if __name__ == "__main__":
    main()

🔧 加载生成模型...
==((====))==  Unsloth 2025.7.3: Fast Qwen2 patching. Transformers: 4.53.2.
   \\   /|    NVIDIA GeForce RTX 4070 Laptop GPU. Num GPUs = 1. Max memory: 7.996 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.31.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
📦 构建完整知识库...
🔧 开始动态分块处理...
✅ 已处理 100 个文档
✅ 已处理 200 个文档
✅ 已处理 300 个文档
✅ 已处理 400 个文档
✅ 已处理 500 个文档
✅ 已处理 600 个文档
✅ 已处理 700 个文档
✅ 已处理 800 个文档
✅ 已处理 900 个文档
✅ 已处理 1000 个文档
✅ 已处理 1100 个文档
✅ 已处理 1200 个文档
✅ 已处理 1300 个文档
✅ 已处理 1400 个文档
✅ 已处理 1500 个文档
✅ 已处理 1600 个文档
✅ 已处理 1700 个文档
✅ 已处理 1800 个文档
✅ 已处理 1900 个文档
✅ 已处理 2000 个文档
📊 动态分块完成，共生成 12780 个文本块

📄 前 5 段动态分块文本：

--- Chunk 1 ---
Q: What are the career options for a B.Tech graduate in an IoT company?

--- Chunk 2 ---
A: Yes, pursuing a B.Tech (Hons) in Electr

In [12]:
import pickle
import re
import faiss
import torch
import numpy as np
from datasets import Dataset
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from unsloth import FastLanguageModel
from transformers import AutoTokenizer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from rank_bm25 import BM25Okapi
import jieba
import os
import time
from typing import Dict, List, Tuple

# 导入完整RAG系统组件（假设在上面的代码中）
# from complete_rag_framework import CompleteRAGSystem

class EnhancedRAGEvaluator:
    """增强的RAG评估系统，支持完整四层架构评估"""
    
    def __init__(self, model_name="unsloth/qwen2-1.5b-bnb-4bit"):
        self.model_name = model_name
        self.embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
        self.rag_system = None
        
    def setup_rag_system(self, documents):
        """初始化完整RAG系统"""
        print("🔧 初始化完整RAG系统...")
        self.rag_system = CompleteRAGSystem(self.model_name)
        
        # 构建知识库
        chunks = self.rag_system.build_knowledge_base(documents)
        print(f"✅ 知识库构建完成，共 {len(chunks)} 个文本块")
        
        return chunks

    def extract_qa(self, text):
        """提取问题和答案"""
        q_match = re.search(r"Q:\s*(.*)", text)
        a_match = re.search(r"A:\s*(.*)", text, re.DOTALL)
        question = q_match.group(1).strip() if q_match else ""
        answer = a_match.group(1).strip() if a_match else ""
        return question, answer

    def build_comprehensive_dataset(self, test_texts, max_samples=100):
        """构建综合评估数据集，包含多种检索策略的结果"""
        print(f"📦 构建综合评估数据集（最大样本数：{max_samples}）...")
        
        # 基础数据收集
        questions, ground_truths = [], []
        
        # 不同策略的结果
        vanilla_answers, vanilla_contexts = [], []  # 基础向量检索
        hybrid_answers, hybrid_contexts = [], []    # 混合检索
        hyde_answers, hyde_contexts = [], []        # HyDE增强
        full_answers, full_contexts = [], []        # 完整RAG（HyDE + 混合检索）
        
        for i, item in enumerate(test_texts[:max_samples]):
            q, a = self.extract_qa(item)
            if not q or not a:
                print(f"⚠️  跳过样本 {i+1}：缺少问题或答案")
                continue
            
            questions.append(q)
            ground_truths.append(a)
            
            try:
                # 1. 基础向量检索
                vanilla_result = self.rag_system.answer_question(
                    q, use_hyde=False, use_hybrid=False
                )
                vanilla_answers.append(vanilla_result['answer'])
                vanilla_contexts.append([doc['text'] for doc in vanilla_result['retrieved_docs']])
                
                # 2. 混合检索
                hybrid_result = self.rag_system.answer_question(
                    q, use_hyde=False, use_hybrid=True
                )
                hybrid_answers.append(hybrid_result['answer'])
                hybrid_contexts.append([doc['text'] for doc in hybrid_result['retrieved_docs']])
                
                # 3. HyDE + 向量检索
                hyde_result = self.rag_system.answer_question(
                    q, use_hyde=True, use_hybrid=False
                )
                hyde_answers.append(hyde_result['answer'])
                hyde_contexts.append([doc['text'] for doc in hyde_result['retrieved_docs']])
                
                # 4. 完整RAG系统（HyDE + 混合检索）
                full_result = self.rag_system.answer_question(
                    q, use_hyde=True, use_hybrid=True
                )
                full_answers.append(full_result['answer'])
                full_contexts.append([doc['text'] for doc in full_result['retrieved_docs']])
                
                print(f"✅ 样本 {i+1}/{max_samples} 处理完成")
                
            except Exception as e:
                print(f"❌ 样本 {i+1} 处理失败: {e}")
                # 添加空值以保持数据对齐
                for lst in [vanilla_answers, vanilla_contexts, hybrid_answers, hybrid_contexts,
                           hyde_answers, hyde_contexts, full_answers, full_contexts]:
                    if len(lst) < len(questions):
                        lst.append("" if "answers" in str(lst) else [])
        
        # 构建多策略数据集
        dataset_dict = {
            "question": questions,
            "ground_truth": ground_truths,
            
            # 基础向量检索
            "vanilla_answer": vanilla_answers,
            "vanilla_contexts": vanilla_contexts,
            
            # 混合检索
            "hybrid_answer": hybrid_answers,
            "hybrid_contexts": hybrid_contexts,
            
            # HyDE增强
            "hyde_answer": hyde_answers,
            "hyde_contexts": hyde_contexts,
            
            # 完整RAG
            "full_answer": full_answers,
            "full_contexts": full_contexts,
        }
        
        return Dataset.from_dict(dataset_dict)

    def evaluate_single_strategy(self, dataset, answer_key, context_key, strategy_name):
        """评估单一策略的性能"""
        print(f"\n📊 评估 {strategy_name} 策略...")
        
        scores = {
            "faithfulness": [],
            "answer_relevancy": [],
            "context_precision": [],
            "context_recall": [],
        }
        
        for i in range(len(dataset)):
            try:
                q = dataset["question"][i]
                a = dataset[answer_key][i]
                gt = dataset["ground_truth"][i]
                ctxs = dataset[context_key][i]
                
                if not a or not ctxs:  # 跳过空答案或空上下文
                    continue
                
                # 生成嵌入向量
                q_vec = self.embedder.encode([q])
                a_vec = self.embedder.encode([a])
                gt_vec = self.embedder.encode([gt])
                ctx_vecs = self.embedder.encode(ctxs)
                
                # 计算各项指标
                # 1. faithfulness: answer与context的相似度
                faith = cosine_similarity(a_vec, ctx_vecs).mean()
                scores["faithfulness"].append(faith)
                
                # 2. answer relevancy: answer与question的相似度
                rel = cosine_similarity(a_vec, q_vec)[0][0]
                scores["answer_relevancy"].append(rel)
                
                # 3. context precision: GT与最相关context的最大相似度
                precision = max([cosine_similarity(gt_vec, c.reshape(1, -1))[0][0] for c in ctx_vecs])
                scores["context_precision"].append(precision)
                
                # 4. context recall: GT与所有context的平均相似度
                recall = cosine_similarity(gt_vec, ctx_vecs).mean()
                scores["context_recall"].append(recall)
                
            except Exception as e:
                print(f"❌ {strategy_name} 第 {i+1} 条评估失败: {e}")
        
        # 计算平均分数
        avg_scores = {k: np.mean(v) if v else 0.0 for k, v in scores.items()}
        
        return avg_scores

    def comparative_evaluation(self, dataset):
        """对比评估不同策略"""
        print("\n" + "="*60)
        print("🏆 RAG系统综合对比评估")
        print("="*60)
        
        strategies = [
            ("基础向量检索", "vanilla_answer", "vanilla_contexts"),
            ("混合检索", "hybrid_answer", "hybrid_contexts"), 
            ("HyDE增强", "hyde_answer", "hyde_contexts"),
            ("完整RAG系统", "full_answer", "full_contexts"),
        ]
        
        all_results = {}
        
        for strategy_name, answer_key, context_key in strategies:
            results = self.evaluate_single_strategy(dataset, answer_key, context_key, strategy_name)
            all_results[strategy_name] = results
            
            print(f"\n🎯 {strategy_name} 评估结果:")
            for metric, score in results.items():
                print(f"  {metric}: {score:.4f}")
        
        # 输出对比表格
        self.print_comparison_table(all_results)
        
        return all_results

    def print_comparison_table(self, results):
        """打印对比表格"""
        print("\n📈 性能对比表格:")
        print("-" * 80)
        
        # 表头
        metrics = list(next(iter(results.values())).keys())
        print(f"{'策略':<15}", end="")
        for metric in metrics:
            print(f"{metric:<18}", end="")
        print()
        print("-" * 80)
        
        # 数据行
        for strategy, scores in results.items():
            print(f"{strategy:<15}", end="")
            for metric in metrics:
                print(f"{scores[metric]:<18.4f}", end="")
            print()
        
        print("-" * 80)
        
        # 找出最佳策略
        print("\n🏅 各指标最佳策略:")
        for metric in metrics:
            best_strategy = max(results.keys(), key=lambda x: results[x][metric])
            best_score = results[best_strategy][metric]
            print(f"  {metric}: {best_strategy} ({best_score:.4f})")

    def analyze_retrieval_quality(self, dataset, sample_size=10):
        """分析检索质量"""
        print(f"\n🔍 检索质量分析（样本数：{sample_size}）...")
        
        strategies = [
            ("基础向量检索", "vanilla_contexts"),
            ("混合检索", "hybrid_contexts"),
            ("HyDE+向量检索", "hyde_contexts"),
            ("完整RAG", "full_contexts")
        ]
        
        for strategy_name, context_key in strategies:
            print(f"\n📚 {strategy_name} 检索质量:")
            
            context_lengths = []
            semantic_similarities = []
            
            for i in range(min(sample_size, len(dataset))):
                ctxs = dataset[context_key][i]
                gt = dataset["ground_truth"][i]
                
                if ctxs and gt:
                    # 统计上下文长度
                    avg_length = np.mean([len(ctx) for ctx in ctxs])
                    context_lengths.append(avg_length)
                    
                    # 计算与ground truth的语义相似度
                    gt_vec = self.embedder.encode([gt])
                    ctx_vecs = self.embedder.encode(ctxs)
                    max_sim = cosine_similarity(gt_vec, ctx_vecs).max()
                    semantic_similarities.append(max_sim)
            
            if context_lengths and semantic_similarities:
                print(f"  平均上下文长度: {np.mean(context_lengths):.1f} 字符")
                print(f"  平均语义相似度: {np.mean(semantic_similarities):.4f}")
                print(f"  语义相似度标准差: {np.std(semantic_similarities):.4f}")

def main():
    """主程序入口"""
    # 加载测试数据
    with open("test_texts.pkl", "rb") as f:
        test_texts = pickle.load(f)
    
    # 初始化评估器
    evaluator = EnhancedRAGEvaluator()
    
    # 设置RAG系统
    documents = [text for text in test_texts[:300]]  # 使用前300个作为知识库
    evaluator.setup_rag_system(documents)
    
    # 构建评估数据集
    dataset_path = "enhanced_ragas_dataset.pkl"
    if os.path.exists(dataset_path):
        print(f"📂 加载已保存的评估数据集...")
        with open(dataset_path, "rb") as f:
            eval_dataset = pickle.load(f)
    else:
        print("📦 构建增强评估数据集...")
        eval_dataset = evaluator.build_comprehensive_dataset(test_texts[300:350], max_samples=50)
        with open(dataset_path, "wb") as f:
            pickle.dump(eval_dataset, f)
        print(f"💾 数据集已保存到 {dataset_path}")
    
    # 执行综合评估
    results = evaluator.comparative_evaluation(eval_dataset)
    
    # 分析检索质量
    evaluator.analyze_retrieval_quality(eval_dataset, sample_size=20)
    
    print("\n🎉 增强RAG评估完成！")
    
    # 保存评估结果
    results_path = "rag_evaluation_results.pkl"
    with open(results_path, "wb") as f:
        pickle.dump(results, f)
    print(f"📊 评估结果已保存到 {results_path}")

if __name__ == "__main__":
    main()

🔧 初始化完整RAG系统...
🔧 加载生成模型...
==((====))==  Unsloth 2025.7.3: Fast Qwen2 patching. Transformers: 4.53.2.
   \\   /|    NVIDIA GeForce RTX 4070 Laptop GPU. Num GPUs = 1. Max memory: 7.996 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.31.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
🔧 开始动态分块处理...
✅ 已处理 100 个文档
✅ 已处理 200 个文档
✅ 已处理 300 个文档
📊 动态分块完成，共生成 1949 个文本块
🔧 构建向量索引中...


Batches: 100%|██████████| 61/61 [00:01<00:00, 43.65it/s]


✅ 向量索引构建完成，维度: 384, 文档数: 1949
🔧 构建BM25关键词索引中...
✅ BM25索引构建完成
✅ 知识库构建完成，共 1949 个文本块
📦 构建增强评估数据集...
📦 构建综合评估数据集（最大样本数：50）...

🤔 用户问题：For a first product management job, is it better to work at a small company or a large one?
📚 向量检索获取到 3 个相关文档

🤔 用户问题：For a first product management job, is it better to work at a small company or a large one?
📚 混合检索获取到 3 个相关文档

🤔 用户问题：For a first product management job, is it better to work at a small company or a large one?
🔍 HyDE假设文档：这个问题的答案因人而异，取决于个人的兴趣和经验。一般来说，对于初入职场的人来说，小公司可能更适合。小公司的规模较小，员工之间更加熟悉，更容易进行沟通和协作。此外，小公司的管理方式也更加灵活，员工可以更...
📚 向量检索获取到 3 个相关文档

🤔 用户问题：For a first product management job, is it better to work at a small company or a large one?
🔍 HyDE假设文档：您需要在小公司和大公司之间做出选择，但您不确定哪个选项更好。以下是一些因素需要考虑的。

首先，您需要考虑您的技能和经验。如果您是一名经验丰富的项目管理专业人士，那么在小公司工作可能会更符合您的需求。...
📚 混合检索获取到 3 个相关文档
✅ 样本 1/50 处理完成

🤔 用户问题：Is tech work culture in the UK really that different from the US?
📚 向量检索获取到 3 个相关文档

🤔 用户问题：Is tech work culture in the UK really that different from th

KeyboardInterrupt: 