In [1]:
import os
import json
from glob import glob
import pickle
import re

def remove_repeated_segments(text, min_repeat_len=50):
    """
    移除文本中连续重复的片段。
    """
    pattern = re.compile(r'(.{%d,}?)\1+' % min_repeat_len, re.DOTALL)
    return pattern.sub(lambda m: m.group(1), text)

data_dir = r"C:\Users\15278\CHATBOT\data_crawl-20250703T175549Z-1-001\data_crawl"
json_files = glob(os.path.join(data_dir, "quora_all_scraped_*_extracted.json"))

cleaned_texts = []

for file_path in json_files:
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f).get("data", [])
        for item in data:
            question = item.get("question_text", "").strip()
            answers = item.get("answer_texts", [])
            for ans in answers:
                # 拼接问答文本
                raw_text = f"Q: {question}\nA: {ans.strip()}"
                # 清洗重复内容
                cleaned_text = remove_repeated_segments(raw_text)
                cleaned_texts.append(cleaned_text)

print(f"✅ 清洗后问答对数量：{len(cleaned_texts)}")

# 保存清洗后的文本
with open("qa_chunks_cleaned.pkl", "wb") as f:
    pickle.dump(cleaned_texts, f)

print("✅ 清洗后的文本已保存到 qa_chunks_cleaned.pkl")


✅ 清洗后问答对数量：10088
✅ 清洗后的文本已保存到 qa_chunks_cleaned.pkl


In [3]:
from sentence_transformers import SentenceTransformer
import faiss
import pickle

# 读清洗好的文本
with open("qa_chunks_cleaned.pkl", "rb") as f:
    texts = pickle.load(f)

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

embeddings = model.encode(texts, show_progress_bar=True, convert_to_numpy=True)

dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

# 保存向量库和索引
with open("qa_chunks_cleaned.pkl", "wb") as f:
    pickle.dump(texts, f)
faiss.write_index(index, "qa_index_cleaned.faiss")

print("✅ 新向量库和索引构建完成！")


  from .autonotebook import tqdm as notebook_tqdm
Batches: 100%|██████████| 316/316 [00:17<00:00, 18.46it/s]


✅ 新向量库和索引构建完成！


In [10]:
import faiss
import pickle
import numpy as np

# 1. 加载索引
index = faiss.read_index("qa_index.faiss")

# 2. 加载文本块（qa_chunks.pkl）
with open("qa_chunks.pkl", "rb") as f:
    qa_chunks = pickle.load(f)

# 3. 检查索引和文本块数量是否一致
assert index.ntotal == len(qa_chunks), f"索引数量 ({index.ntotal}) 和文本块数量 ({len(qa_chunks)}) 不一致"

# 4. 获取前3条向量 + 对应文本
print("索引中的前三条向量及对应文本：\n")
for i in range(10):
    vector = index.reconstruct(i)  # 获取第i条向量
    text = qa_chunks[i]            # 获取第i条对应的文本
    print(f"=== 第 {i} 条 ===")
    print(f"[向量长度]: {len(vector)}")
    print(f"[文本内容]: {text}\n")


索引中的前三条向量及对应文本：

=== 第 0 条 ===
[向量长度]: 384
[文本内容]: Q: As a Java Developer, how can I switch my career to Machine Learning?
A: What skill do you need to learn? Machine learning. If you want to skip the rant and go to my actual advice, scroll to the bottom! Rant Yes, it’s an obvious advantage that you know some programming. Coding proficiency is a basic requirement to do machine learning, although it’s not as important as in a typical software developer job. Mathematical optimization and calculus is somewhat important. Linear algebra and statistics is really important. Probability and combinatorics is really important. But these are just prerequisites. Most importantly, machine learning is an entire branch of computer science Continue ReadingWhat skill do you need to learn? Machine learning. If you want to skip the rant and go to my actual advice, scroll to the bottom! Rant Yes, it’s an obvious advantage that you know some programming. Coding proficiency is a basic requirement to do mach