In [1]:
import os
import re
import json
from collections import defaultdict
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import PyPDFLoader


In [None]:
def get_keywords():
    # File path to the outline document
    file_path = 'out/QA.outline'

    # Read the entire file content
    with open(file_path, 'r') as file:
        file_content = file.read()

    # Use regular expression to find all instances of chapter names in <a> tags
    chapter_details = re.findall(
    r'<a class="l"[^>]*data-dest-detail=\'\[(\d+),[^\]]+\]\'>(.*?)\s*</a>',
    file_content
    )

    chapter_to_number_dict = {detail[1].strip(): int(detail[0]) for detail in chapter_details}
    chapter_names = [k.replace("&amp;", "&").strip() for k, v in chapter_to_number_dict.items()]
    return chapter_names


In [None]:
# os.system("pdf2htmlEX --embed cfijo --dest-dir out data/QA.pdf")
# html2text out/QA.html utf-8 --ignore-links --escape-all > 1.txt

with open("1.txt", 'r', encoding='UTF-8') as f:
    text = f.read()

sections = re.split(r'!\[\]\(.+?\)', text)
# 去掉页眉和页妈
for i in range(len(sections)):
    sections[i] = re.sub(rf'^.*?\n{i}\n', "", sections[i], flags=re.DOTALL)

all_text = "".join(sections).replace("\n\n", "\n")
with open("all.txt", 'w', encoding='UTF-8') as f:
    f.write(all_text)

In [None]:
delimiter = ['，', '。', '：', '！', '-', '的', '是']

end_delimiter = ['。', '！', '？', '!', '?', '-', '的', '是']

keywords = get_keywords()
MAX_KEYWORD_LEN = 20
MAX_SENTENCE_LEN = 30

sections = defaultdict(str)
chapter_name = ""
tmp = ""
with open("all.txt", 'r', encoding='UTF-8') as f:
    for line in f.readlines():
        if line.strip() in keywords:
            if chapter_name != "":
                sections[chapter_name] += "<SEP>" + tmp
                tmp = ""
            chapter_name = line.strip()
        else:
            tmp += line
sections[chapter_name] = tmp

subsection_dict = defaultdict(list)

for chapter_name, text in sections.items():
    # 切分句子
    sentences = text.split('\n')
    # 处理每个句子
    processed_sentences = ""
    for sentence in sentences:
        if len(sentence) == 0:
            continue
        # 可能包含章节数字 1.1 标题
        elif re.match(r"^\d+(\.\d+){0,1,2}\s+.*$", sentence) and any(not sentence.endswith(it) for it in end_delimiter):
            processed_sentences += "\n" + sentence + "\n"
        # 句子长度小于最大关键词长度，且不包含分隔符
        elif len(sentence) < MAX_KEYWORD_LEN and not any(it in sentence for it in delimiter):
            processed_sentences += "\n" + sentence + "\n"
        # 拼接上下句子
        elif len(sentence) > MAX_SENTENCE_LEN - 2 or ("，" in sentence and not sentence.endswith("。")):
            processed_sentences += sentence
        # 换行后第一个字符是分隔符
        elif any(sentence.startswith(it) for it in delimiter):
            processed_sentences = processed_sentences[:-1] + sentence + "\n"
        else:
            processed_sentences += sentence + "\n"

    # 重新组合文本
    processed_sentences = processed_sentences.strip("\n")
    if len(processed_sentences) > 0:
        sections[chapter_name] = processed_sentences

In [None]:
def save_docs_to_jsonl(array, file_path:str)->None:
    with open(file_path, 'w') as jsonl_file:
        for doc in array:
            jsonl_file.write(json.dumps(doc.dict(), ensure_ascii=False) + '\n')


# print(sections["前向碰撞减缓系统"])
chunk_size = 200
chunk_overlap = 20

text_splitter = CharacterTextSplitter(
    separator="\n\n",
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    add_start_index=True,
    length_function=len,
    is_separator_regex=False
)


section_docs = text_splitter.create_documents(list(sections.values()), metadatas=[{"keyword": k} for k in sections.keys()])
save_docs_to_jsonl(section_docs, "section_docs.jsonl")

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS 

model_name = "/home/lzw/.hf_models/stella-base-zh-v2"
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs={"device": "cuda"} ,
    encode_kwargs={"normalize_embeddings": False})
db = FAISS.from_documents(section_docs, embeddings)
retriever = db.as_retriever(search_kwargs={'k': 5})



In [None]:
from fuzzywuzzy import fuzz, process
query = "设置无钥匙解锁中单门和全车的区别在于什么？?"
ret_docs = retriever.get_relevant_documents(query)
all_sent = [doc.page_content for doc in ret_docs]
all_sent
# sent_db.similarity_search(query, top_k=)
# print(ret_docs)

Sentence Cut

In [None]:
chunk_size = 100
chunk_overlap = 10
sentence_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    length_function=len,
    is_separator_regex=False
)

sent_docs = sentence_splitter.split_documents(section_docs)
save_docs_to_jsonl(sent_docs, "sent_docs.jsonl")
sent_db = FAISS.from_documents(sent_docs, embeddings)

In [None]:
query = "驾驶员状态监测系统是如何工作的？"
sent_retriever = sent_db.as_retriever(search_kwargs={'k': 5})
sent_retriever.get_relevant_documents(query)
index_retriever.get_relevant_documents(query)
retriever.get_relevant_documents(query)

In [None]:
from langchain.retrievers import BM25Retriever
query = "如何启用或停用手套箱密码保护功能？"
bm25_retriever = BM25Retriever.from_texts(list(sections.keys()))
bm25_retriever.get_relevant_documents(query)
index_retriever.get_relevant_documents(query)

## parent vectorstore
`https://python.langchain.com/docs/modules/data_connection/retrievers/parent_document_retriever`

In [7]:
from langchain.retrievers import ParentDocumentRetriever
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.storage import InMemoryStore
from langchain.embeddings import HuggingFaceEmbeddings


model_name = "/home/lzw/.hf_models/stella-base-zh-v2"
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs={"device": "cuda"} ,
    encode_kwargs={"normalize_embeddings": False})
# This text splitter is used to create the child documents
child_splitter = RecursiveCharacterTextSplitter(
    separators=["\n", "。\n"],
    chunk_size=100,
    chunk_overlap=0,
    )
# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="full_documents", embedding_function=embeddings
)
# The storage layer for the parent documents
store = InMemoryStore()
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
)

No sentence-transformers model found with name /home/lzw/.hf_models/stella-base-zh-v2. Creating a new one with MEAN pooling.


In [8]:
from langchain.schema import Document
import json
def load_docs_from_jsonl(file_path):
    array = []
    with open(file_path, 'r') as jsonl_file:
        for line in jsonl_file:
            data = json.loads(line)
            obj = Document(**data)
            array.append(obj)
    return array

section_docs = load_docs_from_jsonl("section_docs.jsonl")
retriever.add_documents(section_docs, ids=None)

In [9]:
ret = vectorstore.similarity_search("为什么驾驶之前需要确保挡风玻璃无冰渣、积雪或冷凝水？")
ret

[Document(page_content='■ 冬季使用雨刮前，请先清除挡风玻璃上的冰和积雪并确认雨刮片没有冻结在挡风玻璃上。\n■ 当挡风玻璃上有尘沙、鸟粪、昆虫、树浆等异物时，请先清洁挡风玻璃，否则会损坏雨刮片/影响雨刮片清洁效果。', metadata={'doc_id': 'b8c510bb-9ffd-450b-b0f4-941f62fd40a6', 'keyword': '后雨刮和洗涤器', 'start_index': 0}),
 Document(page_content='■ 恶劣天气下能见度降低：强降雪、高吹雪、雨、浓雾和多尘天气对挡风玻璃和前保险杠均有影响，可能会降低系统功能。', metadata={'doc_id': 'd68a3a12-bf2c-465b-85c4-552c8d6bb5f5', 'keyword': '前向碰撞减缓系统（CMSF）局限性', 'start_index': 1117}),
 Document(page_content='<SEP>冬季行驶\n冬季行驶前，有必要做好准备工作和检查工作。冬季来临前，请检查以下几项：\n■ 在发动机冷却液壶中加入足量防冻液。为获取最佳防冻效果，请使用正确规格的冷却液。', metadata={'doc_id': 'acce8e5d-3ec9-4c8f-9373-5353e7233b52', 'keyword': '冬季行驶', 'start_index': 0}),
 Document(page_content='■ 避免在挡风玻璃干燥的情况下开启雨刮，否则可能导致雨刮片和挡风玻璃损坏。■ 定期清洁和检查雨刮片，否则雨刮片使用寿命可能会缩短。', metadata={'doc_id': 'b8c510bb-9ffd-450b-b0f4-941f62fd40a6', 'keyword': '后雨刮和洗涤器', 'start_index': 0})]

In [None]:
# This text splitter is used to create the parent documents
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=20)
# This text splitter is used to create the child documents
# It should create documents smaller than the parent
child_splitter = RecursiveCharacterTextSplitter(chunk_size=20, chunk_overlap=0)
# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="split_parents", embedding_function=embeddings
)
# The storage layer for the parent documents
store = InMemoryStore()
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)
retriever.add_documents(section_docs)

In [None]:
len(list(store.yield_keys()))

In [10]:
sub_docs = vectorstore.similarity_search("如何打开车辆尾门？")
ret_docs = retriever.get_relevant_documents("如何打开车辆尾门？")
sub_docs, ret_docs

([Document(page_content='<SEP>紧急情况下，您可以使用机械方式从车内应急打开尾门。1 折叠后排座椅靠背，进入后备厢。\n2 使用机械钥匙或类似工具，拆下保护盖。\n3 沿箭头方向拨动控制杆，解锁尾门。\n4 向外推动，打开尾门。', metadata={'doc_id': '03028a02-8d7f-478b-ae37-9cabdfbb861b', 'keyword': '应急打开尾门', 'start_index': 0}),
  Document(page_content='<SEP>紧急情况下，您可以使用机械方式从车内应急打开尾门。1 折叠后排座椅靠背，进入后备厢。\n2 使用机械钥匙或类似工具，拆下保护盖。\n3 沿箭头方向拨动控制杆，解锁尾门。\n4 向外推动，打开尾门。', metadata={'doc_id': '97dac7f1-b7bd-4869-9127-03d0e4b25ae6', 'keyword': '应急打开尾门', 'start_index': 0}),
  Document(page_content='<SEP>打开尾门\n\n尾门开启按键\n车辆处于解锁状态时，轻按尾门下部的开启按键，尾门自动打开。说明！□ 打开/关闭尾门，后备厢灯会相应地自动点亮或熄灭。\n□ 尾门打开时，车辆将自动点亮后部位置灯，以提醒后方车辆的驾驶员，避免发生碰撞。\n\n关闭尾门', metadata={'doc_id': 'd4e26474-248a-4785-b685-855e0774e8c9', 'keyword': '打开尾门', 'start_index': 0}),
  Document(page_content='<SEP>打开尾门\n\n尾门开启按键\n车辆处于解锁状态时，轻按尾门下部的开启按键，尾门自动打开。说明！□ 打开/关闭尾门，后备厢灯会相应地自动点亮或熄灭。', metadata={'doc_id': '640a2964-ba9b-4889-9897-04adab2c898b', 'keyword': '打开尾门', 'start_index': 0})],
 [Document(page_content='<SEP>紧急情况下，您可以使用机械方式从车内应急打开尾门。1 折叠后排座椅靠背，进入后备厢。\