In [1]:
import os
import sys
import json

from typing import Dict, Tuple, Iterable

from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores.faiss import FAISS
from langchain.docstore.document import Document
from langchain.llms import ChatGLM
from langchain.chains import LLMChain

from src.llm.template_manager import template_manager
from src.parse_pdf import parse_page_of_content


In [2]:
def load_docs_from_jsonl(file_path)->Iterable[Document]:
    array = []
    with open(file_path, 'r') as jsonl_file:
        for line in jsonl_file:
            data = json.loads(line)
            obj = Document(**data)
            array.append(obj)
    return array

In [3]:
# %timeit
documents, table_of_content = parse_page_of_content()
all_sub_sections = {sub_k : sub_v for _, v in table_of_content.items() for sub_k, sub_v in v.items() }


all_key_word = list(all_sub_sections.keys()) 


section_key_word = list(table_of_content.keys())
table_of_content, section_key_word

# cut_section
section_start_page_id = dict()
for k, v in table_of_content.items():
    section_start_page_id[k] = list(v.values())[0][0] - 2

section_start_page_id

Not matched chapter name and page_num, source: 车载12V电源......................................................... 128
Not matched chapter name and page_num, source: 车载12V电源......................................................... 129
Not matched chapter name and page_num, source: 4
Not matched chapter name and page_num, source: 360°全景影像 .........................................................226
Not matched chapter name and page_num, source: 6
Not matched chapter name and page_num, source: 123


{'前言': 9,
 '用车前准备': 15,
 '装载货物': 19,
 '上车和下车': 37,
 '驾驶前的准备': 49,
 '仪表和灯光': 67,
 '安全出行': 97,
 '启动和驾驶': 139,
 '驾驶辅助': 167,
 '空调': 239,
 '中央显示屏': 259,
 'Lynk&CoApp': 279,
 '高压系统': 285,
 '保养和维护': 301,
 'OTA升级': 323,
 '紧急情况下': 327,
 '技术资料': 345}

In [4]:
def find_first_key_geq(d, x):
    for key, value in reversed(d.items()):
        if x >= value:
            return key
    return "None"  # 返回 None 如果没有找到满足条件的键

with open("data/all_subsection_keys.txt", "r", encoding="utf-8") as f:
    all_sub_key_word = json.loads(f.read())


documents = load_docs_from_jsonl('data/section_documents.jsonl')
all_key_word = [doc.metadata["key_word"] for doc in documents]

In [5]:
documents[10].metadata['page'] + 1, documents[10].page_content[:10]

KeyError: 'page'

In [None]:
documents = load_docs_from_jsonl('data/page_documents.jsonl')
page_id2doc = {doc.metadata["page"]: doc for doc in documents}

all_text = ""
sub_section_key = ""

for doc in documents:
    # 页码删除 and 页眉删除
    page_id = doc.metadata['page'] + 1
    section_name = find_first_key_geq(section_start_page_id, page_id)
    content = doc.page_content.replace(f"{section_name}\n{page_id}", "")

    new_lines = []
    tmp = ""
    ## 修正换行
    for line in content.split("\n"):
        line = line.strip()
        tmp += line
        if line.endswith("。"):
            new_lines.append(tmp)
            tmp = ""
        elif line in all_key_word:
            # 添加 <sub_section> 标签
            sub_section_key = line
            line = "<\sub_section>\n\n<sub_section>" + line
            new_lines.append(line)
            tmp = ""
        elif line in all_sub_key_word:
            # TODO: 判断是否需要子章节
            line = "<SEP>" + line
            new_lines.append(line)
            tmp = ""

    new_lines.append(tmp)
    content = "\n".join(new_lines)

    content = content.replace("警告！", "<SEP>警告:\n")
    content = content.replace("注意！", "<SEP>注意:\n")
    content = content.replace("说明！", "<SEP>说明:\n")
    # all_text += f"\n<PAGE_SEP> page_id:{page_id}\n" + content
    all_text += content

# 去掉目录
all_text = "\n".join(all_text.split("\n")[12:])

with open("data/all_text.txt", "w") as f:
    f.write(all_text)

In [None]:
from collections import defaultdict
def split_and_deduplicate(s):
    # 按照"。"为分隔符切分字符串
    parts = s.split("。")
    seen = set()
    deduplicated_parts = []
    # 去重，同时保证去重后的顺序不变
    for part in parts:
        if part not in seen:
            seen.add(part)
            deduplicated_parts.append(part)
    return "。".join(deduplicated_parts)

### Build section cut

In [None]:
# build section_documents & sub_section_documents
def save_docs_to_jsonl(array:Iterable[Document], file_path:str)->None:
    with open(file_path, 'w') as jsonl_file:
        for doc in array:
            jsonl_file.write(json.dumps(doc.dict(), ensure_ascii=False) + '\n')


sub_sections = all_text.split("<\sub_section>")

# # section document cut
# section_documents = []
# sections_dict = defaultdict(str)

# for sub_section in sub_sections:
#     key_word = sub_section[2:].split("\n")[0].strip("<sub_section>")
#     sections_dict[key_word] += "\n".join(sub_section[2:].split("\n")[1:])


# for k, v in sections_dict.items():
#     sections_dict[k] = split_and_deduplicate(v)
#     section_documents.append(Document(page_content=sections_dict[k], metadata={"key_word":k}))

# save_docs_to_jsonl(section_documents, "data/section_documents.jsonl")


# sub_section cuts
sub_section_documents = []
sub_sections_dict = defaultdict(str)
for sub_section in sub_sections:
    sub_sub_sections = "\n".join(sub_section.split("\n")[1:]).split("<SEP>")
    if len(sub_sub_sections) < 2:
        continue

    sub_sub_sections = [sec for sec in sub_sub_sections[1:] if not (sec.startswith("警告") or sec.startswith("注意") or sec.startswith("说明") ) ]
    for sub_sub_section in sub_sub_sections:
        if len(sub_sub_section.split("\n")) < 2:
            continue
        sub_key_word = sub_sub_section.split("\n")[0]
        sub_sections_dict[sub_key_word] += "\n".join(sub_sub_section.split("\n")[1:])
        # print(sub_key_word, sub_sections_dict[sub_key_word][:10])

for k, v in sub_sections_dict.items():
    sub_sections_dict[k] = split_and_deduplicate(v)
    sub_section_documents.append(Document(page_content=sub_sections_dict[k], metadata={"key_word":k}))
save_docs_to_jsonl(sub_section_documents, "data/sub_section_documents.jsonl")

In [18]:
sub_section_documents = load_docs_from_jsonl('data/sub_section_documents.jsonl')
section_documents = load_docs_from_jsonl('data/section_documents.jsonl')


# no overlap key
section_key = [doc.metadata["key_word"] for doc in section_documents]
sub_section_key = [doc.metadata["key_word"] for doc in sub_section_documents]

set(section_key) & set(sub_section_key) == set()

True

In [35]:
# all
def save_docs_to_jsonl(array:Iterable[Document], file_path:str)->None:
    with open(file_path, 'w') as jsonl_file:
        for doc in array:
            jsonl_file.write(json.dumps(doc.dict(), ensure_ascii=False) + '\n')


all_documents = sub_section_documents + section_documents

all_index = [Document(page_content=doc.metadata["key_word"]) for doc in all_documents]
all_index_db = FAISS.from_documents(all_index, embeddings)
all_index_db.save_local("vector_store/all_index")


all_content = [Document(page_content=doc.page_content, metadata={"key_word":doc.metadata["key_word"]}) for doc in all_documents]
all_content_db = FAISS.from_documents(all_content, embeddings)
all_content_db.save_local("vector_store/all_content")

In [8]:
model_name = "/home/lzw/.hf_models/stella-base-zh-v2"
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs={"device": "cuda"} ,
    encode_kwargs={"normalize_embeddings": False})

sub_section_db = FAISS.from_documents(sub_section_documents, embeddings)
sub_section_db.save_local("vector_store/sub_section_content")

# # db = FAISS.from_documents(key_index, embeddings)
# sub_key_index = [Document(page_content=doc.metadata["key_word"]) for doc in sub_section_documents]
# sub_index_db = FAISS.from_documents(sub_key_index, embeddings)
# sub_index_db.save_local("vector_store/sub_index")

No sentence-transformers model found with name /home/lzw/.hf_models/stella-base-zh-v2. Creating a new one with MEAN pooling.


In [36]:
index_db = FAISS.load_local("vector_store/all_index", embeddings)
content_db = FAISS.load_local("vector_store/all_content", embeddings)
question = "如何通过遥控钥匙启动车辆"
ret1 = index_db.similarity_search_with_relevance_scores(question)
print("search result:", ret1, ret1[0][0].page_content)
related_str = content_db.similarity_search_with_relevance_scores(question, filter={"key_word": ret1[0][0].page_content})
print(ret1, "\n", related_str)

search result: [(Document(page_content='通过遥控钥匙启动车辆'), -15.168591166312694), (Document(page_content='通过蓝牙钥匙启动车辆'), -59.629723803289494), (Document(page_content='使用遥控钥匙打开/关闭车窗'), -66.37679092038796), (Document(page_content='使用遥控钥匙解锁和闭锁'), -86.33092435450425)] 通过遥控钥匙启动车辆
[(Document(page_content='通过遥控钥匙启动车辆'), -15.168591166312694), (Document(page_content='通过蓝牙钥匙启动车辆'), -59.629723803289494), (Document(page_content='使用遥控钥匙打开/关闭车窗'), -66.37679092038796), (Document(page_content='使用遥控钥匙解锁和闭锁'), -86.33092435450425)] 
 [(Document(page_content='1请检查是否携带遥控钥匙。\n<SEP>说明:\n□只有经过正确编码的遥控钥匙才能启动车辆。\n2请将挡位置于驻车挡（P）。\n3将遥控钥匙放置于车内。\n4将制动踏板踩到底。\n5长按START/STOP按钮，车辆启动后松开按钮。\n说明□动力电池电量充足时，纯电启动车辆，并点亮READY指示灯，此时发动机处于未启动状态。\n车辆紧急启动遥控钥匙电池电量低，系统无法检测到钥匙时，仪表显示屏上会弹出消息提示您，未检测到钥匙。此时，请按照以下步骤启动车辆：1将遥控钥匙放入前杯托底部。\n2按照前文中遥控钥匙启动车辆所述步骤启动车辆。\n<SEP>注意:\n■如果您尝试了三次都未能启动车辆，请避免再次尝试，并联系Lynk&Co领克中心。\n<SEP>警告:\n请严格遵循以下注意事项，有助于避免发生事故：■启动车辆前请检查确认方向盘、座椅、内后视镜或外后视镜是否调节到安全、舒适的位置。\n■启动车辆前请检查制动踏板是否可以踩到底。\n■请检查周围环境是否满足车辆启动的条件，若不满足切勿启动车辆。\n■如果发动机还没



## Build Sentence Cut

In [39]:
documents = load_docs_from_jsonl('data/page_documents.jsonl')
page_id2doc = {doc.metadata["page"]: doc for doc in documents}

all_text = ""
sub_section_key = ""

for doc in documents:
    # 页码删除 and 页眉删除
    page_id = doc.metadata['page'] + 1
    section_name = find_first_key_geq(section_start_page_id, page_id)
    content = doc.page_content.replace(f"{section_name}\n{page_id}", "")

    new_lines = []
    tmp = ""
    ## 修正换行
    for line in content.split("\n"):
        line = line.strip()
        tmp += line
        if line.endswith("。"):
            new_lines.append(tmp)
            tmp = ""
        elif line in all_key_word:
            # 添加 <sub_section> 标签
            new_lines.append(line)
            tmp = ""

    new_lines.append(tmp)
    content = "\n".join(new_lines)

    content = content.replace("警告！", "<SEP>警告:\n")
    content = content.replace("注意！", "<SEP>注意:\n")
    content = content.replace("说明！", "<SEP>说明:\n")
    # all_text += f"\n<PAGE_SEP> page_id:{page_id}\n" + content
    all_text += content

In [41]:
from langchain.text_splitter import CharacterTextSplitter
print("all_text_len", len(all_text))


text_splitter = CharacterTextSplitter(
    
)

134198
