### 导包和变量设置

In [1]:
# 引入PyPDFDirectoryLoader，可以从文件夹中一次性加载所有pdf文件
# 然后使用RecursiveCharacterTextSplitter对解析出来的文档进行切分，主要根据分隔符，chunk_size以及overlap等

from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
from langchain_community.retrievers.bm25 import BM25Retriever
from langchain.retrievers import EnsembleRetriever
from langchain_community.vectorstores import Chroma, FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import pandas as pd
import numpy as np
import torch
import os
import gc


DOCS_DIR = '/root/autodl-tmp/dataset/rag/A_document'
EMB_MODEL = '/root/autodl-tmp/models/bge-large-zh-v1_5'
RERANK_MODEL = "/root/autodl-tmp/models/bge-reranker-large"
PERSIST_DIR = '/root/autodl-tmp/vectorDatabase/faiss'
QUERY_DIR = '/root/autodl-tmp/dataset/rag/A_question.csv'
SUB_DIR = '/root/autodl-tmp/dataset/rag/submit.csv'
query = pd.read_csv(QUERY_DIR)
sub = pd.read_csv("/root/autodl-tmp/dataset/rag/submit_example.csv")
display(query.head(3))
display(sub.head(3))

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,ques_id,question
0,1,根据年度报告，2022年中国联通在向数字科技领军企业转变的过程中实现了哪些维度的转型升级？
1,2,告诉我2022年联通产业互联网收入的同比增长速度。
2,3,根据2022年度报告，中国联通的企业定位是什么？


Unnamed: 0,ques_id,question,answer,embedding
0,1,根据年度报告，2022年中国联通在向数字科技领军企业转变的过程中实现了哪些维度的转型升级？,我们坚定践行网络强国、数字中国、智慧社会战略部署，今天的中国联通，正在从传统运营商加速向数字...,"-0.02707982249557972,-0.009818901307880878,-0...."
1,2,告诉我2022年联通产业互联网收入的同比增长速度。,我们坚定践行网络强国、数字中国、智慧社会战略部署，今天的中国联通，正在从传统运营商加速向数字...,"-0.02707982249557972,-0.009818901307880878,-0...."
2,3,根据2022年度报告，中国联通的企业定位是什么？,我们坚定践行网络强国、数字中国、智慧社会战略部署，今天的中国联通，正在从传统运营商加速向数字...,"-0.02707982249557972,-0.009818901307880878,-0...."


### PDF文档解析和切分

In [None]:
# 进行数据加载
loader = PyPDFDirectoryLoader(DOCS_DIR)

docs = loader.load_and_split(
    RecursiveCharacterTextSplitter(        
        chunk_size=200,             
        chunk_overlap=0,
        separators = ["。", "！", "？"],
        keep_separator='end',
    ),
)
# 打印文档数量
print(len(docs))
# print(docs[0].page_content)

# # 打印所有第一页的数据出来看下，切分效果如何
# for i, item in enumerate(docs):
#     print(f"the {i} doc's content i: {item.page_content}")

ERROR:tornado.general:Uncaught exception in ZMQStream callback
Traceback (most recent call last):
  File "/root/miniconda3/envs/dl/lib/python3.10/site-packages/jupyter_client/session.py", line 95, in json_packer
UnicodeEncodeError: 'utf-8' codec can't encode characters in position 713444-713448: surrogates not allowed

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/root/miniconda3/envs/dl/lib/python3.10/site-packages/zmq/eventloop/zmqstream.py", line 560, in _run_callback
    f = callback(*args, **kwargs)
  File "/root/miniconda3/envs/dl/lib/python3.10/site-packages/ipykernel/iostream.py", line 170, in _handle_event
    event_f()
  File "/root/miniconda3/envs/dl/lib/python3.10/site-packages/ipykernel/iostream.py", line 649, in _flush
    self.session.send(
  File "/root/miniconda3/envs/dl/lib/python3.10/site-packages/jupyter_client/session.py", line 852, in send
    elif stream:
  File "/root/miniconda3/envs/dl/lib/pytho

### 文本块向量化（比赛限定使用bge-large-zh-v1.5模型）

In [None]:
embeddings = HuggingFaceEmbeddings(model_name=EMB_MODEL, show_progress=True)
vectordb = FAISS.from_documents(   
    documents=docs,
    embedding=embeddings,
)

vectordb.save_local(PERSIST_DIR)

Batches: 100%|██████████| 154/154 [00:32<00:00,  4.71it/s]


### 混合检索器

#### bm25 
- k1 较高的 k1 值意味着词频对评分的影响更大。
- b  当 b=1 时，文档长度的影响最大；当b = 0 时，文档长度不影响评分。
- langchain 默认切分英文split()，中文需要jieba分词

In [None]:
import jieba
dense_retriever = vectordb.as_retriever(search_kwargs={"k": 5})
bm25_retriever = BM25Retriever.from_documents(
    docs, 
    k=5, 
    bm25_params={"k1": 1.5, "b": 0.75}, 
    preprocess_func=jieba.lcut
)
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, dense_retriever], weights=[0.5, 0.5])

### 文本召回和重排

In [None]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain_community.cross_encoders import HuggingFaceCrossEncoder

def rerank(questions, retriever, top_n=5, cut_len=384):
    rerank_model = HuggingFaceCrossEncoder(model_name=RERANK_MODEL)
    compressor = CrossEncoderReranker(model=rerank_model, top_n=top_n)
    compression_retriever = ContextualCompressionRetriever(
        base_compressor=compressor, base_retriever=retriever
    )
    rerank_answers = []
    for question in tqdm(questions):
        relevant_docs = compression_retriever.invoke(question)
        answer=''
        for rd in relevant_docs:
            answer += rd.page_content
        rerank_answers.append(answer[:cut_len])
    return rerank_answers

questions = list(query['question'].values)
rerank_answers = rerank(questions, ensemble_retriever)
print(rerank_answers[0])

### 提交

In [None]:
def emb(answers, emb_batch_size = 4):
    model = SentenceTransformer(EMB_MODEL, trust_remote_code=True).half()
    all_sentence_embeddings = []
    for i in tqdm(range(0, len(answers), emb_batch_size), desc="embedding sentences"):
        batch_sentences = answers[i:i+emb_batch_size]
        sentence_embeddings = model.encode(batch_sentences, normalize_embeddings=True)
        all_sentence_embeddings.append(sentence_embeddings)
    all_sentence_embeddings = np.concatenate(all_sentence_embeddings, axis=0)
    print('emb_model max_seq_length: ', model.max_seq_length)
    print('emb_model embeddings_shape: ', all_sentence_embeddings.shape[-1])
    del model
    gc.collect()
    torch.cuda.empty_cache()
    return all_sentence_embeddings

all_sentence_embeddings = emb(rerank_answers)
sub['answer'] = rerank_answers
sub['embedding']= [','.join([str(a) for a in all_sentence_embeddings[i]]) for i in range(len(all_sentence_embeddings))]
sub.to_csv(SUB_DIR, index=None)
sub.head()

### 后续可能提分点
- 引入LLM
   * LLM 递归判断/抽取
   * rag-fusion 查询改写
   * 构建知识图谱



### 注意：
- 在分块、重排等过程中可以使用公开库和模型，但禁止使用LLM直接生成最终答案。
- 禁止使用LLM继续调整精排得到的文本块，如压缩文本块长度；
- 禁止使用LLM直接从文档获取问题答案。