### 导包和变量设置

In [1]:
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter, CharacterTextSplitter,TokenTextSplitter
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
from langchain_community.retrievers.bm25 import BM25Retriever
from langchain.retrievers import EnsembleRetriever
from langchain_community.vectorstores import Chroma, FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import pandas as pd
import numpy as np
import torch
import os
import gc
import re


DOCS_DIR = './A/A_document'
EMB_MODEL = './bge-large-zh-v1.5'
RERANK_MODEL = "BAAI/bge-reranker-large"
PERSIST_DIR = './vectordb' 
QUERY_DIR = './A/A_question.csv'
SUB_DIR = './submit_example.csv'


TypeError: 

In [None]:
query = pd.read_csv(QUERY_DIR)
sub = pd.read_csv("./submit_example.csv")
display(query.head(3))
display(sub.head(3))

### PDF文档解析和切分

In [None]:
loader = PyPDFDirectoryLoader(DOCS_DIR)
pages = loader.load_and_split()
pdf_list = os.listdir(DOCS_DIR)

In [None]:
pdf_text = { pdf_page.metadata['source'][-8:]:'' for pdf_page  in pages }
for pdf in tqdm(pdf_list):
    for pdf_page in pages:
        if pdf in pdf_page.metadata['source']:
            pdf_text[pdf] += pdf_page.page_content
        else:
            continue
print('key:pdf value:text')

In [None]:

import torch
import re

from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load model and tokenizer using auto class
tokenizer = AutoTokenizer.from_pretrained('./bert-base-chinese')
model = AutoModelForSequenceClassification.from_pretrained('./bert-base-chinese')

def filter_text(text):
    text = text.replace('\n', '').replace(' ', '')
    head_pattern = '本文档为2024CCFBDCI比赛用语料的一部分。[^\s]+仅允许在本次比赛中使用。'
    pattern1 = r"发布时间：[^\s]+发布人：新闻宣传中心"
    pattern2 = r"发布时间：[^\s]+发布人：新闻发布人"
    pattern3 = r'发布时间：\d{4}年\d{1,2}月\d{1,2}日'
    news_pattern = head_pattern + '|' + pattern1 + '|' + pattern2 + '|' + pattern3
    text = re.sub(news_pattern, '', text)
    report_pattern1 = '第一节重要提示[^\s]+本次利润分配方案尚需提交本公司股东大会审议。'
    report_pattern12 = '一重要提示[^\s]+股东大会审议。'
    report_pattern13 = '一、重要提示[^\s]+季度报告未经审计。'
    report_pattern2 = '本公司董事会及全体董事保证本公告内容不存在任何虚假记载、[^\s]+季度财务报表是否经审计□是√否'
    report_pattern3 = '中国联合网络通信股份有限公司（简称“公司”）董事会审计委员会根据相关法律法规、[^\s]+汇报如下：'
    report_pattern = report_pattern1 + '|' + report_pattern12 + '|' + report_pattern13 + '|' + report_pattern2 + '|' + report_pattern3
    text = re.sub(report_pattern, '', text)

    # Use BERT for further filtering
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    outputs = model(**inputs)
    logits = outputs.logits
    filtered_text = text if torch.argmax(logits) == 1 else ""

    return filtered_text

    

In [None]:
# Filter out None values from the pdf_text dictionary
pdf_text = {pdf_id: content for pdf_id, content in pdf_text.items() if content is not None}

# Process the PDF text and remove empty entries
pdf_text = {pdf_id: filter_text(content) for pdf_id, content in pdf_text.items() if filter_text(content)}

with open('AZ.txt', 'w', encoding='utf-8') as file:
    pdf_all = ''.join(list(pdf_text.values())).encode('utf-8', 'replace').decode('utf-8')
    file.write(pdf_all)

In [None]:
from langchain_community.document_loaders import TextLoader
loader = TextLoader("AZ.txt",encoding="utf-8")
documents = loader.load()
#分割文本
text_splitter = RecursiveCharacterTextSplitter(        
        chunk_size=245,             
        chunk_overlap=128,
        separators = ["。", "！", "？"],
        keep_separator='end',
    )
docs = text_splitter.split_documents(documents)


In [None]:
len(docs)

### 文本块向量化（比赛限定使用bge-large-zh-v1.5模型）

In [None]:
embeddings = HuggingFaceEmbeddings(model_name=EMB_MODEL, show_progress=True)
vectordb = FAISS.from_documents(   
    documents=docs,
    embedding=embeddings,
)

vectordb.save_local(PERSIST_DIR)

### 混合检索器

#### bm25 
- k1 较高的 k1 值意味着词频对评分的影响更大。
- b  当 b=1 时，文档长度的影响最大；当b = 0 时，文档长度不影响评分。
- langchain 默认切分英文split()，中文需要jieba分词

In [None]:
import jieba
dense_retriever = vectordb.as_retriever(search_kwargs={"k": 5})
bm25_retriever = BM25Retriever.from_documents(
    docs, 
    k=5, 
    bm25_params={"k1": 1.5, "b": 0.75}, 
    preprocess_func=jieba.lcut
)
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, dense_retriever], weights=[0.4, 0.6])

### 文本召回和重排

In [None]:
from transformers import BertTokenizer, BertForQuestionAnswering

# Load BERT model and tokenizer for QA
qa_tokenizer = AutoTokenizer.from_pretrained("./bert-base-chinese")
qa_model = AutoModelForQuestionAnswering.from_pretrained("./bert-base-chinese")


def rerank_with_bert(questions, retriever, top_n=1):
    rerank_answers = []
    for question in tqdm(questions):
        relevant_docs = retriever.retrieve(question)
        combined_text = " ".join([doc.page_content for doc in relevant_docs])

        inputs = qa_tokenizer(question, combined_text, return_tensors='pt', truncation=True, padding=True, max_length=512)
        outputs = qa_model(**inputs)
        answer_start = torch.argmax(outputs.start_logits)
        answer_end = torch.argmax(outputs.end_logits) + 1
        answer = qa_tokenizer.convert_tokens_to_string(qa_tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end]))

        rerank_answers.append(answer[:245])
    return rerank_answers

questions = list(query['question'].values)
rerank_answers = rerank_with_bert(questions, ensemble_retriever)
print(rerank_answers[0])

### 提交

### Embedding

In [None]:
def emb(answers, emb_batch_size = 4):
    model = SentenceTransformer(EMB_MODEL, trust_remote_code=True)
    all_sentence_embeddings = []
    for i in tqdm(range(0, len(answers), emb_batch_size), desc="embedding sentences"):
        batch_sentences = answers[i:i+emb_batch_size]
        sentence_embeddings = model.encode(batch_sentences, normalize_embeddings=True)
        all_sentence_embeddings.append(sentence_embeddings)
    all_sentence_embeddings = np.concatenate(all_sentence_embeddings, axis=0)
    print('emb_model max_seq_length: ', model.max_seq_length)
    print('emb_model embeddings_shape: ', all_sentence_embeddings.shape[-1])
    del model
    gc.collect()
    torch.cuda.empty_cache()
    return all_sentence_embeddings

all_sentence_embeddings = emb(rerank_answers)


In [None]:
sub['answer'] = rerank_answers
sub['embedding']= [','.join([str(a) for a in all_sentence_embeddings[i]]) for i in range(len(all_sentence_embeddings))]
sub.to_csv('submit-Bert.csv', index=None)
sub.head()