In [24]:
import json
import requests
import pdfplumber
import torch
import transformers
import jieba
import sklearn
import numpy as np
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from rank_bm25 import BM25Okapi
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sentence_transformers import SentenceTransformer

### Load data

In [11]:
questions = json.load(open("questions.json", encoding="utf-8"))
print(questions[0])

pdf = pdfplumber.open("初赛训练数据集.pdf")
print("pages:", len(pdf.pages))

{'question': '“前排座椅通风”的相关内容在第几页？', 'answer': '', 'reference': ''}
pages: 354


In [12]:
pdf_content = []
for page_idx in range(len(pdf.pages)):
    pdf_content.append({
        'page': 'page_' + str(page_idx + 1),
        'content': pdf.pages[page_idx].extract_text()
    })

In [18]:
pdf_content[0]

{'page': 'page_1',
 'content': '欢迎\n感谢您选择了具有优良安全性、舒适性、动力性和经济性的Lynk&Co领克汽车。\n首次使用前请仔细、完整地阅读本手册内容，将有助于您更好地了解和使用车辆。\n本手册中的所有资料均为出版时的最新资料，但本公司将对产品进行不断的改进和优化，您所购的车辆可能与本手册中的描述有所不同，请以实际\n接收的车辆为准。\n如您有任何问题，或需要预约服务，请拨打电话4006-010101联系我们。您也可以开车前往Lynk&Co领克中心。\n在抵达之前，请您注意驾车安全。\n©领克汽车销售有限公司'}

### TFDIF

In [25]:
question_words = [' '.join(jieba.lcut(x['question'])) for x in questions]
pdf_content_words = [' '.join(jieba.lcut(x['content'])) for x in pdf_content]

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ZHONGY~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.582 seconds.
Prefix dict has been built successfully.


In [27]:
tfidf = TfidfVectorizer()
tfidf.fit(question_words + pdf_content_words)

question_feat = tfidf.transform(question_words)
pdf_content_feat = tfidf.transform(pdf_content_words)

question_feat = normalize(question_feat)
pdf_content_feat = normalize(pdf_content_feat)

In [28]:
for idx, feat in enumerate(question_feat):
    score = feat @ pdf_content_feat.T
    score = score.toarray()[0]
    max_score_page_idx = score.argsort()[-1] + 1

    questions[idx]['reference'] = 'page_' + str(max_score_page_idx)

In [29]:
# with open('submit.json', 'w', encoding='utf8') as up:
#     json.dump(questions, up, ensure_ascii=False, indent=4)

### BM25

In [13]:
pdf_content_words = [jieba.lcut(x['content']) for x in pdf_content]
bm25 = BM25Okapi(pdf_content_words)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ZHONGY~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.578 seconds.
Prefix dict has been built successfully.


In [14]:

for idx in range(len(questions)):
    doc_scores = bm25.get_scores(jieba.lcut(questions[idx]["question"]))
    max_score_page_idx = doc_scores.argsort()[-1] + 1
    questions[idx]['reference'] = 'page_' + str(max_score_page_idx)

In [32]:
# with open('submit.json', 'w', encoding='utf8') as up:
#     json.dump(questions, up, ensure_ascii=False, indent=4)

### BM25 + BGE

In [5]:
model = SentenceTransformer('BAAI/bge-small-zh-v1.5')
question_sentences = [x['question'] for x in questions]
pdf_content_sentences = [x['content'] for x in pdf_content]

question_embeddings = model.encode(question_sentences, normalize_embeddings=True)
pdf_embeddings = model.encode(pdf_content_sentences, normalize_embeddings=True)

for query_idx, feat in enumerate(question_embeddings):
    score = feat @ pdf_embeddings.T
    max_score_page_idx = score.argsort()[-1]
    questions[query_idx]['reference'] = pdf_content[max_score_page_idx]['page']

with open('submit.json', 'w', encoding='utf8') as up:
    json.dump(questions, up, ensure_ascii=False, indent=4)

In [None]:

from scipy.stats import rankdata

for query_idx, feat in enumerate(question_embeddings):
    score1 = feat @ pdf_embeddings.T
    score2 = bm25.get_scores(jieba.lcut(questions[query_idx]["question"]))

    score = rankdata(score1) + rankdata(score2)
    max_score_page_idx = score.argsort()[-1] + 1
    questions[query_idx]['reference'] = 'page_' + str(max_score_page_idx)

with open('submit.json', 'w', encoding='utf8') as up:
    json.dump(questions, up, ensure_ascii=False, indent=4)

### Prompt

In [45]:
def ask_gemini(apikey, text):
    url = f'https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?key={apikey}'
    headers = {'Content-Type': 'application/json'}
    data = {
        'contents': [
            {
                'parts': [
                    {"text": text}
                    ]
                }
            ]
        }

    response = requests.post(url, headers=headers, json=data)
    if response.status_code == 200:
        response_data = response.json()
        return response_data
    else:
        print(f"Error: {response.status_code}")
        print(response.json())

In [16]:
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-reranker-base')
rerank_model = AutoModelForSequenceClassification.from_pretrained('BAAI/bge-reranker-base')
rerank_model.cuda()
rerank_model.eval()

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=

In [55]:
apikey = 'AIzaSyC7tR3iQJXs6_ittg0r8XwYNhobklLh7JE'

# prompt = "我今天心情不好，给我讲个笑话吧！"
# answer = ask_gemini(apikey, prompt)['candidates'][0]['content']['parts'][0]['text']
# print(answer)

# for query_idx in tqdm(range(len(questions))):
for query_idx in tqdm(range(1)):
    doc_scores = bm25.get_scores(jieba.lcut(questions[query_idx]["question"]))
    max_score_page_idxs = doc_scores.argsort()[-4:]

    pairs = []
    for idx in max_score_page_idxs:
        pairs.append([questions[query_idx]["question"], pdf_content[idx]['content']])

    inputs = tokenizer(pairs, padding=True, truncation=True, return_tensors='pt', max_length=512)
    with torch.no_grad():
        inputs = {key: inputs[key].cuda() for key in inputs.keys()}
        scores = rerank_model(**inputs, return_dict=True).logits.view(-1, ).float()
    max_score_page_idx = max_score_page_idxs[scores.cpu().numpy().argmax()]
    questions[query_idx]['reference'] = 'page_' + str(max_score_page_idx + 1)

    # prompt = '''你是一个汽车专家，帮我结合给定的资料，回答下面的问题。如果问题无法从资料中获得，或无法从资料中进行回答，请无法回答问题。如果问题可以从资料中获得，则请逐步回答。
    # 资料：{0}

    # 问题：{1}
    # '''.format(
    #     pdf_content[max_score_page_idx]['content'],
    #     questions[query_idx]["question"]
    # )
    # print(str(prompt))
    prompt = """你是一个汽车专家，帮我结合给定的资料，回答下面的问题。如果问题无法从资料中获得，或无法从资料中进行回答，请无法回答问题。如果问题可以从资料中获得，则请逐步回答。
    资料：安全出行
        设置前排座椅加热时间
        在中央显示屏中唤起空调控制界面，然后点击 -舒适。
        01设置副驾驶员侧座椅加热强度及开关控制。
        驾驶员/副驾驶员侧座椅加热分三级调节，点击控制开关后在“关-低-
        中-高”之间循环。
        01设置驾驶员侧座椅加热时间（5分钟、15分钟、30分钟和持
        续）。
        警告！
        02设置副驾驶员侧座椅加热时间（5分钟、15分钟、30分钟和持
        ■ 如果您或者车上的乘客（例如：病人、残疾人、无身体知觉的人 续）。
        等）身体无法感知座椅温度，请勿使用前排座椅加热功能。
        前排座椅通风
        使用Lynk&CoApp打开/关闭前排座椅加热 通过空调辅助功能菜单调节
        打开/关闭前排座椅加热图标：登录Lynk&CoApp，按下 您可以通过中央显示屏空调功能菜单，设置驾驶员/副驾驶员侧座椅
        该图标可以打开/关闭前排座椅加热。 通风强度或关闭座椅通风功能。
        115

    问题：“前排座椅通风”的相关内容在第几页？
    """
    answer = ask_gemini(apikey, prompt)['candidates'][0]['content']['parts'][0]['text']
    print(answer)

    if '无法回答' in answer:
        answer = '结合给定的资料，无法回答问题。'
    
    questions[query_idx]['answer'] = answer

100%|██████████| 1/1 [23:16:33<00:00, 83793.80s/it]

此问题无法从提供的资料中回答。



