In [None]:
import os
import json
import argparse
import re
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Qdrant
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from tqdm import tqdm
import pdfplumber  # 用於從PDF文件中提取文字的工具
from rank_bm25 import BM25Okapi  # 使用BM25演算法進行文件檢索
from ckiptagger import data_utils, construct_dictionary, WS, POS, NER
ws = WS("./data")
pos = POS("./data")


  cell = tf.compat.v1.nn.rnn_cell.LSTMCell(hidden_d, name=name)
2024-11-09 08:45:05.127822: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M3 Pro
2024-11-09 08:45:05.127851: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 36.00 GB
2024-11-09 08:45:05.127856: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 13.50 GB
2024-11-09 08:45:05.127884: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:303] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-09 08:45:05.127897: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:269] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2024-11-09 08:45:05.605204: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:303] Could 

In [2]:
quse_tags = [
    "Na",  # 普通名詞
    "Nb",  # 專有名詞
    "Nc",  # 地方詞
    "Neqa",# 數量定詞
    "Nf",  # 量詞
    "VC",  # 動作及物動詞
    "VJ",  # 狀態及物動詞
    "VD",  # 雙賓動詞
    "D",   # 副詞
    "Nep", # 指代定詞
    "Nd",  # 時間詞
    "VA",
    "VF",
    "FW",
    "Neu",
    "A"
]

In [3]:
# 載入參考資料，返回一個字典，key為檔案名稱，value為PDF檔內容的文本
def load_data(source_path):
    masked_file_ls = [file for file in os.listdir(source_path) if file.endswith('.pdf')]  # 獲取資料夾中的檔案列表
    corpus_dict = {int(file.replace('.pdf', '')): read_pdf(os.path.join(source_path, file)) for file in tqdm(masked_file_ls)}  # 讀取每個PDF文件的文本，並以檔案名作為鍵，文本內容作為值存入字典
    return corpus_dict

In [4]:
# 讀取單個PDF文件並返回其文本內容
def read_pdf(pdf_loc, page_infos: list = None):
    
    pdf = pdfplumber.open(pdf_loc)  # 打開指定的PDF文件

    # TODO: 可自行用其他方法讀入資料，或是對pdf中多模態資料（表格,圖片等）進行處理

    # 如果指定了頁面範圍，則只提取該範圍的頁面，否則提取所有頁面
    pages = pdf.pages[page_infos[0]:page_infos[1]] if page_infos else pdf.pages
    pdf_text = ''
    for _, page in enumerate(pages):  # 迴圈遍歷每一頁
        text = page.extract_text()  # 提取頁面的文本內容
        if text:
            english_chars = re.findall(r'[A-Za-z]+', text)
            english_chars = ' '.join(english_chars)  # 儲存英文字符，以空格分隔
            text = text.replace('\n', '').replace('\r', '').replace(' ', '').replace(',', '').replace('-', '').replace('%', '').replace('~', '').replace('.', '').replace('民國', '').replace('一', '').replace('二', '').replace('三', '').replace('四', '').replace('五', '').replace('六', '').replace('七', '').replace('八', '').replace('九', '').replace('十', '').replace('零', '').replace('年', '').replace('月', '').replace('日', '').replace('至', '').replace('、','').replace('$','').replace('～','').replace("+","").replace("&","")
            text = re.sub(r'\d+', '', text)
            text = re.sub(r'\(.*?\)', '', text)
            text = re.sub(r'\（.*?\）', '', text)
            text = re.sub(r'[A-Za-z]', '', text)
            text += english_chars
            pdf_text += text
    pdf.close()  # 關閉PDF文件

    return pdf_text  # 返回萃取出的文本


In [5]:
# 根據查詢語句和指定的來源，檢索答案
def BM25_retrieve(qs, source, cut_dict):
    filtered_cut = [cut_dict[str(file)] for file in source]
    # [TODO] 可自行替換其他檢索方式，以提升效能
    bm25 = BM25Okapi(filtered_cut)  # 使用BM25演算法建立檢索模型
    tokenized_query = ws([qs])  # 將查詢語句進行分詞
    pos_query = pos(tokenized_query)[0]
    tokenized_query = tokenized_query[0]
    final = list()
    for i in range(len(pos_query)):
        if pos_query[i] in quse_tags:
            final.append(tokenized_query[i])
    tokenized_query = final
    tokenized_query = [word for word in tokenized_query if word not in [",","？","，", "。", " "]]  # 過��停用��
    bigramed_query =[]
    for i in range(len(tokenized_query)):
        tokenized_query[i] = tokenized_query[i].strip(" ").strip("，").strip("？")
        bigramed_query.append(tokenized_query[i])
        if i < len(tokenized_query)-1:  # 避免超出範��
            tokenized_query[i+1] = tokenized_query[i+1].strip(" ").strip("，").strip("？")
            bigramed_query.append(tokenized_query[i] + tokenized_query[i+1])
    print(bigramed_query)
    ans = bm25.get_top_n(bigramed_query, list(filtered_cut), n=1)  # 根據查詢語句檢索，返回最相關的文檔，其中n為可調整項
    a = ans[0]
    # 找回與最佳匹配文本相對應的檔案名
    res = [key for key, value in cut_dict.items() if value == a]
    return res[0]  # 回傳檔案名

In [6]:
import sys

# 模擬命令行參數
sys.argv = [
    "notebook",
    "--question_path", "/Users/yhk/Desktop/大三上/DataMining/FP/競賽資料集/dataset/preliminary/questions_example.json",
    "--source_path", "/Users/yhk/Desktop/大三上/DataMining/FP/競賽資料集/reference",
    "--output_path", "/Users/yhk/Desktop/大三上/DataMining/FP/競賽資料集/dataset/preliminary/pred_retrieve.json",
]

In [7]:
use_tags = [
    "Na",  # 普通名詞
    "Nb",  # 專有名詞
    "Nc",  # 地方詞
    "Neqa",# 數量定詞
    "Nf",  # 量詞
    "VC",  # 動作及物動詞
    "VJ",  # 狀態及物動詞
    "VD",  # 雙賓動詞
    "D",   # 副詞
    "Nep", # 指代定詞
    "Nd",  # 時間詞
    "VA",
    "VF",
    "FW"
]

In [8]:
chinese_to_arabic = {
    "零": "0",
    "一": "1",
    "二": "2",
    "三": "3",
    "四": "4",
    "五": "5",
    "六": "6",
    "七": "7",
    "八": "8",
    "九": "9",
    "ㄧ": "1", 
    "—": "1",
    "○": "0"
}

# 清出關鍵字
使用特定Tag，同時處理年份以及季度

In [9]:
def cleaninsurance_data(cut_insurance_data):
    dt = dict()
    for i, v in cut_insurance_data.items():
        dt[i] = list()
        for j in range(len(v['ws'])) :
            v['ws'][j].replace(" ","").replace(",","").replace(".","").replace("(", "").replace(")", "")
            if v['pos'][j] in use_tags:
                dt[i].append(v['ws'][j])

        if i =="":
            print("why")
    return dt

In [None]:
def cleanfinance_data(cut_finance_data):
    dt = dict()
    for i, v in cut_finance_data.items():
        dt[i] = list()
        j = 0
        while j < len(v['ws']):
            if v['pos'][j] in use_tags and "$" not in v['ws'][j] and "|" not in v['ws'][j] and "/" not in v['ws'][j]and "~" not in v['ws'][j]and "," not in v['ws'][j]and "新台幣" not in v['ws'][j]and "%" not in v['ws'][j]and "百分之" not in v['ws'][j]and "[" not in v['ws'][j]and "]" not in v['ws'][j]:
                v['ws'][j] = v['ws'][j].strip(" ")
                temp = v['ws'][j].replace(",","").replace("(","").replace(")","").replace("-","").replace(" ","")
                if temp.isdigit():
                    j+=1
                    continue
                if v['pos'][j] == 'Nd':
                    v['ws'][j] = v['ws'][j].replace("千","").replace("百","").replace("十","").replace(" ","")
                    for chinese_num, arabic_num in chinese_to_arabic.items():
                        v['ws'][j] = v['ws'][j].replace(chinese_num, arabic_num)
                if v['pos'][j] == 'Nd' and v['ws'][j][-1] =="年" and v['ws'][j][-4:-1].isdigit() and int(v['ws'][j][-4:-1])>50: # 年
                    dt[i].append(str(int(v['ws'][j][-4:-1])+1911)+"年")
                elif v['pos'][j] == 'Nd' and len(v['ws'][j])>=2 and v['ws'][j][-2] =="年" and v['ws'][j][-5:-2].isdigit() and int(v['ws'][j][-5:-2])>50: # 年度
                    dt[i].append(str(int(v['ws'][j][-5:-2])+1911)+"年")
                elif v['pos'][j] == 'Nd' and v['ws'][j].isdigit():
                    dt[i].append(str(int(v['ws'][j])+1911)+"年")
                elif v['pos'][j] == 'Nf' and v['ws'][j][0] =="年":
                    time = 0
                    cal = 0
                    while time <=5 and cal < 120:
                        if j-time <0:
                            break
                        if v['ws'][j-time-1] in chinese_to_arabic.keys():
                            cal +=int(chinese_to_arabic[v['ws'][j-time-1]])*(10**time)
                            time += 1
                        else:
                            time += 1
                    dt[i].append(str(cal+1911)+"年")
                elif v['pos'][j] == 'Nd' and v['ws'][j][-1] =="年" and len(v['ws'][j])<5:
                    if len(v['ws'][j])>1 and v['ws'][j][-2].isdigit():
                        dt[i].append(str(110+int(v['ws'][j][-2])+1911)+"年")
                    elif len(v['ws'][j])>1 and not v['ws'][j][-2].isdigit():
                        try:
                            dt[i].append(str(110+int(chinese_to_arabic[v['ws'][j][-2]])+1911)+"年")
                        except:
                            j+= 1
                            continue
                    elif len(v['ws'][j])==1 and j-1 > 0 and v['ws'][j-1][-1].isdigit():
                        if len(v['ws'][j-1]) >1 and v['ws'][j-1][-2] =="第":
                            j+= 1
                            continue
                        elif "第" in v['ws'][j-2]:
                            j+= 1
                            continue
                        elif v['ws'][j-1][-1].isdigit():
                            dt[i].append(str(110+int(v['ws'][j-1][-1])+1911)+"年")
                        elif v['ws'][j-1][-1] in chinese_to_arabic.keys():
                            dt[i].append(str(110+int(chinese_to_arabic[v['ws'][j-1][-2]])+1911)+"年")
                elif v['pos'][j] == 'Nd' and v['ws'][j][-1] =="月" and len(v['ws'][j])>=3 and v['ws'][j][-3:-1].isdigit() and not v['ws'][j][-3].isspace():
                    dt[i].extend(["第4","季"])
                elif v['pos'][j] == 'Nd' and v['ws'][j][-1] =="月" and v['ws'][j][-2:-1].isdigit():
                    dt[i].extend(["第"+str(int((int(v['ws'][j][-2:-1])-1)/3)+1),"季"])
                else:
                    dt[i].append(v['ws'][j])
            j+=1
    return dt
        


In [11]:
def bigram(corpus):
    bigram_dict = {}
    for i,v in corpus.items():
        bigram_dict[i] = []
        for j in range(len(v)):
            v[j] = v[j].strip(" ")
            bigram_dict[i].append(v[j])
            if j+1 < len(v):
                v[j+1] = v[j+1].strip(" ")
                bigram_dict[i].append(v[j]+ v[j+1])
    return bigram_dict

In [16]:
with open('/Users/yhk/Desktop/大三上/DataMining/FP/競賽資料集/reference/faq/pid_map_content.json', 'rb') as f_s:
    faq_data = json.load(f_s)
    key_to_source_dict = {int(key): value for key, value in faq_data.items()}
faq_documents = convert_faqdict_to_documents(key_to_source_dict, 'faq')
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=100) 
splits = text_splitter.split_documents(faq_documents)

Embedding 初次


In [17]:
os.environ["OPENAI_API_KEY"] = #please input your openai key

if os.path.exists("/Users/yhk/Desktop/大三上/DataMining/FP/競賽資料集/reference/chroma"):
    shutil.rmtree("/Users/yhk/Desktop/大三上/DataMining/FP/競賽資料集/reference/chroma")

embeddings = OpenAIEmbeddings(model = 'text-embedding-3-large')
vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings, persist_directory="/Users/yhk/Desktop/大三上/DataMining/FP/競賽資料集/reference/chroma") #Our vector space for comparison

load local

In [109]:
os.environ["OPENAI_API_KEY"] = #please input your openai key
# laod local chroma 
embeddings = OpenAIEmbeddings()
vectorstore = Chroma(persist_directory="/Users/yhk/Desktop/大三上/DataMining/FP/競賽資料集/reference/chroma", embedding_function=embeddings)

In [14]:
# retrieve with  chroma
def transform_qs_ref_to_filter(source):
    
    or_conditions = [{"key": key} for key in source]
    
    filter_criteria = {
        "$and": [
            {"$or": or_conditions},
            {"category": 'faq'}
        ]
    }
    
    return filter_criteria

def retrieve_answers_faq(qs, sources):
    # Iterate over each question dictionary in the list
    filter_criteria = transform_qs_ref_to_filter(sources)
    docs = vectorstore.similarity_search(qs,k=1,filter=filter_criteria)
    try:
        retrieved = docs[0].metadata["key"]
    except:
        retrieved = -1
    
    return retrieved

In [15]:
def convert_faqdict_to_documents(data_dict, category):
    documents = []
    
    for key, questions_list in data_dict.items():
        for item in questions_list:
            # Extract only the question for the page_content
            question_text = item['question']
            doc = Document(
                page_content=question_text,
                metadata={"key": key, "category": category}
            )
            documents.append(doc)
    
    return documents

In [18]:
if __name__ == "__main__":
    # 使用argparse解析命令列參數
    parser = argparse.ArgumentParser(description='Process some paths and files.')
    parser.add_argument('--question_path', type=str, required=True, help='讀取發布題目路徑')  # 問題文件的路徑
    parser.add_argument('--source_path', type=str, required=True, help='讀取參考資料路徑')  # 參考資料的路徑
    parser.add_argument('--output_path', type=str, required=True, help='輸出符合參賽格式的答案路徑')  # 答案輸出的路徑
    
    args = parser.parse_args()  # 解析參數

    answer_dict = {"answers": []}  # 初始化字典

    with open(args.question_path, 'rb') as f:
        qs_ref = json.load(f)  # 讀取問題檔案

    with open(os.path.join(args.source_path, 'Final_correct_finance_CKIP.json'), 'rb') as f_s:
        cut_finance_data = json.load(f_s)  # 讀取參考資料文件
    cleaned_finance = cleanfinance_data(cut_finance_data)
    bigram_finance = bigram(cleaned_finance)
    with open(os.path.join(args.source_path, 'Final_correct_insurance_CKIP.json'), 'rb') as f_s:
        cut_insurance_data = json.load(f_s) 
    cleaned_insurance = cleaninsurance_data(cut_insurance_data)
    bigram_insurance = bigram(cleaned_insurance)
    
    count = 0
    for q_dict in qs_ref['questions']:
        if q_dict['category'] == 'finance':
            # 進行檢索
            retrieved = BM25_retrieve(q_dict['query'], q_dict['source'], bigram_finance)
            # 將結果加入字典
            answer_dict['answers'].append({"qid": q_dict['qid'], "retrieve": retrieved, "category": "finance"})

        elif q_dict['category'] == 'insurance':
            retrieved = BM25_retrieve(q_dict['query'], q_dict['source'], bigram_insurance)
            answer_dict['answers'].append({"qid": q_dict['qid'], "retrieve": retrieved, "category": "insurance"})
    
        elif q_dict['category'] == 'faq':
            retrieved = retrieve_answers_faq(q_dict['query'], q_dict['source'])
            answer_dict['answers'].append({"qid": q_dict['qid'], "retrieve": retrieved, "category": "faq"})

        else:
            raise ValueError("Something went wrong")  # 如果過程有問題，拋出錯誤
        count += 1
        print(count)

    # 將答案字典保存為json文件
    with open(args.output_path, 'w', encoding='utf8') as f:
        json.dump(answer_dict, f, ensure_ascii=False, indent=4)  # 儲存檔案，確保格式和非ASCII字符


2024-11-09 08:45:54.832454: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-11-09 08:45:55.288355: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


['匯款', '匯款銀行', '銀行', '銀行中間行', '中間行', '中間行所', '所', '所收取', '收取', '收取費用', '費用', '費用負擔', '負擔']
1
['公司', '公司應', '應', '應停止', '停止', '停止日前', '日前', '日前多少', '多少', '多少天', '天', '天書面', '書面', '書面通知', '通知', '通知要保人', '要保人']
2
['完全', '完全保險金', '保險金', '保險金受益人', '受益人', '受益人是否', '是否', '是否可以', '可以', '可以變更', '變更']
3
['契約', '契約內容', '內容', '內容變更', '變更', '變更應', '應', '應批註', '批註']
4
['基本', '基本保額', '保額', '保額什麼', '什麼']
5
['要保人', '要保人受益人', '受益人', '受益人應', '應', '應公司', '公司', '公司應', '應', '應負', '負', '負保險', '保險', '保險責任', '責任', '責任事故', '事故', '事故幾', '幾', '幾日', '日', '日通知', '通知', '通知公司', '公司']
6
['保險金', '保險金會', '會', '會契約', '契約', '契約效力', '效力', '效力如何', '如何']
7
['要保', '要保人', '人', '人訂立', '訂立', '訂立契約', '契約', '契約是否', '是否', '是否應', '應', '應公司', '公司', '公司要保書', '要保書', '要保書書面', '書面', '書面事項', '事項', '事項據實', '據實']
8
['受益人', '受益人被保險人', '被保險人', '被保險人未', '未', '未者', '者', '者會', '會', '會喪失', '喪失', '喪失什麼', '什麼', '什麼權利', '權利']
9
['要', '要保', '保', '保人', '人', '人展期', '展期', '展期定期', '定期', '定期是否', '是否', '是否仍', '仍', '仍適用', '適用', '適用回饋', '回饋', '回饋分享金', '分享金']

In [None]:
answer_dict

{'answers': [{'qid': 1, 'retrieve': '392', 'category': 'insurance'},
  {'qid': 2, 'retrieve': '428', 'category': 'insurance'},
  {'qid': 3, 'retrieve': '83', 'category': 'insurance'},
  {'qid': 4, 'retrieve': '186', 'category': 'insurance'},
  {'qid': 5, 'retrieve': '162', 'category': 'insurance'},
  {'qid': 6, 'retrieve': '116', 'category': 'insurance'},
  {'qid': 7, 'retrieve': '439', 'category': 'insurance'},
  {'qid': 8, 'retrieve': '78', 'category': 'insurance'},
  {'qid': 9, 'retrieve': '62', 'category': 'insurance'},
  {'qid': 10, 'retrieve': '32', 'category': 'insurance'},
  {'qid': 11, 'retrieve': '258', 'category': 'insurance'},
  {'qid': 12, 'retrieve': '66', 'category': 'insurance'},
  {'qid': 13, 'retrieve': '526', 'category': 'insurance'},
  {'qid': 14, 'retrieve': '526', 'category': 'insurance'},
  {'qid': 15, 'retrieve': '536', 'category': 'insurance'},
  {'qid': 16, 'retrieve': '54', 'category': 'insurance'},
  {'qid': 17, 'retrieve': '606', 'category': 'insurance'},
 

In [None]:
type([])

list

In [19]:
sys.argv = [
    "notebook",
    "--true_answer_path", "/Users/yhk/Desktop/大三上/DataMining/FP/競賽資料集/dataset/preliminary/ground_truths_example.json",
    "--pred_answer_path", "/Users/yhk/Desktop/大三上/DataMining/FP/競賽資料集/dataset/preliminary/pred_retrieve.json"
]
parser = argparse.ArgumentParser(description='Process some paths and files.')
parser.add_argument('--true_answer_path', type=str, required=True, help='真實答案的答案路徑')  # 答案輸出的路徑
parser.add_argument('--pred_answer_path', type=str, required=True, help='輸出符合參賽格式的答案路徑')  # 答案輸出的路徑
args = parser.parse_args()
caterror = {"faq":0, "insurance" :0, "finance" :0}
retrieveerror = {"faq":0, "insurance" :0, "finance" :0}
catelen = {"faq":0, "insurance" :0, "finance" :0}
with open(args.true_answer_path, 'rb') as f:
    true_ans_ref = json.load(f) 
with open(args.pred_answer_path, 'rb') as f:
    pred_ans_ref = json.load(f) 




In [20]:
true_ans_ref = true_ans_ref["ground_truths"]
pred_ans_ref = pred_ans_ref["answers"]
for i in range(len(true_ans_ref)):
    catelen[true_ans_ref[i]['category']] += 1
    if true_ans_ref[i]['category'] != pred_ans_ref[i]['category']:
        caterror[true_ans_ref[i]['category']] += 1
        retrieveerror[true_ans_ref[i]['category']] +=1
    elif true_ans_ref[i]['retrieve'] != int(pred_ans_ref[i]['retrieve']):
        retrieveerror[true_ans_ref[i]['category']] +=1
        print(i)
print("Error rate")
print("Category Error")
for cate in list(caterror.keys()):
    print(f"{cate}: {caterror[cate]/catelen[cate]}")
print("Retrieve Error")
for cate in list(caterror.keys()):
    print(f"{cate}: {retrieveerror[cate]/catelen[cate]}")

6
9
10
11
23
34
39
46
49
52
61
67
69
85
88
91
93
94
96
98
134
Error rate
Category Error
faq: 0.0
insurance: 0.0
finance: 0.0
Retrieve Error
faq: 0.02
insurance: 0.18
finance: 0.22


In [21]:
cleaned_finance['490']

['智邦',
 '科港',
 '子公司',
 '民國',
 '2023年',
 '第1',
 '季',
 '31日',
 '核閱',
 '準',
 '則',
 '單位',
 '元',
 '其他',
 '權',
 '保留',
 '盈餘',
 '國外',
 '機構',
 '財務',
 '報表',
 '換算',
 '代碼',
 '2022年',
 '第1',
 '季',
 '1日',
 '餘額',
 '普通股',
 '股本',
 '資本',
 '公積',
 '盈餘',
 '公積',
 '未',
 '分配',
 '盈餘',
 '兌換',
 '差額',
 '盈餘',
 '公積',
 '其他',
 '損益',
 '按',
 '價值',
 '金融',
 '資產',
 '未',
 '實現',
 '損益',
 '庫',
 '藏',
 '股票',
 '權益',
 'D',
 '2022年',
 '第1',
 '季',
 '1日',
 '第1',
 '季',
 '31日',
 '淨利',
 '2022年',
 '第1',
 '季',
 '1日',
 '第1',
 '季',
 '31日',
 '其他',
 '損益',
 '2022年',
 '第1',
 '季',
 '1日',
 '第1',
 '季',
 '31日',
 '損益',
 '總額',
 'N',
 '股份',
 '給付',
 '基礎',
 '交易',
 'Z',
 '2022年',
 '第1',
 '季',
 '31日',
 '餘額',
 'A',
 '2023年',
 '第1',
 '季',
 '1日',
 '餘額',
 'D',
 '2023年',
 '第1',
 '季',
 '1日',
 '第1',
 '季',
 '31日',
 '淨利',
 '2023年',
 '第1',
 '季',
 '1日',
 '第1',
 '季',
 '31日',
 '其他',
 '損益',
 '2023年',
 '第1',
 '季',
 '1日',
 '第1',
 '季',
 '31日',
 '損益',
 '總額',
 'ōN1',
 '股份',
 '給付',
 '基礎',
 '交易',
 'Q1',
 '處分',
 '其他',
 '損益',
 '按',
 '價值',
 '權益',
 '工具',
 '2023年',
 '第1',
 '季',
 

# 對照區
格式：\
[qid, tfidf找到的, 正確答案]\
錯誤的文本\
正確的文本


In [22]:
for i in range(len(answer_dict['answers'])):
    if answer_dict['answers'][i]['category'] == 'insurance':
        if int(answer_dict['answers'][i]['retrieve'])!= true_ans_ref[i]['retrieve']:
            print(answer_dict['answers'][i]['qid'], answer_dict['answers'][i]['retrieve'], true_ans_ref[i]['retrieve'])
            print(qs_ref['questions'][i]['query'])
            print(bigram_insurance[answer_dict['answers'][i]['retrieve']])
            print(bigram_insurance[str(true_ans_ref[i]['retrieve'])])

7 439 107
身故保險金的給付會使本契約效力如何變化? 
['南山', '南山人壽', '人壽', '人壽醫療', '醫療', '醫療終身', '終身', '終身保險', '保險', '保險_TPHI5', '_TPHI5', '_TPHI5不得', '不得', '不得申請', '申請', '申請要保人', '要保人', '要保人停止', '停止', '停止日', '日', '日個', '個', '個月', '月', '月提出', '提出', '提出項', '項', '項申請', '申請', '申請要保人', '要保人', '要保人清償', '清償', '清償保險費', '保險費', '保險費扣除', '扣除', '扣除期間', '期間', '期間保險費', '保險費', '保險費餘額', '餘額', '餘額翌日', '翌日', '翌日上午', '上午', '上午零時', '零時', '零時其', '其', '其效力', '效力', '效力要保人', '要保人', '要保人停止', '停止', '停止日', '日', '日個', '個', '個月', '月', '月提出', '提出', '提出項', '項', '項復效', '復效', '復效申請', '申請', '申請者', '者', '者公司', '公司', '公司得', '得', '得要保人', '要保人', '要保人復效', '復效', '復效申請', '申請', '申請送達', '送達', '送達公司', '公司', '公司日', '日', '日日', '日', '日要求', '要求', '要求要保人', '要保人', '要保人提供', '提供', '提供被保險人', '被保險人', '被保險人可', '可', '可保', '保', '保證明', '證明', '證明要保人', '要保人', '要保人未', '未', '未日', '日', '日交齊', '交齊', '交齊公司', '公司', '公司要求', '要求', '要求提供', '提供', '提供可', '可', '可保', '保', '保證明', '證明', '證明者', '者', '者公司', '公司', '公司得', '得', '得退回', '退回', '退回次', '次', '次復效', '復效', '復效被保險人', '被保險人', '

In [None]:
list(caterror.keys())

['faq', 'insurance', 'finance']

In [None]:
import pdfplumber
pdf = pdfplumber.open("/Users/yhk/Desktop/大三上/DataMining/FP/競賽資料集/reference/finance/628.pdf")
pdf_text = ''
page = pdf.pages[0]
text = page.extract_text()  # 提取頁面的文本內容
if text:
    text = text.replace('\n', '')
    pdf_text += text
print(pdf_text)
pdf.close()  # 關閉PDF文件

九、重大或有負債及未認列之合約承諾(一)截至民國 111 年 3 月 31 日止，子公司和泰產險已簽約尚未支付之工程價款為$595,804。(二)截至民國 111 年 3 月 31 日止，本公司與關係人及非關係人所簽訂之重要契約彙總如下：十、重大之災害損失無。十一、重大之期後事項(一)子公司和潤企業於民國 111 年 5 月 5 日董事會決議通過下列事項：~82~代製約經業約長買和代契本理品銷務源賣泰理約商買契委汽契豐商公性契賣約託車約田契質司約契契股物約份料日式日業國國等國、司公有國運TCoo當本會本株瑞都經都國及司限瑞搬yrop豐社日式汽汽銷汽瑞長公汽股toar田野會車車商車汽源司車份aIt自自社(((車汽(有ni事股股股股do動動(車限un)))股)s車車公公公(公公t)股r株工司司司公司司i)e人s1月138、車情1月999除終自雙反續1月111418120130日年1)形01年年年經止9方契有93年1年1除年42年1日(71外日761雙契年同約效日契14日月年依5月月月4月月，月方約1意約。月約野11終111月11日月持1日日日簽外終定1起日日車止51日起1續日起起起署，日止事訖至至)日條至(有至書持起契項日11日起款111效1面續，約外1期35野(規12年年豐。3同有除或，年車定年13田意效經違持32月)5。在之型國權公授內輛委售及國權在在車之主台豐車瑞製司權，與託、販瑞製台台系特灣田輛汽造在經販零本物促汽造灣灣之種地車及車之台銷賣配公流管車之地地各車要區系零公車灣商本件司、理公各區區型輛販、配司輛地在公、代售等司式銷販工及賣日件同及區主司用為前事同車售賣業零內進野等意零銷要所品處、宜意輛之進用配口車。提配售販提等理售。提供用口、件或系供件之賣供。車後供本。之產。容國之其供用區之輛服其公豐業產各授本。域車販務授司田用
