In [1]:
from utils.read_file import *
from utils.model_tool import *
from langchain.text_splitter import RecursiveCharacterTextSplitter
from utils.query_handler import extract_time

In [2]:
QUESTION_PATH = "競賽資料集/dataset/preliminary/questions_example.json"

INSURANCE_PATH = "競賽資料集/reference/insurance"

FAQ_PATH = "競賽資料集/reference/faq/pid_map_content.json"
ANS_PATH = "競賽資料集/dataset/preliminary/ground_truths_example.json"
FINANCE_PATH = "競賽資料集/reference/finance"

In [3]:
question = read_json(QUESTION_PATH)["questions"]

In [4]:
import re
import jieba.analyse


def find_most_correct(top10: list[str], query: str, all_predict, similarity: np.array):
    keywords = jieba.analyse.extract_tags(query, topK=10)
    score = list(np.zeros(len(top10)))
    for i in range(len(top10)):
        
        for k in keywords:
            if k.isdigit():
                continue
            if re.search(k, top10[i]):
                score[i] += 1
        score[i] *= similarity[i]
    order = np.argsort(score)[::-1]
    return all_predict[order[0]]
        



In [5]:



chinese_num = ['○', '一', '二', '三', '四', '五', '六', '七', '八', '九']

def get_years(time_data: list[int]):
    years = []
    for year in time_data[0]:
        years.append(year)
        char_count = 0
        chinese_year = ""
        while year[char_count] != '年':
            chinese_year += chinese_num[int(year[char_count])]
            char_count += 1
        years.append(chinese_year + '年')
    return years

def get_valid_chunk(chunks: list[str], years: list[str]) -> list:
    valid_chunks = []


    pattern = r"|".join(years)
    for chunk in chunks:
        if re.search(pattern, chunk):
            valid_chunks.append(chunk)

    return valid_chunks

In [6]:
already_read = ["" for _ in range(10000)]

text = read_target_insurance_pdf(FINANCE_PATH, [10], already_read)[0]

def remove_useless(text):

    pattern = r"(\b\d{1,3}(,\d{3})+\b)|(-?\b\d+(\.\d+)?\b)"
    text = re.sub(pattern, '1', text)
    text = str(text).replace(' ', '')
    return text

remove_useless(text)


"三—'重大或有負債及未認列之合約承諾\n除已於其他附註所述者外，合併公司於資產負債表日重大未認列\n之合約事項如下：\n111年9月30日110年12月31日110年9月30日\n購置不動產、廠房及設備$1$1$1\n三二'具重大影響之外幣資產及負債資訊\n以下資訊係按合併公司各個體功能性貨幣以外之外幣彙總表達，\n所揭露之匯率係指該等外幣換算至功能性貨幣之匯率。具重大影響之\n外幣資產及負債如下：\n111年9月30日\n外幣匯率帳面金額\n金融資產\n貨幣性項目\n美金$11（美金：人民幣）$1\n美金11（美金：新台幣）1\n人民幣1,11（人民幣:新台幣）1\n金融負債\n貨帶性項目\n美金11（美金：人民幣）1\n美金11（美金：新台幣）1\n人民幣11（人民幣:新台幣）1\n110年12月31日\n外幣匯率帳面金額\n金融資產\n貨幣性項目\n美金$11（美金：人民幣）$1\n美金11（美金：新台幣）1\n人民幣11（人民幣：新台幣）1\n金融負債\n貨幣性項目\n美金11（美金：人民幣）1\n美金11（美金：新台幣）1\n人民幣11（人民幣：新台幣）1\n1-"

In [7]:
# from huggingface_hub import snapshot_download

# # 下載模型到指定路徑
# local_model_path = "./models/bge-m3"
# snapshot_download(repo_id="BAAI/bge-m3", revision="main", local_dir=local_model_path)


# from transformers import AutoModel

# reranker_model = 'BAAI/bge-reranker-v2-m3'

# # 下載並儲存模型和 tokenizer 到本地
# model_path = './local_bge_reranker_model'
# model = AutoModel.from_pretrained(reranker_model)

# # 將模型和 tokenizer 儲存到本地目錄
# model.save_pretrained(model_path)

In [8]:
from FlagEmbedding import FlagReranker
from sentence_transformers import SentenceTransformer

# 加載模型
embbeded_model = SentenceTransformer("./models/bge-m3", device='cuda')

#reranker = FlagReranker('BAAI/bge-reranker-v2-m3')

  from .autonotebook import tqdm as notebook_tqdm


In [9]:

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,  # 每段的目標長度
    chunk_overlap=300  # 分段間的重疊字符數
)

FILE_NUM = 2000

def validate_finance(embbeded_model, question):
    ans = read_json(ANS_PATH)
    correct = 0
    half_correct = 0 #答案在前五

    already_read = ["" for _ in range(FILE_NUM)]

    for i in range(50, 100):
        query = question[i]["query"]


        src = np.array(question[i]["source"])
        document = read_target_insurance_pdf(FINANCE_PATH, src, already_read)
        tokens = []
        chunk_real_index = []


        for j in range(len(document)):
            text = remove_useless(document[j])


            valid_chunks = text_splitter.split_text(text)

            tokens.extend(valid_chunks)
            chunk_real_index.extend([j + k * 0 for k in range(len(valid_chunks))])

        
        chunk_real_index = np.array(chunk_real_index)


        token_index, similarities = get_ans(embbeded_model, question[i], tokens, 10)
        real_index = chunk_real_index[token_index]
        all_predict = src[real_index]
        all_predict_text = [tokens[p] for p in token_index]
        predict = find_most_correct(all_predict_text, query, all_predict, similarities)


        if (predict == ans["ground_truths"][i]["retrieve"]):
            correct += 1
        else:
            show_wrong_ans(i, predict, ans, token_index, \
                   src, real_index, similarities, chunk_real_index, tokens, "output_finance")

        if (ans["ground_truths"][i]["retrieve"] in all_predict):
            half_correct += 1

    print(f"acc: {correct / 50 * 100} %")
    print(f"in rank 5: {half_correct / 50 * 100} %")

### 驗證finance資料集的正確率

In [10]:
import os

if not os.path.exists("output_finance/"):
    os.makedirs("output_finance/")
validate_finance(embbeded_model, question)

  attn_output = torch.nn.functional.scaled_dot_product_attention(
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\weiso131\AppData\Local\Temp\jieba.cache
Loading model cost 0.446 seconds.
Prefix dict has been built successfully.


qid: 59, predict: 639, ans: 632
qid: 64, predict: 706, ans: 124
qid: 72, predict: 497, ans: 204
qid: 80, predict: 386, ans: 213
qid: 86, predict: 757, ans: 189
qid: 94, predict: 481, ans: 699
qid: 97, predict: 579, ans: 282


KeyboardInterrupt: 