In [22]:
import os
import re
import json
import yaml
import torch
import pandas as pd

from accelerate import Accelerator

from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document

from sentence_transformers import (
    SentenceTransformer,
    CrossEncoder,
)
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
)

from dotenv import load_dotenv

load_dotenv(override=True)

from huggingface_hub import login

login(token=os.getenv("HUGGINGFACE_TOKEN"))

seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

In [23]:
def l2_format_docs(docs):
    return "\n\n".join([doc for doc in docs])


def l3_format_docs(docs_pairs):
    qa_pair_texts = ""
    for doc_pair in docs_pairs:
        qa_pair_texts += (
            f"Question: {doc_pair['question']}\nAnswer: {doc_pair['answer_list']}\n\n"
        )
    return qa_pair_texts


def contains_chinese(text):
    pattern = re.compile("[\u4e00-\u9fff]")
    return bool(pattern.search(text))

# Data Load

In [26]:
# Directory
local_path = "."  # your path

data_path = f"{local_path}/data"

test_path = f"{data_path}/ver-1-test.csv"  # processed test set
qe_path = f"{data_path}/ver-1-preprocessed.jsonl"  # question expansion data
prompts_path = f"{data_path}/prompts.yaml"  # prompts
docs_dir = f"{data_path}/md-files-processed-jsonl"  # processed documents (pdf)

answer_filename = "real_answer.jsonl"  # 최종 답변 저장
if os.path.exists(f"{data_path}/{answer_filename}"):
    raise FileExistsError(f"'{answer_filename}' already exists.")

submit_filename = "submit.csv"  # 제출 파일 저장
if os.path.exists(f"{data_path}/{submit_filename}"):
    raise FileExistsError(f"'{submit_filename}' already exists.")


# CSV
test = pd.read_csv(test_path, encoding="utf-8-sig")
submission = pd.read_csv(f"{data_path}/sample_submission.csv", encoding="utf-8-sig")


# Query Expansion (Ver 1)
with open(qe_path, "r", encoding="utf-8-sig") as f:
    qe_data = []
    for line in f:
        data = json.loads(line)
        data["questions"] = data["questions"][:3]
        qe_data.append(data)


# Prompts
with open(prompts_path, "r", encoding="utf-8-sig") as f:
    prompts = yaml.safe_load(f)

l2_rag_system_prompt_eng = prompts["rag_level_2"]["rag_system_prompt_eng"]
l2_rag_user_prompt_eng = prompts["rag_level_2"]["rag_user_prompt_eng"]

l3_rag_system_prompt_eng = prompts["rag_level_3"]["rag_system_prompt_eng"]
l3_rag_user_prompt_eng = prompts["rag_level_3"]["rag_user_prompt_eng"]


# Documents
file_list = sorted(os.listdir(docs_dir))
documents = []
for file in file_list:
    if file.endswith(".jsonl"):
        with open(f"{docs_dir}/{file}", "r", encoding="utf-8-sig") as f:
            for line in f:
                line_data = json.loads(line)
                if line_data["metadata"]["type"] != "intro":
                    documents.append(
                        Document(
                            page_content=line_data["page_content"],
                            metadata=line_data["metadata"],
                        )
                    )

# Model Load

In [None]:
# LLM
model_id = "Qwen/Qwen2.5-7B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    attn_implementation="flash_attention_2",
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(model_id)


# Embedding
embedding_model_id = "nlpai-lab/KURE-v1"
embedding = HuggingFaceEmbeddings(model_name=embedding_model_id)


# Embedding for Submission
embedding_model_id = "jhgan/ko-sbert-sts"
submission_embedding = SentenceTransformer(embedding_model_id)


# Reranking
reranker_model_id = "dragonkue/bge-reranker-v2-m3-ko"
reranker = CrossEncoder(
    reranker_model_id,
    default_activation_function=torch.nn.Sigmoid(),
)


# Retriever
vector_store = FAISS.from_documents(documents=documents, embedding=embedding)
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 30})

# RUN

In [None]:
top_k = 10
max_retries = 10
real_answer = []

for idx in range(test.shape[0]):
    question_answer_pairs = []
    l3_valid_response = None

    test_id = qe_data[idx]["test_id"]
    questions = qe_data[idx]["questions"]

    print(f"[{idx+1}/{test.shape[0]}  {test_id}]", "-----" * 10)

    for q_num, question in enumerate(questions):
        print(f"Q{q_num+1}: {question}")

        results = retriever.invoke(question)

        reranker_pred_list = []
        for result in results:
            reranker_pred_list.append([question, result.page_content])

        scores = reranker.predict(reranker_pred_list)
        scored_results = list(zip(scores, results))
        scored_results.sort(reverse=True, key=lambda x: x[0])
        sorted_results = [result for _, result in scored_results]

        top_k_to_rerank = []
        for result in sorted_results[:top_k]:
            top_k_to_rerank.append(result)

        reranked_contents = []
        for result in top_k_to_rerank:
            reranked_contents.append(result.page_content)

        search_results = l2_format_docs(reranked_contents)
        l2_valid_response = None
        retry_count = 0

        while retry_count < max_retries:
            messages = [
                {"role": "system", "content": l2_rag_system_prompt_eng},
                {
                    "role": "user",
                    "content": l2_rag_user_prompt_eng.format(
                        question=question, search_results=search_results
                    ),
                },
            ]

            text = tokenizer.apply_chat_template(
                messages, tokenize=False, add_generation_prompt=True
            )
            model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

            generated_ids = model.generate(
                **model_inputs,
                max_new_tokens=1024,
                do_sample=True,
                temperature=0.1,
                top_p=0.9,
            )
            generated_ids = [
                output_ids[len(input_ids) :]
                for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
            ]

            response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[
                0
            ]
            response = response.strip()

            if not response:
                print(
                    f"[{retry_count+1}/{max_retries}  LEVEL 2] Empty Response. Retrying..."
                )
                retry_count += 1
                continue

            if contains_chinese(response):
                print(
                    f"[{retry_count+1}/{max_retries}  LEVEL 2] Chinese Detected. Retrying..."
                )
                retry_count += 1
                continue

            l2_valid_response = response
            print(">>>", l2_valid_response)
            break

        torch.cuda.empty_cache()

        if l2_valid_response is None and retry_count >= max_retries:
            l2_valid_response = response
            print(
                f"[{retry_count+1}/{max_retries}  LEVEL 2] Maximum Retry Count Reached. Last Response Used."
            )
            print(">>>", l2_valid_response)

        l2_save_data = {
            "question": question,
            "answer_list": l2_valid_response,
            "test_id": test_id,
        }
        question_answer_pairs.append(l2_save_data)

    retry_count = 0

    while retry_count < max_retries:
        messages = [
            {"role": "system", "content": l3_rag_system_prompt_eng},
            {
                "role": "user",
                "content": l3_rag_user_prompt_eng.format(
                    question_answer_pairs=l3_format_docs(question_answer_pairs)
                ),
            },
        ]

        text = tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

        generated_ids = model.generate(
            **model_inputs,
            max_new_tokens=512,
            do_sample=True,
            temperature=0.1,
            top_p=0.9,
        )
        generated_ids = [
            output_ids[len(input_ids) :]
            for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
        ]

        response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
        response = response.strip()

        if not response:
            print(
                f"[{retry_count+1}/{max_retries}  LEVEL 3] Empty Response. Retrying..."
            )
            retry_count += 1
            continue

        if contains_chinese(response):
            print(
                f"[{retry_count+1}/{max_retries}  LEVEL 3] Chinese Detected. Retrying..."
            )
            retry_count += 1
            continue

        l3_valid_response = response
        print(">>>", l3_valid_response)
        break

    if l3_valid_response is None and retry_count >= max_retries:
        l3_valid_response = response
        print(
            f"[{retry_count+1}/{max_retries}  LEVEL 3] Maximum Retry Count Reached. Last Response Used."
        )
        print(">>>", l3_valid_response)

    l3_save_data = {
        "total_final_response": l3_valid_response,
        "test_id": test_id,
    }
    real_answer.append(l3_save_data)

    with open(f"{data_path}/{answer_filename}", "a", encoding="utf-8-sig") as f:
        f.write(json.dumps(l3_save_data, ensure_ascii=False) + "\n")

    torch.cuda.empty_cache()


# 최종 결과 후처리
real_answer_df = pd.DataFrame(real_answer)
real_answer_df_cp = real_answer_df.copy()

for idx in range(real_answer_df_cp.shape[0]):
    cond = real_answer_df_cp.loc[idx, "total_final_response"]
    if contains_chinese(cond):
        real_answer_df_cp.loc[idx, "total_final_response"] = re.sub(
            "[\u4e00-\u9fff]", "", cond
        )
    if "apply_not_to_be_caused" in cond:
        real_answer_df_cp.loc[idx, "total_final_response"] = re.sub(
            "apply_not_to_be_caused", "발생하지 않도록 하고", cond
        )


# Submission
test_results = real_answer_df_cp["total_final_response"].tolist()
pred_embeddings = submission_embedding.encode(test_results)

submission.iloc[:, 1] = test_results
submission.iloc[:, 2:] = pred_embeddings

submission.to_csv(f"{data_path}/{submit_filename}", index=False, encoding="utf-8-sig")

print("-----" * 10)
print(f"Saved Submission File: '{data_path}/{submit_filename}'")