In [None]:
import os
import re
import unicodedata
from langchain.chains import RetrievalQA
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from funcs.logic import process_pdfs_to_faiss_with_positions
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from typing import TypedDict, List, Optional
from langgraph.graph import StateGraph
from funcs.logic import get_qa_score
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from docx import Document as DocxDocument
from tqdm import tqdm  # ÏßÑÌñâ ÏÉÅÌô© ÌëúÏãúÏö©

source_dir = "path/to/sources"
output_dir = "data/case2"
# embedding_model = "intfloat/multilingual-e5-small"
embedding_model = "intfloat/multilingual-e5-large-instruct"
chunk_size = 500  # Í∞Å Ï≤≠ÌÅ¨Ïùò ÌÅ¨Í∏∞ (Î¨∏Ïûê Ïàò)
chunk_overlap = 50  # Ï≤≠ÌÅ¨ Í∞Ñ Í≤πÏπ® ÌÅ¨Í∏∞ (Î¨∏Ïûê Ïàò)

In [None]:
# 1Ô∏è‚É£ Word Î¨∏ÏÑúÏóêÏÑú ÌÖçÏä§Ìä∏ Ï∂îÏ∂úÌïòÍ≥† Ï†ÑÏ≤òÎ¶¨ÌïòÎäî Ìï®Ïàò
def extract_and_preprocess_text_from_docx(file_path):
    try:
        doc = DocxDocument(file_path)
        # Î¨∏Îã® ÌÖçÏä§Ìä∏ Ï∂îÏ∂ú (Îπà Ï§Ñ Ï†úÏô∏)
        paragraphs = [para.text for para in doc.paragraphs if para.text.strip()]

        # Î¨∏Îã®ÏùÑ Í∞úÌñâÎ¨∏ÏûêÎ°ú Í≤∞Ìï©
        text = "\n".join(paragraphs)

        # ÌÖçÏä§Ìä∏ Ï†ÑÏ≤òÎ¶¨
        text = preprocess_text(text)

        return text
    except Exception as e:
        print(f"‚ö†Ô∏è {file_path} ÌååÏùº Ï≤òÎ¶¨ Ï§ë Ïò§Î•ò Î∞úÏÉù: {str(e)}")
        return ""

# 2Ô∏è‚É£ ÌÖçÏä§Ìä∏ Ï†ÑÏ≤òÎ¶¨ Ìï®Ïàò
def preprocess_text(text):
    # ÌïúÍ∏Ä Ïú†ÎãàÏΩîÎìú Ï†ïÍ∑úÌôî (NFC)
    text = unicodedata.normalize('NFC', text)

    # Î∂àÌïÑÏöîÌïú ÌäπÏàò Î¨∏Ïûê Î∞è Í≥µÎ∞± Ï†úÍ±∞
    text = re.sub(r'[\u200b\u200c\u200d\u2060\ufeff]', '', text)  # Î≥¥Ïù¥ÏßÄ ÏïäÎäî ÌäπÏàò Î¨∏Ïûê Ï†úÍ±∞
    text = re.sub(r'\s+', ' ', text)  # Ïó∞ÏÜçÎêú Í≥µÎ∞± Ï†úÍ±∞

    # Í∞Å Ï§Ñ ÏïûÎí§ Í≥µÎ∞± Ï†úÍ±∞
    lines = [line.strip() for line in text.split('\n')]
    text = '\n'.join(lines)

    # Îπà Ï§Ñ Ï†úÍ±∞ (Ïó∞ÏÜçÎêú Ï§ÑÎ∞îÍøà)
    text = re.sub(r'\n\s*\n', '\n', text)

    return text.strip()

# 3Ô∏è‚É£ ÌÖçÏä§Ìä∏Î•º Ï≤≠ÌÅ¨Î°ú Î∂ÑÌï†ÌïòÎäî Ìï®Ïàò
def create_chunks_from_texts(texts, metadata_list, chunk_size=500, chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", ". ", "? ", "! ", " ", ""],
        length_function=len
    )

    documents = []
    for text, metadata in zip(texts, metadata_list):
        # ÌÖçÏä§Ìä∏Í∞Ä ÏûàÎäî Í≤ΩÏö∞ÏóêÎßå Ï≤≠ÌÇπ ÏßÑÌñâ
        if text:
            chunks = text_splitter.split_text(text)
            # Í∞Å Ï≤≠ÌÅ¨Î≥ÑÎ°ú Î¨∏ÏÑú ÏÉùÏÑ±
            for i, chunk in enumerate(chunks):
                chunk_metadata = metadata.copy()  # ÏõêÎ≥∏ Î©îÌÉÄÎç∞Ïù¥ÌÑ∞ Î≥µÏÇ¨
                chunk_metadata["chunk_id"] = i    # Ï≤≠ÌÅ¨ ID Ï∂îÍ∞Ä
                documents.append(Document(page_content=chunk, metadata=chunk_metadata))

    return documents

# 4Ô∏è‚É£ Î©îÏù∏ Ìï®Ïàò: Ïó¨Îü¨ Í∞úÏùò Word ÌååÏùºÏùÑ ÏùΩÏñ¥ÏôÄÏÑú Î≤°ÌÑ∞ DB ÏÉùÏÑ±
def process_word_documents_to_vector_db(doc_dir, output_dir,
                                        embedding_model_name="intfloat/multilingual-e5-small",
                                        chunk_size=500, chunk_overlap=50):
    # Word ÌååÏùº Î™©Î°ù ÏàòÏßë
    word_files = [f for f in os.listdir(doc_dir) if f.endswith(".docx")]

    if not word_files:
        print("‚ö†Ô∏è Ï≤òÎ¶¨Ìï† Word ÌååÏùºÏù¥ ÏóÜÏäµÎãàÎã§.")
        return

    print(f"üîç Ï¥ù {len(word_files)}Í∞úÏùò Word ÌååÏùºÏùÑ Ï≤òÎ¶¨Ìï©ÎãàÎã§...")

    # ÌÖçÏä§Ìä∏ Ï∂îÏ∂ú Î∞è Ï†ÑÏ≤òÎ¶¨
    texts = []
    metadata_list = []

    for file_name in tqdm(word_files, desc="ÌååÏùº Ï≤òÎ¶¨ Ï§ë"):
        file_path = os.path.join(doc_dir, file_name)
        text = extract_and_preprocess_text_from_docx(file_path)

        if text:
            texts.append(text)
            # Î©îÌÉÄÎç∞Ïù¥ÌÑ∞ ÏÉùÏÑ± (ÌååÏùºÎ™Ö, Í≤ΩÎ°ú Îì±)
            metadata = {
                "source": file_name,
                "full_path": file_path,
                "file_type": "docx"
            }
            metadata_list.append(metadata)

    # ÌÖçÏä§Ìä∏ Ï≤≠ÌÇπ
    print(f"üìÑ ÌÖçÏä§Ìä∏Î•º Ï≤≠ÌÅ¨ ÌÅ¨Í∏∞ {chunk_size}(Í≤πÏπ® {chunk_overlap})Î°ú Î∂ÑÌï†Ìï©ÎãàÎã§...")
    documents = create_chunks_from_texts(texts, metadata_list, chunk_size, chunk_overlap)

    if not documents:
        print("‚ö†Ô∏è Ï≤òÎ¶¨Ìï† Î¨∏ÏÑúÍ∞Ä ÏóÜÏäµÎãàÎã§.")
        return

    print(f"‚úÖ {len(documents)}Í∞úÏùò Ï≤≠ÌÅ¨Í∞Ä ÏÉùÏÑ±ÎêòÏóàÏäµÎãàÎã§.")

    # ÏûÑÎ≤†Îî© Î™®Îç∏ Î°úÎìú
    print(f"üß† ÏûÑÎ≤†Îî© Î™®Îç∏ '{embedding_model_name}'ÏùÑ Î°úÎìúÌï©ÎãàÎã§...")
    embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_name)

    # FAISS Î≤°ÌÑ∞ DB ÏÉùÏÑ±
    print("üî¢ Î≤°ÌÑ∞ DBÎ•º ÏÉùÏÑ±Ìï©ÎãàÎã§...")
    vector_db = FAISS.from_documents(documents, embedding_model)

    # Î≤°ÌÑ∞ DB Ï†ÄÏû•
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    vector_db.save_local(output_dir)
    print(f"‚úÖ '{output_dir}'Ïóê Î≤°ÌÑ∞ Îç∞Ïù¥ÌÑ∞Î≤†Ïù¥Ïä§ Ï†ÄÏû• ÏôÑÎ£å")

    return vector_db

In [None]:
# 1Ô∏è‚É£ FAISS Î≤°ÌÑ∞ DB Î°úÎìú
qa_tokenizer = AutoTokenizer.from_pretrained("Dongjin-kr/ko-reranker")
qa_model = AutoModelForSequenceClassification.from_pretrained("Dongjin-kr/ko-reranker")


# Î©îÏù∏ Ìï®Ïàò Ïã§Ìñâ
vector_db = process_word_documents_to_vector_db(
    source_dir,
    output_dir,
    embedding_model,
    chunk_size,
    chunk_overlap
)


In [None]:
# LangGraph pipeline Ïã§Ìñâ
class QAState(TypedDict):
    question: str
    retrieved_docs: Optional[List]
    reranked_docs: Optional[List]
    top_docs: Optional[List]
    answer: Optional[str]


def retrieve_documents(state: QAState):
    question = state["question"]
    retrieved_docs = vector_db.similarity_search_with_score(question, k=30)
    return {"retrieved_docs": retrieved_docs}


def rerank_documents(state: QAState):
    question = state["question"]
    retrieved_docs = state["retrieved_docs"]

    scored_docs = []
    _score = []
    for doc, _ in retrieved_docs:
        score = get_qa_score(question, doc.page_content, qa_tokenizer, qa_model)
        scored_docs.append((doc, score))
        _score.append(score)

    _average = sum(_score) / len(_score)

    reranked_docs = sorted(scored_docs, key=lambda x: x[1], reverse=True)
    # top_docs = [doc for doc, _ in reranked_docs[:10]]
    top_docs = []
    for doc, score in reranked_docs[:10]:
        doc.metadata["score"] = score
        if score > _average:
            top_docs.append(doc)

    return {
        "reranked_docs": reranked_docs,
        "top_docs": top_docs,
    }





In [None]:
graph = StateGraph(QAState)

graph.add_node("retrieve_documents", retrieve_documents)
graph.add_node("rerank_documents", rerank_documents)

graph.set_entry_point("retrieve_documents")
graph.add_edge("retrieve_documents", "rerank_documents")


graph.set_finish_point("retrieve_documents")

qa_graph = graph.compile()


In [None]:
import csv


# CSV ÌååÏùºÏùÑ ÏùΩÏñ¥ ÎîïÏÖîÎÑàÎ¶¨ Î¶¨Ïä§Ìä∏Î°ú Ï†ÄÏû•
data_list = []
with open("path/to/sources", newline='', encoding="utf-8-sig") as csvfile:
    reader = csv.DictReader(csvfile)  # Í∞Å ÌñâÏùÑ ÎîïÏÖîÎÑàÎ¶¨Î°ú Î≥ÄÌôò
    for row in reader:
        op = {
            'code':row['code'],
            'item':row['item'],
            'exp' : row['exp'],
              }
        result = qa_graph.invoke({"question": f"{row['item']}"})

        op['result'] = result
        data_list.append(op)

In [None]:
import pandas as pd


result_list = []
for _documents in data_list:
    for _docs in _documents['result']['top_docs']:
        _dummy = {
            "code":_documents['code'],
            "item":_documents['item'],
            "page_content": _docs.page_content,
            "score": _docs.metadata['score'],
        }
        result_list.append(_dummy)


df = pd.DataFrame(result_list)



mean_scores = df.groupby('code')['score'].mean().reset_index()

import seaborn as sns
import matplotlib.pyplot as plt




# Î∞ïÏä§ ÌîåÎ°Ø Í∑∏Î¶¨Í∏∞
plt.figure(figsize=(12, 6))
sns.boxplot(x='code', y='score', data=df)
plt.xticks(rotation=90)
plt.title('Score Distribution by Code')
plt.show()

In [None]:
result_list

In [None]:
df

In [None]:
data_list