In [23]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from funcs.logic import get_qa_score
from typing import TypedDict, List, Optional
from langgraph.graph import StateGraph

source_dir = "data/source/pdf"
output_dir = "data/vectordb/dict"
embedding_model = "intfloat/multilingual-e5-small"



# ------------------------
# 나중에 불러올 때
# ------------------------

# 6. 동일한 임베딩 모델로 FAISS 로드


In [30]:
# 1. 딕셔너리 데이터 정의 (content + metadata)
data = {
    "doc1": {
        "content": "이것은 첫 번째 문서입니다.",
        "metadata": {"category": "공지", "author": "홍길동"}
    },
    "doc2": {
        "content": "두 번째 문서는 여기에 있습니다.",
        "metadata": {"category": "보고서", "author": "이순신"}
    },
    "doc3": {
        "content": "세 번째 문서는 여기에 있습니다.",
        "metadata": {"category": "보고서", "author": "이순신"}
    },
    "doc4": {
        "content": "넷 번째 문서는 여기에 있습니다.",
        "metadata": {"category": "보고서", "author": "이순신"}
    },
    "doc5": {
        "content": "다섯 번째 문서는 여기에 있습니다.",
        "metadata": {"category": "보고서", "author": "이순신"}
    }
}

# 2. Document 객체로 변환
documents = [
    Document(
        page_content=item["content"],
        metadata={"source": key, **item["metadata"]}
    )
    for key, item in data.items()
]

# 3. HuggingFace 임베딩 모델 설정
embedding = HuggingFaceEmbeddings(
    model_name=embedding_model  # 작고 빠른 모델
)

# 4. FAISS 인덱스 생성
faiss_index = FAISS.from_documents(documents, embedding)

# 5. 저장
faiss_index.save_local(output_dir)

In [31]:
# 1️⃣ FAISS 벡터 DB 로드
qa_tokenizer = AutoTokenizer.from_pretrained("Dongjin-kr/ko-reranker")
qa_model = AutoModelForSequenceClassification.from_pretrained("Dongjin-kr/ko-reranker")

In [32]:
vector_db = FAISS.load_local(
    output_dir, embedding,
    allow_dangerous_deserialization=True
)



In [37]:
# LangGraph pipeline 실행
class QAState(TypedDict):
    question: str
    retrieved_docs: Optional[List]
    reranked_docs: Optional[List]
    top_docs: Optional[List]
    answer: Optional[str]


def retrieve_documents(state: QAState):
    question = state["question"]
    retrieved_docs = vector_db.similarity_search_with_score(question, k=30)
    return {"retrieved_docs": retrieved_docs}


def rerank_documents(state: QAState):
    question = state["question"]
    retrieved_docs = state["retrieved_docs"]

    scored_docs = []
    _score = []
    for doc, _ in retrieved_docs:
        score = get_qa_score(question, doc.page_content, qa_tokenizer, qa_model)
        scored_docs.append((doc, score))
        _score.append(score)

    _average = sum(_score) / len(_score)

    reranked_docs = sorted(scored_docs, key=lambda x: x[1], reverse=True)
    # top_docs = [doc for doc, _ in reranked_docs[:10]]
    top_docs = []
    for doc, score in reranked_docs[:5]:
        doc.metadata["score"] = score
        if score > _average:
            top_docs.append(doc)

    return {
        "reranked_docs": reranked_docs,
        "top_docs": top_docs,
    }


graph = StateGraph(QAState)

graph.add_node("retrieve_documents", retrieve_documents)
graph.add_node("rerank_documents", rerank_documents)

graph.set_entry_point("retrieve_documents")
graph.add_edge("retrieve_documents", "rerank_documents")


graph.set_finish_point("retrieve_documents")

qa_graph = graph.compile()




In [38]:
result = qa_graph.invoke({"question": f"두번째 문서는?"})


In [40]:
result['top_docs']

[Document(id='767a97e9-ad3c-4430-8c72-13e0ae077fa6', metadata={'source': 'doc2', 'category': '보고서', 'author': '이순신', 'score': 0.9995869994163513}, page_content='두 번째 문서는 여기에 있습니다.')]