In [19]:
# company_report_agent_final.py

from dotenv import load_dotenv
load_dotenv()

import glob, os
from typing import List, Optional
from dataclasses import dataclass

# Document loaders / splitter / vectorstore
from langchain_community.document_loaders import PyMuPDFLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import FAISS

# LCEL
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser

# Structured output
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field

# Retrieval enhancers
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import EmbeddingsFilter


@dataclass
class IndexConfig:
    study_dir: str = "./mystudy"
    index_path: Optional[str] = "./faiss_index"
    chunk_size: int = 500
    chunk_overlap: int = 50
    k: int = 4
    use_multi_query: bool = True
    use_compression: bool = True
    similarity_threshold: float = 0.30

@dataclass
class ReportConfig:
    model: str = "gpt-5-nano"
    temperature: float = 0.2
    audience: str = "임원"
    tone: str = "격식체, 간결"
    length: str = "요약+핵심 문단(각 섹션 5문장 이내)"
    citation_style: str = "말미 인용(source_file, page/section)"


# ----------------------------
# 1) 코퍼스 로드 / 인덱스 구성
# ----------------------------
def load_corpus(study_dir: str):
    docs = []
    for p in glob.glob(os.path.join(study_dir, "*.pdf")):
        for d in PyMuPDFLoader(p).load():
            d.metadata.update(file_type="pdf", source_file=os.path.basename(p))
            docs.append(d)
    for p in glob.glob(os.path.join(study_dir, "*.md")):
        for d in TextLoader(p, encoding="utf-8").load():
            d.metadata.update(file_type="markdown", source_file=os.path.basename(p))
            docs.append(d)
    return docs

def build_vectorstore(docs, cfg: IndexConfig):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=cfg.chunk_size,
        chunk_overlap=cfg.chunk_overlap
    )
    chunks = splitter.split_documents(docs)
    emb = OpenAIEmbeddings()

    if cfg.index_path and os.path.isdir(cfg.index_path):
        vs = FAISS.load_local(
            folder_path=cfg.index_path,
            embeddings=emb,
            allow_dangerous_deserialization=True
        )
        vs.add_documents(chunks)
    else:
        vs = FAISS.from_documents(chunks, emb)
        if cfg.index_path:
            os.makedirs(cfg.index_path, exist_ok=True)
            vs.save_local(cfg.index_path)

    return vs

def build_retriever(vs, cfg: IndexConfig):
    base = vs.as_retriever(search_kwargs={"k": cfg.k})

    if cfg.use_multi_query:
        base = MultiQueryRetriever.from_llm(
            retriever=base,
            llm=ChatOpenAI(temperature=0)
        )

    if cfg.use_compression:
        filt = EmbeddingsFilter(
            embeddings=OpenAIEmbeddings(),
            similarity_threshold=cfg.similarity_threshold
        )
        base = ContextualCompressionRetriever(
            base_retriever=base,
            base_compressor=filt
        )

    return base


# ----------------------------
# 2) 구조화 스키마 정의
# ----------------------------
class ReportSection(BaseModel):
    heading: str = Field(..., description="섹션 제목")
    content: str = Field(..., description="본문(말미 인용 포함)")
    bullets: List[str] = Field(default_factory=list)

class ReportModel(BaseModel):
    title: str
    executive_summary: str
    sections: List[ReportSection]
    key_findings: List[str]
    risks: List[str]
    recommendations: List[str]
    references: List[str]


# ----------------------------
# 3) 프롬프트 템플릿
# ----------------------------
MAP_TEMPLATE = """아래 문서 한 조각과 요청 사양(spec)을 바탕으로, 관련성이 높은 증거 3개 이하로 요약하세요.
각 항목 말미에 (source_file: 파일명, page: 가능 시) 형태로 인용을 붙이세요.

[spec]
{spec}

[doc_page]
{page_content}

[meta]
source_file={source_file}, page={page}

[short_evidence]
"""

REDUCE_TEMPLATE = """다음은 여러 문서 조각에서 추출한 증거 목록입니다.
중복/장황한 내용을 제거하고, 최대 600토큰 내에서 핵심 근거만 남겨 통합하세요.
인용 표기는 유지하세요.

[evidence_list]
{evidence}

[consolidated_evidence]
"""

REPORT_INSTR = """당신은 기업용 보고서 작성 전문가입니다.
규칙:
- 제공된 context 내부 사실만 사용하고, 각 섹션 본문 말미에 인용({citation_style})을 포함하세요.
- 대상({audience}), 문체({tone}), 분량({length})을 준수하세요.
- 최종 출력은 JSON 스키마에 맞춰 JSON만 반환하세요. 추가 텍스트 금지.

[context]
{context}

[spec]
{spec}
"""


# ----------------------------
# 4) 에이전트 정의
# ----------------------------
class CompanyReportAgent:
    def __init__(
        self,
        idx_cfg: IndexConfig = IndexConfig(),
        rpt_cfg: ReportConfig = ReportConfig()
    ):
        # 인덱스 및 리트리버 초기화
        docs = load_corpus(idx_cfg.study_dir)
        vs = build_vectorstore(docs, idx_cfg)
        retriever = build_retriever(vs, idx_cfg)

        # LLM 및 파서
        self.llm = ChatOpenAI(
            model=rpt_cfg.model,
            temperature=rpt_cfg.temperature
        )
        self.parser = PydanticOutputParser(pydantic_object=ReportModel)

        # Map 체인: 문서별 증거 요약
        map_prompt = ChatPromptTemplate.from_template(MAP_TEMPLATE)
        map_chain = map_prompt | self.llm | StrOutputParser()

        def fanout_map_inputs(x):
            items = []
            for d in x["docs"]:
                items.append({
                    "page_content": d.page_content,
                    "source_file": d.metadata.get("source_file", ""),
                    "page": d.metadata.get("page", d.metadata.get("page_number", "")),
                    "spec": x["spec"],
                })
            return items

        self.map_evidence_chain = (
            RunnableLambda(fanout_map_inputs)
            | map_chain.map()
            | RunnableLambda(lambda lst: "\n".join(lst))
        )

        # Reduce 체인: 증거 통합
        reduce_prompt = ChatPromptTemplate.from_template(REDUCE_TEMPLATE)
        self.reduce_chain = (
            {"evidence": self.map_evidence_chain}
            | reduce_prompt
            | self.llm
            | StrOutputParser()
        )

        # 최종 보고서 체인
        final_prompt = ChatPromptTemplate.from_template(
            REPORT_INSTR + "\n{format_instructions}"
        ).partial(
            audience=rpt_cfg.audience,
            tone=rpt_cfg.tone,
            length=rpt_cfg.length,
            citation_style=rpt_cfg.citation_style,
            format_instructions=self.parser.get_format_instructions()
        )

        self.chain = (
            {"docs": retriever, "spec": RunnablePassthrough()}
            | RunnablePassthrough().assign(context=self.reduce_chain)
            | final_prompt
            | self.llm
            | self.parser
        )

    def generate_report(self, spec: str) -> ReportModel:
        return self.chain.invoke(spec)


# ----------------------------
# 5) 사용 예시
# ----------------------------
if __name__ == "__main__":
    spec = (
        "목표: RAG 약관/정책 자료의 기업 보고서화 가능성 평가\n"
        "범위: 검색/인용/운영비용/리스크/개선안\n"
        "요청: 임원 브리핑용 보고서"
    )
    agent = CompanyReportAgent()
    report = agent.generate_report(spec)
    print(report.json(indent=2, ensure_ascii=False))


C:\Users\tipay\AppData\Local\Temp\ipykernel_16284\2834682981.py:257: PydanticDeprecatedSince20: The `json` method is deprecated; use `model_dump_json` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  print(report.json(indent=2, ensure_ascii=False))


TypeError: `dumps_kwargs` keyword arguments are no longer supported.