In [None]:
#S1 _ 1차, 2차 chunk
#S1 성공

import fitz  # PyMuPDF
import re
import json

# PDF 열기 및 목차 이후 본문 텍스트 추출 (5페이지부터)
pdf_path = r"D:\TECHLAB\2차프로젝트_Chatbot\data\raw\IFRS_S1.pdf"
doc = fitz.open(pdf_path)

start_page = 4  # 0-indexed → 5페이지부터
full_text = "\n".join(doc[i].get_text() for i in range(start_page, len(doc)))

# 섹션 정의 (start ~ end 기준, 정규표현식)
section_defs = [
    ("IFRS_S1_Objective", r"\nObjective\n", r"\nScope\n"),
    ("IFRS_S1_Scope", r"\nScope\n", r"\nConceptual foundations\n"),
    ("IFRS_S1_Conceptual foundations", r"\nConceptual foundations\n", r"\nCore content\n"), 
    ("IFRS_S1_Governance", r"\nGovernance\n", r"\nStrategy\n"),
    ("IFRS_S1_Strategy", r"\nStrategy\n", r"\nRisk management\n"),
    ("IFRS_S1_Risk management", r"\nRisk management\n", r"\nMetrics and targets\n"),
    ("IFRS_S1_Metrics and targets", r"\nMetrics and targets\n", r"\nGeneral requirements\n"),  
    ("IFRS_S1_General requirements", r"\nGeneral requirements\n", r"\nJudgements\n"),
    ("IFRS_S1_Judgements", r"\nJudgements\n", r"\nMeasurement uncertainty\n"),
    ("IFRS_S1_Measurement uncertainty", r"\nMeasurement uncertainty\n", r"\nErrors\n"),
    ("IFRS_S1_Errors", r"\nErrors\n", r"Appendix A\nDefined terms"),
    ("IFRS_S1_Appendix A : Defined terms", r"Appendix A\nDefined terms", r"Appendix B\nApplication guidance"),
    ("IFRS_S1_Appendix B : Application guidance", r"Appendix B\nApplication guidance", r"Appendix C\nSources of guidance"),
    ("IFRS_S1_Appendix C : Sources of guidance", r"Appendix C\nSources of guidance", r"Appendix D\nQualitative characteristics"),
    ("IFRS_S1_Appendix D : Qualitative characteristics", r"Appendix D\nQualitative characteristics", r"Appendix E\nEffective date and transition"),
    ("IFRS_S1_Appendix E : Effective date and transition", r"Appendix E\nEffective date and transition", None)
]


# 1차 청크 생성
section_chunks = []
for chunk_id, start_pattern, end_pattern in section_defs: 
    start_match = re.search(start_pattern, full_text, flags=re.IGNORECASE)
    if not start_match:
        continue
    start_pos = start_match.start()
    end_pos = len(full_text)
    if end_pattern:
        end_match = re.search(end_pattern, full_text[start_pos + 1:], flags=re.IGNORECASE)
        if end_match:
            end_pos = start_pos + 1 + end_match.start()
    section_text = full_text[start_pos:end_pos].strip()
    section_chunks.append({"id": chunk_id, "text": section_text})

# 확인
print(f"✅ 1차 섹션 수: {len(section_chunks)}")
print(section_chunks[0]["id"], ":", section_chunks[0]["text"][:200], "...")


from langchain.schema import Document

# dict → Document로 변환
documents = [
    Document(page_content=chunk['text'], metadata={"source": chunk['id'], "title": "IFRS_S1"})
    for chunk in section_chunks
]

print(documents[10])

# 2차 청크 생성 (1000자 기준)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

def split_into_chunks(documents, max_chars=1000):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=max_chars,
        chunk_overlap=20,
        separators=["\n\n", "\n", ".", " ", ""]
    )

    new_docs = []
    for doc in documents:
        chunks = splitter.split_text(doc.page_content)
        for i, chunk in enumerate(chunks):
            new_docs.append(Document(
                page_content=chunk,
                metadata={
                    "chunk_id": f"{doc.metadata['chunk_id']}_{i+1:02}",
                    "source": doc.metadata["chunk_id"],
                }
            ))
    
    return new_docs

# ✅ dict → Document로 변환한 리스트
documents2 = [
    Document(page_content=chunk['text'], metadata={"chunk_id": chunk['id'], "title": "IFRS_S1"})
    for chunk in section_chunks
]

# ✅ 2차 청크 생성
second_level_docs = split_into_chunks(documents2)  # ✅ 여기 section_chunks ❌ → documents ✅
print(f"✅ 2차 청크 수: {len(second_level_docs)}")

IFRS_01_1ST=documents
IFRS_01_2ND=second_level_docs

✅ 1차 섹션 수: 16
IFRS_S1_Objective : Objective
The objective of IFRS S1 General Requirements for Disclosure of Sustainability-
related Financial Information is to require an entity to disclose information
about its sustainability-related ...
page_content='Errors
An entity shall correct material prior period errors by restating the
comparative amounts for the prior period(s) disclosed unless it is
impracticable to do so.
Prior period errors are omissions from and misstatements in the entity’s
sustainability-related financial disclosures for one or more prior periods. Such
errors arise from a failure to use, or the misuse of, reliable information that:
(a)
was available when the sustainability-related financial disclosures for
that period(s) were authorised for issue; and
(b)
could reasonably be expected to have been obtained and considered in
the preparation of those disclosures.
Corrections of errors are distinguished from changes in estimates. Estimates
are approximations that an entity 

In [111]:
IFRS_01_1ST[1]

Document(metadata={'source': 'IFRS_S1_Scope', 'title': 'IFRS_S1'}, page_content='Scope\nAn entity shall apply this Standard in preparing and reporting\nsustainability-related financial disclosures in accordance with IFRS\nSustainability Disclosure Standards.\nSustainability-related risks and opportunities that could not reasonably be\nexpected to affect an entity’s prospects are outside the scope of this Standard.\nOther IFRS Sustainability Disclosure Standards specify information an entity is\nrequired \nto \ndisclose \nabout \nspecific \nsustainability-related \nrisks \nand\nopportunities.\n1\n2\n3\n4\n5\n6\n7\n1\nThroughout this Standard, the terms ‘primary users’ and ‘users’ are used interchangeably, with\nthe same meaning.\nIFRS S1 GENERAL REQUIREMENTS FOR DISCLOSURE OF SUSTAINABILITY-\nRELATED FINANCIAL INFORMATION—JUNE 2023\n6\n© IFRS Foundation\n\nAn entity may apply IFRS Sustainability Disclosure Standards irrespective\nof whether the entity’s related general purpose financial

In [112]:
IFRS_01_2ND[2]

Document(metadata={'chunk_id': 'IFRS_S1_Scope_01', 'source': 'IFRS_S1_Scope'}, page_content='Scope\nAn entity shall apply this Standard in preparing and reporting\nsustainability-related financial disclosures in accordance with IFRS\nSustainability Disclosure Standards.\nSustainability-related risks and opportunities that could not reasonably be\nexpected to affect an entity’s prospects are outside the scope of this Standard.\nOther IFRS Sustainability Disclosure Standards specify information an entity is\nrequired \nto \ndisclose \nabout \nspecific \nsustainability-related \nrisks \nand\nopportunities.\n1\n2\n3\n4\n5\n6\n7\n1\nThroughout this Standard, the terms ‘primary users’ and ‘users’ are used interchangeably, with\nthe same meaning.\nIFRS S1 GENERAL REQUIREMENTS FOR DISCLOSURE OF SUSTAINABILITY-\nRELATED FINANCIAL INFORMATION—JUNE 2023\n6\n© IFRS Foundation')

In [113]:
#S2 _ 1차, 2차 chunk
#S2 성공

import fitz  # PyMuPDF
import re
import json

# PDF 열기 및 목차 이후 본문 텍스트 추출 (5페이지부터)
pdf_path = r"D:\TECHLAB\2차프로젝트_Chatbot\data\raw\IFRS_S2.pdf"
doc = fitz.open(pdf_path)

start_page = 4  # 0-indexed → 5페이지부터
full_text = "\n".join(doc[i].get_text() for i in range(start_page, len(doc)))

# 섹션 정의 (start ~ end 기준, 정규표현식)
section_defs = [
    ("IFRS_S2_Objective", r"\nObjective\n", r"\nScope\n"),
    ("IFRS_S2_scope", r"\nScope\n", r"\nGovernance\n"),
    ("IFRS_S2_Governance", r"\nGovernance\n", r"\nStrategy\n"),
    ("IFRS_S2_Strategy", r"\nStrategy\n", r"\nRisk management\n"),
    ("IFRS_S2_Risk management", r"\nRisk management\n", r"\nMetrics and targets\n"),
    ("IFRS_S2_Metrics and targets", r"\nMetrics and targets\n", r"Appendix A\nDefined terms"),
    ("IFRS_S2_Appendix A : Defined terms", r"Appendix A\nDefined terms", r"Appendix B\nApplication guidance"),
    ("IFRS_S2_Appendix B : Application guidance", r"Appendix B\nApplication guidance", r"Appendix C\nEffective date and transition"),
    ("IFRS_S2_Appendix C : Effective date and transition", r"Appendix C\nEffective date and transition", None),
]

# 1차 청크 생성
section_chunks = []
for chunk_id, start_pattern, end_pattern in section_defs: 
    start_match = re.search(start_pattern, full_text, flags=re.IGNORECASE)
    if not start_match:
        continue
    start_pos = start_match.start()
    end_pos = len(full_text)
    if end_pattern:
        end_match = re.search(end_pattern, full_text[start_pos + 1:], flags=re.IGNORECASE)
        if end_match:
            end_pos = start_pos + 1 + end_match.start()
    section_text = full_text[start_pos:end_pos].strip()
    section_chunks.append({"id": chunk_id, "text": section_text})

# 확인
print(f"✅ 1차 섹션 수: {len(section_chunks)}")
print(section_chunks[0]["id"], ":", section_chunks[0]["text"][:200], "...")


# ✅ dict → Document 변환
documents = [
    Document(page_content=chunk['text'], metadata={"source": chunk['id'], "title": "IFRS_S2"})
    for chunk in section_chunks
]

print(documents[2])


# ✅ 2차 청크 분할 함수
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

# ✅ dict → Document로 변환한 리스트
documents = [
    Document(page_content=chunk['text'], metadata={"chunk_id": chunk['id'], "title": "IFRS_S2"})
    for chunk in section_chunks
]

def split_into_chunks(documents, max_chars=1000):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=max_chars,
        chunk_overlap=20,
        separators=["\n\n", "\n", ".", " ", ""]
    )

    new_docs = []
    for doc in documents:
        chunks = splitter.split_text(doc.page_content)
        for i, chunk in enumerate(chunks):
            new_docs.append(Document(
                page_content=chunk,
                metadata={
                    "chunk_id": f"{doc.metadata['chunk_id']}_{i+1:02}",
                    "source": doc.metadata["chunk_id"],
                }
            ))
    
    return new_docs


# ✅ 2차 청크 생성 및 저장
second_level_docs = split_into_chunks(documents)
print(f"✅ 2차 청크 수: {len(second_level_docs)}")

IFRS_02_1ST=documents
IFRS_02_2ND=second_level_docs

✅ 1차 섹션 수: 9
IFRS_S2_Objective : Objective
The objective of IFRS S2 Climate-related Disclosures is to require an entity to
disclose information about its climate-related risks and opportunities that is
useful to primary users of gene ...
page_content='Governance
The objective of climate-related financial disclosures on governance is to
enable users of general purpose financial reports to understand the
governance processes, controls and procedures an entity uses to monitor,
manage and oversee climate-related risks and opportunities.
To achieve this objective, an entity shall disclose information about:
(a)
the governance body(s) (which can include a board, committee or
equivalent body charged with governance) or individual(s) responsible
for oversight of climate-related risks and opportunities. Specifically,
the entity shall identify that body(s) or individual(s) and disclose
information about:
1
2
3
4
5
6
1
Throughout this Standard, the terms ‘primary users’ and ‘users’ are used inter

In [114]:
IFRS_02_1ST[1]

Document(metadata={'chunk_id': 'IFRS_S2_scope', 'title': 'IFRS_S2'}, page_content='Scope\nThis Standard applies to:\n(a)\nclimate-related risks to which the entity is exposed, which are:\n(i)\nclimate-related physical risks; and\n(ii)\nclimate-related transition risks; and\n(b)\nclimate-related opportunities available to the entity.\nClimate-related risks and opportunities that could not reasonably be\nexpected to affect an entity’s prospects are outside the scope of this\nStandard.\nCore content')

In [115]:
IFRS_02_2ND[7]

Document(metadata={'chunk_id': 'IFRS_S2_Strategy_02', 'source': 'IFRS_S2_Strategy'}, page_content='the reporting period, and their anticipated effects on the entity’s\nfinancial position, financial performance and cash flows over the\nshort, medium and long term, taking into consideration how those\nclimate-related risks and opportunities have been factored into the\nentity’s financial planning (see paragraphs 15–21); and\n(e)\nthe climate resilience of the entity’s strategy and its business model to\nclimate-related changes, developments and uncertainties, taking into\nconsideration the entity’s identified climate-related risks and\nopportunities (see paragraph 22).\nClimate-related risks and opportunities\nAn entity shall disclose information that enables users of general purpose\nfinancial reports to understand the climate-related risks and opportunities\nthat could reasonably be expected to affect the entity’s prospects. Specifically,\nthe entity shall:\n(a)\ndescribe climate-relat

In [116]:
# TCFD 각 1차, 통합 2차 chunk

# TCFD-Implementing_Guidance
import fitz
from langchain.schema import Document

# PDF 로드
pdf_path = r"D:\TECHLAB\2차프로젝트_Chatbot\data\raw\TCFD-Implementing_Guidance.pdf"
doc = fitz.open(pdf_path)

# 섹션 정의
section_defs = [
    ("TCFD_Implementing_Guidance_A. Introduction", 4, 13),
    ("TCFD_Implementing_Guidance_B. Recommendations", 13, 17),
    ("TCFD_Implementing_Guidance_C. Guidance for All Sectors", 17, 24),
    ("TCFD_Implementing_Guidance_D. financial", 24, 56),
    ("TCFD_Implementing_Guidance_E. Supplemental Guidance for the Financial Sector", 56, 70),
    ("TCFD_Implementing_Guidance_F. Fundamental Principles for Effective Disclosure", 70, 74),
    ("TCFD_Implementing_Guidance_Appendix 1 : Climate-Related Risks, Opportunities, and Financial Impacts", 74, 79),
    ("TCFD_Implementing_Guidance_Appendix 2 : Cross-Industry, Climate-Related Metric Categories", 79, 82),
    ("TCFD_Implementing_Guidance_Appendix 3 : Glossary and Abbreviations", 82, 85),
    ("TCFD_Implementing_Guidance_Appendix 4 : References", 85, 88),
]

# ✅ 1차 청크: 페이지 기준 분할
documents1 = []
for section_id, start_page, end_page in section_defs:
    text = "\n".join(doc[i].get_text() for i in range(start_page, end_page)).strip()
    documents1.append(
        Document(
            page_content=text,
            metadata={"source": section_id, "title": "TCFD_Implementing_Guidance"}
        )
    )

# ✅ 확인
print(f"✅ 섹션 수: {len(documents1)}")
print(documents1[0].metadata)
print(documents1[0].page_content[:300])




#TCFD_metrics_targets_guidance
import fitz  # PyMuPDF
from langchain_community.vectorstores import FAISS
from langchain.schema import Document

# 1. PDF 열기
pdf_path = r"D:\TECHLAB\2차프로젝트_Chatbot\data\raw\TCFD-Metrics_Targets_Guidance.pdf"
doc = fitz.open(pdf_path)

# 2. 섹션 정의 (0-indexed page 기준)
section_page_defs = [
    ("TCFD_Metrics_Targets_Guidance_A. Overview and Background", 1, 6),
    ("TCFD_Metrics_Targets_Guidance_B. Scope and Approach", 6, 10),
    ("TCFD_Metrics_Targets_Guidance_C. Climate-Related Metrics", 10, 29),
    ("TCFD_Metrics_Targets_Guidance_D. Climate-Related Targets", 29, 38),
    ("TCFD_Metrics_Targets_Guidance_E. Transition Plans", 38, 45),
    ("TCFD_Metrics_Targets_Guidance_F. Financial Impacts", 45, 54),
    ("TCFD_Metrics_Targets_Guidance_Appendix 1 : Further Information on Select Cross-Industry, Climate-Related Metric Categories", 54, 61),
    ("TCFD_Metrics_Targets_Guidance_Appendix 2 : Example Disclosures", 61, 65),
    ("TCFD_Metrics_Targets_Guidance_Appendix 3 : Glossary and Abbreviations", 65, 68),
    ("TCFD_Metrics_Targets_Guidance_Appendix 4 : References", 68, 79),
]

# ✅ 1차 청크: 페이지 기준 분할
documents2 = []
for section_id, start_page, end_page in section_page_defs:
    text = "\n".join(doc[i].get_text() for i in range(start_page, end_page)).strip()
    documents2.append(
        Document(
            page_content=text,
            metadata={"source": section_id, "title": "TCFD_Metrics_Targets_Guidance"}
        )
    )

# ✅ 확인
print(f"✅ 섹션 수: {len(documents2)}")
print(documents2[0].metadata)
documents2[2]




# TCFD_report
import fitz  # PyMuPDF
from langchain_community.vectorstores import FAISS
from langchain.schema import Document

# 1. PDF 열기
pdf_path = r"D:\TECHLAB\2차프로젝트_Chatbot\data\raw\TCFD-Report.pdf"
doc = fitz.open(pdf_path)

# 2. Section page range 정의 (0-indexed 기준, 실제 목차는 6페이지까지)
section_page_defs = [
    ("TCFD_Report_A. Introduction", 6, 7),
    ("TCFD_Report_B. Climate-Related Risks, Opportunities, and Financial Impacts", 7, 13),
    ("TCFD_Report_C. Recommendations and Guidance", 13, 25),
    ("TCFD_Report_D. Scenario Analysis and Climate-Related Issues", 25, 32),
    ("TCFD_Report_E. Key Issues Considered and Areas for Further Work", 32, 41),
    ("TCFD_Report_F. Conclusion", 41, 44),
    ("TCFD_Report_Appendix 1 : Task Force Members", 44, 46),
    ("TCFD_Report_Appendix 2 : Task Forc e Objectives and Approach", 46, 51),
    ("TCFD_Report_Appendix 3 : Fundamental Principles for Effective Disclosure", 51, 54),
    ("TCFD_Report_Appendix 4 : Select Disclosure Frameworks", 54, 62),
    ("TCFD_Report_Appendix 5 : Glossary and Abbreviations", 62, 65),
    ("TCFD_Report_Appendix 6 : References", 65, 70),  # 끝
]

# ✅ 1차 청크: 페이지 기준 분할
documents3 = []
for section_id, start_page, end_page in section_page_defs:
    text = "\n".join(doc[i].get_text() for i in range(start_page, end_page)).strip()
    documents3.append(
        Document(
            page_content=text,
            metadata={"source": section_id, "title": "TCFD_Report"}
        )
    )

# ✅ 확인
print(f"✅ 섹션 수: {len(documents3)}")
print(documents3[0].metadata)
documents3[2]



# ✅ 1차 전체 병합
all_documents = documents1 + documents2 + documents3

print(f"✅ 전체 1차 청크 수: {len(all_documents)}")


# ✅ 2차 청크로 나누기 (1000자 기준)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

def split_into_chunks(all_documents, max_chars=1000, chunk_overlap=20):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=max_chars,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", ".", " ", ""]
    )

    new_docs = []
    for doc in all_documents:
        chunks = splitter.split_text(doc.page_content)
        for i, chunk in enumerate(chunks):
            new_docs.append(Document(
                page_content=chunk,
                metadata={
                    "chunk_id": f"{doc.metadata['source']}_{i+1:02}",
                    "source": doc.metadata["source"]
                }
            ))
    return new_docs


# ✅ 2차 청크 생성
second_level_docs = split_into_chunks(all_documents)

print(f"✅ 2차 청크 수: {len(second_level_docs)}")
print(second_level_docs[0].metadata)
print(second_level_docs[0].page_content[:200])

TCFD_1ST=all_documents
TCFD_2ND=second_level_docs

✅ 섹션 수: 10
{'source': 'TCFD_Implementing_Guidance_A. Introduction', 'title': 'TCFD_Implementing_Guidance'}
Implementing the Recommendations of the Task Force on Climate-related Financial Disclosures 
 
4
A. 
Introduction 
 
B. 
Recommendations 
 
C. 
Guidance for All Sectors 
 
D. 
Supplemental Guidance 
for the Financial Sector 
 
E. 
Supplemental Guidance 
for Non-Financial Groups 
 
F. 
Fundamental Pr
✅ 섹션 수: 10
{'source': 'TCFD_Metrics_Targets_Guidance_A. Overview and Background', 'title': 'TCFD_Metrics_Targets_Guidance'}
✅ 섹션 수: 12
{'source': 'TCFD_Report_A. Introduction', 'title': 'TCFD_Report'}
✅ 전체 1차 청크 수: 32
✅ 2차 청크 수: 895
{'chunk_id': 'TCFD_Implementing_Guidance_A. Introduction_01', 'source': 'TCFD_Implementing_Guidance_A. Introduction'}
Implementing the Recommendations of the Task Force on Climate-related Financial Disclosures 
 
4
A. 
Introduction 
 
B. 
Recommendations 
 
C. 
Guidance for All Sectors 
 
D. 
Supplemental Guidance 
f


In [117]:
TCFD_1ST[2]

Document(metadata={'source': 'TCFD_Implementing_Guidance_C. Guidance for All Sectors', 'title': 'TCFD_Implementing_Guidance'}, page_content='Implementing the Recommendations of the Task Force on Climate-related Financial Disclosures \n17\nA. \nIntroduction \n \nB. \nRecommendations \n \nC. \nGuidance for All Sectors \n \nD. \nSupplemental Guidance \nfor the Financial Sector \n \nE. \nSupplemental Guidance \nfor Non-Financial Groups \n \nF. \nFundamental Principles  \nfor Effective Disclosure \n \nAppendices \n    C. Guidance for All Sectors \nThe Task Force developed guidance to support all organizations in developing climate-related financial \ndisclosures consistent with its recommendations and recommended disclosures. The guidance assists \npreparers by providing context and suggestions for implementing the recommended disclosures. \n1. Governance \nInvestors, lenders, insurance underwriters, and other users of climate-related financial disclosures \n(collectively referred to as “in

In [118]:
TCFD_2ND[600]

Document(metadata={'chunk_id': 'TCFD_Metrics_Targets_Guidance_Appendix 3 : Glossary and Abbreviations_04', 'source': 'TCFD_Metrics_Targets_Guidance_Appendix 3 : Glossary and Abbreviations'}, page_content='151 \x07BP, “Progressing strategy development, bp revises long-term price assumptions, reviews intangible assets, and, as a result, expects non-cash impairments and write-offs,” June 15, 2020.\n152 \x07Eni, Eni for 2020: Carbon Neutrality by 2050, May 12, 2021, p. 20.\n64')

In [119]:
# GRI 1차, 2차 chunk
# GRI 성공

import fitz
import re
from langchain.schema import Document

# 1. PDF 열기
pdf_path = r"D:\TECHLAB\2차프로젝트_Chatbot\data\raw\Consolidated Set of the GRI Standards.pdf"
doc = fitz.open(pdf_path)

# 2. GRI 전체 문서 페이지 정의 (0-index 기준)
section_page_defs = [
    ("GRI_1", 4, 40),
    ("GRI_2", 40, 94),
    ("GRI_3", 94, 121),
    ("GRI_11", 121, 292),
    ("GRI_13", 292, 383),
    ("GRI_14", 383, 478),
    ("GRI_101", 478, 488),
    ("GRI_102", 488, 519),
    ("GRI_103", 519, 540),
    ("GRI_201", 540, 547),
    ("GRI_202", 547, 550),
    ("GRI_203", 550, 558),
    ("GRI_204", 558, 566),
    ("GRI_205", 566, 569),
    ("GRI_206", 569, 574),
    ("GRI_207", 574, 588),
    ("GRI_301", 588, 663),
    ("GRI_302", 663, 669),
    ("GRI_303", 669, 691),
    ("GRI_305", 691, 703),
    ("GRI_306", 703, 741),
    ("GRI_308", 741, 754),
    ("GRI_401", 754, 764),
    ("GRI_402", 764, 777),
    ("GRI_403", 777, 794),
    ("GRI_404", 794, 800),
    ("GRI_405", 800, 807),
    ("GRI_406", 807, 827),
    ("GRI_407", 827, 837),
    ("GRI_408", 837, 848),
    ("GRI_409", 848, 858),
    ("GRI_410", 858, 866),
    ("GRI_411", 866, 876),
    ("GRI_412", 876, 888),
    ("GRI_414", 888, 895),
    ("GRI_415", 895, 898),
    ("GRI_416", 898, 903),
    ("GRI_417", 903, 926),
    ("GRI_418", 926, 936),
    ("GRI_Glossary", 936, len(doc)),
]

# 3. 헤더 패턴 정의 (Introduction, 1., 2., ..., Glossary, Bibliography, Appendix)
header_pattern = re.compile(
    r"(?P<header>Introduction|Glossary|Bibliography|Appendix[\s\S]*?|^\d+\..+)",
    flags=re.IGNORECASE | re.MULTILINE
)

# 4. 각 문서 내 세부 청크 분할
all_documents = []

for gri_id, start_page, end_page in section_page_defs:
    text = "\n".join(doc[i].get_text() for i in range(start_page, end_page))

    matches = list(header_pattern.finditer(text))
    if not matches:
        print(f"⚠️ 헤더 없음: {gri_id}")
        continue

    for i, match in enumerate(matches):
        start = match.start()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
        header = match.group("header").strip()
        chunk_text = text[start:end].strip()

        # ✅ 무조건 Document 생성
        doc_obj = Document(
            page_content=chunk_text,
            metadata={
                "source": gri_id+'_'+header,
                "title": gri_id
            }
        )
        all_documents.append(doc_obj)

print(f"\n✅ 총 청크 수: {len(all_documents)}")


# ✅ 2차 청크로 나누기 (1000자 기준)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

def split_into_chunks(all_documents, max_chars=1000, chunk_overlap=20):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=max_chars,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", ".", " ", ""]
    )

    new_docs = []
    for doc in all_documents:
        chunks = splitter.split_text(doc.page_content)
        for i, chunk in enumerate(chunks):
            new_docs.append(Document(
                page_content=chunk,
                metadata={
                    "chunk_id": f"{doc.metadata['source']}_{i+1:02}",
                    "source": doc.metadata["source"]
                }
            ))
    return new_docs

# ✅ 2차 청크 생성
second_level_docs = split_into_chunks(all_documents)

print(f"✅ 2차 청크 수: {len(second_level_docs)}")
print(second_level_docs[0].metadata)
print(second_level_docs[0].page_content[:200])

second_level_docs[2]

GRI_1ST=all_documents
GRI_2ND=second_level_docs


✅ 총 청크 수: 1801
✅ 2차 청크 수: 4160
{'chunk_id': 'GRI_1_Introduction_01', 'source': 'GRI_1_Introduction'}
Introduction
GRI 1: Foundation 2021 introduces the purpose and system of the GRI Sustainability Reporting Standards (GRI
Standards) and explains key concepts for sustainability reporting. It also spec


In [120]:
GRI_1ST[0]

Document(metadata={'source': 'GRI_1_Introduction', 'title': 'GRI_1'}, page_content='Introduction\nGRI 1: Foundation 2021 introduces the purpose and system of the GRI Sustainability Reporting Standards (GRI\nStandards) and explains key concepts for sustainability reporting. It also specifies the requirements and reporting\nprinciples that organizations must comply with to report in accordance with the GRI Standards. GRI 1 is the first\nStandard that organizations should consult to understand how to report using the GRI Standards.\nGRI 1 is structured as follows:')

In [121]:
GRI_2ND[0]

Document(metadata={'chunk_id': 'GRI_1_Introduction_01', 'source': 'GRI_1_Introduction'}, page_content='Introduction\nGRI 1: Foundation 2021 introduces the purpose and system of the GRI Sustainability Reporting Standards (GRI\nStandards) and explains key concepts for sustainability reporting. It also specifies the requirements and reporting\nprinciples that organizations must comply with to report in accordance with the GRI Standards. GRI 1 is the first\nStandard that organizations should consult to understand how to report using the GRI Standards.\nGRI 1 is structured as follows:')

In [122]:
# 전체 벡터 저장

from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
import key
import math

# ✅ 필요한 변수
openai_api_key = key.key["OPEN_API_KEY"]
embedding_model = OpenAIEmbeddings(openai_api_key=openai_api_key)

def save_faiss_vectorstore(docs, embedding_model, save_path):
    batch_size = 100
    total = len(docs)
    num_batches = math.ceil(total / batch_size)

    vectorstore = None
    for i in range(num_batches):
        start = i * batch_size
        end = min(start + batch_size, total)
        batch = docs[start:end]

        if i == 0:
            vectorstore = FAISS.from_documents(batch, embedding_model)
        else:
            vectorstore.add_documents(batch)
        print(f"✅ 1차 배치 {i+1}/{num_batches} 저장 완료")

    vectorstore.save_local(save_path)
    print(f"✅ 저장 완료: {save_path}")


In [123]:
save_faiss_vectorstore(IFRS_01_1ST, embedding_model, "vectorstores/IFRS_01_1st")

✅ 1차 배치 1/1 저장 완료
✅ 저장 완료: vectorstores/IFRS_01_1st


In [131]:
save_faiss_vectorstore(IFRS_01_2ND, embedding_model, "vectorstores/IFRS_01_2nd")

✅ 1차 배치 1/2 저장 완료
✅ 1차 배치 2/2 저장 완료
✅ 저장 완료: vectorstores/IFRS_01_2nd


In [132]:
save_faiss_vectorstore(IFRS_02_1ST, embedding_model, "vectorstores/IFRS_02_1st")

✅ 1차 배치 1/1 저장 완료
✅ 저장 완료: vectorstores/IFRS_02_1st


In [133]:
save_faiss_vectorstore(IFRS_02_2ND, embedding_model, "vectorstores/IFRS_02_2nd")

✅ 1차 배치 1/2 저장 완료
✅ 1차 배치 2/2 저장 완료
✅ 저장 완료: vectorstores/IFRS_02_2nd


In [None]:
save_faiss_vectorstore(TCFD_1ST, embedding_model, "vectorstores/TFCD_01_1st")

✅ 1차 배치 1/1 저장 완료
✅ 저장 완료: vectorstores/TFCD_1st


In [135]:
save_faiss_vectorstore(TCFD_2ND, embedding_model, "vectorstores/TFCD_01_2nd")

✅ 1차 배치 1/9 저장 완료
✅ 1차 배치 2/9 저장 완료
✅ 1차 배치 3/9 저장 완료
✅ 1차 배치 4/9 저장 완료
✅ 1차 배치 5/9 저장 완료
✅ 1차 배치 6/9 저장 완료
✅ 1차 배치 7/9 저장 완료
✅ 1차 배치 8/9 저장 완료
✅ 1차 배치 9/9 저장 완료
✅ 저장 완료: vectorstores/TFCD_01_2nd


In [129]:
save_faiss_vectorstore(GRI_1ST, embedding_model, "vectorstores/GRI_01_1st")

✅ 1차 배치 1/19 저장 완료
✅ 1차 배치 2/19 저장 완료
✅ 1차 배치 3/19 저장 완료
✅ 1차 배치 4/19 저장 완료
✅ 1차 배치 5/19 저장 완료
✅ 1차 배치 6/19 저장 완료
✅ 1차 배치 7/19 저장 완료
✅ 1차 배치 8/19 저장 완료
✅ 1차 배치 9/19 저장 완료
✅ 1차 배치 10/19 저장 완료
✅ 1차 배치 11/19 저장 완료
✅ 1차 배치 12/19 저장 완료
✅ 1차 배치 13/19 저장 완료
✅ 1차 배치 14/19 저장 완료
✅ 1차 배치 15/19 저장 완료
✅ 1차 배치 16/19 저장 완료
✅ 1차 배치 17/19 저장 완료
✅ 1차 배치 18/19 저장 완료
✅ 1차 배치 19/19 저장 완료
✅ 저장 완료: vectorstores/GRI_01_1st


In [130]:
save_faiss_vectorstore(GRI_2ND, embedding_model, "vectorstores/GRI_01_2nd")

✅ 1차 배치 1/42 저장 완료
✅ 1차 배치 2/42 저장 완료
✅ 1차 배치 3/42 저장 완료
✅ 1차 배치 4/42 저장 완료
✅ 1차 배치 5/42 저장 완료
✅ 1차 배치 6/42 저장 완료
✅ 1차 배치 7/42 저장 완료
✅ 1차 배치 8/42 저장 완료
✅ 1차 배치 9/42 저장 완료
✅ 1차 배치 10/42 저장 완료
✅ 1차 배치 11/42 저장 완료
✅ 1차 배치 12/42 저장 완료
✅ 1차 배치 13/42 저장 완료
✅ 1차 배치 14/42 저장 완료
✅ 1차 배치 15/42 저장 완료
✅ 1차 배치 16/42 저장 완료
✅ 1차 배치 17/42 저장 완료
✅ 1차 배치 18/42 저장 완료
✅ 1차 배치 19/42 저장 완료
✅ 1차 배치 20/42 저장 완료
✅ 1차 배치 21/42 저장 완료
✅ 1차 배치 22/42 저장 완료
✅ 1차 배치 23/42 저장 완료
✅ 1차 배치 24/42 저장 완료
✅ 1차 배치 25/42 저장 완료
✅ 1차 배치 26/42 저장 완료
✅ 1차 배치 27/42 저장 완료
✅ 1차 배치 28/42 저장 완료
✅ 1차 배치 29/42 저장 완료
✅ 1차 배치 30/42 저장 완료
✅ 1차 배치 31/42 저장 완료
✅ 1차 배치 32/42 저장 완료
✅ 1차 배치 33/42 저장 완료
✅ 1차 배치 34/42 저장 완료
✅ 1차 배치 35/42 저장 완료
✅ 1차 배치 36/42 저장 완료
✅ 1차 배치 37/42 저장 완료
✅ 1차 배치 38/42 저장 완료
✅ 1차 배치 39/42 저장 완료
✅ 1차 배치 40/42 저장 완료
✅ 1차 배치 41/42 저장 완료
✅ 1차 배치 42/42 저장 완료
✅ 저장 완료: vectorstores/GRI_01_2nd
