In [1]:
from langchain_community.document_loaders import Docx2txtLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os
from pathlib import Path
from dotenv import load_dotenv
from langchain_upstage import UpstageEmbeddings
from pinecone import Pinecone
from langchain_pinecone import PineconeVectorStore
import time
import re
from langchain_core.documents import Document
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()
embedding = UpstageEmbeddings(model="solar-embedding-1-large")

In [3]:
pc = Pinecone()
index_name = 'codedoc'

In [4]:
all_documents = []

law_folders = [folder for folder in Path("../../내파일/").iterdir() 
                   if folder.is_dir() and folder.name.endswith('법령')]

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
                
for folder in law_folders:
    docx_files = list(folder.glob('*.docx'))
    for docx_path in docx_files:
        loader = Docx2txtLoader(str(docx_path))
        document_list = loader.load_and_split(text_splitter=text_splitter)
        full_text = loader.load()[0].page_content

        chunks = re.split(r'(?=제\d+조)', full_text)

        if chunks[0].strip():
                doc = Document(
                    page_content=chunks[0].strip(),
                    metadata={"source": str(docx_path), "chunk_type": "title"}
                )
                all_documents.append(doc)

        for chunk in chunks[1:]:
                if chunk.strip():
                    doc = Document(
                        page_content=chunk.strip(),
                        metadata={"source": str(docx_path), "chunk_type": "article"}
                    )
                    all_documents.append(doc)


In [6]:
if all_documents:
    batch_size = 30
    print(f"\n총 {len(all_documents)}개의 구조적 텍스트 조각을 업로드합니다...")

    for i in tqdm(range(0, len(all_documents), batch_size), desc="배치 업로드 중"):
        batch = all_documents[i:i + batch_size]
        try:
            PineconeVectorStore.from_documents(
                documents=batch,
                embedding=embedding,
                index_name=index_name
            )
        except Exception as e:
            print(f"배치 업로드 중 오류 발생: {e}")
        time.sleep(1) # API 제한을 피하기 위한 지연

    print("구조 기반 청킹 및 업로드가 완료되었습니다!")
else:
    print("처리할 문서가 없습니다.")



총 86103개의 구조적 텍스트 조각을 업로드합니다...


배치 업로드 중: 100%|██████████| 2871/2871 [8:01:11<00:00, 10.06s/it]   

구조 기반 청킹 및 업로드가 완료되었습니다!



