In [1]:
from langchain_community.document_loaders import Docx2txtLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os
from pathlib import Path
from dotenv import load_dotenv
from langchain_upstage import UpstageEmbeddings
from pinecone import Pinecone
from langchain_pinecone import PineconeVectorStore
import time
import re
from langchain_core.documents import Document
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()
embedding = UpstageEmbeddings(model="solar-embedding-1-large")

In [3]:
pc = Pinecone()
index_name = 'codedoc'

In [8]:
all_documents = []

law_folders = [folder for folder in Path("../../내파일/").iterdir() 
                   if folder.is_dir() and folder.name.endswith('법령')]

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)                

for folder in law_folders:
    docx_files = list(folder.glob('*.docx'))
    for docx_path in docx_files:
        loader = Docx2txtLoader(str(docx_path))
        full_text = loader.load()[0].page_content

        chunks = re.split(r'(?m)(?=^\s*제\d+조)', full_text)

        for chunk in chunks:
            if not chunk.strip():
                continue

            if len(chunk) > 1500:
                sub_chunks = text_splitter.create_documents(
                    [chunk], 
                    metadatas=[{"source": docx_path.name}]
                )
                all_documents.extend(sub_chunks)
            else: 
                doc = Document(
                    page_content=chunk.strip(),
                    metadata={"source": docx_path.name}
                )
                all_documents.append(doc)

In [7]:
if all_documents:
    batch_size = 30
    print(f"\n총 {len(all_documents)}개의 텍스트 조각을 업로드 시작")

    for i in tqdm(range(0, len(all_documents), batch_size), desc="배치 업로드 중"):
        batch = all_documents[i:i + batch_size]
        try:
            PineconeVectorStore.from_documents(
                documents=batch,
                embedding=embedding,
                index_name=index_name
            )
        except Exception as e:
            print(f"배치 업로드 중 오류 발생: {e}")
        time.sleep(1) # API 제한을 피하기 위한 지연

    print("업로드 완료")
else:
    print("처리할 문서가 없음")



총 23399개의 텍스트 조각을 업로드 시작


배치 업로드 중:   7%|▋         | 58/780 [10:53<2:09:40, 10.78s/it]

배치 업로드 중 오류 발생: Error code: 400 - {'error': {'message': "This model's maximum context length is 4000 tokens, but your request contains 6784 tokens. Please reduce the length of your input text or select only the most relevant portions to include in your request. For information on token counting methods and model-specific limits, please refer to our API reference documentation (https://console.upstage.ai/api/embeddings)", 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_request_body'}}


배치 업로드 중:   9%|▉         | 72/780 [13:37<2:36:40, 13.28s/it]

배치 업로드 중 오류 발생: Error code: 400 - {'error': {'message': "This model's maximum context length is 4000 tokens, but your request contains 5451 tokens. Please reduce the length of your input text or select only the most relevant portions to include in your request. For information on token counting methods and model-specific limits, please refer to our API reference documentation (https://console.upstage.ai/api/embeddings)", 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_request_body'}}


배치 업로드 중:  10%|█         | 78/780 [14:43<2:12:33, 11.33s/it]


KeyboardInterrupt: 