In [4]:
from langchain_community.document_loaders import GitLoader


def file_filter(file_path: str) -> bool:
    return file_path.endswith(".mdx")


loader = GitLoader(
    clone_url="https://github.com/langchain-ai/docs",
    repo_path="./langchain",
    branch="main",
    file_filter=file_filter,
)

documents = loader.load()
print(len(documents))

1975


In [5]:
from langchain_text_splitters import CharacterTextSplitter

# 문서를 청크로 분할
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)

docs = text_splitter.split_documents(documents)
print(len(docs))

Created a chunk of size 1814, which is longer than the specified 1000
Created a chunk of size 1339, which is longer than the specified 1000
Created a chunk of size 1781, which is longer than the specified 1000
Created a chunk of size 1423, which is longer than the specified 1000
Created a chunk of size 1206, which is longer than the specified 1000
Created a chunk of size 1772, which is longer than the specified 1000
Created a chunk of size 3159, which is longer than the specified 1000
Created a chunk of size 5522, which is longer than the specified 1000
Created a chunk of size 1114, which is longer than the specified 1000
Created a chunk of size 17024, which is longer than the specified 1000
Created a chunk of size 2370, which is longer than the specified 1000
Created a chunk of size 2103, which is longer than the specified 1000
Created a chunk of size 1589, which is longer than the specified 1000
Created a chunk of size 1217, which is longer than the specified 1000
Created a chunk of 

13657


In [6]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# 필터링된 문서로 처리
import os
os.environ["ANONYMIZED_TELEMETRY"] = "False"

from tqdm import tqdm
import time

# chroma_db 폴더가 이미 존재하면 기존 DB 로드, 없으면 새로 생성
db_path = "./chroma_db/langchain-ai-docs"

if os.path.exists(db_path):
    print(f"✓ 기존 DB를 로드합니다: {db_path}")
    db = Chroma(persist_directory=db_path, embedding_function=embeddings)
    print(f"총 {db._collection.count()} 개의 문서가 저장되어 있습니다.")
else:
    print("✗ DB가 없습니다. 새로 생성합니다...")

    # 먼저 문서 길이 확인
    doc_lengths = [len(doc.page_content) for doc in docs]
    print(f"평균 문서 길이: {sum(doc_lengths)/len(doc_lengths):.0f} 문자")
    print(f"최대 문서 길이: {max(doc_lengths)} 문자")
    print(f"총 문서 수: {len(docs)}")

    # 너무 긴 문서 제외 (예: 50,000자 이상)
    docs_filtered = [doc for doc in docs if len(doc.page_content) < 50000]
    print(f"필터링 후: {len(docs_filtered)} 문서")

    batch_size = 20
    db = None

    for i in tqdm(range(0, len(docs_filtered), batch_size)):
        batch = docs_filtered[i:i + batch_size]
        try:
            if db is None:
                db = Chroma.from_documents(batch, embeddings, persist_directory=db_path)
            else:
                db.add_documents(batch)
            time.sleep(0.5)
        except Exception as e:
            print(f"배치 {i} 실패: {e}")

    print(f"✓ DB 생성 완료: {db._collection.count()} 개의 문서 저장됨")

✗ DB가 없습니다. 새로 생성합니다...
평균 문서 길이: 2458 문자
최대 문서 길이: 3429301 문자
총 문서 수: 13657
필터링 후: 13632 문서


  0%|          | 0/682 [00:00<?, ?it/s]Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given
100%|██████████| 682/682 [13:24<00:00,  1.18s/it]

✓ DB 생성 완료: 13632 개의 문서 저장됨



