In [None]:
pip install langchain

In [2]:
pip install -qU langchain-text-splitter

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement langchain-text-splitter (from versions: none)
ERROR: No matching distribution found for langchain-text-splitter


# Markdown Header Splitter


In [3]:
from langchain_text_splitters import MarkdownHeaderTextSplitter

In [9]:
markdown_document = "# Foo\n\n    ## Bar\n\nHi this is Jim\n\nHi this is Joe\n\n ### Boo \n\n Hi this is Lance \n\n ## Baz\n\n Hi this is Molly"

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on)
md_header_splits = markdown_splitter.split_text(markdown_document)
md_header_splits

[Document(metadata={'Header 1': 'Foo', 'Header 2': 'Bar'}, page_content='Hi this is Jim  \nHi this is Joe'),
 Document(metadata={'Header 1': 'Foo', 'Header 2': 'Bar', 'Header 3': 'Boo'}, page_content='Hi this is Lance'),
 Document(metadata={'Header 1': 'Foo', 'Header 2': 'Baz'}, page_content='Hi this is Molly')]

In [None]:
markdown_document = "# Intro \n\n    ## History \n\n Markdown[9] is a lightweight markup language for creating formatted text using a plain-text editor. John Gruber created Markdown in 2004 as a markup language that is appealing to human readers in its source code form.[9] \n\n Markdown is widely used in blogging, instant messaging, online forums, collaborative software, documentation pages, and readme files. \n\n ## Rise and divergence \n\n As Markdown popularity grew rapidly, many Markdown implementations appeared, driven mostly by the need for \n\n additional features such as tables, footnotes, definition lists,[note 1] and Markdown inside HTML blocks. \n\n #### Standardization \n\n From 2012, a group of people, including Jeff Atwood and John MacFarlane, launched what Atwood characterised as a standardisation effort. \n\n ## Implementations \n\n Implementations of Markdown are available for over a dozen programming languages."

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
]

# MD splits
markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on, strip_headers=False
)
md_header_splits = markdown_splitter.split_text(markdown_document)

# Char-level splits
from langchain_text_splitters import RecursiveCharacterTextSplitter

chunk_size = 250
chunk_overlap = 30
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, chunk_overlap=chunk_overlap
)

# Split
splits = text_splitter.split_documents(md_header_splits)
splits

[Document(metadata={}, page_content='document_result.md')]

## Dividing into chunks


In [None]:
from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
import os

def split_markdown_document(input_file_path, chunk_size=500, chunk_overlap=30):
    """
    Split a markdown document into chunks based on headers and character count.
    
    Args:
        input_file_path (str): Path to the input markdown file
        chunk_size (int): Size of each chunk in characters
        chunk_overlap (int): Number of characters to overlap between chunks
        
    Returns:
        list: List of document chunks
    """
    # Check if file exists
    if not os.path.exists(input_file_path):
        raise FileNotFoundError(f"Input file not found: {input_file_path}")
    
    # Read the markdown document
    with open(input_file_path, 'r', encoding='utf-8') as file:
        markdown_document = file.read()
    
    # Define headers to split on
    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"),
        ("####", "Header 4"),
    ]
    
    # First split by headers
    markdown_splitter = MarkdownHeaderTextSplitter(
        headers_to_split_on=headers_to_split_on, strip_headers=False
    )
    md_header_splits = markdown_splitter.split_text(markdown_document)
    
    # Then split by character count
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, 
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", " ", ""],
        keep_separator=True
    )
    
    # Split the documents
    splits = text_splitter.split_documents(md_header_splits)
    
    return splits, markdown_document

def display_chunks(splits, original_document):
    """
    Display the original document and the resulting chunks.
    
    Args:
        splits (list): List of document chunks
        original_document (str): Original markdown document
    """

    print("\n" + "=" * 80)
    print(f"DOCUMENT SPLIT INTO {len(splits)} CHUNKS:")
    print("=" * 80)
    
    for i, chunk in enumerate(splits):
        print(f"\nCHUNK {i+1} (Length: {len(chunk.page_content)} characters):")
        print("-" * 50)
        
        # Print metadata
        print("Metadata:")
        for key, value in chunk.metadata.items():
            print(f"  {key}: {value}")
        
        # Print content
        print("\nContent:")
        print(chunk.page_content)
        print("-" * 50)

def main():
    # Define input file path
    input_file = "합쳐진_용어집.md"
    
    # Define chunk size
    chunk_size = int(1000)
    
    # Define chunk overlap
    chunk_overlap = int(40)
    
    # Split the document
    try:
        splits, original_document = split_markdown_document(input_file, chunk_size, chunk_overlap)
        
        # Display the results
        display_chunks(splits, original_document)
        
        print(f"\nTotal chunks created: {len(splits)}")
        
    except Exception as e:
        print(f"Error: {e}")

if __name__ == "__main__":
    main()

# Header Splitter + LLM 

In [None]:
import os
from openai import AzureOpenAI
from dotenv import load_dotenv
from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
import json

def split_markdown_document(input_file_path, chunk_size=500, chunk_overlap=30):
    """
    Split a markdown document into chunks based on headers and character count.
    
    Args:
        input_file_path (str): Path to the input markdown file
        chunk_size (int): Size of each chunk in characters
        chunk_overlap (int): Number of characters to overlap between chunks
        
    Returns:
        list: List of document chunks and original document
    """
    # Check if file exists
    if not os.path.exists(input_file_path):
        raise FileNotFoundError(f"Input file not found: {input_file_path}")
    
    # Read the markdown document
    with open(input_file_path, 'r', encoding='utf-8') as file:
        markdown_document = file.read()
    
    # Define headers to split on
    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"),
        ("####", "Header 4"),
    ]
    
    # First split by headers
    markdown_splitter = MarkdownHeaderTextSplitter(
        headers_to_split_on=headers_to_split_on, strip_headers=False
    )
    md_header_splits = markdown_splitter.split_text(markdown_document)
    
    # Then split by character count
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, 
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", " ", ""],
        keep_separator=True
    )
    
    # Split the documents
    splits = text_splitter.split_documents(md_header_splits)
    
    return splits, markdown_document

def process_file(chunk_content, chunk_number):
    """
    Process a chunk of content through Azure AI to convert HTML tables to text.
    
    Args:
        chunk_content (str): Content of the chunk to process
        chunk_number (int): Number of the chunk for identification
        
    Returns:
        str: Processed text content
    """
    load_dotenv()
    endpoint = os.getenv("ENDPOINT_URL")
    deployment = os.getenv("DEPLOYMENT_NAME")
    subscription_key = os.getenv("AZURE_OPENAI_KEY")
    
    if not all([endpoint, deployment, subscription_key]):
        raise ValueError("환경변수 누락: ENDPOINT_URL, DEPLOYMENT_NAME, AZURE_OPENAI_KEY를 확인하세요.")
    
    client = AzureOpenAI(
        azure_endpoint=endpoint,
        api_key=subscription_key,
        api_version="2024-05-01-preview",
    )
    
    chat_prompt = [
        {
            "role": "system",
            "content": [
                {
                    "type": "text",
                    "text": "너는 HTML 테이블을 읽고 자연스러운 서술형 텍스트로 변환하는 텍스트 변환 엔진이다.\n\n입력 데이터는 일반 텍스트와 HTML 코드가 혼합된 문서이며, 이 중 HTML 테이블(`<table>`) 형식으로 작성된 표만을 감지하여 사람이 읽기 쉬운 **자연스러운 텍스트**로 변환하라. 표 외의 일반 텍스트는 **절대로 변경하지 않는다**. \n\n출력된 텍스트는 아래의 기준을 모두 따라야 한다:\n\n1. 표의 계층 구조, 제목, 셀의 관계를 모두 파악하여 자연어로 기술한다.\n2. 셀이 병합된 경우 (`rowspan`, `colspan`)에는 의미적으로 내용을 통합하여 풀어서 설명한다.\n3. 표 안에 또 다른 표가 중첩되어 있는 경우에도 각 표를 계층적으로 처리하고, 문맥상 자연스럽게 연결되도록 한다.\n4. 빈 칸이 있는 경우, 내용을 유추하지 않고 \"(빈칸)\" 또는 \"해당 없음\" 등으로 명확하게 표기한다.\n5. 항목 간 구분은 \"■\", \"1.\", \"-\" 등을 사용하여 명확히 구분하고, 계층적으로 정리한다.\n6. 결과 텍스트는 반드시 문맥상 자연스럽고 일관되게 연결되어야 하며, 원래 문서의 흐름과 연결되도록 이어져야 한다.\n7. HTML 태그가 아닌 일반 텍스트 영역은 절대로 수정하거나 재구성하지 않는다.\n8. 결과는 마크다운 문서로 사용 가능한 수준의 가독성을 갖춰야 하며, 표를 설명하는 문장은 공식 문서나 계약서 스타일처럼 명료하고 단정하게 작성한다.\n\n예외나 애매한 구조가 있어도 최대한 의미를 보존하여 사람이 이해할 수 있도록 직관적으로 설명하라.\n\n입력 형식 예시:\n(본문 텍스트)\n<table>...</table>\n(본문 텍스트 계속)\n\n출력 형식 예시:\n(본문 텍스트)\n■ 항목명  \n- 내용1  \n- 내용2  \n이제 아래에 입력된 문서 내 HTML 테이블을 위 기준에 따라 서술형 텍스트로 변환하라."
                }
            ]
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": chunk_content
                }
            ]
        }
    ]
    
    completion = client.chat.completions.create(
        model=deployment,
        messages=chat_prompt,
        max_tokens=1500,
        temperature=0.7,
        top_p=0.95,
        frequency_penalty=0,
        presence_penalty=0,
        stream=False
    )
    
    processed_text = completion.choices[0].message.content
    
    return processed_text

def prepare_for_vector_db(processed_text, chunk_metadata):
    """
    Prepare processed text for vector DB storage.
    
    Args:
        processed_text (str): The processed text
        chunk_metadata (dict): The metadata for this chunk
        
    Returns:
        dict: A dictionary formatted for vector DB insertion
    """
    # Create a dict that can be easily stored in vector DB
    vector_db_entry = {
        "text": processed_text,
        "metadata": chunk_metadata
    }
    
    return vector_db_entry

def process_document(input_file_path, output_dir="processed_chunks"):
    """
    Main function to process a document:
    1. Split document into chunks
    2. Process each chunk with Azure OpenAI
    3. Prepare for vector DB
    4. Save processed chunks to text files
    
    Args:
        input_file_path (str): Path to the input markdown file
        output_dir (str): Directory to save processed chunks
    """
    # Create output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Step 1: Split the document into chunks
    print(f"🔄 Splitting document '{input_file_path}' into chunks...")
    chunks, original_document = split_markdown_document(input_file_path)
    print(f"✅ Document split into {len(chunks)} chunks")
    
    # List to store vector DB entries
    vector_db_entries = []
    
    # Step 2 & 3: Process each chunk and prepare for vector DB
    for i, chunk in enumerate(chunks):
        chunk_number = i + 1
        print(f"\n🔄 Processing chunk {chunk_number} of {len(chunks)}...")
        
        # Save original chunk to file
        original_chunk_path = os.path.join(output_dir, f"chunk_{chunk_number}_original.txt")
        with open(original_chunk_path, 'w', encoding='utf-8') as f:
            f.write(chunk.page_content)
        
        # Process the chunk with Azure OpenAI
        processed_text = process_file(chunk.page_content, chunk_number)
        
        # Prepare for vector DB
        vector_db_entry = prepare_for_vector_db(processed_text, chunk.metadata)
        vector_db_entries.append(vector_db_entry)
        
        # Step 4: Save processed chunk to text file
        processed_chunk_path = os.path.join(output_dir, f"chunk_{chunk_number}_processed.txt")
        with open(processed_chunk_path, 'w', encoding='utf-8') as f:
            f.write(processed_text)
        
        # Save vector DB entry to JSON file
        vector_db_entry_path = os.path.join(output_dir, f"chunk_{chunk_number}_vector_db.json")
        with open(vector_db_entry_path, 'w', encoding='utf-8') as f:
            json.dump(vector_db_entry, f, ensure_ascii=False, indent=2)
        
        print(f"✅ Chunk {chunk_number} processed and saved")
    
    # Save all vector DB entries to a single file
    all_entries_path = os.path.join(output_dir, "all_vector_db_entries.json")
    with open(all_entries_path, 'w', encoding='utf-8') as f:
        json.dump(vector_db_entries, f, ensure_ascii=False, indent=2)
    
    print(f"\n✅ All processing complete! Results saved in '{output_dir}'")
    print(f"✅ Vector DB entries saved to '{all_entries_path}'")
    
    return vector_db_entries

if __name__ == "__main__":
    # Set your input file path here
    input_file_path = "document_result.md"  # Change this to your input file path
    
    # Process the document
    vector_db_entries = process_document(input_file_path)