In [None]:
import os
import tiktoken
import requests
import json

from docling.document_converter import DocumentConverter

test_folder = "/content/drive/MyDrive/htx-sample/pdf"

# all pdfs 
pdf_files = [f for f in os.listdir(test_folder) if f.lower().endswith('.pdf')]

output_path_dir = "/content/drive/MyDrive/htx-sample/md"

converter = DocumentConverter()

for pdf_file in pdf_files:
    pdf_path = os.path.join(test_folder, pdf_file)
    
    try:
        print(f"Processing: {pdf_file}")
        result = converter.convert(pdf_path)
        
        markdown_content = result.document.export_to_markdown()
        
        output_filename = pdf_file.replace('.pdf', '.md')
        output_path = os.path.join(output_path_dir, output_filename)
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(markdown_content)
        
    except Exception as e:
        print(f"Error converting {pdf_file}: {str(e)}")

In [None]:
md_folder = "/content/drive/MyDrive/htx-sample/md"

output_path_headers = "/content/drive/MyDrive/htx-sample/chunks/chunks_header_based.jsonl"

output_path_bert = "/content/drive/MyDrive/htx-sample/chunks/chunks_bert_based.jsonl"

md_files = [f for f in os.listdir(md_folder) if f.lower().endswith('.md')]

In [None]:
## Header Chunking 

import os
import json


md_files = [f for f in os.listdir(md_folder) if f.lower().endswith('.md')]
all_chunks = []

for md_file in md_files:
    with open(os.path.join(md_folder, md_file), 'r', encoding='utf-8') as f:
        text = f.read()
    
    lines = text.split('\n')
    current_chunk = ""
    chunk_id = 1
    
    for line in lines:
        if line.strip().startswith('#'):
            if current_chunk.strip():
                all_chunks.append({
                    "source_file": md_file,
                    "chunk_id": chunk_id,
                    "chunk": current_chunk.strip(),
                    "chunk_size": len(current_chunk.strip())
                })
                chunk_id += 1
            current_chunk = line + '\n'
        else:
            current_chunk += line + '\n'
    
    if current_chunk.strip():
        all_chunks.append({
            "source_file": md_file,
            "chunk_id": chunk_id,
            "chunk": current_chunk.strip(),
            "chunk_size": len(current_chunk.strip())
        })

with open(output_path_headers, 'w', encoding='utf-8') as f:
    for chunk in all_chunks:
        json.dump(chunk, f, ensure_ascii=False)
        f.write('\n')

print(f"total chunks: {len(all_chunks)}")

In [None]:
"""
Chunk Grouping 

Adding of contexutla information 
"""

def count_tokens(text, model="gpt-4"):
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

def group_chunks_by_tokens(chunks, min_tokens=512):
    grouped_chunks = []
    current_group = []
    current_tokens = 0
    
    for chunk in chunks:
        chunk_tokens = count_tokens(chunk['chunk'])
        
        if current_tokens + chunk_tokens >= min_tokens and current_group:
            grouped_chunks.append({
                'chunks': current_group,
                'total_tokens': current_tokens,
                'combined_text': '\n\n'.join([c['chunk'] for c in current_group])
            })
            current_group = [chunk]
            current_tokens = chunk_tokens
        else:
            current_group.append(chunk)
            current_tokens += chunk_tokens
    
    if current_group:
        grouped_chunks.append({
            'chunks': current_group,
            'total_tokens': current_tokens,
            'combined_text': '\n\n'.join([c['chunk'] for c in current_group])
        })
    
    return grouped_chunks

def add_contextual_information(grouped_chunks):
    """
    Chunking Strategy


    1. Given the raw chunks seperated by headers, combine each base chunk to form a chunk size = MIN_CHUNK_TOKENS 

    2. Following anthropic Introducing Contextual Retrieval -> Append contextual information to each chunk by passing Chunk + Document to a LM

    3. Grounds each chunk with contexual infromation, such that when used for downstream generation, its able to reference the context of the whole document, 
       rather than a portion of the document instead 

    """
    enhanced_chunks = []
    
    for i, group in enumerate(grouped_chunks):
        combined_text = group['combined_text']
        
        prompt = f"""<document_context>
            You are analyzing a document chunk and need to add helpful contextual information that would make this chunk more useful for retrieval and understanding when used in isolation.

            Add contextual information such as:
            - What this section is about (brief summary)
            - Key concepts or entities mentioned
            - How this relates to the broader document topic
            - Any important background needed to understand this content

            Keep the contextual information concise but informative.
            </document_context>

            Original chunk:
            {combined_text}

            Please provide:
            1. A brief contextual summary (2-3 sentences)
            2. Key concepts/entities mentioned
            3. The enhanced chunk with contextual information added

            Format your response as:
            CONTEXTUAL_SUMMARY: [your summary]
            KEY_CONCEPTS: [comma-separated list]
            ENHANCED_CHUNK: [original content with contextual information seamlessly integrated]
      """

        headers = {
            "Authorization": f"Bearer .",
            "Content-Type": "application/json"
        }
        
        data = {
            "model": "qwen/qwen3-235b-a22b-2507",
            "messages": [
                {"role": "user", "content": prompt}
            ],
            "temperature": 0.3,
            "max_tokens": 2000
        }
        
        try:
            response = requests.post(
                "https://openrouter.ai/api/v1/chat/completions",
                headers=headers,
                json=data
            )
            
            if response.status_code == 200:
                result = response.json()
                enhanced_content = result['choices'][0]['message']['content']
                
                contextual_summary = ""
                key_concepts = ""
                enhanced_chunk = combined_text
                
                lines = enhanced_content.split('\n')
                for line in lines:
                    if line.startswith('CONTEXTUAL_SUMMARY:'):
                        contextual_summary = line.replace('CONTEXTUAL_SUMMARY:', '').strip()
                    elif line.startswith('KEY_CONCEPTS:'):
                        key_concepts = line.replace('KEY_CONCEPTS:', '').strip()
                    elif line.startswith('ENHANCED_CHUNK:'):
                        enhanced_chunk = enhanced_content[enhanced_content.find('ENHANCED_CHUNK:') + len('ENHANCED_CHUNK:'):].strip()
                
                enhanced_chunks.append({
                    'group_id': i + 1,
                    'original_chunks': group['chunks'],
                    'total_tokens': group['total_tokens'],
                    'contextual_summary': contextual_summary,
                    'key_concepts': key_concepts,
                    'enhanced_content': enhanced_chunk,
                    'original_content': combined_text
                })
                
                print(f"{i+1}/{len(grouped_chunks)}")
                
        except Exception as e:
            print(f"error calling api")
            continue 

    return enhanced_chunks

min_token_threshold = 512

grouped_chunks = group_chunks_by_tokens(all_chunks, min_tokens=min_token_threshold)
print(f"{len(grouped_chunks)} grouped chunks")

enhanced_chunks = add_contextual_information(grouped_chunks)

with open('enhanced_chunks.jsonl', 'w', encoding='utf-8') as f:
    for chunk in enhanced_chunks:
        json.dump(chunk, f, ensure_ascii=False)
        f.write('\n')

print(f"{len(enhanced_chunks)} chunks in total")