In [None]:
# PDF to Markdown Converter

This Jupyter notebook provides a comprehensive solution for converting PDF files to well-structured Markdown format. It's particularly useful for converting technical documents, ADRs, and documentation files while preserving structure and readability.

## Features

- ✅ **Clean Text Extraction**: Extracts text from PDFs while preserving structure
- ✅ **Smart Formatting**: Automatically detects and formats headings, lists, and paragraphs
- ✅ **Batch Processing**: Convert multiple PDFs at once
- ✅ **Customizable Output**: Configure formatting options and templates
- ✅ **Error Handling**: Robust error handling and logging
- ✅ **File Management**: Automatic output file naming and organization

## Requirements

- Python 3.7+
- pdfplumber library
- Standard Python libraries (os, re, pathlib, etc.)


In [None]:
# Install required packages (run this once)
import subprocess
import sys

def install_package(package):
    """Install a package using pip if not already installed."""
    try:
        __import__(package)
        print(f"✅ {package} is already installed")
    except ImportError:
        print(f"📦 Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        print(f"✅ {package} installed successfully")

# Install required packages
packages = ["pdfplumber"]
for package in packages:
    install_package(package)

print("\n🎉 All required packages are ready!")


In [None]:
# Import required libraries
import pdfplumber
import re
import os
import sys
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Optional, Tuple
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

print("📚 All libraries imported successfully!")


In [None]:
# Configuration settings
class PDFConverterConfig:
    """Configuration class for PDF to Markdown conversion."""
    
    def __init__(self):
        # Text cleaning settings
        self.remove_page_numbers = True
        self.remove_headers_footers = True
        self.preserve_line_breaks = True
        self.clean_whitespace = True
        
        # Markdown formatting settings
        self.auto_detect_headings = True
        self.create_table_of_contents = True
        self.add_metadata = True
        self.format_lists = True
        
        # Output settings
        self.output_extension = '.md'
        self.backup_originals = False
        self.overwrite_existing = True
        
        # Document structure templates
        self.adr_template = True  # Use ADR (Architecture Decision Record) template
        self.technical_doc_template = False
        
        # Default metadata
        self.default_status = "Proposed"
        self.default_date = datetime.now().strftime("%Y-%m-%d")

# Create global config instance
config = PDFConverterConfig()
print("⚙️ Configuration loaded successfully!")


In [None]:
class PDFToMarkdownConverter:
    """Main class for converting PDF files to Markdown format."""
    
    def __init__(self, config: PDFConverterConfig):
        self.config = config
        
    def extract_text_from_pdf(self, pdf_path: str) -> str:
        """Extract raw text from PDF file."""
        try:
            with pdfplumber.open(pdf_path) as pdf:
                logger.info(f"📖 Processing {len(pdf.pages)} pages from {pdf_path}")
                
                full_text = ""
                for i, page in enumerate(pdf.pages):
                    logger.info(f"   Processing page {i+1}...")
                    
                    # Extract text from the page
                    page_text = page.extract_text()
                    if page_text:
                        full_text += page_text + "\n\n"
                
                logger.info(f"✅ Extracted {len(full_text)} characters from PDF")
                return full_text
                
        except Exception as e:
            logger.error(f"❌ Error extracting text from {pdf_path}: {e}")
            return ""
    
    def clean_text(self, text: str) -> str:
        """Clean and normalize extracted text."""
        if not text:
            return ""
        
        # Remove page numbers if configured
        if self.config.remove_page_numbers:
            text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE)
        
        # Clean up excessive whitespace
        if self.config.clean_whitespace:
            text = re.sub(r'\s+', ' ', text)  # Multiple spaces to single
            text = re.sub(r'\n\s*\n', '\n\n', text)  # Multiple newlines to double
        
        # Remove common PDF artifacts
        text = text.replace('\x0c', '\n\n')  # Form feed to paragraph break
        text = re.sub(r'(?<=\w)-\s*\n\s*(?=\w)', '', text)  # Remove hyphenated line breaks
        
        return text.strip()
    
    def detect_document_structure(self, text: str) -> Dict[str, any]:
        """Analyze text to detect document structure and extract metadata."""
        structure = {
            'title': '',
            'sections': [],
            'has_toc': False,
            'document_type': 'general',
            'metadata': {}
        }
        
        lines = text.split('\n')
        
        # Try to detect title (usually first significant line)
        for line in lines[:10]:
            line = line.strip()
            if len(line) > 10 and not line.isdigit():
                structure['title'] = line
                break
        
        # Detect if it's an ADR document
        if any('ADR' in line.upper() or 'ARCHITECTURE DECISION' in line.upper() 
               for line in lines[:5]):
            structure['document_type'] = 'adr'
        
        # Detect table of contents
        toc_indicators = ['table of contents', 'contents', 'toc']
        if any(indicator in text.lower() for indicator in toc_indicators):
            structure['has_toc'] = True
        
        # Extract basic metadata
        for line in lines:
            line = line.strip().lower()
            if 'status:' in line:
                structure['metadata']['status'] = line.split('status:')[1].strip()
            elif 'date:' in line:
                structure['metadata']['date'] = line.split('date:')[1].strip()
        
        return structure

# Create converter instance
converter = PDFToMarkdownConverter(config)
print("🔄 PDF Converter initialized!")


In [None]:
# Add markdown formatting methods to the converter class
class MarkdownFormatter:
    """Handles markdown formatting and template generation."""
    
    @staticmethod
    def create_adr_template(title: str, status: str = "Proposed", date: str = None) -> str:
        """Create ADR template structure."""
        if not date:
            date = datetime.now().strftime("%Y-%m-%d")
        
        template = f"""# {title}

**Status:** {status}  
**Date:** {date}

---

## Table of Contents

1. [Context](#1-context)
2. [Decision](#2-decision)
3. [Consequences](#3-consequences)
4. [Implementation](#4-implementation)

---

## 1. Context

[Context content will be populated from PDF]

---

## 2. Decision

[Decision content will be populated from PDF]

---

## 3. Consequences

[Consequences content will be populated from PDF]

---

## 4. Implementation

[Implementation details will be populated from PDF]

---

"""
        return template
    
    @staticmethod
    def format_text_as_markdown(text: str, structure: Dict) -> str:
        """Convert cleaned text to well-formatted markdown."""
        if not text:
            return ""
        
        # Start with title and metadata if it's an ADR
        if structure['document_type'] == 'adr' and structure['title']:
            # Extract ADR number and title
            title = structure['title']
            if 'ADR' in title.upper():
                # Clean up the title format
                title = re.sub(r'ADR[^:]*:', 'ADR-XXX:', title)
            
            status = structure['metadata'].get('status', 'Proposed')
            date = structure['metadata'].get('date', datetime.now().strftime("%Y-%m-%d"))
            
            markdown = f"""# {title}

**Status:** {status}  
**Date:** {date}

---

"""
        else:
            markdown = f"# {structure['title']}\n\n" if structure['title'] else ""
        
        # Split text into paragraphs and format
        paragraphs = text.split('\n\n')
        formatted_paragraphs = []
        
        for para in paragraphs:
            para = para.strip()
            if not para:
                continue
                
            # Detect and format headings
            if MarkdownFormatter.is_heading(para):
                formatted_paragraphs.append(MarkdownFormatter.format_heading(para))
            # Detect and format lists
            elif MarkdownFormatter.is_list_item(para):
                formatted_paragraphs.append(MarkdownFormatter.format_list_item(para))
            # Regular paragraph
            else:
                formatted_paragraphs.append(para + "\n")
        
        markdown += "\n".join(formatted_paragraphs)
        return markdown
    
    @staticmethod
    def is_heading(text: str) -> bool:
        """Detect if text line is likely a heading."""
        text = text.strip()
        # Check for numbered sections like "1. Context" or "2.1. Authentication"
        if re.match(r'^\d+\.?\s+[A-Z]', text):
            return True
        # Check for all caps short lines
        if text.isupper() and len(text) < 60:
            return True
        # Check for title case and reasonable length
        if text.istitle() and len(text) < 80 and len(text.split()) <= 8:
            return True
        return False
    
    @staticmethod
    def format_heading(text: str) -> str:
        """Format text as markdown heading."""
        text = text.strip()
        
        # Detect heading level based on numbering
        if re.match(r'^\d+\.\d+\.\d+', text):  # 1.2.3
            return f"#### {text}\n"
        elif re.match(r'^\d+\.\d+', text):     # 1.2
            return f"### {text}\n"
        elif re.match(r'^\d+\.', text):        # 1.
            return f"## {text}\n"
        else:
            return f"## {text}\n"
    
    @staticmethod
    def is_list_item(text: str) -> bool:
        """Detect if text is a list item."""
        text = text.strip()
        return (text.startswith('•') or 
                text.startswith('-') or 
                text.startswith('*') or
                re.match(r'^\d+\.', text) or
                text.startswith('✓') or
                text.startswith('✅') or
                text.startswith('❗'))
    
    @staticmethod
    def format_list_item(text: str) -> str:
        """Format text as markdown list item."""
        text = text.strip()
        if not text.startswith('-'):
            # Convert other bullet types to markdown
            text = re.sub(r'^[•*✓✅❗]\s*', '- ', text)
            text = re.sub(r'^\d+\.\s*', '- ', text)
        return text + "\n"

# Add the formatter to our converter
converter.formatter = MarkdownFormatter()
print("📝 Markdown formatter added!")


In [None]:
# Add main conversion and batch processing methods
def convert_pdf_to_markdown(pdf_path: str, output_path: str = None, use_custom_template: bool = True) -> bool:
    """
    Convert a single PDF file to Markdown format.
    
    Args:
        pdf_path: Path to the input PDF file
        output_path: Path for output markdown file (optional)
        use_custom_template: Whether to use custom formatting template
    
    Returns:
        bool: True if conversion successful, False otherwise
    """
    try:
        # Validate input file
        if not os.path.exists(pdf_path):
            logger.error(f"❌ Input file not found: {pdf_path}")
            return False
        
        # Generate output path if not provided
        if not output_path:
            pdf_file = Path(pdf_path)
            output_path = pdf_file.parent / f"{pdf_file.stem}.md"
        
        logger.info(f"🔄 Converting {pdf_path} to {output_path}")
        
        # Step 1: Extract text from PDF
        raw_text = converter.extract_text_from_pdf(pdf_path)
        if not raw_text:
            logger.error("❌ No text extracted from PDF")
            return False
        
        # Step 2: Clean the extracted text
        cleaned_text = converter.clean_text(raw_text)
        
        # Step 3: Analyze document structure
        structure = converter.detect_document_structure(cleaned_text)
        logger.info(f"📋 Detected document type: {structure['document_type']}")
        
        # Step 4: Format as markdown
        if use_custom_template and structure['document_type'] == 'adr':
            # Use our proven ADR template
            markdown_content = create_structured_adr_markdown(structure, cleaned_text)
        else:
            # Use automatic formatting
            markdown_content = converter.formatter.format_text_as_markdown(cleaned_text, structure)
        
        # Step 5: Write to output file
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(markdown_content)
        
        logger.info(f"✅ Successfully converted to {output_path}")
        logger.info(f"📊 Output file size: {len(markdown_content)} characters")
        
        return True
        
    except Exception as e:
        logger.error(f"❌ Error converting {pdf_path}: {e}")
        return False

def create_structured_adr_markdown(structure: Dict, content: str) -> str:
    """Create a structured ADR markdown using our proven template."""
    
    # Extract title and clean it up
    title = structure['title']
    if 'ADR' in title.upper():
        title = re.sub(r'.*?(ADR[-\s]*\d+[^:]*)', r'\\1', title, flags=re.IGNORECASE)
        title = title.replace(' ', '-').replace('_', '-')
    
    status = structure['metadata'].get('status', 'Proposed')
    date = structure['metadata'].get('date', datetime.now().strftime("%Y-%m-%d"))
    
    # Use our proven template structure
    markdown = f"""# {title}

**Status:** {status}  
**Date:** {date}

---

## Table of Contents

1. [Context](#1-context)
2. [Main Use Cases](#2-main-use-cases)
3. [Decision](#3-decision)
4. [Consequences](#4-consequences)
5. [Technical Details](#5-technical-details)
6. [Open Issues / Next Steps](#6-open-issues--next-steps)

---

## 1. Context

{extract_section_content(content, ['context', 'background', 'problem'])}

---

## 2. Main Use Cases

{extract_section_content(content, ['use cases', 'requirements', 'scenarios'])}

---

## 3. Decision

{extract_section_content(content, ['decision', 'solution', 'approach'])}

---

## 4. Consequences

{extract_section_content(content, ['consequences', 'implications', 'pros', 'cons'])}

---

## 5. Technical Details

{extract_section_content(content, ['technical', 'implementation', 'architecture'])}

---

## 6. Open Issues / Next Steps

{extract_section_content(content, ['issues', 'next steps', 'todo', 'future'])}

---

*This document was automatically converted from PDF. Please review and update as needed.*
"""
    
    return markdown

def extract_section_content(content: str, keywords: List[str]) -> str:
    """Extract content related to specific section keywords."""
    lines = content.split('\\n')
    relevant_content = []
    
    for i, line in enumerate(lines):
        line_lower = line.lower()
        if any(keyword in line_lower for keyword in keywords):
            # Found a relevant section, extract following content
            relevant_content.append(line)
            # Get next 10-20 lines or until next major section
            for j in range(i+1, min(i+20, len(lines))):
                next_line = lines[j].strip()
                if next_line and (next_line.startswith('#') or 
                                re.match(r'^\d+\.', next_line)):
                    break
                relevant_content.append(lines[j])
    
    if relevant_content:
        return '\\n'.join(relevant_content)
    else:
        return "[Content to be added based on source document]"

print("🔧 Main conversion functions added!")


In [None]:
# Batch processing and utility functions
def batch_convert_pdfs(input_directory: str, output_directory: str = None, 
                      file_pattern: str = "*.pdf") -> Dict[str, bool]:
    """
    Convert multiple PDF files to Markdown format.
    
    Args:
        input_directory: Directory containing PDF files
        output_directory: Directory for output files (optional, defaults to input_directory)
        file_pattern: File pattern to match (default: "*.pdf")
    
    Returns:
        Dict[str, bool]: Dictionary with filename as key and success status as value
    """
    
    input_path = Path(input_directory)
    if not input_path.exists():
        logger.error(f"❌ Input directory not found: {input_directory}")
        return {}
    
    if output_directory:
        output_path = Path(output_directory)
        output_path.mkdir(parents=True, exist_ok=True)
    else:
        output_path = input_path
    
    # Find all PDF files matching pattern
    pdf_files = list(input_path.glob(file_pattern))
    
    if not pdf_files:
        logger.warning(f"⚠️ No PDF files found in {input_directory} matching pattern {file_pattern}")
        return {}
    
    logger.info(f"📁 Found {len(pdf_files)} PDF files to convert")
    
    results = {}
    successful_conversions = 0
    
    for pdf_file in pdf_files:
        output_file = output_path / f"{pdf_file.stem}.md"
        
        logger.info(f"\\n🔄 Processing {pdf_file.name}...")
        success = convert_pdf_to_markdown(str(pdf_file), str(output_file))
        
        results[pdf_file.name] = success
        if success:
            successful_conversions += 1
    
    # Summary
    logger.info(f"\\n📊 Batch conversion complete!")
    logger.info(f"✅ Successful: {successful_conversions}/{len(pdf_files)}")
    logger.info(f"❌ Failed: {len(pdf_files) - successful_conversions}/{len(pdf_files)}")
    
    return results

def find_pdf_files(directory: str, recursive: bool = True) -> List[str]:
    """
    Find all PDF files in a directory.
    
    Args:
        directory: Directory to search
        recursive: Whether to search subdirectories
    
    Returns:
        List[str]: List of PDF file paths
    """
    directory_path = Path(directory)
    
    if recursive:
        pdf_files = list(directory_path.rglob("*.pdf"))
    else:
        pdf_files = list(directory_path.glob("*.pdf"))
    
    return [str(f) for f in pdf_files]

def preview_pdf_content(pdf_path: str, num_pages: int = 2) -> str:
    """
    Preview the first few pages of a PDF file.
    
    Args:
        pdf_path: Path to PDF file
        num_pages: Number of pages to preview
    
    Returns:
        str: Preview text
    """
    try:
        with pdfplumber.open(pdf_path) as pdf:
            preview_text = ""
            for i in range(min(num_pages, len(pdf.pages))):
                page_text = pdf.pages[i].extract_text()
                if page_text:
                    preview_text += f"--- Page {i+1} ---\\n"
                    preview_text += page_text[:500] + "...\\n\\n"
            
            return preview_text
    except Exception as e:
        return f"Error previewing PDF: {e}"

def get_pdf_info(pdf_path: str) -> Dict[str, any]:
    """
    Get basic information about a PDF file.
    
    Args:
        pdf_path: Path to PDF file
    
    Returns:
        Dict with PDF information
    """
    try:
        with pdfplumber.open(pdf_path) as pdf:
            return {
                'filename': Path(pdf_path).name,
                'num_pages': len(pdf.pages),
                'file_size': os.path.getsize(pdf_path),
                'metadata': pdf.metadata,
                'first_page_preview': pdf.pages[0].extract_text()[:200] + "..." if pdf.pages else ""
            }
    except Exception as e:
        return {'error': str(e)}

print("🔧 Batch processing and utility functions added!")


In [None]:
## Usage Examples

Below are various ways to use the PDF to Markdown converter:

### 1. Single File Conversion
```python
# Convert a single PDF file
success = convert_pdf_to_markdown(
    pdf_path="path/to/document.pdf",
    output_path="path/to/output.md"
)
```

### 2. Batch Conversion
```python
# Convert all PDFs in a directory
results = batch_convert_pdfs(
    input_directory="path/to/pdfs/",
    output_directory="path/to/markdown_files/"
)
```

### 3. Find and Preview PDFs
```python
# Find all PDF files
pdf_files = find_pdf_files("path/to/search/", recursive=True)

# Preview a PDF before conversion
preview = preview_pdf_content("path/to/document.pdf", num_pages=2)
print(preview)
```

### 4. Customize Configuration
```python
# Modify global configuration
config.adr_template = True
config.create_table_of_contents = True
config.default_status = "Draft"
```


In [None]:
# PRACTICAL EXAMPLE: Convert the existing ADR PDF
# Update these paths based on your project structure

# Example: Convert the Moneta Network Authentication PDF
example_pdf_path = "../docs/adr/MoPrd-ADR-015_ Moneta Network Authentication and Identification-300625-052159.pdf"
example_output_path = "../docs/adr/ADR-015_MonetaNetworkAuthentication_AUTO.md"

# Check if the example PDF exists
if os.path.exists(example_pdf_path):
    print(f"📄 Found example PDF: {example_pdf_path}")
    
    # Get PDF info first
    pdf_info = get_pdf_info(example_pdf_path)
    print(f"📊 PDF Info:")
    print(f"   - Pages: {pdf_info.get('num_pages', 'Unknown')}")
    print(f"   - Size: {pdf_info.get('file_size', 0) / 1024:.1f} KB")
    
    # Preview content
    print(f"\\n👀 Preview:")
    preview = preview_pdf_content(example_pdf_path, num_pages=1)
    print(preview[:300] + "...")
    
    # Convert the PDF
    print(f"\\n🔄 Converting PDF to Markdown...")
    success = convert_pdf_to_markdown(example_pdf_path, example_output_path)
    
    if success:
        print(f"✅ Conversion successful! Check: {example_output_path}")
    else:
        print(f"❌ Conversion failed!")
        
else:
    print(f"⚠️ Example PDF not found at: {example_pdf_path}")
    print("Please update the path or use your own PDF file.")


In [None]:
# BATCH CONVERSION EXAMPLE
# Convert all PDFs in the docs/adr directory

adr_directory = "../docs/adr/"

print("🔍 Searching for PDF files in ADR directory...")
pdf_files = find_pdf_files(adr_directory, recursive=False)

if pdf_files:
    print(f"📁 Found {len(pdf_files)} PDF file(s):")
    for pdf_file in pdf_files:
        print(f"   - {Path(pdf_file).name}")
    
    print(f"\\n🚀 Starting batch conversion...")
    results = batch_convert_pdfs(
        input_directory=adr_directory,
        output_directory=adr_directory,  # Save in same directory
        file_pattern="*.pdf"
    )
    
    print(f"\\n📋 Conversion Results:")
    for filename, success in results.items():
        status = "✅ Success" if success else "❌ Failed"
        print(f"   {filename}: {status}")
        
else:
    print("📄 No PDF files found in the ADR directory.")
    print("\\nYou can:")
    print("1. Add PDF files to the docs/adr/ directory")
    print("2. Update the path in this cell")
    print("3. Use the single file conversion example above")


In [None]:
## Tips and Best Practices

### 📝 Document Quality
- **Best Results**: PDFs with clear text (not scanned images)
- **Structure**: Documents with consistent heading styles work better
- **Language**: English documents are optimized, but other languages should work

### ⚙️ Configuration Tweaks
- Set `config.adr_template = True` for Architecture Decision Records
- Use `config.clean_whitespace = True` for cleaner output
- Enable `config.create_table_of_contents = True` for navigation

### 🔧 Troubleshooting
- **Empty Output**: Check if PDF contains text (not just images)
- **Poor Formatting**: Try adjusting the cleaning settings
- **Large Files**: Consider processing in smaller batches

### 🚀 Advanced Usage
```python
# Custom configuration for specific document types
config.adr_template = True
config.default_status = "Under Review"

# Process with custom settings
success = convert_pdf_to_markdown("document.pdf", "output.md", use_custom_template=True)
```

### 📁 File Organization
- Keep PDFs organized in directories by type (ADRs, specifications, etc.)
- Use consistent naming conventions
- Consider backup strategies for important documents

---

## 🎉 You're Ready to Convert!

This notebook provides a comprehensive solution for converting PDF documents to well-structured Markdown. The system is particularly optimized for technical documents and ADRs, but works well with various document types.

**Next Steps:**
1. Update file paths in the example cells
2. Run the conversion on your PDF files
3. Review and refine the output as needed
4. Customize the configuration for your specific needs

Happy converting! 🚀
