A Notebook to compile results, get the full content from OCR and combine them into a json dataset with appropriate formatting along with some helper functions.
REQUIRES MANUAL CORRECTIONS

1.Get the content from OCR

In [29]:
import os
import re
import glob
import logging
from tqdm import tqdm

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('complete_content_processing.log'),
        logging.StreamHandler()
    ]
)

def read_markdown_sections(md_file_path):
    """Read title, abstract, and content from a markdown file"""
    try:
        with open(md_file_path, 'r', encoding='utf-8') as f:
            content = f.read()
        
        # Extract title (more flexible pattern)
        title_match = re.search(r'#\s+(.*?)(?:\n|$)', content)
        title = title_match.group(1).strip() if title_match else None
        
        # Extract abstract (more flexible pattern)
        abstract_match = re.search(r'##\s+Abstract\s*\n+\s*(.*?)(?:\n+\s*##|$)', content, re.DOTALL)
        abstract = abstract_match.group(1).strip() if abstract_match else None
        
        # Extract content (more flexible pattern)
        content_match = re.search(r'##\s+Content\s*\n+\s*(.*?)(?:\n+\s*##|$)', content, re.DOTALL)
        content_text = content_match.group(1).strip() if content_match else None
        
        return {
            'title': title,
            'abstract': abstract,
            'content': content_text
        }
    except Exception as e:
        logging.error(f"Error reading markdown file {md_file_path}: {str(e)}")
        return {'title': None, 'abstract': None, 'content': None}

def read_ocr_text(ocr_file_path):
    """Read full OCR text from a text file"""
    try:
        with open(ocr_file_path, 'r', encoding='utf-8') as f:
            return f.read()
    except Exception as e:
        logging.error(f"Error reading OCR file {ocr_file_path}: {str(e)}")
        return None

def clean_ocr_text(text):
    """Remove OCR markers and clean up the text"""
    if not text:
        return ""
    
    # Remove OCR markers
    text = re.sub(r'---\s*OCR Start\s*---.*?\n', '', text, flags=re.IGNORECASE)
    text = re.sub(r'---\s*OCR End\s*---.*?\n', '', text, flags=re.IGNORECASE)
    
    # Remove any remaining markers with variations
    text = re.sub(r'OCR Start.*?\n', '', text, flags=re.IGNORECASE)
    text = re.sub(r'OCR End.*?\n', '', text, flags=re.IGNORECASE)
    
    return text.strip()

def extract_content_after_abstract(ocr_text, abstract):
    """
    Extract all content that comes after the abstract in the OCR text.
    Returns the raw text without additional processing.
    """
    if not abstract or abstract == "None":
        logging.warning("No abstract provided for content extraction")
        return clean_ocr_text(ocr_text)  # Return the clean OCR text if no abstract
    
    # Clean the abstract slightly to help with matching
    clean_abstract = abstract.strip()
    
    # Try to find where the abstract ends in the OCR text
    if clean_abstract in ocr_text:
        # Simple case - exact match
        content_start_idx = ocr_text.find(clean_abstract) + len(clean_abstract)
        return clean_ocr_text(ocr_text[content_start_idx:].strip())
    
    # Abstract is not an exact match in the OCR text
    # Try to find the last few sentences of the abstract
    abstract_sentences = re.split(r'(?<=[.!?])\s+', clean_abstract)
    if len(abstract_sentences) > 1:
        # Try with last sentence
        last_sentence = abstract_sentences[-1].strip()
        if last_sentence in ocr_text:
            content_start_idx = ocr_text.find(last_sentence) + len(last_sentence)
            return clean_ocr_text(ocr_text[content_start_idx:].strip())
        
        # Try with last two sentences
        if len(abstract_sentences) > 2:
            last_two_sentences = ' '.join(abstract_sentences[-2:]).strip()
            if last_two_sentences in ocr_text:
                content_start_idx = ocr_text.find(last_two_sentences) + len(last_two_sentences)
                return clean_ocr_text(ocr_text[content_start_idx:].strip())
    
    # Look for common section headings that typically come after the abstract
    section_headings = [
        "Introduction", "1 Introduction", "I. Introduction", 
        "1.", "I.", "Background", "Methods", "Methodology"
    ]
    
    for heading in section_headings:
        heading_pattern = r'\n\s*' + re.escape(heading) + r'\s*\n'
        match = re.search(heading_pattern, ocr_text)
        if match:
            return clean_ocr_text(ocr_text[match.start():].strip())
    
    # If all matching methods fail, return the clean OCR text with a warning
    logging.warning("Could not find content after abstract - returning full clean OCR text")
    return clean_ocr_text(ocr_text)

def create_complete_markdown_files(md_folder, ocr_folder, output_folder):
    """
    Create complete markdown files with title and abstract from DeepSeek
    and full content from OCR text files.
    
    Parameters:
    -----------
    md_folder : str
        Path to folder containing markdown files from DeepSeek
    ocr_folder : str
        Path to folder containing OCR text files
    output_folder : str
        Path to folder to save complete markdown files
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)
    
    # Get all markdown files
    md_files = glob.glob(os.path.join(md_folder, "*.md"))
    
    if not md_files:
        logging.error(f"No markdown files found in {md_folder}")
        return
    
    logging.info(f"Found {len(md_files)} markdown files to process")
    
    # Process each markdown file
    success_count = 0
    for md_file in tqdm(md_files, desc="Creating complete markdown files"):
        try:
            # Get base name of file
            base_name = os.path.splitext(os.path.basename(md_file))[0]
            
            # Find corresponding OCR file
            ocr_file = os.path.join(ocr_folder, f"{base_name}.txt")
            
            if not os.path.exists(ocr_file):
                logging.warning(f"No corresponding OCR file found for {base_name}, skipping")
                continue
            
            # Read sections from markdown file
            sections = read_markdown_sections(md_file)
            
            # Debug missing sections
            if not sections['title']:
                logging.error(f"Missing title in {md_file}, reading raw file:")
                with open(md_file, 'r', encoding='utf-8') as f:
                    first_lines = ''.join(f.readlines()[:10])
                logging.error(f"First 10 lines: {first_lines}")
            
            if not sections['abstract']:
                logging.error(f"Missing abstract in {md_file}, reading raw file:")
                with open(md_file, 'r', encoding='utf-8') as f:
                    first_lines = ''.join(f.readlines()[:20])
                logging.error(f"First 20 lines: {first_lines}")
            
            # Skip if any section is missing
            if not sections['title'] or not sections['abstract']:
                logging.warning(f"Missing title or abstract in {md_file}, skipping")
                continue
            
            # Read OCR text
            ocr_text = read_ocr_text(ocr_file)
            if not ocr_text:
                logging.warning(f"Empty OCR text for {base_name}, skipping")
                continue
            
            # Extract full content after abstract
            full_content = extract_content_after_abstract(ocr_text, sections['abstract'])
            
            # Create output file path
            output_file = os.path.join(output_folder, f"{base_name}.md")
            
            # Write complete markdown file
            with open(output_file, 'w', encoding='utf-8') as f:
                f.write(f"# {sections['title']}\n\n")
                f.write(f"## Abstract\n\n{sections['abstract']}\n\n")
                f.write(f"## Content\n\n{full_content}\n")
            
            success_count += 1
            
        except Exception as e:
            logging.error(f"Error processing {md_file}: {str(e)}")
    
    logging.info(f"Processing completed! Successfully created {success_count}/{len(md_files)} complete markdown files")
    return success_count

# Run the function to create complete markdown files
md_folder = "sectioning_output_100"
ocr_folder = "ocr_output_100"
output_folder = "sectioning_output_100_complete"

# Execute the process
result = create_complete_markdown_files(md_folder, ocr_folder, output_folder)

# Print summary
print(f"\n{'='*50}")
if result:
    print(f"✅ Successfully created {result} complete markdown files in '{output_folder}'")
else:
    print(f"❌ Failed to create complete markdown files")
print(f"{'='*50}")

2025-02-27 16:51:37,824 - INFO - Found 100 markdown files to process
Creating complete markdown files: 100%|██████████| 100/100 [00:00<00:00, 912.48it/s]
2025-02-27 16:51:37,935 - INFO - Processing completed! Successfully created 100/100 complete markdown files



✅ Successfully created 100 complete markdown files in 'sectioning_output_100_complete'


2.Apply json safe formatting to markdowns --- BACKUP YOUR INPUT FOLDER JUST IN CASE DO NOT RUN THIS CODE MUTLIPLE TIMES

In [30]:
import os
import re
import glob
import json
import logging
from tqdm import tqdm

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('formatting_check.log'),
        logging.StreamHandler()
    ]
)

def apply_json_safe_formatting(text):
    """Apply JSON-safe formatting to text"""
    if not text:
        return ""
    
    # Remove OCR markers if still present
    text = re.sub(r'---\s*OCR Start\s*---', '', text)
    text = re.sub(r'---\s*OCR End\s*---', '', text)
    
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    
    # Use json.dumps to handle escaping properly
    # This automatically handles all escape sequences in one step
    return json.loads(json.dumps(text))
    
    # The previous approach caused cascading escapes:
    # replacements = {
    #     '\\': '\\\\',  # Backslash - these cause multiple passes to create \\\\\\
    #     '"': '\\"',    # Double quote
    #     '\b': '\\b',   # Backspace
    #     '\f': '\\f',   # Form feed
    #     '\n': '\\n',   # New line
    #     '\r': '\\r',   # Carriage return
    #     '\t': '\\t'    # Tab
    # }
    # 
    # for char, replacement in replacements.items():
    #     text = text.replace(char, replacement)
    
    # return text

def read_markdown_sections(md_file_path):
    """Read title, abstract, and content from a markdown file"""
    try:
        with open(md_file_path, 'r', encoding='utf-8') as f:
            content = f.read()
        
        # Extract title
        title_match = re.search(r'# (.*?)\n', content)
        title = title_match.group(1).strip() if title_match else None
        
        # Extract abstract
        abstract_match = re.search(r'## Abstract\n\n(.*?)\n\n## Content', content, re.DOTALL)
        abstract = abstract_match.group(1).strip() if abstract_match else None
        
        # Extract content
        content_match = re.search(r'## Content\n\n(.*?)(?:\n\n#|$)', content, re.DOTALL)
        content_text = content_match.group(1).strip() if content_match else None
        
        return {
            'title': title,
            'abstract': abstract,
            'content': content_text
        }
    except Exception as e:
        logging.error(f"Error reading markdown file {md_file_path}: {str(e)}")
        return {'title': None, 'abstract': None, 'content': None}

def verify_json_compatibility(sections):
    """Verify that all sections can be encoded as valid JSON"""
    try:
        test_dict = {
            'title': sections['title'],
            'abstract': sections['abstract'],
            'content': sections['content']
        }
        json.dumps(test_dict)
        return True
    except (json.JSONDecodeError, TypeError) as e:
        return False

def format_all_markdown_files(input_folder, output_folder="jsonsafe_markdowns"):
    """Apply JSON-safe formatting to all markdown files in a folder and save to a new folder"""
    # Create output directory if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)
    
    # Get all markdown files in the folder
    md_files = glob.glob(os.path.join(input_folder, "*.md"))
    
    if not md_files:
        logging.error(f"No markdown files found in {input_folder}")
        return
    
    logging.info(f"Found {len(md_files)} markdown files to format")
    
    formatted_count = 0
    for md_file in tqdm(md_files, desc="Applying JSON formatting"):
        try:
            # Read markdown sections
            sections = read_markdown_sections(md_file)
            
            # Skip if any section is missing
            if not sections['title'] or not sections['abstract'] or not sections['content']:
                logging.warning(f"Missing sections in {md_file}, skipping")
                continue
            
            # Apply JSON-safe formatting to all sections
            for key in sections:
                sections[key] = apply_json_safe_formatting(sections[key])
            
            # Verify JSON compatibility
            if not verify_json_compatibility(sections):
                logging.error(f"JSON encoding error in {md_file} even after formatting")
                continue
            
            # Get base filename and create new output path
            base_filename = os.path.basename(md_file)
            output_file = os.path.join(output_folder, base_filename)
            
            # Write formatted sections to new file in output folder
            with open(output_file, 'w', encoding='utf-8') as f:
                f.write(f"# {sections['title']}\n\n")
                f.write(f"## Abstract\n\n{sections['abstract']}\n\n")
                f.write(f"## Content\n\n{sections['content']}\n\n")
            
            formatted_count += 1
            
        except Exception as e:
            logging.error(f"Error formatting {md_file}: {str(e)}")
    
    logging.info(f"Formatting completed! Successfully formatted {formatted_count}/{len(md_files)} files.")
    logging.info(f"JSON-safe files saved to: {output_folder}")

if __name__ == "__main__":
    # Apply JSON-safe formatting to all markdown files and save to jsonsafe_markdowns folder
    format_all_markdown_files("sectioning_output_100_complete")

2025-02-27 17:05:12,041 - INFO - Found 100 markdown files to format
Applying JSON formatting: 100%|██████████| 100/100 [00:00<00:00, 781.92it/s]
2025-02-27 17:05:12,170 - INFO - Formatting completed! Successfully formatted 100/100 files.
2025-02-27 17:05:12,170 - INFO - JSON-safe files saved to: jsonsafe_markdowns


Turn it to json dataset 

In [32]:
import os
import re
import glob
import json
import logging
from tqdm import tqdm

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('combined_json_conversion.log'),
        logging.StreamHandler()
    ]
)

def read_markdown_sections(md_file_path):
    """Read title, abstract, and content from a markdown file"""
    try:
        with open(md_file_path, 'r', encoding='utf-8') as f:
            content = f.read()
        
        # Extract title
        title_match = re.search(r'# (.*?)\n', content)
        title = title_match.group(1).strip() if title_match else None
        
        # Extract abstract
        abstract_match = re.search(r'## Abstract\n\n(.*?)\n\n## Content', content, re.DOTALL)
        abstract = abstract_match.group(1).strip() if abstract_match else None
        
        # Extract content
        content_match = re.search(r'## Content\n\n(.*?)(?:\n\n#|$)', content, re.DOTALL)
        content_text = content_match.group(1).strip() if content_match else None
        
        return {
            'title': title,
            'abstract': abstract,
            'content': content_text
        }
    except Exception as e:
        logging.error(f"Error reading markdown file {md_file_path}: {str(e)}")
        return {'title': None, 'abstract': None, 'content': None}

def combine_markdowns_to_json(input_folder="jsonsafe_markdowns", output_file="combined_dataset.json"):
    """Combine all markdown files into a single JSON dataset"""
    # Get all markdown files in the folder
    md_files = glob.glob(os.path.join(input_folder, "*.md"))
    
    if not md_files:
        logging.error(f"No markdown files found in {input_folder}")
        return
    
    logging.info(f"Found {len(md_files)} markdown files to combine into a single JSON")
    
    # Create a list to hold all document data
    all_documents = []
    
    success_count = 0
    for md_file in tqdm(md_files, desc="Processing documents"):
        try:
            # Read markdown sections
            sections = read_markdown_sections(md_file)
            
            # Skip if any section is missing
            if not sections['title'] or not sections['abstract'] or not sections['content']:
                logging.warning(f"Missing sections in {md_file}, skipping")
                continue
            
            # Add document to the list
            all_documents.append(sections)
            success_count += 1
            
        except Exception as e:
            logging.error(f"Error processing {md_file}: {str(e)}")
    
    # Write the combined JSON file
    try:
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(all_documents, f, ensure_ascii=False, indent=2)
        
        logging.info(f"Combined JSON created successfully with {success_count} documents")
        logging.info(f"Combined JSON saved to: {output_file}")
        
        # Print file size
        file_size_mb = os.path.getsize(output_file) / (1024 * 1024)
        logging.info(f"File size: {file_size_mb:.2f} MB")
        
        return success_count
    except Exception as e:
        logging.error(f"Error writing combined JSON file: {str(e)}")
        return 0

# Execute the conversion
combine_markdowns_to_json("jsonsafe_markdowns", "combined_dataset.json")

2025-02-27 17:10:24,447 - INFO - Found 100 markdown files to combine into a single JSON
Processing documents: 100%|██████████| 100/100 [00:00<00:00, 2171.55it/s]
2025-02-27 17:10:24,506 - INFO - Combined JSON created successfully with 100 documents
2025-02-27 17:10:24,506 - INFO - Combined JSON saved to: combined_dataset.json
2025-02-27 17:10:24,506 - INFO - File size: 2.86 MB


100

In [19]:
import os
import re
import google.generativeai as genai
from PIL import Image
from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_exception_type
from google.api_core import exceptions
import fitz  # PyMuPDF
from tqdm import tqdm

# Hard code your PDF path here
PDF_PATH = "articles_100_pdf/three_heads_are_better_than_one.pdf"  # Change this to your actual PDF file path

def convert_pdf_to_images(pdf_path):
    """Convert PDF to a list of images using PyMuPDF."""
    print("Converting PDF to images...")
    # Create output directory if it doesn't exist
    os.makedirs('temp_images', exist_ok=True)
    
    # Open PDF
    pdf_document = fitz.open(pdf_path)
    
    # Process all pages
    num_pages = pdf_document.page_count
    
    # Define footer crop height in pixels
    footer_height = 180  # pixels to crop from bottom
    
    image_paths = []
    for page_num in range(num_pages):
        # Get page
        page = pdf_document[page_num]
        
        # Convert page to image with higher resolution
        pix = page.get_pixmap(matrix=fitz.Matrix(3, 3))  # 3x zoom for better quality
        
        # Create a cropped version of the image (removing footer with page number)
        width, height = pix.width, pix.height
        cropped_height = height - footer_height
        
        # Only crop if there's enough image height
        if cropped_height > height * 0.7:  # Safety check - don't crop more than 30% of the image
            # Create a new PIL image and crop it
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            cropped_img = img.crop((0, 0, width, cropped_height))
            
            # Save image
            image_path = f'temp_images/page_{page_num + 1:03d}.png'
            cropped_img.save(image_path)
        else:
            # If image is too small to safely crop, save the original
            image_path = f'temp_images/page_{page_num + 1:03d}.png'
            pix.save(image_path)
            
        image_paths.append(image_path)
    
    pdf_document.close()
    return image_paths

# Configure Gemini API
def setup_gemini():
    print("Initializing Gemini model...")
    genai.configure(api_key="AIzaSyAFv18kSLlVV4-ClGYCrgaiLtXESJDq5fM")
    model = genai.GenerativeModel('gemini-1.5-flash')
    return model

# Retry decorator for rate limit handling
@retry(
    retry=retry_if_exception_type((exceptions.ResourceExhausted, exceptions.ServiceUnavailable)),
    wait=wait_fixed(15),  # Wait 15 seconds between retries
    stop=stop_after_attempt(5)  # Maximum 5 attempts
)
def process_image_with_gemini(model, image_path):
    img = Image.open(image_path)
    response = model.generate_content([
        "Extract all text from this image exactly as it appears, preserving all formatting and line breaks. Do not add any additional text or markers:",
        img
    ])
    return response.text

def process_pdf(pdf_path):
    # Create output folder if it doesn't exist
    os.makedirs('output', exist_ok=True)
    
    # Generate output filename from PDF name
    pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
    output_file = os.path.join('output', f'{pdf_name}_extracted.txt')
    
    # Convert PDF to images
    image_paths = convert_pdf_to_images(pdf_path)
    
    # Initialize Gemini
    model = setup_gemini()
    
    print(f"\nProcessing {len(image_paths)} pages...")
    
    # Process each image
    with open(output_file, 'w', encoding='utf-8') as f:
        for img_path in tqdm(image_paths, desc="Extracting text"):
            try:
                # Process image with Gemini
                extracted_text = process_image_with_gemini(model, img_path)
                f.write(extracted_text + '\n\n')  # Add double newline between pages
            except Exception as e:
                print(f"\nError processing {img_path}: {str(e)}")
                f.write(f"[ERROR: Failed to process this page - {str(e)}]\n\n")
    
    # Clean up temporary images
    for img_path in image_paths:
        try:
            os.remove(img_path)
        except:
            pass
    try:
        os.rmdir('temp_images')
    except:
        pass
    
    print(f"\nText extraction completed!")
    print(f"Text file saved as: {output_file}")
    return output_file

if __name__ == "__main__":
    # Check if the file exists and is a PDF
    if not os.path.exists(PDF_PATH):
        print(f"Error: File '{PDF_PATH}' does not exist!")
    elif not PDF_PATH.lower().endswith('.pdf'):
        print(f"Error: '{PDF_PATH}' is not a PDF file!")
    else:
        print(f"\n{'='*60}")
        print(f"Processing: {os.path.basename(PDF_PATH)}")
        print(f"{'='*60}")
        try:
            process_pdf(PDF_PATH)
        except Exception as e:
            print(f"Error processing {PDF_PATH}: {str(e)}")


Processing: three_heads_are_better_than_one.pdf
Converting PDF to images...
Initializing Gemini model...

Processing 6 pages...


Extracting text: 100%|██████████| 6/6 [01:12<00:00, 12.13s/it]


Text extraction completed!
Text file saved as: output/three_heads_are_better_than_one_extracted.txt





In [26]:
import os
import glob

def count_files_in_folder(folder_path, extension=None, show_list=False):
    """
    Count the number of files in a given folder.
    
    Parameters:
    -----------
    folder_path : str
        Path to the folder to check
    extension : str, optional
        File extension to filter by (e.g., '.md', '.txt', '.json')
    show_list : bool, optional
        Whether to print the list of files
        
    Returns:
    --------
    int
        Number of files in the folder
    """
    # Create the pattern to search for
    if extension:
        pattern = os.path.join(folder_path, f"*{extension}")
    else:
        pattern = os.path.join(folder_path, "*")
    
    # Get all files matching the pattern
    files = glob.glob(pattern)
    
    # Filter out directories
    files = [f for f in files if os.path.isfile(f)]
    
    # Print the list of files if requested
    if show_list and files:
        print(f"Files in {folder_path}:")
        for i, file in enumerate(files, 1):
            print(f"  {i}. {os.path.basename(file)}")
    
    return len(files)

# Example usage:
folder_path = "sectioning_output_100"  # Replace with your folder path
md_count = count_files_in_folder(folder_path, extension=".md", show_list=True)
print(f"\nTotal number of markdown files: {md_count}")

# Count all files in a folder
total_count = count_files_in_folder(folder_path)
print(f"\nTotal number of all files in {folder_path}: {total_count}")

Files in sectioning_output_100:
  1. a_framework_for_robust_semantic_interpretation_learning_extracted.md
  2. ambiguity_packing_in_constraintbased_parsing_practical_results_extracted.md
  3. the_automatic_translation_of_discourse_structures_extracted.md
  4. evaluating_automatic_dialogue_strategy_adaptation_for_a_spoken_dialogue_system_extracted.md
  5. insights_into_the_dialogue_processing_of_verbmobil_extracted.md
  6. automatic_selection_of_class_labels_from_a_thesaurus_for_an_effective_semantic_tagging_of_corpora_extracted.md
  7. assigning_function_tags_to_parsed_text_extracted.md
  8. an_information_extraction_core_system_for_real_world_german_text_processing_extracted.md
  9. natural_language_in_four_spatial_interfaces_extracted.md
  10. responding_to_semantically_illformed_input_extracted.md
  11. structure_from_anarchy_meta_level_representation_of_expert_system_propositions_for_natural_language_interfaces_extracted.md
  12. forestbased_statistical_sentence_generation_extracte