In [None]:
"""
This notebook is used to create input pages for the classification pipeline and create ground truth for single pages.

Instructions:

Place the original reports (PDFs) in `data/input/`  
   - The folder should contain PDF files from which pages will be extracted.

Specify the following parameters:  
   - filename: Name of the PDF file to extract pages from.  
   - wanted_page`: The page number to extract.  
   - out_dir`: The target directory where the extracted page will be stored.

Set out_dir to one of the following categories:  
   - "boreprofile"  
   - "maps" 
   - "text" 
   - "title_page" 
   -"unknown"  
"""

import os
import pymupdf
import sys
import random
from tqdm import tqdm
from pathlib import Path
import json

# Configuration
repo_root = Path.cwd().parent.resolve()
sys.path.append(str(repo_root))

from src.classify_scanned_page import classify_pdf
from main import read_params

In [None]:

filename = "2155.pdf"
wanted_pages = range(0,2)
out_category = ""

pdf_path = os.path.join(repo_root, "data/input/", filename)
out_dir = os.path.join(repo_root, f"data/input/single_pages/{out_category}")
os.makedirs(out_dir, exist_ok=True)

with pymupdf.open(pdf_path) as doc:
   for page_index,page in enumerate(doc):
         page_number = page_index +1
      
         if page_number in wanted_pages:
               out_path= os.path.join(out_dir,f"{ os.path.splitext(filename)[0]}_{page_number}.pdf")

               new_doc = pymupdf.open()

               new_doc.insert_pdf(doc, from_page=page_index, to_page=page_index)
               new_doc.save(out_path)
               new_doc.close()
               print(f"saved to {out_path}")

In [None]:
input_dir = repo_root / "data/input/reports_no_gt"
output_base_dir = repo_root / "data/input/single_pages"
matching_params = read_params(repo_root / "matching_params.yml")

# Max number of pages per class per report to save
max_per_class = {
    "text": 1,
    "boreprofile": 2,
    "maps": 2,
    "title_page": 2,
    "unknown": 1
}

# Get list of downloaded PDFs
pdf_files = list(input_dir.glob("*.pdf"))
sampled_pdfs = random.sample(pdf_files, 1)

# Process each PDF
for pdf_path in tqdm(sampled_pdfs, desc="Classifying and saving selected pages"):
    classification_result = classify_pdf(pdf_path, matching_params)
    if not classification_result:
        continue

    filename_base = pdf_path.stem
    page_by_class = {}

    # Group pages by class
    for page_info in classification_result["classification"]:
        page_num = page_info["Page"]
        page_class = next((cls for cls, val in page_info.items() if cls != "Page" and val == 1), "unknown").lower()

        page_by_class.setdefault(page_class, []).append(page_num)

    # Save pages
    with pymupdf.open(pdf_path) as doc:
        for class_label, page_nums in page_by_class.items():
            max_pages = max_per_class.get(class_label, 1)
            selected_pages = random.sample(page_nums, min(len(page_nums), max_pages))

            output_dir = output_base_dir / class_label
            output_dir.mkdir(parents=True, exist_ok=True)

            for page_num in selected_pages:
                out_filename = f"{filename_base}_{page_num}.pdf"
                out_path = output_dir / out_filename

                new_doc = pymupdf.open()
                new_doc.insert_pdf(doc, from_page=page_num - 1, to_page=page_num - 1)
                new_doc.save(out_path)
                new_doc.close()

In [None]:
## create groundtruth based on input in subfolder of single pages:

input_folder = repo_root/ "data/input/single_pages"

# All possible classes (must match ground truth schema)
classes = ["Text", "Boreprofile", "Maps", "Title_Page", "Unknown"]

folder_to_class = {
    "text": "Text",
    "boreprofile": "Boreprofile",
    "maps": "Maps",
    "title_page": "Title_Page",
    "unknown": "Unknown"
}

ground_truth = []

for class_folder in input_folder.iterdir():
    if not class_folder.is_dir():
        continue

    folder_name = class_folder.name.lower()
    if folder_name not in folder_to_class:
        print(f"Skipping unrecognized folder: {class_folder.name}")
        continue

    class_label = folder_to_class[folder_name]

    for pdf_file in class_folder.glob("*.pdf"):
        entry = {
            "filename": pdf_file.name,
            "classification": [{
                "Page": 1,
                **{cls: int(cls == class_label) for cls in classes}
            }]
        }
        ground_truth.append(entry)

output_path = repo_root /"data/gt_single_pages_new.json"
output_path.parent.mkdir(parents=True, exist_ok=True)
with output_path.open("w") as f:
    json.dump(ground_truth, f, indent=4)

print(f"Saved {len(ground_truth)} entries to {output_path}")
