In [1]:
%load_ext autoreload
%autoreload 2

import os
import pymupdf
import numpy as np
import pandas as pd
import math
import regex
from collections import defaultdict
import logging
from tabulate import tabulate
import sys

repo_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(os.path.join(repo_root, "src"))

logging.basicConfig(level=logging.INFO, format="%(message)s")
logger = logging.getLogger(__name__)

from text import extract_words, create_text_lines, create_text_blocks
from utils import TextWord
from keyword_finding import find_keywords_in_lines
from title_page import title_page_type
from utils import closest_word_distances, y0_word_cluster

In [5]:
input_folder = "maps"
filename ="8627_3.pdf"

pdf_path = os.path.join(repo_root, "data/input", input_folder)
out_dir = os.path.join(repo_root, "data/test", os.path.splitext(filename)[0])

#pdf_path = "/home/lillemor/PycharmProjects/swissgeol-boreholes-dataextraction/data/geoquat/validation/"

ground_truth_path = os.path.join(repo_root,"data/ground_truth_maps.csv")

In [3]:
keywords_boreprofile = ["bohrung", "bohrprofil", "sondage"]

pattern_maps = [
    regex.compile(r"1\s*:\s*[125](25|5)?000+"),
    regex.compile(r"1\s*:\s*[125]((0{1,2})?([',]000)+)")
]

def find_maps_pattern(words: list[TextWord]) -> regex.Match | None:
    return next((match 
                 for pattern in pattern_maps 
                 for word in words 
                 if (match := pattern.search(word.text))), None)


def classify_on_keywords(lines: list[str], words: list[TextWord]) -> str | None:

    if find_keywords_in_lines(lines, keywords_boreprofile):
        return "boreprofile"
    if find_maps_pattern(words):
        return "map"  
    return None


In [6]:
# Classification tracking
classification_counts = {
    "Text": 0,
    "Title Page": 0,
    "Boreprofile": 0,
    "Map": 0,
    "Unknown": 0
}
total_pages = 0
classification_data = []


for filename in os.listdir(pdf_path):
    if filename.lower().endswith('.pdf'):
        file_path = os.path.join(pdf_path, filename)

        with pymupdf.Document(file_path) as doc:
            for page_index, page in enumerate(doc):
                total_pages += 1
                page_number = page_index + 1
                text = page.get_text()

                words = extract_words(page, page_number)
                if not words:
                    classification_counts["Unknown"] += 1
                    classification_data.append({
                        "Filename": filename,
                        "Page Number": page_number,
                        "Classification": "Unknown"
                    })
                    continue

                # Compute word distances and line attributes
                distances = closest_word_distances(words)
                median_distance = np.median(distances) if distances else None
                lines = create_text_lines(page, page_number)
                words_per_line = [len(line.words) for line in lines]
                mean_words_per_line = np.mean(words_per_line) if words_per_line else 0

                # Compute text block attributes
                text_blocks = create_text_blocks(lines)
                block_area = sum(block.rect.get_area() for block in text_blocks)
                word_area = sum(word.rect.get_area()
                                for block in text_blocks
                                for line in block.lines
                                for word in line.words if len(line.words) > 1)

                classification = "Unknown"

                # Rule-based classification
                if block_area > 0 and word_area / block_area > 1 and mean_words_per_line > 3:
                    classification = "Title Page" if title_page_type(text) else "Text"
                else:
                    classify_keywords = classify_on_keywords(lines, words)
                    if classify_keywords in classification_counts:
                        classification = classify_keywords
                    else:
                        clusters = y0_word_cluster(lines)
                        filtered_clusters = [cluster for cluster in clusters if len(cluster) > 1]
                        longest_cluster = max(map(len, filtered_clusters), default=0)

                        if median_distance is not None and median_distance < 20 and longest_cluster > 4:
                            classification = "Boreprofile"
                        else:
                            classification = "Map"

                # Update class counts
                classification_counts[classification] += 1
                classification_data.append({
                    "Filename": filename,
                    "Page Number": page_number,
                    "Classification": classification
                })

df = pd.DataFrame(classification_data)

# classification summary
summary = pd.DataFrame.from_dict(classification_counts, orient='index', columns=['Count'])
summary['Percentage'] = (summary['Count'] / total_pages * 100).round(2)

logging.info("Classification Summary:")
logging.info(tabulate(summary, headers="keys", tablefmt="grid"))

#Save results to CSV
df.to_csv(os.path.join(repo_root,"data/classification_results.csv"), index=False)

# TODO: implement groundtruth
try:
    ground_truth = pd.read_csv(ground_truth_path)
    df = df.merge(ground_truth, on=["Filename", "Page Number"], how="left")
    df["Correct"] = df["Classification"] == df["True Label"]

    from sklearn.metrics import classification_report
    report = classification_report(df["True Label"], df["Classification"], zero_division=0)
    logging.info("\nClassification Report:\n" + report)
except FileNotFoundError:
    logging.info("\nNo ground truth available for evaluation.")


Classification Summary:
+-------------+---------+--------------+
|             |   Count |   Percentage |
| Text        |       0 |         0    |
+-------------+---------+--------------+
| Title Page  |       0 |         0    |
+-------------+---------+--------------+
| Boreprofile |       3 |        13.04 |
+-------------+---------+--------------+
| Map         |      20 |        86.96 |
+-------------+---------+--------------+
| Unknown     |       0 |         0    |
+-------------+---------+--------------+

Classification Report:
              precision    recall  f1-score   support

 Boreprofile       0.00      0.00      0.00         0
         Map       1.00      0.87      0.93        23

    accuracy                           0.87        23
   macro avg       0.50      0.43      0.47        23
weighted avg       1.00      0.87      0.93        23

