In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pathlib import Path
import pandas as pd
import pymupdf
from tqdm import tqdm
import cv2
import logging
from typing import List
import sys


base_dir = Path.cwd().parent.resolve()

src_path = base_dir / "src"
sys.path.append(str(src_path))
print(src_path)

from line_detection import extract_geometric_lines

/home/lillemor/Documents/lgd-utils/asset-data-extraction/src


Features that might be interesting:
**edges** 
    - edge density: # edges / total
**line**
    - line count
    - mean line length 
    - std line length (length distribution)
    - merged long lines
        - straigh lines
        - curved lines
**line angles**
    - # angles non 0/90
    - # angles 0/90
    - ratio angles # angles non 0/90/# angles 0/90
**pixel histogram**
    - histogram entropy of pixel intensities

In [None]:
##extract feature that might be helpful to distinguish between maps and other page types

def extract_map_features(image: np.ndarray, page) -> dict:
    """Extract geometric and intensity-based features from a map-like image."""
    features = {}

    # Convert to grayscale and apply Gaussian blur
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    blurred = cv2.GaussianBlur(gray, (5, 5), sigmaX=1.2)

    # --- Line and Edge Features ---
    edges, lines_obj = extract_geometric_lines(page)
    features["edge_density"] = np.mean(edges > 0)
    features["line_count"] = len(lines_obj) if lines_obj is not None else 0

    if lines_obj:
        # Length-based features
        line_lengths = [line.length for line in lines_obj]
        features["line_len"] = np.mean(line_lengths)
        features["line_len_std"] = np.std(line_lengths)

        # Angle-based features
        angles = [line.line_angle for line in lines_obj]
        tolerance = 2  # degrees

        grid_angles = [
            angle for angle in angles
            if abs(angle - 0) < tolerance or abs(angle - 90) < tolerance or abs(angle - 180) < tolerance
        ]
        non_grid_angles = [
            angle for angle in angles
            if angle not in grid_angles
        ]

        features["grid_angle_count"] = len(grid_angles)
        features["non_grid_angle_count"] = len(non_grid_angles)
        features["ratio_angles"] = len(non_grid_angles) / len(grid_angles) if grid_angles else None
    else:
        features.update({
            "line_len": None,
            "line_len_std": None,
            "grid_angle_count": 0,
            "non_grid_angle_count": 0,
            "ratio_angles": None,
        })

    hist = cv2.calcHist([blurred], [0], None, [16], [0, 256])
    hist_norm = hist / hist.sum()
    features["hist_entropy"] = -np.sum(hist_norm * np.log2(hist_norm + 1e-7))

    return features


In [None]:
# functions to help for plotting features per subfolder
def classify_pdf_with_features(file_path: Path, subfolder: str) -> List[dict]:
    if not file_path.is_file() or file_path.suffix.lower() != '.pdf':
        logging.error(f"Invalid file path: {file_path}. Must be a valid PDF file.")
        return []

    data = []
    with pymupdf.open(file_path) as doc:
        for page_number, page in enumerate(doc, start=1):
            pix = page.get_pixmap(dpi=150)
            img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
            if pix.n == 4:
                img = cv2.cvtColor(img, cv2.COLOR_RGBA2BGR)
            elif pix.n == 1:
                img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)

            features = extract_map_features(img,page)
            features["subfolder"] = subfolder
            features["file"] = file_path.name
            features["page"] = page_number
            data.append(features)
            

    return data


def process_pdfs_with_features(pdf_dirs: List[Path]) -> pd.DataFrame:
    all_data = []
    for pdf_dir in pdf_dirs:
        subfolder = pdf_dir.name
        pdf_files = list(pdf_dir.glob("*.pdf"))
        with tqdm(total=len(pdf_files)) as pbar:
            for pdf in pdf_files:
                pbar.set_description(f"Processing {pdf.name}")
                pdf_data = classify_pdf_with_features(pdf, subfolder)
                all_data.extend(pdf_data)
                pbar.update(1)
    return pd.DataFrame(all_data)


def plot_feature_distributions(df: pd.DataFrame):
    features = ["edge_density", "line_count","line_len","line_len_std",
                "non_grid_angle_count", "grid_angle_count","ratio_angles", "hist_entropy"]
    for feature in features:
        plt.figure(figsize=(8, 5))
        sns.boxplot(data=df, x="subfolder", y=feature)
        plt.title(f"{feature} by Subfolder")
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

In [None]:
input_dir = Path(base_dir / "data/input/single_pages")
pdf_dirs = [Path( input_dir /"maps/"), Path(input_dir/"text"), Path(input_dir/"boreprofile"), Path(input_dir/ "unknown")]
df = process_pdfs_with_features(pdf_dirs)

In [None]:
plot_feature_distributions(df)
df.to_csv("pdf_page_features.csv", index=False)
display(df.groupby("subfolder").describe())

In [None]:
## collect and plot angles and line length distribution per page type subfolder

def collect_angle_length_data(pdf_dir: Path):
    all_angles = []
    all_lengths = []

    for pdf_path in pdf_dir.glob("*.pdf"):
        with pymupdf.open(pdf_path) as doc:
            for _, page in enumerate(doc, start=1):
                _, lines_obj = extract_geometric_lines(page)
                if lines_obj:
                    all_lengths.extend([line.length for line in lines_obj])
                    all_angles.extend([line.line_angle for line in lines_obj])

    return all_angles, all_lengths

def plot_angle_and_length_histograms_by_subfolder(pdf_dirs: list[Path]):
    for pdf_dir in pdf_dirs:
        subfolder = pdf_dir.name
        angles, lengths = collect_angle_length_data(pdf_dir)
        desc = pd.Series(lengths).describe()
        print(desc)

        # Plot angle histogram
        plt.figure(figsize=(10, 5))
        sns.histplot(angles, bins=36)
        plt.title(f"Angle Distribution - {subfolder}")
        plt.xlabel("Angle (degrees)")
        plt.ylabel("Frequency")
        plt.tight_layout()
        plt.show()

        # Plot line length histogram
        plt.figure(figsize=(10, 5))
        sns.histplot(lengths, bins=30)
        plt.yscale("log")
        plt.title(f"Line Length Distribution (Log Scale)- {subfolder}")
        plt.xlabel("Line Length (pixels)")
        plt.ylabel("Frequency")
        plt.tight_layout()
        plt.show()

In [None]:
pdf_dirs = [Path( input_dir /"maps/"), Path(input_dir/"text"), Path(input_dir/"boreprofile"), Path(input_dir/ "unknown")]
plot_angle_and_length_histograms_by_subfolder(pdf_dirs)