In [1]:
%load_ext autoreload
%autoreload 2

import pymupdf
import cv2
from pathlib import Path
import sys
import numpy as np
import matplotlib.pyplot as plt
from math import degrees, atan2,sqrt
from dataclasses import dataclass

from skimage import io as io
from skimage.morphology import skeletonize
from skimage.color import rgb2gray
from skimage.filters import gaussian,threshold_otsu
from skimage.measure import find_contours,approximate_polygon,label
from skimage.transform import probabilistic_hough_line

base_dir = Path.cwd().parent.resolve()
sys.path.append(str(base_dir))

from src.line_detection import turn_page_to_image, extract_geometric_lines
from src.text_objects import create_text_lines
from src.geometric_objects import Line, Point

In [2]:
@dataclass
class ConnectedLine:
    points: list[Point]

    def __post_init__(self):
        if len(self.points) >= 2:
            self.start = self.points[0]
            self.end = self.points[-1]
        else:
            self.start = self.end = None

    @property
    def naive_length(self):
        if len(self.points) < 2:
            return 0.0

        length = 0.0
        for i in range(1, len(self.points)):
            length += self.points[i].distance_to(self.points[i - 1])
        return length
    
def calc_angle(point1, point2) -> float:
    dx = point2.x - point1.x
    dy = point2.y - point1.y
    return degrees(atan2(dy, dx)) % 180

def connect_lines_by_nearest_angle(
    lines: list[Line],
    max_neighbors: int = 5,
    max_dist: float = 100,
    angle_thresh: float = 35
) -> list[ConnectedLine]:
    from scipy.spatial import cKDTree

    # map all points to its parent line -> to build KDtree
    point_to_line = {}
    all_points = []
    for line in lines:
        for pt in [line.start, line.end]:
            point_to_line[pt.tuple] = (line,pt)
            all_points.append(pt.tuple)

    #build tree and initiliaize visited line set + end list
    tree = cKDTree(all_points)
    visited_lines = set()
    connected_lines = []

    # if line was already used skip them
    for line in lines:
        if line in visited_lines:
            continue
        
        ##initalize connected line ( add to used lines + set point we will start from and its angle it comes from)
        visited_lines.add(line)
        path = [line.start, line.end]
        current_angle = line.line_angle
        current_point = line.end

        while True:
            ##search colsest points
            dists, idxs = tree.query(current_point.tuple, k=max_neighbors, distance_upper_bound=max_dist)

            found_match = False
            for i in range(len(idxs)):
                if np.isinf(dists[i]):
                    break  # No more valid neighbors within max_dist

                neighbor_tuple = all_points[idxs[i]]
                neighbor_line, matched_point = point_to_line[neighbor_tuple]
                if neighbor_line in visited_lines:
                    continue #skip if already used

                # Determine the angle from current_point to the *other* end of neighbor_line
                extension_point = neighbor_line.end if matched_point == neighbor_line.start else neighbor_line.start

                # Angle of potential new segment
                angle = calc_angle(current_point,matched_point)
                angle_diff = abs(angle - current_angle)
                angle_diff = min(angle_diff, 180 - angle_diff)

                if angle_diff <= angle_thresh:
                    path.append(matched_point)
                    path.append(extension_point)
                    current_point = extension_point
                    current_angle = calc_angle(matched_point,current_point)

                    visited_lines.add(neighbor_line)
                    found_match = True
                    break  # Found a valid extension, continue from here

            if not found_match:
                break  # No further extension possible

        connected_lines.append(ConnectedLine(points=path))

    return connected_lines

In [3]:
pdf_path = Path( base_dir /"data/input/single_pages/maps/45004_28.pdf" )
# pdf_path = Path( base_dir /"data/input/single_pages/boreprofile/29192_250.pdf" )
# pdf_path = Path( base_dir /"data/input/single_pages/text/1801_9.pdf" )
# pdf_path = Path( base_dir /"data/input/single_pages/unknown/1432_8.pdf" )


doc = pymupdf.open(pdf_path)
page = doc[0]
page_number= 1

In [4]:
image = turn_page_to_image(page)

edges, lines = extract_geometric_lines(page)
text_lines = create_text_lines(page, page_number)
text_font_size = np.mean([line.font_size for line in text_lines])

longer_lines = [line for line in lines if line.length > text_font_size* sqrt(2)]

line_image= np.zeros_like(image)
for lin in longer_lines:
    cv2.line(line_image, (int(lin.start.x), int(lin.start.y)),
                         (int(lin.end.x), int(lin.end.y)),
                         (225, 225, 225), 2)

# Save the image with LSD lines
cv2.imwrite(Path(base_dir/"data/lines_LSD.png"), line_image)

True

In [None]:
lines_HL = probabilistic_hough_line(edges, threshold=10, line_length=5, line_gap=3)
for line in lines_HL:
    p0, p1 = line  # each line is a tuple of two points
    cv2.line(line_image, p0, p1, (225, 225, 225), 2)


# Save the image with lines and processed images
cv2.imwrite(Path(base_dir/"data/lines_HL.png"), line_image)
image = turn_page_to_image(page)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
blurred = cv2.GaussianBlur(gray, (5, 5), sigmaX=1.2)
cv2.imwrite(Path(base_dir/"data/image.png"), image)

cv2.imwrite(Path(base_dir/"data/gray.png"), gray)
cv2.imwrite(Path(base_dir/"data/blurred.png"), blurred)
cv2.imwrite(Path(base_dir/"data/edges.png"), edges)




True

In [6]:
connected_lines = connect_lines_by_nearest_angle(lines)
longer_connected_lines = [line for line in connected_lines if line.naive_length >   4*text_font_size* sqrt(2)]

In [7]:
rng = np.random.default_rng(seed=42)
canvas = np.zeros_like(image)
for cl in longer_connected_lines:
    if len(cl.points) >= 2:
        color = tuple(rng.integers(50, 255, size=3).tolist())
        for i in range(len(cl.points) - 1):
            pt1 = tuple(map(int, cl.points[i].tuple))
            pt2 = tuple(map(int, cl.points[i + 1].tuple))
            cv2.line(canvas, pt1, pt2, color, 2)
cv2.imwrite(Path(base_dir/"data/visualized_mask.png"), canvas)


True

In [None]:
gray_image = rgb2gray(image)
#Denoise the image
filtered_mask = np.zeros_like(gray_image)

denoised_image = gaussian(gray_image, sigma=2)
thresh = threshold_otsu(denoised_image)
binary_image = denoised_image < thresh

binary = np.zeros(shape=binary_image.shape)
binary[binary_image] = 1

contours = find_contours(binary, level=0.2)
for contour in contours:
    plt.plot(contour[:,1], contour[:,0], linewidth=0.5)
plt.imshow(filtered_mask, cmap='gray')
plt.axis('off')
plt.title('Grayscale Image with Detected Contours')
labs, count = label(binary, return_num=True)
print('{} shapes were found'.format(count))
plt.savefig(Path(base_dir) / "data" / "contours.png", bbox_inches='tight', dpi=300)

plt.close()

1059 shapes were found


In [None]:
##approximated polygons
for contour in contours:
    if contour.ndim == 3 and contour.shape[1] == 1:
        contour = contour[:, 0]

    if contour.ndim == 2 and contour.shape[1] == 2:
        coords = approximate_polygon(contour, tolerance=2.5)
        plt.plot(coords[:, 0], coords[:, 1], '-r')
    else:
        print(f" invalid contour with shape: {contour.shape}")
plt.savefig(Path(base_dir) / "data" / "polygon.png", bbox_inches='tight', dpi=300)

In [9]:
### morphological cleaning
# Convert to grayscale
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# Denoise
blurred = cv2.GaussianBlur(gray, (5, 5), sigmaX=1.2)

# Adaptive thresholding to enhance lines
thresh = cv2.adaptiveThreshold(
    blurred, 255,
    cv2.ADAPTIVE_THRESH_MEAN_C,
    cv2.THRESH_BINARY_INV,
    blockSize=15,
    C=10
)

# Morphological cleanup
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
cleaned = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=1)
cv2.imwrite(Path(base_dir/"data/cleaned.png"), cleaned)

True

In [10]:
###Morphological extraction

closed = cv2.morphologyEx(edges, cv2.MORPH_CLOSE, kernel)

# Step 4: Skeletonize the result to 1-pixel width lines
skeleton = skeletonize(closed > 0).astype(np.uint8) * 255

# Step 5: Optional - filter out short contours
contours, _ = cv2.findContours(skeleton, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)

# Create blank canvas to draw filtered contours
filtered_mask = np.zeros_like(skeleton)

for cnt in contours:
    length = cv2.arcLength(cnt, False)
    # if length > 50:
    cv2.drawContours(filtered_mask, [cnt], -1, 255, thickness=1)

# Save the final filtered contour line map
cv2.imwrite(Path(base_dir/"data/contour_line_mask_grayscale.png"), filtered_mask)

True