In [None]:
%load_ext autoreload
%autoreload 2

import os
import pymupdf
import numpy as np
import math
import regex
from collections import defaultdict

from text import extract_words, create_text_lines, create_text_blocks
from utils import classify_wordpos, classify_text_density, TextWord
from keyword_finding import find_keywords_in_lines


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [106]:
base_dir = os.getcwd()
input_folder = "maps"
filename ="8627_3.pdf"

pdf_path = os.path.join(base_dir, "data/input", input_folder)
out_dir = os.path.join(base_dir, "data/test", os.path.splitext(filename)[0])

# pdf_path = "/home/lillemor/PycharmProjects/swissgeol-boreholes-dataextraction/data/zurich/"

In [89]:

def calculate_distance(word1, word2):
    """Calculate Euclidean distance between two TextWord objects based on x0 and y0"""
    x_dist = word1.rect.x0 - word2.rect.x0
    y_dist = word1.rect.y0 - word2.rect.y0
    return math.sqrt(x_dist**2 + y_dist**2)

def closest_word_distances(words):
    """Calculate distances between each word and its closest neighbor"""
    if not words or len(words) < 2:
        return []

    distances = []
    for i, word in enumerate(words):
        other_words = words[:i] + words[i+1:]  # Exclude current word
        closest_word = min(other_words, key=lambda w: calculate_distance(word, w))
        distances.append(calculate_distance(word, closest_word))

    return distances

In [90]:
keywords_boreprofile = ["bohrung","bohrprofil"]


def find_maps_pattern(words:list[TextWord]):
    pattern_maps = r"1\s*:\s*[1,2,5,][0,5]*"
    for word in words:
        pattern = regex.compile(pattern_maps)
        match = pattern.search(word.text)
        if match:
            return match
            
def classify_on_keywords(lines, words):
    keywords_on_page = find_keywords_in_lines(lines, keywords_boreprofile)
    if keywords_on_page:
        return "boreprofile"
    if find_maps_pattern(words):
        return "map"
    return None


In [None]:

def y0_word_cluster(all_words, tolerance: int = 10):
   
    if not all_words:
        return []

    # Dictionary to hold clusters, keys are representative y0 values
    grouped_y0 = defaultdict(list)

    for word in all_words:
        y0 = word.rect.y0
        matched_y0 = None

        # Check if y0 is within tolerance of an existing cluster
        for key in grouped_y0:
            if abs(key - y0) <= tolerance:
                matched_y0 = key
                break

        # Add to an existing cluster or create a new one
        if matched_y0 is not None:
            grouped_y0[matched_y0].append(word)
        else:
            grouped_y0[y0].append(word)

    clusters = list(grouped_y0.values())

    return clusters

In [107]:
count = 0
text_count=0
boreprofile_count=0
map_count=0
unknown_count= 0

for filename in os.listdir(pdf_path):
    # if count >=1:
    #     break
    if filename.lower().endswith('.pdf'):
        file_path = os.path.join(pdf_path, filename)
        
        with pymupdf.Document(file_path) as doc: 
            for page_index,page in enumerate(doc):
                count += 1

                #page info
                page_number = page_index +1
                page_size = (page.rect.width, page.rect.height)

                words = extract_words(page, page_number)
                if not words:
                    unknown_count += 1
                    continue

                #words attributes
                word_attributes = classify_text_density(words,page_size)
                words_position = classify_wordpos(words)
                distances = closest_word_distances(words)
                median_distance = np.median(distances) if distances else None

                #line attributes
                lines = create_text_lines(page, page_number)
                words_per_line =[len(line.words) for line in lines]

                #textblock attributes
                text_blocks = create_text_blocks(lines)
                block_area = sum([block.rect.get_area() for block in text_blocks])
                word_area =sum([word.rect.get_area() 
                    for block in text_blocks
                    for line in block.lines 
                    for word in line.words if len(line.words) > 1])
                

                classified = False
                # classify based on word density within textblock and words per lines
                if word_area/block_area > 1 and np.mean(words_per_line) >  3:
                    text_count += 1  
                    classified = True
 
                else:
                    #classify by keywords
                    classify_keywords = classify_on_keywords(lines, words)
                    if classify_keywords == "boreprofile":
                        boreprofile_count += 1
                        classified = True
                    # elif classify_keywords == "map":
                    #     map_count += 1
                    #     classified = True

                    #classify based on y0 alignment
                    if not classified:
                        ##  could also do text_blocks...
                        clusters = y0_word_cluster(lines)
                        ## filter out clusters with only one line
                        filtered_clusters = [cluster for cluster in clusters if len(cluster) >=2]
                        if filtered_clusters:           

                            ### boreprofile smaller eucledian distance + longer clusters
                            if median_distance < 20 and max((len(cluster) for cluster in filtered_clusters)) > 4:
                                boreprofile_count +=1
                            else:
                                map_count+=1
                        else:
                            map_count +=1

print(f"text: {text_count,round(text_count/count,4)}")
print(f"boreprofile: {boreprofile_count,round(boreprofile_count/count,4)}")
print(f"map: {map_count,round(map_count/count,4)}")
print(f"unknown: {unknown_count, round(unknown_count/count,4)}")
print(f"total {count}")

text: (0, 0.0)
boreprofile: (3, 0.1304)
map: (20, 0.8696)
unknown: (0, 0.0)
total 23


In [93]:
import numpy as np
import matplotlib.pyplot as plt

from skimage import measure

# Construct some test data
x, y = np.ogrid[-np.pi : np.pi : 100j, -np.pi : np.pi : 100j]
r = np.sin(np.exp(np.sin(x) ** 3 + np.cos(y) ** 2))

# Find contours at a constant value of 0.8
contours = measure.find_contours(r, 0.8)