In [17]:
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer, LTChar

In [None]:
class PDFSearch:

    def __init__(self, file_path):
        self.file_path = file_path

    def extract_text(self):
        """Extract text from the PDF along with positional data."""
        text_data = []

        for page_layout in extract_pages(self.file_path):
            for element in page_layout:
                if isinstance(element, LTTextContainer):
                    text = element.get_text().strip()
                    x, y, x1, y1 = element.bbox
                    text_data.append({
                        "text": text,
                        "x": x,
                        "y": y,
                        "x1": x1,
                        "y1": y1
                    })

        return text_data

    def search(self, keywords):
        """Search for keywords in the PDF and return blocks containing them."""
        text_data = self.extract_text()
        results = []

        for keyword in keywords:
            keyword_results = [data for data in text_data if keyword.lower() in data["text"].lower()]
            results.extend(keyword_results)

        # Return unique blocks
        return list({block["text"]: block for block in results}.values())

# Usage example
pdf_searcher = PDFSearch(pdf_path)
keywords = ['Litostomatea', "feeding mode"]
found_blocks = pdf_searcher.search(keywords)
for block in found_blocks:
    print(block["text"])
    print("Position:", block["x"], block["y"], block["x1"], block["y1"])
    print("------")


In [None]:
# Usage
literature_dir = '/home/lubo/code/wRajter/vampyrella_2023/raw_data/literature'
pdf_path = f'{literature_dir}/Lynn_2008_The_Ciliated_Protozoa.pdf'

pdf_searcher = PDFSearch(pdf_path)
keywords = ['Paramecium']
found_paragraphs = pdf_searcher.search(keywords)
for paragraph in found_paragraphs:
    print(paragraph)

In [None]:
found_paragraphs[1]

## Testing

In [18]:
from pdfminer.high_level import extract_text

In [15]:
raw_data = '../raw_data'
literature_dir = f'{raw_data}/literature'
path = f'{literature_dir}/test.pdf'

In [19]:
text = extract_text(path)

In [22]:
print(text)

Vol. 627: 49–60, 2019
https://doi.org/10.3354/meps13086

MARINE ECOLOGY PROGRESS SERIES
Mar Ecol Prog Ser

Published September 26§

OPENPEN
 ACCESS
CCESS

Spatial patterns of functional diversity and
composition in marine benthic ciliates along
the coast of China

1State Key Laboratory of Estuarine and Coastal Research, East China Normal University, Shanghai 200241, P. R. China
2Department of Geosciences and Geography, University of Helsinki, Helsinki 00014, Finland

Y. Xu1,*, J. Soininen2

ABSTRACT: Large-scale patterns of community composition and diversity along environmental
gradients have been well studied for macroorganisms. However, the biogeography of microorgan-
isms,  especially  ciliated  protozoa,  remains  understudied.  Here,  we  analyzed  a  comprehensive
database of marine benthic ciliates found along the coast of China from 1991 to 2018 to examine
the geographical patterns in species and trait composition and functional diversity. According to
redundancy analysis cond

In [23]:
pages = extract_pages(path)

In [25]:
print(pages)

<generator object extract_pages at 0x7fba0cd0b370>


In [45]:
for page_layout in extract_pages(path):
    for element in page_layout:
        if isinstance(element, LTTextContainer):
            text = element.get_text().strip()
            x, y, x1, y1 = element.bbox
            print(y1)

732.1641999999999
732.269
730.9692
699.21732
657.4191
546.92288
571.55462
480.87600000000003
290.47610000000003
229.6163
205.34144
229.53911
53.7953
52.7435
19.17457999999999
749.1188
749.2007
701.4421
616.7404899999999
338.43519999999967
108.5308299999995
701.4309399999996
616.7293299999995
423.2009799999994
399.0014499999994
374.7265899999994


In [None]:
x, y, x1, y1 = element.bbox
                    text_data.append({
                        "text": text,
                        "x": x,
                        "y": y,
                        "x1": x1,
                        "y1": y1