# PDF Utils: extending current functionalities for PDF

Add dependency to `pdfminer.six`: use `pip install unpackai[PDF]` to install the dependencies.

In [None]:
# default_exp pdf

In [None]:
# export
import re
from pathlib import Path
from typing import List, Union

from pdfminer.high_level import extract_text
from unpackai import utils
from unpackai import nlp

PathStr = Union[Path, str]

In [None]:
# export
class TextualPDF(nlp.Textual):
    """Extend Textual for PDF"""

    @classmethod
    def from_url_pdf(
        cls, url: str, password: str = "", page_numbers: List[int] = None, cleanup=True
    ):
        """Create a Textual object from a PDF URL with specific options

        Args:
            url: url of PDF
            password: password, if the PDF is protected
            page_numbers: list of pages to extract (first page = 0)
            cleanup: remove messy characters and line returns (default=True)
        """
        return cls.from_path_pdf(
            utils.download(url),
            password=password,
            page_numbers=page_numbers,
            cleanup=cleanup,
        )

    @classmethod
    def from_path_pdf(
        cls,
        pdf_file: PathStr,
        password: str = "",
        page_numbers: List[int] = None,
        cleanup=True,
    ):
        """Create a Textual object from a PDF

        Args:
            pdf_file: path of PDF
            password: password, if the PDF is protected
            page_numbers: list of pages to extract (first page = 0)
            cleanup: remove messy characters and line returns (default=True)
        """
        txt = extract_text(pdf_file, password=password, page_numbers=page_numbers)
        if cleanup:
            txt = re.sub(r"[\r\n]{2,}", "<line_break>", txt)
            txt = re.sub(r"- *[\n\r]", "", txt)
            txt = txt.replace("\n", " ").replace("<line_break>", "\n\n")

        return cls(txt, Path(pdf_file))

    @classmethod
    def from_path(cls, path: PathStr):
        """Create a Textual object from a path, including PDF"""
        path = Path(path)
        if path.suffix.lower() == ".pdf":
            return cls.from_path_pdf(path)
        else:
            return super().from_path(path)


## How to use it?

```python

from unpackai.pdf import TextualPDF

textual = TextualPDF.from_url("http://islamicblessings.com/upload/A-Thousand-And-One-Nights-1.pdf")
# OR...
textual = TextualPDF.from_path("C:/my_doc.pdf")
# OR ... if there is a password or you want to extract specific pages...
textual = TextualPDF.from_url_pdf("https://my_company.com/my_protected_doc.pdf", password="P@ssW0rd")
textual = TextualPDF.from_path_pdf("C:/my_doc.pdf", page_numbers=range(10))
```

In [None]:
t = TextualPDF.from_url("http://islamicblessings.com/upload/A-Thousand-And-One-Nights-1.pdf")
t

Text (1514113 chars), textual(),
    train_path, val_path = textual.create_train_val()

# Tests

In [None]:
# hide
# To be able to run the tests in the Notebook
from pathlib import Path
import ipytest
import sys

ipytest.autoconfig()

root_dir = Path("..").resolve()
sys.path.append(str(root_dir / "test"))

In [None]:
# exportest
# For Test Cases (might have duplicate import because it will be in a dedicated file)
from pathlib import Path
from typing import List

import pytest
from test_common.utils_4_tests import DATA_DIR, IMG_DIR, check_no_log, check_only_warning
from test_utils import GITHUB_TEST_DATA_URL, check_connection_github

In [None]:
# exportest
GITHUB_TEST_PDF = f"{GITHUB_TEST_DATA_URL}/Deep%20learning.pdf"
LOCAL_TEST_PDF = DATA_DIR / "Deep learning.pdf"


@pytest.fixture(scope="session")
def local_textual():
    return TextualPDF.from_path(LOCAL_TEST_PDF)


class Test_TextualPDF:
    def test_from_path(self, local_textual):
        """Test extract Textual of PDF from local file"""
        t = local_textual
        assert "Deep learning" in t.text, f"Text parsed:\n{t.text}"

    def test_from_path_pdf(self, local_textual):
        """Test extract Textual of PDF from local path using from_path_pdf"""
        textual = TextualPDF.from_path_pdf(LOCAL_TEST_PDF)
        assert textual.text == local_textual.text

    def test_from_url(self, check_connection_github, local_textual):
        """Test extract Textual of PDF from URL"""
        textual = TextualPDF.from_url(GITHUB_TEST_PDF)
        assert textual.text == local_textual.text, f"URL text: {textual.text}"

    def test_from_url_pdf(self, check_connection_github, local_textual):
        """Test extract Textual of PDF from URL using from_url_pdf"""
        textual = TextualPDF.from_url_pdf(GITHUB_TEST_PDF)
        assert textual.text == local_textual.text, f"URL text: {textual.text}"


In [None]:
# hide
ipytest.run()