# PDF Utils: extending current functionalities for PDF

Add dependency to `pdfminer.six`: use `pip install unpackai[PDF]` to install the dependencies.

In [None]:
# default_exp pdf

In [None]:
# export
from pathlib import Path
from typing import List, Union

from pdfminer.high_level import extract_text
from unpackai import utils
from unpackai import nlp

PathStr = Union[Path, str]

In [None]:
# export
class TextualPDF(nlp.Textual):
    """Extend Textual for PDF"""

    @classmethod
    def from_url_pdf(cls, url: str, password: str = "", page_numbers: List[int] = None):
        """Create a Textual object from a PDF URL with specific options

        Args:
            url: url of PDF
            password: password, if the PDF is protected
            page_numbers: list of pages to extract (first page = 0)
        """
        return cls.from_path_pdf(
            utils.download(url), password=password, page_numbers=page_numbers
        )

    @classmethod
    def from_path_pdf(
        cls, pdf_file: PathStr, password: str = "", page_numbers: List[int] = None
    ):
        """Create a Textual object from a PDF

        Args:
            pdf_file: path of PDF
            password: password, if the PDF is protected
            page_numbers: list of pages to extract (first page = 0)
        """
        txt = extract_text(pdf_file, password=password, page_numbers=page_numbers)
        return cls(txt, Path(pdf_file))

    @classmethod
    def from_path(cls, path: PathStr):
        """Create a Textual object from a path, including PDF"""
        path = Path(path)
        if path.suffix.lower() == ".pdf":
            return cls.from_path_pdf(path)
        else:
            return super().from_path(path)


## How to use it?

```python

from unpackai.pdf import TextualPDF

textual = TextualPDF.from_url("http://islamicblessings.com/upload/A-Thousand-And-One-Nights-1.pdf")
# OR...
textual = TextualPDF.from_path("C:/my_doc.pdf")
# OR ... if there is a password or you want to extract specific pages...
textual = TextualPDF.from_url_pdf("https://my_company.com/my_protected_doc.pdf", password="P@ssW0rd")
textual = TextualPDF.from_path_pdf("C:/my_doc.pdf", page_numbers=range(10))
```

In [None]:
t = TextualPDF.from_url("http://islamicblessings.com/upload/A-Thousand-And-One-Nights-1.pdf")
t

Text (1514456 chars), textual(),
    train_path, val_path = textual.create_train_val()