# PDF Importing 📄
In this notebook we will focus on importing PDF files into the Vector database.
We will focus on impoting documents from local storage, Google Drive and Microsoft OneDrive.

## Importing PDFs with Unstructured, that are 100% text-based

In [5]:
from unstructured.partition.auto import partition
from unstructured.cleaners.core import clean
from unstructured.chunking.basic import chunk_elements

In [None]:
elements = partition("./documents/pdf_with_extractabletext.pdf")
chunks = chunk_elements(elements, max_characters=1000)

In [None]:
for chunk in chunks:
    print('---')
    print(clean(chunk.text))

In [None]:
print("\n\n".join([str(el) for el in elements]))

## Importing scanned PDF-files

Installation of poppler needed
```
brew install poppler
brew install libmagic
```

In [4]:
from unstructured.partition.auto import partition

In [None]:
elements = partition("./documents/scan_example.pdf")

In [None]:
print("\n\n".join([str(el) for el in elements]))

## Extracting PDF-files with images

In [None]:
from unstructured.partition.pdf import partition_pdf

elements = partition_pdf(
    filename="./documents/pdf_with_images.pdf",                  # mandatory
    strategy="hi_res",                                     # mandatory to use ``hi_res`` strategy
    extract_images_in_pdf=True,                            # mandatory to set as ``True``
    extract_image_block_types=["Image", "Table"],          # optional
    extract_image_block_to_payload=False,                  # optional
    extract_image_block_output_dir="./documents/images/",  # optional - only works when ``extract_image_block_to_payload=False``
    )

In [None]:
print("\n\n".join([str(el) for el in elements]))

## Extracting PDF-files with tables

In [None]:
from unstructured.partition.pdf import partition_pdf

fname = "./documents/pdf_with_tables.pdf"

elements = partition_pdf(filename=fname,
                         infer_table_structure=True,
                         strategy='hi_res',
           )

tables = [el for el in elements if el.category == "Table"]

print(tables[0].text)
from IPython.display import HTML, display

# Assuming tables[0].metadata.text_as_html contains the HTML table string
html_table = tables[0].metadata.text_as_html

# Display the HTML table
display(HTML(html_table))