## 1. Install packages to cloud

In [0]:
%pip install azure.storage.blob
%pip install azure.ai.formrecognizer

##2. Connect to Azure Storage Container

In [0]:
from azure.storage.blob import ContainerClient

container_url = "https://formrecognizerdemo070621.blob.core.windows.net/pdf-raw"
container = ContainerClient.from_container_url(container_url)

for blob in container.list_blobs():
  blob_url = container_url + "/" + blob.name
  print(blob_url)

##3. Enable Congitive Services

Additional documentation [here](https://docs.microsoft.com/en-us/python/api/azure-ai-formrecognizer/azure.ai.formrecognizer.formrecognizerclient?view=azure-python)

In [0]:
import requests
from azure.ai.formrecognizer import FormRecognizerClient
from azure.core.credentials import AzureKeyCredential

endpoint = dbutils.secrets.get(scope="formrec",key="cognitiveServicesEndpoint")
key = dbutils.secrets.get(scope="formrec",key="cognitiveServicesKey")

form_recognizer_client = FormRecognizerClient(endpoint=endpoint, credential=AzureKeyCredential(key))

##4. Send files to Cognitive Services
And save results as dataframe. To put this into production, we recommend converting this to a koala dataframe to leverage the power of Apache Spark for big data analytics.

In [0]:
import pandas as pd

field_list = ["InvoiceId", "VendorName", "VendorAddress", "CustomerName", "CustomerAddress", "CustomerAddressRecipient", "InvoiceDate", "InvoiceTotal", "DueDate"]
df = pd.DataFrame(columns=field_list)

for blob in container.list_blobs():
  blob_url = container_url + "/" + blob.name
  poller = form_recognizer_client.begin_recognize_invoices_from_url(invoice_url=blob_url)
  invoices = poller.result()
  print("Scanning " + blob.name + "...")
  
  for idx, invoice in enumerate(invoices):
      single_df = pd.DataFrame(columns=field_list)

      for field in field_list:
        entry = invoice.fields.get(field)
        
        if entry:
          single_df[field] = [entry.value]
          
      single_df['FileName'] = blob.name
      df = df.append(single_df)

df = df.reset_index(drop=True)
df

Unnamed: 0,InvoiceId,VendorName,VendorAddress,CustomerName,CustomerAddress,CustomerAddressRecipient,InvoiceDate,InvoiceTotal,DueDate,FileName
0,INV-3337,DEMO - Sliced Invoices,Suite 5A-1204 123 Somewhere Street Your City A...,Test Business,"123 Somewhere St Melbourne, VIC 3000",Test Business,2016-01-25,93.5,2016-01-31,pdf1.pdf
1,00001,,,,,,2020-01-12,8039.64,,pdf10.pdf
2,0000005,,,Aden Matchett Vandelay Group,,,2019-06-03,5500.0,2019-07-03,pdf11.pdf
3,0023 - 2020,H&H APPAREL,"4805 JEFFERSON ST. HAMPTON, VA",MRS. LAURA JOHNSONS,"2330 BETTER STREET KANSAS CITY, KS",MRS. LAURA JOHNSONS,,762.39,,pdf12.pdf
4,000231,,"582 Grand Drive Lithonia, GA 30038",Acme Industries,,,2020-11-05,12512.0,2020-11-05,pdf13.pdf
5,1001,,"2601 Airport Drive, Suite 380 Los Angeles, CA ...",Ms. Lynn Tracey Allied Technology,"1616 Adventure Way Sunnyside, CA 95000",Ms. Lynn Tracey Allied Technology,2011-03-01,,,pdf2.pdf
6,#000015,"General Goods, LLC",,,,,2016-10-26,21.77,,pdf3.pdf
7,1234567,ADVANCED SYSTEMS,"3341 Shinn Street Long Beach, CA 90807",Widget Tec,"3545 Long Beach Blvd. Long Beach, CA 90807",Widget Tec,2020-02-04,944.52,2016-01-18,pdf4.pdf
8,3.14,,,P. Sherman,,,2015-10-21,8.46,,pdf5.pdf
9,US-001,East Repair Inc.,"1912 Harvest Lane New York, NY 12210",John Smith,,,2019-11-02,154.06,,pdf6.pdf


##Upload dataframe of results to Azure

Mount an Azure Blob storage container [(link)](https://docs.databricks.com/data/data-sources/azure/azure-storage.html#mount-an-azure-blob-storage-container)

In [0]:
account_name = "formrecognizerdemo070621"
account_key = "fs.azure.account.key." + account_name + ".blob.core.windows.net"

try: 
  dbutils.fs.mount(
    source = "wasbs://pdf-recognized@formrecognizerdemo070621.blob.core.windows.net",
    mount_point = "/mnt/pdf-recognized",
    extra_configs = {account_key: dbutils.secrets.get(scope = "formrec", key = "formreckey")} )

except:
  print('Directory already mounted or error')

df.to_csv(r"/dbfs/mnt/pdf-recognized/output.csv", index=False)