##### The following template demonstrates how to:
##### 1. Upload a document
##### 2. Retrieve the document
##### 3. Workflow management
##### 4. Form operation 

In [1]:
from pprint import pprint

TODO: Add the correct path below

In [2]:
DOCUMENT_PATH = "AAPL_10Q.pdf"

## Setting config

In [3]:
from weavaidev import Config

import os
from dotenv import load_dotenv

load_dotenv(override=True)


config = Config(auth_token=os.environ.get("AUTH_TOKEN"), env=os.environ.get("ENV"))

## Uploading document

In [4]:
from weavaidev.documents import DocumentOperations

print(f"Uploading document {DOCUMENT_PATH.split('/')[-1]}")
document_operation = DocumentOperations(config=config)
document_create_response = document_operation.create_document(file_path=DOCUMENT_PATH)

print("Uploaded document!")

Uploading document AAPL_10Q.pdf
Uploaded document!


In [5]:
document_id = document_create_response.id
print(f"Document ID: {document_id}")

Document ID: 671994d79e2cac3c191b2798


## Get information about the document

In [6]:
get_document_response = document_operation.get_document(document_id=document_id)

print("Fetched document")

pprint(get_document_response.model_dump())



Fetched document
{'ai_tags': [],
 'category': '',
 'created_at': datetime.datetime(2024, 10, 24, 0, 29, 11, tzinfo=TzInfo(UTC)),
 'download_url': '/doc-proc-service/local_store/google-oauth2|117349365869611297391/671994d79e2cac3c191b2798/671994d79e2cac3c191b2798',
 'file_name': 'AAPL_10Q.pdf',
 'form_instances': None,
 'id': '671994d79e2cac3c191b2798',
 'in_folders': [],
 'media_type': 'application/pdf',
 'pages': [],
 'redacted_summary': '',
 'size': 654929,
 'source': 'application',
 'status': 'NEW',
 'step_status': {'FORM_EXTRACTION': {'error': '',
                                     'modified_at': datetime.datetime(2024, 10, 24, 0, 29, 11, tzinfo=TzInfo(UTC)),
                                     'response': {},
                                     'status': 'NOT_STARTED'}},
 'summary': '',
 'summary_status': '',
 'tags': [],
 'tenant_id': '',
 'user_id': 'google-oauth2|117349365869611297391'}


In [7]:
document_category = get_document_response.category

print(document_category)




## Workflow management

### 1. Getting all workflows

In [8]:
from weavaidev.workflows import WorkflowService

workflow_operation = WorkflowService(config=config)

print("GETTING ALL WORKFLOWS")

workflow_response = workflow_operation.get_all_workflows()

workflows = [workflow.name for workflow in workflow_response.workflows]

pprint(workflows)

GETTING ALL WORKFLOWS
['dagtasktest',
 'dagtest',
 'download_from_connector',
 'download_mail_using_connector',
 'extract_form_and_compliance',
 'process_brain_document',
 'process_compliance_exclude_entity',
 'process_custom_queries',
 'process_document',
 'process_document_create_decision_tree',
 'process_document_multi_forms',
 'process_document_sensors',
 'process_document_summary_compliance',
 'process_document_summary_generation',
 'process_document_tabular_data',
 'process_form_data_cleanup',
 'process_form_multi_page_workflow',
 'process_form_workflow',
 'run_agent']


### 2. Running specific workflows

#### Process document workflow

In [17]:
print("RUNNING 'process_document' WORKFLOW")

process_document_response = workflow_operation.run_workflow(
    workflow_name="process_document", doc_id=document_id, data={}
)

pprint(process_document_response.model_dump())

print(f"RUN ID: {process_document_response.run_id}")

RUNNING 'process_document' WORKFLOW
{'created_at': '2024-10-24T00:10:51.000000+00:00',
 'document_id': '67198ffc7be19a06e59a8edb',
 'document_name': 'AAPL_10Q.pdf',
 'end_date': None,
 'in_folders': [],
 'run_id': '67198ffc7be19a06e59a8edb_d2f25a48-8992-4683-9429-8af46010bcbc',
 'start_date': None,
 'state': None,
 'workflow_id': 'process_document'}
RUN ID: 67198ffc7be19a06e59a8edb_d2f25a48-8992-4683-9429-8af46010bcbc


#### The cell below utilizes the get workflow status API to check if the workflow has completed.

In [18]:
import time
import random

finished = False

while not finished:
    time.sleep(random.randint(5, 10))
    workflow_status_response = workflow_operation.get_workflow_status(
        show_internal_steps=False,
        workflow_id=process_document_response.workflow_id,
        workflow_run_id=process_document_response.run_id,
    )
    current_status = workflow_status_response.status
    print(f"Processing status: {current_status}")
    if current_status in ["success", "failed"]:
        finished = True

pprint(workflow_status_response.model_dump())

Processing status: running
Processing status: running
Processing status: running
Processing status: running
Processing status: running
Processing status: running
Processing status: running
Processing status: running
Processing status: running
Processing status: running
Processing status: running


KeyboardInterrupt: 

##### Form processing can be run only after `process_document` workflow has finished

##### First we need to create a form definition to be able to run form extraction

#### Creating form definition

In [18]:
from weavaidev.forms import FormOperations
from weavaidev.forms.models import CreateFormRequest


form_operation = FormOperations(config=config)
fields_data = [
    {
        "name": "Total Sales",
        "field_type": "Number",
        "description": "Net Sales for the quarter",
        "is_array": True,
        "fill_by_search": False,
    }
]
body = CreateFormRequest(
    name="APPLE FORM DEFINITION 2",
    description="Apple form definition 2",
    category=document_category,
    fields=fields_data,
    is_shared=True,
    is_searchable=True,
)
form_create_response = form_operation.create_form(form_data=body)

In [10]:
print(f"Form ID: {form_create_response.id}")

form_id = form_create_response.id

Form ID: 671995226d31d047e6c82300


#### Process form workflow

In [25]:
print("RUNNING 'process_form_workflow' WORKFLOW")


process_form_response = workflow_operation.run_workflow(
    workflow_name="process_form_workflow", doc_id=document_id, data={"form_id": form_id}
)

pprint(process_form_response.model_dump())

print(f"FORM EXTRACTION RUN ID: {process_form_response.run_id}")

workflow_id = process_form_response.workflow_id
run_id = process_form_response.run_id

RUNNING 'process_form_workflow' WORKFLOW
{'created_at': '2024-10-24T00:36:37.000000+00:00',
 'document_id': '671994d79e2cac3c191b2798',
 'document_name': 'AAPL_10Q.pdf',
 'end_date': None,
 'in_folders': [],
 'run_id': '671994d79e2cac3c191b2798_9882a822-b629-46c6-ae95-c41b34afaa74',
 'start_date': None,
 'state': None,
 'workflow_id': 'process_form_workflow'}
FORM EXTRACTION RUN ID: 671994d79e2cac3c191b2798_9882a822-b629-46c6-ae95-c41b34afaa74



### Get workflow status

In [26]:
import time
import random

finished = False

while not finished:
    time.sleep(random.randint(5, 10))
    workflow_status_response = workflow_operation.get_workflow_status(
        show_internal_steps=False,
        workflow_id=workflow_id,
        workflow_run_id=run_id,
    )
    current_status = workflow_status_response.status
    print(f"Processing status: {current_status}")
    if current_status in ["success", "failed"]:
        finished = True

pprint(workflow_status_response.model_dump())

Processing status: success
{'document_id': '671994d79e2cac3c191b2798',
 'end_date': datetime.datetime(2024, 10, 24, 0, 37, 32, 827528, tzinfo=TzInfo(UTC)),
 'start_date': datetime.datetime(2024, 10, 24, 0, 36, 37, 138307, tzinfo=TzInfo(UTC)),
 'status': 'success',
 'tasks': [{'end_date': datetime.datetime(2024, 10, 24, 0, 36, 43, 7539, tzinfo=TzInfo(UTC)),
            'failed_task_ids': [],
            'name': 'extract_by_search',
            'start_date': datetime.datetime(2024, 10, 24, 0, 36, 42, 474470, tzinfo=TzInfo(UTC)),
            'status': 'success',
            'task_status_summary': {'failed': 0,
                                    'queued': 0,
                                    'running': 0,
                                    'skipped': 0,
                                    'success': 1}},
           {'end_date': datetime.datetime(2024, 10, 24, 0, 36, 40, 356711, tzinfo=TzInfo(UTC)),
            'failed_task_ids': [],
            'name': 'get_document',
            'star

### Get single workflow

In [24]:
single_workflow_response = workflow_operation.get_single_workflow(
    show_internal_steps=False,
    workflow_name="process_form_workflow",
)
pprint(single_workflow_response.model_dump())

[32m2024-10-23 17:36:28.624[0m | [1mINFO    [0m | [36mweavaidev.workflows[0m:[36mget_single_workflow[0m:[36m61[0m - [1m200[0m


{'name': 'process_form_workflow',
 'params': [],
 'tasks': [{'downstream_tasks': ['extract_form_values_normal',
                                 'extract_form_values_list'],
            'is_active': True,
            'name': 'get_document'},
           {'downstream_tasks': ['update_doc'],
            'is_active': True,
            'name': 'extract_by_search'},
           {'downstream_tasks': ['update_doc'],
            'is_active': True,
            'name': 'extract_form_values_list'},
           {'downstream_tasks': ['update_doc'],
            'is_active': True,
            'name': 'extract_form_values_normal'},
           {'downstream_tasks': ['set_processing_to_in_state__1',
                                 'set_processing_to_failed_state'],
            'is_active': True,
            'name': 'update_doc'},
           {'downstream_tasks': [],
            'is_active': True,
            'name': 'run_next_workflow'},
           {'downstream_tasks': ['run_next_workflow'],
            'is

### Rerun failed workflow (Optional)

In [None]:
rerun_workflow = workflow_operation.rerun_workflow(
    workflow_name="process_form_workflow", doc_id=document_id, data = {"form_id":form_id}
)
pprint(rerun_workflow.model_dump())

{'created_at': '2024-10-03T04:01:26.000000+00:00',
 'document_id': '66fe11c5927ce8c0ebda42a3',
 'document_name': 'MCS-CS-Handbook-2022-2023Publish.pdf',
 'end_date': None,
 'in_folders': [],
 'run_id': '66fe11c5927ce8c0ebda42a3_c0d7d1fd-70be-4aba-a2bf-1bc1b960392c',
 'start_date': None,
 'state': None,
 'workflow_id': 'process_form_workflow'}


In [13]:
workflow_status_response = workflow_operation.get_workflow_status(
    show_internal_steps=False,
    workflow_id=rerun_workflow.workflow_id,
    workflow_run_id=rerun_workflow.run_id,
)
pprint(workflow_status_response.model_dump())

NameError: name 'rerun_workflow' is not defined

### Get form

In [27]:
from weavaidev.forms.models import FilterFormInstanceRequest


body = FilterFormInstanceRequest(
    scope="my_documents",
    doc_id=document_id,
)

form_create_response = form_operation.filter_form_instances(form_data=body).model_dump()

pprint(form_create_response)

{'form_instances': [{'category': 'UNITED STATES SECURITIES AND EXCHANGE '
                                 'COMMISSION',
                     'doc_id': '671994d79e2cac3c191b2798',
                     'file_name': 'AAPL_10Q.pdf',
                     'form_id': '671995226d31d047e6c82300',
                     'form_instance': {'data': [{'identifier': 'a0c65e1b-89d2-432a-99f9-96535e0ad176',
                                                 'name': 'Total Sales',
                                                 'value': [81797.0,
                                                           19881.0,
                                                           19442.0,
                                                           60274.0,
                                                           74039.0,
                                                           19881.0,
                                                           74439.0,
                                                           

In [28]:
form_id = form_create_response["form_instances"][0]["form_id"]
form_id

'671995226d31d047e6c82300'

### View form definition

In [29]:
form_definition_response = form_operation.get_form_definition(form_id=form_id)
pprint(form_definition_response.model_dump())

{'category': '',
 'created_at': '2024-10-24T00:30:26Z',
 'description': 'Apple form definition 2',
 'fields': [{'description': 'Net Sales for the quarter',
             'field_type': 'Number',
             'fill_by_search': False,
             'is_array': True,
             'name': 'Total Sales'}],
 'id': '671995226d31d047e6c82300',
 'is_searchable': True,
 'is_shared': True,
 'name': 'APPLE FORM DEFINITION 2',
 'user_id': 'google-oauth2|117349365869611297391'}
