In [5]:
import sys
import os
from pprint import pprint

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

from config_models import LoadConfigurations, ServiceType

from documents.service import DocumentOperations

TODO: Add the correct path below

In [6]:
ENV_FILE_PATH = "../.env"
DOCUMENT_PATH = "../AAPL_10Q.pdf"

### Uploading document

In [7]:
document_configs = LoadConfigurations(env_file_path=ENV_FILE_PATH).set_config(service=ServiceType.DOCUMENT)
document_operation = DocumentOperations(configs=document_configs)

print(f"Uploading document {DOCUMENT_PATH.split('/')[-1]}")

document_create_response = document_operation.create_document(file_path=DOCUMENT_PATH)

print("Uploaded document!")

[32m2024-10-03 01:56:54.893[0m | [1mINFO    [0m | [36mconfig_models[0m:[36mset_config[0m:[36m100[0m - [1mConfig set.[0m


Uploading document AAPL_10Q.pdf
Uploaded document!


In [8]:
document_id = document_create_response.id
print(f"Document ID: {document_id}")

Document ID: 66fe5c58b1d0dfb13c9975f3


### Get information about the document

In [31]:
get_document_response = document_operation.get_document(document_id=document_id)

print("Fetched document")

pprint(get_document_response.model_dump())

document_category = get_document_response.category

print(document_category)

Fetched document
{'ai_tags': [],
 'category': 'UNITED STATES SECURITIES AND EXCHANGE COMMISSION',
 'created_at': datetime.datetime(2024, 10, 3, 8, 56, 56, tzinfo=TzInfo(UTC)),
 'download_url': '/doc-proc-service/local_store/google-oauth2|117349365869611297391/66fe5c58b1d0dfb13c9975f3/66fe5c58b1d0dfb13c9975f3',
 'file_name': 'AAPL_10Q.pdf',
 'form_instances': None,
 'id': '66fe5c58b1d0dfb13c9975f3',
 'in_folders': [],
 'media_type': 'application/pdf',
 'pages': [],
 'redacted_summary': '',
 'size': 654929,
 'source': 'application',
 'status': 'PROCESSING',
 'step_status': {'FORM_EXTRACTION': {'error': None,
                                     'modified_at': datetime.datetime(2024, 10, 3, 9, 26, 37, tzinfo=TzInfo(UTC)),
                                     'response': {'_id': '66fe6295eb87303bc52bbac7',
                                                  'category': '',
                                                  'created_at': '2024-10-03T09:23:33Z',
                                

## Workflow management

### 1. Getting all workflows

In [10]:
from workflows.service import WorkflowService
from workflows.models import WorkflowRequest


workflow_configs = LoadConfigurations(env_file_path=ENV_FILE_PATH).set_config(service=ServiceType.WORKFLOWS)
workflow_operation = WorkflowService(configs=workflow_configs)

print("GETTING ALL WORKFLOWS")

workflow_response = workflow_operation.get_all_workflows()

workflows = [workflow.name for workflow in workflow_response.workflows]

pprint(workflows)

[32m2024-10-03 01:57:02.866[0m | [1mINFO    [0m | [36mconfig_models[0m:[36mset_config[0m:[36m100[0m - [1mConfig set.[0m


GETTING ALL WORKFLOWS
['dagtasktest',
 'dagtest',
 'download_from_connector',
 'extract_form_and_compliance',
 'process_brain_document',
 'process_compliance_exclude_entity',
 'process_custom_queries',
 'process_document',
 'process_document_create_decision_tree',
 'process_document_multi_forms',
 'process_document_sensors',
 'process_document_summary_compliance',
 'process_document_summary_generation',
 'process_document_tabular_data',
 'process_form_data_cleanup',
 'process_form_workflow',
 'run_agent']


### 2. Running specific workflows

#### Process document workflow

In [30]:
print("RUNNING 'process_document' WORKFLOW")

running_workflow_data = WorkflowRequest(doc_id=document_id, data={})

process_document_response = workflow_operation.run_workflow(
    workflow_name="process_document", data=running_workflow_data
)

pprint(process_document_response.model_dump())

print(f"RUN ID: {process_document_response.run_id}")

RUNNING 'process_document' WORKFLOW
{'created_at': '2024-10-03T09:29:34.000000+00:00',
 'document_id': '66fe5c58b1d0dfb13c9975f3',
 'document_name': 'AAPL_10Q.pdf',
 'end_date': None,
 'in_folders': [],
 'run_id': '66fe5c58b1d0dfb13c9975f3_c163fcea-d479-4b36-a8a4-ad82ba5268fe',
 'start_date': None,
 'state': None,
 'workflow_id': 'process_document'}
RUN ID: 66fe5c58b1d0dfb13c9975f3_c163fcea-d479-4b36-a8a4-ad82ba5268fe


#### The cell below utilizes the get workflow status API to check if the workflow has completed.

In [17]:
import time
import random

finished = False

while not finished:
    time.sleep(random.randint(5,10))
    workflow_status_response = workflow_operation.get_workflow_status(
        show_internal_steps=False,
        workflow_id=process_document_response.workflow_id,
        workflow_run_id=process_document_response.run_id,
    )
    current_status = workflow_status_response.status
    print(f"Processing status: {current_status}")
    if current_status in ['success','failed']:
        finished = True

pprint(workflow_status_response.model_dump())

Processing status: success
{'document_id': '66fe5c58b1d0dfb13c9975f3',
 'end_date': datetime.datetime(2024, 10, 3, 9, 8, 23, 126762, tzinfo=TzInfo(UTC)),
 'start_date': datetime.datetime(2024, 10, 3, 8, 58, 42, 785002, tzinfo=TzInfo(UTC)),
 'status': 'success',
 'tasks': [{'end_date': datetime.datetime(2024, 10, 3, 9, 6, 24, 898440, tzinfo=TzInfo(UTC)),
            'failed_task_ids': [],
            'name': 'form_extraction',
            'start_date': datetime.datetime(2024, 10, 3, 9, 6, 12, 275298, tzinfo=TzInfo(UTC)),
            'status': 'success',
            'task_status_summary': {'failed': 0,
                                    'queued': 0,
                                    'running': 0,
                                    'skipped': 0,
                                    'success': 1}},
           {'end_date': datetime.datetime(2024, 10, 3, 9, 5, 36, 890536, tzinfo=TzInfo(UTC)),
            'failed_task_ids': [],
            'name': 'vectorize_page',
            'start_date'

##### Form processing can be run only after `process_document` workflow has finished

##### First we need to create a form definition to be able to run form extraction

In [19]:
from documents.service import FormOperations
from documents.models import CreateFormRequest

configs = LoadConfigurations(env_file_path=ENV_FILE_PATH).set_config(service=ServiceType.DOCUMENT)
form_operation = FormOperations(configs=configs)
fields_data =  [
    {
      "name": "Total Sales",
      "field_type": "Number",
      "description": "Net Sales for the quarter",
      "is_array": True,
      "fill_by_search": False
    }
  ]
body = CreateFormRequest(
    name="APPLE FORM DEFINITION 2",
    description="Apple form definition 2",
    category=document_category,
    fields=fields_data,
    is_shared=True,
    is_searchable=True
)
form_create_response = form_operation.create_form(form_data=body)

[32m2024-10-03 02:23:32.797[0m | [1mINFO    [0m | [36mconfig_models[0m:[36mset_config[0m:[36m100[0m - [1mConfig set.[0m


In [23]:
print(f"Form ID: {form_create_response.id}")

form_id = form_create_response.id

Form ID: 66fe6295eb87303bc52bbac7


#### Process form workflow

In [24]:
print("RUNNING 'process_form_workflow' WORKFLOW")

running_workflow_data = WorkflowRequest(doc_id=document_id, data={"form_id":form_id})

process_form_response = workflow_operation.run_workflow(
    workflow_name="process_form_workflow", data=running_workflow_data
)

pprint(process_form_response.model_dump())

print(f"FORM EXTRACTION RUN ID: {process_form_response.run_id}")

workflow_id = process_form_response.workflow_id
run_id = process_form_response.run_id

RUNNING 'process_form_workflow' WORKFLOW
{'created_at': '2024-10-03T09:25:46.000000+00:00',
 'document_id': '66fe5c58b1d0dfb13c9975f3',
 'document_name': 'AAPL_10Q.pdf',
 'end_date': None,
 'in_folders': [],
 'run_id': '66fe5c58b1d0dfb13c9975f3_833c2300-7fbe-42e4-8e6b-7b885fa80a07',
 'start_date': None,
 'state': None,
 'workflow_id': 'process_form_workflow'}
FORM EXTRACTION RUN ID: 66fe5c58b1d0dfb13c9975f3_833c2300-7fbe-42e4-8e6b-7b885fa80a07



### Get workflow status

In [25]:
import time
import random

finished = False

while not finished:
    time.sleep(random.randint(5,10))
    workflow_status_response = workflow_operation.get_workflow_status(
        show_internal_steps=False,
        workflow_id=workflow_id,
        workflow_run_id=run_id,
    )
    current_status = workflow_status_response.status
    print(f"Processing status: {current_status}")
    if current_status in ['success','failed']:
        finished = True

pprint(workflow_status_response.model_dump())

Processing status: success
{'document_id': '66fe5c58b1d0dfb13c9975f3',
 'end_date': datetime.datetime(2024, 10, 3, 9, 26, 42, 419716, tzinfo=TzInfo(UTC)),
 'start_date': datetime.datetime(2024, 10, 3, 9, 25, 46, 190941, tzinfo=TzInfo(UTC)),
 'status': 'success',
 'tasks': [{'end_date': datetime.datetime(2024, 10, 3, 9, 26, 35, 407564, tzinfo=TzInfo(UTC)),
            'failed_task_ids': [],
            'name': 'extract_form_values_normal',
            'start_date': datetime.datetime(2024, 10, 3, 9, 26, 16, 324496, tzinfo=TzInfo(UTC)),
            'status': 'success',
            'task_status_summary': {'failed': 0,
                                    'queued': 0,
                                    'running': 0,
                                    'skipped': 0,
                                    'success': 1}},
           {'end_date': datetime.datetime(2024, 10, 3, 9, 25, 48, 124827, tzinfo=TzInfo(UTC)),
            'failed_task_ids': [],
            'name': 'set_processing_to_in_state

### Get single workflow

In [26]:
single_workflow_response = workflow_operation.get_single_workflow(
    show_internal_steps=False,
    workflow_name="process_form_workflow",
)
pprint(single_workflow_response.model_dump())

{'name': 'process_form_workflow',
 'params': [],
 'tasks': [{'downstream_tasks': ['extract_form_values_normal',
                                 'extract_form_values_list'],
            'is_active': True,
            'name': 'get_document'},
           {'downstream_tasks': ['update_doc'],
            'is_active': True,
            'name': 'extract_by_search'},
           {'downstream_tasks': ['update_doc'],
            'is_active': True,
            'name': 'extract_form_values_list'},
           {'downstream_tasks': ['update_doc'],
            'is_active': True,
            'name': 'extract_form_values_normal'},
           {'downstream_tasks': ['set_processing_to_failed_state',
                                 'set_processing_to_in_state__1'],
            'is_active': True,
            'name': 'update_doc'},
           {'downstream_tasks': [],
            'is_active': True,
            'name': 'run_next_workflow'},
           {'downstream_tasks': ['run_next_workflow'],
            'is

### Rerun failed workflow (Optional)

In [None]:
rerun_workflow = workflow_operation.rerun_workflow(
    workflow_name="process_form_workflow",
    data=running_workflow_data
)
pprint(rerun_workflow.model_dump())

{'created_at': '2024-10-03T04:01:26.000000+00:00',
 'document_id': '66fe11c5927ce8c0ebda42a3',
 'document_name': 'MCS-CS-Handbook-2022-2023Publish.pdf',
 'end_date': None,
 'in_folders': [],
 'run_id': '66fe11c5927ce8c0ebda42a3_c0d7d1fd-70be-4aba-a2bf-1bc1b960392c',
 'start_date': None,
 'state': None,
 'workflow_id': 'process_form_workflow'}


In [None]:
workflow_status_response = workflow_operation.get_workflow_status(
    show_internal_steps=False,
    workflow_id=rerun_workflow.workflow_id,
    workflow_run_id=rerun_workflow.run_id,
)
pprint(workflow_status_response.model_dump())

{'document_id': '66fe11c5927ce8c0ebda42a3',
 'end_date': None,
 'start_date': datetime.datetime(2024, 10, 3, 3, 58, 9, 22120, tzinfo=TzInfo(UTC)),
 'status': 'running',
 'tasks': [{'end_date': datetime.datetime(2024, 10, 3, 3, 58, 58, 918824, tzinfo=TzInfo(UTC)),
            'failed_task_ids': [-1],
            'name': 'set_processing_to_in_state__1',
            'start_date': datetime.datetime(2024, 10, 3, 3, 58, 58, 918824, tzinfo=TzInfo(UTC)),
            'status': 'failed',
            'task_status_summary': {'failed': 1,
                                    'queued': 0,
                                    'running': 0,
                                    'skipped': 0,
                                    'success': 0}},
           {'end_date': None,
            'failed_task_ids': [],
            'name': 'extract_form_values_list',
            'start_date': None,
            'status': 'queued',
            'task_status_summary': {'failed': 0,
                                    'queu

### Get form

In [None]:
from documents.service import FormOperations
from documents.models import FilterFormInstanceRequest

configs = LoadConfigurations(env_file_path=ENV_FILE_PATH).set_config(service=ServiceType.DOCUMENT)
form_operation = FormOperations(configs=configs)


body = FilterFormInstanceRequest(
    scope="all_documents",
    doc_id=document_id,
)

form_create_response = form_operation.filter_form_instances(form_data=body).model_dump()

pprint(form_create_response)

In [28]:
form_id = form_create_response["form_instances"][0]["form_id"]
form_id

'66fe234870dd6d497d9b8ba5'

### View form definition

In [29]:
form_definition_response = form_operation.get_form_definition(form_id=form_id)
pprint(form_definition_response.model_dump())

{'category': 'UNITED STATES SECURITIES AND EXCHANGE COMMISSION',
 'created_at': '2024-10-03T04:53:28Z',
 'description': 'Apple form',
 'fields': [{'description': 'Net Sales for the quarter',
             'field_type': 'Number',
             'fill_by_search': False,
             'is_array': True,
             'name': 'Net Sales'},
            {'description': '',
             'field_type': 'Number',
             'fill_by_search': False,
             'is_array': False,
             'name': 'Total sales'}],
 'id': '66fe234870dd6d497d9b8ba5',
 'is_searchable': False,
 'is_shared': False,
 'name': 'APPLE FORM',
 'user_id': 'google-oauth2|117349365869611297391'}
