<td>
   <a target="_blank" href="https://labelbox.com" ><img src="https://labelbox.com/blog/content/images/2021/02/logo-v4.svg" width=256/></a>
</td>

<td>
<a href="https://colab.research.google.com/github/Labelbox/labelbox-python/blob/master/examples/annotation_import/pdf.ipynb" target="_blank"><img
src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"></a>
</td>

<td>
<a href="https://github.com/Labelbox/labelbox-python/tree/master/examples/annotation_import/pdf.ipynb" target="_blank"><img
src="https://img.shields.io/badge/GitHub-100000?logo=github&logoColor=white" alt="GitHub"></a>
</td>

# PDF Prediction Import

Supported predictions for PDF assets

Python Annotation Types 
- Checklist classification 
- Radio classification 
- Free text classifications 
- Entities 

NDJson 
- Checklist classification (including nested classifications)
- Radio classificaations (including nested classifications)
- Free text classifications 
- Bouding box 
- Entities

### Setup

In [None]:
!pip install -q 'labelbox[data]'

In [None]:
import uuid
import labelbox as lb
import labelbox.types as lb_types
from labelbox.schema.queue_mode import QueueMode

### Replace with your API key 
Guides on how to create an API key : https://docs.labelbox.com/docs/create-an-api-key

In [None]:
# Add your api key
API_KEY = ""
client = lb.Client(api_key=API_KEY)

### Supported Predictions

In [None]:
########## Entity ##########

# Annotation Types
entities_prediction = lb_types.ObjectAnnotation(        
    name="named_entity",
    confidence=0.5, 
    value= lb_types.DocumentEntity(
        name="named_entity",
        textSelections=[
            lb_types.DocumentTextSelection(
                token_ids=[],
                group_id="",
                page=1
            )
        ]
    )
)

# NDJSON
entities_prediction_ndjson = { 
    "name": "named_entity",
    # "confidence": 0.5,
    "textSelections": [
        {
            "tokenIds": [
                "<UUID>",
            ],
            "groupId": "<UUID>",
            "page": 1,
        }
    ]
}

In [None]:
########### Radio Classification #########

# Annotation types 
radio_prediction = lb_types.ClassificationAnnotation(
    name="radio_question",
    value=lb_types.Radio(answer = 
        lb_types.ClassificationAnswer(name = "first_radio_answer", confidence=0.5)
    )
)
# NDJSON
radio_prediction_ndjson = {
  'name': 'radio_question',
  'answer': {'name': 'first_radio_answer', "confidence": 0.5}
}

In [None]:
############ Checklist Classification ###########

# Annotation types 
checklist_prediction = lb_types.ClassificationAnnotation(
    name="checklist_question",
    value=lb_types.Checklist(answer = [
        lb_types.ClassificationAnswer(name = "first_checklist_answer", confidence=0.5),
        lb_types.ClassificationAnswer(name = "second_checklist_answer", confidence=0.5)
    ])
  )


# NDJSON
checklist_prediction_ndjson = {
  'name': 'checklist_question',
  'answer': [
    {'name': 'first_checklist_answer', "confidence":0.5},
    {'name': 'second_checklist_answer', "confidence":0.5}
  ]
}

In [None]:
############ Bounding Box ###########
bbox_prediction = lb_types.ObjectAnnotation(
    name="bounding_box",  # must match your ontology feature's name
    confidence=0.5,
    value=lb_types.DocumentRectangle(
        start=lb_types.Point(x=86.498, y=42.799),  # x = left, y = top 
        end=lb_types.Point(x=389.693, y=184.71),  # left + width , y = top + height
        page=1,
        unit=lb_types.RectangleUnit.POINTS
        )
    )

bbox_prediction_ndjson = {
  'name': 'bounding_box',
  "confidence" : 0.5,
  'bbox': {
          "top": 42.799,
          "left": 86.498,
          "height": 141.911,
          "width": 303.195
      },
  'page': 0,
  'unit': "POINTS"
}

In [None]:
# ############ nested classifications ###########

nested_checklist_prediction_ndjson = {
  "name": "nested_checklist_question",
  "answer": [{
      "name": "first_checklist_answer", "confidence" : 0.5,
      "classifications" : [
        {
          "name": "sub_checklist_question", 
          "answer": {"name": "first_sub_checklist_answer", "confidence": 0.5}
        }          
      ]         
  }]
}

nested_radio_prediction_ndjson = {
  'name': 'nested_radio_question',
  'answer': {
      'name': 'first_radio_answer', "confidence": 0.5,
      'classifications': [{
          'name':'sub_radio_question',
          'answer': { 'name' : 'first_sub_radio_answer', "confidence": 0.5}
        }]
    }
}

In [None]:
############## Classification Free-form text ############## 
# Confidence scores are not supported for Text classifications 

text_prediction = lb_types.ClassificationAnnotation(
  name="free_text",  # must match your ontology feature's name
  value=lb_types.Text(answer="sample text")
)


text_prediction_ndjson = {
  'name': 'free_text',
  'answer': 'sample text'
}

In [None]:
############ NER with nested classifications ######## 

ner_with_checklist_subclass_prediction_ndjson = {
  'name': 'ner_with_checklist_subclass',
  'classifications':[
    {
      'name': 'sub_checklist_question', "confidence": 0.5,
      'answer': [{'name': 'first_sub_checklist_answer', "confidence": 0.5}] 
    }
  ],
  'textSelections': [
      {
          "tokenIds": [
              "<UUID>",
          ],
          "groupId": "<UUID>",
          "page": 1,
      }
  ] 
}

In [None]:
######### BBOX with nested classifications #########

bbox_with_radio_subclass_prediction = lb_types.ObjectAnnotation(
    name="bbox_with_radio_subclass",
    confidence=0.5,
    value=lb_types.DocumentRectangle(
        start=lb_types.Point(x=189.215, y=214.894), # x = left, y = top 
        end=lb_types.Point(x=429.788, y=478.894), # left + width , y = top + height
        unit="POINTS",
        page=1
    ),
    classifications=[
    	lb_types.ClassificationAnnotation(
        	name="sub_radio_question",
      		value=lb_types.Radio(
          answer=lb_types.ClassificationAnswer(
            name="first_sub_radio_answer", confidence=0.5,
            classifications=[
              lb_types.ClassificationAnnotation(
                name="second_sub_radio_question",
                value=lb_types.Radio(
                  answer=lb_types.ClassificationAnswer(
                    name="second_sub_radio_answer", confidence=0.5,
                  )
                )
              )
            ]
          )
          )
        )
    ]
)

bbox_with_radio_subclass_prediction_ndjson = {
  'name': 'bbox_with_radio_subclass',
  'classifications': [
    {
      "name": "sub_radio_question", 
      "answer": {
          "name": "first_sub_radio_answer", "confidence": 0.5,
          "classifications": [
              {
                  "name": "second_sub_radio_question", 
                  "answer": {
                      "name": "second_sub_radio_answer", "confidence": 0.5}
               }
            ]
        }
    }
  ],
  'bbox': {
        "top": 214.894,
        "left": 189.215,
        "height": 264,
        "width": 240.573
    },
  'page': 1,
  'unit': "POINTS"
}

## Step 1: Import data rows into Catalog 


In [None]:
## Text layer url is required for uploading entity annotations
global_key = "0801.3483.pdf"
img_url = {
    "row_data": {
      "pdf_url": "https://storage.googleapis.com/labelbox-datasets/arxiv-pdf/data/99-word-token-pdfs/0801.3483.pdf",
      "text_layer_url": "https://storage.googleapis.com/labelbox-datasets/arxiv-pdf/data/99-word-token-pdfs/0801.3483-lb-textlayer.json"
    },
    "global_key": global_key
}


dataset = client.create_dataset(name="pdf_demo_dataset")
task = dataset.create_data_rows([img_url])
task.wait_till_done()
print("Errors:",task.errors)
print("Failed data rows:", task.failed_data_rows)

There are errors present. Please look at `task.errors` for more details


Errors: Duplicate global keys found: 0801.3483.pdf
Failed data rows: [{'message': 'Duplicate global keys found: 0801.3483.pdf', 'failedDataRows': [{'globalKey': '0801.3483.pdf', 'rowData': 'https://storage.googleapis.com/labelbox-datasets/arxiv-pdf/data/99-word-token-pdfs/0801.3483.pdf', 'attachmentInputs': [], 'mediaAttributes': {'textLayerUrl': 'https://storage.googleapis.com/labelbox-datasets/arxiv-pdf/data/99-word-token-pdfs/0801.3483-lb-textlayer.json'}}]}]


## Step 2: Create/select an Ontology for your project

In [None]:
## Setup the ontology and link the tools created above.

ontology_builder = lb.OntologyBuilder(
  classifications=[ # List of Classification objects
    lb.Classification( 
      class_type=lb.Classification.Type.RADIO,
      name="radio_question", 
      scope = lb.Classification.Scope.GLOBAL,
      options=[
        lb.Option(value="first_radio_answer"),
        lb.Option(value="second_radio_answer")
      ]
    ),
    lb.Classification(
      class_type=lb.Classification.Type.CHECKLIST,
      name="checklist_question", 
      scope = lb.Classification.Scope.GLOBAL,
      options=[
        lb.Option(value="first_checklist_answer"),
        lb.Option(value="second_checklist_answer")
      ]
    ), 
    lb.Classification(
      class_type=lb.Classification.Type.TEXT,
      name="free_text",
      scope = lb.Classification.Scope.GLOBAL
    ),
    lb.Classification(
        class_type=lb.Classification.Type.RADIO,
        name="nested_radio_question",
        scope = lb.Classification.Scope.GLOBAL,
        options=[
            lb.Option("first_radio_answer",
                options=[
                    lb.Classification(
                        class_type=lb.Classification.Type.RADIO,
                        name="sub_radio_question",
                        options=[lb.Option("first_sub_radio_answer")]
                    )
                ])
          ]
    ),
    lb.Classification(
      class_type=lb.Classification.Type.CHECKLIST,
      name="nested_checklist_question",
      scope = lb.Classification.Scope.GLOBAL,
      options=[
          lb.Option("first_checklist_answer",
            options=[
              lb.Classification(
                  class_type=lb.Classification.Type.CHECKLIST,
                  name="sub_checklist_question", 
                  options=[lb.Option("first_sub_checklist_answer")]
              )
          ])
      ]
    ),      
  ],
  tools=[ # List of Tool objects
    lb.Tool( tool=lb.Tool.Type.BBOX,name="bounding_box"), 
    lb.Tool(tool=lb.Tool.Type.NER, name="named_entity"),
    lb.Tool(tool=lb.Tool.Type.NER,
            name="ner_with_checklist_subclass",
            classifications=[
              lb.Classification(
                class_type=lb.Classification.Type.CHECKLIST,
                name="sub_checklist_question",
                options=[
                  lb.Option(value="first_sub_checklist_answer")
                ]
              )
          ]),
    lb.Tool( tool=lb.Tool.Type.BBOX,
            name="bbox_with_radio_subclass",
            classifications=[
              lb.Classification(
                  class_type=lb.Classification.Type.RADIO,
                  name="sub_radio_question",
                  options=[
                    lb.Option(
                      value="first_sub_radio_answer" ,
                      options=[
                        lb.Classification(
                          class_type=lb.Classification.Type.RADIO,
                          name='second_sub_radio_question',
                          options=[lb.Option("second_sub_radio_answer")]
                        )]
                    )]
                )]
      )]
)

ontology = client.create_ontology("Document Annotation Import Demo",
                                  ontology_builder.asdict(),
                                  media_type=lb.MediaType.Document)

## Step 3: Create a Model and Model Run

In [None]:
# create Model
model = client.create_model(name="pdf_model_run_" + str(uuid.uuid4()),
                            ontology_id=ontology.uid)
# create Model Run
model_run = model.create_model_run("iteration 1")

## Step 4: Send data rows to the Model Run

In [None]:
model_run.upsert_data_rows(global_keys=[global_key])

True

## Step 5: Create the predictions payload
Create the prediction payload using the snippets of code in ***Supported Predictions*** section. 

The resulting label_ndjson should have exactly the same content for predictions that are supported by both

First, populate the text selections in the entity predictions.

In [None]:
import requests
import json

# Helper method
def update_text_selections(annotation, group_id, list_tokens, page):
  return annotation.update({
    'textSelections': [
      {
        'groupId': group_id,
        'tokenIds': list_tokens,
        'page': page
      }
    ]
  })
  

text_layer = "https://storage.googleapis.com/labelbox-datasets/arxiv-pdf/data/99-word-token-pdfs/0801.3483-lb-textlayer.json"

# Fetch the content of the text layer
res = requests.get(text_layer) 


# Phrases that we want to annotation obtained from the text layer url
content_phrases = ["Metal-insulator (MI) transitions have been one of the" , 
                   "T. Sasaki,* N. Yoneyama, and N. Kobayashi", 
                   "Organic charge transfer salts based on the donor",
                   "the experimental investigations on this issue have not"]

# Parse the text layer
text_selections = []
text_selections_source = []
text_selections_target = []

for obj in json.loads(res.text):
  for group in obj['groups']:
    if group['content'] == content_phrases[0]:
      list_tokens = [x['id'] for x in group['tokens']]
      # build text selections for Python Annotation Types
      document_text_selection = lb_types.DocumentTextSelection(groupId=group['id'], tokenIds=list_tokens, page=1)
      text_selections.append(document_text_selection)
      # build text selection for the NDJson annotations
      update_text_selections(annotation=entities_prediction_ndjson,
                             group_id=group['id'], # id representing group of words 
                             list_tokens=list_tokens, # ids representing individual words from the group
                             page=1)
    if group['content'] == content_phrases[1]:
      list_tokens_2 = [x['id'] for x in group['tokens']]
      update_text_selections(annotation=ner_with_checklist_subclass_prediction_ndjson,
                             group_id=group['id'], # id representing group of words 
                             list_tokens=list_tokens_2, # ids representing individual words from the group
                             page=1)
   

    
      
#re-write the entity annotation with text selections (python annotation types)
entities_prediction_document_entity = lb_types.DocumentEntity(name="named_entity", 
                                          textSelections = text_selections)
entities_prediction = lb_types.ObjectAnnotation(name="named_entity",
                                                value=entities_prediction_document_entity)


        
print(f"entities_prediction_ndjson={entities_prediction_ndjson}")
print(f"entities_prediction={entities_prediction}")
print(f"nested_entities_annotation={ner_with_checklist_subclass_prediction_ndjson}")


entities_prediction_ndjson={'name': 'named_entity', 'confidence': 0.5, 'textSelections': [{'groupId': '2f4336f4-a07e-4e0a-a9e1-5629b03b719b', 'tokenIds': ['3f984bf3-1d61-44f5-b59a-9658a2e3440f', '3bf00b56-ff12-4e52-8cc1-08dbddb3c3b8', '6e1c3420-d4b7-4c5a-8fd6-ead43bf73d80', '87a43d32-af76-4a1d-b262-5c5f4d5ace3a', 'e8606e8a-dfd9-4c49-a635-ad5c879c75d0', '67c7c19e-4654-425d-bf17-2adb8cf02c30', '149c5e80-3e07-49a7-ab2d-29ddfe6a38fa', 'b0e94071-2187-461e-8e76-96c58738a52c'], 'page': 1}], 'dataRow': {'globalKey': '0801.3483.pdf'}}
entities_prediction=confidence=None name='named_entity' feature_schema_id=None extra={} value=DocumentEntity(text_selections=[DocumentTextSelection(token_ids=['3f984bf3-1d61-44f5-b59a-9658a2e3440f', '3bf00b56-ff12-4e52-8cc1-08dbddb3c3b8', '6e1c3420-d4b7-4c5a-8fd6-ead43bf73d80', '87a43d32-af76-4a1d-b262-5c5f4d5ace3a', 'e8606e8a-dfd9-4c49-a635-ad5c879c75d0', '67c7c19e-4654-425d-bf17-2adb8cf02c30', '149c5e80-3e07-49a7-ab2d-29ddfe6a38fa', 'b0e94071-2187-461e-8e76-96c5

In [None]:
# Create a Label for predictions
label = []
label.append(lb_types.Label(
    data=lb_types.DocumentData(global_key=global_key),
    annotations = [
      entities_prediction,
      radio_prediction,
      checklist_prediction, 
      bbox_prediction, 
      bbox_with_radio_subclass_prediction 
      ]
  )
)

If using NDJSON

In [None]:
label_prediction_ndjson = []

for annot in [
    entities_prediction_ndjson,
    radio_prediction_ndjson,
    checklist_prediction_ndjson, 
    bbox_prediction_ndjson, 
    bbox_with_radio_subclass_prediction_ndjson, 
    nested_radio_prediction_ndjson,
    nested_checklist_prediction_ndjson
]:
  annot.update({
      'uuid': str(uuid.uuid4()),
      'dataRow': {'globalKey': global_key}
  })
  label_prediction_ndjson.append(annot)

## Step 6: Upload the predictions payload to the Model Run

In [None]:
# Upload the prediction label to the Model Run
upload_job_prediction = model_run.add_predictions(
    name="prediction_upload_job"+str(uuid.uuid4()),
    predictions=label_prediction_ndjson)

# Errors will appear for prediction uploads that failed.
print("Errors:",  upload_job_prediction.errors)
print("Status of uploads: ", upload_job_prediction.statuses)

Errors: [{'uuid': '20859877-8c68-4df8-a3d1-4bcc8da38326', 'dataRow': {'id': 'clfh1n29e1mh2078d3jt3b5cx', 'globalKey': '0801.3483.pdf'}, 'status': 'FAILURE', 'errors': [{'name': 'ValidationError', 'message': "{'_schema': ['Confidence is not supported for this type of annotation']}", 'additionalInfo': None}]}, {'uuid': '6d4963f2-5d0e-48e0-a6c0-7e7dd1b08329', 'dataRow': {'id': 'clfh1n29e1mh2078d3jt3b5cx', 'globalKey': '0801.3483.pdf'}, 'status': 'FAILURE', 'errors': [{'name': 'ValidationError', 'message': "{'_schema': ['Confidence is not supported for this type of annotation']}", 'additionalInfo': None}]}, {'uuid': 'b923e24d-6c30-41b4-a1b9-e61a9460585e', 'dataRow': {'id': 'clfh1n29e1mh2078d3jt3b5cx', 'globalKey': '0801.3483.pdf'}, 'status': 'FAILURE', 'errors': [{'name': 'ValidationError', 'message': "{'_schema': ['Confidence is not supported for this type of annotation']}", 'additionalInfo': None}]}]
Status of uploads:  [{'uuid': '20859877-8c68-4df8-a3d1-4bcc8da38326', 'dataRow': {'id': 

In [None]:
status_output = upload_job_prediction.statuses
status_output

[{'uuid': '20859877-8c68-4df8-a3d1-4bcc8da38326',
  'dataRow': {'id': 'clfh1n29e1mh2078d3jt3b5cx', 'globalKey': '0801.3483.pdf'},
  'status': 'FAILURE',
  'errors': [{'name': 'ValidationError',
    'message': "{'_schema': ['Confidence is not supported for this type of annotation']}",
    'additionalInfo': None}]},
 {'uuid': '30033980-9eeb-48eb-9665-259bf66f41a6',
  'dataRow': {'id': 'clfh1n29e1mh2078d3jt3b5cx', 'globalKey': '0801.3483.pdf'},
  'status': 'SUCCESS'},
 {'uuid': '83d91a66-19d1-4802-be84-3ca5cefa0ed9',
  'dataRow': {'id': 'clfh1n29e1mh2078d3jt3b5cx', 'globalKey': '0801.3483.pdf'},
  'status': 'SUCCESS'},
 {'uuid': '6d4963f2-5d0e-48e0-a6c0-7e7dd1b08329',
  'dataRow': {'id': 'clfh1n29e1mh2078d3jt3b5cx', 'globalKey': '0801.3483.pdf'},
  'status': 'FAILURE',
  'errors': [{'name': 'ValidationError',
    'message': "{'_schema': ['Confidence is not supported for this type of annotation']}",
    'additionalInfo': None}]},
 {'uuid': 'b923e24d-6c30-41b4-a1b9-e61a9460585e',
  'dataRow

In [None]:
errors_job = [i['uuid'] for i in status_output if i['status'] == 'FAILURE']
errors_job

['20859877-8c68-4df8-a3d1-4bcc8da38326',
 '6d4963f2-5d0e-48e0-a6c0-7e7dd1b08329',
 'b923e24d-6c30-41b4-a1b9-e61a9460585e']

In [None]:
errors_ndjson = [i for i in label_prediction_ndjson if i['uuid'] in errors_job]
errors_ndjson

[{'name': 'named_entity',
  'confidence': 0.5,
  'textSelections': [{'groupId': '2f4336f4-a07e-4e0a-a9e1-5629b03b719b',
    'tokenIds': ['3f984bf3-1d61-44f5-b59a-9658a2e3440f',
     '3bf00b56-ff12-4e52-8cc1-08dbddb3c3b8',
     '6e1c3420-d4b7-4c5a-8fd6-ead43bf73d80',
     '87a43d32-af76-4a1d-b262-5c5f4d5ace3a',
     'e8606e8a-dfd9-4c49-a635-ad5c879c75d0',
     '67c7c19e-4654-425d-bf17-2adb8cf02c30',
     '149c5e80-3e07-49a7-ab2d-29ddfe6a38fa',
     'b0e94071-2187-461e-8e76-96c58738a52c'],
    'page': 1}],
  'dataRow': {'globalKey': '0801.3483.pdf'},
  'uuid': '20859877-8c68-4df8-a3d1-4bcc8da38326'},
 {'name': 'bounding_box',
  'confidence': 0.5,
  'bbox': {'top': 42.799, 'left': 86.498, 'height': 141.911, 'width': 303.195},
  'page': 0,
  'unit': 'POINTS',
  'dataRow': {'globalKey': '0801.3483.pdf'},
  'uuid': '6d4963f2-5d0e-48e0-a6c0-7e7dd1b08329'},
 {'name': 'bbox_with_radio_subclass',
  'classifications': [{'name': 'sub_radio_question',
    'answer': {'name': 'first_sub_radio_answer'

## Step 7: Send annotations to a model run 
To visualize both annotations and predictions in the model run we will create a project with ground truth annotations. 
To send annotations to a Model Run, we must first import them into a project, create a label payload and then send them to the Model Run.

##### 7.1 Create a labelbox project 

In [None]:
# Create a Labelbox project
project = client.create_project(name="pdf_prediction_demo",
                                    auto_audit_percentage=1,
                                    auto_audit_number_of_labels=1,
                                    media_type=lb.MediaType.Image)
project.setup_editor(ontology)

##### 7.2 Create a batch to send to the project

In [None]:
project.create_batch(
  "batch_predictions_demo", # Each batch in a project must have a unique name
  global_keys=[global_key], # Paginated collection of data row objects, list of data row ids or global keys
  priority=5 # priority between 1(Highest) - 5(lowest)
)

##### 7.3 Create the annotations payload

In [None]:
# Create a Labelbox project
project = client.create_project(name="PDF_annotation_demo",                                    
                                    queue_mode=QueueMode.Batch,
                                    media_type=lb.MediaType.Document)
project.setup_editor(ontology)

#### Step 4: Send a batch of data rows to the project 

In [None]:
project.create_batch(
  "PDF_annotation_batch", # Each batch in a project must have a unique name
  global_keys=[global_key], # Paginated collection of data row objects, list of data row ids or global keys
  priority=5 # priority between 1(Highest) - 5(lowest)
)

### Step 5: Create the annotation payload 
Create the annotations payload using the snippets of code in Supported predictions section.

Labelbox support NDJSON only for this data type.

The resulting label should have exactly the same content for annotations that are supported by both (with exception of the uuid strings that are generated)

First, we need to populate the text selections for Entity annotations

To learn how to generate a text layer for your documents please refer to the following repositories/files:   https://github.com/Labelbox/PDF-OCR-Transform-CLI/blob/main/src/scripts/gcloud/gcp-vision-to-lb-text-layer.py  https://github.com/Labelbox/PDF-OCR-Transform-CLI/blob/main/src/scripts/adobe/adobe-ocr-to-lb-text-layer.py 

In [None]:
import requests
import json

# Helper method
def update_text_selections(annotation, group_id, list_tokens, page):
  return annotation.update({
    'textSelections': [
      {
        'groupId': group_id,
        'tokenIds': list_tokens,
        'page': page
      }
    ]
  })
  

text_layer = "https://storage.googleapis.com/labelbox-datasets/arxiv-pdf/data/99-word-token-pdfs/0801.3483-lb-textlayer.json"

# Fetch the content of the text layer
res = requests.get(text_layer) 


# Phrases that we want to annotation obtained from the text layer url
content_phrases = ["Metal-insulator (MI) transitions have been one of the" , 
                   "T. Sasaki,* N. Yoneyama, and N. Kobayashi", 
                   "Organic charge transfer salts based on the donor",
                   "the experimental investigations on this issue have not"]

# Parse the text layer
text_selections = []
text_selections_source = []
text_selections_target = []

for obj in json.loads(res.text):
  for group in obj['groups']:
    if group['content'] == content_phrases[0]:
      list_tokens = [x['id'] for x in group['tokens']]
      # build text selections for Python Annotation Types
      document_text_selection = lb_types.DocumentTextSelection(groupId=group['id'], tokenIds=list_tokens, page=1)
      text_selections.append(document_text_selection)
      # build text selection for the NDJson annotations
      update_text_selections(annotation=entities_prediction_ndjson,
                             group_id=group['id'], # id representing group of words 
                             list_tokens=list_tokens, # ids representing individual words from the group
                             page=1)
    if group['content'] == content_phrases[1]:
      list_tokens_2 = [x['id'] for x in group['tokens']]
      update_text_selections(annotation=ner_with_checklist_subclass_prediction_ndjson,
                             group_id=group['id'], # id representing group of words 
                             list_tokens=list_tokens_2, # ids representing individual words from the group
                             page=1)
   

    
      
#re-write the entity annotation with text selections (python annotation types)
entities_annotation_document_entity = lb_types.DocumentEntity(name="named_entity", 
                                          textSelections = text_selections)
entities_annotation = lb_types.ObjectAnnotation(name="named_entity",
                                                value=entities_annotation_document_entity)


        
print(f"entities_annotations_ndjson={entities_prediction_ndjson}")
print(f"entities_annotation={entities_annotation}")
print(f"nested_entities_annotation={ner_with_checklist_subclass_prediction_ndjson}")


Python annotation

Here we create the complete labels ndjson payload of annotations only using python annotation format. There is one annotation for each reference to an annotation that we created. Note that only a handful of python annotation types are supported for PDF documents.

In [None]:
python_labels = []

python_labels.append(
    lb_types.Label(
        data=lb_types.DocumentData(
            global_key=global_key),
        annotations = [
            entities_annotation,
            checklist_annotation, 
            text_annotation,
            radio_annotation,
            bbox_annotation,
            bbox_with_radio_subclass_annotation,
            bbox_source,
            bbox_target,
            bbox_relationship, 
            entity_source, 
            entity_target, 
            entity_relationship
        ]
  )
)