# Agentic NER Pipeline

## Initial Imports

In [1]:
# Warning control
import warnings
warnings.filterwarnings('ignore')

# Load environment variables
from helper import load_env
load_env()

import os
import yaml
import json

from pydantic import BaseModel, Field
from typing import List

from crewai import Agent, Task, Crew

## Define the OpenAI Model

In [2]:
os.environ['OPENAI_MODEL_NAME'] = 'gpt-4o-mini'

## Loading Tasks and Agents YAML files

In [3]:
# Define file paths for YAML configurations
files = {
    'lead_agents': 'config/ner_annotation_agents.yaml',
    'lead_tasks': 'config/ner_annotation_tasks.yaml',
}

# Load configurations from YAML files
configs = {}
for config_type, file_path in files.items():
    with open(file_path, 'r') as file:
        configs[config_type] = yaml.safe_load(file)

# Assign loaded configurations to specific variables
ner_agents_config = configs['lead_agents']
ner_tasks_config = configs['lead_tasks']

## Create Pydantic Models for Structured Output

In [4]:
class Entity(BaseModel):
    text: str = Field(description="the exact text substring that was recognized as an entity")
    labels: List[str] = Field(description="A list of valid entities associated with the text substring.")

class EntitiesResult(BaseModel):
    # summary: List[str] = Field("list of topics for summarizing the received text")
    entities: List[Entity] = Field(description="A list of entities containing the identified text substring and a list of labels associated with the substring.")

## Importing Tools

In [5]:
from crewai_tools import FileReadTool
read_entities = FileReadTool(file_path='config/entities.md')

## NER Crew, Agents and Tasks

In [6]:
# Creating Agents
ner_annotator_agent = Agent(
  config=ner_agents_config['ner_annotator_specialist'],
  tools=[read_entities]
)

ner_reviewer_agent = Agent(
  config=ner_agents_config['senior_ner_reviewer_specialist'],
  tools=[read_entities]
)

# Creating Tasks
ner_annotation_task = Task(
  config=ner_tasks_config['ner_annotation'],
  agent=ner_annotator_agent,
  output_pydantic=EntitiesResult
)

ner_review_task = Task(
  config=ner_tasks_config['ner_review_and_consolidation'],
  agent=ner_reviewer_agent,
  context=[ner_annotation_task],
  output_pydantic=EntitiesResult
)

# Creating Crew
ner_annotation_crew = Crew(
  agents=[
    ner_annotator_agent,
    ner_reviewer_agent
  ],
  tasks=[
    ner_annotation_task,
    ner_review_task
  ],
  verbose=True
)

## Running the Crew

- Set the input for the execution of the crew.

In [7]:
sentence = '''
Neosaldina é um medicamento com atividade analgésica (diminui a dor) e antiespasmódica (diminui contração involuntária) indicado para o tratamento de diversos tipos de dor de cabeça, incluindo enxaquecas ou para o tratamento de cólicas.
'''

In [8]:
result = ner_annotation_crew.kickoff(inputs={'sentence': sentence})

[1m[95m# Agent:[00m [1m[92mNamed Entity Recognition (NER) Annotator Specialist[00m
[95m## Task:[00m [92mAnnotate clinical sentences in Portuguese by identifying and tagging entities belonging to the assigned cluster of tags. Use the descriptions and examples of the entities as a guide to ensure precise and consistent annotation. Utilize tools to know more about the entities to detect and a use the description to recognize the entities. - text to be analyzed: 
Neosaldina é um medicamento com atividade analgésica (diminui a dor) e antiespasmódica (diminui contração involuntária) indicado para o tratamento de diversos tipos de dor de cabeça, incluindo enxaquecas ou para o tratamento de cólicas.

[00m


[1m[95m# Agent:[00m [1m[92mNamed Entity Recognition (NER) Annotator Specialist[00m
[95m## Thought:[00m [92mI need to gather information about the specific entity tags for clinical annotations. This will help me accurately identify and tag the entities in the provided text

## Usage Metrics and Costs

In [9]:
import pandas as pd

costs = 0.150 * (ner_annotation_crew.usage_metrics.prompt_tokens + ner_annotation_crew.usage_metrics.completion_tokens) / 1_000_000
print(f"Total costs: ${costs:.4f}")

# Convert UsageMetrics instance to a DataFrame
df_usage_metrics = pd.DataFrame([ner_annotation_crew.usage_metrics.dict()])
df_usage_metrics

Total costs: $0.0011


Unnamed: 0,total_tokens,prompt_tokens,cached_prompt_tokens,completion_tokens,successful_requests
0,7435,6547,0,888,4


## Results

In [10]:
result.dict()

{'raw': '{\n  "entities": [\n    {\n      "text": "Neosaldina",\n      "labels": ["Drug_Brand_Name"]\n    },\n    {\n      "text": "enxaquecas",\n      "labels": ["Disease_Syndrome_Disorder"]\n    },\n    {\n      "text": "cólicas",\n      "labels": ["Disease_Syndrome_Disorder"]\n    }\n  ]\n}',
 'pydantic': {},
 'json_dict': None,
 'tasks_output': [{'description': 'Annotate clinical sentences in Portuguese by identifying and tagging entities belonging to the assigned cluster of tags. Use the descriptions and examples of the entities as a guide to ensure precise and consistent annotation. Utilize tools to know more about the entities to detect and a use the description to recognize the entities. - text to be analyzed: \nNeosaldina é um medicamento com atividade analgésica (diminui a dor) e antiespasmódica (diminui contração involuntária) indicado para o tratamento de diversos tipos de dor de cabeça, incluindo enxaquecas ou para o tratamento de cólicas.\n\n',
   'name': None,
   'expect

In [11]:
json.loads(result.raw)

{'entities': [{'text': 'Neosaldina', 'labels': ['Drug_Brand_Name']},
  {'text': 'enxaquecas', 'labels': ['Disease_Syndrome_Disorder']},
  {'text': 'cólicas', 'labels': ['Disease_Syndrome_Disorder']}]}