In [None]:
# Install necessary libraries
!pip install transformers
!pip install torch

# Import libraries
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

# Load the model and tokenizer
model_name = "d4data/biomedical-ner-all"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Create a NER pipeline
nlp = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Sample medical text
text = """The patient was diagnosed with diabetes and prescribed metformin. 
They reported frequent urination and fatigue, and later developed hypertension."""

# Run the NER pipeline
ner_results = nlp(text)

# Display the extracted named entities
print("Named Entities Found:\n")
for entity in ner_results:
    print(f"{entity['word']}: {entity['entity_group']}")

In [None]:
!pip install transformers torch -q

from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

# Load model and tokenizer
model_name = "d4data/biomedical-ner-all"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# NER pipeline with basic config
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=False)

# Input clinical text
text = """The patient was diagnosed with diabetes and prescribed metformin.
They reported frequent urination and fatigue, and later developed hypertension."""

# Run the pipeline
ner_results = ner_pipeline(text)

# Custom post-processing to merge subwords and same-entity tokens
entities = []
current_entity = ""
current_label = ""
for item in ner_results:
    word = item["word"]
    label = item["entity"]

    # Clean up subwords
    if word.startswith("##"):
        current_entity += word[2:]
    elif label == current_label:
        current_entity += " " + word
    else:
        if current_entity:
            entities.append((current_entity.strip(), current_label.split("_")[-1]))
        current_entity = word
        current_label = label

# Append last entity
if current_entity:
    entities.append((current_entity.strip(), current_label.split("_")[-1]))

# Display results
print("Named Entities Found:\n")
for word, label in entities:
    print(f"{word}: {label}")

In [None]:
!pip install transformers torch -q

from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

# Load model and tokenizer
model_name = "d4data/biomedical-ner-all"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# NER pipeline
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=False)

# Input clinical text
text = """The patient was diagnosed with diabetes and prescribed metformin.
They reported frequent urination and fatigue, and later developed hypertension."""

# Run NER pipeline
ner_results = ner_pipeline(text)

# Improved post-processing to handle B- and I- tags and merge subwords
entities = []
current_entity = ""
current_label = ""
last_index = -1

for item in ner_results:
    word = item["word"]
    label = item["entity"]
    index = item["index"]

    label_type = label.split("-")[-1]
    prefix = label.split("-")[0] if "-" in label else ""

    # Merge subword tokens
    if word.startswith("##"):
        current_entity += word[2:]
        continue

    # Start of a new entity
    if prefix == "B" or label != current_label or index != last_index + 1:
        if current_entity:
            entities.append((current_entity.strip(), current_label.split("-")[-1]))
        current_entity = word
        current_label = label
    else:  # continuation of an entity (I-tag or repeated label)
        current_entity += " " + word

    last_index = index

# Append the last entity if exists
if current_entity:
    entities.append((current_entity.strip(), current_label.split("-")[-1]))

# Display results
print("Named Entities Found:\n")
for word, label in entities:
    print(f"{word}: {label}")

In [None]:
!pip install transformers torch -q

from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

# Load model and tokenizer
model_name = "d4data/biomedical-ner-all"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# NER pipeline
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=False)

# Input clinical text
text = """Diarrhea, also spelled diarrhoea, is the condition of having at least three loose, liquid, or watery bowel movements each day. It often lasts for a few days and can result in dehydration due to fluid loss. Signs of dehydration often begin with loss of the normal stretchiness of the skin and irritable behaviour. This can progress to decreased urination, loss of skin color, a fast heart rate, and a decrease in responsiveness as it becomes more severe. Loose but non-watery stools in babies who are exclusively breastfed, however, are normal."""

# Run NER pipeline
ner_results = ner_pipeline(text)

# Improved post-processing using character positions
entities = []
current_entity = ""
current_label = ""
current_start = None
current_end = None

for item in ner_results:
    word = item["word"]
    label = item["entity"]
    start = item["start"]
    end = item["end"]
    
    # Split entity label
    if "-" in label:
        prefix, entity_type = label.split("-")
    else:
        prefix, entity_type = "", label

    # Skip non-entity tokens
    if entity_type == "O":
        if current_entity:
            entities.append((current_entity, current_label, current_start, current_end))
            current_entity = ""
            current_label = ""
        continue

    # Handle subwords and entity continuity
    if current_entity:
        # Check if current token continues the previous entity
        if (start == current_end) and (entity_type == current_label):
            if word.startswith("##"):
                current_entity += word[2:]
            else:
                current_entity += " " + word
            current_end = end
        else:
            # Finalize current entity and start new
            entities.append((current_entity, current_label, current_start, current_end))
            current_entity = word if not word.startswith("##") else word[2:]
            current_label = entity_type
            current_start = start
            current_end = end
    else:
        # Start new entity
        current_entity = word if not word.startswith("##") else word[2:]
        current_label = entity_type
        current_start = start
        current_end = end

# Add the last entity
if current_entity:
    entities.append((current_entity, current_label, current_start, current_end))

# Merge overlapping/adjacent entities and clean up
final_entities = []
for entity in entities:
    text_segment = text[entity[2]:entity[3]]
    
    # Handle minor position mismatches
    if entity[0].lower() != text_segment.lower():
        final_entity = text_segment
    else:
        final_entity = entity[0]
    
    final_entities.append((final_entity, entity[1]))

# Merge Detailed_description followed by Sign_symptom
merged_entities = []
i = 0
while i < len(final_entities):
    if i < len(final_entities) - 1:
        current_ent, next_ent = final_entities[i], final_entities[i+1]
        # Check for Detailed_description followed by Sign_symptom
        if current_ent[1] == "Detailed_description" and next_ent[1] == "Sign_symptom":
            merged_text = f"{current_ent[0]} {next_ent[0]}"
            merged_entities.append((merged_text, "Sign_symptom"))
            i += 2  # Skip next element
            continue
    merged_entities.append(final_entities[i])
    i += 1

# Remove duplicates and filter non-entities
seen = set()
unique_entities = []
for ent in merged_entities:
    if ent[1] != "O" and ent[0] not in seen:
        seen.add(ent[0])
        unique_entities.append(ent)

# Display results
print("Named Entities Found:\n")
for word, label in unique_entities:
    print(f"{word}: {label}")