# Summary Nu-Extract, actual waste of time not worth reporting for healthcare tasks

In [1]:

import os
import sys
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.insert(0, parent_dir)
# Set the parent directory as the current directory
os.chdir(parent_dir)

In [2]:
from utils.data import read_json_file, print_json_structure
hpo_data = read_json_file('data/dataset/mine_hpo.json')
print(len(hpo_data))
# print_json_structure(hpo_data)
truth = hpo_data["53"]["phenotypes"]
# sample 5 texts and their ground truth phenotypes for testing.
ids = ["53", "54", "55", "56", "57"] 
texts = []
ground_truths = []
for id in ids:
    text = hpo_data[id]["clinical_text"]
    texts.append(text)
    truth = hpo_data[id]["phenotypes"]
    ground_truth = []
    for item in truth:
        ground_truth.append(item["phenotype_name"])
    ground_truths.append(ground_truth)

    sanity_check_list = []
    for item in ground_truth:
        # print(item)
        if item in text:
            sanity_check_list.append(item)
    print("Pairwise checks:")
    print(len(sanity_check_list))
    print(len(truth))
# ground_truth = []
# for item in truth:
#     ground_truth.append(item["phenotype_name"])



116
Pairwise checks:
31
48
Pairwise checks:
16
25
Pairwise checks:
19
22
Pairwise checks:
15
22
Pairwise checks:
10
21


In [7]:
from typing import List, Dict, Optional, Union, Any
import json
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from hporag.entity import BaseEntityExtractor
class NuExtractor(BaseEntityExtractor):
    """Entity extraction using the NuExtract model with structured schema output.
    
    NuExtract extracts comprehensive phenotype-related information from clinical text,
    aggregating findings by category (medical conditions, lab results, etc.).
    """
    
    def __init__(self, 
                 model_name: str = "numind/NuExtract", 
                 device: str = "cuda:0",
                 max_length: int = 4000,
                 schema: Optional[str] = None,
                 examples: List[str] = None,
                 cache_dir: Optional[str] = None):
        """
        Initialize the NuExtract extractor.
        
        Args:
            model_name: Name of the NuExtract model to use
            device: Device to use for inference (e.g., "cuda:0", "cpu")
            max_length: Maximum token length for input
            schema: Custom schema for extraction (if None, uses default phenotype schema)
            examples: List of example extractions in JSON string format
            cache_dir: Directory to cache the model
        """
        self.model_name = model_name
        self.device = device
        self.max_length = max_length
        self.examples = examples or ["", "", ""]
        self.cache_dir = cache_dir
        
        # Default schema for phenotype extraction
        self.schema = schema or self._get_default_schema()
        
        # Initialize model
        self._initialize_model()
    
    def _initialize_model(self):
        """Initialize the NuExtract model and tokenizer."""
        print(f"Initializing NuExtract model from {self.model_name}...")
        try:
            self.tokenizer = AutoTokenizer.from_pretrained(
                self.model_name, 
                trust_remote_code=False,
                cache_dir=self.cache_dir
            )
            
            # Load model with appropriate precision
            self.model = AutoModelForCausalLM.from_pretrained(
                self.model_name, 
                torch_dtype=torch.bfloat16, 
                trust_remote_code=False,
                cache_dir=self.cache_dir
            )
            
            # Move model to specified device
            self.model.to(self.device)
            self.model.eval()  # Set to evaluation mode
            print(f"NuExtract model loaded successfully on {self.device}")
            
        except Exception as e:
            print(f"Error initializing NuExtract model: {e}")
            raise
    
    def _get_default_schema(self) -> str:
        """Get the default schema for comprehensive phenotype extraction."""
        schema = {
            "findings": {
                "medical_conditions": [],    # Medical diagnoses, diseases, disorders
                "syndromes": [],             # Named syndromes and genetic disorders
                "lab_measurements": [],      # Laboratory test results and abnormal values
                "symptoms": [],              # Patient-reported symptoms
                "signs": [],                 # Clinician-observed signs
                "anatomical_abnormalities": [], # Structural or morphological abnormalities
                "developmental_features": [], # Developmental delays, growth issues
                "neurological_features": [], # Neurological findings and abnormalities
                "dysmorphic_features": [],   # Distinctive physical features
                "congenital_anomalies": [],  # Birth defects and anomalies
                "genetic_variants": [],      # Genetic variants, mutations, chromosomal abnormalities
                "family_history": [],        # Relevant family history findings
                "negated_findings": [],      # Explicitly negated clinical findings
                "behavioral_features": [],   # Behavioral and psychiatric features
                "metabolic_abnormalities": [], # Metabolic disorders and findings
                "functional_limitations": [], # Functional impairments and disabilities
                "diseases" : []

            }
        }
        return json.dumps(schema)

    def extract_entities(self, text: str) -> List[str]:
        """
        Extract entities from text using the NuExtract model.
        
        Args:
            text: Clinical text to extract entities from
            
        Returns:
            List of extracted entities as strings
        """
        if not text:
            return []
            
        # Get full structured extraction
        extraction = self._extract_structured_data(text)
        
        # Aggregate all relevant findings into a single list for the pipeline
        entities = []
        print(extraction)
        # Categories directly relevant to phenotypes
        phenotype_relevant_categories = [
            'medical_conditions', 'syndromes', 'lab_measurements', 'symptoms', 'signs',
            'anatomical_abnormalities', 'developmental_features',
            'neurological_features', 'dysmorphic_features',
            'congenital_anomalies', 'genetic_variants',
            'behavioral_features', 'metabolic_abnormalities',
            'functional_limitations', 'diseases'
        ]
        
        # Process findings if they exist
        if 'findings' in extraction:
            findings = extraction['findings']
            
            # Add entities from each relevant category
            for category in phenotype_relevant_categories:
                if category in findings and isinstance(findings[category], list):
                    for item in findings[category]:
                        if item and isinstance(item, str):
                            entities.append(item)
        
        # Filter out duplicates while preserving order
        unique_entities = []
        seen = set()
        for entity in entities:
            entity_lower = entity.lower().strip()
            if entity_lower and entity_lower not in seen:
                seen.add(entity_lower)
                unique_entities.append(entity)
        
        return unique_entities
    
    def _extract_structured_data(self, text: str) -> Dict[str, Any]:
        """
        Perform the actual extraction using NuExtract.
        
        Args:
            text: Clinical text to extract from
            
        Returns:
            Dictionary with structured extraction results
        """
        # Prepare the input
        schema_formatted = json.dumps(json.loads(self.schema), indent=4)
        input_text = "<|input|>\n### Template:\n" + schema_formatted + "\n"
        
        # Add examples if provided
        for example in self.examples:
            if example:
                input_text += "### Example:\n" + json.dumps(json.loads(example), indent=4) + "\n"
        
        # Add the actual text to process
        input_text += "### Text:\n" + text + "\n<|output|>\n"
        
        # Tokenize and generate
        try:
            input_ids = self.tokenizer(
                input_text, 
                return_tensors="pt", 
                truncation=True, 
                max_length=self.max_length
            ).to(self.device)
            
            # Generate output
            with torch.no_grad():
                output_ids = self.model.generate(
                    **input_ids,
                    max_new_tokens=1024,  # Ensure enough tokens for complex extractions
                    temperature=0.1       # Lower temperature for more deterministic output
                )
                
            # Decode output
            output = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)
            
            # Extract the structured output
            extraction_text = output.split("<|output|>")[1].split("<|end-output|>")[0].strip()
            
            # Parse the extraction as JSON
            try:
                extraction = json.loads(extraction_text)
                return extraction
            except json.JSONDecodeError as e:
                print(f"Error parsing NuExtract output as JSON: {e}")
                print(f"Raw output: {extraction_text}")
                return {}
                
        except Exception as e:
            print(f"Error during NuExtract inference: {e}")
            return {}
    
    def process_batch(self, texts: List[str]) -> List[List[str]]:
        """
        Process a batch of texts for entity extraction.
        
        Args:
            texts: List of clinical texts to process
            
        Returns:
            List of lists containing extracted entities for each text
        """
        results = []
        for text in texts:
            entities = self.extract_entities(text)
            results.append(entities)
        return results
    
    def extract_detailed_entities(self, text: str) -> Dict[str, List[str]]:
        """
        Extract entities and categorize them by type.
        
        Args:
            text: Clinical text to extract entities from
            
        Returns:
            Dictionary with categorized entities (medical_conditions, lab_measurements, etc.)
        """
        extraction = self._extract_structured_data(text)
        
        result = {
            'medical_conditions': [],
            'lab_measurements': [],
            'symptoms': [],
            'anatomical_sites': [],
            'genetic_variants': [],
            'medications': [],
            'diagnostic_procedures': [],
            'diseases': []
        }
        
        # Process findings if they exist
        if 'findings' in extraction:
            findings = extraction['findings']
            
            # Copy found categories to result
            for category in result.keys():
                if category in findings:
                    result[category] = findings[category]
        
        return result

In [8]:
extractor = NuExtractor()
extractor.extract_entities(text)

Initializing NuExtract model from numind/NuExtract...


Loading checkpoint shards: 100%|██████████| 4/4 [00:01<00:00,  3.70it/s]


NuExtract model loaded successfully on cuda:0




{'findings': {'medical_conditions': ['myeloproliferative disorders', 'renal dysfunction'], 'syndromes': [], 'lab_measurements': ['urine microscopy', '24-hour urinary protein', 'serum creatinine', 'haemoglobin', 'white cell count', 'platelet count', 'serum lactate dehydrogenase (LDH)'], 'symptoms': ['frothy urine', 'leg oedema'], 'signs': ['bilateral pedal oedema', 'spleen palpable'], 'anatomical_abnormalities': ['mild renal parenchymal disease'], 'developmental_features': [], 'neurological_features': [], 'dysmorphic_features': [], 'congenital_anomalies': [], 'genetic_variants': [], 'family_history': ['no family member was suffering from similar illness'], 'negated_findings': ['viral markers (HbsAg, antihepatitis C virus, HIV) were negative', 'serum electrophoresis study and antinuclear antibodies were negative'], 'behavioral_features': [], 'metabolic_abnormalities': [], 'functional_limitations': [], 'diseases': ['myeloproliferative disorders', 'renal dysfunction']}}


['myeloproliferative disorders',
 'renal dysfunction',
 'urine microscopy',
 '24-hour urinary protein',
 'serum creatinine',
 'haemoglobin',
 'white cell count',
 'platelet count',
 'serum lactate dehydrogenase (LDH)',
 'frothy urine',
 'leg oedema',
 'bilateral pedal oedema',
 'spleen palpable',
 'mild renal parenchymal disease']

In [9]:
print(text)

A 56-year-old man presented to us with a history of frothy urine and leg oedema for the last 6 months. There was no history of fever, cough, shortness of breath and decrease urine output. There is no significant past medical history. No family member was suffering from similar illness. On examination, bilateral pedal oedema was present, the spleen was palpable 2 cm below left costal margin, rest examination was unremarkable. On further biochemical and haematological investigations, urine microscopy revealed significant proteinuria, a 24-hour urinary protein was 3.1 g and serum creatinine was 2.1 mg/dL. Complete blood count revealed haemoglobin (Hb)—13.0 g/dL, white cell count—28×109/L and platelet count of 842×109/L (table 1). The peripheral blood smear was suggestive of the leucoerythroblastic picture with few tear drop cells. Serum lactate dehydrogenase (LDH) was raised (1142 IU/L). To confirm the diagnosis of myeloproliferative disorders, bone marrow biopsy was done which was sugges