# Conversion of weirdly formatted excel to csv.

In [9]:
import pandas as pd
import numpy as np
import json

def load_and_display_raw_table(file_path="labels.csv"):
    """
    Loads CSV file and displays first 3 columns as a raw table.
    """
    df = pd.read_csv(file_path, header=None, usecols=[0, 1, 2])
    print(f"Raw table shape: {df.shape}")
    print("\nFirst few rows of raw table:")
    print(df.head())
    return df

def find_phenotype_header_rows(df):
    """
    Finds all rows containing 'Phenotype name'.
    Returns a list of row indices where new patient sections start.
    """
    header_indices = []
    for idx, row in df.iterrows():
        if row.astype(str).str.contains('Phenotype name').any():
            header_indices.append(idx)
    
    print(f"\nFound {len(header_indices)} patient sections")
    for i, idx in enumerate(header_indices):
        print(f"Patient {i+1} section starts at row {idx}")
    
    return header_indices

def process_table(df, header_indices):
    """
    Processes the table using header indices as section delimiters.
    Returns a structured dictionary of patient data.
    """
    result = {}
    
    # Process each patient section
    for patient_num, start_idx in enumerate(header_indices, 1):
        # Determine the end of this section
        if patient_num < len(header_indices):
            end_idx = header_indices[patient_num]
            section_df = df.iloc[start_idx:end_idx].copy()
        else:
            section_df = df.iloc[start_idx:].copy()
        
        # Set column names for this section
        section_df.columns = ['Col1', 'Phenotype name', 'HPO ID']
        
        # Initialize patient entry
        patient_id = str(patient_num)
        result[patient_id] = []
        
        # Skip the header row and process remaining rows
        for _, row in section_df.iloc[1:].iterrows():
            phenotype = str(row['Phenotype name']).strip()
            hpo_id = str(row['HPO ID']).strip()
            
            if pd.notna(phenotype) and pd.notna(hpo_id) and phenotype and hpo_id.startswith('HP:'):
                entry = {
                    'phenotype_name': phenotype,
                    'hpo_id': hpo_id
                }
                # Only add if not duplicate
                if entry not in result[patient_id]:
                    result[patient_id].append(entry)
    
    return result

def main():
    try:
        # Step 1: Load and display raw table
        print("Loading raw table...")
        df = load_and_display_raw_table()
        
        # Step 2: Find all header rows
        print("\nLocating patient sections...")
        header_indices = find_phenotype_header_rows(df)
        if not header_indices:
            raise ValueError("Could not find any 'Phenotype name' headers in the table")
        
        # Step 3: Process the table
        print("\nProcessing table...")
        result = process_table(df, header_indices)
        
        # Step 4: Save the result
        with open('labels.json', 'w') as f:
            json.dump(result, f, indent=2)
        
        # Print summary
        total_patients = len(result)
        total_phenotypes = sum(len(phenotypes) for phenotypes in result.values())
        print(f"\nProcessed {total_patients} patients with {total_phenotypes} total phenotypes")
        print("Results saved to labels.json")
        
        # Print sample of first patient
        if result:
            first_patient = next(iter(result))
            print(f"\nSample data for patient {first_patient}:")
            print(json.dumps({first_patient: result[first_patient][:2]}, indent=2))
            
    except Exception as e:
        print(f"Error: {str(e)}")
        raise

if __name__ == "__main__":
    main()

Loading raw table...
Raw table shape: (3110, 3)

First few rows of raw table:
     0                             1           2
0    1   Manually Assigned HPO Terms         NaN
1  NaN                Phenotype name      HPO ID
2   11  Multicystic kidney dysplasia  HP:0000003
3  NaN      Decreased fetal movement  HP:0001558
4  NaN          Subglottic stenosis   HP:0001607

Locating patient sections...

Found 116 patient sections
Patient 1 section starts at row 1
Patient 2 section starts at row 22
Patient 3 section starts at row 50
Patient 4 section starts at row 64
Patient 5 section starts at row 84
Patient 6 section starts at row 113
Patient 7 section starts at row 142
Patient 8 section starts at row 169
Patient 9 section starts at row 204
Patient 10 section starts at row 223
Patient 11 section starts at row 244
Patient 12 section starts at row 263
Patient 13 section starts at row 290
Patient 14 section starts at row 324
Patient 15 section starts at row 344
Patient 16 section starts at r

In [20]:
import pandas as pd
import numpy as np
from typing import Dict, Any, Tuple
import matplotlib.pyplot as plt

def analyze_word_counts(filepath: str) -> Tuple[float, Dict[str, Any]]:
    """
    Analyze word counts in clinical notes from a CSV file.
    
    Args:
        filepath: Path to the input CSV file
        
    Returns:
        Tuple containing:
        - float: Average word count
        - Dict: Additional statistics including:
            - min_words: Minimum word count
            - max_words: Maximum word count
            - median_words: Median word count
            - std_dev: Standard deviation of word counts
            - total_notes: Total number of clinical notes analyzed
            - word_counts: List of individual word counts
            
    Raises:
        ValueError: If required columns are missing
        FileNotFoundError: If file cannot be found or read
    """
    try:
        # Read and validate CSV using logic from main.py
        df = pd.read_csv(filepath)
        if 'clinical_note' not in df.columns:
            raise ValueError('Input CSV must have a "clinical_note" column')
        
        # Handle patient IDs (following main.py logic)
        if 'patient_id' not in df.columns and 'case number' not in df.columns:
            df['patient_id'] = range(1, len(df) + 1)
        elif 'case number' in df.columns:
            df['patient_id'] = df['case number']
            df = df.drop('case number', axis=1)
            
        # Clean notes and calculate word counts
        df['clinical_note'] = df['clinical_note'].astype(str)
        df = df.dropna(subset=['clinical_note'])
        
        # Calculate word counts for each note
        word_counts = df['clinical_note'].apply(lambda x: len(str(x).split()))
        
        # Calculate statistics
        stats = {
            'min_words': int(word_counts.min()),
            'max_words': int(word_counts.max()),
            'median_words': int(word_counts.median()),
            'std_dev': float(word_counts.std()),
            'total_notes': len(df),
            'word_counts': word_counts.tolist()
        }
        
        average_words = float(word_counts.mean())
        
        # Print summary
        print(f"\nWord Count Analysis Results:")
        print(f"- Total clinical notes analyzed: {stats['total_notes']}")
        print(f"- Average words per note: {average_words:.2f}")
        print(f"- Median words per note: {stats['median_words']}")
        print(f"- Word count range: {stats['min_words']} to {stats['max_words']}")
        print(f"- Standard deviation: {stats['std_dev']:.2f}")
        
        return average_words, stats
        
    except FileNotFoundError:
        print(f"Error: File not found at {filepath}")
        raise
    except Exception as e:
        print(f"Error analyzing word counts: {e}")
        raise

def plot_word_count_distribution(stats: Dict[str, Any], output_path: str = None) -> None:
    """
    Create a histogram of word count distribution.
    
    Args:
        stats: Statistics dictionary from analyze_word_counts
        output_path: Optional path to save the plot
    """
    plt.figure(figsize=(10, 6))
    plt.hist(stats['word_counts'], bins=30, edgecolor='black')
    plt.title('Distribution of Word Counts in Clinical Notes')
    plt.xlabel('Word Count')
    plt.ylabel('Number of Notes')
    
    # Add vertical lines for key statistics
    plt.axvline(np.mean(stats['word_counts']), color='red', linestyle='dashed', label='Mean')
    plt.axvline(stats['median_words'], color='green', linestyle='dashed', label='Median')
    
    plt.legend()
    
    if output_path:
        plt.savefig(output_path)
    plt.close()

# Example usage:

try:
    avg_words, statistics = analyze_word_counts('data/dataset/Test_Cases.csv')
    plot_word_count_distribution(statistics, 'word_count_distribution.png')
except Exception as e:
    print(f"Analysis failed: {e}")



Word Count Analysis Results:
- Total clinical notes analyzed: 116
- Average words per note: 278.10
- Median words per note: 253
- Word count range: 50 to 675
- Standard deviation: 118.43


In [21]:
import json
import numpy as np
from typing import Dict, Any, Tuple
import matplotlib.pyplot as plt
from pathlib import Path

def analyze_json_word_counts(filepath: str) -> Tuple[float, Dict[str, Any]]:
    """
    Analyze word counts in clinical notes from a JSON file with nested structure.
    
    Args:
        filepath: Path to the JSON file containing clinical notes
        
    Returns:
        Tuple containing:
        - float: Average word count
        - Dict: Additional statistics including:
            - min_words: Minimum word count
            - max_words: Maximum word count
            - median_words: Median word count
            - std_dev: Standard deviation of word counts
            - total_notes: Total number of clinical notes analyzed
            - word_counts: List of individual word counts
            - by_category: Word count statistics grouped by note category
            
    Raises:
        FileNotFoundError: If file cannot be found or read
        json.JSONDecodeError: If JSON is invalid
    """
    try:
        # Read JSON file
        with open(filepath, 'r') as file:
            data = json.load(file)
        
        # Extract text and calculate word counts
        word_counts = []
        categories = {}  # To track statistics by category
        
        for note_id, note_data in data.items():
            if isinstance(note_data, dict) and 'note_details' in note_data:
                text = note_data['note_details'].get('text', '')
                category = note_data['note_details'].get('category', 'Unknown')
                
                if text:  # Only process if text exists
                    word_count = len(str(text).split())
                    word_counts.append(word_count)
                    
                    # Track category statistics
                    if category not in categories:
                        categories[category] = []
                    categories[category].append(word_count)
        
        if not word_counts:
            raise ValueError("No valid text fields found in the JSON file")
            
        # Calculate overall statistics
        word_counts = np.array(word_counts)
        average_words = float(np.mean(word_counts))
        
        stats = {
            'min_words': int(np.min(word_counts)),
            'max_words': int(np.max(word_counts)),
            'median_words': int(np.median(word_counts)),
            'std_dev': float(np.std(word_counts)),
            'total_notes': len(word_counts),
            'word_counts': word_counts.tolist(),
            'by_category': {}
        }
        
        # Calculate statistics by category
        for category, counts in categories.items():
            counts_array = np.array(counts)
            stats['by_category'][category] = {
                'count': len(counts),
                'average': float(np.mean(counts_array)),
                'median': int(np.median(counts_array)),
                'min': int(np.min(counts_array)),
                'max': int(np.max(counts_array)),
                'std_dev': float(np.std(counts_array))
            }
        
        # Print summary
        print(f"\nWord Count Analysis Results:")
        print(f"- Total clinical notes analyzed: {stats['total_notes']}")
        print(f"- Average words per note: {average_words:.2f}")
        print(f"- Median words per note: {stats['median_words']}")
        print(f"- Word count range: {stats['min_words']} to {stats['max_words']}")
        print(f"- Standard deviation: {stats['std_dev']:.2f}")
        
        print("\nBy Category:")
        for category, cat_stats in stats['by_category'].items():
            print(f"\n{category}:")
            print(f"  - Notes: {cat_stats['count']}")
            print(f"  - Average words: {cat_stats['average']:.2f}")
            print(f"  - Median words: {cat_stats['median']}")
            
        return average_words, stats
        
    except FileNotFoundError:
        print(f"Error: File not found at {filepath}")
        raise
    except json.JSONDecodeError as e:
        print(f"Error: Invalid JSON format - {e}")
        raise
    except Exception as e:
        print(f"Error analyzing word counts: {e}")
        raise

def plot_word_count_distribution(stats: Dict[str, Any], output_dir: str = None) -> None:
    """
    Create visualizations of word count distributions.
    
    Args:
        stats: Statistics dictionary from analyze_json_word_counts
        output_dir: Optional directory to save the plots
    """
    if output_dir:
        Path(output_dir).mkdir(parents=True, exist_ok=True)
    
    # Overall distribution
    plt.figure(figsize=(12, 6))
    plt.hist(stats['word_counts'], bins=30, edgecolor='black')
    plt.title('Overall Distribution of Word Counts in Clinical Notes')
    plt.xlabel('Word Count')
    plt.ylabel('Number of Notes')
    
    # Add vertical lines for key statistics
    plt.axvline(np.mean(stats['word_counts']), color='red', linestyle='dashed', label='Mean')
    plt.axvline(stats['median_words'], color='green', linestyle='dashed', label='Median')
    plt.legend()
    
    if output_dir:
        plt.savefig(Path(output_dir) / 'overall_distribution.png')
    plt.close()
    
    # Distribution by category
    if stats['by_category']:
        plt.figure(figsize=(12, 6))
        categories = list(stats['by_category'].keys())
        averages = [cat_stats['average'] for cat_stats in stats['by_category'].values()]
        
        plt.bar(categories, averages)
        plt.title('Average Word Count by Note Category')
        plt.xlabel('Category')
        plt.ylabel('Average Word Count')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        
        if output_dir:
            plt.savefig(Path(output_dir) / 'category_comparison.png')
        plt.close()

# Example usage:

try:
    avg_words, statistics = analyze_json_word_counts('/home/johnwu3/projects/rare_disease/workspace/repos/RareDiseaseMention/filtered_rd_annos.json')
    plot_word_count_distribution(statistics, 'word_count_plots')
except Exception as e:
    print(f"Analysis failed: {e}")



Word Count Analysis Results:
- Total clinical notes analyzed: 117
- Average words per note: 1897.26
- Median words per note: 1722
- Word count range: 541 to 6726
- Standard deviation: 932.66

By Category:

Discharge summary:
  - Notes: 117
  - Average words: 1897.26
  - Median words: 1722


In [19]:
import pandas as pd
import numpy as np
import json

def load_test_cases(file_path="Test_Cases.csv"):
    """Loads all test cases."""
    return pd.read_csv(file_path, encoding='utf-8')

def load_labels(file_path="labels.json"):
    """Loads labels from JSON file."""
    with open(file_path) as f:
        labels = json.load(f)
    return labels  # The JSON is already in the format we want

def combine_data(labels_data, test_cases_df):
    combined_data = {}
    n_cases = len(test_cases_df)
    
    # First 96 cases (indices 21-116)
    for idx in range(96):
        label_id = str(idx + 21)  # This is the ID used to lookup in labels_data
        if idx < n_cases - 20:
            case_id = str(test_cases_df.iloc[idx]['Case'])  # Get original Case ID from CSV
            clinical_text = test_cases_df.iloc[idx]['clinical_note']
            phenotypes = labels_data.get(label_id, [])  # Still use label_id (21-116) for phenotype lookup
            combined_data[case_id] = {  # Store using original Case ID
                'clinical_text': clinical_text, 
                'phenotypes': phenotypes
            }
            
    # Last 20 cases (indices 1-20)
    for label_idx in range(1, 21):
        label_id = str(label_idx)  # This is the ID used to lookup in labels_data
        case_idx = n_cases - (21 - label_idx)
        
        if case_idx >= 0 and case_idx < n_cases:
            case_id = str(test_cases_df.iloc[case_idx]['Case'])  # Get original Case ID from CSV
            clinical_text = test_cases_df.iloc[case_idx]['clinical_note']
            phenotypes = labels_data.get(label_id, [])  # Use label_id (1-20) for phenotype lookup
            combined_data[case_id] = {  # Store using original Case ID
                'clinical_text': clinical_text,
                'phenotypes': phenotypes
            }
    
    sorted_data = dict(sorted(combined_data.items(), 
                            key=lambda x: int(x[0])))
    
    return sorted_data

def main():
    try:
        # Load labels data
        print("Loading labels data...")
        labels_data = load_labels()
        print(f"Loaded {len(labels_data)} label sections")
        
        # Load test cases
        print("\nLoading test cases...")
        test_cases_df = load_test_cases()
        print(f"Loaded {len(test_cases_df)} test cases")
        
        # Combine the data
        print("\nCombining datasets...")
        combined_data = combine_data(labels_data, test_cases_df)
        
        # Save combined result
        with open('data.json', 'w') as f:
            json.dump(combined_data, f, indent=2)
        
        print(f"\nProcessed {len(combined_data)} total cases")
        print("Results saved to data.json")
        
        # Print sample
        first_id = "1"
        print(f"\nSample data for case {first_id}:")
        print(json.dumps({first_id: combined_data[first_id]}, indent=2))
        
        # Verify all cases have both clinical_text and phenotypes
        missing_data = []
        for patient_id, data in combined_data.items():
            if 'clinical_text' not in data or 'phenotypes' not in data:
                missing_data.append(patient_id)
        
        if missing_data:
            print("\nWarning: The following cases are missing data:", missing_data)
        else:
            print("\nVerification: All cases have both clinical_text and phenotypes")
        
    except Exception as e:
        print(f"Error: {str(e)}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

Loading labels data...
Loaded 116 label sections

Loading test cases...
Loaded 116 test cases

Combining datasets...

Processed 116 total cases
Results saved to data.json

Sample data for case 1:
{
  "1": {
    "clinical_text": "A 44-year- old super-morbidly- obese man body mass index (BMI 63) underwent sleeve gastrectomy for weight loss and was found to have multiple adenomatous fundic gland polyps on final pathology. Subsequent workup included esophagogastroduodenoscopy which revealed innumerable polyps of the remaining gastric fundus and body consistent with fundic gland polyps, normal duodenum without polyps, and Barrett\u2019s oesophagus. Colonoscopy was significant for innumerable polyps of varying sizes up to 1.5 cm throughout the colon, with relative rectal sparing. Biopsies were consistent with tubular adenoma and hyperplastic polyps. Thyroid ultrasound was within normal limits and abdominal CT was significant for left-sided 3.4 cm mesenteric mass representing scarring versus 

In [1]:
import pandas as pd

def reorder_results(input_file="output_results.csv", output_file="reordered_results.csv"):
    # Read the CSV
    df = pd.read_csv(input_file)
    
    # Convert patient_id to integer for proper sorting
    df['patient_id'] = df['patient_id'].astype(int)
    
    # Create a mapping for the reordering
    # Last 20 patients should be first (1-20), then the rest (21-116)
    def map_patient_id(pid):
        if pid > 96:  # Last 20 patients should be first
            return pid - 96
        else:  # First 96 patients should be shifted up
            return pid + 20
            
    # Apply the mapping
    df['sort_order'] = df['patient_id'].apply(map_patient_id)
    
    # Sort by the new order
    df_sorted = df.sort_values('sort_order')
    
    # Drop the temporary sort column
    df_sorted = df_sorted.drop('sort_order', axis=1)
    
    # Save the reordered results
    df_sorted.to_csv(output_file, index=False)
    
    print(f"Reordered {len(df)} rows from {df['patient_id'].nunique()} patients")
    print(f"Results saved to {output_file}")

# Run the reordering
reorder_results(output_file="reordered_results_final.csv")

Reordered 115 rows from 115 patients
Results saved to reordered_results_final.csv


In [4]:
import json
import pandas as pd
import ast
from typing import List, Dict, Set, Tuple
from fuzzywuzzy import fuzz
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import torch
from tqdm import tqdm

class SemanticHPOEvaluator:
    def __init__(self, 
                 similarity_threshold: float = 0.85,
                 model_name: str = 'pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb',
                 device: str = None):
        """
        Initialize evaluator with semantic similarity model and threshold
        
        Args:
            similarity_threshold: Threshold for considering terms semantically similar
            model_name: Name of the sentence transformer model to use
            device: Device to run the model on ('cuda', 'cuda:0', 'cuda:1', etc., or 'cpu').
                   If None, will use CUDA if available, else CPU.
        """
        self.similarity_threshold = similarity_threshold
        
        # Determine device
        if device is None:
            import torch
            device = 'cuda' if torch.cuda.is_available() else 'cpu'
        
        # Initialize model on specified device
        self.model = SentenceTransformer(model_name, device=device)
        self.device = device
        
        print(f"Using device: {device}")
        if device.startswith('cuda'):
            import torch
            print(f"GPU: {torch.cuda.get_device_name(self.model.device.index)}")
        
    def compute_term_similarity(self, term1: str, term2: str) -> float:
        """
        Compute semantic similarity between two terms using multiple methods
        
        Args:
            term1: First term
            term2: Second term
            
        Returns:
            Combined similarity score
        """
        # String similarity using fuzzywuzzy
        fuzzy_score = fuzz.ratio(term1.lower(), term2.lower()) / 100
        
        # Semantic similarity using sentence transformers
        with torch.no_grad():  # Disable gradient computation for inference
            embeddings = self.model.encode(
                [term1, term2],
                convert_to_tensor=True,
                device=self.device
            )
            # Move to CPU for numpy conversion if needed
            if self.device != 'cpu':
                embeddings = embeddings.cpu()
            embeddings = embeddings.numpy()
            semantic_score = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
        
        # Combine scores (weighted average)
        combined_score = (0.3 * fuzzy_score) + (0.7 * semantic_score)
        
        return combined_score
    
    def find_best_matching_term(self, predicted_term: str, 
                              ground_truth_terms: Set[str]) -> Tuple[str, float]:
        """
        Find the best matching ground truth term for a prediction
        
        Args:
            predicted_term: The predicted term
            ground_truth_terms: Set of ground truth terms
            
        Returns:
            Tuple of (best matching term, similarity score)
        """
        best_match = None
        best_score = 0
        
        for gt_term in ground_truth_terms:
            similarity = self.compute_term_similarity(predicted_term, gt_term)
            if similarity > best_score:
                best_score = similarity
                best_match = gt_term
                
        return best_match, best_score
    
    def compute_metrics(self, output_results_df: pd.DataFrame, 
                       ground_truth: Dict[str, Dict], 
                       verbose: bool = False) -> Dict:
        """
        Compute metrics using semantic similarity
        """
        all_true_positives = 0
        all_false_positives = 0
        all_false_negatives = 0
        all_partial_matches = []  # Track partial matches for analysis
        patient_metrics = {}
        
        for _, row in output_results_df.iterrows():
            patient_id = str(row['patient_id'])
            
            if patient_id not in ground_truth:
                if verbose:
                    print(f"Warning: No ground truth found for patient {patient_id}")
                continue
                
            true_phenotypes = {item['phenotype_name'].lower() 
                             for item in ground_truth[patient_id]['phenotypes']}
            
            if not true_phenotypes:
                continue
                
            # Parse predictions
            try:
                hpo_terms = json.loads(row['HPO_Terms'])
            except:
                try:
                    hpo_terms = ast.literal_eval(row['HPO_Terms'])
                except:
                    continue
                    
            predictions = {item['HPO_Term'].lower() for item in hpo_terms 
                         if 'HPO_Term' in item}
            
            # Track matches for this patient
            patient_matches = {
                'exact_matches': set(),
                'partial_matches': [],
                'unmatched_predictions': set(),
                'unmatched_ground_truth': set()
            }
            
            # Process each prediction
            for pred in predictions:
                best_match, score = self.find_best_matching_term(pred, true_phenotypes)
                
                if score >= self.similarity_threshold:
                    if score == 1.0:  # Exact match
                        all_true_positives += 1
                        patient_matches['exact_matches'].add(pred)
                    else:  # Partial match
                        all_true_positives += score
                        patient_matches['partial_matches'].append({
                            'predicted': pred,
                            'matched_to': best_match,
                            'similarity': score
                        })
                        all_partial_matches.append({
                            'patient_id': patient_id,
                            'predicted': pred,
                            'matched_to': best_match,
                            'similarity': score
                        })
                else:
                    all_false_positives += 1
                    patient_matches['unmatched_predictions'].add(pred)
            
            # Count unmatched ground truth terms
            matched_gt = {match['matched_to'] for match in patient_matches['partial_matches']}
            matched_gt.update(patient_matches['exact_matches'])
            patient_matches['unmatched_ground_truth'] = true_phenotypes - matched_gt
            all_false_negatives += len(patient_matches['unmatched_ground_truth'])
            
            patient_metrics[patient_id] = patient_matches
            
            if verbose:
                print(f"\nPatient {patient_id}:")
                print(f"Exact Matches: {patient_matches['exact_matches']}")
                print(f"Partial Matches: {patient_matches['partial_matches']}")
                print(f"Unmatched Predictions: {patient_matches['unmatched_predictions']}")
                print(f"Unmatched Ground Truth: {patient_matches['unmatched_ground_truth']}")
        
        # Calculate final metrics
        precision = all_true_positives / (all_true_positives + all_false_positives) \
            if (all_true_positives + all_false_positives) > 0 else 0
        recall = all_true_positives / (all_true_positives + all_false_negatives) \
            if (all_true_positives + all_false_negatives) > 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) \
            if (precision + recall) > 0 else 0
        
        return {
            'precision': precision,
            'recall': recall,
            'f1_score': f1_score,
            'true_positives': all_true_positives,
            'false_positives': all_false_positives,
            'false_negatives': all_false_negatives,
            'patient_metrics': patient_metrics,
            'partial_matches': all_partial_matches
        }
    
    def evaluate_results(self, output_results_path: str, 
                        ground_truth_path: str,
                        verbose: bool = False) -> Dict:
        """
        Main evaluation function
        """
        try:
            df = pd.read_csv(output_results_path)
            with open(ground_truth_path, 'r') as f:
                ground_truth_data = json.load(f)
            
            metrics = self.compute_metrics(df, ground_truth_data, verbose)
            
            print("\nEvaluation Results:")
            print(f"Precision: {metrics['precision']:.4f}")
            print(f"Recall: {metrics['recall']:.4f}")
            print(f"F1 Score: {metrics['f1_score']:.4f}")
            
            print("\nPartial Match Analysis:")
            partial_matches_df = pd.DataFrame(metrics['partial_matches'])
            if not partial_matches_df.empty:
                print("\nTop 10 Partial Matches by Similarity:")
                print(partial_matches_df.sort_values('similarity', ascending=False).head(10))
            
            return metrics
            
        except Exception as e:
            print(f"Error during evaluation: {str(e)}")
            import traceback
            traceback.print_exc()
            return None

# Example usage:
if __name__ == "__main__":
    # Set device number here (e.g., 0, 1, 2, etc.)
    device_num = 0
    device = f'cuda:{device_num}' if torch.cuda.is_available() else 'cpu'
    
    evaluator = SemanticHPOEvaluator(
        similarity_threshold=0.85,
        device=device
    )
    
    metrics = evaluator.evaluate_results(
        'output_results.csv',
        'data.json',
        verbose=True
    )

Using device: cuda:0
GPU: NVIDIA RTX A6000

Patient 1:
Exact Matches: {'mesenteric mass'}
Partial Matches: [{'predicted': 'barrett esophagus', 'matched_to': "barret's esophagus", 'similarity': np.float32(0.9077887)}]
Unmatched Predictions: {'colorectal cancer', 'tubular adenoma', 'hyperplastic polyp', 'fundic gland polyps', 'fundic gland polyp', 'thyroid cancer', 'desmoid tumor', 'severe obesity'}
Unmatched Ground Truth: {'super-morbid obesity', 'obesity', 'multiple adenomatous fundic gland polyps', 'colonic polyp hyperplasia', 'innumerable polyps', 'colonic tubular adenoma'}

Patient 2:
Exact Matches: {'chronic gastritis', 'anemia', 'foveolar hyperplasia', 'small bowel polyps'}
Partial Matches: [{'predicted': 'telangiectasia', 'matched_to': 'telangiectasia', 'similarity': np.float32(1.0000002)}, {'predicted': 'stromal edema', 'matched_to': 'stromal oedema', 'similarity': np.float32(0.89529765)}, {'predicted': 'irregular microvascular pattern', 'matched_to': 'irregular microvasculature

# Evaluation Metrics

In [2]:
import json
import pandas as pd
from typing import List, Dict, Set, Tuple, Optional
from fuzzywuzzy import fuzz
from tqdm import tqdm
import numpy as np
from collections import defaultdict

class CombinedHPOEvaluator:
    def __init__(self, similarity_threshold: float = 50.0, verbose=False):
        """
        Initialize evaluator with configurable similarity threshold
        
        Args:
            similarity_threshold: Threshold for fuzzy string matching (0-100)
        """
        self.similarity_threshold = similarity_threshold
        self.verbose = verbose

    def analyze_csv_structure(self, file_path: str) -> None:
        """
        Analyze and print the structure of the results CSV file
        """
        try:
            df = pd.read_csv(file_path)
            if self.verbose:
                print("\nCSV File Structure Analysis:")
                print("-" * 50)
                print(f"Number of rows: {len(df)}")
                print(f"Columns: {', '.join(df.columns)}")
                
                print("\nSample row analysis:")
                print("-" * 50)
                sample_row = df.iloc[0]
                for col in df.columns:
                    print(f"\n{col}:")
                    print(f"Type: {type(sample_row[col]).__name__}")
                    print(f"Value: {sample_row[col]}")
                
            return df
            
        except Exception as e:
            print(f"Error analyzing CSV: {str(e)}")
            return None

    def compute_term_similarity(self, term1: str, term2: str) -> float:
        """
        Compute string similarity between two terms using fuzzy matching
        """
        if not term1 or not term2:
            return 0.0
            
        # Clean and normalize terms
        term1 = str(term1).lower().strip()
        term2 = str(term2).lower().strip()
        
        # Try different fuzzy matching methods
        ratio_score = fuzz.ratio(term1, term2)
        partial_score = fuzz.partial_ratio(term1, term2)
        token_sort_score = fuzz.token_sort_ratio(term1, term2)
        
        return max(ratio_score, partial_score, token_sort_score) / 100.0

    def process_predictions_df(self, df: pd.DataFrame) -> Dict[str, Dict]:
        """
        Process predictions DataFrame to extract HPO IDs, terms, and entity information
        """
        predictions = defaultdict(lambda: {'hpo_ids': set(), 'terms': [], 'entities': {}})
        
        total_rows = len(df)
        debug_rows = df['debug_output'].notna().sum() if 'debug_output' in df.columns else 0
        print(f"\nProcessing {total_rows} total rows ({debug_rows} debug rows will be skipped)")
        
        for _, row in df.iterrows():
            # Skip debug output rows
            if pd.notna(row.get('debug_output')):
                continue
            patient_id = str(row['patient_id'])
            
            # Extract HPO term
            hpo_term = row.get('hpo_term', '')
            entity = row.get('entity', '')  # Get the entity
            
            if pd.notna(hpo_term) and isinstance(hpo_term, str):
                hpo_term = hpo_term.strip().upper()
                if hpo_term.startswith('HP:'):
                    predictions[patient_id]['hpo_ids'].add(hpo_term)
                    # Store entity associated with this HPO ID
                    if pd.notna(entity):
                        predictions[patient_id]['entities'][hpo_term] = entity
            
            # Extract corresponding term info from top_candidates
            try:
                candidates = eval(str(row.get('top_candidates', '[]')))
                if isinstance(candidates, list) and candidates:
                    matching_candidate = next(
                        (c for c in candidates 
                        if c.get('metadata', {}).get('hp_id', '').upper() == hpo_term),
                        candidates[0]
                    )
                    term_info = matching_candidate.get('metadata', {}).get('info', '')
                    if term_info:
                        predictions[patient_id]['terms'].append(term_info)
            except (SyntaxError, ValueError) as e:
                print(f"Error processing candidates for patient {patient_id}: {e}")
                continue
                
        return dict(predictions)

    def extract_ground_truth(self, ground_truth_data: Dict) -> Dict[str, Dict]:
        """
        Extract both HPO IDs and phenotype terms from ground truth
        """
        processed_truth = {}
        
        for patient_id, data in ground_truth_data.items():
            phenotypes = data.get('phenotypes', [])
            processed_truth[patient_id] = {
                'hpo_ids': {item['hpo_id'] for item in phenotypes if 'hpo_id' in item},
                'terms': {item['phenotype_name'] for item in phenotypes if 'phenotype_name' in item}
            }
            
        return processed_truth

    def compute_metrics(self, predictions: Dict[str, Dict], 
                       ground_truth: Dict[str, Dict],
                       verbose: bool = False) -> Dict:
        """
        Compute both exact and fuzzy matching metrics
        """
        exact_metrics = {'tp': 0, 'fp': 0, 'fn': 0}
        fuzzy_metrics = {'tp': 0, 'fp': 0, 'fn': 0}
        patient_details = {}
        
        for patient_id, pred_data in predictions.items():
            if patient_id not in ground_truth:
                continue
                
            gt_data = ground_truth[patient_id]
            patient_metrics = {
                'exact_matches': set(),
                'fuzzy_matches': [],
                'unmatched_predictions': set(),
                'unmatched_ground_truth': set()
            }
            
            # Exact matching on HPO IDs
            exact_matches = pred_data['hpo_ids'].intersection(gt_data['hpo_ids'])
            exact_metrics['tp'] += len(exact_matches)
            exact_metrics['fp'] += len(pred_data['hpo_ids'] - gt_data['hpo_ids'])
            exact_metrics['fn'] += len(gt_data['hpo_ids'] - pred_data['hpo_ids'])
            
            patient_metrics['exact_matches'] = exact_matches
            
            # Fuzzy matching on terms
            matched_gt_terms = set()
            for pred_term in pred_data['terms']:
                best_match = None
                best_score = 0
                
                for gt_term in gt_data['terms']:
                    if gt_term in matched_gt_terms:
                        continue
                        
                    similarity = self.compute_term_similarity(pred_term, gt_term)
                    if similarity > best_score:
                        best_score = similarity
                        best_match = gt_term
                
                if best_score >= (self.similarity_threshold / 100):
                    fuzzy_metrics['tp'] += best_score
                    matched_gt_terms.add(best_match)
                    patient_metrics['fuzzy_matches'].append({
                        'predicted': pred_term,
                        'matched_to': best_match,
                        'similarity': best_score
                    })
                else:
                    fuzzy_metrics['fp'] += 1
                    patient_metrics['unmatched_predictions'].add(pred_term)
            
            # Count unmatched ground truth terms
            unmatched_gt = gt_data['terms'] - matched_gt_terms
            fuzzy_metrics['fn'] += len(unmatched_gt)
            patient_metrics['unmatched_ground_truth'] = unmatched_gt
            
            patient_details[patient_id] = patient_metrics
            
            if verbose:
                print(f"\nPatient {patient_id}:")
                print(f"Exact Matches: {len(patient_metrics['exact_matches'])}")
                print(f"Fuzzy Matches: {len(patient_metrics['fuzzy_matches'])}")
                print(f"Unmatched Predictions: {len(patient_metrics['unmatched_predictions'])}")
                print(f"Unmatched Ground Truth: {len(patient_metrics['unmatched_ground_truth'])}")
                print("Exact:", patient_metrics['exact_matches'])
                print("Fuzzy:", patient_metrics['fuzzy_matches'])
                print("Unmatched Ground Truth:", patient_metrics['unmatched_ground_truth'])
                print("Unmatched Predictions:", patient_metrics['unmatched_predictions'])
                print("-" * 50)
        # Calculate final metrics
        metrics = {}
        for method, m in [('exact', exact_metrics), ('fuzzy', fuzzy_metrics)]:
            precision = m['tp'] / (m['tp'] + m['fp']) if (m['tp'] + m['fp']) > 0 else 0
            recall = m['tp'] / (m['tp'] + m['fn']) if (m['tp'] + m['fn']) > 0 else 0
            f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
            
            metrics[method] = {
                'precision': precision,
                'recall': recall,
                'f1_score': f1,
                'true_positives': m['tp'],
                'false_positives': m['fp'],
                'false_negatives': m['fn']
            }
        
        metrics['patient_details'] = patient_details
        return metrics

    def evaluate_results(self, output_results_path: str, 
                        ground_truth_path: str,
                        verbose: bool = False) -> Dict:
        """
        Main evaluation function that combines exact and fuzzy matching
        """
        try:
            # Analyze CSV structure
            print("\nAnalyzing results file structure...")
            df = self.analyze_csv_structure(output_results_path)
            if df is None:
                return None
            
            # Load ground truth
            print("\nLoading ground truth data...")
            with open(ground_truth_path, 'r') as f:
                ground_truth_data = json.load(f)
            
            # Process predictions and ground truth
            predictions = self.process_predictions_df(df)
            processed_truth = self.extract_ground_truth(ground_truth_data)
            
            # Compute metrics
            print("\nComputing metrics...")
            metrics = self.compute_metrics(predictions, processed_truth, verbose)
            if verbose:
                # Print results
                print("\nEvaluation Results:")
                print("-" * 50)
                for method in ['exact', 'fuzzy']:
                    print(f"\n{method.title()} Matching Metrics:")
                    print(f"Precision: {metrics[method]['precision']:.4f}")
                    print(f"Recall: {metrics[method]['recall']:.4f}")
                    print(f"F1 Score: {metrics[method]['f1_score']:.4f}")
                
            return metrics
            
        except Exception as e:
            print(f"Error during evaluation: {str(e)}")
            import traceback
            traceback.print_exc()
            return None

# Example usage:
if __name__ == "__main__":
    files = ["mistral_24b_optimized.csv", "mistral_24b_optimized_iter.csv","mistral_24b.csv", "mixtral_24b.csv", "llama70b_groq.csv","llama8b_updated.csv", "llama70b_updated.csv", "i2b2_llama70b_updated.csv", "llama70b_medcpt_updated.csv", "llama70b_updated.csv", "llama70b_medembed_updated.csv", "openbio70b_medembed_updated.csv"]
    directory = "data/results/hporag/"
    evaluator = CombinedHPOEvaluator(similarity_threshold=50.0)
    for file in files:
        file = directory + file
        print(file)
        metrics = evaluator.evaluate_results(
            file,
            'data/dataset/mine_hpo.json',
            verbose=False
        )
        print(file)
        print(metrics['exact'])
        print(metrics['fuzzy'])

data/results/hporag/mistral_24b_optimized.csv

Analyzing results file structure...

Loading ground truth data...

Processing 2093 total rows (0 debug rows will be skipped)

Computing metrics...
data/results/hporag/mistral_24b_optimized.csv
{'precision': 0.487062404870624, 'recall': 0.534521158129176, 'f1_score': 0.5096894080169897, 'true_positives': 960, 'false_positives': 1011, 'false_negatives': 836}
{'precision': 0.6855835036885605, 'recall': 0.8237838415122546, 'f1_score': 0.7483567391884347, 'true_positives': 1234.1599999999967, 'false_positives': 566, 'false_negatives': 264}
data/results/hporag/mistral_24b_optimized_iter.csv

Analyzing results file structure...

Loading ground truth data...

Processing 3403 total rows (0 debug rows will be skipped)

Computing metrics...
data/results/hporag/mistral_24b_optimized_iter.csv
{'precision': 0.34117268041237114, 'recall': 0.5946097697922516, 'f1_score': 0.43357215967246676, 'true_positives': 1059, 'false_positives': 2045, 'false_negative

In [20]:
import pandas as pd

df = pd.read_csv("data/results/llama70b_medembed.csv")
df.head()
df = df[df["patient_id"] == 53]

# Save the matches to a text file
df["matches"].to_csv('matches.txt', index=False, header=True)
df["hpo_term"].to_csv('matches_output.txt', index=False, header=True)

In [1]:
import json
import pandas as pd
from typing import List, Dict, Set, Tuple, Optional
from fuzzywuzzy import fuzz
from tqdm import tqdm
import numpy as np
from collections import defaultdict

class EnhancedHPOEvaluator(CombinedHPOEvaluator):
    def __init__(self, similarity_threshold: float = 50.0):
        super().__init__(similarity_threshold)
        
    def analyze_errors(self, predictions: Dict[str, Dict], 
                      ground_truth: Dict[str, Dict]) -> Dict:
        """
        Analyze errors for each patient and identify worst cases
        """
        error_analysis = {}
        
        for patient_id, pred_data in predictions.items():
            if patient_id not in ground_truth:
                continue
                
            gt_data = ground_truth[patient_id]
            
            # Analyze exact matches (HPO IDs)
            false_positives = pred_data['hpo_ids'] - gt_data['hpo_ids']
            false_negatives = gt_data['hpo_ids'] - pred_data['hpo_ids']
            true_positives = pred_data['hpo_ids'].intersection(gt_data['hpo_ids'])
            
            error_analysis[patient_id] = {
                'false_positives': len(false_positives),
                'false_negatives': len(false_negatives),
                'true_positives': len(true_positives),
                'fp_terms': false_positives,
                'fn_terms': false_negatives,
                'tp_terms': true_positives,
                'total_errors': len(false_positives) + len(false_negatives)
            }
            
        return error_analysis
            
    def print_worst_cases(self, error_analysis: Dict, 
                     predictions: Dict[str, Dict],
                     ground_truth: Dict[str, Dict],
                     top_n: int = 5):
        """
        Print detailed analysis of the worst performing cases, including entity information
        """
        sorted_patients = sorted(
            error_analysis.items(),
            key=lambda x: x[1]['total_errors'],
            reverse=True
        )
        
        print(f"\nTop {top_n} Worst Performing Cases:")
        print("=" * 80)
        
        for patient_id, stats in sorted_patients[:top_n]:
            print(f"\nPatient ID: {patient_id}")
            print("-" * 40)
            print(f"Total Errors: {stats['total_errors']}")
            print(f"False Positives: {stats['false_positives']}")
            print(f"False Negatives: {stats['false_negatives']}")
            print(f"True Positives: {stats['true_positives']}")
            
            print("\nFalse Positives (Incorrectly Predicted):")
            for hpo_id in stats['fp_terms']:
                # Get entity associated with this HPO ID
                entity = predictions[patient_id]['entities'].get(hpo_id, "Entity not found")
                # Find corresponding term info from predictions
                term_info = next(
                    (term for term in predictions[patient_id]['terms']
                    if any(hpo_id in str(candidate) for candidate in ground_truth[patient_id]['phenotypes'])),
                    "Term info not found"
                )
                print(f"  - {hpo_id}: {term_info}")
                print(f"    Entity: {entity}")
            
            # Rest of the method remains the same...
            print("\nFalse Negatives (Missed):")
            for hpo_id in stats['fn_terms']:
                term_info = next(
                    (pheno['phenotype_name'] 
                    for pheno in ground_truth[patient_id]['phenotypes']
                    if pheno['hpo_id'] == hpo_id),
                    "Term info not found"
                )
                print(f"  - {hpo_id}: {term_info}")
            
            print("\nTrue Positives (Correct Predictions):")
            for hpo_id in stats['tp_terms']:
                term_info = next(
                    (pheno['phenotype_name'] 
                    for pheno in ground_truth[patient_id]['phenotypes']
                    if pheno['hpo_id'] == hpo_id),
                    "Term info not found"
                )
                print(f"  - {hpo_id}: {term_info}")
            
            print("\n" + "=" * 80)
    
    def evaluate_results(self, output_results_path: str, 
                        ground_truth_path: str,
                        verbose: bool = False) -> Dict:
        """
        Enhanced evaluation function that includes error analysis
        """
        try:
            # Load and process data
            df = self.analyze_csv_structure(output_results_path)
            if df is None:
                return None
            
            with open(ground_truth_path, 'r') as f:
                ground_truth_data = json.load(f)
            
            predictions = self.process_predictions_df(df)
            processed_truth = self.extract_ground_truth(ground_truth_data)
            
            # Compute metrics
            metrics = self.compute_metrics(predictions, processed_truth, verbose)
            
            # Perform error analysis
            error_analysis = self.analyze_errors(predictions, processed_truth)
            self.print_worst_cases(error_analysis, predictions, ground_truth_data)
            
            # Add error analysis to metrics
            metrics['error_analysis'] = error_analysis
            
            return metrics
            
        except Exception as e:
            print(f"Error during evaluation: {str(e)}")
            import traceback
            traceback.print_exc()
            return None

# Example usage:
if __name__ == "__main__":
    evaluator = EnhancedHPOEvaluator(similarity_threshold=50.0)
    metrics = evaluator.evaluate_results(
        'data/results/llama70b_r1.csv',
        'data/dataset/data.json',
        verbose=False
    )

NameError: name 'CombinedHPOEvaluator' is not defined

In [1]:
from utils.llm_client import LocalLLMClient

# Initialize the client
client = LocalLLMClient(model_type="llama3_70b", device="cuda:0", temperature=0.1)

  from .autonotebook import tqdm as notebook_tqdm
2025-02-21 13:53:00.704438: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1740167580.718815  501546 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1740167580.723073  501546 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-21 13:53:00.740411: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kw

Initialized ModelLoader with cache directory: /shared/rsaas/jw3/rare_disease/model_cache
Loading LLM!
Device configuration: cuda:0
Using device map: {'': 'cuda:0'}
Loading 70B model with quantization: llama3_70b
Generated cache path: /shared/rsaas/jw3/rare_disease/model_cache/Llama-3.3-70B-Instruct_4bit_nf4
Valid cache found at /shared/rsaas/jw3/rare_disease/model_cache/Llama-3.3-70B-Instruct_4bit_nf4
Loading cached quantized model from /shared/rsaas/jw3/rare_disease/model_cache/Llama-3.3-70B-Instruct_4bit_nf4


Loading checkpoint shards: 100%|██████████| 8/8 [00:10<00:00,  1.28s/it]
Device set to use cuda:0


Hello. How can I assist you with medical information or answer any questions you may have related to healthcare or the biomedical domain?


In [2]:
user_input = "A 35-year-old woman presented to the medical emergency department with low-grade fever for 3 weeks, vomiting for 1 week and anuria for 3 days. She also reported dysuria and breathlessness for 1 week. There was no history of decreased urine output, dialysis, effort intolerance, chest pain or palpitation, dyspnoea and weight loss. Menstrual history was within normal limit but she reported gradually progressive loss of appetite. Family history included smoky urine in her younger brother in his childhood, who died in an accident. On general survey, the patient was conscious and alert. She was dyspnoeic and febrile. Severe pallor was present with mild pedal oedema. Blood pressure was 180/100 mm Hg and pulse rate of 116/min regular. No evidence of jaundice, clubbing cyanosis or lymphadenopathy was found. Physical examination revealed bibasilar end-inspiratory crepitations in lungs and suprapubic tenderness. There was no hepatosplenomegaly or ascites. Cardiac examination was normal. She was found to have severe bilateral hearing loss, which was gradually progressive for 5 years. The fundi were bilaterally pale. The patient was referred to the department of ophthalmology for a comprehensive eye examination. Her visual acuity was documented as 6/18 in both eyes with no obvious lenticular opacity. Slit-lamp examination showed bilateral anterior lentiglobus with posterior lenticonus. Distant direct ophthalmoscopy revealed oil droplet sign (a suggestive confirmation of the presence of lenticonus); and peripheral retina revealed multiple yellowish white lesion-like flecks in the mid-periphery, and few blot haemorrhages indicative of hypertensive changes. Haemogram showed haemoglobin (Hb) 5.7 g/dL, erythrocyte sedimentation rate 15 mm in first hour, white cell count 17 200/\u00b5L with 82% polymorphs, adequate platelets and mean corpuscular volume 83.3 fL. Peripheral smear showed normocytic normochromic anaemia with mild anisocytosis. Fasting sugar 78 mg/dL, blood urea 325 mg/dL, serum creatinine 11.2 mg/dL and uric acid 8.3 mg/dL. Liver function tests were within normal limit as were serum electrolytes, except serum calcium (conc.) 5.8 mg/dL (adjusted with serum albumin). Lipid profile and iron profile were also normal. HIV and viral markers for HbsAg and hepatitis C virus were non-reactive. ECG showed sinus tachycardia with features of left ventricular hypertrophy and chest X-ray posteroanterior view revealed cardiomegaly. Urinalysis showed full field of pus cells with 35\u201340 RBCs/hpf and 3(+) proteinuria. Urine samples for cultures were sent which reported pure growth of Escherichia coli. Spot urine for protein:creatinine ratio was 2.07 g/g Cr. She underwent pure tone audiometry which revealed features suggestive of severe bilateral sensorineural hearing loss (SHNL). Ultrasound of the abdomen showed bilateral contracted kidneys: right measured 6.7\u00d72.3 cm and left 7.8\u00d73 cm, with increased cortical echogenecity and loss of corticomedullary differentiation, suggestive of medical renal disease. Two-dimensional Echo reported dilated left ventricular cavity with mild mitral regurgitation and ejection fraction of 55%. Renal and skin biopsies were conducted and specimens were sent for light and electron microscopy (EM). Renal tissue on H&E stain was reported as focal segmental glomerulonephritis (FSGS). Ultrathin sections of EM study of renal tissue revealed disruption of glomerular basement membrane (GBM) with diffuse thickening of glomerular capillary wall. Dermal tissue depicted discontinuity of lamina densa with basket weaving pattern under EM."
system_prompt = "You are a rare disease expert with extensive medical knowledge. Carefully review every sentence of the clinical passage to identify terms related to genetic inheritance patterns, anatomical anomalies, clinical symptoms, diagnostic findings, test results, and specific conditions or syndromes. Completely ignore negative findings, normal findings (i.e. 'normal' or 'no'), procedures and family history. Include appropriate context based only on the passage. Return the extracted terms in a JSON object with a single key 'findings', which contains the list of extracted terms spelled correctly. Ensure the output is concise without any additional notes, commentary, or meta explanations."
# client.query(user_input, system_prompt)

In [3]:
from hporag.entity import LLMEntityExtractor
from hporag.hpo_match import RAGHPOMatcher
from hporag.pipeline import HPORAG
from utils.embedding import EmbeddingsManager


retriever = 'fastembed'
retriever_model = 'BAAI/bge-small-en-v1.5'
embeddings_manager = EmbeddingsManager(
model_type=retriever,
model_name=retriever_model if retriever in ['fastembed', 'sentence_transformer'] else None,
device="cpu"
)

texts =[user_input]

entity_extractor = LLMEntityExtractor(llm_client=client, system_message=system_prompt)
# Extract entities using existing entity extractor
batch_entities = entity_extractor.process_batch(texts)
print(batch_entities)

# # Process matches with contexts
# batch_matches = []
# for entities, contexts in zip(batch_entities, batch_contexts):
#     # Pass embedded_documents directly, not as a list
#     matches = self.hpo_matcher.match_hpo_terms(entities, embedded_documents, contexts)
#     batch_matches.append(matches)

# # Store results
# for j, matches in enumerate(batch_matches):
#     if matches:  # matches is a list of dictionaries with full match info
#         results.append({
#             'patient_id': batch.iloc[j][id_column],
#             'matches': matches,
#             'original_text': batch.iloc[j][text_column]
#         })

Loading model...
Model type: fastembed
Model name: BAAI/bge-small-en-v1.5
Device: cpu
[['anuria', 'dysuria', 'breathlessness', 'dyspnoea', 'severe pallor', 'mild pedal oedema', 'hypertension', 'bibasilar end-inspiratory crepitations', 'suprapubic tenderness', 'severe bilateral hearing loss', 'bilateral anterior lentiglobus', 'posterior lenticonus', 'lenticular opacity', 'oil droplet sign', 'hypertensive changes', 'normocytic normochromic anaemia', 'mild anisocytosis', 'elevated blood urea', 'elevated serum creatinine', 'elevated uric acid', 'left ventricular hypertrophy', 'cardiomegaly', 'proteinuria', 'haematuria', 'Escherichia coli infection', 'severe bilateral sensorineural hearing loss', 'bilateral contracted kidneys', 'medical renal disease', 'focal segmental glomerulonephritis', 'disruption of glomerular basement membrane', 'diffuse thickening of glomerular capillary wall', 'discontinuity of lamina densa']]


In [4]:
# Extract contexts for all entities
system_prompt_ii = "You are a rare disease expert with extensive medical knowledge. Identify the most appropriate Human Phenotype Ontology (HPO) term for the given patient data and additional context provided. Prioritize terms that are concise and directly relevant to the primary symptom or condition described. Focus on the core subject of each phrase and avoid selecting options with additional descriptive or situational details unless they are essential for accurately capturing the phenotype. Ensure the chosen HPO term closely matches the patient's condition as described, without adding any new or extraneous terms. If multiple phenotypes are present, select and return the single most pertinent HPO term that best represents the primary condition or symptom. Provide only the HPO term itself, with no extra context or commentary."
hpo_matcher = RAGHPOMatcher(embeddings_manager, client, system_prompt_ii)
pipeline = HPORAG(entity_extractor, hpo_matcher, False)
batch_contexts = pipeline._extract_contexts(batch_entities, texts)
print(len(batch_contexts[0]))

32


In [5]:
batch_matches = []
embedded_documents = embeddings_manager.load_documents("data/vector_stores/G2GHPO_metadata_medembed.npy")
for entities, contexts in zip(batch_entities, batch_contexts):
    # Pass embedded_documents directly, not as a list
    matches = hpo_matcher.match_hpo_terms(entities, embedded_documents, contexts)
    batch_matches.append(matches)

Data type: <class 'numpy.ndarray'>
Shape or length: (61401,)
First element type: <class 'dict'>
Query: breathlessness
Original Sentence: She also reported dysuria and breathlessness for 1 week.
Context: The following related information is available to assist in determining the appropriate HPO terms:
- exertional breathlessness (HP:0002875)
- breathlessness at rest (HP:0033710)
- a sensation of breathlessness in the recumbent position, relieved by sitting or standing. (HP:0012764)
- attacks of breathlessness that occur at night and may awaken the sleeping patient. (HP:0034807)
- dyspnea (HP:0002094)
- perceived difficulty to breathe that occurs with exercise or exertion and improves with rest. (HP:0002875)
- shortness of breath (HP:0002094)
- breathing difficulty (HP:0002094)
- decreased breath sounds (HP:4000214)
- exertional dyspnea (HP:0002875)
- breathing difficulties (HP:0002098)
- noticeably unpleasant odors exhaled in breathing. (HP:0100812)
- hyperventilation (HP:0002883)
- abn

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Query: oil droplet sign
Original Sentence: Distant direct ophthalmoscopy revealed oil droplet sign (a suggestive confirmation of the presence of lenticonus); and peripheral retina revealed multiple yellowish white lesion-like flecks in the mid-periphery, and few blot haemorrhages indicative of hypertensive changes.
Context: The following related information is available to assist in determining the appropriate HPO terms:
- holster sign (HP:6000006)
- rope sign (HP:0031924)
- empty delta sign (HP:0032267)
- shawl sign (HP:0025535)
- railroad track sign (HP:0009897)
- air crescent sign (HP:0032172)
- champagne cork sign (HP:0034226)
- trousseau sign (HP:6000919)
- snowflake sign (HP:6000341)
- silhouette sign (HP:0033647)
- chandelier sign (HP:6000107)
- gowers sign (HP:0003391)
- froment sign (HP:0032121)
- comet tail sign (HP:0033658)
- dural tail sign (HP:0032268)
- lemon sign (HP:0032269)
- carpet tack sign (HP:0032152)
- trident sign (HP:0034044)
- mcconnell sign (HP:0034331)
- oppe

In [7]:
batch_matches = []

embedded_documents = embeddings_manager.load_documents("data/vector_stores/G2GHPO_metadata_medembed.npy")
hpo_matcher.prepare_index(embedded_documents)
for entities, contexts in zip(batch_entities, batch_contexts):
    # Pass embedded_documents directly, not as a list
    matches = []
    print(len(entities))
    for entity, original_sentence in zip(entities, contexts):
        # Get candidate terms using vector retrieval
        
        candidates = hpo_matcher._retrieve_candidates(entity)
        match_info = {
            'entity': entity,
            'top_candidates': candidates[:5]
        }
        print("entity:", entity)
        print("match_info:", match_info)
        # Try enriched exact matching process
        hpo_term = hpo_matcher._try_enriched_matching(entity, candidates)
        print("hpo_term:", hpo_term)
        if hpo_term:
            match_info.update({
                'hpo_term': hpo_term,
                'match_method': 'exact',
                'confidence_score': 1.0
            })
            matches.append(match_info)
        # If no exact match found, try LLM matching with original sentence context
        elif client:
            hpo_term = hpo_matcher._try_llm_match(entity, candidates, original_sentence)
            print("LLM response:", hpo_term)
            if hpo_term:
                match_info.update({
                    'hpo_term': hpo_term,
                    'match_method': 'llm',
                    'confidence_score': 0.7
                })
                matches.append(match_info)
        print(len(matches))
        print("-----------------")
        
    batch_matches.append(matches)

Data type: <class 'numpy.ndarray'>
Shape or length: (61401,)
First element type: <class 'dict'>
32
entity: anuria
match_info: {'entity': 'anuria', 'top_candidates': [{'metadata': {'info': 'anuria', 'hp_id': 'HP:0100519'}, 'similarity_score': 0.971534315495728}, {'metadata': {'info': 'nocturia', 'hp_id': 'HP:0000017'}, 'similarity_score': 0.7103948435237569}, {'metadata': {'info': 'polyuria', 'hp_id': 'HP:0000103'}, 'similarity_score': 0.7015884729540331}, {'metadata': {'info': 'alaninuria', 'hp_id': 'HP:0020078'}, 'similarity_score': 0.6859778720297504}, {'metadata': {'info': 'valinuria', 'hp_id': 'HP:0033088'}, 'similarity_score': 0.6802457270398027}]}
hpo_term: HP:0100519
1
-----------------
entity: dysuria
match_info: {'entity': 'dysuria', 'top_candidates': [{'metadata': {'info': 'dysuria', 'hp_id': 'HP:0100518'}, 'similarity_score': 0.9568687598737801}, {'metadata': {'info': 'lysinuria', 'hp_id': 'HP:0003297'}, 'similarity_score': 0.797467700704248}, {'metadata': {'info': 'dysmorph

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset



Prompt: Query: lenticular opacity
Original Sentence: Her visual acuity was documented as 6/18 in both eyes with no obvious lenticular opacity.
Context: The following related information is available to assist in determining the appropriate HPO terms:
- punctate lenticular opacities (HP:0007648)
- juvenile posterior subcapsular lenticular opacities (HP:0007935)
- lenticular astigmatism (HP:0031791)
- subcapsular opacities (HP:0000523)
- lens opacity (HP:0000518)
- opacity refers to any area that preferentially attenuates the x-ray beam and therefore appears more opaque than the surrounding area. it is a nonspecific term that does not indicate the size or pathologic nature of the abnormality. (HP:0031457)
- lens opacities (HP:0000518)
- centrilobular groundglass opacity (HP:0025180)
- corneal stromal opacity (HP:0007759)
- corneal opacity (HP:0007957)
- enamel opacity (HP:0033786)
- anterior lenticonus (HP:0011501)
- a type of posterior cortical cataract characterized by dense lenticula

In [6]:
print(len(batch_matches[0]))

32


In [6]:
import json
labels = json.load(open("data/dataset/data.json", "r"))
phenos = labels["53"]["phenotypes"]
len(phenos)

48

In [7]:
from typing import List, Dict
import pandas as pd

def evaluate_hpo_matches(batch_matches: List[Dict], ground_truth: List[Dict]) -> Dict:
    """
    Evaluate HPO term predictions against ground truth based on HPO IDs.
    
    Args:
        batch_matches: List of prediction dictionaries with 'entity' and 'hpo_term' keys
        ground_truth: List of dictionaries with 'phenotype_name' and 'hpo_id' keys
    
    Returns:
        Dictionary containing evaluation metrics
    """
    # Convert ground truth to a set of HPO IDs
    true_hpo_ids = set(pheno['hpo_id'] for pheno in ground_truth)
    
    # Get predicted HPO terms, handling None values
    pred_hpo_ids = set(
        match['hpo_term'] for match in batch_matches 
        if match['hpo_term'] is not None
    )
    
    # Calculate metrics based on HPO IDs
    true_positives = len(true_hpo_ids.intersection(pred_hpo_ids))
    false_positives = len(pred_hpo_ids - true_hpo_ids)
    false_negatives = len(true_hpo_ids - pred_hpo_ids)
    
    # Calculate precision, recall, and F1
    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    # Create detailed matches analysis
    matches_analysis = []
    for match in batch_matches:
        entity = match['entity']
        pred_hpo = match['hpo_term']
        match_type = match['match_method']
        confidence = match['confidence_score']
        
        # Check if predicted HPO ID is in ground truth
        correct = pred_hpo in true_hpo_ids if pred_hpo else False
        
        matches_analysis.append({
            'entity': entity,
            'predicted_hpo': pred_hpo,
            'correct': correct,
            'match_type': match_type,
            'confidence': confidence
        })
    
    # Identify missed and incorrect HPO IDs
    missed_hpo_ids = true_hpo_ids - pred_hpo_ids
    incorrect_hpo_ids = pred_hpo_ids - true_hpo_ids
    
    # Map these back to phenotype names
    missed_phenotypes = [
        pheno for pheno in ground_truth
        if pheno['hpo_id'] in missed_hpo_ids
    ]
    
    incorrect_predictions = [
        match for match in batch_matches
        if match['hpo_term'] in incorrect_hpo_ids
    ]
    
    return {
        'metrics': {
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'true_positives': true_positives,
            'false_positives': false_positives,
            'false_negatives': false_negatives
        },
        'matches_analysis': matches_analysis,
        'missed_phenotypes': missed_phenotypes,
        'incorrect_predictions': incorrect_predictions,
        'ground_truth_hpo_ids': list(true_hpo_ids),
        'predicted_hpo_ids': list(pred_hpo_ids)
    }

def print_evaluation_results(eval_results: Dict):
    """
    Print formatted evaluation results.
    
    Args:
        eval_results: Dictionary containing evaluation metrics and analysis
    """
    # Print overall metrics
    metrics = eval_results['metrics']
    print("\n=== Overall Metrics ===")
    print(f"F1 Score: {metrics['f1_score']:.3f}")
    print(f"Precision: {metrics['precision']:.3f}")
    print(f"Recall: {metrics['recall']:.3f}")
    print(f"True Positives: {metrics['true_positives']}")
    print(f"False Positives: {metrics['false_positives']}")
    print(f"False Negatives: {metrics['false_negatives']}")
    
    # Print HPO ID sets
    print("\n=== HPO ID Comparison ===")
    print("Ground Truth HPO IDs:", eval_results['ground_truth_hpo_ids'])
    print("Predicted HPO IDs:", eval_results['predicted_hpo_ids'])
    
    # Print detailed matches analysis
    print("\n=== Detailed Matches Analysis ===")
    matches_df = pd.DataFrame(eval_results['matches_analysis'])
    if not matches_df.empty:
        print("\nPredictions:")
        print(matches_df.to_string(index=False))
    
    # Print missed phenotypes
    if eval_results['missed_phenotypes']:
        print("\n=== Missed Phenotypes (False Negatives) ===")
        for pheno in eval_results['missed_phenotypes']:
            print(f"- {pheno['phenotype_name']} ({pheno['hpo_id']})")
    
    # Print incorrect predictions
    if eval_results['incorrect_predictions']:
        print("\n=== Incorrect Predictions (False Positives) ===")
        for pred in eval_results['incorrect_predictions']:
            print(f"- {pred['entity']} -> {pred['hpo_term']} (confidence: {pred['confidence_score']:.3f})")

# Example usage:

# With your existing variables:
eval_results = evaluate_hpo_matches(batch_matches[0], phenos)
print_evaluation_results(eval_results)



=== Overall Metrics ===
F1 Score: 0.538
Precision: 0.700
Recall: 0.438
True Positives: 21
False Positives: 9
False Negatives: 27

=== HPO ID Comparison ===
Ground Truth HPO IDs: ['HP:0025242', 'HP:4000141', 'HP:0011134', 'HP:0001640', 'HP:0000097', 'HP:0005564', 'HP:0011527', 'HP:0012085', 'HP:0000365', 'HP:0003138', 'HP:0001730', 'HP:0001017', 'HP:0001897', 'HP:0008625', 'HP:0100519', 'HP:0001903', 'HP:0011897', 'HP:0000822', 'HP:0012614', 'HP:0025005', 'HP:0030506', 'HP:0011502', 'HP:0002094', 'HP:0011273', 'HP:0033132', 'HP:0003259', 'HP:0001895', 'HP:0004396', 'HP:0010741', 'HP:0000093', 'HP:0100518', 'HP:0000407', 'HP:0012586', 'HP:0031998', 'HP:0032037', 'HP:0002013', 'HP:0001712', 'HP:0002027', 'HP:0012587', 'HP:0011501', 'HP:0012461', 'HP:0001653', 'HP:6000027', 'HP:0002149', 'HP:0011703', 'HP:0033803', 'HP:0033485', 'HP:0002901']
Predicted HPO IDs: ['HP:0001640', 'HP:0000097', 'HP:0011527', 'HP:0003138', 'HP:0008625', 'HP:0001897', 'HP:0100519', 'HP:0000822', 'HP:0000790', 'H