In [2]:
import json
import requests
import logging
import numpy as np
import os
from typing import Dict, List, Optional

# Configure logging: DEBUG for detailed internal state, INFO for high-level outputs.
logging.basicConfig(
    format="%(asctime)s [%(levelname)s] %(message)s",
    level=logging.DEBUG  # Set to INFO for less verbosity if desired
)
logger = logging.getLogger(__name__)

# URL to the local Ollama embedding API endpoint.
EMBED_URL = "http://localhost:11434/api/embed"

def get_embedding_for_drug(drug: str, model: str) -> Optional[np.ndarray]:
    """
    Get the embedding of a drug name by calling the Ollama embedding API.
    
    Args:
        drug (str): The drug name.
        model (str): The model to use for embeddings (e.g. "llama3:8b").
    
    Returns:
        Optional[np.ndarray]: The embedding vector as a NumPy array, or None if request fails.
    """
    payload = {
        "model": model,
        "input": drug
    }
    logger.debug("Requesting embedding for drug '%s' using model '%s'", drug, model)
    try:
        response = requests.post(EMBED_URL, json=payload)
        response.raise_for_status()
        data = response.json()
        # Assuming the embedding is returned in the first (and only) element of "embeddings".
        embedding_list = data["embeddings"][0]
        logger.info("Embedding for drug '%s' received (vector length: %d)", drug, len(embedding_list))
        return np.array(embedding_list)
    except requests.RequestException as err:
        logger.error("Error fetching embedding for drug '%s': %s", drug, str(err))
        return None

def embed_drugs(drug_list: List[str], model: str) -> Dict[str, np.ndarray]:
    """
    For each drug in the provided list, call the Ollama API to get an embedding.
    
    Args:
        drug_list (List[str]): List of drug names.
        model (str): Embedding model to use.
    
    Returns:
        Dict[str, np.ndarray]: Mapping from drug name to its embedding (as a NumPy array).
    """
    embeddings = {}
    for drug in drug_list:
        embedding = get_embedding_for_drug(drug, model)
        if embedding is not None:
            embeddings[drug] = embedding
        else:
            logger.warning("Skipping drug '%s' due to failed embedding.", drug)
    return embeddings

def run_through_all_approaches(approach_files: Dict[str, str], model: str) -> Dict[str, Dict[str, np.ndarray]]:
    """
    Process all approaches by loading the respective JSON files containing drugs,
    generating embeddings for each, and storing the results.
    
    Args:
        approach_files (Dict[str, str]): Mapping of approach names to JSON filenames.
        model (str): Embedding model to use.
    
    Returns:
        Dict[str, Dict[str, np.ndarray]]: Mapping from approach name to a dictionary of (drug -> embedding) pairs.
    """
    all_approach_embeddings = {}
    
    for approach, file_path in approach_files.items():
        if not os.path.exists(file_path):
            logger.error("File %s for approach '%s' not found!", file_path, approach)
            continue
        
        with open(file_path, 'r') as file:
            data = json.load(file)
            # Handle both list and dictionary structures
            if isinstance(data, dict):
                drug_list = list(data.keys())
            else:
                # Assume it's a list of drug names
                drug_list = data
            logger.info("Loaded %d drugs for approach '%s' from %s", len(drug_list), approach, file_path)
        
        embeddings = embed_drugs(drug_list, model)
        all_approach_embeddings[approach] = embeddings
    return all_approach_embeddings

def cosine_similarity(vec1: np.ndarray, vec2: np.ndarray) -> float:
    """
    Calculate the cosine similarity between two embedding vectors.
    
    Args:
        vec1 (np.ndarray): First embedding vector.
        vec2 (np.ndarray): Second embedding vector.
    
    Returns:
        float: Cosine similarity value (range [-1, 1]).
    """
    logger.debug("Calculating cosine similarity.")
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    
    if norm1 == 0 or norm2 == 0:
        logger.error("Zero norm encountered.")
        return 0.0
    similarity = dot_product / (norm1 * norm2)
    logger.debug("Cosine similarity: %f", similarity)
    return similarity

def calculate_minimal_distances_to_literature(approach_embeddings: Dict[str, np.ndarray], 
                                           literature_embeddings: Dict[str, np.ndarray]) -> Dict[str, float]:
    """
    For each drug in an approach, compute the minimum cosine distance 
    (1 - cosine similarity) to any drug in the literature comparison set.
    
    Args:
        approach_embeddings (Dict[str, np.ndarray]): Mapping of drug to embedding for one approach.
        literature_embeddings (Dict[str, np.ndarray]): Mapping of drug to embedding for literature comparison.
    
    Returns:
        Dict[str, float]: Mapping from drug name to its minimal distance to any literature drug.
    """
    minimal_distances = {}
    
    for drug, emb in approach_embeddings.items():
        min_distance = float('inf')
        closest_literature_drug = None
        
        for lit_drug, lit_emb in literature_embeddings.items():
            similarity = cosine_similarity(emb, lit_emb)
            distance = 1 - similarity  # convert similarity to distance
            if distance < min_distance:
                min_distance = distance
                closest_literature_drug = lit_drug
        
        minimal_distances[drug] = min_distance
        logger.debug("Drug '%s': minimal cosine distance to literature = %f (closest to '%s')", 
                    drug, min_distance, closest_literature_drug)
    
    return minimal_distances

def compare_approaches_to_literature(all_approach_embeddings: Dict[str, Dict[str, np.ndarray]], 
                                   literature_embeddings: Dict[str, np.ndarray]) -> None:
    """
    For each approach, calculate the minimal distances between its drug embeddings
    and the literature comparison embeddings. Then compare these distances
    to determine which approach is closest to the literature comparison set.
    
    Args:
        all_approach_embeddings (Dict[str, Dict[str, np.ndarray]]): Mapping of approach names to embeddings dict.
        literature_embeddings (Dict[str, np.ndarray]): Embeddings of literature comparison drugs.
    """
    approach_results = {}
    
    for approach, embeddings in all_approach_embeddings.items():
        if approach == "literature":  # Skip the literature set itself
            continue
            
        minimal_distances = calculate_minimal_distances_to_literature(embeddings, literature_embeddings)
        
        # Calculate statistics
        total_distance = sum(minimal_distances.values())
        avg_distance = total_distance / len(minimal_distances)
        min_distance = min(minimal_distances.values())
        max_distance = max(minimal_distances.values())
        
        approach_results[approach] = {
            "total_distance": total_distance,
            "avg_distance": avg_distance,
            "min_distance": min_distance,
            "max_distance": max_distance,
            "num_drugs": len(minimal_distances)
        }
        
        logger.info("Approach '%s' results:", approach)
        logger.info("  Total distance (sum of minimal distances for all drugs in the approach): %f", total_distance)
        logger.info("  Average distance (mean of the minimal distances across all drugs): %f", avg_distance)
        logger.info("  Min distance (smallest minimal distance observed among all drugs.): %f", min_distance)
        logger.info("  Max distance (largest minimal distance observed among all drugs): %f", max_distance)
        logger.info("  Number of drugs (count of drugs successfully embedded and evaluated): %d", len(minimal_distances))
    
    # Find the approach with the smallest total distance
    closest_approach = min(approach_results.items(), key=lambda x: x[1]["total_distance"])
    
    # Print summary results
    print("\nComparison of approaches to literature comparison:")
    print("=" * 50)
    for approach, results in approach_results.items():
        print(f"\n{approach.upper()}:")
        print("  Total distance: {:.4f}".format(results['total_distance']))
        print("    → Sum of minimal distances between each drug in this approach and its closest literature drug")
        print("  Average distance: {:.4f}".format(results['avg_distance']))
        print("    → Mean cosine distance across all drugs, lower values indicate better alignment with literature")
        print("  Min distance: {:.4f}".format(results['min_distance']))
        print("    → Distance of the drug most similar to any literature drug")
        print("  Max distance: {:.4f}".format(results['max_distance']))
        print("    → Distance of the drug least similar to any literature drug")
        print("  Number of drugs: {}".format(results['num_drugs']))
        print("    → Total drugs successfully embedded and compared from this approach")
    
    print(f"\nApproach closest to literature comparison: {closest_approach[0]}")
    print(f"Total distance: {closest_approach[1]['total_distance']:.4f}")

if __name__ == "__main__":
    # Define the JSON file paths for the approaches
    approach_files = {
        "zero_shot": "drug_lists/zero_shot_prompt_top_50_drugs.json",
        "ontological_prompt": "drug_lists/ontological_prompt_top_50_drugs.json",
        "literature": "drug_lists/cummings_eta_al_AD_DR_candidates.json"
    }
    
    # Specify the embedding model to use
    embedding_model = "llama3:8b"
    
    # Run the embedding pipeline for all approaches
    all_embeddings = run_through_all_approaches(approach_files, embedding_model)
    
    # Extract the literature comparison embeddings
    literature_embeddings = all_embeddings.get("literature", {})
    if not literature_embeddings:
        logger.error("No literature embeddings found. Exiting.")
        exit(1)
    
    # Compare approaches to literature comparison
    compare_approaches_to_literature(all_embeddings, literature_embeddings)


2025-05-27 23:10:06,492 [INFO] Loaded 50 drugs for approach 'zero_shot' from drug_lists/zero_shot_prompt_top_50_drugs.json
2025-05-27 23:10:06,493 [DEBUG] Requesting embedding for drug 'Aducanumab' using model 'llama3:8b'
2025-05-27 23:10:06,501 [DEBUG] Starting new HTTP connection (1): localhost:11434
2025-05-27 23:10:09,292 [DEBUG] http://localhost:11434 "POST /api/embed HTTP/1.1" 200 None
2025-05-27 23:10:09,295 [INFO] Embedding for drug 'Aducanumab' received (vector length: 4096)
2025-05-27 23:10:09,297 [DEBUG] Requesting embedding for drug 'Lecanemab' using model 'llama3:8b'
2025-05-27 23:10:09,300 [DEBUG] Starting new HTTP connection (1): localhost:11434
2025-05-27 23:10:09,426 [DEBUG] http://localhost:11434 "POST /api/embed HTTP/1.1" 200 None
2025-05-27 23:10:09,428 [INFO] Embedding for drug 'Lecanemab' received (vector length: 4096)
2025-05-27 23:10:09,428 [DEBUG] Requesting embedding for drug '6-[3-(4-Morpholinyl)Propyl]-2-(3-Nitrophenyl)-5-Thioxo-5,6,-Dihydro-7h-Thienol[2',3'


Comparison of approaches to literature comparison:

ZERO_SHOT:
  Total distance: 18.8076
    → Sum of minimal distances between each drug in this approach and its closest literature drug
  Average distance: 0.3762
    → Mean cosine distance across all drugs, lower values indicate better alignment with literature
  Min distance: 0.1105
    → Distance of the drug most similar to any literature drug
  Max distance: 0.6926
    → Distance of the drug least similar to any literature drug
  Number of drugs: 50
    → Total drugs successfully embedded and compared from this approach

ONTOLOGICAL_PROMPT:
  Total distance: 14.7721
    → Sum of minimal distances between each drug in this approach and its closest literature drug
  Average distance: 0.2954
    → Mean cosine distance across all drugs, lower values indicate better alignment with literature
  Min distance: 0.1039
    → Distance of the drug most similar to any literature drug
  Max distance: 0.6928
    → Distance of the drug least simi