# MedCompare: Using Bio+ClinicalBERT for comparative analysis of medications



## Requirements

In [6]:
!pip install datasets
!pip install transformers
!pip install huggingface_hub
!pip install ipywidgets

Collecting ipywidgets
  Using cached ipywidgets-8.1.5-py3-none-any.whl.metadata (2.3 kB)
Collecting widgetsnbextension~=4.0.12 (from ipywidgets)
  Using cached widgetsnbextension-4.0.13-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab-widgets~=3.0.12 (from ipywidgets)
  Using cached jupyterlab_widgets-3.0.13-py3-none-any.whl.metadata (4.1 kB)
Using cached ipywidgets-8.1.5-py3-none-any.whl (139 kB)
Using cached jupyterlab_widgets-3.0.13-py3-none-any.whl (214 kB)
Using cached widgetsnbextension-4.0.13-py3-none-any.whl (2.3 MB)
Installing collected packages: widgetsnbextension, jupyterlab-widgets, ipywidgets
Successfully installed ipywidgets-8.1.5 jupyterlab-widgets-3.0.13 widgetsnbextension-4.0.13


In [4]:
from huggingface_hub import notebook_login

In [7]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## 1. Data Processing

In [8]:
import re
import json
import requests
from transformers import AutoTokenizer, pipeline, AutoModel, pipeline
import torch
from datasets import load_dataset

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


ModuleNotFoundError: No module named 'torch'

Load the necessary models and data


In [None]:
# Load Bio+ClinicalBERT tokenizer, model and data
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
dataset = load_dataset("MattBastar/Medicine_Details")
data = dataset['train']

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Medicine_Details.csv:   0%|          | 0.00/4.36M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11825 [00:00<?, ? examples/s]

We get the ontology mapping via open source platform BioPortal bioontology. For this u need a file (api-key.txt) that contains an api-key for BioPortal. For this create an account5 at https://bioportal.bioontology.org/ and get your own key.

In [None]:
BASE_URL = "http://data.bioontology.org"
file = open("api-key.txt", "r")
API_KEY = file.read().strip()
file.close()

headers = {
    "Authorization": f"apikey token={API_KEY}"
}

In [None]:
# Function to clean text

def clean_text(text):
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Remove special characters
    text = re.sub(r"\b\d+\b", "", text)  # Remove standalone numbers
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    return text

In [None]:
# Function to look up ontology mappings from BioPortal API

def get_bioportal_mapping(term):

    params = {
        "q": term,
        "require_exact_match": "true"
    }
    response = requests.get(f"{BASE_URL}/search", headers=headers, params=params)

    if response.status_code != 200:
        return {term: "unknown"}  # Default to "unknown" if the API call fails

    data = response.json()

    # Filter relevant mappings based on ontology prefixes else words without medical context get mapped as well
    relevant_prefixes = [
        "http://purl.bioontology.org/ontology",  # BioPortal's main prefix
        "http://www.co-ode.org/ontologies/galen",  # GALEN ontology
        "http://ncicb.nci.nih.gov"  # NCI Thesaurus
    ]

    for result in data.get("collection", []):
        label = result.get("prefLabel")
        ontology_id = result.get("@id")

        if label and ontology_id and any(ontology_id.startswith(prefix) for prefix in relevant_prefixes):
            return {label.lower(): ontology_id}

    # Default
    return {term: "unknown"}

### Processing User Input


In [None]:
pipe = pipeline("token-classification", model="Clinical-AI-Apollo/Medical-NER", aggregation_strategy="simple")

def preprocess_user_input_with_ner(user_input):

    # Step 1: Use the NER pipeline to extract medical entities
    ner_results = pipe(user_input)

    # Step 2: Filter relevant entities Disease Symptom
    relevant_entity_groups = {"DISEASE_DISORDER", "SIGN_SYMPTOM", "MEDICATION"}
    relevant_entities = [entity["word"] for entity in ner_results if entity["entity_group"] in relevant_entity_groups]

    # Step 3: Map entities to ontology terms
    mapped_terms = {}
    for entity in relevant_entities:
        mapping = get_bioportal_mapping(entity.lower())
        mapped_terms.update(mapping)

    # Step 4: Reconstruct the mapped input
    mapped_input = " ".join(mapped_terms.keys())

    # Step 5: Generate embeddings for the mapped input
    embeddings = None
    if mapped_input.strip():
        inputs = tokenizer(mapped_input, return_tensors="pt", padding=True, truncation=True, max_length=128)
        with torch.no_grad():
            outputs = model(**inputs)
            embeddings = outputs.last_hidden_state[:, 0, :]  # Use [CLS] token for embeddings

    return {
        "cleaned_input": user_input,
        "ner_results": ner_results,
        "relevant_entities": relevant_entities,
        "mapped_terms": mapped_terms,
        "mapped_input": mapped_input,
        "embeddings": embeddings.squeeze(0).tolist() if embeddings is not None and embeddings.numel() > 0 else []
    }

Device set to use cpu


### Generate and store Dataset embeddings

In [None]:
import numpy as np

# Function to combine and preprocess relevant fields
def combine_text(record):
    composition = clean_text(record['Composition'])
    uses = clean_text(record['Uses'])
    return f"{composition} {uses}"

# Process dataset: Combine and preprocess text
processed_texts = [combine_text(record) for record in data]

# Generate embeddings for the dataset
embeddings = []
for text in processed_texts:
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
    embeddings.append(embedding)

# Convert embeddings to a NumPy array
embeddings = np.array(embeddings)

# Store medicine names and side effects
medicine_names = [record['Medicine Name'] for record in data]
side_effects = [record['Side_effects'] for record in data]


KeyboardInterrupt: 

read from file

In [None]:
# Store medicine names and side effects
medicine_names = [record['Medicine Name'] for record in data]
side_effects = [record['Side_effects'] for record in data]

embeddings = np.load("medication_embeddings.npy")

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
query = "The patient has asthma and needs Budecort"


# Preprocess the query using NER
preprocessed_query = preprocess_user_input_with_ner(query)

print("NER Results:", preprocessed_query["ner_results"])
print("Relevant Entities:", preprocessed_query["relevant_entities"])
print("Mapped Terms:", preprocessed_query["mapped_terms"])
print("Mapped Input:", preprocessed_query["mapped_input"])
print("Embeddings:", preprocessed_query["embeddings"])


# Extract query embedding
query_embedding = np.array(preprocessed_query["embeddings"])
# Compute cosine similarity
similarities = cosine_similarity([query_embedding], embeddings)

# Get top-K matches
top_k = 3
top_indices = np.argsort(similarities[0])[-top_k:][::-1]

# Print results
print("Top matching medications with side effects:")
for idx in top_indices:
    medicine_name = medicine_names[idx]
    side_effect = side_effects[idx]
    similarity = similarities[0][idx]
    print(f"Medicine: {medicine_name}, Similarity: {similarity:.4f}, Side Effects: {side_effect}")

NER Results: [{'entity_group': 'DISEASE_DISORDER', 'score': 0.19648945, 'word': 'asthma', 'start': 15, 'end': 22}, {'entity_group': 'MEDICATION', 'score': 0.7326823, 'word': 'Budecort', 'start': 32, 'end': 41}]
Relevant Entities: ['asthma', 'Budecort']
Mapped Terms: {'asthma': 'http://purl.bioontology.org/ontology/CST/ASTHMA', 'budesonide': 'http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C1027'}
Mapped Input: asthma budesonide
Embeddings: [0.18023738265037537, 0.3017248511314392, -0.13882333040237427, 0.0818009003996849, -0.014251728542149067, -0.21599873900413513, 0.15633946657180786, 0.4638438820838928, 0.5831120014190674, -0.4617716670036316, -0.1947605460882187, 0.3148210644721985, -0.7621767520904541, 0.06914067268371582, -0.10264098644256592, 0.47087588906288147, 0.0049354806542396545, -0.03586035966873169, 0.7566759586334229, -0.4439232051372528, -0.18401242792606354, 0.16389189660549164, -0.28692930936813354, -0.5211812853813171, -0.14588117599487305, -0.17900367081165314, 

save embeddings

In [None]:
np.save("medication_embeddings.npy", embeddings)

## 2. Similarity Ranking

## 3. Evaluation