# MedCompare: Using Bio+ClinicalBERT for comparative analysis of medications



## Requirements

Attention: before running, switch to gpu execution

In [1]:
!pip install datasets
!pip install transformers
!pip install huggingface_hub
!pip install tqdm


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [2]:
from huggingface_hub import notebook_login

In [3]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## 1. Data Processing

In [4]:
import re
import json
import requests
from transformers import AutoTokenizer, pipeline, AutoModel, pipeline
import torch
from datasets import load_dataset
from functools import lru_cache

Load the necessary models and data


In [5]:
# Load Bio+ClinicalBERT tokenizer, model and data
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
dataset = load_dataset("MattBastar/Medicine_Details")
data = dataset['train']

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Medicine_Details.csv:   0%|          | 0.00/4.36M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11825 [00:00<?, ? examples/s]

We get the ontology mapping via open source platform BioPortal bioontology. For this u need a file (api-key.txt) that contains an api-key for BioPortal. For this create an account5 at https://bioportal.bioontology.org/ and get your own key.

In [6]:
BASE_URL = "http://data.bioontology.org"
file = open("api-key.txt", "r")
API_KEY = file.read().strip()
file.close()

headers = {
    "Authorization": f"apikey token={API_KEY}"
}

In [7]:
# Function to clean text

# def clean_text(text):
#     text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Remove special characters
#     text = re.sub(r"\b\d+\b", "", text)  # Remove standalone numbers
#     text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
#     return text

def clean_text(text):
    # Step 1: Remove special characters except for alphanumeric and spaces
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Keep only letters and spaces

    # Step 2: Remove standalone numbers
    text = re.sub(r"\b\d+\b", "", text)

    # Step 3: Remove specific unwanted terms
    unwanted_terms = ["mg", "treatment", "Treatment", "MG", "mg", "ML", "ml", "of", "mgml"]  # Add other terms to this list as needed
    for term in unwanted_terms:
        text = re.sub(rf"\b{term}\b", "", text, flags=re.IGNORECASE)

    # Step 4: Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()

    return text



In [8]:
# Function to look up ontology mappings from BioPortal API
 #caching ap calls maybe helpful

#@lru_cache(maxsize=1000)
def get_bioportal_mapping(term):

    params = {
        "q": term,
        "require_exact_match": "false" # false or true  deoending on exact matching
    }
    response = requests.get(f"{BASE_URL}/search", headers=headers, params=params)

    if response.status_code != 200:
        return {term: "unknown"}  # Default to "unknown" if the API call fails

    data = response.json()

    # Filter relevant mappings based on ontology prefixes else words without medical context get mapped as well
    relevant_prefixes = [
        "http://purl.bioontology.org/ontology",  # BioPortal's main prefix
        "http://www.co-ode.org/ontologies/galen",  # GALEN ontology
        "http://ncicb.nci.nih.gov"  # NCI Thesaurus
    ]

    for result in data.get("collection", []):
        label = result.get("prefLabel")
        ontology_id = result.get("@id")

        if label and ontology_id and any(ontology_id.startswith(prefix) for prefix in relevant_prefixes):
            return {label.lower(): ontology_id}

    # Default
    return {term: "unknown"}

### Processing User Input


In [9]:
pipe = pipeline("token-classification", model="Clinical-AI-Apollo/Medical-NER", aggregation_strategy="simple")

def preprocess_user_input_with_ner(user_input):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Step 1: Use the NER pipeline to extract medical entities
    ner_results = pipe(user_input)

    # Step 2: Group all entities by their types
    grouped_entities = {}
    for entity in ner_results:
        entity_group = entity["entity_group"]
        if entity_group not in grouped_entities:
            grouped_entities[entity_group] = []
        grouped_entities[entity_group].append(entity["word"])

    # Step 3: Collect all identified words
    all_words = []
    for group, words in grouped_entities.items():
        all_words.extend(words)

    # Step 4: Combine all words into a single string
    combined_text = " ".join(all_words)
    print(f"Combined Text: {combined_text}")

    # Step 5: Generate a single embedding for the combined text
    inputs = tokenizer(combined_text, padding=True, truncation=True, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        combined_embedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()  # CLS token embedding

    # Step 6: Return results
    return {
        "cleaned_input": user_input,
        "ner_results": ner_results,
        "grouped_entities": grouped_entities,
        "all_words": all_words,
        "combined_text": combined_text,
        "combined_embedding": combined_embedding  # Single combined embedding
    }


config.json:   0%|          | 0.00/5.14k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/736M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.66M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

Device set to use cpu


### Generate and store Dataset embeddings

In [10]:
def preprocess_with_ner(text_input):
    ner_results = pipe(text_input)
    all_words = [entity["word"] for entity in ner_results]
    combined_text = " ".join(all_words)
    return combined_text

In [11]:
from tqdm import tqdm
def combine_text(record):
    composition = clean_text(record['Composition'])
    uses = clean_text(record['Uses'])
    return f"{composition}{uses}"


processed_texts = [preprocess_with_ner(combine_text(record)) for record in tqdm(data, desc="Processing records")]



Processing records:   0%|          | 0/11825 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Processing records: 100%|██████████| 11825/11825 [1:24:18<00:00,  2.34it/s]


In [12]:
import csv

output_file = "processed_texts.csv"

# Save to a CSV file
with open(output_file, "w", encoding="utf-8", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["Processed Text"])  # Header row
    for text in processed_texts:
        writer.writerow([text])

print(f"Processed texts saved to {output_file}")

from google.colab import files

# Download the CSV file
files.download(output_file)


Processed texts saved to processed_texts.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [13]:
import numpy as np


def generate_embeddings(processed_text, model, tokenizer):
  print("Executing with Cuda GPU: " + str(torch.cuda.is_available()))
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model = model.to(device)
  embeddings = []
  for text in tqdm(processed_text, desc="Generating embeddings"):
      inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt").to(device)
      with torch.no_grad():
          outputs = model(**inputs)
      embedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()
      embeddings.append(embedding)

  # Convert embeddings to a NumPy array
  entity_embedding = np.array(embeddings)
  return entity_embedding


data_embeddings = generate_embeddings(processed_texts, model, tokenizer)


# Store medicine entities
medicine_names = [record['Medicine Name'] for record in data]
medicine_usage = [record['Uses'] for record in data]
medicine_composition = [record['Composition'] for record in data]
side_effects = [record['Side_effects'] for record in data]


Executing with Cuda GPU: False


Generating embeddings:   0%|          | 0/11825 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Generating embeddings: 100%|██████████| 11825/11825 [24:41<00:00,  7.98it/s]


In [14]:
import numpy as np

# Save the embeddings to a .npy file
embeddings_file = "data_embeddings.npy"
np.save(embeddings_file, data_embeddings)

print(f"Embeddings saved to {embeddings_file}")



Embeddings saved to data_embeddings.npy


save embeddings

In [None]:
# for key, value in data_embeddings.items():
#     np.save(f"{key}.npy", value)

In [None]:
# from google.colab import files
# files.download("uses_embeddings.npy")
# files.download("composition_embeddings.npy")
# files.download("medication_embeddings.npy")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [46]:
# Store medicine names and side effects
#print(data[:5])


medicine_name = [record['Medicine Name'] for record in data]

compositions = [record['Composition'] for record in data]

uses= [record['Uses'] for record in data]

ratings = [record['Average Review %'] for record in data]


side_effects = [record['Side_effects'] for record in data]

#embeddings = np.load("medication_embeddings.npy")
# data_embeddings = {}
# for key in ['uses_embeddings', 'composition_embeddings', 'medication_embeddings']:
#     data_embeddings[key] = np.load(f"{key}.npy")

In [70]:
from sklearn.metrics.pairwise import cosine_similarity

# Example query
query = "bloating"

query = clean_text(query)

# Preprocess the query using NER
preprocessed_query = preprocess_user_input_with_ner(query)

# Print extracted information
print("NER Results:", preprocessed_query["ner_results"])
print("Relevant Entities:", preprocessed_query["grouped_entities"])
print(f"Combined Text: {preprocessed_query['combined_text']}")
group_embedding = preprocessed_query["combined_embedding"]

#entity_embedding = data_embeddings['uses_embeddings']

reshaped_embedding = np.array(group_embedding).reshape(1, -1)

# Calculate cosine similarity
similarities = cosine_similarity(reshaped_embedding, data_embeddings)

top_k = 5
ranked_indices = np.argsort(similarities[0])[::-1]  

# Print results
print("\nTop matching medications with side effects:")
unique_compositions = set()
final_results = []  

for idx in ranked_indices:
    medicine_name = medicine_names[idx]
    composition = compositions[idx]
    use = uses[idx]
    side_effect = side_effects[idx]
    average_review = ratings[idx]

    # Check for uniqueness based on composition
    if clean_text(composition) not in unique_compositions:
        unique_compositions.add(clean_text(composition))
        final_results.append({
            "Medicine": medicine_name,
            "Composition": composition,
            "Use": use,
            "Side Effects": side_effect,
            "Similarity": similarities[0][idx],
            "Rating": average_review
        })
    if len(final_results) == top_k:
        break

# Print the results
for result in final_results:
    print(f"Medicine: {result['Medicine']},\nComposition: {result['Composition']}, Use: {result['Use']}, \nSide Effects: {result['Side Effects']}, Similarity: {result['Similarity']:.4f}, Rating: {result['Rating']} %\n")

print("\n" + "=" * 50 + "\n")



Combined Text: bloating
NER Results: [{'entity_group': 'DETAILED_DESCRIPTION', 'score': 0.06905991, 'word': 'bloating', 'start': 0, 'end': 8}]
Relevant Entities: {'DETAILED_DESCRIPTION': ['bloating']}
Combined Text: bloating

Top matching medications with side effects:
Medicine: Tinnitod 20mg Capsule,
Composition: Caroverine (20mg), Use:  Tinnitus (ringing in the ears)Treatment of Abdominal pain, 
Side Effects: nausea itching skin rash, Similarity: 0.8988, Rating: 17 %

Medicine: EME OD Tablet MD,
Composition: Palonosetron (0.5mg), Use:  Nausea Vomiting, 
Side Effects: constipation headache diarrhea dizziness, Similarity: 0.8933, Rating: 28 %

Medicine: Dimol 40 Tablet,
Composition: Dimethicone (40mg), Use: Treatment of BloatingTreatment of Abdominal pain, 
Side Effects: coating on tongue diarrhea dehydration, Similarity: 0.8897, Rating: 29 %

Medicine: Akynzeo 300mg/0.5mg Capsule,
Composition: Netupitant (300mg) + Palonosetron (0.5mg), Use:  Nausea and vomiting due to chemotherapy, 
S

save embeddings

In [None]:
np.save("medication_embeddings.npy", embeddings)

## 2. Similarity Ranking

## 3. Evaluation