# MedCompare

## Requirements

Attention: before running, switch to gpu execution

In [1]:
!pip install datasets
!pip install transformers
!pip install huggingface_hub
!pip install tqdm


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [2]:
from huggingface_hub import notebook_login

In [3]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## 1. Data Processing


In [4]:
import re
import json
import requests
import numpy as np
from transformers import AutoTokenizer, pipeline, AutoModel, pipeline
import torch
from datasets import load_dataset
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer


Load Data

In [5]:
dataset = load_dataset("MattBastar/Medicine_Details")
data = dataset['train']
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
vectorizer = TfidfVectorizer(max_features=1000, stop_words=None)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Medicine_Details.csv:   0%|          | 0.00/4.36M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11825 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

We get the ontology mapping via open source platform BioPortal bioontology. For this u need a file (api-key.txt) that contains an api-key for BioPortal. For this create an account5 at https://bioportal.bioontology.org/ and get your own key.

In [6]:
# Function to clean text

# def clean_text(text):
#     text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Remove special characters
#     text = re.sub(r"\b\d+\b", "", text)  # Remove standalone numbers
#     text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
#     return text

def clean_text(text):
    # Step 1: Remove special characters except for alphanumeric and spaces
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Keep only letters and spaces

    # Step 2: Remove standalone numbers
    text = re.sub(r"\b\d+\b", "", text)

    # Step 3: Remove specific unwanted terms
    unwanted_terms = ["mg", "treatment", "Treatment", "MG", "mg", "ML", "ml", "of", "mgml"]  # Add other terms to this list as needed
    for term in unwanted_terms:
        text = re.sub(rf"\b{term}\b", "", text, flags=re.IGNORECASE)

    # Step 4: Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()

    return text



In [7]:
# #BASE_URL = "http://data.bioontology.org"
# #file = open("api-key.txt", "r")
# API_KEY = file.read().strip()
# #file.close()

# headers = {
#     "Authorization": f"apikey token={API_KEY}"
# }

# Function to look up ontology mappings from BioPortal API

# def get_bioportal_mapping(term):

#     params = {
#         "q": term,
#         "require_exact_match": "false" # false or true  deoending on exact matching
#     }
#     response = requests.get(f"{BASE_URL}/search", headers=headers, params=params)

#     if response.status_code != 200:
#         return {term: "unknown"}  # Default to "unknown" if the API call fails

#     data = response.json()

#     # Filter relevant mappings based on ontology prefixes else words without medical context get mapped as well
#     relevant_prefixes = [
#         "http://purl.bioontology.org/ontology",  # BioPortal's main prefix
#         "http://www.co-ode.org/ontologies/galen",  # GALEN ontology
#         "http://ncicb.nci.nih.gov"  # NCI Thesaurus
#     ]

#     for result in data.get("collection", []):
#         label = result.get("prefLabel")
#         ontology_id = result.get("@id")

#         if label and ontology_id and any(ontology_id.startswith(prefix) for prefix in relevant_prefixes):
#             return {label.lower(): ontology_id}

#     # Default
#     return {term: "unknown"}

### Processing User Input


In [8]:
pipe = pipeline("token-classification", model="Clinical-AI-Apollo/Medical-NER", aggregation_strategy="simple")

def preprocess_user_input_with_ner(user_input):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Step 1: Use the NER pipeline to extract medical entities
    ner_results = pipe(user_input)

    # Step 2: Group all entities by their types
    grouped_entities = {}
    for entity in ner_results:
        entity_group = entity["entity_group"]
        if entity_group not in grouped_entities:
            grouped_entities[entity_group] = []
        grouped_entities[entity_group].append(entity["word"])

    # Step 3: Collect all identified words
    all_words = []
    for group, words in grouped_entities.items():
        all_words.extend(words)

    # Step 4: Combine all words into a single string
    combined_text = " ".join(all_words)
    print(f"Combined Text: {combined_text}")

    # Step 5: Generate a single embedding for the combined text
    inputs = tokenizer(combined_text, padding=True, truncation=True, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        combined_embedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()  # CLS token embedding

    # Step 6: Return results
    return {
        "cleaned_input": user_input,
        "ner_results": ner_results,
        "grouped_entities": grouped_entities,
        "all_words": all_words,
        "combined_text": combined_text,
        "combined_embedding": combined_embedding  # Single combined embedding
    }

def preprocess_user_input_with_ner_tfidf(user_input):
    # Step 1: Use the NER pipeline to extract medical entities
    ner_results = pipe(user_input)

    # Step 2: Group all entities by their types
    grouped_entities = {}
    for entity in ner_results:
        entity_group = entity["entity_group"]
        if entity_group not in grouped_entities:
            grouped_entities[entity_group] = []
        grouped_entities[entity_group].append(entity["word"])

    # Step 3: Collect all identified words
    all_words = []
    for group, words in grouped_entities.items():
        all_words.extend(words)

    # Step 4: Combine all words into a single string
    combined_text = " ".join(all_words)
    print(f"Combined Text: {combined_text}")

    tfidf_embeddings = vectorizer.transform([combined_text])

    print(tfidf_embeddings.shape)
    # Step 6: Return results
    return {
        "cleaned_input": user_input,
        "ner_results": ner_results,
        "grouped_entities": grouped_entities,
        "all_words": all_words,
        "combined_text": combined_text,
        "combined_embedding": tfidf_embeddings  # Feature names for interpretation
    }


config.json:   0%|          | 0.00/5.14k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/736M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.66M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

Device set to use cuda:0


###Preprocessing for User Input aswell as Dataset

In [9]:
def preprocess_with_ner(text_input):
    ner_results = pipe(text_input)
    all_words = [entity["word"] for entity in ner_results]
    combined_text = " ".join(all_words)
    return combined_text

In [10]:
def combine_text(record):
    composition = clean_text(record['Composition'])
    uses = clean_text(record['Uses'])
    return f"{composition}{uses}"


processed_texts = [preprocess_with_ner(combine_text(record)) for record in tqdm(data, desc="Processing records")]



Processing records:   0%|          | 0/11825 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Processing records:   0%|          | 9/11825 [00:02<31:50,  6.19it/s]  You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Processing records: 100%|██████████| 11825/11825 [04:56<00:00, 39.95it/s]


In [11]:
import csv

output_file = "processed_texts.csv"

# Save to a CSV file
with open(output_file, "w", encoding="utf-8", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["Processed Text"])  # Header row
    for text in processed_texts:
        writer.writerow([text])

print(f"Processed texts saved to {output_file}")
from google.colab import files

# Download the CSV file
files.download(output_file)


Processed texts saved to processed_texts.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Generate and store Dataset embeddings with Clinical-BERT

In [12]:
def generate_embeddings(processed_text, model, tokenizer):
  print("Executing with Cuda GPU: " + str(torch.cuda.is_available()))
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model = model.to(device)
  embeddings = []
  for text in tqdm(processed_text, desc="Generating embeddings"):
      inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt").to(device)
      with torch.no_grad():
          outputs = model(**inputs)
      embedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()
      embeddings.append(embedding)

  # Convert embeddings to a NumPy array
  entity_embedding = np.array(embeddings)
  return entity_embedding


data_embeddings_bert = generate_embeddings(processed_texts, model, tokenizer)


Executing with Cuda GPU: True


Generating embeddings:   0%|          | 0/11825 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Generating embeddings: 100%|██████████| 11825/11825 [01:40<00:00, 117.96it/s]


In [13]:
embeddings_file = "data_embeddings_bert.npy"
np.save(embeddings_file, data_embeddings_bert)

print(f"BERT Embeddings saved to {embeddings_file}")


BERT Embeddings saved to data_embeddings_bert.npy


### Generate and store Dataset embeddings with TFIDF

In [14]:
def generate_tfidf_embeddings(processed_texts):
    processed_texts = [text for text in processed_texts if text.strip()]

    tfidf_embeddings = vectorizer.fit_transform(processed_texts)

    return tfidf_embeddings


data_embeddings_tfidf = generate_tfidf_embeddings(processed_texts)


In [15]:
embeddings_file = "data_embeddings_tfidf.npy"
np.save(embeddings_file, data_embeddings_tfidf)

print(f"TFIDF Embeddings saved to {embeddings_file}")


TFIDF Embeddings saved to data_embeddings_tfidf.npy


## 2. Similarity Ranking

Prepare Data Embeddings and Query Embeddings for Ranking

TODO: maybe add normalization before embedding

In [16]:
from sklearn.metrics.pairwise import cosine_similarity

function for user query preprocessing

In [17]:
def prepare_query_for_similarity_ranking(query, embedding_type = 'bert'):

  query = clean_text(query)

  if(embedding_type == 'tfidf'):
    preprocessed_query = preprocess_user_input_with_ner_tfidf(query)
  else:
    preprocessed_query = preprocess_user_input_with_ner(query)

  print("NER Results:", preprocessed_query["ner_results"])
  print(f"NER Query for Embedding: {preprocessed_query['combined_text']}")

  query_embedding = preprocessed_query["combined_embedding"]

  # print(group_embedding.shape)
  # print(embedding_type)
  if(embedding_type == 'bert'):
    query_embedding = np.array(query_embedding).reshape(1, -1)

  return query_embedding


function for ranking best Matches and Eliminate Duplicates, because necessary only Substances and Dataset provides multiple solutions with different dosages


In [18]:
def similarity_ranking(similarities, data, top_k = 5):

  medicine_names = [record['Medicine Name'] for record in data]
  compositions = [record['Composition'] for record in data]
  uses= [record['Uses'] for record in data]
  side_effects = [record['Side_effects'] for record in data]
  ratings = [record['Average Review %'] for record in data]


  ranked_indices = np.argsort(similarities[0])[::-1]  # Sort indices by similarity in descending order

  unique_compositions = set()
  final_results = []  # To store the final unique top-k results

  for idx in ranked_indices:
      medicine_name = medicine_names[idx]
      composition = compositions[idx]
      use = uses[idx]
      side_effect = side_effects[idx]
      average_review = ratings[idx]

      # Check for uniqueness based on composition
      if clean_text(composition) not in unique_compositions:
          unique_compositions.add(clean_text(composition)) # Mark composition as seen
          final_results.append({
              "Medicine": medicine_name,
              "Composition": composition,
              "Use": use,
              "Side Effects": side_effect,
              "Similarity": similarities[0][idx]
          })

      if len(final_results) == top_k:
          break
  return final_results

function for printing top results

In [19]:
def print_results(similarity_results):
  print("\nTop matching medications with side effects:")
  for result in similarity_results:
      print(f"Medicine: {result['Medicine']},\nComposition: {result['Composition']}, Use: {result['Use']}, \nSide Effects: {result['Side Effects']},\n Similarity: {result['Similarity']:.4f}\n")

  print("\n" + "=" * 50 + "\n")

###Calculating Cosine Similarities with Sklearn
and combine all together

In [20]:
def similarity(query, data_embedding, ground_data, embedding_type = 'bert'):
    query_embedding = prepare_query_for_similarity_ranking(query, embedding_type)
    #cosine similarity sklearn
    similarities = cosine_similarity(query_embedding, data_embedding)
    similarity_results = similarity_ranking(similarities, ground_data)
    print_results(similarity_results)


###Testing of Similarity Ranking

In [26]:
query = "antibiotics"
similarity(query, data_embeddings_tfidf, data, embedding_type = 'tfidf')
similarity(query, data_embeddings_bert, data)

Combined Text: antibiotics
(1, 1000)
NER Results: [{'entity_group': 'DETAILED_DESCRIPTION', 'score': 0.04926286, 'word': 'antibiotics', 'start': 0, 'end': 11}]
NER Query for Embedding: antibiotics

Top matching medications with side effects:
Medicine: Avastin 400mg Injection,
Composition: Bevacizumab (400mg), Use:  Cancer of colon and rectum Non-small cell lung cancer Kidney cancer Brain tumor Ovarian cancer Cervical cancer, 
Side Effects: Rectal bleeding Taste change Headache Nosebleeds Back pain Dry skin High blood pressure Protein in urine Inflammation of the nose,
 Similarity: 0.0000

Medicine: Zedruff Shampoo,
Composition: Ketoconazole (2% w/v) + Zinc pyrithione (1% w/v), Use: Treatment of Dandruff, 
Side Effects: Application site reactions burning irritation itching and redness,
 Similarity: 0.0000

Medicine: Zestasil 100 Tablet,
Composition: Sildenafil (100mg), Use: Treatment of Erectile dysfunction, 
Side Effects: Flushing sense of warmth in the face ears neck and trunk Headach

## 3. Evaluation of Clinical-BERT & TFIDF results


Get Query Dictonary with evaluation:
rating_precision -> bool relevant or not
rating_relevance -> output top 5 (1,2,3,4,5) gets a relevant rating like following:
5: Highly relevant (most relevant item to the query).
4: Very relevant (still strongly related to the query).
3: Moderately relevant (somewhat useful for the query).
2: Slightly relevant (not very useful but loosely related).
1: Not relevant (unlikely to satisfy the query).

In [22]:
#model = 'tfidf'
query = query
rating_precision_tfidf = (0, 0, 0, 1, 1)
rating_precision_bert = (1, 1, 1, 0, 1)
relevance_rating_tfidf = (0, 0, 0, 3, 3)
relevance_rating_bert = (5, 5, 5, 0, 5)
model_rating_tfidf = {'query': query, 'rating_prec': rating_precision_tfidf, 'sum_relevant_items': 14, 'relevance_rating': relevance_rating_tfidf}
model_rating_bert = {'query': query, 'rating_prec': rating_precision_bert, 'sum_relevant_items': 14, 'relevance_rating': relevance_rating_bert}

###Precision@K and Recall@K

$
\text{Precision@K} = \frac{\text{Number of relevant items in top-K}}{K}
$


In [23]:
def precision(model_rating):
    #model_rating['query']
    relevant_in_top_k = sum(model_rating['rating_prec'])
    return relevant_in_top_k / 5

value_tfidf = precision(model_rating_tfidf)
value_bert = precision(model_rating_bert)
print("precision values:")
print(f'TFIDF: {value_tfidf}')
print(f'BERT: {value_bert}')

precision values:
TFIDF: 0.4
BERT: 0.8


$\text{Recall@K} = \frac{\text{Number of relevant items in top-K}}{\text{Total number of relevant items}}$

In [24]:
def recall(model_rating):
    relevant_in_top_k = sum(model_rating['rating_prec'])
    return relevant_in_top_k / model_rating['sum_relevant_items']

value_tfidf = recall(model_rating_tfidf)
value_bert = recall(model_rating_bert)
print("recall values:")
print(f'TFIDF: {value_tfidf}')
print(f'BERT: {value_bert}')

recall values:
TFIDF: 0.14285714285714285
BERT: 0.2857142857142857


### NDCG Normalized discounted cumulative gain

$\text{IDCG@K} = \sum_{i=1}^{K} \frac{\text{ideal relevance}_i}{\log_2(i + 1)}$

$\text{NDCG@K} = \frac{\text{DCG@K}}{\text{IDCG@K}}$

In [25]:
def ndcg(model_rating):
    dcg = 0.0
    idcg = 0.0
    relevance_scores = model_rating['relevance_rating']
    for i, score in enumerate(relevance_scores):
        dcg += score / np.log2(i + 2)

    ideal_scores = sorted(relevance_scores, reverse=True)
    for i, score in enumerate(ideal_scores):
        idcg += score / np.log2(i + 2)
    return dcg / idcg if idcg > 0 else 0.0

value_tfidf = ndcg(model_rating_tfidf)
value_bert = ndcg(model_rating_bert)
print("recall values:")
print(f'TFIDF: {value_tfidf}')
print(f'BERT: {value_bert}')

recall values:
TFIDF: 0.5012658353418872
BERT: 0.9828920819566878


###Compare Models
Plotting etc for certain querys