In [2]:
import torch
from transformers import AutoTokenizer, AutoModel
import random
import numpy as np
from sklearn.manifold import TSNE
import umap
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import plotly.express as px
import pandas as pd

np.random.seed(0)
random.seed(0)
torch.manual_seed(0)

<torch._C.Generator at 0x10d37d0f0>

In [3]:
pd.set_option("display.max_colwidth", None)

In [4]:
"""
References:
- https://arxiv.org/pdf/1908.10084
- https://huggingface.co/sentence-transformers/bert-base-nli-mean-tokens
- https://huggingface.co/microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext
- https://huggingface.co/NeuML/pubmedbert-base-embeddings
"""

'\nReferences:\n- https://arxiv.org/pdf/1908.10084\n- https://huggingface.co/sentence-transformers/bert-base-nli-mean-tokens\n- https://huggingface.co/microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext\n- https://huggingface.co/NeuML/pubmedbert-base-embeddings\n'

In [None]:
"""
Set options here
"""
# encoding_model_str = "google-bert/bert-base-uncased"
# encoding_model_str = "sentence-transformers/bert-base-nli-mean-tokens"
# encoding_model_str = "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext"
encoding_model_str = "NeuML/pubmedbert-base-embeddings"

# text files from the Canadian ADR dB
# see: https://www.canada.ca/en/health-canada/services/drugs-health-products/medeffect-canada/adverse-reaction-database/canada-vigilance-online-database-data-extract.html
base_dir = "./"
text_path_dict = {
    "AlternateDrugs": base_dir + "AlternateDrugs.csv",
    "diseases": base_dir + "diseases.csv",
    "drugnames": base_dir + "drugnames.csv",
}

text_path = text_path_dict["diseases"]  # ["diseases"]
limit_num_rows_text = 500

In [6]:
with open(text_path, "r") as f:
    texts = f.readlines()
texts = [i.strip().lower() for i in texts[1:]]
random.shuffle(texts)
texts = texts[:limit_num_rows_text]

In [7]:
tokenizer = AutoTokenizer.from_pretrained(encoding_model_str)
model = AutoModel.from_pretrained(encoding_model_str)

In [8]:
encoded_inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
with torch.no_grad():
    outputs = model(**encoded_inputs)[0]  # get the last hidden state vector
attention_mask = encoded_inputs["attention_mask"][
    ..., None
]  # handle padding with attention mask

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [9]:
sentence_embedding = torch.sum(outputs * attention_mask, dim=1) / torch.clamp(
    torch.sum(attention_mask, dim=1), min=1e-9
)

In [10]:
sentence_embedding

tensor([[-0.2993, -0.1205, -0.4987,  ...,  0.8281, -0.1082, -0.9280],
        [-0.2576, -0.1558,  0.1248,  ...,  0.7627,  1.3915, -0.0368],
        [-0.8967,  0.3811, -0.2032,  ..., -0.0026, -0.3606, -0.5183],
        ...,
        [ 0.2720, -0.0714, -0.1720,  ...,  0.5993, -1.2595, -0.2090],
        [-0.4956,  0.0176, -0.7110,  ...,  1.2511, -1.0651, -0.3460],
        [ 0.1386,  0.0774, -0.2331,  ...,  1.7877, -0.8214,  0.0385]])

In [11]:
sentence_embedding.shape

torch.Size([500, 768])

In [12]:
norms = torch.clamp(torch.sqrt(torch.sum(sentence_embedding**2, dim=1)), min=1e-9)[
    ..., None
]
sentence_embedding_normalized = sentence_embedding / norms

In [13]:
# visualization
# first use PCA
pca_components = 50

In [14]:
pca = PCA(n_components=pca_components, svd_solver="full")
X_pca = pca.fit_transform(sentence_embedding_normalized.numpy())

In [15]:
# then use UMAP (or tSNE)
tsne = umap.UMAP(random_state=42)  # TSNE(n_components=2)
X_tsne = tsne.fit_transform(X_pca)

  warn(
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [16]:
X_tsne_pd = pd.DataFrame(X_tsne)
X_tsne_pd.columns = ["tsne_x", "tsne_y"]
X_tsne_pd["text"] = texts

In [17]:
X_tsne_pd.head()

Unnamed: 0,tsne_x,tsne_y,text
0,7.862282,4.145305,road traffic accident
1,6.996767,5.301177,chylothorax
2,6.294518,3.694638,foreign body in gastrointestinal tract
3,8.049612,6.591092,prophylactic chemotherapy
4,5.2344,5.507451,vaccination site infection


In [18]:
# K-means clustering to visualize the closest neighbors
n_clusters = 50

kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans_classes = kmeans.fit_predict(X_tsne)
X_tsne_pd["KMeans_classes"] = kmeans_classes
kmeans_classes.shape

  super()._check_params_vs_input(X, default_n_init=10)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


(500,)

In [19]:
fig = px.scatter(
    X_tsne_pd, x="tsne_x", y="tsne_y", hover_data=["text"], color="KMeans_classes"
)
fig.show()

In [20]:
# visualize per kmeans class
class_num = 9
X_tsne_pd[X_tsne_pd["KMeans_classes"] == class_num].head(20)

Unnamed: 0,tsne_x,tsne_y,text,KMeans_classes
27,6.95951,0.640372,plasmablastic lymphoma,9
89,7.148324,0.763391,lymphomatoid papulosis,9
100,6.836546,0.498588,b-cell unclassifiable lymphoma low grade,9
236,6.760018,0.470636,primary mediastinal large b-cell lymphoma stage ii,9
315,6.933728,0.651776,b precursor type acute leukaemia,9
433,6.808297,0.457208,nodular lymphocyte predominant hodgkin lymphoma,9
457,6.847237,0.577801,primary effusion lymphoma,9
480,6.659011,0.411188,non-hodgkin's lymphoma stage i,9


In [32]:
# Use Cosine Similarity for RAG -> This is the Retrieval part of Retrieval-augmented Generation
# Cosine similarity is just a dot-product after normalizing vectors to be unit norm
# For each embedding, we can compute the top K related sentences based on cosine similarity

topK = 10
topK_similar_texts = []
topK_similar_texts_cosines = []

print(sentence_embedding_normalized.shape)  # already normalized before
for user_query_embedding_idx in range(sentence_embedding_normalized.shape[0]):
    user_query = sentence_embedding_normalized[
        user_query_embedding_idx : user_query_embedding_idx + 1
    ]
    dot_product = torch.mm(sentence_embedding_normalized, user_query.T)
    top_matches = torch.argsort(dot_product, dim=0, descending=True).flatten()[
        1:
    ]  # discard the first elem as that will be itself

    # keep topK
    topK_matches = top_matches[:topK]
    topK_similar_texts.append(
        ", ".join(X_tsne_pd["text"].to_numpy()[topK_matches].tolist())
    )
    topK_similar_texts_cosines.append(
        ", ".join(
            [str(i)[:5] for i in dot_product[topK_matches].flatten().numpy().tolist()]
        )
    )

X_tsne_pd["topK_similar_texts"] = topK_similar_texts
X_tsne_pd["topK_similar_cosines"] = topK_similar_texts_cosines

torch.Size([500, 768])


In [33]:
X_tsne_pd.head()

Unnamed: 0,tsne_x,tsne_y,text,KMeans_classes,topK_similar_texts,topK_similar_cosines
0,7.862282,4.145305,road traffic accident,41,"cerebrovascular accident, accident at work, neck injury, mouth injury, animal bite, intentional self-injury, aneurysm ruptured, carbon monoxide poisoning, vulvovaginal injury, aortic aneurysm rupture","0.663, 0.578, 0.504, 0.397, 0.392, 0.336, 0.331, 0.313, 0.303, 0.301"
1,6.996767,5.301177,chylothorax,40,"aspiration pleural cavity, bronchostenosis, infectious pleural effusion, sympathectomy, pericardial excision, phlebotomy, splenectomy, ulcer haemorrhage, bursitis infective, abscess drainage","0.469, 0.439, 0.426, 0.407, 0.394, 0.388, 0.371, 0.370, 0.370, 0.370"
2,6.294518,3.694638,foreign body in gastrointestinal tract,12,"sensation of foreign body, gastrointestinal tube insertion, gastrointestinal erosion, metastases to biliary tract, intestinal transit time abnormal, frequent bowel movements, genitourinary tract neoplasm, reproductive tract disorder, gastric adenoma, perforation","0.629, 0.562, 0.503, 0.417, 0.416, 0.413, 0.360, 0.352, 0.332, 0.314"
3,8.049612,6.591092,prophylactic chemotherapy,47,"antibiotic prophylaxis, prophylaxis against solar radiation, radioimmunotherapy, psychiatric disorder prophylaxis, appendicectomy, antidiarrhoeal supportive care, colectomy, colorectal cancer metastatic, blood folate decreased, drug eruption","0.537, 0.466, 0.380, 0.366, 0.284, 0.264, 0.264, 0.262, 0.255, 0.254"
4,5.2344,5.507451,vaccination site infection,5,"systemic infection, injection site plaque, nipple infection, streptococcal infection, lymph gland infection, genital infection bacterial, rhinovirus infection, herpes zoster immunisation, influenza immunisation, retroviral infection","0.570, 0.501, 0.450, 0.432, 0.432, 0.406, 0.388, 0.387, 0.385, 0.354"
