In [17]:
import torch
import polars as pl
import utils.model as model_utils
from tqdm.auto import tqdm
import pickle
import torch.nn.functional as F
import numpy as np 
FILE_PATH = "D:/UMLS/"
FILE_PATH_SAVE = "D:/finetune_sbert_new/1Membeddings/lora_16_quantized/"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = FILE_PATH_SAVE + "quantized_lora_16_1000_5.pt"

# load mapping and embeddings

In [7]:
snomed_info = pl.read_csv("D:/UMLS/snomed_info.csv")
icd_info = pl.read_csv("D:/UMLS/icd_info.csv")
with open(FILE_PATH + "icd_snomed_mappings.pkl", "rb") as f:
    loaded_mappings = pickle.load(f)

# Access individual mappings like:
id2snomed = loaded_mappings["id2snomed"]
id2icd = loaded_mappings["id2icd"]
snomed2id = loaded_mappings["snomed2id"]
icd2id = loaded_mappings["icd2id"]
icd2snomed = loaded_mappings["icd2snomed"]
snomed2icd = loaded_mappings["snomed2icd"]

embeddings = torch.load(FILE_PATH + "sapbert_lora_triplet_rank16_merged.pt")
icd_embeddings = torch.tensor(embeddings["icd_embeddings"]).to(device)
snomed_embeddings = torch.tensor(embeddings["snomed_embeddings"]).to(device)

# 1. icd to snomed

In [30]:
def get_similarity_score_icd2snomed(icd_embeddings, snomed_embeddings, icd2snomed):
    """
    Calculate the similarity score between ICD and SNOMED embeddings.
    """
    cosine_sim_all = []
    for icd in icd2snomed.keys():
        icd_embedding = icd_embeddings[int(icd)]
        snomed_embedding = snomed_embeddings[icd2snomed[icd]]

        cosine_sim = F.cosine_similarity(icd_embedding, snomed_embedding.sum(dim=0), dim=0)
        cosine_sim_all.append(cosine_sim.item())
    return np.array(cosine_sim_all)

In [31]:
cosine_sim_all = get_similarity_score_icd2snomed(icd_embeddings, snomed_embeddings, icd2snomed)

In [32]:
np.array(cosine_sim_all).mean(), np.array(cosine_sim_all).std(), np.array(cosine_sim_all).min(), np.array(cosine_sim_all).max()


(0.7801330139307064,
 0.1735793493375094,
 -0.0774565041065216,
 0.9965282678604126)

## 2. quantize and evaluate

In [None]:
model_name = FILE_PATH_SAVE + "quantized_lora_16_1000_5.pt"

# load model 
model = model_utils.initialize_model()
model.load_state_dict(torch.load(model_name)['model_state_dict'])

model.eval()
model.to(device)

In [35]:
icd_embeddings_q,_ = model_utils.evaluate(model, icd_embeddings)
snomed_embeddings_q,_ = model_utils.evaluate(model, snomed_embeddings)

Average cosine similarity: 0.7804
Average cosine similarity: 0.7864


In [37]:
cosine_sim_all_q = get_similarity_score_icd2snomed(icd_embeddings_q, snomed_embeddings_q, icd2snomed)
np.array(cosine_sim_all_q).mean(), np.array(cosine_sim_all_q).std(), np.array(cosine_sim_all_q).min(), np.array(cosine_sim_all_q).max()


(0.7967629565292343,
 0.16910495326887415,
 -0.096503846347332,
 1.000000238418579)