In [10]:
import pandas as pd
import numpy as np
import torch
import transformers
from transformers import AutoTokenizer, AutoModel


LLM

In [11]:
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)

if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id


gpu = 7
device = f"cuda:{gpu}" if torch.cuda.is_available() else "cpu"

model_nf4 = transformers.AutoModelForCausalLM.from_pretrained(model_id, 
                                                 device_map={"": device})

pipeline = transformers.pipeline(
    "text-generation",
    model= model_nf4, #model_id,
    tokenizer=tokenizer)

Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00,  1.36s/it]


In [12]:
def questions_answers(text):
    messages = [
    #    {"role": "system", "content": "You are an expert in creating key questions from a medical text and extract the answers from the text. Extract 3-10 Q/A pairs without repititions of key entities in the Q/As. Avoid general questions like 'What is the exclusion criteria?'. Make sure an answer is NO MORE than 5 tokens/words. Output as json format like this: {'Question': 'question1', 'Answer': 'answer1', 'Question': 'question2' , 'Answer': 'answer2', ...} \n Input: "},
        {"role": "system", "content": "You are an expert in creating key questions from a medical text and extract the answers from the text. Extract 3-10 Q/A pairs without repititions of key entities in the Q/As. Avoid general questions like 'What is the exclusion criteria?'. Make sure an answer is NO MORE than 5 tokens/words. Output ONLY json formated Q/A pairs like this: {'Question': 'question1', 'Answer': 'answer1'} \n {'Question': 'question2' , 'Answer': 'answer2'} \n ... \n Input: "},
        {"role": "user", "content": text}]
    
    prompt = pipeline.tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
    )

    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    outputs = pipeline(
        prompt,
        max_new_tokens=1024,
        eos_token_id=terminators,
        do_sample=True,
        temperature=0.1,
        top_p=0.9,
    )
    #print(outputs[0]["generated_text"][len(prompt):])
    return outputs[0]["generated_text"][len(prompt):]

In [13]:
import ast

def parse_q_a_criteria(q_a_criteria):
    """
    Converts a string of question-answer pairs into a list of formatted strings.

    Parameters:
    q_a_criteria (str): Input string containing question-answer pairs.

    Returns:
    list: List of strings combining questions and answers.
    """
    result = []
    # Split by newline to handle individual JSON-like entries
    for entry in q_a_criteria.split('\n'):
        try:
            # Safely evaluate the string to a dictionary
            qa_dict = ast.literal_eval(entry)
            if 'Question' in qa_dict and 'Answer' in qa_dict:
                # Format the question-answer pair
                result.append(f"{qa_dict['Question']} {qa_dict['Answer']}")
        except (ValueError, SyntaxError):
            continue
    return result

Trials

In [14]:
#read /home/trishad2/trial_searching/PoC/for_submission/data/demo_train_data.csv

df = pd.read_csv("/home/trishad2/trial_searching/PoC/for_submission/data/demo_data.csv")

df.head()	

Unnamed: 0,nct_id,description,title,intervention_name,disease,keywords,outcome_measures,criteria,overall_status
0,NCT04167371,Background. Rumination syndrome is characteriz...,Treament of Rumination,Biofeedback,Rumination Disorders,Not Available,Number of rumination events after a challenge ...,Inclusion Criteria:~* Rumination syndrome~Excl...,COMPLETED
1,NCT01645722,The objective of this pilot study is to assess...,Enriched Autologous Fat Grafting for Treating ...,Procedure/Surgery: Enriched Fat grafting,Amputation Stumps,Fat Grafts,"Treatment of Painful Amputation Sites, 1) Trea...",Inclusion Criteria:~1. Aged 18 years or older ...,COMPLETED
2,NCT04519957,The primary objective of this study is to asse...,Multicentre Study To Assess Safety And Efficac...,Not Available,Treatment Resistant Depression,Not Available,"Long-term efficacy of psilocybin, Use of new a...",Inclusion Criteria:~Signed ICF Each participan...,COMPLETED
3,NCT04768985,"This study is a multicenter, Phase I, open-lab...","A Phase I, Open-Label, Randomized, 2-Treatment...",Treatment A: Acalabrutinib tablet,Bioequivalence,Pharmacokinetics,Area under plasma concentration time curve fro...,Inclusion Criteria:~* Females must have a nega...,COMPLETED
4,NCT03593941,This research project will address a desperate...,Ageing Gut Brain Interactions,Standard Diet,Dementia Alzheimers,Ageing,Faecal sample Short chain fatty acid (SCFA) pr...,Inclusion Criteria:~* Resident in a care home~...,COMPLETED


Q/A generation using LLM

In [15]:
from tqdm import tqdm
tqdm.pandas()

#apply the function to each row in the 'criteria' column
df['q_a_criteria'] = df['criteria'].progress_apply(lambda x: parse_q_a_criteria(questions_answers(x)))

Predefined Q/A

In [18]:
"""q_a_set.append('What are the drugs used? '+ intervention)
q_a_set.append('What is the disease treated in this trial? '+ disease)
q_a_set.append('What is the title of the trial? '+ title)
q_a_set.append('What are the outcome measures? '+ outcome_measures)
q_a_set.append('What are the keywords? '+ keywords)"""

#predefined questions for title, intervention, disease, outcome measures, keywords

predefined_questions = [
    'What are the drugs used? ',
    'What is the disease treated in this trial? ',
    'What is the title of the trial? ',
    'What are the outcome measures? ',
    'What are the keywords? '
]

#create q_a_intervention, q_a_disease, q_a_title, q_a_outcome_measures, q_a_keywords columns
df['q_a_intervention'] = predefined_questions[0] + df['intervention_name']
df['q_a_disease'] = predefined_questions[1] + df['disease']
df['q_a_title'] = predefined_questions[2] + df['title']
df['q_a_outcome_measures'] = predefined_questions[3] + df['outcome_measures']
df['q_a_keywords'] = predefined_questions[4] + df['keywords']

In [19]:
#create a new column 'all_q_a' that combines all q_a columns
df['all_q_a'] = df['q_a_criteria'] + df['q_a_intervention'].apply(lambda x: [x]) + df['q_a_disease'].apply(lambda x: [x]) + df['q_a_title'].apply(lambda x: [x]) + df['q_a_outcome_measures'].apply(lambda x: [x]) + df['q_a_keywords'].apply(lambda x: [x])
df['all_q_a'] = df['all_q_a'].apply(lambda x: ' '.join(x))  # Join list of strings into a single string

Load SECRET

In [21]:
# Load the BioBERT model and tokenizer
model_name = "dmis-lab/biobert-base-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

state_dict = torch.load('models/global_model.pth')

# Remove `module.` prefix if present
from collections import OrderedDict
new_state_dict = OrderedDict()
for k, v in state_dict.items():
    name = k[7:] if k.startswith('module.') else k  # remove 'module.' prefix
    new_state_dict[name] = v

model.load_state_dict(new_state_dict)
model.to(device)
model.eval()

  state_dict = torch.load('models/global_model.pth')


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [22]:
def embed_text(text):
    # Tokenize input text
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    
    # Pass through the model
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract the embeddings (hidden states from the last layer)
    # outputs.last_hidden_state -> (batch_size, sequence_length, hidden_size)
    embeddings = outputs.last_hidden_state.to(device)

    # Pool the embeddings (e.g., by taking the mean across the sequence length)
    pooled_embeddings = embeddings.mean(dim=1)

    return pooled_embeddings

Get embedding

In [25]:

#progress apply the embedding function to the 'all_q_a' column
df['embedding'] = df['all_q_a'].apply(lambda x: embed_text(x))

Save the dataframe

In [28]:
#save as pickle
df.to_pickle("/home/trishad2/trial_searching/PoC/for_submission/data/demo_data_with_embeddings.pkl")

KNN for each test trial

In [36]:
#get the embeddings for all trials from /home/trishad2/trial_searching/search/emb_dict_255572_secret_2_0.pkl

import pickle
with open('/home/trishad2/trial_searching/search/emb_dict_216587_secret_1_0.pkl', 'rb') as f:
    emb_dict = pickle.load(f)

#show first 5 keys
list(emb_dict.keys())[:5]

['NCT04167371', 'NCT01645722', 'NCT04519957', 'NCT04768985', 'NCT03593941']

In [37]:
#now for the fifth trial in df, find the 5 nearest neighbors in emb_dict using cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
test_embedding = df.iloc[4]['embedding'].cpu().numpy()  # Convert to numpy array
all_embeddings = np.array([emb_dict[key].cpu().numpy() for key in emb_dict])  # Convert all to numpy arrays

all_embeddings.shape, test_embedding.shape

((216587, 1, 768), (1, 768))

In [38]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def get_k_nearest_neighbors(test_embedding, emb_dict, k=5):
    """
    Returns the k nearest neighbors (keys) from emb_dict for the given test_embedding.

    Parameters:
    - test_embedding: torch.Tensor or np.ndarray, the embedding to compare (shape: [hidden_dim] or [1, hidden_dim])
    - emb_dict: dict, keys are IDs and values are torch.Tensor embeddings
    - k: int, number of nearest neighbors to return

    Returns:
    - List of tuples: (key, similarity_score), sorted by descending similarity
    """
    # Convert test_embedding to numpy array and flatten to 1D
    if hasattr(test_embedding, "cpu"):
        test_embedding = test_embedding.cpu().numpy()
    test_embedding = np.squeeze(test_embedding)  # shape: [hidden_dim]

    all_keys = list(emb_dict.keys())
    # Flatten all embeddings to 1D
    all_embeddings = np.array([
        np.squeeze(emb_dict[key].cpu().numpy() if hasattr(emb_dict[key], "cpu") else emb_dict[key])
        for key in all_keys
    ])  # shape: [n_samples, hidden_dim]

    similarities = cosine_similarity([test_embedding], all_embeddings)[0]
    top_k_idx = np.argsort(similarities)[::-1][:k]
    return [(all_keys[i], similarities[i]) for i in top_k_idx]

In [39]:
neighbors = get_k_nearest_neighbors(test_embedding, emb_dict, k=5)
print(neighbors)

[('NCT03593941', 0.98154485), ('NCT03256929', 0.8254221), ('NCT01020617', 0.82448864), ('NCT05008770', 0.81799597), ('NCT04135066', 0.8174179)]
