In [1]:
import chromadb
from sentence_transformers import SentenceTransformer

client = chromadb.PersistentClient(path="./clinical_trials_chroma")
model = SentenceTransformer("malteos/scincl")
collection = client.get_or_create_collection("clinical_trials_studies")

  from tqdm.autonotebook import tqdm, trange


In [2]:
import re
def fix_invalid_json(input_str):
    ## add double quotes around elements inside square brackets if not already quoted
    fixed_str = re.sub(r'(?<=\[)([^\[\],]+)(?=\])', lambda x: '"' + x.group(0).strip() + '"', input_str)
    
    ## add double quotes around words in Conditions and Interventions
    fixed_str = re.sub(r'(?<=\[)([^\"\]]+?)(?=\])', lambda x: '"' + x.group(0).strip().replace(", ", '", "') + '"', fixed_str)
    
    ## fix key-value pairs inside Interventions
    fixed_str = re.sub(r'"([A-Za-z]+): ([A-Za-z0-9\s]+)"', r'"\1: \2"', fixed_str)
    
    # fix dictionary keys
    fixed_str = re.sub(r'(?<!")(\b[A-Za-z_]+\b)(?=\s*:)', r'"\1"', fixed_str)
    
    return fixed_str

In [3]:
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer

model_id = "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a16"
number_gpus = 1

tokenizer = AutoTokenizer.from_pretrained(model_id)

llm = LLM(model=model_id, tensor_parallel_size=number_gpus, max_model_len=20000)

def pipe(messages):
    sampling_params = SamplingParams(temperature=0, top_p=0.9, max_tokens=4096)
    prompts = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
    outputs = llm.generate(prompts, sampling_params)
    return [i.outputs[0].text for i in outputs]

INFO 10-25 09:58:45 llm_engine.py:174] Initializing an LLM engine (v0.5.4) with config: model='neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a16', speculative_config=None, tokenizer='neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a16', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=20000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=compressed-tensors, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a16, use_v2_block_manager=False, enable_prefix_caching=False)
INFO 10-25 09:58:47 model_runner.

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 10-25 09:58:50 model_runner.py:732] Loading model weights took 8.4927 GB
INFO 10-25 09:58:55 gpu_executor.py:102] # GPU blocks: 4915, # CPU blocks: 2048
INFO 10-25 09:58:58 model_runner.py:1024] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 10-25 09:58:58 model_runner.py:1028] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 10-25 09:59:11 model_runner.py:1225] Graph capturing finished in 13 secs.


In [4]:
query_embedding = model.encode("Effect of Kinesiotaping on Edema Management, Pain and Function on Patients With Bilateral Total Knee Arthroplasty [SEP] After being informed about the study and potential risk, all patients undergoing inpatient rehabilitation after bilateral total knee arthroplasty will have Kinesio(R)Tape applied to one randomly selected leg while the other leg serves as a control. Measurement of bilateral leg circumference, knee range of motion, numerical rating scale for pain, and selected questions from the Knee Injury and Osteoarthritis Outcome Score will occur at regular intervals throughout the rehabilitation stay. Patients will receive standard rehabilitation.").tolist()
    
results = collection.query(
    query_embeddings=[query_embedding],
    n_results=5
)
results

{'ids': [['NCT03032679',
   'NCT00686764',
   'NCT00130962',
   'NCT01632709',
   'NCT02283957']],
 'distances': [[110.8730239868164,
   113.80130004882812,
   115.11090087890625,
   115.89321899414062,
   122.37474060058594]],
 'metadatas': [[{'detailedDescription': "Study Description \nBrief Summary \nAfter Institutional Review Board approval, adult patients scheduled for total knee arthroplasty (TKA) will be enrolled into the study after obtaining informed consent over a 24 month period. Interim analyses of cases who have completed the 6 month follow up as on 31st October, 2017 will be analyzed as part of the co- principle investigator's thesis. All consenting patients will be assessed for pain preoperatively, the first 3 post-operative days, at 1, 4 and 6 months from the date of surgery. Pain scores will be assessed using the Numerical Rating Scale during the 3 postoperative days. Pain will be managed by the Acute Pain Service team as per standard protocols.\n\nThe Brief pain inven

In [5]:
# retrieve relevent studies from chromadb but exclude the ones that are already is the query
def retrieve_relevant_studies(query, existing_study, n_results=5):
    query_embedding = model.encode(query).tolist()
    
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=n_results + 1,
    )
    
    filtered_results = []
    for id, distance, document in zip(results['ids'][0], results['distances'][0], results['documents'][0]):
        if id != existing_study:
            filtered_results.append({
                "id": id,
                "distance": distance,
                "document": document,
            })
        
        if len(filtered_results) == n_results:
            break
    
    return filtered_results

print(
    retrieve_relevant_studies("Effect of Kinesiotaping on Edema Management, Pain and Function on Patients With Bilateral Total Knee Arthroplasty [SEP] After being informed about the study and potential risk, all patients undergoing inpatient rehabilitation after bilateral total knee arthroplasty will have Kinesio(R)Tape applied to one randomly selected leg while the other leg serves as a control. Measurement of bilateral leg circumference, knee range of motion, numerical rating scale for pain, and selected questions from the Knee Injury and Osteoarthritis Outcome Score will occur at regular intervals throughout the rehabilitation stay. Patients will receive standard rehabilitation.", 
                              "NCT05013879"))

[{'id': 'NCT03032679', 'distance': 110.8730239868164, 'document': '{"metadata": {"NCT_ID": "NCT03032679", "Brief_Title": "Pain and Impact of Chronic Pain on Function After Total Knee Replacements", "Official_title": "Postoperative Pain Severity, Incidence Of Chronic Pain And Its Impact On Daily Function Following Total Knee Replacements (TKR) At A Tertiary Cancer Institute", "Conditions": ["Pain", "Postoperative", "Pain", "Chronic", "Osteogenic Sarcoma", "Knee Arthropathy"], "Interventions": ["Other: Non interventional study"], "Location_Countries": ["India"], "Study_Design": {"Study_Type": "OBSERVATIONAL"}}, "description": "Study Description \\nBrief Summary \\nAfter Institutional Review Board approval, adult patients scheduled for total knee arthroplasty (TKA) will be enrolled into the study after obtaining informed consent over a 24 month period. Interim analyses of cases who have completed the 6 month follow up as on 31st October, 2017 will be analyzed as part of the co- principle 

In [6]:
import json
import json_repair
def related_studies_template(title: str, description: str, criteria: str):
    return f"""Example Title: {title}
Example Description: {description}
Example Criteria: {criteria}
"""

def craft_context_from_studies_documents(related_studies: list[str]):
    json_related_studies = [json.loads(i) for i in related_studies]
    context = ""
    for i in json_related_studies:
        title = i.get('metadata', {}).get('Official_title', "")
        description = i.get('description', "")
        criteria = i.get('criteria', "")
        if title and description:
            context += f"""<STUDY>
{related_studies_template(title, description, criteria)}
</STUDY>"""
    return context

def get_messages_for_create_CoT(encoded_related_studies: str, title: str, description: str, desired_criteria: str):
    user_prompt_template = """<EXAMPLE_STUDIES>{encoded_related_studies}</EXAMPLE_STUDIES>

Title: {title}
Description: {description}
Desired criteria: {desired_criteria}

Task Instructions:
1. Derive a step-by-step justification starting from the Title and Description provided, gradually building up to support the Desired criteria.
2. Could use example studies (in the <EXAMPLE_STUDIES> section) if they support your justifications, but ensure the reasoning is well-explained and relevant to the study's context.
4. Avoid mentioning that the criteria were already provided, and please do not cite the given criteria directly in your justification.
5. You should give the justification first before giving out the criteria.

Response Format:
<STEP-BY-STEP-JUSTIFICATION>
Your step by step justification here.
</STEP-BY-STEP-JUSTIFICATION>
<Criteria>
The copied desired criteria here.
</Criteria>
"""
    return [
        {"role": "system", "content": "You are a justifier chatbot designed to generate step-by-step justifications that derived form the Title and Description of a study and then gradually build up to the Desired criteria. Your task is to analyze the title and description of a study and build logical, step-by-step justifications that connect the study’s key elements to the desired criteria. Reference related example studies if they reinforce your justifications. You must assume the desired criteria are correct (as it was already reviewed by specialists) and develop arguments to support them based on the study context and relevant research insights."},
        {"role": "user", "content": user_prompt_template.format(encoded_related_studies=encoded_related_studies, title=title, description=description, desired_criteria=desired_criteria)},
    ]

def get_prompt_from_studies(study_info: dict):
    metadata = json_repair.loads(fix_invalid_json(study_info.get('metadata')))
    title = metadata.get('Official_title')
    description = study_info.get('data')
    study_id = metadata.get('NCT_ID')
    desired_criteria = study_info.get('criteria')

    # Ensure we have the minimum required information
    if not title or not description or not desired_criteria or not study_id:
        print(f"Skipping study {study_id}: Missing title or description or desired criteria or study id")
        return None

    query = f'{title} [SEP] {description}'
    relevant_studies = retrieve_relevant_studies(query, study_id)
    encoded_related_studies = craft_context_from_studies_documents([i['document'] for i in relevant_studies])
    messages = get_messages_for_create_CoT(encoded_related_studies, title, description, desired_criteria)
    return messages




In [7]:
import pandas as pd
from datasets import load_dataset
from tqdm import tqdm
# Load the JSON data

ravis_dataset = load_dataset("ravistech/clinical-trial-llm-cancer-restructure")

## use the function to get the CoT for the first 10 studies
for i in tqdm(ravis_dataset['train']):
    print(i)
    messages = get_prompt_from_studies(i)
    print(messages)
    print(pipe(messages))


  0%|          | 0/31840 [00:00<?, ?it/s]

{'metadata': '{\n"NCT_ID" : "NCT00933777",\n"Brief_Title" : "SORAVE - Sorafenib and Everolimus in Solid Tumors",\n\n"Official_title" : "SORAVE-Sorafenib and Everolimus in Solid Tumors. A Phase I Clinical Trial to Evaluate the Safety of Combined Sorafenib and Everolimus Treatment in Patients With Relapsed Solid Tumors",\n\n"Conditions" : [Unspecified Adult Solid Tumor, Protocol Specific, Non-Small Cell Lung Cancer],\n\n"Interventions" : [Drug: Combination of sorafenib and everolimus],\n\n"Location_Countries" : [Germany],\n\n"Study_Design" : {\n"Study_Type" : "INTERVENTIONAL",\n"Phase" : [PHASE1],\n"Primary_Purpose" : "TREATMENT",\n"Allocation" : "NA",\n"Interventional Model" : "SINGLE_GROUP",\n"Masking" : "NONE"\n }\n}', 'data': 'Study Description \nBrief Summary \nDose finding part: A phase I clinical trial to evaluate the safety of combined sorafenib and everolimus treatment in patients with relapsed solid tumors (finished).\n\nExtension part:Treatment of non-small cell lung cancer (N

Processed prompts: 100%|██████████| 1/1 [00:20<00:00, 20.64s/it, est. speed input: 386.98 toks/s, output: 62.89 toks/s]
  0%|          | 1/31840 [00:20<183:05:34, 20.70s/it]

['<STEP-BY-STEP-JUSTIFICATION>\n\nStep 1: The study aims to evaluate the safety of combined sorafenib and everolimus treatment in patients with relapsed solid tumors. This is evident from the title "SORAVE-Sorafenib and Everolimus in Solid Tumors. A Phase I Clinical Trial to Evaluate the Safety of Combined Sorafenib and Everolimus Treatment in Patients With Relapsed Solid Tumors." The description further supports this objective, stating that the study will be conducted in a dose-finding part to establish the maximal tolerated dose of the combination.\n\nStep 2: To achieve this objective, the study will recruit patients with solid tumors that have relapsed after and/or refractory to standard therapy. This is a common approach in phase I clinical trials, where the primary goal is to assess the safety and tolerability of a new treatment combination. As seen in the study "A Phase I Safety and Tolerability Study of Vorinostat in Combination With Sorafenib in Patients With Advanced Solid Tum

Processed prompts: 100%|██████████| 1/1 [00:13<00:00, 13.55s/it, est. speed input: 261.79 toks/s, output: 72.09 toks/s]
  0%|          | 2/31840 [00:34<145:59:17, 16.51s/it]

['<STEP-BY-STEP-JUSTIFICATION>\nStep 1: The study aims to evaluate the efficacy and safety of a paclitaxel-cisplatin combination regimen in the neoadjuvant setting for locally advanced head and neck cancer. This is evident from the title and description of the study, which mentions the purpose of learning if the combination regimen is active in locally advanced head and neck cancer and studying its safety.\n\nStep 2: To achieve this goal, the study will involve administering paclitaxel and cisplatin to patients with locally advanced head and neck cancer. The use of paclitaxel and cisplatin is supported by previous studies, such as the TAX 323 and TAX 324 trials, which demonstrated the efficacy of the TPF regimen (docetaxel, cisplatin, and 5-fluorouracil) in improving overall survival in patients with locally advanced head and neck cancer.\n\nStep 3: The study will involve patients with locally advanced, Stage II-IV (except M1), head and neck cancer, with at least one measurable lesion.

Processed prompts: 100%|██████████| 1/1 [00:18<00:00, 18.12s/it, est. speed input: 220.44 toks/s, output: 71.99 toks/s]
  0%|          | 3/31840 [00:52<152:33:10, 17.25s/it]

['<STEP-BY-STEP-JUSTIFICATION>\n\nStep 1: The study aims to compare the efficacy of two doses of Multihance (0.10 mmol/kg and 0.05 mmol/kg) in Magnetic Resonance Imaging (MRI) of the Central Nervous System (CNS). This is evident from the title and description of the study, which highlights the comparison of two doses of Multihance for CNS imaging.\n\nStep 2: To evaluate the efficacy of the two doses, the study will collect existing data and images from patients who have undergone MRI for CNS diseases with Multihance administration at either the standard dose of 0.10 mmol/kg or the half dose of 0.05 mmol/kg. This approach is similar to the retrospective design of the study "The Safety and Efficacy of ProHance at the Dose of 0.10 mmol/kg in Magnetic Resonance Imaging of the Central Nervous System in Pediatric Patients Who Are Younger Than 2 Years of Age", which also collected existing data for a retrospective analysis.\n\nStep 3: The study will include patients who are at least 2 years o

  0%|          | 3/31840 [00:56<165:48:43, 18.75s/it]


KeyboardInterrupt: 