In [1]:
import chromadb
from sentence_transformers import SentenceTransformer

client = chromadb.PersistentClient(path="./clinical_trials_chroma")
model = SentenceTransformer("malteos/scincl")
collection = client.get_or_create_collection("clinical_trials_studies")

  from tqdm.autonotebook import tqdm, trange


In [2]:
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer

model_id = "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a16"
number_gpus = 1

tokenizer = AutoTokenizer.from_pretrained(model_id)

llm = LLM(model=model_id, tensor_parallel_size=number_gpus, max_model_len=20000)

def pipe(messages):
    sampling_params = SamplingParams(temperature=0, top_p=0.9, max_tokens=4096)
    prompts = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
    outputs = llm.generate(prompts, sampling_params)
    return [i.outputs[0].text for i in outputs]

INFO 10-08 02:10:59 llm_engine.py:174] Initializing an LLM engine (v0.5.4) with config: model='neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a16', speculative_config=None, tokenizer='neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a16', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=20000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=compressed-tensors, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a16, use_v2_block_manager=False, enable_prefix_caching=False)
INFO 10-08 02:11:00 model_runner.

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 10-08 02:11:03 model_runner.py:732] Loading model weights took 8.4927 GB
INFO 10-08 02:11:09 gpu_executor.py:102] # GPU blocks: 4915, # CPU blocks: 2048
INFO 10-08 02:11:12 model_runner.py:1024] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 10-08 02:11:12 model_runner.py:1028] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 10-08 02:11:25 model_runner.py:1225] Graph capturing finished in 13 secs.


In [3]:
query_embedding = model.encode("Effect of Kinesiotaping on Edema Management, Pain and Function on Patients With Bilateral Total Knee Arthroplasty [SEP] After being informed about the study and potential risk, all patients undergoing inpatient rehabilitation after bilateral total knee arthroplasty will have Kinesio(R)Tape applied to one randomly selected leg while the other leg serves as a control. Measurement of bilateral leg circumference, knee range of motion, numerical rating scale for pain, and selected questions from the Knee Injury and Osteoarthritis Outcome Score will occur at regular intervals throughout the rehabilitation stay. Patients will receive standard rehabilitation.").tolist()
    
results = collection.query(
    query_embeddings=[query_embedding],
    n_results=5
)
results

{'ids': [['NCT05013879',
   'NCT05847725',
   'NCT02830958',
   'NCT04164927',
   'NCT05457686']],
 'distances': [[1.5859999424883142e-10,
   39.03214645385742,
   40.20624923706055,
   43.7834587097168,
   44.530799865722656]],
 'metadatas': [[{'detailed_description': 'After being informed about the study and potential risk, all patients undergoing inpatient rehabilitation after bilateral total knee arthroplasty will have Kinesio(R)Tape applied to one randomly selected leg while the other leg serves as a control. Measurement of bilateral leg circumference, knee range of motion, numerical rating scale for pain, and selected questions from the Knee Injury and Osteoarthritis Outcome Score will occur at regular intervals throughout the rehabilitation stay. Patients will receive standard rehabilitation.',
    'nctId': 'NCT05013879',
    'officialTitle': 'Effect of Kinesiotaping on Edema Management, Pain and Function on Patients With Bilateral Total Knee Arthroplasty'},
   {'detailed_descri

In [4]:
# retrieve relevent studies from chromadb but exclude the ones that are already is the query
def retrieve_relevant_studies(query, existing_study, n_results=5):
    query_embedding = model.encode(query).tolist()
    
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=n_results + 1,
    )
    
    filtered_results = []
    for id, distance, document in zip(results['ids'][0], results['distances'][0], results['documents'][0]):
        if id != existing_study:
            filtered_results.append({
                "id": id,
                "distance": distance,
                "document": document,
            })
        
        if len(filtered_results) == n_results:
            break
    
    return filtered_results

print(
    retrieve_relevant_studies("Effect of Kinesiotaping on Edema Management, Pain and Function on Patients With Bilateral Total Knee Arthroplasty [SEP] After being informed about the study and potential risk, all patients undergoing inpatient rehabilitation after bilateral total knee arthroplasty will have Kinesio(R)Tape applied to one randomly selected leg while the other leg serves as a control. Measurement of bilateral leg circumference, knee range of motion, numerical rating scale for pain, and selected questions from the Knee Injury and Osteoarthritis Outcome Score will occur at regular intervals throughout the rehabilitation stay. Patients will receive standard rehabilitation.", 
                              "NCT05013879"))

[{'id': 'NCT05847725', 'distance': 39.03214645385742, 'document': '{"protocolSection": {"identificationModule": {"nctId": "NCT05847725", "orgStudyIdInfo": {"id": "HU-15/666-21"}, "organization": {"fullName": "Hacettepe University", "class": "OTHER"}, "briefTitle": "Effects of Bandaging and Kinesiotaping\\u00ae in Patients With Total Knee Arthroplasty", "officialTitle": "Effects of Bandaging and Kinesiotaping\\u00ae on Pain, Edema, and Functional Level in Patients With Total Knee Arthroplasty in the Early Postoperative Period: A Randomized Clinical Trial"}, "statusModule": {"statusVerifiedDate": "2023-05", "overallStatus": "COMPLETED", "expandedAccessInfo": {"hasExpandedAccess": false}, "startDateStruct": {"date": "2015-05", "type": "ACTUAL"}, "primaryCompletionDateStruct": {"date": "2016-06", "type": "ACTUAL"}, "completionDateStruct": {"date": "2016-12", "type": "ACTUAL"}, "studyFirstSubmitDate": "2023-04-01", "studyFirstSubmitQcDate": "2023-04-27", "studyFirstPostDateStruct": {"date":

In [13]:
import json
def related_studies_template(title: str, description: str, criteria: str):
    return f"""Example Title: {title}
Example Description: {description}
Example Criteria: {criteria}
"""

def craft_context_from_studies_documents(related_studies: list[str]):
    json_related_studies = [json.loads(i) for i in related_studies]
    context = ""
    for i in json_related_studies:
        title = i.get('protocolSection', {}).get('identificationModule', {}).get('officialTitle', '')
        description = i.get('protocolSection', {}).get('descriptionModule', {}).get('detailedDescription', "")
        criteria = i.get('protocolSection', {}).get('eligibilityModule', {}).get('eligibilityCriteria', "")
        if title and description:
            context += f"""<STUDY>
{related_studies_template(title, description, criteria)}
</STUDY>"""
    return context

def get_messages_for_create_CoT(encoded_related_studies: str, title: str, description: str, desired_criteria: str):
    user_prompt_template = """<EXAMPLE_STUDIES>{encoded_related_studies}</EXAMPLE_STUDIES>

Title: {title}
Description: {description}
Desired criteria: {desired_criteria}

Task Instructions:
1. Derive a step-by-step justification starting from the Title and Description provided, gradually building up to support the Desired criteria.
2. Could use example studies (in the <EXAMPLE_STUDIES> section) if they support your justifications, but ensure the reasoning is well-explained and relevant to the study's context.
3. Focus on creating only justifications that support the desired criteria.
4. Avoid mentioning that the criteria were already provided, and please do not cite the given criteria directly in your justification.
5. You should give the justification first before giving out the criteria.
"""
    return [
        {"role": "system", "content": "You are a justifier chatbot designed to generate step-by-step justifications that derived form the Title and Description of a study and then gradually build up to the Desired criteria. Your task is to analyze the title and description of a study and build logical, step-by-step justifications that connect the study’s key elements to the desired criteria. Reference related example studies if they reinforce your justifications. You must assume the desired criteria are correct (as it was already reviewed by specialists) and develop arguments to support them based on the study context and relevant research insights."},
        {"role": "user", "content": user_prompt_template.format(encoded_related_studies=encoded_related_studies, title=title, description=description, desired_criteria=desired_criteria)},
    ]

def get_prompt_from_studies(study_info: dict):
    title = study_info.get('protocolSection', {}).get('identificationModule', {}).get('officialTitle', '')
    description = study_info.get('protocolSection', {}).get('descriptionModule', {}).get('detailedDescription', "")
    study_id = study_info.get('protocolSection', {}).get('identificationModule', {}).get('nctId', '')
    desired_criteria = study_info.get('protocolSection', {}).get('eligibilityModule', {}).get('eligibilityCriteria', "")

    # Ensure we have the minimum required information
    if not title or not description or not desired_criteria or not study_id:
        print(f"Skipping study {study_id}: Missing title or description or desired criteria or study id")
        return None

    query = f'{title} [SEP] {description}'
    relevant_studies = retrieve_relevant_studies(query, study_id)
    encoded_related_studies = craft_context_from_studies_documents([i['document'] for i in relevant_studies])
    messages = get_messages_for_create_CoT(encoded_related_studies, title, description, desired_criteria)
    return messages




In [14]:
import pandas as pd
import os
import pickle
import json
from tqdm import tqdm
# Load the JSON data
with open('clinical_trials_data/studies_page_1.json', 'r') as file:
    studies = json.load(file)

## use the function to get the CoT for the first 10 studies
for i in tqdm(studies[:10]):
    messages = get_prompt_from_studies(i)
    print(messages)
    if messages is None:
        continue
    print(pipe(messages))


Processed prompts:   0%|          | 0/1 [7:11:02<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts:   0%|          | 0/1 [03:12<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
  0%|          | 0/10 [00:00<?, ?it/s]

[{'role': 'system', 'content': 'You are a justifier chatbot designed to generate step-by-step justifications that derived form the Title and Description of a study and then gradually build up to the Desired criteria. Your task is to analyze the title and description of a study and build logical, step-by-step justifications that connect the study’s key elements to the desired criteria. Reference related example studies if they reinforce your justifications. You must assume the desired criteria are correct (as it was already reviewed by specialists) and develop arguments to support them based on the study context and relevant research insights.'}, {'role': 'user', 'content': "<EXAMPLE_STUDIES><STUDY>\nExample Title: Effects of Bandaging and Kinesiotaping® on Pain, Edema, and Functional Level in Patients With Total Knee Arthroplasty in the Early Postoperative Period: A Randomized Clinical Trial\nExample Description: The study was conducted on 30 volunteer subjects who underwent unilateral

Processed prompts: 100%|██████████| 2/2 [00:13<00:00,  6.61s/it, est. speed input: 638.56 toks/s, output: 139.40 toks/s]
 10%|█         | 1/10 [00:13<01:59, 13.23s/it]

["**Step 1: Understanding the Importance of Reducing Anxiety and Fear in Childbirth**\n\nThe study aims to evaluate the effect of Emotional Freedom Technique (EFT) on reducing anxiety, fear of surgery, and traumatic birth perception in women undergoing cesarean section. This is crucial because anxiety and fear experienced before surgery can lead to uneasiness, restlessness, and dissatisfaction due to uncertainty (1). In the context of childbirth, anxiety and fear can play a significant role in the formation of traumatic perception towards birth (2).\n\n**Step 2: The Role of Midwives in Reducing Negative Feelings**\n\nMidwives play a vital role in providing care to pregnant women during the prenatal period, and their approach can significantly impact the woman's experience. A study by Öznur HAYAT ÖKTEM (3) highlights the importance of emotional support from healthcare professionals in reducing anxiety and fear during childbirth. Similarly, a study by Öznur HAYAT ÖKTEM (3) suggests that 

Processed prompts: 100%|██████████| 1/1 [00:13<00:00, 13.88s/it, est. speed input: 241.64 toks/s, output: 69.47 toks/s]
 40%|████      | 4/10 [00:27<00:37,  6.26s/it]

['**Justification for Inclusion Criteria:**\n\n1. **Age Range (12-21 years)**: The study aims to assess the short and long-term consequences of alcohol exposure on brain, cognitive, and emotional/regulatory development during preadolescence and adolescence. This age range is crucial for understanding the effects of alcohol on brain development, as adolescence is a critical period of brain maturation (Giedd et al., 1999). Studies have shown that early alcohol exposure can lead to long-term changes in brain structure and function, particularly in regions involved in reward processing and impulse control (Squeglia et al., 2011; Squeglia et al., 2012).\n\n2. **Ability to Read and Understand English**: The study involves cognitive and emotional assessments, which require participants to understand and follow instructions. Ensuring that participants can read and understand English is essential for accurate data collection and interpretation. This criterion is also consistent with the inclusi

Processed prompts: 100%|██████████| 1/1 [00:12<00:00, 12.67s/it, est. speed input: 330.47 toks/s, output: 66.87 toks/s]
 50%|█████     | 5/10 [00:39<00:40,  8.05s/it]

["**Justification 1: Targeting High-Risk Adolescents**\n\nThe study aims to leverage noninvasive transcutaneous vagus nerve stimulation (tVNS) and smartphone technology to reduce suicidal behaviors and suicide among highly vulnerable adolescents. This approach is justified by the increasing trend of suicide rates among adolescents, with a 35% increase over the past two decades in the U.S. (Leveraging Biomarkers and New Technologies to Reduce Self-Injury and Substance Abuse Risk Among Highly Vulnerable Adolescents). The study's focus on primary prevention is essential, as most deaths by suicide occur on the first attempt, and current primary prevention programs are often intensive, expensive, and delivered by highly trained mental health providers who are in short supply (Redesigning Mental Health Services to Reduce Adolescent Suicide).\n\n**Justification 2: Emotion Dysregulation and Social Isolation as Key Risk Factors**\n\nThe study targets emotion dysregulation and social isolation a

Processed prompts: 100%|██████████| 1/1 [00:51<00:00, 51.55s/it, est. speed input: 91.67 toks/s, output: 68.04 toks/s]
 60%|██████    | 6/10 [01:31<01:21, 20.44s/it]

["**Justification for Inclusion Criteria 1: Age ≥18**\n\nThe study aims to investigate the effectiveness of the Specific Carbohydrate Diet (SCD) and the Mediterranean Diet (MD) in inducing remission in patients with Crohn's disease (CD). The SCD has been previously studied in pediatric patients with CD, but its effectiveness in adults is not well established. The MD has been associated with anti-inflammatory properties and has been studied in various populations, including adults. Therefore, it is essential to include adults in this study to determine the efficacy of these diets in this age group.\n\nA study by the investigators of the Specific Carbohydrate and Mediterranean Diets to Induce Remission in Patients With Crohn's Disease (2020) included patients aged 18 and above, which supports the inclusion of adults in this study. This study aimed to compare the effectiveness of the SCD and the MD in managing symptoms and reducing inflammatory markers in patients with CD.\n\n**Justificat

 70%|███████   | 7/10 [02:05<00:53, 17.90s/it]


KeyboardInterrupt: 