In [None]:
!pip install chromadb sentence-transformers

In [4]:
from sentence_transformers import SentenceTransformer
import chromadb

class GeneListRag:
    def __init__(self, gene_list_path = '../../ref/genes.txt'):
        self.embed_model = SentenceTransformer('all-MiniLM-L6-v2')
        self.client = chromadb.PersistentClient(path='./gene_db')
        self.collection = self.client.get_or_create_collection(name='genes')
        self.gene_list_path = gene_list_path

        if self.collection.count() == 0:
            print("Indexing gene list...")
            self._index_gene_list(self.gene_list_path)
        else :
            print(f"Gene list already indexed: {self.collection.count()} genes chunks found.")
    
    def _index_gene_list(self, file_path):
        # load gene list
        with open(file_path, 'r') as f:
            genes = f.read()

        # create chunks : groups of n genes per chunk
        genes = [gene for gene in genes.split('\n') if gene.strip() != '']
        chunk_size = 1
        gene_chunks = [genes[i:i + chunk_size] for i in range(0, len(genes), chunk_size)]
        
        # create embeddings for each chunk and store in ChromaDB
        for i, chunk in enumerate(gene_chunks):
            chunk_text = '\n'.join(chunk)
            embedding = self.embed_model.encode(chunk_text).tolist()
            self.collection.add(
                documents=[chunk_text],
                embeddings=[embedding],
                ids=[f"gene_chunk_{i}"]
            )
        print(f"Indexed {len(gene_chunks)} gene chunks.")

    def retrieve_relevant_genes(self, trial_criteria):
        # create embeddings for the trial criteria
        query_embedding = self.embed_model.encode(trial_criteria).tolist()

        # query the vector database for relevant gene chunks
        results = self.collection.query(
            query_embeddings=[query_embedding]
             ,n_results=5
        )
        print(f"Retrieved {len(results['documents'][0])} relevant gene chunks.")
        retrieved_genes = []
        for doc in results['documents'][0]:
            print(f"Retrieved chunk:\n{doc}\n")
            retrieved_genes.extend(doc.split('\n')) 
        return retrieved_genes


In [2]:
nct_id = "NCT05410145"
eligibilityCriteria = "Inclusion:\n\n* Subject must have a histologically or cytologically confirmed metastatic or locally advanced solid tumor which is progressing.\n* Subject must have documented KRAS p.G12C mutation identified within the last 5 years by a local test on tumor tissue or blood.\n* Subject must have measurable disease per RECIST v1.1.\n* Subject must have Eastern Cooperative Oncology Group (ECOG) performance status of 0 or 1.\n* Subject must have adequate organ and marrow function within the screening period.\n\nExclusion:\n\n* Subject has any prior treatment with other treatments without adequate washout periods as defined in the protocol.\n* Subject has uncontrolled intercurrent illness, including but not limited to, ongoing or active infection, uncontrolled or significant cardiovascular disease, serious chronic gastrointestinal conditions associated with diarrhea, or psychiatric illness/social situations that would limit compliance with study requirements, substantially increase risk of incurring AEs, or compromise the ability of the subject to give written informed consent.\n* Subject has unresolved treatment-related toxicities from previous anticancer therapy of NCI CTCAE Grade \u22652 (with exception of vitiligo or alopecia).\n* Subject has active gastrointestinal disease or other that could interfere significantly with the absorption, distribution, metabolism, or excretion of oral therapy.\n* Concurrent participation in any clinical research study involving treatment with any investigational drug, radiotherapy, or surgery, except for the nontreatment phases of these studies (e.g., follow-up phase).\n\nOther protocol inclusion/exclusion criteria may apply"

In [5]:

# Initialize RAG system and retrieve relevant genes
rag_system = GeneListRag()
relevant_genes_list = rag_system.retrieve_relevant_genes(eligibilityCriteria)
print(f'Retrieved gene subset: {relevant_genes_list}')

Gene list already indexed: 754 genes chunks found.
Retrieved 5 relevant gene chunks.
Retrieved chunk:
MED12

Retrieved chunk:
BCL11A

Retrieved chunk:
NCOR1

Retrieved chunk:
CTCF

Retrieved chunk:
TNFRSF14

Retrieved gene subset: ['MED12', 'BCL11A', 'NCOR1', 'CTCF', 'TNFRSF14']


In [7]:

import sys
import os
import yaml
sys.path.append(os.path.abspath('../../'))
from utils.ai_helper import send_ai_request, parse_ai_response, get_genomic_criteria_prompt

json_schema, prompt = get_genomic_criteria_prompt(relevant_genes_list, eligibilityCriteria)
ai_response = send_ai_request(nct_id, prompt, json_schema)
genomic_criteria_dict = parse_ai_response(ai_response)

yaml_data = yaml.dump(genomic_criteria_dict, sort_keys=False)
print(yaml_data)


[32m2026-01-09 17:52:08.573[0m | [34m[1mDEBUG   [0m | [36mutils.ai_helper[0m:[36msend_ai_request[0m:[36m95[0m - [34m[1mAI request | ID:NCT05410145 | {"model": "neuralmagic/DeepSeek-R1-Distill-Qwen-32B-quantized.w4a16", "messages": [{"role": "system", "content": "You are a biomedical researcher."}, {"role": "user", "content": "Task: Evaluate the EligibilityCriteria to return a JSON-formatted eligibility criteria involving genetic variants in any genes such as those in the GeneList.\n    EligibilityCritieria: Inclusion:\n\n* Subject must have a histologically or cytologically confirmed metastatic or locally advanced solid tumor which is progressing.\n* Subject must have documented KRAS p.G12C mutation identified within the last 5 years by a local test on tumor tissue or blood.\n* Subject must have measurable disease per RECIST v1.1.\n* Subject must have Eastern Cooperative Oncology Group (ECOG) performance status of 0 or 1.\n* Subject must have adequate organ and marrow func

Creating SGLang platform...
http://localhost:30000/v1/chat/completions


[32m2026-01-09 17:52:16.886[0m | [34m[1mDEBUG   [0m | [36mutils.ai_helper[0m:[36msend_ai_request[0m:[36m105[0m - [34m[1mAI response | ID:NCT05410145 | {'id': 'b1e628f3d1014273b8a523de076ddc77', 'object': 'chat.completion', 'created': 1767952336, 'model': 'neuralmagic/DeepSeek-R1-Distill-Qwen-32B-quantized.w4a16', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': 'Okay, so I\'m trying to figure out how to evaluate the EligibilityCriteria provided and create a JSON-formatted output based on the given GeneList. Let me break this down step by step.\n\nFirst, I\'ll look at the inclusion and exclusion criteria provided. The inclusion criteria mention that the subject must have a KRAS p.G12C mutation. The exclusion criteria don\'t mention any specific genes from the GeneList, so I don\'t need to consider them for exclusion.\n\nNow, the GeneList includes [\'MED12\', \'BCL11A\', \'NCOR1\', \'CTCF\', \'TNFRSF14\']. I need to check if any of these genes are mention

<Response [200]>
200
and: []

