In [None]:
!pip install chromadb sentence-transformers

In [35]:
from sentence_transformers import SentenceTransformer
import chromadb

class GeneListRag:
    def __init__(self, gene_list_path = '../ref/genes.txt'):
        self.embed_model = SentenceTransformer('all-MiniLM-L6-v2')
        self.client = chromadb.PersistentClient(path='./gene_db')
        self.collection = self.client.get_or_create_collection(name='genes')
        self.gene_list_path = gene_list_path

        if self.collection.count() == 0:
            print("Indexing gene list...")
            self._index_gene_list(self.gene_list_path)
        else :
            print(f"Gene list already indexed: {self.collection.count()} genes chunks found.")
    
    def _index_gene_list(self, file_path):
        # load gene list
        with open(file_path, 'r') as f:
            genes = f.read()

        # create chunks : groups of n genes per chunk
        genes = [gene for gene in genes.split('\n') if gene.strip() != '']
        chunk_size = 20
        gene_chunks = [genes[i:i + chunk_size] for i in range(0, len(genes), chunk_size)]
        
        # create embeddings for each chunk and store in ChromaDB
        for i, chunk in enumerate(gene_chunks):
            chunk_text = '\n'.join(chunk)
            embedding = self.embed_model.encode(chunk_text).tolist()
            self.collection.add(
                documents=[chunk_text],
                embeddings=[embedding],
                ids=[f"gene_chunk_{i}"]
            )
        print(f"Indexed {len(gene_chunks)} gene chunks.")

    def retrieve_relevant_genes(self, trial_criteria):
        # create embeddings for the trial criteria
        query_embedding = self.embed_model.encode(trial_criteria).tolist()

        # query the vector database for relevant gene chunks
        results = self.collection.query(
            query_embeddings=[query_embedding]
             ,n_results=5
        )
        print(f"Retrieved {len(results['documents'][0])} relevant gene chunks.")
        retrieved_genes = []
        for doc in results['documents'][0]:
            retrieved_genes.extend(doc.split('\n')) 
        return retrieved_genes


In [61]:
nct_id = "NCT05410145"
eligibilityCriteria = "Inclusion:\n\n* Subject must have a histologically or cytologically confirmed metastatic or locally advanced solid tumor which is progressing.\n* Subject must have documented KRAS p.G12C mutation identified within the last 5 years by a local test on tumor tissue or blood.\n* Subject must have measurable disease per RECIST v1.1.\n* Subject must have Eastern Cooperative Oncology Group (ECOG) performance status of 0 or 1.\n* Subject must have adequate organ and marrow function within the screening period.\n\nExclusion:\n\n* Subject has any prior treatment with other treatments without adequate washout periods as defined in the protocol.\n* Subject has uncontrolled intercurrent illness, including but not limited to, ongoing or active infection, uncontrolled or significant cardiovascular disease, serious chronic gastrointestinal conditions associated with diarrhea, or psychiatric illness/social situations that would limit compliance with study requirements, substantially increase risk of incurring AEs, or compromise the ability of the subject to give written informed consent.\n* Subject has unresolved treatment-related toxicities from previous anticancer therapy of NCI CTCAE Grade \u22652 (with exception of vitiligo or alopecia).\n* Subject has active gastrointestinal disease or other that could interfere significantly with the absorption, distribution, metabolism, or excretion of oral therapy.\n* Concurrent participation in any clinical research study involving treatment with any investigational drug, radiotherapy, or surgery, except for the nontreatment phases of these studies (e.g., follow-up phase).\n\nOther protocol inclusion/exclusion criteria may apply"

In [62]:

# Initialize RAG system and retrieve relevant genes
rag_system = GeneListRag()
relevant_genes_list = rag_system.retrieve_relevant_genes(eligibilityCriteria)
print(f'Retrieved gene subset: {relevant_genes_list}')

Gene list already indexed: 38 genes chunks found.
Retrieved 5 relevant gene chunks.
Retrieved gene subset: ['CSMD3', 'CTCF', 'CTNNA1', 'CTNNA2', 'CTNNB1', 'CTNND1', 'CTNND2', 'CUL3', 'CUX1', 'CXCR4', 'CYLD', 'CYP2C8', 'CYSLTR2', 'DAXX', 'DCAF12L2', 'DCC', 'DCTN1', 'DDB2', 'DDIT3', 'DDR2', 'CCNC', 'CCND1', 'CCND2', 'CCND3', 'CCNE1', 'CCR4', 'CCR7', 'CD209', 'CD274', 'CD28', 'CD74', 'CD79A', 'CD79B', 'CDC73', 'CDH1', 'CDH10', 'CDH11', 'CDH17', 'CDK12', 'CDK4', 'BTG2', 'BTK', 'BUB1B', 'C15orf65', 'CACNA1D', 'CALR', 'CAMTA1', 'CANT1', 'CARD11', 'CARS', 'CASP3', 'CASP8', 'CASP9', 'CBFA2T3', 'CBFB', 'CBL', 'CBLB', 'CBLC', 'CCDC6', 'CCNB1IP1', 'TNFRSF17', 'TNRC18', 'TOP1', 'TP53', 'TP63', 'TPM3', 'TPM4', 'TPR', 'TRA', 'TRAF7', 'TRB', 'TRD', 'TRIM24', 'TRIM27', 'TRIM33', 'TRIP11', 'TRRAP', 'TSC1', 'TSC2', 'TSHR', 'ATF1', 'ATIC', 'ATM', 'ATP1A1', 'ATP2B3', 'ATR', 'ATRX', 'AXIN1', 'AXIN2', 'B2M', 'BAP1', 'BARD1', 'BAX', 'BAZ1A', 'BCL10', 'BCL11A', 'BCL11B', 'BCL2', 'BCL2L12', 'BCL3']


In [63]:

import sys
import os
import yaml
sys.path.append(os.path.abspath('../'))
from utils.ai_helper import send_ai_request, parse_ai_response, get_genomic_criteria_prompt

json_schema, prompt = get_genomic_criteria_prompt(relevant_genes_list, eligibilityCriteria)
ai_response = send_ai_request(nct_id, prompt, json_schema)
genomic_criteria_dict = parse_ai_response(ai_response)

yaml_data = yaml.dump(genomic_criteria_dict, sort_keys=False)
print(yaml_data)


[32m2026-01-05 20:51:15.035[0m | [34m[1mDEBUG   [0m | [36mutils.ai_helper[0m:[36msend_ai_request[0m:[36m112[0m - [34m[1mAI request | ID:NCT05410145 | {"model": "hf.co/unsloth/gemma-3-27b-it-GGUF:Q4_K_M", "messages": [{"role": "system", "content": "You are a biomedical researcher specializing in cancer genomics and clinical trials."}, {"role": "user", "content": "\n    Task: Evaluate the clinical trial description against the provided gene list and variant categories to determine whether any mutations in the listed genes are included or excluded based on the eligibility criteria.\n\n    Instructions:\n    1. Identify if the clinical trial description mentions mutations in the given genes (inclusion) or specifies exclusions.\n    2. Use the variant categories:\n        Mutation\n        Copy Number Variation\n        Structural Variation\n        Any Variation\n        !Mutation\n        !Copy Number Variation\n        !Structural Variation\n    \n    Logic:\n    1. If the g

http://localhost:11434/api/chat


[32m2026-01-05 20:52:58.832[0m | [34m[1mDEBUG   [0m | [36mutils.ai_helper[0m:[36msend_ai_request[0m:[36m122[0m - [34m[1mAI response | ID:NCT05410145 | {'model': 'hf.co/unsloth/gemma-3-27b-it-GGUF:Q4_K_M', 'created_at': '2026-01-05T12:52:58.830086909Z', 'message': {'role': 'assistant', 'content': '{\n    "and": [\n        {\n            "or": [\n                {\n                    "genomic": {\n                        "hugo_symbol": "KRAS",\n                        "variant_category": "Mutation"\n                    }\n                }\n            ]\n        },\n        {\n            "and": [\n                {\n                    "genomic": {\n                        "hugo_symbol": "CSMD3",\n                        "variant_category": "!Mutation"\n                    }\n                },\n                {\n                    "genomic": {\n                        "hugo_symbol": "CTCF",\n                        "variant_category": "!Mutation"\n                    }

<Response [200]>
200
and:
- or:
  - genomic:
      hugo_symbol: KRAS
      variant_category: Mutation
- and:
  - genomic:
      hugo_symbol: CSMD3
      variant_category: '!Mutation'
  - genomic:
      hugo_symbol: CTCF
      variant_category: '!Mutation'
  - genomic:
      hugo_symbol: CTNNA1
      variant_category: '!Mutation'
  - genomic:
      hugo_symbol: CTNNA2
      variant_category: '!Mutation'
  - genomic:
      hugo_symbol: CTNNB1
      variant_category: '!Mutation'
  - genomic:
      hugo_symbol: CTNND1
      variant_category: '!Mutation'
  - genomic:
      hugo_symbol: CTNND2
      variant_category: '!Mutation'
  - genomic:
      hugo_symbol: CUL3
      variant_category: '!Mutation'
  - genomic:
      hugo_symbol: CUX1
      variant_category: '!Mutation'
  - genomic:
      hugo_symbol: CXCR4
      variant_category: '!Mutation'
  - genomic:
      hugo_symbol: CYLD
      variant_category: '!Mutation'
  - genomic:
      hugo_symbol: CYP2C8
      variant_category: '!Mutation'
 