In [None]:
!pip install chromadb sentence-transformers

In [17]:
from sentence_transformers import SentenceTransformer
import chromadb
import json

class GeneListRag:
    def __init__(self, gene_list_path = '../ref/genes.txt'):
        self.embed_model = SentenceTransformer('all-MiniLM-L6-v2')
        self.client = chromadb.PersistentClient(path='./gene_db')
        self.collection = self.client.get_or_create_collection(name='genes')
        self.gene_list_path = gene_list_path

        if self.collection.count() == 0:
            print("Indexing gene list...")
            self._index_gene_list(self.gene_list_path)
        else :
            print(f"Gene list already indexed: {self.collection.count()} genes chunks found.")
    
    def _index_gene_list(self, file_path):

        # load gene list
        with open(file_path, 'r') as f:
            genes = f.read()

        # create chunks : groups of 200 genes to keep context
        genes = [gene for gene in genes.split('\n') if gene.strip() != '']
        chunk_size = 200
        gene_chunks = [genes[i:i + chunk_size] for i in range(0, len(genes), chunk_size)]
        
        # create embeddings for each chunk and store in ChromaDB
        for i, chunk in enumerate(gene_chunks):
            chunk_text = '\n'.join(chunk)
            embedding = self.embed_model.encode(chunk_text).tolist()
            self.collection.add(
                documents=[chunk_text],
                embeddings=[embedding],
                ids=[f"gene_chunk_{i}"]
            )
        print(f"Indexed {len(gene_chunks)} gene chunks.")

    def retrieve_relevant_genes(self, trial_criteria):
        # create embeddings for the trial criteria
        query_embedding = self.embed_model.encode(trial_criteria).tolist()

        # query the vector database for relevant gene chunks
        results = self.collection.query(
            query_embeddings=[query_embedding],
            n_results=5
        )

        retrieved_genes = []    
        for doc in results['documents'][0]:
            retrieved_genes.extend(doc.split('\n')) 
        return retrieved_genes


In [18]:
def get_genomic_criteria_prompt(relevant_genes, eligibilityCriteria):
    prompt = f"""
    Task: Evaluate the clinical trial description against the provided gene list and variant categories to determine whether any mutations in the listed genes are included or excluded based on the eligibility criteria.

    Instructions:
    1. Identify if the clinical trial description mentions mutations in the given genes (inclusion) or specifies exclusions.
    2. Use the variant categories:
        Mutation
        Copy Number Variation
        Structural Variation
        Any Variation
        !Mutation
        !Copy Number Variation
        !Structural Variation
    
    Logic:
    1. If the genes are mentioned in trial's inlcusion criteria, use the 'or' operator along with appropriate variant categories.
    2. If the genes are mentioned in trial's exclusion criteria, use the 'and' operator along with variant categories (!Mutation/!Copy Number Variation/!Structural Variation).
    3. If applicable, combine inclusion and exclusion with a top level 'and' operator.
    
    Gene list: {relevant_genes}

    Clinical Trial Description: {eligibilityCriteria}

    Example A:
    Inclusion criteria:Subjects with advanced solid tumors harboring ROS1 or NTRK1 rearrangement will be included in this trial. 
    Output:
{{
    "or": [        
        {{
            "genomic": {{
                "hugo_symbol": "ROS1",
                "variant_category": "Structural Variation"
            }}
        }},
        {{
            "genomic": {{
                "hugo_symbol": "NTRK1",
                "variant_category": "Structural Variation"
            }}
        }}
    ]
}}

Example B:
Inclusion criteria:Patient should have mutation in geneA or geneB.
Exclusion criteria: Patient should not have a mutation in geneC or geneD

Output:
{{
    "and":[
        {{
            "or":[
                {{"genomic": {{"hugo_symbol": "geneA","variant_category": "Mutation"}}}},
                {{"genomic": {{"hugo_symbol": "geneB","variant_category": "Mutation"}}}}
            ]
        }},
        {{
            "and":[
                {{"genomic": {{"hugo_symbol": "geneC","variant_category": "!Mutation"}}}},
                {{"genomic": {{"hugo_symbol": "geneD","variant_category": "!Mutation"}}}}
            ]

        }}
    ]
}}

    """
    return None, prompt

In [19]:
nct_id = ""
eligibilityCriteria = "Phase 1\n\nOral repotrectinib (TPX-0005):\n\nPhase 1a dose escalation, Phase 1b food-effect sub-study, and Phase 1c dose escalation with food, and Midazolam drug-drug interaction sub-study.\n\nPhase 2\n\nOral repotrectinib (TPX-0005): 6 distinct expansion cohorts\n\n* EXP-1: ROS1 TKI-naïve ROS1+ NSCLC\n* EXP-2: 1 Prior ROS1 TKI and 1 Platinum based chemo ROS1+ NSCLC\n* EXP-3: 2 Prior ROS1 TKIs ROS1+ NSCLC (No Chemo or IO)\n* EXP-4: 1 Prior ROS1 TKI ROS1+ NSCLC (No Chemo or IO)\n* EXP-5: TRK TKI-naïve NTRK+ solid tumors\n* EXP-6: TRK TKI-pretreated NTRK+ solid tumors"


In [24]:

import sys
import os


sys.path.append(os.path.abspath('../'))

from utils.ai_helper import send_ai_request, parse_ai_response

rag_system = GeneListRag()
relevant_genes_str = rag_system.retrieve_relevant_genes(eligibilityCriteria)
print(f'Retrieved gene subset: {relevant_genes_str}')

#print(f"Retrieved gene subset: {relevant_genes_list}...")

json_schema, prompt = get_genomic_criteria_prompt(relevant_genes_str, eligibilityCriteria)
ai_response = send_ai_request(nct_id, prompt, json_schema)
genomic_criteria_dict = parse_ai_response(ai_response)

# pretty print the genomic criteria dict
import pprint
pprint.pprint(genomic_criteria_dict)
        

[32m2026-01-02 20:29:41.830[0m | [34m[1mDEBUG   [0m | [36mutils.ai_helper[0m:[36msend_ai_request[0m:[36m112[0m - [34m[1mAI request | ID: | {"model": "hf.co/unsloth/gemma-3-27b-it-GGUF:Q4_K_M", "messages": [{"role": "system", "content": "You are a biomedical researcher specializing in cancer genomics and clinical trials."}, {"role": "user", "content": "\n    Task: Evaluate the clinical trial description against the provided gene list and variant categories to determine whether any mutations in the listed genes are included or excluded based on the eligibility criteria.\n\n    Instructions:\n    1. Identify if the clinical trial description mentions mutations in the given genes (inclusion) or specifies exclusions.\n    2. Use the variant categories:\n        Mutation\n        Copy Number Variation\n        Structural Variation\n        Any Variation\n        !Mutation\n        !Copy Number Variation\n        !Structural Variation\n\n    Logic:\n    1. If the genes are mentio

Gene list already indexed: 4 genes chunks found.
Retrieved gene subset: ['A1CF', 'ABI1', 'ABL1', 'ABL2', 'ACKR3', 'ACSL3', 'ACSL6', 'ACVR1', 'ACVR1B', 'ACVR2A', 'AFDN', 'AFF1', 'AFF3', 'AFF4', 'AKAP9', 'AKT1', 'AKT2', 'AKT3', 'ALDH2', 'ALK', 'AMER1', 'ANK1', 'APC', 'APOBEC3B', 'AR', 'ARAF', 'ARHGAP26', 'ARHGAP35', 'ARHGAP5', 'ARHGEF10', 'ARHGEF10L', 'ARHGEF12', 'ARID1A', 'ARID1B', 'ARID2', 'ARNT', 'ASPM', 'ASPSCR1', 'ASXL1', 'ASXL2', 'ATF1', 'ATIC', 'ATM', 'ATP1A1', 'ATP2B3', 'ATR', 'ATRX', 'AXIN1', 'AXIN2', 'B2M', 'BAP1', 'BARD1', 'BAX', 'BAZ1A', 'BCL10', 'BCL11A', 'BCL11B', 'BCL2', 'BCL2L12', 'BCL3', 'BCL6', 'BCL7A', 'BCL9', 'BCL9L', 'BCLAF1', 'BCOR', 'BCORL1', 'BCR', 'BIRC3', 'BIRC6', 'BLM', 'BMP5', 'BMPR1A', 'BRAF', 'BRCA1', 'BRCA2', 'BRD3', 'BRD4', 'BRIP1', 'BTG1', 'BTG2', 'BTK', 'BUB1B', 'C15orf65', 'CACNA1D', 'CALR', 'CAMTA1', 'CANT1', 'CARD11', 'CARS', 'CASP3', 'CASP8', 'CASP9', 'CBFA2T3', 'CBFB', 'CBL', 'CBLB', 'CBLC', 'CCDC6', 'CCNB1IP1', 'CCNC', 'CCND1', 'CCND2', 'CCND3', 'C

[32m2026-01-02 20:29:47.498[0m | [34m[1mDEBUG   [0m | [36mutils.ai_helper[0m:[36msend_ai_request[0m:[36m122[0m - [34m[1mAI response | ID: | {'model': 'hf.co/unsloth/gemma-3-27b-it-GGUF:Q4_K_M', 'created_at': '2026-01-02T12:29:47.496890941Z', 'message': {'role': 'assistant', 'content': '{"or": [\n    {\n        "genomic": {\n            "hugo_symbol": "ROS1",\n            "variant_category": "Structural Variation"\n        }\n    },\n    {\n        "genomic": {\n            "hugo_symbol": "NTRK1",\n            "variant_category": "Structural Variation"\n        }\n    }\n]}'}, 'done': True, 'done_reason': 'stop', 'total_duration': 5663094117, 'load_duration': 311912864, 'prompt_eval_count': 4096, 'prompt_eval_duration': 2878226827, 'eval_count': 87, 'eval_duration': 2199265013}[0m


<Response [200]>
200
{'or': [{'genomic': {'hugo_symbol': 'ROS1',
                     'variant_category': 'Structural Variation'}},
        {'genomic': {'hugo_symbol': 'NTRK1',
                     'variant_category': 'Structural Variation'}}]}
