In [None]:
!pip install llama-index-core llama-index-embeddings-huggingface llama-index-vector-stores-chroma chromadb

In [122]:
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.core.schema import TextNode
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore
import chromadb
from llama_index.core import Settings

class LlamaIndexGeneRAG:
    def __init__(self, reset_db=False):

        self.embed_model = HuggingFaceEmbedding(
            model_name="all-MiniLM-L6-v2",
            device="cuda" 
        )

        Settings.llm = None #if not set, llamaindex looks for OpenAI API key
        
        db_path = './gene_db_5'

        # Initialize Chroma vector store
        self.chroma_client = chromadb.PersistentClient(path=db_path)

        if reset_db:
            # FRESH START: Create new nodes and index
            self.nodes = self._create_gene_nodes_from_gene_csv()
            self._create_fresh_index()
        else:
            # USE EXISTING: Load index without creating nodes
            self._load_existing_index()
        
        self.query_engine = self.index.as_query_engine(
            similarity_top_k=10,
            response_mode="no_text"  # Just retrieval
        )

        print(f"LlamaIndex RAG initialized")   
  
    
    def _create_fresh_index(self):
        """Create brand new database with embeddings"""
        print("Creating fresh database...")
        
        # Delete old collection if exists
        try:
            self.chroma_client.delete_collection('gene_collection')
            print("Deleted old collection")
        except:
            pass
        
        # Create new collection
        chroma_collection = self.chroma_client.create_collection('gene_collection')
        
        # Create vector store
        vector_store = ChromaVectorStore(
            chroma_client=self.chroma_client, 
            chroma_collection=chroma_collection
        )
        storage_context = StorageContext.from_defaults(vector_store=vector_store)
        
        # Create index with nodes (this will generate embeddings)
        self.index = VectorStoreIndex(
            nodes=self.nodes,
            storage_context=storage_context,            
            embed_model=self.embed_model,
            show_progress=True
        )
        
        print(f"Created fresh index with {len(self.nodes)} gene embeddings")

    def _load_existing_index(self):
        """Load existing index without creating new embeddings"""
        print("Loading existing database...")
        
        try:
            # Get existing collection
            chroma_collection = self.chroma_client.get_collection('gene_collection')
            count = chroma_collection.count()
            print(f"Found existing collection with {count} embeddings")
            
            # Create vector store from existing collection
            vector_store = ChromaVectorStore(
                chroma_client=self.chroma_client, 
                chroma_collection=chroma_collection
            )
            
            # Load index from vector store (NO new embeddings)
            self.index = VectorStoreIndex.from_vector_store(
                vector_store, 
                embed_model=self.embed_model
            )
            
            print(f"Successfully loaded existing index")
            
        except Exception as e:
            print(f"Error loading existing index: {e}")

    # Create nodes with just gene name
    def _create_gene_nodes_from_gene_list(self, gene_list_path = '../../ref/genes.txt'):
        """Create nodes from gene list (similar to chunks)"""
        with open(gene_list_path, 'r') as f:
            genes = [gene.strip() for gene in f if gene.strip()]
        
        nodes = []
        # Single gene per node (like chunk_size=1)
        for gene in genes:
            # Create node with metadata
            node = TextNode(
                text=f"Gene: {gene}",  # Text that gets embedded
                metadata={"gene": gene},  # Metadata for retrieval
                id_=f"gene_{gene}"  # Unique ID
            )
            nodes.append(node)
        
        print(f'Created {len(nodes)} nodes for gene list.')
        return nodes
    
    # Create nodes with gene name and extra information
    def _create_gene_nodes_from_gene_csv(self, gene_csv_path='../../ref/Census_gene_list.csv'):

        import pandas as pd

        gene_df = pd.read_csv(gene_csv_path)

        nodes = []

        for _,row in gene_df.iterrows():
            gene_symbol = row['Gene Symbol']

            # The actual content that gets embedded (turned into vector)
            # Used for semantic similarity search
            text_content = f""" 
            Gene: {gene_symbol}
            Synonyms: {row['Synonyms'] if pd.notna(row['Synonyms']) else 'None'}
            Associated Cancers: {row['Tumour Types(Somatic)']}
            """

            node = TextNode(
                text=text_content,
                metadata={ # Structured data that does NOT get embedded, Used for filtering and display
                    "gene": gene_symbol,
                    "synonyms": row['Synonyms'] if pd.notna(row['Synonyms']) else "",
                    "tumour_types": row['Tumour Types(Somatic)'],
                    "role": row['Role in Cancer']
                    },
                    id = f"gene_{gene_symbol}"
            )
            nodes.append(node)        
        print(f'Created {len(nodes)} nodes with enriched gene information.')
        print(f'Example node text:\n{nodes[0].text}')
        return nodes
    
    def retrieve_genes(self, query):
        response = self.query_engine.query(query)
        
        # Extract genes from node metadata
        retrieved_genes = []
        for node in response.source_nodes:
            retrieved_genes.append(node.metadata['gene'])
        return retrieved_genes

In [None]:
import shutil
db_path = './gene_db_4'
shutil.rmtree(db_path)

In [108]:
nct_id = "NCT05410145"
# eligibilityCriteria = "Subject must have documented KRAS p.G12C mutation"
eligibilityCriteria = "Inclusion:\n\n* Subject must have a histologically or cytologically confirmed metastatic or locally advanced solid tumor which is progressing.\n* Subject must have documented KRAS p.G12C mutation identified within the last 5 years by a local test on tumor tissue or blood.\n* Subject must have measurable disease per RECIST v1.1.\n* Subject must have Eastern Cooperative Oncology Group (ECOG) performance status of 0 or 1.\n* Subject must have adequate organ and marrow function within the screening period.\n\nExclusion:\n\n* Subject has any prior treatment with other treatments without adequate washout periods as defined in the protocol.\n* Subject has uncontrolled intercurrent illness, including but not limited to, ongoing or active infection, uncontrolled or significant cardiovascular disease, serious chronic gastrointestinal conditions associated with diarrhea, or psychiatric illness/social situations that would limit compliance with study requirements, substantially increase risk of incurring AEs, or compromise the ability of the subject to give written informed consent.\n* Subject has unresolved treatment-related toxicities from previous anticancer therapy of NCI CTCAE Grade \u22652 (with exception of vitiligo or alopecia).\n* Subject has active gastrointestinal disease or other that could interfere significantly with the absorption, distribution, metabolism, or excretion of oral therapy.\n* Concurrent participation in any clinical research study involving treatment with any investigational drug, radiotherapy, or surgery, except for the nontreatment phases of these studies (e.g., follow-up phase).\n\nOther protocol inclusion/exclusion criteria may apply"

In [120]:
nct_id = "NCT03093116"
eligibilityCriteria = "Histologically or cytologically confirmed diagnosis of locally advanced, or metastatic solid tumor (including primary CNS tumors) (Stage IV, American Joint Committee on Cancer v.7) that harbors an ALK, ROS1, NTRK1, NTRK2, or NTRK3 gene rearrangement by protocol specified tests."

In [123]:
rag_system = LlamaIndexGeneRAG(reset_db=False)
relevant_genes_list = rag_system.retrieve_genes(eligibilityCriteria)
print(f'Retrieved gene subset: {relevant_genes_list}')

LLM is explicitly disabled. Using MockLLM.
Loading existing database...
Found existing collection with 753 embeddings
Successfully loaded existing index
LlamaIndex RAG initialized
Retrieved gene subset: ['ALK', 'RNF213', 'NTRK2', 'SH3GL1', 'IKBKB', 'NTRK3', 'CDK4', 'MSN', 'NTHL1', 'PIK3CB']


In [None]:

import sys
import os
import yaml
sys.path.append(os.path.abspath('../../'))
from utils.ai_helper import send_ai_request, parse_ai_response, get_genomic_criteria_prompt

json_schema, prompt = get_genomic_criteria_prompt(relevant_genes_list, eligibilityCriteria)
ai_response = send_ai_request(nct_id, prompt, json_schema)
genomic_criteria_dict = parse_ai_response(ai_response)

yaml_data = yaml.dump(genomic_criteria_dict, sort_keys=False)
print(yaml_data)
