In [None]:
!pip install llama-index
!pip install llama-index-embeddings-huggingface
!pip install llama-index-llms-groq
!pip install networkx
!pip install "transformers[torch]" "huggingface_hub[inference]"


Collecting llama-index-embeddings-huggingface
  Downloading llama_index_embeddings_huggingface-0.5.1-py3-none-any.whl.metadata (767 bytes)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=2.6.1->llama-index-embeddings-huggingface)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=2.6.1->llama-index-embeddings-huggingface)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=2.6.1->llama-index-embeddings-huggingface)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers>=2.6.1->llama-index-embeddings-huggingface)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3

In [None]:
pip install llama-index --upgrade --no-cache-dir --force-reinstall


Collecting llama-index
  Downloading llama_index-0.12.15-py3-none-any.whl.metadata (12 kB)
Collecting llama-index-agent-openai<0.5.0,>=0.4.0 (from llama-index)
  Downloading llama_index_agent_openai-0.4.3-py3-none-any.whl.metadata (727 bytes)
Collecting llama-index-cli<0.5.0,>=0.4.0 (from llama-index)
  Downloading llama_index_cli-0.4.0-py3-none-any.whl.metadata (1.5 kB)
Collecting llama-index-core<0.13.0,>=0.12.15 (from llama-index)
  Downloading llama_index_core-0.12.15-py3-none-any.whl.metadata (2.5 kB)
Collecting llama-index-embeddings-openai<0.4.0,>=0.3.0 (from llama-index)
  Downloading llama_index_embeddings_openai-0.3.1-py3-none-any.whl.metadata (684 bytes)
Collecting llama-index-indices-managed-llama-cloud>=0.4.0 (from llama-index)
  Downloading llama_index_indices_managed_llama_cloud-0.6.4-py3-none-any.whl.metadata (3.6 kB)
Collecting llama-index-llms-openai<0.4.0,>=0.3.0 (from llama-index)
  Downloading llama_index_llms_openai-0.3.17-py3-none-any.whl.metadata (3.3 kB)
Collec

In [None]:
import pandas as pd
import networkx as nx

# LlamaIndex components
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.groq import Groq
from llama_index.core import VectorStoreIndex, Document
from llama_index.core import Settings

# -------------------------------
# Step 1. Load and Prepare the Data
# -------------------------------
# CSV files:
#   • edges.csv: columns: node_1, node_2, edge, chunk_id
#   • nodes_attributes.csv: columns: node, color, group
#   • text_source.csv: columns: chunk_id, text

edges_df = pd.read_csv("graphb.csv", delimiter='|')
nodes_df = pd.read_csv("colorsb.csv", delimiter='|')
text_df = pd.read_csv("chunksb.csv", delimiter='|')

# Create a mapping from chunk_id to text (for quick lookup later)
chunk_id_to_text = {str(row["chunk_id"]): row["text"] for _, row in text_df.iterrows()}

# -------------------------------
# Step 2. Build the Knowledge Graph
# -------------------------------
# Create an undirected graph from the edges CSV.
G = nx.Graph()

for _, row in edges_df.iterrows():
    # row contains node_1, node_2, edge (relation label) and chunk_id
    G.add_edge(row["node_1"], row["node_2"], relation=row["edge"], chunk_id=str(row["chunk_id"]))

# Enrich nodes with additional attributes from nodes_attributes.csv.
for _, row in nodes_df.iterrows():
    if row["node"] in G:
        # Update existing node; you might extend metadata as needed.
        G.nodes[row["node"]].update({"color": row["color"], "group": row["group"]})
    else:
        G.add_node(row["node"], color=row["color"], group=row["group"])

# -------------------------------
# Step 3. Build the LlamaIndex Knowledge Base
# -------------------------------
# In a graph RAG system the textual “chunks” (from text_source.csv)
# provide the grounding context for generation.
documents = []
for _, row in text_df.iterrows():
    # Create a Document for each text chunk.
    doc = Document(text=row["text"], metadata={"chunk_id": str(row["chunk_id"])})
    documents.append(doc)

# Create a Hugging Face embedding model. (Here we use a sample model name.)
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

# Build an in-memory vector index from the documents.
index = VectorStoreIndex.from_documents(documents, embed_model=embed_model)

# -------------------------------
# Step 4. Initialize the Groq LLM
# -------------------------------
# Configure the Groq LLM (replace "your-groq-model" with your actual model).
llm = Groq(model="llama-3.3-70b-versatile", api_key = "")

Settings.llm = llm  # Ensure any LlamaIndex calls use this LLM by default

In [None]:
# -------------------------------
# Step 5. Build a Graph-Enhanced RAG Query Engine (Modified)
# -------------------------------

def get_graph_context(query: str, graph: nx.Graph, chunk_mapping: dict) -> str:
    """
    Enhanced to return both formatted relationships and context text
    """
    relevant_texts = set()
    relationships = []

    query_lower = query.lower()
    for node in graph.nodes():
        if node.lower() in query_lower:
            # Track relationships
            for neighbor in graph.neighbors(node):
                edge_data = graph.get_edge_data(node, neighbor)
                if edge_data:
                    relationships.append({
                        'source': node,
                        'target': neighbor,
                        'relation': edge_data.get('relation', 'unknown'),
                        'chunk_id': edge_data.get('chunk_id', '')
                    })
                    # Get associated text
                    chunk_id = edge_data.get("chunk_id")
                    if chunk_id and chunk_id in chunk_mapping:
                        relevant_texts.add(chunk_mapping[chunk_id])

    # Create beautiful formatting
    relationship_str = "📊 Graph Relationships Found:\n"
    for rel in relationships:
        relationship_str += (
            f"├─ {rel['source']} \n"
            f"│  ├─ Relation: {rel['relation'].title().replace('_', ' ')}\n"
            f"│  ├─ Connected to: {rel['target']}\n"
            f"│  └─ Chunk ID: {rel['chunk_id']}\n"
            f"╰───────────────────────────────\n"
        )

    return relationship_str + "\n📄 Contextual Text:\n" + "\n".join(relevant_texts)


class GraphRAGQueryEngine:
    """
    Modified to display beautiful graph output
    """
    def __init__(self, index, graph, chunk_mapping, llm):
        self.index = index
        self.graph = graph
        self.chunk_mapping = chunk_mapping
        self.llm = llm
        self.vector_engine = self.index.as_query_engine(llm=self.llm)

    def query(self, user_query: str) -> str:
        # Get graph context with formatted relationships
        graph_context = get_graph_context(user_query, self.graph, self.chunk_mapping)

        # Print beautiful graph visualization
        print("\n🔍 Graph Relationships Discovered:")
        print(graph_context.split("📄 Contextual Text:")[0])

        # Vector retrieval
        retrieval_result = self.vector_engine.query(user_query)
        print("\n🔖 Retrieved Document Context:")
        print(retrieval_result)

        # Combine contexts
        combined_context = f"{graph_context}\n\nRetrieved Answer: {retrieval_result}"

        # Generate final answer
        prompt = (
            f"Use the following structured information to answer the question.\n\n"
            f"{combined_context}\n\nQuestion: {user_query}\nAnswer:"
        )
        response = self.llm.complete(prompt)

        print("\n💡 Final Answer:")
        return response.text


In [None]:
!pip install instructor groq

Collecting instructor
  Downloading instructor-1.7.2-py3-none-any.whl.metadata (18 kB)
Collecting groq
  Downloading groq-0.17.0-py3-none-any.whl.metadata (14 kB)
Downloading instructor-1.7.2-py3-none-any.whl (71 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.4/71.4 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading groq-0.17.0-py3-none-any.whl (109 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.8/109.8 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq, instructor
Successfully installed groq-0.17.0 instructor-1.7.2


In [None]:
from typing import List, Dict
from pydantic import BaseModel
import instructor
from groq import Groq

def concept_segmentation(user_query: str) -> List[Dict[str, str]]:
    class ConceptDetail(BaseModel):
        concept: str
        summary: str  # Added summary field
        textbook_reference: str  # Added textbook alignment

    class ConceptList(BaseModel):
        concepts: List[ConceptDetail]  # Updated to use ConceptDetail

    client = instructor.from_groq(
        Groq(api_key="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"),
        mode=instructor.Mode.JSON
    )

    system_prompt = """Analyze the question and identify core concepts with concise summaries. For each concept:
    1. Provide a short name (2-5 words)
    2. Give a 1-sentence summary explaining its relevance to the query
    3. Reference relevant textbook chapters/sections (e.g.: "cf. Goodfellow et al. Chap 6")
    Focus on fundamental concepts from authoritative sources in the field."""

    response = client.chat.completions.create(
        model="llama3-70b-8192",
        response_model=ConceptList,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_query}
        ],
        temperature=0.4,  # Lower temperature for more factual responses
        max_tokens=600,    # Increased for summaries
    )

    return [{"concept": c.concept, "summary": c.summary, "textbook": c.textbook_reference}
            for c in response.concepts]

In [None]:
from typing import List, Dict

def format_concepts(concepts: List[Dict[str, str]]) -> str:
    """Format concepts with summaries and textbook references in a visually appealing way."""
    if not concepts:
        return "No concepts identified in the query."

    header = "📚 Identified Concepts:\n"
    formatted_concepts = []

    for concept in concepts:
        concept_str = f"• \033[1;36m{concept['concept']}\033[0m"  # Bold cyan concept name
        summary_str = f"  \033[0;37m{concept['summary']}\033[0m"  # White summary
        # reference_str = f"  \033[0;90mReference: {concept['textbook']}\033[0m"  # Gray textbook reference
        formatted_concepts.append(f"{concept_str}\n{summary_str}")

    return f"{header}\n" + "\n\n".join(formatted_concepts)

def print_concepts(concepts: List[Dict[str, str]]):
    """Print concepts in a visually appealing format with colors and spacing."""
    print(format_concepts(concepts))

In [None]:
user_query = "explain human Respiratory system?"
# Create an instance of the GraphRAGQueryEngine
print_concepts(concept_segmentation(user_query))
query_engine = GraphRAGQueryEngine(index, G, chunk_id_to_text, llm)
# Call the query method on the instance
response = query_engine.query(user_query)
print("\n")
print("Response:", response)

📚 Identified Concepts:

• [1;36mNose and Mouth[0m
  [0;37mThe primary entrance points for air into the lungs, responsible for warming, humidifying, and filtering the air.[0m

• [1;36mPharynx[0m
  [0;37mA muscular tube that serves as a common passage for both food and air, directing air into the larynx.[0m

• [1;36mLarynx[0m
  [0;37mA cartilaginous structure containing the vocal cords, responsible for producing sound and preventing food from entering the lungs.[0m

• [1;36mTrachea[0m
  [0;37mA tube that divides into two primary bronchi, one for each lung, responsible for conducting air into the lungs.[0m

• [1;36mBronchi and Bronchioles[0m
  [0;37mA network of tubes that branch into smaller airways, eventually leading to the alveoli where gas exchange occurs.[0m

• [1;36mAlveoli[0m
  [0;37mTiny sacs where oxygen diffuses into the blood and carbon dioxide is removed, facilitating gas exchange.[0m

• [1;36mDiaphragm[0m
  [0;37mA dome-shaped muscle that separates