# Custom Knowledge Graph Pipeline Notebook
This notebook walks through building and querying a custom knowledge graph using Cognee. It mirrors the pipeline from `build_graph.py` but in an interactive Jupyter format.

Taken from : https://docs.cognee.ai/how-to-guides/cognee-sdk/define-custom-entities-low-level-api 


In [None]:
# The usual import
%load_ext autoreload
%autoreload 2
%reset -f

import sys

from devtools import debug  # noqa: F401  # noqa: F811
from dotenv import load_dotenv
from rich import print  # noqa: F401

assert load_dotenv(verbose=True)

sys.path.append(".")

## 1. Install and Import Dependencies

In [None]:
import asyncio
import os
import uuid
from pathlib import Path

import cognee
from cognee import SearchType, config, prune, search, visualize_graph
from cognee.low_level import DataPoint, setup
from cognee.modules.users.methods import get_default_user
from cognee.pipelines import Task, run_tasks
from cognee.tasks.storage import add_data_points
from cognee.tasks.storage.index_graph_edges import index_graph_edges

# from genai_tk.extra.cognee_utils import set_embeddings_config, set_llm_config

## 2. Define DataPoint Classes

In [None]:
class Person(DataPoint):
    """Represents an individual employee"""

    name: str
    metadata: dict = {"index_fields": ["name"]}


class Department(DataPoint):
    """Represents a company department with employees"""

    name: str
    employees: list[Person]
    metadata: dict = {"index_fields": ["name"]}


class CompanyType(DataPoint):
    """Represents the type/category of companies"""

    name: str = "Company"


class Company(DataPoint):
    """Represents a company with departments and type classification"""

    name: str
    departments: list[Department]
    is_type: CompanyType
    metadata: dict = {"index_fields": ["name"]}

## 3. Ingest Data Function

In [None]:
def ingest_files(data=None):
    """Load and process JSON data into DataPoint instances"""
    # Company data
    companies = [
        {"name": "TechCorp Solutions", "departments": ["Engineering", "Marketing", "Sales"]},
        {"name": "GreenFuture Solutions", "departments": ["Research", "Engineering", "Operations"]},
        {"name": "DataFlow Analytics", "departments": ["Data Science", "Engineering", "Customer Success"]},
    ]

    # People data
    people = [
        {"name": "Alice Johnson", "department": "Engineering"},
        {"name": "Bob Smith", "department": "Engineering"},
        {"name": "Carol Davis", "department": "Marketing"},
        {"name": "David Wilson", "department": "Sales"},
        {"name": "Eve Brown", "department": "Research"},
        {"name": "Frank Miller", "department": "Operations"},
        {"name": "Grace Lee", "department": "Data Science"},
        {"name": "Henry Chen", "department": "Customer Success"},
        {"name": "Ivy Rodriguez", "department": "Engineering"},
        {"name": "Jack Thompson", "department": "Marketing"},
    ]

    # Create person DataPoints and organize by department
    people_data_points = {}
    departments_data_points = {}

    print("🔄 Processing employee data...")
    for person in people:
        new_person = Person(name=person["name"])
        people_data_points[person["name"]] = new_person

        # Group employees by department
        if person["department"] not in departments_data_points:
            departments_data_points[person["department"]] = Department(
                name=person["department"], employees=[new_person]
            )
        else:
            departments_data_points[person["department"]].employees.append(new_person)

    # Create company DataPoints
    companies_data_points = {}

    # Create a single CompanyType node for all companies
    print("🏢 Creating company type classification...")
    companyType = CompanyType()

    print("🔄 Processing company data...")
    for company in companies:
        new_company = Company(name=company["name"], departments=[], is_type=companyType)
        companies_data_points[company["name"]] = new_company

        # Link departments to companies
        for department_name in company["departments"]:
            if department_name not in departments_data_points:
                departments_data_points[department_name] = Department(name=department_name, employees=[])

            new_company.departments.append(departments_data_points[department_name])

    print(f"✅ Created {len(companies_data_points)} companies with {len(departments_data_points)} departments")
    return list(companies_data_points.values())

## 4. Setup Cognee System

In [None]:
# Setup Cognee system directory
from genai_tk.extra.cognee_utils import set_cognee_config

cognee_directory_path = str(Path(".cognee_system").resolve())
config.system_root_directory(cognee_directory_path)

# Configure LLM

LLM_ID = "gpt_4omini_openai"
EMBEDDINGS_ID = "ada_002_openai"


set_cognee_config(llm_id=LLM_ID, embeddings_id=EMBEDDINGS_ID)


print("🧹 Cleaning up previous runs...")
await prune.prune_system(metadata=True)

print("⚙️ Setting up Cognee system...")
await setup()

In [None]:
await cognee.add("hello world")
await cognee.cognify()  # Respects rate limits automatically

## 5. Create Dataset

In [None]:
from cognee.infrastructure.databases.relational import get_relational_engine
from cognee.modules.data.models import Dataset

dataset_id = uuid.uuid4()
user = await get_default_user()
if user is None:
    raise RuntimeError("Failed to load default user. Make sure the database is seeded.")

db_engine = get_relational_engine()
async with db_engine.get_async_session() as session:
    dataset = await session.get(Dataset, dataset_id)
    if dataset is None:
        dataset = Dataset(
            id=dataset_id,
            name="custom_graph_pipeline",
            owner_id=user.id,
        )
        session.add(dataset)
        await session.commit()
print("✅ Dataset created with ID:", dataset_id)

## 6. Run Pipeline

In [None]:
print("🚀 Running custom data pipeline...")
pipeline = run_tasks(
    [
        Task(ingest_files),  # Load and process data
        Task(add_data_points),  # Add to Cognee storage
    ],
    dataset_id,
    None,
    user,
    "custom_graph_pipeline",
)

async for status in pipeline:
    print(f"📊 Pipeline status: {status}")

## 7. Index Graph Edges

In [None]:
print("🔗 Indexing graph relationships...")
await index_graph_edges()

## 8. Visualize Graph

In [None]:
import os

graph_file_path = str(os.path.join(os.getcwd(), "graph_visualization.html"))
await visualize_graph(graph_file_path)
print(f"📈 Graph visualization saved to: {graph_file_path}")

In [None]:
# from cognee.shared.utils import render_graph

# from cognee.utils import render_graph
# import graphistry

# # Setting an environment variable
# os.environ["GRAPHISTRY_USERNAME"] = YOUR GRAPHISTRY USERNAME
# os.environ["GRAPHISTRY_PASSWORD"] = YOUR GRAPHISTRY PASSWORD

# graphistry.login(api=3, username=os.getenv["GRAPHISTRY_USERNAME"], password=os.getenv["GRAPHISTRY_PASSWORD"])

# graph_url = await render_graph(graph)
# print(graph_url)
# url = await render_graph()
# print(f"Graphistry URL: {url}")

## 9. Query the Graph

In [None]:
queries = [
    "Who works for GreenFuture Solutions?",
    "Which departments does TechCorp Solutions have?",
    "List all employees in the Engineering department",
    "What companies have Research departments?",
]

for query in queries:
    print(f"\n🤔 Query: {query}")
    completion = await search(
        query_text=query,
        query_type=SearchType.GRAPH_COMPLETION,
    )
    print(f"💡 Answer: {completion}")
print("✅ Custom knowledge graph pipeline completed successfully!")