## Cognee GraphRAG with LlamaIndex Documents

In [None]:
%pip install llama-index-core


## Load Data

We will use a sample news article dataset retrieved from Diffbot, which Tomaz has conveniently made available on GitHub for easy access.

The dataset contains 2,500 samples; for ease of experimentation, we will use 5 of these samples, which include the `title` and `text` of news articles.

In [None]:
import pandas as pd
from llama_index.core import Document

news = pd.read_csv(
    "https://raw.githubusercontent.com/tomasonjo/blog-datasets/main/news_articles.csv"
)[:5]

news.head()

## Prepare documents as required by LlamaIndex

In [3]:
documents = [Document(text=f"{row['title']}: {row['text']}") for i, row in news.iterrows()]

## Set environment variables

In [4]:
import os

# Setting environment variables
if "GRAPHISTRY_USERNAME" not in os.environ:
    os.environ["GRAPHISTRY_USERNAME"] = ""

if "GRAPHISTRY_PASSWORD" not in os.environ:
    os.environ["GRAPHISTRY_PASSWORD"] = ""

if "LLM_API_KEY" not in os.environ:
    os.environ["LLM_API_KEY"] = ""

# "neo4j" or "networkx"
os.environ["GRAPH_DATABASE_PROVIDER"] = "networkx"
# Not needed if using networkx
# os.environ["GRAPH_DATABASE_URL"]=""
# os.environ["GRAPH_DATABASE_USERNAME"]=""
# os.environ["GRAPH_DATABASE_PASSWORD"]=""

# "pgvector", "qdrant", "weaviate" or "lancedb"
os.environ["VECTOR_DB_PROVIDER"] = "lancedb"
# Not needed if using "lancedb" or "pgvector"
# os.environ["VECTOR_DB_URL"]=""
# os.environ["VECTOR_DB_KEY"]=""

# Relational Database provider "sqlite" or "postgres"
os.environ["DB_PROVIDER"] = "sqlite"

# Database name
os.environ["DB_NAME"] = "cognee_db"

# Postgres specific parameters (Only if Postgres or PGVector is used)
# os.environ["DB_HOST"]="127.0.0.1"
# os.environ["DB_PORT"]="5432"
# os.environ["DB_USERNAME"]="cognee"
# os.environ["DB_PASSWORD"]="cognee"

## Run Cognee with LlamaIndex Documents

In [None]:
from typing import Union, BinaryIO

from cognee.infrastructure.databases.vector.pgvector import (
    create_db_and_tables as create_pgvector_db_and_tables,
)
from cognee.infrastructure.databases.relational import (
    create_db_and_tables as create_relational_db_and_tables,
)
from cognee.modules.users.models import User
from cognee.modules.users.methods import get_default_user
from cognee.tasks.ingestion.ingest_data import ingest_data
import cognee

# Create a clean slate for cognee -- reset data and system state
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)


# Add the LlamaIndex documents, and make it available for cognify
async def add(
    data: Union[BinaryIO, list[BinaryIO], str, list[str]],
    dataset_name: str = "main_dataset",
    user: User = None,
):
    await create_relational_db_and_tables()
    await create_pgvector_db_and_tables()

    if user is None:
        user = await get_default_user()

    await ingest_data(data, dataset_name, user)


await add(documents)

# Use LLMs and cognee to create knowledge graph
await cognee.cognify()

## Query Cognee for summaries related to data

In [None]:
from cognee import SearchType

# Query cognee for summaries
search_results = await cognee.search(
    query_type=SearchType.SUMMARIES, query_text="What are the main news discussed in the document?"
)
# Display search results
print("\n Summary of main news discussed:\n")
print(search_results[0]["text"])

## Render Knowledge Graph generated from provided data

In [None]:
import graphistry

from cognee.infrastructure.databases.graph import get_graph_engine
from cognee.shared.utils import render_graph

# Get graph
graphistry.login(
    username=os.getenv("GRAPHISTRY_USERNAME"), password=os.getenv("GRAPHISTRY_PASSWORD")
)
graph_engine = await get_graph_engine()

graph_url = await render_graph(graph_engine.graph)
print(graph_url)