In [1]:
%load_ext dotenv
%dotenv

In [2]:
from utils import neo4j_driver, num_tokens_from_string, chunk_text, chat, embed
import ch07_tools

import json
import requests

from tqdm import tqdm
from typing import List, Dict

  from pandas.core import (


In [3]:
url = "https://www.gutenberg.org/cache/epub/1727/pg1727.txt"
response = requests.get(url)

In [4]:
def chunk_into_books(text: str) -> List[str]:
    return (
        text.split("PREFACE TO FIRST EDITION")[2]
        .split("FOOTNOTES")[0]
        .strip()
        .split("\nBOOK")[1:]
    )

books = chunk_into_books(response.text)

In [5]:
token_count = [num_tokens_from_string(el) for el in books]
print(
    f"""There are {len(token_count)} books with token sizes:
- avg {sum(token_count) / len(token_count)}
- min {min(token_count)}
- max {max(token_count)}
"""
)

There are 24 books with token sizes:
- avg 6515.208333333333
- min 4459
- max 10760



In [6]:
chunked_books = [chunk_text(book, 1000, 40) for book in books]

In [7]:
ENTITY_TYPES = [
    "PERSON",
    "ORGANIZATION",
    "LOCATION",
    "GOD",
    "EVENT",
    "CREATURE",
    "WEAPON_OR_TOOL",
]
def extract_entities(text: str) -> List[Dict]:
    # Construct prompt
    messages = [
        {"role": "user", "content": ch07_tools.create_extraction_prompt(ENTITY_TYPES, text)},
    ]
    # Make the LLM call
    output = chat(messages, model = "gpt-4o")
    # Construct JSON from output
    return ch07_tools.parse_extraction_output(output)

In [8]:
number_of_books = 1
for book_i, book in enumerate(
    tqdm(chunked_books[:number_of_books], desc="Processing Books")
):
    for chunk_i, chunk in enumerate(tqdm(book, desc=f"Book {book_i}", leave=False)):
        nodes, relationships = extract_entities(chunk)
        neo4j_driver.execute_query(
            ch07_tools.import_nodes_query,
            data=nodes,
            book_id=book_i,
            text=chunk,
            chunk_id=chunk_i,
        )
        neo4j_driver.execute_query(
            ch07_tools.import_relationships_query, data=relationships
        )

Processing Books:   0%|          | 0/1 [00:00<?, ?it/s]
Book 0:   0%|          | 0/22 [00:00<?, ?it/s][A
Book 0:   5%|▍         | 1/22 [00:08<02:49,  8.09s/it][A
Book 0:   9%|▉         | 2/22 [00:18<03:07,  9.40s/it][A
Book 0:  14%|█▎        | 3/22 [00:31<03:34, 11.27s/it][A
Book 0:  18%|█▊        | 4/22 [00:41<03:08, 10.48s/it][A
Book 0:  23%|██▎       | 5/22 [00:56<03:27, 12.19s/it][A
Book 0:  27%|██▋       | 6/22 [01:06<03:03, 11.45s/it][A
Book 0:  32%|███▏      | 7/22 [01:11<02:18,  9.24s/it][A
Book 0:  36%|███▋      | 8/22 [01:18<02:02,  8.73s/it][A
Book 0:  41%|████      | 9/22 [01:25<01:46,  8.18s/it][A
Book 0:  45%|████▌     | 10/22 [01:31<01:27,  7.30s/it][A
Book 0:  50%|█████     | 11/22 [01:41<01:31,  8.36s/it][A
Book 0:  55%|█████▍    | 12/22 [01:51<01:28,  8.83s/it][A
Book 0:  59%|█████▉    | 13/22 [02:04<01:31, 10.13s/it][A
Book 0:  64%|██████▎   | 14/22 [02:10<01:10,  8.85s/it][A
Book 0:  68%|██████▊   | 15/22 [02:17<00:57,  8.28s/it][A
Book 0:  73%|█████

In [9]:
data, _, _ = neo4j_driver.execute_query(
    """MATCH (:`__Entity__`)
    RETURN 'entity' AS type, count(*) AS count
    UNION
    MATCH ()-[:RELATIONSHIP]->()
    RETURN 'relationship' AS type, count(*) AS count
    """
)
print([el.data() for el in data])

[{'type': 'entity', 'count': 62}, {'type': 'relationship', 'count': 91}]


In [10]:
data, _, _ = neo4j_driver.execute_query(
    """MATCH (n:PERSON)
WHERE n.name = "ORESTES"
RETURN n.description AS description"""
)
print([el.data()['description'] for el in data])

[["Orestes is Agamemnon's son who killed Aegisthus", 'Orestes is a character who was expected to take revenge for past wrongs', "Orestes is praised for avenging his father's murder by killing Aegisthus"]]


In [11]:
data, _, _ = neo4j_driver.execute_query(
    """MATCH (n:__Entity__)-[:RELATIONSHIP]-(m:__Entity__)
WITH n,m, count(*) AS countOfRels
ORDER BY countOfRels DESC LIMIT 1
MATCH (n)-[r:RELATIONSHIP]-(m)
RETURN n.name AS source, m.name AS target, countOfRels, collect(r.description) AS descriptions
"""
)
print([el.data() for el in data])

[{'source': 'TELEMACHUS', 'target': 'MINERVA', 'countOfRels': 7, 'descriptions': ['Telemachus spoke quietly to Minerva during the banquet', 'Minerva, a goddess, advises and encourages Telemachus, making him think more about his father', 'Minerva gave counsel to Telemachus regarding his intended voyage', 'Minerva visits Ithaca, where Telemachus challenges the suitors', 'Minerva plans to encourage Telemachus to confront the suitors and seek information about his father', 'Minerva, in disguise, is recognized by Telemachus and welcomed', 'Minerva is speaking to Telemachus, offering him guidance and reassurance']}]


In [12]:
candidates_to_summarize, _, _ = neo4j_driver.execute_query(
    """MATCH (e:__Entity__) WHERE size(e.description) > 1 
    RETURN e.name AS entity_name, e.description AS description_list"""
)
summaries = []
for candidate in tqdm(candidates_to_summarize, desc="Summarizing entities"):
    messages = [
        {
            "role": "user",
            "content": ch07_tools.get_summarize_prompt(
                candidate["entity_name"], candidate["description_list"]
            ),
        },
    ]
    summary = chat(messages, model="gpt-4o")
    summaries.append({"entity": candidate["entity_name"], "summary": summary})

ch07_tools.import_entity_summary(neo4j_driver, summaries)

Summarizing entities: 100%|██████████| 25/25 [01:05<00:00,  2.61s/it]


In [13]:
summary, _, _ = neo4j_driver.execute_query(
    """MATCH (n:PERSON)
WHERE n.name = "ORESTES"
RETURN n.summary AS summary""")
print(summary[0]['summary'])

Orestes is a character known for being the son of Agamemnon. He is recognized for fulfilling the expectation of avenging past wrongs, specifically by killing Aegisthus. Orestes is praised for avenging his father's murder, which was a significant act of retribution in his narrative.


In [14]:
rels_to_summarize, _, _ = neo4j_driver.execute_query(
    """MATCH (s:__Entity__)-[r:RELATIONSHIP]-(t:__Entity__)
    WHERE id(s) < id(t)
    WITH s.name AS source, t.name AS target, 
           collect(r.description) AS description_list,
           count(*) AS count
    WHERE count > 1
    RETURN source, target, description_list"""
)
rel_summaries = []
for candidate in tqdm(rels_to_summarize, desc="Summarizing relationships"):
    entity_name = f"{candidate['source']} relationship to {candidate['target']}"
    messages = [
        {
            "role": "user",
            "content": ch07_tools.get_summarize_prompt(
                entity_name, candidate["description_list"]
            ),
        },
    ]
    summary = chat(messages, model="gpt-4o")
    rel_summaries.append({"source": candidate["source"], "target": candidate["target"], "summary": summary})

ch07_tools.import_rels_summary(neo4j_driver, summaries)

Summarizing relationships: 100%|██████████| 14/14 [00:28<00:00,  2.06s/it]


In [15]:
data, _, _ = neo4j_driver.execute_query(
    """MATCH (n:__Entity__)-[r:SUMMARIZED_RELATIONSHIP]-(m:__Entity__)
WHERE n.name = 'TELEMACHUS' AND m.name = 'MINERVA'
RETURN r.summary AS description
"""
)
print(data[0]["description"])

Minerva is speaking to Telemachus, offering him guidance and reassurance


In [16]:
community_distribution = ch07_tools.calculate_communities(neo4j_driver)
print(f"There are {community_distribution['communityCount']} communities with distribution: {community_distribution['communityDistribution']}")

There are 9 communities with distribution: {'min': 2, 'p5': 2, 'max': 16, 'p999': 16, 'p99': 16, 'p1': 2, 'p10': 2, 'p90': 16, 'p50': 4, 'p25': 3, 'p75': 10, 'p95': 16, 'mean': 6.555555555555555}


In [17]:
community_info, _, _ = neo4j_driver.execute_query(ch07_tools.community_info_query)

communities = []
for community in tqdm(community_info, desc="Summarizing communities"):
    messages = [
        {
            "role": "user",
            "content": ch07_tools.get_summarize_community_prompt(
                community["nodes"], community["rels"]
            ),
        },
    ]
    summary = chat(messages, model="gpt-4o")
    communities.append(
        {
            "community": json.loads(ch07_tools.extract_json(summary)),
            "communityId": community["communityId"],
            "nodes": [el["id"] for el in community["nodes"]],
        }
    )

neo4j_driver.execute_query(ch07_tools.import_community_query, data=communities)

Summarizing communities: 100%|██████████| 9/9 [02:36<00:00, 17.38s/it]


EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x130437510>, keys=[])

In [18]:
data, _, _ = neo4j_driver.execute_query(
    """MATCH (c:__Community__)
WITH c, count {(c)<-[:IN_COMMUNITY]-()} AS size
ORDER BY size DESC LIMIT 1
RETURN c.title AS title, c.summary AS summary
"""
)
print(data[0]["title"])
print(data[0]["summary"])

Ulysses and the Suitors in Ithaca
The community centers around Ulysses, a legendary Greek hero, and the suitors who have taken over his estate in Ithaca. The suitors are causing significant disruption as they seek to marry Ulysses' wife. Ulysses' journey home is hindered by divine and mortal challenges, including the wrath of Neptune and the detainment by Calypso. The Achaean heroes and various Greek islands are also involved, highlighting the complex web of relationships and conflicts surrounding Ulysses' return.


In [19]:
def global_retriever(query: str, rating_threshold: float = 5) -> str:
    community_data, _, _ = neo4j_driver.execute_query(
        """
    MATCH (c:__Community__)
    WHERE c.rating >= $rating
    RETURN c.summary AS summary
    """,
        rating=rating_threshold,
    )
    print(f"Got {len(community_data)} community summaries")
    intermediate_results = []
    for community in tqdm(community_data, desc="Processing communities"):
        intermediate_messages = [
            {
                "role": "system",
                "content": ch07_tools.get_map_system_prompt(community["summary"]),
            },
            {
                "role": "user",
                "content": query,
            },
        ]
        intermediate_response = chat(intermediate_messages, model="gpt-4o")
        intermediate_results.append(intermediate_response)

    final_messages = [
        {
            "role": "system",
            "content": ch07_tools.get_reduce_system_prompt(intermediate_results),
        },
        {"role": "user", "content": query},
    ]
    summary = chat(final_messages, model="gpt-4o")
    return summary

In [20]:
print(global_retriever("What is this story about?"))

Got 5 community summaries


Processing communities: 100%|██████████| 5/5 [00:31<00:00,  6.27s/it]


The story revolves around Ulysses, a legendary Greek hero, and his arduous journey back to his homeland, Ithaca, after the Trojan War. Ulysses faces numerous challenges, both divine and mortal, including the wrath of Neptune and detainment by Calypso, which significantly delay his return [Data: Reports (1)]. Meanwhile, his estate in Ithaca is besieged by suitors vying for his wife, causing considerable disruption [Data: Reports (1)].

The narrative also explores the complex web of relationships and conflicts involving Achaean heroes and various Greek islands, highlighting the broader impact of Ulysses' journey on Greek society [Data: Reports (1)]. Divine influence plays a crucial role, with Heaven depicted as a powerful force determining leadership among people [Data: Reports (1)]. This divine intervention is further exemplified by the involvement of Olympian Jove, who presides over a divine assembly reflecting on the actions of Aegisthus, infamous for murdering Agamemnon, and the subs

In [21]:
entities, _, _ = neo4j_driver.execute_query(
    """
MATCH (e:__Entity__)
RETURN e.summary AS summary, e.name AS name
"""
)
data = [{"name": el["name"], "embedding": embed(el["summary"])[0]} for el in entities]
neo4j_driver.execute_query(
    """
UNWIND $data AS row
MATCH (e:__Entity__ {name: row.name})
CALL db.create.setNodeVectorProperty(e, 'embedding', row.embedding)
""",
    data=data,
)

neo4j_driver.execute_query(
    """
CREATE VECTOR INDEX entities IF NOT EXISTS
FOR (n:__Entity__)
ON (n.embedding)
""",
    data=data,
)


EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x1303d2e10>, keys=[])

In [22]:
local_search_query = """
CALL db.index.vector.queryNodes('entities', $k, $embedding)
YIELD node, score
WITH collect(node) as nodes
WITH collect {
    UNWIND nodes as n
    MATCH (n)<-[:HAS_ENTITY]->(c:__Chunk__)
    WITH c, count(distinct n) as freq
    RETURN c.text AS chunkText
    ORDER BY freq DESC
    LIMIT $topChunks
} AS text_mapping,
collect {
    UNWIND nodes as n
    MATCH (n)-[:IN_COMMUNITY]->(c:__Community__)
    WITH c, c.rank as rank, c.weight AS weight
    RETURN c.summary 
    ORDER BY rank, weight DESC
    LIMIT $topCommunities
} AS report_mapping,
collect {
    UNWIND nodes as n
    MATCH (n)-[r:SUMMARIZED_RELATIONSHIP]-(m) 
    WHERE m IN nodes
    RETURN r.summary AS descriptionText
    ORDER BY r.rank, r.weight DESC 
    LIMIT $topInsideRels
} as insideRels,
collect {
    UNWIND nodes as n
    RETURN n.summary AS descriptionText
} as entities
RETURN {Chunks: text_mapping, Reports: report_mapping, 
       Relationships: insideRels, 
       Entities: entities} AS text
"""

In [23]:
k_entities = 5

topChunks = 3
topCommunities = 3
topInsideRels = 3


def local_search(query: str) -> str:
    context, _, _ = neo4j_driver.execute_query(
        local_search_query,
        embedding=embed(query)[0],
        topChunks=topChunks,
        topCommunities=topCommunities,
        topInsideRels=topInsideRels,
        k=k_entities,
    )
    context_str = str(context[0]["text"])
    local_messages = [
        {
            "role": "system",
            "content": ch07_tools.get_local_system_prompt(context_str),
        },
        {
            "role": "user",
            "content": query,
        },
    ]
    final_answer = chat(local_messages, model="gpt-4o")
    return final_answer


In [24]:
print(local_search("Who is Ulysses?"))

## Ulysses: The Legendary Greek Hero

Ulysses, also known as Odysseus, is a legendary figure in Greek mythology renowned for his intelligence, resourcefulness, and leadership during the Trojan War. He is celebrated as the ingenious hero who, after the fall of Troy, embarked on a long and arduous journey to return to his homeland, Ithaca. This journey is famously chronicled in Homer's epic, "The Odyssey," where Ulysses faces numerous challenges and adventures as he longs to reunite with his wife, Penelope, and his son, Telemachus [Data: Entities (1)].

## The Journey and Challenges

Ulysses' journey home is fraught with divine and mortal challenges. He is detained by the nymph Calypso on a remote island, and his return is further complicated by the wrath of Neptune, the god of the sea. Despite these obstacles, the gods are ultimately facilitating his return, as he is needed in Ithaca to deal with the suitors who have overrun his estate and are vying for his wife's hand in marriage [Data