In [1]:
%load_ext dotenv
%dotenv

In [17]:
from utils import neo4j_driver, num_tokens_from_string, chunk_text, chat, embed
import ch07_tools

import json
import requests

from tqdm import tqdm
from typing import List, Dict

In [3]:
url = "https://www.gutenberg.org/cache/epub/1727/pg1727.txt"
response = requests.get(url)

In [4]:
def chunk_into_books(text: str) -> List[str]:
    return (
        text.split("PREFACE TO FIRST EDITION")[2]
        .split("FOOTNOTES")[0]
        .strip()
        .split("\nBOOK")[1:]
    )

books = chunk_into_books(response.text)

In [5]:
token_count = [num_tokens_from_string(el) for el in books]
print(
    f"""There are {len(token_count)} books with token sizes:
- avg {sum(token_count) / len(token_count)}
- min {min(token_count)}
- max {max(token_count)}
"""
)

There are 24 books with token sizes:
- avg 6515.208333333333
- min 4459
- max 10760



In [6]:
chunked_books = [chunk_text(book, 1000, 40) for book in books]

In [7]:
ENTITY_TYPES = [
    "PERSON",
    "ORGANIZATION",
    "LOCATION",
    "GOD",
    "EVENT",
    "CREATURE",
    "WEAPON_OR_TOOL",
]
def extract_entities(text: str) -> List[Dict]:
    # Construct prompt
    messages = [
        {"role": "user", "content": ch07_tools.create_extraction_prompt(ENTITY_TYPES, text)},
    ]
    # Make the LLM call
    output = chat(messages, model = "gpt-4o")
    # Construct JSON from output
    return ch07_tools.parse_extraction_output(output)

In [8]:
number_of_books = 1
for book_i, book in enumerate(
    tqdm(chunked_books[:number_of_books], desc="Processing Books")
):
    for chunk_i, chunk in enumerate(tqdm(book, desc=f"Book {book_i}", leave=False)):
        nodes, relationships = extract_entities(chunk)
        neo4j_driver.execute_query(
            ch07_tools.import_nodes_query,
            data=nodes,
            book_id=book_i,
            text=chunk,
            chunk_id=chunk_i,
        )
        neo4j_driver.execute_query(
            ch07_tools.import_relationships_query, data=relationships
        )

Processing Books:   0%|          | 0/1 [00:00<?, ?it/s]
Book 0:   0%|          | 0/22 [00:00<?, ?it/s][A
Book 0:   5%|▍         | 1/22 [00:09<03:11,  9.11s/it][A
Book 0:   9%|▉         | 2/22 [00:21<03:36, 10.84s/it][A
Book 0:  14%|█▎        | 3/22 [00:33<03:35, 11.34s/it][A
Book 0:  18%|█▊        | 4/22 [00:42<03:12, 10.70s/it][A
Book 0:  23%|██▎       | 5/22 [01:00<03:44, 13.18s/it][A
Book 0:  27%|██▋       | 6/22 [01:10<03:12, 12.00s/it][A
Book 0:  32%|███▏      | 7/22 [01:16<02:31, 10.12s/it][A
Book 0:  36%|███▋      | 8/22 [01:21<02:00,  8.57s/it][A
Book 0:  41%|████      | 9/22 [01:28<01:43,  7.99s/it][A
Book 0:  45%|████▌     | 10/22 [01:33<01:24,  7.06s/it][A
Book 0:  50%|█████     | 11/22 [01:44<01:31,  8.34s/it][A
Book 0:  55%|█████▍    | 12/22 [01:51<01:18,  7.86s/it][A
Book 0:  59%|█████▉    | 13/22 [02:06<01:30, 10.01s/it][A
Book 0:  64%|██████▎   | 14/22 [02:12<01:11,  8.91s/it][A
Book 0:  68%|██████▊   | 15/22 [02:20<01:00,  8.64s/it][A
Book 0:  73%|█████

In [9]:
data, _, _ = neo4j_driver.execute_query(
    """MATCH (:`__Entity__`)
    RETURN 'entity' AS type, count(*) AS count
    UNION
    MATCH ()-[:RELATIONSHIP]->()
    RETURN 'relationship' AS type, count(*) AS count
    """
)
print([el.data() for el in data])

[{'type': 'entity', 'count': 64}, {'type': 'relationship', 'count': 94}]


In [10]:
data, _, _ = neo4j_driver.execute_query(
    """MATCH (n:PERSON)
WHERE n.name = "ORESTES"
RETURN n.description AS description"""
)
print([el.data()['description'] for el in data])

[["Orestes is Agamemnon's son who killed Aegisthus", 'Orestes is a person who was expected to take revenge on Aegisthus', "Orestes is praised for avenging his father's murder by killing Aegisthus"]]


In [11]:
data, _, _ = neo4j_driver.execute_query(
    """MATCH (n:__Entity__)-[:RELATIONSHIP]-(m:__Entity__)
WITH n,m, count(*) AS countOfRels
ORDER BY countOfRels DESC LIMIT 1
MATCH (n)-[r:RELATIONSHIP]-(m)
RETURN n.name AS source, m.name AS target, countOfRels, collect(r.description) AS descriptions
"""
)
print([el.data() for el in data])

[{'source': 'TELEMACHUS', 'target': 'MINERVA', 'countOfRels': 6, 'descriptions': ['Telemachus spoke quietly to Minerva during the banquet to avoid being overheard', 'Minerva, in disguise, advises and encourages Telemachus, giving him courage and making him think of his father', 'Minerva gave counsel to Telemachus regarding his intended voyage', 'Minerva plans to encourage Telemachus to seek news of his father', 'Minerva, in disguise, is noticed by Telemachus who welcomes her', 'Minerva is speaking to Telemachus, offering him guidance and reassurance']}]


In [12]:
candidates_to_summarize, _, _ = neo4j_driver.execute_query(
    """MATCH (e:__Entity__) WHERE size(e.description) > 1 
    RETURN e.name AS entity_name, e.description AS description_list"""
)
summaries = []
for candidate in tqdm(candidates_to_summarize, desc="Summarizing entities"):
    messages = [
        {
            "role": "user",
            "content": ch07_tools.get_summarize_prompt(
                candidate["entity_name"], candidate["description_list"]
            ),
        },
    ]
    summary = chat(messages, model="gpt-4o")
    summaries.append({"entity": candidate["entity_name"], "summary": summary})

ch07_tools.import_entity_summary(neo4j_driver, summaries)

Summarizing entities: 100%|██████████| 25/25 [01:06<00:00,  2.67s/it]


In [13]:
summary, _, _ = neo4j_driver.execute_query(
    """MATCH (n:PERSON)
WHERE n.name = "ORESTES"
RETURN n.summary AS summary""")
print(summary[0]['summary'])

Orestes is the son of Agamemnon, known for avenging his father's murder by killing Aegisthus. He was expected to take revenge on Aegisthus, and his actions in fulfilling this expectation have been praised.


In [14]:
rels_to_summarize, _, _ = neo4j_driver.execute_query(
    """MATCH (s:__Entity__)-[r:RELATIONSHIP]-(t:__Entity__)
    WHERE id(s) < id(t)
    WITH s.name AS source, t.name AS target, 
           collect(r.description) AS description_list,
           count(*) AS count
    WHERE count > 1
    RETURN source, target, description_list"""
)
rel_summaries = []
for candidate in tqdm(rels_to_summarize, desc="Summarizing relationships"):
    entity_name = f"{candidate['source']} relationship to {candidate['target']}"
    messages = [
        {
            "role": "user",
            "content": ch07_tools.get_summarize_prompt(
                entity_name, candidate["description_list"]
            ),
        },
    ]
    summary = chat(messages, model="gpt-4o")
    rel_summaries.append({"source": candidate["source"], "target": candidate["target"], "summary": summary})

ch07_tools.import_rels_summary(neo4j_driver, summaries)

Summarizing relationships: 100%|██████████| 12/12 [00:28<00:00,  2.37s/it]


In [15]:
data, _, _ = neo4j_driver.execute_query(
    """MATCH (n:__Entity__)-[r:SUMMARIZED_RELATIONSHIP]-(m:__Entity__)
WHERE n.name = 'TELEMACHUS' AND m.name = 'MINERVA'
RETURN r.summary AS description
"""
)
print(data[0]["description"])

Minerva is speaking to Telemachus, offering him guidance and reassurance


In [16]:
community_distribution = ch07_tools.calculate_communities(neo4j_driver)
print(f"There are {community_distribution['communityCount']} communities with distribution: {community_distribution['communityDistribution']}")

There are 9 communities with distribution: {'min': 2, 'p5': 2, 'max': 13, 'p999': 13, 'p99': 13, 'p1': 2, 'p10': 2, 'p90': 13, 'p50': 5, 'p25': 2, 'p75': 9, 'p95': 13, 'mean': 6.333333333333333}


In [17]:
community_info, _, _ = neo4j_driver.execute_query(ch07_tools.community_info_query)

communities = []
for community in tqdm(community_info, desc="Summarizing communities"):
    # Construct prompt
    messages = [
        {
            "role": "user",
            "content": ch07_tools.get_summarize_community_prompt(
                community["nodes"], community["rels"]
            ),
        },
    ]
    # Make the LLM call
    summary = chat(messages, model="gpt-4o")
    communities.append(
        {
            "community": json.loads(ch07_tools.extract_json(summary)),
            "communityId": community["communityId"],
            "nodes": [el["id"] for el in community["nodes"]],
        }
    )

neo4j_driver.execute_query(ch07_tools.import_community_query, data=communities)

Summarizing communities: 100%|██████████| 9/9 [02:29<00:00, 16.61s/it]


EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x13fa44c10>, keys=[])

In [18]:
data, _, _ = neo4j_driver.execute_query(
    """MATCH (c:__Community__)
WITH c, count {(c)<-[:IN_COMMUNITY]-()} AS size
ORDER BY size DESC LIMIT 1
RETURN c.title AS title, c.summary AS summary
"""
)
print(data[0]["title"])
print(data[0]["summary"])

Minerva, Telemachus, and the Ithacan Household
The community centers around Minerva, Telemachus, and the household of Ulysses, with significant interactions involving divine guidance, familial loyalty, and the challenges posed by suitors. Minerva plays a pivotal role in advising Telemachus, who is determined to find his father and restore order to his home. The relationships among these entities highlight themes of wisdom, courage, and resilience.


In [14]:
def global_retriever(query: str, rating: float = 5) -> str:
    community_data, _, _ = neo4j_driver.execute_query(
        """
    MATCH (c:__Community__)
    WHERE c.rating >= $rating
    RETURN c.summary AS summary
    """,
        rating=rating,
    )
    print(f"Got {len(community_data)} community summaries")
    intermediate_results = []
    for community in tqdm(community_data, desc="Processing communities"):
        intermediate_messages = [
            {
                "role": "system",
                "content": ch07_tools.get_map_system_prompt(community["summary"]),
            },
            {
                "role": "user",
                "content": query,
            },
        ]
        intermediate_response = chat(intermediate_messages, model="gpt-4o")
        intermediate_results.append(intermediate_response)

    final_messages = [
        {
            "role": "system",
            "content": ch07_tools.get_reduce_system_prompt(intermediate_results),
        },
        {"role": "user", "content": query},
    ]
    summary = chat(final_messages, model="gpt-4o")
    return summary

In [16]:
print(global_retriever("What is this story about?"))

Got 7 community summaries


Processing communities: 100%|██████████| 7/7 [00:47<00:00,  6.80s/it]


The story revolves around the intricate dynamics of a community involving key figures such as Minerva, Telemachus, and the household of Ulysses. Central themes include divine guidance, familial loyalty, and the challenges posed by suitors. Minerva plays a crucial role in advising Telemachus, who is determined to find his father, Ulysses, and restore order to his home. The relationships among the characters emphasize themes of wisdom, courage, and resilience.

Additionally, the narrative highlights the role of Mentes, the chief of the Taphians, who is recognized as the son of Anchialus. Mentes is involved in a voyage to Temesa, known for its iron cargo, and claims kingship over the Taphians [Data: Reports (1)]. The story also centers around Odysseus, a key figure in Greek mythology, and his connections with other significant entities such as the Achaeans, Laertes, and the gods. The relationships in the story underscore the impact of divine intervention on human affairs, showcasing how t

In [22]:
entities, _, _ = neo4j_driver.execute_query(
    """
MATCH (e:__Entity__)
RETURN e.summary AS summary, e.name AS name
"""
)
data = [{"name": el["name"], "embedding": embed(el["summary"])[0]} for el in entities]
neo4j_driver.execute_query(
    """
UNWIND $data AS row
MATCH (e:__Entity__ {name: row.name})
CALL db.create.setNodeVectorProperty(e, 'embedding', row.embedding)
""",
    data=data,
)

neo4j_driver.execute_query(
    """
CREATE VECTOR INDEX entities IF NOT EXISTS
FOR (n:__Entity__)
ON (n.embedding)
""",
    data=data,
)


EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x17b2c9bd0>, keys=[])