In [1]:
%load_ext dotenv
%dotenv

In [2]:
from utils import neo4j_driver, num_tokens_from_string, chunk_text, chat
import ch07_tools

import json
import requests

from tqdm import tqdm
from typing import List, Dict

  from pandas.core import (


In [3]:
url = "https://www.gutenberg.org/cache/epub/1727/pg1727.txt"
response = requests.get(url)

In [4]:
def chunk_into_books(text: str) -> List[str]:
    return text.split("PREFACE TO FIRST EDITION")[2].split("FOOTNOTES")[0].strip().split("\nBOOK")[1:]

books = chunk_into_books(response.text)

In [5]:
token_count = [num_tokens_from_string(el) for el in books]
print(f"""There are {len(token_count)} books with token sizes:
- avg {sum(token_count) / len(token_count)}
- min {min(token_count)}
- max {max(token_count)}
""")

There are 24 books with token sizes:
- avg 6515.208333333333
- min 4459
- max 10760



In [6]:
chunked_books = [chunk_text(book, 1000, 40) for book in books]

In [7]:
ENTITY_TYPES = ["PERSON", "ORGANIZATION", "LOCATION", "GOD", "EVENT", "CREATURE", "WEAPON_OR_TOOL"]
def extract_entities(text: str) -> List[Dict]:
    # Construct prompt
    messages = [
        {"role": "user", "content": ch07_tools.create_extraction_prompt(ENTITY_TYPES, text)},
    ]
    # Make the LLM call
    output = chat(messages, model = "gpt-4o")
    # Construct JSON from output
    return ch07_tools.parse_extraction_output(output)

In [8]:
# Process only the first x books
x = 1
for book_i, book in enumerate(tqdm(chunked_books[:x], desc="Processing Books")):
    for chunk_i, chunk in enumerate(tqdm(book, desc=f"Book {book_i}", leave=False)):
        nodes, relationships = extract_entities(chunk)
        # Import nodes
        neo4j_driver.execute_query(ch07_tools.import_nodes_query, data=nodes, book_id=book_i, text=chunk, chunk_id=chunk_i)
        # Import relationships
        neo4j_driver.execute_query(ch07_tools.import_relationships_query, data=relationships)

Processing Books:   0%|          | 0/1 [00:00<?, ?it/s]
Book 0:   0%|          | 0/22 [00:00<?, ?it/s][A
Book 0:   5%|▍         | 1/22 [00:04<01:36,  4.59s/it][A
Book 0:   9%|▉         | 2/22 [00:14<02:30,  7.51s/it][A
Book 0:  14%|█▎        | 3/22 [00:20<02:11,  6.93s/it][A
Book 0:  18%|█▊        | 4/22 [00:25<01:52,  6.27s/it][A
Book 0:  23%|██▎       | 5/22 [00:35<02:07,  7.53s/it][A
Book 0:  27%|██▋       | 6/22 [00:41<01:52,  7.00s/it][A
Book 0:  32%|███▏      | 7/22 [00:45<01:30,  6.05s/it][A
Book 0:  36%|███▋      | 8/22 [00:48<01:09,  4.94s/it][A
Book 0:  41%|████      | 9/22 [00:51<00:59,  4.61s/it][A
Book 0:  45%|████▌     | 10/22 [00:54<00:48,  4.07s/it][A
Book 0:  50%|█████     | 11/22 [00:59<00:47,  4.30s/it][A
Book 0:  55%|█████▍    | 12/22 [01:05<00:48,  4.86s/it][A
Book 0:  59%|█████▉    | 13/22 [01:15<00:57,  6.40s/it][A
Book 0:  64%|██████▎   | 14/22 [01:19<00:45,  5.66s/it][A
Book 0:  68%|██████▊   | 15/22 [01:23<00:36,  5.25s/it][A
Book 0:  73%|█████

In [9]:
candidates_to_summarize, _, _ = neo4j_driver.execute_query(
    "MATCH (e:__Entity__) WHERE size(e.description) > 1 RETURN e.name AS entity_name, e.description AS description_list"
)
summaries = []
for candidate in tqdm(candidates_to_summarize, desc="Summarizing entities"):
    # Construct prompt
    messages = [
        {"role": "user", "content": ch07_tools.get_summarize_prompt(candidate["entity_name"], candidate["description_list"])},
    ]
    # Make the LLM call
    summary = chat(messages, model = "gpt-4o")
    summaries.append({"entity": candidate["entity_name"], "summary": summary})

Summarizing entities: 100%|██████████| 61/61 [01:21<00:00,  1.33s/it]


In [10]:
neo4j_driver.execute_query("""
UNWIND $data AS row
MATCH (e:__Entity__ {name: row.entity})
SET e.summary = row.summary
""", data=summaries)

# If there was only 1 description use that
neo4j_driver.execute_query("""
MATCH (e:__Entity__)
WHERE size(e.description) = 1
SET e.summary = e.description[0]
""")

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3109f48d0>, keys=[])

In [11]:
community_distribution = ch07_tools.calculate_communities(neo4j_driver)
print(f"There are {community_distribution['communityCount']} communities with distribution: {community_distribution['communityDistribution']}")

There are 8 communities with distribution: {'min': 2, 'p5': 2, 'max': 19, 'p999': 19, 'p99': 19, 'p1': 2, 'p10': 2, 'p90': 19, 'p50': 5, 'p25': 2, 'p75': 9, 'p95': 19, 'mean': 8.125}


In [12]:
community_info, _, _ = neo4j_driver.execute_query("""
MATCH (e:__Entity__)
WHERE e.louvain IS NOT NULL
WITH e.louvain AS louvain, collect(e) AS nodes
WHERE size(nodes) > 1
CALL apoc.path.subgraphAll(nodes[0], {
	whitelistNodes:nodes
})
YIELD relationships
RETURN louvain AS communityId,
       [n in nodes | {id: n.name, description: n.summary, type: [el in labels(n) WHERE el <> '__Entity__'][0]}] AS nodes,
       [r in relationships | {start: startNode(r).name, type: type(r), end: endNode(r).name, description: r.description}] AS rels
""")

In [14]:
communities = []
for community in tqdm(community_info, desc="Summarizing communities"):
    # Construct prompt
    messages = [
        {"role": "user", "content": ch07_tools.get_summarize_community_prompt(community["nodes"], community["rels"])},
    ]
    # Make the LLM call
    summary = chat(messages, model = "gpt-4o")
    communities.append({"community": json.loads(ch07_tools.extract_json(summary)), "communityId": community["communityId"], "nodes":[el['id'] for el in community["nodes"]]})

neo4j_driver.execute_query(ch07_tools.import_community_query, data=communities)

Summarizing communities: 100%|██████████| 8/8 [01:02<00:00,  7.80s/it]


EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3125508d0>, keys=[])