In [None]:
from langchain_community.graphs import Neo4jGraph
import os
from dotenv import load_dotenv
from langchain.chat_models import init_chat_model
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_core.prompts import ChatPromptTemplate

load_dotenv()
NEO_PASSWORD = os.getenv("NEO_PASSWORD")

In [2]:
# Initialize knowledge graph database
graph = Neo4jGraph(
    url = "neo4j+s://841b6bc1.databases.neo4j.io",
    username = "neo4j",
    password = NEO_PASSWORD,
    refresh_schema = False
)

  graph = Neo4jGraph(


In [3]:
import re

def clean_text(text: str) -> str:
    """Remove extra new lines and whitespace"""
    text = re.sub(r'\n{2,}', '\n', text)
    return text.strip()

def clean_link(text):
    """Get relative link"""
    text = re.search(r"[^./].*", text).group(0)
    return text

In [4]:
import requests
from bs4 import BeautifulSoup
from langchain_core.documents import Document

def scrape():
    """Web scrape Stat 20 lecture notes by retrieving content and next page links"""
    rel_url = "https://stat20.berkeley.edu/summer-2025/"
    url = "https://stat20.berkeley.edu/summer-2025/1-questions-and-data/01-understanding-the-world/notes.html" 
    content = "lecture content"
    documents = []

    # Iterate until there is no next page
    while True:
        response = requests.get(url)
        html_content = response.content
        soup = BeautifulSoup(html_content, "html.parser")
        main = soup.find("main", id = "quarto-document-content")
        header = soup.find("header", id ="quarto-header")

        if main:  
            content = clean_text(main.get_text())

        if header:
            title = header.find("h1", class_ = "quarto-secondary-nav-title no-breadcrumbs").get_text()

        if content:
            documents.append(
                Document(
                    page_content = content,
                    metadata = {
                        "url": url,
                        "title": title
                    }
                )
            )

        next_page = soup.find("div", class_ = "nav-page nav-page-next")
        if next_page:
            url = rel_url + clean_link(next_page.find("a", href = True)["href"])
        else:
            break
        
    return documents

In [5]:
# Define system prompt
system_prompt = (
    "# Knowledge Graph Instructions for GPT-4\n"
    "## 1. Overview\n"
    "You are a top-tier algorithm designed for extracting information in structured "
    "formats to build a knowledge graph for an educational learning platform.\n"
    "Try to capture as much information from the text as possible without "
    "sacrificing accuracy. Do not add any information that is not explicitly "
    "mentioned in the text.\n"
    "- **Nodes** represent entities and concepts.\n"
    "- The aim is to achieve simplicity and clarity in the knowledge graph, making it\n"
    "helpful for a student who is reviewing for an exam.\n"
    "## 2. Labeling Nodes\n"
    "- **Consistency**: Ensure you use available types for node labels.\n"
    "Ensure you use basic or elementary types for node labels.\n"
    "- For example, when you identify an entity representing a person, "
    "always label it as **'person'**. Avoid using more specific terms "
    "like 'mathematician' or 'scientist'."
    "- **Node IDs**: Never utilize integers as node IDs. Node IDs should be "
    "names or human-readable identifiers found in the text.\n"
    "- **Relationships** represent connections between entities or concepts.\n"
    "Ensure consistency and generality in relationship types when constructing "
    "knowledge graphs. Instead of using specific and momentary types "
    "such as 'BECAME_PROFESSOR', use more general and timeless relationship types "
    "like 'PROFESSOR'. Make sure to use general and timeless relationship types!\n"
    "## 3. Coreference Resolution\n"
    "- **Maintain Entity Consistency**: When extracting entities, it's vital to "
    "ensure consistency.\n"
    'If an entity, such as "John Doe", is mentioned multiple times in the text '
    'but is referred to by different names or pronouns (e.g., "Joe", "he"),'
    "always use the most complete identifier for that entity throughout the "
    'knowledge graph. In this example, use "John Doe" as the entity ID.\n'
    "Remember, the knowledge graph should be coherent and easily understandable, "
    "so maintaining consistency in entity references is crucial.\n"
    "## 4. Strict Compliance\n"
    "Adhere to the rules strictly. Non-compliance will result in termination."
)

In [6]:
def get_prompt(subject: str = "") -> ChatPromptTemplate:
    """Define prompt template based on subject and user input"""
    return ChatPromptTemplate.from_messages(
        [
            ("system", system_prompt),
            (
                "human",
                "Ensure that the entities and concepts extracted are relevant material with respect to the following subject: " + subject + ". They should not be teachers, course names, syllabus, or miscellaneous references."
                "Tip: Make sure to answer in the correct format and do "
                "not include any explanations. "
                "Use the given format to extract information from the "
                "following input: {input}"
            )
        ]
    )

In [7]:
# Initialize llm and knowledge graph database
llm = init_chat_model("gpt-4o-mini", model_provider = "openai")
llm_transformer = LLMGraphTransformer(
    llm = llm, 
    allowed_nodes = ["MathTheorem", "MathConcept"],
    allowed_relationships = ["Prerequisite", "Inclusion"],
    node_properties = ["definition"],
    prompt = get_prompt("statistics")
)

In [8]:
documents = scrape()

In [9]:
# Retrieve nodes and relationships from documents
graph_documents = await llm_transformer.aconvert_to_graph_documents([documents[0]])
print(f"Nodes:{graph_documents[0].nodes}")
print(f"Relationships:{graph_documents[0].relationships}")

Nodes:[Node(id='Types Of Claims', type='Mathconcept', properties={}), Node(id='Summary', type='Mathconcept', properties={}), Node(id='Generalization', type='Mathconcept', properties={}), Node(id='Causal Claim', type='Mathconcept', properties={}), Node(id='Prediction', type='Mathconcept', properties={})]
Relationships:[Relationship(source=Node(id='Types Of Claims', type='Mathconcept', properties={}), target=Node(id='Summary', type='Mathconcept', properties={}), type='INCLUSION', properties={}), Relationship(source=Node(id='Types Of Claims', type='Mathconcept', properties={}), target=Node(id='Generalization', type='Mathconcept', properties={}), type='INCLUSION', properties={}), Relationship(source=Node(id='Types Of Claims', type='Mathconcept', properties={}), target=Node(id='Causal Claim', type='Mathconcept', properties={}), type='INCLUSION', properties={}), Relationship(source=Node(id='Types Of Claims', type='Mathconcept', properties={}), target=Node(id='Prediction', type='Mathconcept',

In [10]:
# Keep track of node sources
node_to_docs = {}

for doc in graph_documents:
    for node in doc.nodes:
        if node.id in node_to_docs:
            node_to_docs[node.id].append(doc.source)
        else:
            node_to_docs[node.id] = [doc.source]

In [11]:
# Create embeddings
from langchain.embeddings import OpenAIEmbeddings
embedder = OpenAIEmbeddings(model="text-embedding-3-small")

def get_embedding(text: str):
    return embedder.embed_query(text)

  embedder = OpenAIEmbeddings(model="text-embedding-3-small")


In [12]:
# Delete existing nodes and relationships
graph.query("MATCH (n) DETACH DELETE n")

[]

In [13]:
# Add documents to graph database, including document source
graph.add_graph_documents(graph_documents, include_source = True, baseEntityLabel = False)

In [14]:
# Add embeddings to each node
for doc in graph_documents:
    for node in doc.nodes:
        embedding = get_embedding(node.id)
        graph.query(
            """
            MATCH (n {id: $id})
            SET n.embedding = $embedding
            """,
            params={"id": node.id, "embedding": embedding}
        )

In [26]:
# Add vector indices on nodes
graph.query(
    """
    CREATE VECTOR INDEX $name IF NOT EXISTS
    FOR (m:Mathconcept)
    ON m.embedding
    OPTIONS { indexConfig: {
    `vector.dimensions`: 1536,
    `vector.similarity_function`: 'cosine'
    }}
    """, params = {"name": "mathconcept-embedding"}
)

graph.query(
    """
    CREATE VECTOR INDEX $name IF NOT EXISTS
    FOR (m:Maththeorem)
    ON m.embedding
    OPTIONS { indexConfig: {
    `vector.dimensions`: 1536,
    `vector.similarity_function`: 'cosine'
    }}
    """, params = {"name": "maththeorem-embedding"}
)

[#F170]  _: <CONNECTION> error: Failed to read from defunct connection ResolvedIPv4Address(('34.28.184.63', 7687)) (ResolvedIPv4Address(('34.28.184.63', 7687))): ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None)
Unable to retrieve routing information
Transaction failed and will be retried in 0.941912047534422s (Unable to retrieve routing information)
[#F172]  _: <CONNECTION> error: Failed to read from defunct connection IPv4Address(('si-841b6bc1-3008.production-orch-0696.neo4j.io', 7687)) (ResolvedIPv4Address(('34.28.184.63', 7687))): ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None)
Transaction failed and will be retried in 1.673323521287367s (Failed to read from defunct connection IPv4Address(('si-841b6bc1-3008.production-orch-0696.neo4j.io', 7687)) (ResolvedIPv4Address(('34.28.184.63', 7687))))


[]

### Workflow: retrieve concept, find all prerequisites and create a roadmap to learn concept

In [None]:
def get_node(object_type: str, name: str):
    """Get node that matches concept"""
    query_embedding = get_embedding(concept)
    return graph.query(
        """
        WITH $embedding AS embedding
        CALL db.index.vector.queryNodes($name, 5, embedding)
        RETURN node.id, score
        """, 
        params={"embedding": query_embedding, "name": "mathconcept-embedding" if object_type == "concept" else "maththeorem-embedding"}
    )

In [131]:
# Make sure to exclude cycles!
def get_paths(node_id: str) -> list:
    """Get all paths that end at a given node"""
    return graph.query(
        f"""
        MATCH path = (start)-[:PREREQUISITE|INCLUSION|MENTIONS*]->(end {{id: '{node_id}'}})
        RETURN path
        """
    )

In [None]:
def construct_explanation(node_id: str) -> str:
    """
    Construct an explanation for the concept associated
    with the given node using that node's source document
    """
    source_doc = 

In [62]:
def normalize_path(path: list) -> tuple[tuple]:
    """
    Converts path list into a list of tuples of form (source node, target node, relationship)
    e.g. [{"id" : "Distribution"}, "INCLUSION", {"id" : "Spread"}] -> [("Distribution", "Spread", "INCLUSION")]
    """
    transformed_path = []
    i = 0
    for i in range(0, len(path) - 2, 2):
        transformed_path.append((path[i]["id"], path[i + 2]["id"], path[i + 1]))
    return tuple(transformed_path)

In [104]:
def get_subpaths(path: tuple[tuple]) -> set[tuple[tuple]]:
    """Get all subpaths of a path"""
    subpaths = set()
    for i in range(len(path)):
        cur = [path[i]]
        subpaths.add((path[i],))
        for j in range(i + 1, len(path)):
            cur.append(path[j])
            subpaths.add(tuple(cur))
    return subpaths

In [72]:
def is_subpath(path: tuple[tuple], sub_paths: set[tuple[tuple]]) -> bool:
    """Returns true if path is a subpath"""
    return path in sub_paths

In [121]:
def remove_subpaths(paths: list[dict]) -> list[tuple]:
    """Removes all subpaths from a list of paths"""
    # Normalize paths
    paths = map(normalize_path, tuple(tuple(path["path"]) for path in paths))
    paths = tuple(paths)

    # Sort based on path length in descending order
    sorted_paths = tuple(sorted(paths, key = len, reverse = True))
    sub_paths = set()
    filtered_paths = []

    # For each path, check if it is a subpath
    for path in sorted_paths:
        if not is_subpath(path, sub_paths):
            # add path to res if not subpath
            filtered_paths.append(path)
            # update subpaths
            sub_paths.update(get_subpaths(path))

    return filtered_paths

In [122]:
paths = [{'path': [{'id': 'Distributioncharacteristics'},
   'INCLUSION',
   {'id': 'Spread'}]},
 {'path': [{'id': 'Summarycharacteristics'},
   'INCLUSION',
   {'id': 'Distributioncharacteristics'},
   'INCLUSION',
   {'id': 'Spread'}]}]
remove_subpaths(paths)

[(('Summarycharacteristics', 'Distributioncharacteristics', 'INCLUSION'),
  ('Distributioncharacteristics', 'Spread', 'INCLUSION'))]

In [33]:
get_paths("Spread")

[#D706]  _: <CONNECTION> error: Failed to read from defunct connection ResolvedIPv4Address(('34.28.184.63', 7687)) (ResolvedIPv4Address(('34.28.184.63', 7687))): ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None)
Unable to retrieve routing information
Transaction failed and will be retried in 1.1643805523366217s (Unable to retrieve routing information)
[#D6AE]  _: <CONNECTION> error: Failed to read from defunct connection IPv4Address(('si-841b6bc1-3008.production-orch-0696.neo4j.io', 7687)) (ResolvedIPv4Address(('34.28.184.63', 7687))): ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None)
Transaction failed and will be retried in 2.3144265490046787s (Failed to read from defunct connection IPv4Address(('si-841b6bc1-3008.production-orch-0696.neo4j.io', 7687)) (ResolvedIPv4Address(('34.28.184.63', 7687))))


[{'path': [{'id': 'Distributioncharacteristics'},
   'INCLUSION',
   {'id': 'Spread'}]},
 {'path': [{'id': 'Summarycharacteristics'},
   'INCLUSION',
   {'id': 'Distributioncharacteristics'},
   'INCLUSION',
   {'id': 'Spread'}]}]

In [None]:
# Ultimate goal: given any concept I need help with, create a study plan outlining the things I need to do to learn that concept
# Question 1: Given an arbitrary concept, how can I locate the corresponding node in the knowledge graph
# Question 2: Given the node of a concept, how can I trace back through all its prerequisites/neighbors