In [2]:
from langchain_community.graphs import Neo4jGraph
import os
from dotenv import load_dotenv
from langchain.chat_models import init_chat_model
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_core.prompts import ChatPromptTemplate

load_dotenv()
NEO_PASSWORD = os.getenv("NEO_PASSWORD")

In [3]:
# Initialize knowledge graph database
graph = Neo4jGraph(
    url = "neo4j+s://841b6bc1.databases.neo4j.io",
    username = "neo4j",
    password = NEO_PASSWORD,
    refresh_schema = False
)

  graph = Neo4jGraph(


In [None]:
import re

def clean_text(text: str) -> str:
    """Remove extra new lines and whitespace"""
    text = re.sub(r'\n{2,}', '\n', text)
    return text.strip()

def clean_link(text):
    """Get relative link"""
    text = re.search(r"[^./].*", text).group(0)
    return text

In [28]:
import requests
from bs4 import BeautifulSoup
from langchain_core.documents import Document

def scrape():
    """Web scrape Stat 20 lecture notes by retrieving content and next page links"""
    rel_url = "https://stat20.berkeley.edu/summer-2025/"
    url = "https://stat20.berkeley.edu/summer-2025/1-questions-and-data/01-understanding-the-world/notes.html" 
    content = "lecture content"
    documents = []

    # Iterate until there is no next page
    while True:
        response = requests.get(url)
        html_content = response.content
        soup = BeautifulSoup(html_content, "html.parser")
        main = soup.find("main", id = "quarto-document-content")
        header = soup.find("header", id ="quarto-header")

        if main:  
            content = clean_text(main.get_text())

        if header:
            title = header.find("h1", class_ = "quarto-secondary-nav-title no-breadcrumbs").get_text()

        if content:
            documents.append(
                Document(
                    page_content = content,
                    metadata = {
                        "url": url,
                        "title": title
                    }
                )
            )

        next_page = soup.find("div", class_ = "nav-page nav-page-next")
        if next_page:
            url = rel_url + clean_link(next_page.find("a", href = True)["href"])
        else:
            break
        
    return documents

In [7]:
# Define system prompt
system_prompt = (
    "# Knowledge Graph Instructions for GPT-4\n"
    "## 1. Overview\n"
    "You are a top-tier algorithm designed for extracting information in structured "
    "formats to build a knowledge graph for an educational learning platform.\n"
    "Try to capture as much information from the text as possible without "
    "sacrificing accuracy. Do not add any information that is not explicitly "
    "mentioned in the text.\n"
    "- **Nodes** represent entities and concepts.\n"
    "- The aim is to achieve simplicity and clarity in the knowledge graph, making it\n"
    "helpful for a student who is reviewing for an exam.\n"
    "## 2. Labeling Nodes\n"
    "- **Consistency**: Ensure you use available types for node labels.\n"
    "Ensure you use basic or elementary types for node labels.\n"
    "- For example, when you identify an entity representing a person, "
    "always label it as **'person'**. Avoid using more specific terms "
    "like 'mathematician' or 'scientist'."
    "- **Node IDs**: Never utilize integers as node IDs. Node IDs should be "
    "names or human-readable identifiers found in the text.\n"
    "- **Relationships** represent connections between entities or concepts.\n"
    "Ensure consistency and generality in relationship types when constructing "
    "knowledge graphs. Instead of using specific and momentary types "
    "such as 'BECAME_PROFESSOR', use more general and timeless relationship types "
    "like 'PROFESSOR'. Make sure to use general and timeless relationship types!\n"
    "## 3. Coreference Resolution\n"
    "- **Maintain Entity Consistency**: When extracting entities, it's vital to "
    "ensure consistency.\n"
    'If an entity, such as "John Doe", is mentioned multiple times in the text '
    'but is referred to by different names or pronouns (e.g., "Joe", "he"),'
    "always use the most complete identifier for that entity throughout the "
    'knowledge graph. In this example, use "John Doe" as the entity ID.\n'
    "Remember, the knowledge graph should be coherent and easily understandable, "
    "so maintaining consistency in entity references is crucial.\n"
    "## 4. Strict Compliance\n"
    "Adhere to the rules strictly. Non-compliance will result in termination."
)

In [None]:
def get_prompt(subject: str = "") -> ChatPromptTemplate:
    """Define prompt template based on subject and user input"""
    return ChatPromptTemplate.from_messages(
        [
            ("system", system_prompt),
            (
                "human",
                "Ensure that the entities and concepts extracted are relevant material with respect to the following subject: " + subject + ". They should not be teachers, course names, or syllabus"
                "Tip: Make sure to answer in the correct format and do "
                "not include any explanations. "
                "Use the given format to extract information from the "
                "following input: {input}"
            )
        ]
    )

In [15]:
# Initialize llm and knowledge graph database
llm = init_chat_model("gpt-4o-mini", model_provider = "openai")
llm_transformer = LLMGraphTransformer(
    llm = llm, 
    allowed_nodes = ["MathObject, MathTheorem", "MathConcept"],
    allowed_relationships = ["Prerequisite", "Inclusion"],
    node_properties = ["definition"],
    prompt = get_prompt("statistics")
)

In [31]:
# Retrieve nodes and relationships from documents
documents = scrape()
graph_documents = await llm_transformer.aconvert_to_graph_documents(documents)
print(f"Nodes:{graph_documents[0].nodes}")
print(f"Relationships:{graph_documents[0].relationships}")

Nodes:[Node(id='Stat 20', type='Mathconcept', properties={}), Node(id='Summary', type='Mathconcept', properties={}), Node(id='Generalization', type='Mathconcept', properties={}), Node(id='Causal Claim', type='Mathconcept', properties={}), Node(id='Prediction', type='Mathconcept', properties={}), Node(id='Proportion Of Respondents', type='Mathobject, maththeorem', properties={'definition': "Prediction about Uber's stock price going up 1.2% tomorrow."})]
Relationships:[Relationship(source=Node(id='Stat 20', type='Mathconcept', properties={}), target=Node(id='Summary', type='Mathconcept', properties={}), type='INCLUSION', properties={}), Relationship(source=Node(id='Stat 20', type='Mathconcept', properties={}), target=Node(id='Generalization', type='Mathconcept', properties={}), type='INCLUSION', properties={}), Relationship(source=Node(id='Stat 20', type='Mathconcept', properties={}), target=Node(id='Causal Claim', type='Mathconcept', properties={}), type='INCLUSION', properties={}), Rel

In [None]:
# Delete existing nodes and relationships
graph.query("MATCH (n) DETACH DELETE n")

In [None]:
# Add documents to graph database, including document source
graph.add_graph_documents(graph_documents, include_source = True, baseEntityLabel = False)

In [None]:
# Question 1: How to capture relationships between different documents?
# Question 2: How to define schema to fit educational context?