In [1]:
import os
import requests
import re
from dotenv import load_dotenv
from tutor import KnowledgeGraph
from langchain.embeddings import OpenAIEmbeddings
from bs4 import BeautifulSoup
from langchain_core.documents import Document
from langchain_openai import ChatOpenAI

In [2]:
load_dotenv()
NEO_PASSWORD = os.getenv("NEO_PASSWORD")
NEO_URI = os.getenv("NEO_URI")
NEO_USERNAME = os.getenv("NEO_USERNAME")

In [3]:
knowledge_graph = KnowledgeGraph(NEO_URI, NEO_USERNAME, NEO_PASSWORD)

  self.graph = Neo4jGraph(


In [4]:
def clean_text(text: str) -> str:
    """Remove extra new lines and whitespace"""
    text = re.sub(r'\n{2,}', '\n', text)
    return text.strip()

def clean_link(text):
    """Get relative link"""
    text = re.search(r"[^./].*", text).group(0)
    return text

In [5]:
def scrape():
    """Web scrape Stat 20 lecture notes by retrieving content and next page links"""
    rel_url = "https://stat20.berkeley.edu/summer-2025/"
    url = "https://stat20.berkeley.edu/summer-2025/1-questions-and-data/01-understanding-the-world/notes.html" 
    content = "lecture content"
    documents = []

    # Iterate until there is no next page
    while True:
        response = requests.get(url)
        html_content = response.content
        soup = BeautifulSoup(html_content, "html.parser")
        main = soup.find("main", id = "quarto-document-content")
        header = soup.find("header", id ="quarto-header")

        if main:  
            content = clean_text(main.get_text())

        if header:
            title = header.find("h1", class_ = "quarto-secondary-nav-title no-breadcrumbs").get_text()

        if content:
            documents.append(
                Document(
                    page_content = content,
                    metadata = {
                        "url": url,
                        "title": title
                    }
                )
            )

        next_page = soup.find("div", class_ = "nav-page nav-page-next")
        if next_page:
            next_href = next_page.find("a", href = True)
            if not next_href:
                break
            url = rel_url + clean_link(next_page.find("a", href = True)["href"])
        else:
            break
        
    return documents

In [6]:
documents = scrape()

In [7]:
llm = ChatOpenAI(
    model="gpt-5",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

In [None]:
knowledge_graph.build_graph(llm, documents[:5])
knowledge_graph.create_fulltext_index()

In [8]:
embeddings = OpenAIEmbeddings()
vector_index = knowledge_graph.create_vector_index(embeddings)
entity_chain = knowledge_graph.create_entity_chain(llm)
chain = knowledge_graph.query_chain(llm, vector_index, entity_chain)

  embeddings = OpenAIEmbeddings()


In [None]:
def conversation():
    question = input("Ask a question about Stat 20 (type 'q' to exit): ")
    print(f"Question: {question}")
    if question.lower() != "q":
        response = knowledge_graph.ask_query(llm, chain, question)
        print(f"Response: {response}")
        conversation()

In [None]:
conversation()