In [4]:
from dotenv import load_dotenv
import os
from create_kb import create_kb_from_file
from langchain.chat_models import init_chat_model
from langgraph.graph import START, StateGraph
from typing_extensions import TypedDict
from dsrag.knowledge_base import KnowledgeBase

llm = init_chat_model("gpt-4o-mini", model_provider="openai")

In [5]:
load_dotenv()

True

In [6]:
# prompt
from langchain import hub
prompt = hub.pull("rlm/rag-prompt")

In [7]:
# Define state for application
class State(TypedDict):
    kb: KnowledgeBase
    question: str
    context: list[dict]
    answer: str

In [8]:
# Define application steps
def retrieve(state: State):
    search_queries = [state["question"]]
    retrieved_docs = state["kb"].query(search_queries)
    return {"context": retrieved_docs}

def generate(state: State):
    docs_content = "\n\n".join(doc["content"] for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": docs_content})
    response = llm.invoke(messages)
    return {"answer": response.content}

In [9]:
# Compile application and test
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

### Stress test

In [10]:
transcripts = [
    "cs168.txt", "econ136.txt", "mcb130.txt", "music139.txt", 
    "philosophy25b.txt", "physics7a.txt", "stat20.txt"
]
transcript_to_queries = {}

In [11]:
SYSTEM_MESSAGE = """
You are a query generation system. Please generate one or more search queries (up to a maximum of {max_queries}) that can be answered based on the provided section title and content. DO NOT generate the answer, just queries.

Each of the queries you generate will be used to search a knowledge base for information that can be used to respond to the user input. Make sure each query is specific enough to return relevant information. If multiple pieces of information would be useful, you should generate multiple queries, one for each specific piece of information needed.

Return a list of queries formatted as follows: [query1, query2]

Example output: ["Who is Napoleon?", "Where did Napoleon conquer?"]
""".strip()

from langchain_core.prompts import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate
)

prompt_template = ChatPromptTemplate.from_messages([
    SystemMessagePromptTemplate.from_template(
        SYSTEM_MESSAGE
    ),
    HumanMessagePromptTemplate.from_template(
        """
        Section title: {section_title}
        Section content: {section_content}
        """
    ),
])

from dsrag.dsparse.sectioning_and_chunking.semantic_sectioning import get_sections_from_str
import ast
# generate important queries for a section
def gen_queries(title: str, content: str, max_queries: int = 3) -> list[str]:
    # a) Fill in the template
    prompt_value = prompt_template.format_prompt(
        section_title = title,
        section_content = content,
        max_queries = max_queries
    )
    # b) Turn that into LangChain messages
    messages = prompt_value.to_messages()
    # c) Ask the model
    resp = llm(messages)            #
    text = resp.content            
    # d) Split into lines (or use your own parser if e.g. you return JSON)
    return ast.literal_eval(text)

In [None]:
# your raw text:
for transcript in transcripts:
    with open(transcript, encoding = "utf-8-sig") as f:
        raw = f.read()

    sections, lines = get_sections_from_str(
        document=raw,
        max_characters_per_window=20000,   # how large each LLM window is
        semantic_sectioning_config={
            "use_semantic_sectioning": True,
            "llm_provider": "openai",
            "model": "gpt-4o-mini",
            "language": "en",
        },
        chunking_config={},
        kb_id="my_kb",
        doc_id="my_doc",
    )

    queries = []
    for section in sections:
        queries.extend(gen_queries(section["title"], section["content"]))
    transcript_to_queries[transcript] = queries

In [39]:
# generate a response for each query
def gen_responses(transcript_index: int, sample: list[str]):
    queries_and_responses = {}
    for query in sample:
        response = graph.invoke({
            "question": query,
            "kb": kbs[transcript_index]
        })
        queries_and_responses[query] = response["answer"]
    return queries_and_responses

In [41]:
import random
def gen_sample(transcript: str, n: int = 10):
    return random.sample(transcript_to_queries[transcript], n)

In [43]:
# create knowledge base from transcript
kbs = []
samples = {}
for transcript in transcripts:
    kbs.append(create_kb_from_file(transcript[:-4], transcript))
    samples[transcript] = gen_sample(transcript)

responses = []
for i in range(len(samples)):
    res = gen_responses(i, samples[transcripts[i]])
    responses.append(res)

Creating KB with id cs168...
Creating KB with id econ136...
Creating KB with id mcb130...
Creating KB with id music139...
Creating KB with id philosophy25b...
Creating KB with id physics7a...
Creating KB with id stat20...


In [44]:
responses

[{'How are optical fibers laid underwater?': 'Optical fibers are laid underwater using specialized cable-laying ships that carefully deploy the cables along the seabed. The process involves surveying the seabed to select the optimal route and then laying the fiber optic cables in controlled sections. Additionally, the cables are often buried in the seabed to protect them from environmental conditions and potential damage.',
  'What are loops in network routing?': 'Loops in network routing refer to situations where a packet continuously cycles through the same set of nodes without reaching its intended destination. This occurs when routers are unable to forward the packet to its destination due to a lack of appropriate paths, leading to the packet repeatedly returning to the same routers. To maintain valid routing, it is essential to eliminate such loops, as they prevent successful delivery of packets.',
  'How do participants determine their magic number in the experiment?': 'Participa

In [50]:
questions = []
answers = []
for response in responses:
    questions.extend(list(response.keys()))
    answers.extend(list(response.values()))

In [53]:
import pandas as pd
data = {
    "Question": questions,
    "LLM Answer": answers
}
df = pd.DataFrame(data)

In [54]:
df.head()

Unnamed: 0,Question,LLM Answer
0,How are optical fibers laid underwater?,Optical fibers are laid underwater using speci...
1,What are loops in network routing?,Loops in network routing refer to situations w...
2,How do participants determine their magic numb...,Participants determine their magic number by s...
3,What is the role of routers in managing networ...,Routers manage network bandwidth by acting as ...
4,What is the role of a router?,The role of a router is to act as an intermedi...


In [56]:
df.to_excel("results.xlsx")