In [1]:
import os
import pandas as pd
import random
from dotenv import load_dotenv
from create_kb import create_kb_from_file
from langchain.chat_models import init_chat_model
from langgraph.graph import START, StateGraph
from typing_extensions import TypedDict
from dsrag.knowledge_base import KnowledgeBase
from langchain import hub
from langchain_core.prompts import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate
)
load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [None]:
# prompt
llm = init_chat_model("gpt-4o-mini", model_provider="openai")
prompt = hub.pull("rlm/rag-prompt")

In [7]:
# Define state for application
class State(TypedDict):
    kb: KnowledgeBase
    question: str
    context: list[dict]
    answer: str

In [8]:
# Define application steps
def retrieve(state: State):
    search_queries = [state["question"]]
    retrieved_docs = state["kb"].query(search_queries)
    return {"context": retrieved_docs}

def generate(state: State):
    docs_content = "\n\n".join(doc["content"] for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": docs_content})
    response = llm.invoke(messages)
    return {"answer": response.content}

In [9]:
# Compile application and test
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

### Stress test

In [10]:
transcripts = [
    "cs168.txt", "econ136.txt", "mcb130.txt", "music139.txt", 
    "philosophy25b.txt", "physics7a.txt", "stat20.txt"
]
transcript_to_queries = {}

In [None]:
SYSTEM_MESSAGE = """
You are a query generation system. Please generate one or more search queries (up to a maximum of {max_queries}) that can be answered based on the provided section title and content. DO NOT generate the answer, just queries.

Each of the queries you generate will be used to search a knowledge base for information that can be used to respond to the user input. Make sure each query is specific enough to return relevant information. If multiple pieces of information would be useful, you should generate multiple queries, one for each specific piece of information needed.

Return a list of queries formatted as follows: [query1, query2]

Example output: ["Who is Napoleon?", "Where did Napoleon conquer?"]
""".strip()

prompt_template = ChatPromptTemplate.from_messages([
    SystemMessagePromptTemplate.from_template(
        SYSTEM_MESSAGE
    ),
    HumanMessagePromptTemplate.from_template(
        """
        Section title: {section_title}
        Section content: {section_content}
        """
    ),
])

from dsrag.dsparse.sectioning_and_chunking.semantic_sectioning import get_sections_from_str
import ast
# generate important queries for a section
def gen_queries(title: str, content: str, max_queries: int = 3) -> list[str]:
    # a) Fill in the template
    prompt_value = prompt_template.format_prompt(
        section_title = title,
        section_content = content,
        max_queries = max_queries
    )
    # b) Turn that into LangChain messages
    messages = prompt_value.to_messages()
    # c) Ask the model
    resp = llm(messages)            #
    text = resp.content            
    # d) Split into lines (or use your own parser if e.g. you return JSON)
    return ast.literal_eval(text)

In [None]:
# your raw text:
for transcript in transcripts:
    with open(transcript, encoding = "utf-8-sig") as f:
        raw = f.read()

    sections, lines = get_sections_from_str(
        document=raw,
        max_characters_per_window=20000,   # how large each LLM window is
        semantic_sectioning_config={
            "use_semantic_sectioning": True,
            "llm_provider": "openai",
            "model": "gpt-4o-mini",
            "language": "en",
        },
        chunking_config={},
        kb_id="my_kb",
        doc_id="my_doc",
    )

    queries = []
    for section in sections:
        queries.extend(gen_queries(section["title"], section["content"]))
    transcript_to_queries[transcript] = queries

In [39]:
# generate a response for each query
def gen_responses(transcript_index: int, sample: list[str]):
    queries_and_responses = {}
    for query in sample:
        response = graph.invoke({
            "question": query,
            "kb": kbs[transcript_index]
        })
        queries_and_responses[query] = response["answer"]
    return queries_and_responses

In [None]:
def gen_sample(transcript: str, n: int = 10):
    return random.sample(transcript_to_queries[transcript], n)

In [None]:
# create knowledge base from transcript
kbs = []
samples = {}
for transcript in transcripts:
    kbs.append(create_kb_from_file(transcript[:-4], transcript))
    samples[transcript] = gen_sample(transcript)

responses = []
for i in range(len(samples)):
    res = gen_responses(i, samples[transcripts[i]])
    responses.append(res)

In [50]:
questions = []
answers = []
for response in responses:
    questions.extend(list(response.keys()))
    answers.extend(list(response.values()))

In [None]:
data = {
    "Question": questions,
    "LLM Answer": answers
}
df = pd.DataFrame(data)
df.to_excel("results.xlsx")

In [2]:
results = pd.read_excel("results.xlsx")

In [3]:
results.head()

Unnamed: 0,Question,LLM Answer,Human Answer
0,How are optical fibers laid underwater?,Optical fibers are laid underwater using speci...,Installing optical fibers is the process of ru...
1,What are loops in network routing?,Loops in network routing refer to situations w...,When a packet cycles around the same set of no...
2,How do participants determine their magic numb...,Participants determine their magic number by s...,"Initially, participants’ magic number is infin..."
3,What is the role of routers in managing networ...,Routers manage network bandwidth by acting as ...,Routers enable a more efficient use of network...
4,What is the role of a router?,The role of a router is to act as an intermedi...,A router is an intermediary node between diffe...


In [18]:
results.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70 entries, 0 to 69
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Question      70 non-null     object
 1   LLM Answer    70 non-null     object
 2   Human Answer  40 non-null     object
dtypes: object(3)
memory usage: 1.8+ KB


In [20]:
results = results.dropna()

In [4]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('stsb-roberta-large')

  backends.update(_get_backends("networkx.backends"))
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [22]:
embeddings_llm = model.encode(list(results["LLM Answer"]), convert_to_tensor=True)
embeddings_human = model.encode(list(results["Human Answer"]))

similarity = []

for i in range(len(embeddings_llm)):
    similarity.append(util.pytorch_cos_sim(embeddings_llm[i], embeddings_human[i]).item())

In [27]:
import numpy as np
print(f"mean of response similarities: {np.mean(similarity)}")
print(f"standard deviation of response similarities: {np.std(similarity)}")

mean of response similarities: 0.6674860008060932
standard deviation of response similarities: 0.14756425367320591


In [28]:
results["response similarity"] = similarity

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results["response similarity"] = similarity


In [29]:
results.head()

Unnamed: 0,Question,LLM Answer,Human Answer,response similarity
0,How are optical fibers laid underwater?,Optical fibers are laid underwater using speci...,Installing optical fibers is the process of ru...,0.625332
1,What are loops in network routing?,Loops in network routing refer to situations w...,When a packet cycles around the same set of no...,0.531271
2,How do participants determine their magic numb...,Participants determine their magic number by s...,"Initially, participants’ magic number is infin...",0.785601
3,What is the role of routers in managing networ...,Routers manage network bandwidth by acting as ...,Routers enable a more efficient use of network...,0.855478
4,What is the role of a router?,The role of a router is to act as an intermedi...,A router is an intermediary node between diffe...,0.779942


In [30]:
results.to_excel("results.xlsx")