In [1]:
import dotenv

dotenv.load_dotenv()

True

In [2]:
# SET UP METADATA EXTRACTORS (ENTITYEXTRACTOR IS COMMENTED OUT BECAUSE IT SEEMS TO HAVE SOME ISSUES ref: https://github.com/run-llama/llama_index/issues/13774)

from llama_index.core.extractors import TitleExtractor, QuestionsAnsweredExtractor, SummaryExtractor
from llama_index.extractors.entity import EntityExtractor

title_extractor = TitleExtractor()

# entity_extractor = EntityExtractor(label_entities=True, device="cpu")

questions_answered_extractor = QuestionsAnsweredExtractor()

summary_extractor = SummaryExtractor()

from llama_index.core.node_parser import SentenceSplitter

node_parser = SentenceSplitter(chunk_size=1000, chunk_overlap=100)


  from .autonotebook import tqdm as notebook_tqdm



In [19]:
# LOAD TRANSCRIPT DATA

from llama_index.core import SimpleDirectoryReader
import re

# This function receives the filename and allows you to return a dictionary with any custom metadata.
def filename_fn(filename):
    # Extract number using regular expression
    number = re.findall(r'\d+', filename)

    # Convert the list of matched numbers to an integer (or use it directly as string)
    week_number = 0
    if number:
        week_number = int(number[0])

    return {
        "source": f"This was mentioned by Tim Lee during Week {week_number} of the LLM Bootcamp",
    }

documents = SimpleDirectoryReader(
  input_files=["./data/llm_bootcamp_week1_transcript.txt"],
  # file_metadata=filename_fn,
).load_data()

documents += SimpleDirectoryReader(
  input_files=["./data/llm_bootcamp_week2_transcript.txt"],
  # file_metadata=filename_fn,
).load_data()

documents += SimpleDirectoryReader(
  input_files=["./data/llm_bootcamp_week3_transcript.txt"],
  # file_metadata=filename_fn,
).load_data()

documents += SimpleDirectoryReader(
  input_files=["./data/llm_bootcamp_week4_transcript.txt"],
  # file_metadata=filename_fn,
).load_data()


In [20]:
import nest_asyncio
nest_asyncio.apply()

from llama_index.core.ingestion import IngestionPipeline

pipeline = IngestionPipeline(transformations=[node_parser, summary_extractor])

nodes = pipeline.run(documents=documents)

100%|██████████| 101/101 [00:35<00:00,  2.81it/s]


In [22]:
# SOME DEBUG PRINTING TO SEE WHAT THE NODES LOOK LIKE

from llama_index.core.schema import MetadataMode

print(len(nodes))
print(nodes[0].metadata)
print(nodes[1].metadata)
print(nodes[2].metadata)


print(nodes[int(len(nodes)/2)].metadata)
print(nodes[len(nodes)-1].metadata)

# print(type(nodes[3])) # This prints the type of the node: <class 'llama_index.core.schema.TextNode'>
# print(nodes[3].text) # This prints the original snippet of text
print( # Another way 
    "LLM will see the following when querying:\n",
    (nodes)[3].get_content(metadata_mode=MetadataMode.LLM),
)

101
{'file_path': 'data/llm_bootcamp_week1_transcript.txt', 'file_name': 'llm_bootcamp_week1_transcript.txt', 'file_type': 'text/plain', 'file_size': 101098, 'creation_date': '2024-10-06', 'last_modified_date': '2024-10-06', 'section_summary': 'The key topics discussed in the section include the statelessness of llms, the parallelizability of llm serving, performance testing of llms, streaming from llms, the process of llm prediction (inference), and the use of open AI for web server standardization. The entities mentioned include llms, web servers, tokens per second rate, AI solutions, inference, and open AI rest API.'}
{'file_path': 'data/llm_bootcamp_week1_transcript.txt', 'file_name': 'llm_bootcamp_week1_transcript.txt', 'file_type': 'text/plain', 'file_size': 101098, 'creation_date': '2024-10-06', 'last_modified_date': '2024-10-06', 'section_summary': "The section discusses the use of OpenAI for web servers, the standardization around OpenAI's REST API and streaming API, the organ

In [25]:
# SAMPLE QUERIES TO TEST
queries = [
  "What did Tim talk about during week 2?",
  "What was the thing about Sunday midnight?",
  "When was the thing about Sunday midnight discussed?",
  "Tim Lee mentioned something about HR policy in the LLM Bootcamp. Can you find the week he mentioned it and explain what he was talking about?",
  "What were some topics from Week 4?",
  "How many lecture weeks have happened so far?",
]

In [26]:
# METADATA EXTRACTION FOR BETTER DOCUMENT INDEXING + 
# SUBQUESTION QUERY ENGINE FOR ALL QnA PIPELINES
# ref: https://docs.llamaindex.ai/en/stable/examples/metadata_extraction/MetadataExtractionSEC/
from llama_index.llms.openai import OpenAI

llm = OpenAI(temperature=0.1, model="gpt-4o-mini", max_tokens=512)

from llama_index.core.question_gen import LLMQuestionGenerator
from llama_index.core.question_gen.prompts import (
    DEFAULT_SUB_QUESTION_PROMPT_TMPL,
)

question_gen = LLMQuestionGenerator.from_defaults(
    llm=llm,
    prompt_template_str="""
        Follow the example, but instead of giving a question, always prefix the question 
        with: 'By first identifying and quoting from the most relevant lecture week and pointing to a timestamp when something was mentioned, '. 
        """
    + DEFAULT_SUB_QUESTION_PROMPT_TMPL,
)


from llama_index.core import VectorStoreIndex
from llama_index.core.query_engine import SubQuestionQueryEngine
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.llms.openai import OpenAI
from llama_index.core import Settings

index = VectorStoreIndex(nodes=nodes, llm=llm)
engine = index.as_query_engine(similarity_top_k=10)

final_engine = SubQuestionQueryEngine.from_defaults(
    query_engine_tools=[
        QueryEngineTool(
            query_engine=engine,
            metadata=ToolMetadata(
                name="llm_bootcamp_transcripts",
                description="Transcripts from lecture video recordings.",
            ),
        )
    ],
    question_gen=question_gen,
    use_async=True,
)

for query in queries:
    response = final_engine.query(query)
    print(f"Answer to '{query}':")
    print(response)

Generated 3 sub questions.
[1;3;38;2;237;90;200m[llm_bootcamp_transcripts] Q: By first identifying and quoting from the most relevant lecture week and pointing to a timestamp when something was mentioned, what topics did Tim cover during week 2?
[0m[1;3;38;2;90;149;237m[llm_bootcamp_transcripts] Q: By first identifying and quoting from the most relevant lecture week and pointing to a timestamp when something was mentioned, what key points did Tim emphasize in week 2?
[0m[1;3;38;2;11;159;203m[llm_bootcamp_transcripts] Q: By first identifying and quoting from the most relevant lecture week and pointing to a timestamp when something was mentioned, were there any specific examples or case studies discussed by Tim in week 2?
[0m[1;3;38;2;11;159;203m[llm_bootcamp_transcripts] A: In week 2, Tim Lee discussed the concept of summarization and provided a specific example related to document compression. He mentioned, "imagine a world where you take that document. You generate that cliff n

In [27]:
# This just shows a basic use of the query engine without any subquestions or metadata extraction (see above for that)

from llama_index.core import VectorStoreIndex
from llama_index.llms.openai import OpenAI
from llama_index.core import Settings

llm = OpenAI(model="gpt-4o-mini", temperature=0.2)

index = VectorStoreIndex(nodes=nodes, llm=llm)

query_engine = index.as_query_engine()

for query in queries:
    response = query_engine.query(query)
    print(f"Answer to '{query}':")
    print(response)


Answer to 'What did Tim talk about during week 2?':
During week 2, Tim Lee led a breakout room session where participants were tasked with answering questions and explaining concepts to each other. He emphasized the importance of understanding by encouraging participants to articulate their thoughts and come up with additional questions. Tim also demonstrated a project involving a data folder and a file with tracing callback handlers, which was related to a five-year strategic plan document that was 55 pages long. Participants were encouraged to follow along with the demonstration and engage in discussions for better comprehension.
Answer to 'What was the thing about Sunday midnight?':
The mention of Sunday at midnight refers to the deadline for turning in one of the weekly assignments. If the assignment is not submitted by that time, it could result in being dropped from the course.
Answer to 'When was the thing about Sunday midnight discussed?':
The provided excerpts do not mention a