In [1]:
import os
import dotenv

dotenv.load_dotenv()

True

In [2]:
import langchain as lc
from langchain.llms import HuggingFaceHub
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
import openai

# llm = HuggingFaceHub(
#     repo_id="google/flan-t5-xxl", 
#     model_kwargs={"temperature": 1.0, "max_length":4096},
# )
# llm = Ollama(
#     model="mistral", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()])
# )
# llm = OpenAI()
llm = ChatOpenAI(model="gpt-3.5-turbo-0613")

In [3]:
from langchain.globals import set_debug
from langchain.globals import set_verbose

DEBUG = False

if DEBUG:
    set_verbose(True)
    set_debug(True)
else:
    set_verbose(False)
    set_debug(False)

## Data Loading

In [4]:
import glob
import os
from pathlib import Path
from typing import Any, List

from langchain.docstore.document import Document
from langchain.document_loaders import SRTLoader
from langchain.document_loaders.base import BaseLoader


class LectureLoader(BaseLoader):
    def __init__(self, 
                 lecture_file: str,
                 add_lecture_info: bool = False
                ):
        self.add_lecture_info = add_lecture_info
        self.lecture_file = lecture_file

    @classmethod
    def from_folder(cls, folder_name: str, **kwargs: Any) -> 'LectureLoader':
        return cls(folder_name, kwargs)

    def load(self) -> List[Document]:
        documents = []
        
        for file_name in Path(self.lecture_file).rglob('*'):
            
            file_path = Path(file_name)
            if not file_path.is_file(): continue
                
            with open(file_name, "r") as f:

                metadata = {}
                
                # Load the transcript data
                if file_name.suffix == ".srt":
                # or file_name.suffix == ".sbv" \
                # or file_name.suffix == ".vtt" \
                # or file_name.suffix == ".txt":

                    srt_loader = SRTLoader(file_name)
                        
                    if self.add_lecture_info:
                        metadata["lecture_name"] = file_path.parent.name
                        metadata["source"] = file_path.stem
                        metadata["type"] = "transcript"

                    for doc in srt_loader.load():
                        doc.metadata.update(metadata)
                        documents.append(doc)

        return documents

In [5]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.vectorstores.base import VectorStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema.embeddings import Embeddings

from typing import Type, Iterable, Optional, List


class LectureIndex(FAISS):
    """Wrapper around the FAISS VectorStore"""

    @classmethod
    def from_documents(cls, documents: List[Document], embedding: Embeddings):
        text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 100)
        docs_split = text_splitter.split_documents(documents)
        return FAISS.from_documents(docs_split, embedding)

In [6]:
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

lecture_loader = LectureLoader.from_folder("./data", add_lecture_info=True)
lecture_docs = lecture_loader.load()

lecture_index = LectureIndex.from_documents(lecture_docs, embeddings)

In [7]:
results = lecture_index.similarity_search("Who is Plato?")

for doc in results:
    print("- ", doc.page_content)

-  for positions of public leadership and high political
responsibilities. It is always worthwhile to
remember that Plato was, above all, a teacher.
He was the founder of the first university, the Academy,
the Platonic Academy, where we will find out later
Aristotle came to study, among many others--Aristotle
being but the most famous. Plato was the founder of this
school. This, in turn,
spawned other philosophical schools throughout the Greek
world and later, the Roman world.
-  Athens he called the Academy, for the training of
philosophers, statesmen, and legislators.
Plato lived a long time. He lived until the age of 80.
Except for two expeditions to Sicily, where he went at the
request of Dionysius to help try to establish a philosophical
kingship in Syracuse, he remained in Athens teaching
and writing. The Republic belongs to
that period of Plato's work after his return to Athens,
after the execution of Socrates.
The dominant feature of Plato's political theory,
-  perplexed and d

## Simple Question Answering 

In [8]:
from langchain.schema.runnable import RunnablePassthrough
from langchain import hub

# Retrieval Augmented Generation (RAG)
retriever = lecture_index.as_retriever(k=10)
rag_prompt = hub.pull("rlm/rag-prompt")

rag_chain = {"context": retriever, "question": RunnablePassthrough()} | rag_prompt | llm

query = "Did Plato go to Italy?"
answer = rag_chain.invoke(query)
print(answer)

content='Yes, Plato went to Sicily twice but there is no mention of him going to Italy.'


## Question Answering using Embedding Index and SQL Database

In [9]:
import pysrt
import sqlite3

class SubtitleIndex:
    """
    Can load timestamps from an .srt file and retrieve subtitles between a given time range.
    """

    def __init__(self):
        self.connection = sqlite3.connect('subtitles.db')
        self.cursor = self.connection.cursor()

        # Create the table
        self.cursor.execute('''
        CREATE TABLE IF NOT EXISTS subtitles (
            video_id TEXT,
            start_time TEXT,
            end_time TEXT,
            subtitle_text TEXT,
            UNIQUE(video_id, start_time, end_time, subtitle_text)
            )
        ''')
    
    def get_subtitles(self, time_start: str, time_end: str, video_id='%') -> List[str]:
        self.cursor.execute('''
                SELECT * FROM subtitles
                WHERE start_time BETWEEN ? AND ?
                AND video_id LIKE ?
            ''', (time_start, time_end, video_id))

        rows = self.cursor.fetchall()

        return rows

    def add_subtitle_file(self, subtitle_filename, video_id):
        def to_timestamp(srt_time):
            # Format the time to a string as HH:MM:SS,SSS
            return '{:02}:{:02}:{:02},{:03}'.format(srt_time.hours,
                                                    srt_time.minutes,
                                                    srt_time.seconds,
                                                    srt_time.milliseconds)

        subs = pysrt.open(subtitle_filename)

        for sub in subs:
            start_time = to_timestamp(sub.start)
            end_time = to_timestamp(sub.end)
            subtitle_text = sub.text.replace('\n', ' ')  # Remove newline characters

            try:
                self.cursor.execute('''
                    INSERT INTO subtitles (video_id, start_time, end_time, subtitle_text)
                    VALUES (?, ?, ?, ?)
                    ''', (lecture_id, start_time, end_time, subtitle_text))

            except sqlite3.IntegrityError:
                print("Duplicate subtitle: ", sub)
                
        self.connection.commit()

    def __del__(self):
        self.connection.close()

In [10]:
# Setup the Database

sub_file = "./data/lecture1/4. Philosophers and Kings： Plato's Republic, I-II [nVQKbQVc2_w].en.srt"
lecture_id = 'lecture1'

subtitle_index = SubtitleIndex()
subtitle_index.add_subtitle_file(sub_file, lecture_id)

Duplicate subtitle:  1
00:00:02,060 --> 00:00:07,014
Professor Steven Smith:
There is one person in here,

Duplicate subtitle:  2
00:00:07,014 --> 00:00:11,627
I don't know who it is,
and you will not know who it is

Duplicate subtitle:  3
00:00:11,627 --> 00:00:16,668
yet, but there is one person in
here for whom the reading of

Duplicate subtitle:  4
00:00:16,668 --> 00:00:21,964
Plato's Republic will be
the most important intellectual

Duplicate subtitle:  5
00:00:21,964 --> 00:00:25,040
experience you have at Yale.

Duplicate subtitle:  6
00:00:28,090 --> 00:00:33,345
It is a book that one of you
will go back to time and time

Duplicate subtitle:  7
00:00:33,345 --> 00:00:37,310
again and it will stick with you
forever.

Duplicate subtitle:  8
00:00:37,310 --> 00:00:41,906
What I would like you to do is
to remember this and four years

Duplicate subtitle:  9
00:00:41,906 --> 00:00:45,750
from now, when most of you are
ready to graduate,

Duplicate subtitle:  10
00:00:45,750 --> 00:

In [11]:
# subtitle_index.get_subtitles('00:00:00.000', '00:01:01.000', 'lecture1')

from langchain.utilities import SQLDatabase
from langchain.chains import create_sql_query_chain
from langchain.prompts import ChatPromptTemplate
from langchain.chains.router import MultiRetrievalQAChain

subtitle_db = SQLDatabase.from_uri("sqlite:///subtitles.db")
query_chain = create_sql_query_chain(llm, subtitle_db, k=-1)

system_message = """Use the information from the below sources to answer any questions.

Source 1: Relevant timestamped snippets from the lecture. Use if the questions mentions
a specific part of the lecture.
<source1>
{source1}
</source1>

Source 2: Content from the lecture. Use if relevant to the question.
<source2>
{source2}
</source2>
"""

prompt = ChatPromptTemplate.from_messages(
    [("system", system_message), ("human", "{question}")]
)


full_chain = (
    {
        "source1": {"question": lambda x: x["question"]} | query_chain | subtitle_db.run,
        "source2": (lambda x: x["question"]) | retriever,
        "question": lambda x: x["question"],
    }
    | prompt
    | llm
)

response = full_chain.invoke({"question": "Please the first 5 seconds of the lecture"})
print(response)

content='The first 5 seconds of the lecture are not available. The available timestamped snippets start at 00:00:02,060.'


## Agent Experiment

In [12]:
from langchain.agents import tool
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.tools.render import format_tool_to_openai_function
from langchain.agents.format_scratchpad.openai_functions import format_to_openai_functions
from langchain.agents.output_parsers import OpenAIFunctionsAgentOutputParser
from langchain.agents import OpenAIFunctionsAgent, AgentType, initialize_agent
from langchain.agents.agent_toolkits import SQLDatabaseToolkit
from langchain.agents import create_sql_agent


@tool
def get_word_length(word: str) -> int:
    """Returns the length of a word."""
    return len(word)

sql_toolkit = SQLDatabaseToolkit(db=subtitle_db, llm=llm)

tools = []

# prompt = ChatPromptTemplate.from_messages(
#     [
#         (
#             "system",
#             "You are very powerful assistant, but bad at calculating lengths of words.",
#         ),
#         ("user", "{input}"),
#         MessagesPlaceholder(variable_name="agent_scratchpad"),
#     ]
# )

llm_with_tools = llm.bind(functions=[format_tool_to_openai_function(t) for t in tools])
tools.extend(sql_toolkit.get_tools())

# agent = (
#     {
#         "input": lambda x: x["input"],
#         "agent_scratchpad": lambda x: format_to_openai_functions(
#             x["intermediate_steps"]
#         ),
#     }
#     | prompt
#     | llm_with_tools
#     | OpenAIFunctionsAgentOutputParser()
# )

from langchain.agents import AgentExecutor

agent_executor = initialize_agent(tools, 
                                  llm, 
                                  agent=AgentType.CHAT_ZERO_SHOT_REACT_DESCRIPTION, 
                                  handle_parsing_errors=True, 
                                  verbose=True)

result = agent_executor.invoke({"input": "Can you summarize the lecture between minutes 5 and 10??"})
print(result)

# agent_executor = create_sql_agent(
#     llm=OpenAI(temperature=0),
#     toolkit=SQLDatabaseToolkit(db=subtitle_db, llm=llm),
#     verbose=True,
#     agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
# )

# agent_executor.invoke({"input": "When does the professor mention Yale?"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mCould not parse LLM output: Sure, I can help with that. However, I need to know the name of the table that contains the lecture data. Could you please provide the table name?[0m
Observation: Invalid or incomplete response
Thought:[32;1m[1;3mCould not parse LLM output: I need to know the name of the table that contains the lecture data in order to summarize the lecture between minutes 5 and 10.[0m
Observation: Invalid or incomplete response
Thought:[32;1m[1;3mCould not parse LLM output: I need to know the name of the table that contains the lecture data in order to summarize the lecture between minutes 5 and 10.[0m
Observation: Invalid or incomplete response
Thought:[32;1m[1;3mCould not parse LLM output: I need to ask the user for the name of the table that contains the lecture data in order to summarize the lecture between minutes 5 and 10.[0m
Observation: Invalid or incomplete response
Thought:[32;1m[1;3mCould no

In [13]:
# Experimenting with MultiRetrievalQAChain (currently unable to retrieve content from SQL Database)
# This code DOES NOT work
retriever_infos = [
    {
        "name": "Lecture Timestamps",
        "description": "Good for answering questions that refer to specific time points in the lecture.",
        "retriever": query_chain
    },
    {
        "name": "Lecture Content",
        "description": "Good for answering questions about the lecture generally",
        "retriever": retriever
    }
]

# Disabled the below snippet
if False:
    chain = MultiRetrievalQAChain.from_retrievers(llm, retriever_infos, verbose=True)
    print(chain.run("Can you tell me what the professor in lecture1 says right after 5 minutes?"))

## Making Tests

In [14]:
from langchain.schema import HumanMessage
from langchain.prompts import PromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate
from langchain.output_parsers import StructuredOutputParser, ResponseSchema

response_schemas = [
    ResponseSchema(name="question", description="A multiple choice question from input text snippet."),
    ResponseSchema(name="options", description="Possible choices for the multiple choice question."),
    ResponseSchema(name="answer", description="Correct answer for the question."),
]

output_parser = StructuredOutputParser.from_response_schemas(response_schemas)

format_instructions = output_parser.get_format_instructions()

question = "Where did Plato go?"
results = lecture_index.similarity_search(question)
text = "".join([result.page_content for result in results])

prompt = ChatPromptTemplate(
    messages = [
        HumanMessagePromptTemplate.from_template("""
        Given a text input, generate multiple choice questions along with the correct answer."
        
        {format_instructions}
        
        {user_prompt}""")
    ],
    input_variables=["user_prompt"],
    partial_variables={"format_instructions": format_instructions}
)

user_query = prompt.format_prompt(user_prompt = text)
# output = llm(user_query.to_messages())
# print(output.content)

## Student Tracking

We want to keep track of how a student interacts with the teacher.
This will allow us to do things like:

- Fit the responses to the student's knowledge gaps
- Generate tests depending on the student's level
- Recommend learning resources based on their level/interests

In [15]:
class Student:
    """
    Data structure that will hold student data.
    This will be more efficient that keeping just a list of conversation pieces (Langchain Memory).
    We can then "remind" the model at each prompt what the student's level, interests etc. are.
    """
    def __init__(self):
        self.level_history = []
        self.interests = []
        self.confused_topics = []
        self.understood_topics = []
        self.mood = "frustrated"

    def set_mood(self, mood: str):
        self.mood = mood
from langchain.schema.runnable import RunnableBranch
from langchain.prompts import PromptTemplate

frustrated_template = """
You are a sympathetic tutor. You have come up with an answer to one of the students questions.
The answer only includes facts. 
Please transform the answer in order to be more supportive, affirmative of the student.
Reassure them that you will continue to help them understand the material.

Your Answer: {answer}
"""
frustrated_prompt = PromptTemplate(
    input_variables=["answer"],
    template=frustrated_template
)

frustrated_chain = (
    frustrated_prompt
)

neutral_chain = (
    PromptTemplate.from_template("""
    Just respond with ERROR
    """)
)

general_chain = (
    PromptTemplate.from_template("""""")
)

branch = RunnableBranch(
    (lambda x: "frustrated" in x["mood"].lower(), frustrated_chain),
    (lambda x: "neutral" in x["mood"].lower(), neutral_chain),
    general_chain,
)
student = Student()
full_chain = { "mood": lambda x: x["mood"], "answer": lambda x: x["answer"] } | branch | llm

mood_prompt = PromptTemplate(
    input_variables=["question"],
    output_variables=["mood"],
    template="""
You are a tutor. You will be given a student's question about a subject. 
Based on the question, you will understand whether the student is FRUSTRATED or CONFIDENT.
You will base your assesment on how basic the question is and the wording the student gives.
Please bear in mind that sometimes a student's state might be subtle, so pay attention to phrasing.
You will respond only with a single word: FRUSTRATED or CONFIDENT.
When there really doesn't seem to be any emotion in the given question, ouput NEUTRAL.

Student's Question: {question}
    """
)

mood_chain = (
    mood_prompt | llm
)

# mood = mood_chain.invoke({"question": "I undertand this! Thanks"})
# print(mood)


# It might just be better to just keep track of the question-answer pairs in the past 
# and just have that be the topics the student had trouble with.
understanding_prompt = PromptTemplate(
    input_variables=["question", "answer"],
    template="""
You are a tutor. You will be given a question from a student and the corresponding answer.
You will reply with the topic the student had trouble with.
Bear in mind that a student might have just been asking a clarifying question and that doens't 
mean that they had trouble with the subject.
Respond ONLY with the subject the student had trouble with.

Student Question: {question}

Teacher Answer: {answer}
"""
)

understanding_chain = (
    understanding_prompt | llm
)

topic = understanding_chain.invoke({"question": "What the fuck was Plato the sus about?", "answer": "Plato was a video game character in 1981."})
print(topic)
# full_chain.invoke({ "mood": student.mood, "answer": """The lecture between minutes 5 and 10 covers various topics related to Plato\'s book 
# "The Republic," including its literary approach, utopian elements, the concept of a harmonious city, 
# the role of philosophers in ruling, and the use of censorship and propaganda."""})

content='Philosophy'


## Memory

In [26]:
from langchain.memory import ConversationBufferMemory
from langchain.chains import LLMChain

questions_memory = ConversationBufferMemory(memory_key="questions", return_messages=True)

questions_memory.chat_memory.add_user_message(query)
questions_memory.chat_memory.add_ai_message("What a dumb question")

print(questions_memory)
print(questions_memory.load_memory_variables({}))

chat_memory=ChatMessageHistory(messages=[HumanMessage(content='Did Plato go to Italy?'), AIMessage(content='What a dumb question')]) return_messages=True memory_key='questions'
{'questions': [HumanMessage(content='Did Plato go to Italy?'), AIMessage(content='What a dumb question')]}
