In [337]:
import os
import dotenv

dotenv.load_dotenv()

True

In [338]:
import langchain as lc
from langchain.llms import HuggingFaceHub
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI

llm = HuggingFaceHub(
    repo_id="google/flan-t5-xxl", 
    model_kwargs={"temperature": 1.0, "max_length":4096},
)

# llm = OpenAI()
# llm = ChatOpenAI(model="gpt-3.5-turbo-0301")



In [339]:
from langchain.globals import set_debug
from langchain.globals import set_verbose

DEBUG = False

if DEBUG:
    set_verbose(True)
    set_debug(True)
else:
    set_verbose(False)
    set_debug(False)

In [340]:
import glob
import os
from pathlib import Path
from typing import Any, List

from langchain.docstore.document import Document
from langchain.document_loaders import SRTLoader
from langchain.document_loaders.base import BaseLoader


class LectureLoader(BaseLoader):
    def __init__(self, 
                 lecture_file: str,
                 add_lecture_info: bool = False
                ):
        self.add_lecture_info = add_lecture_info
        self.lecture_file = lecture_file

    @classmethod
    def from_folder(cls, folder_name: str, **kwargs: Any) -> 'LectureLoader':
        return cls(folder_name, kwargs)

    def load(self) -> List[Document]:
        documents = []
        
        for file_name in Path(self.lecture_file).rglob('*'):
            
            file_path = Path(file_name)
            if not file_path.is_file(): continue
                
            with open(file_name, "r") as f:

                metadata = {}
                
                # Load the transcript data
                if file_name.suffix == ".srt":
                # or file_name.suffix == ".sbv" \
                # or file_name.suffix == ".vtt" \
                # or file_name.suffix == ".txt":

                    srt_loader = SRTLoader(file_name)
                        
                    if self.add_lecture_info:
                        metadata["lecture_name"] = file_path.parent.name
                        metadata["source"] = file_path.stem
                        metadata["type"] = "transcript"

                    for doc in srt_loader.load():
                        doc.metadata.update(metadata)
                        documents.append(doc)

        return documents

In [341]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.vectorstores.base import VectorStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema.embeddings import Embeddings

from typing import Type, Iterable, Optional, List


class LectureIndex(FAISS):
    """Wrapper around the FAISS VectorStore"""

    @classmethod
    def from_documents(cls, documents: List[Document], embedding: Embeddings):
        text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 100)
        docs_split = text_splitter.split_documents(documents)
        return FAISS.from_documents(docs_split, embedding)

In [342]:
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

lecture_loader = LectureLoader.from_folder("./data", add_lecture_info=True)
lecture_docs = lecture_loader.load()

lecture_index = LectureIndex.from_documents(lecture_docs, embeddings)

In [343]:
results = lecture_index.similarity_search("Can you tell me what the professor in lecture1 says right after 5 minutes?")

for doc in results:
    pass
    # print("- ", doc.page_content)

In [344]:
from langchain.schema.runnable import RunnablePassthrough
from langchain import hub

# Retrieval Augmented Generation (RAG)
retriever = lecture_index.as_retriever(k=1)
rag_prompt = hub.pull("rlm/rag-prompt")

rag_chain = {"context": retriever, "question": RunnablePassthrough()} | rag_prompt | llm

query = "Where did Socrates go?"
answer = rag_chain.invoke(query)
print(answer)

Athens


In [345]:
import pysrt

class SubtitleIndex:
    """
    Can load timestamps from an .srt file and retrieve subtitles between a given time range.
    """

    def __init__(self):
        self.connection = sqlite3.connect('subtitles.db')
        self.cursor = self.connection.cursor()

        # Create the table
        self.cursor.execute('''
        CREATE TABLE IF NOT EXISTS subtitles (
            video_id TEXT,
            start_time TEXT,
            end_time TEXT,
            subtitle_text TEXT
            )
        ''')
    
    def get_subtitles(self, time_start: str, time_end: str, video_id='%') -> List[str]:
        self.cursor.execute('''
                SELECT * FROM subtitles
                WHERE start_time BETWEEN ? AND ?
                AND video_id LIKE ?
            ''', (time_start, time_end, video_id))

        rows = self.cursor.fetchall()

        return rows

    def add_subtitle_file(self, subtitle_filename, video_id):
        def to_timestamp(srt_time):
            # Format the time to a string as HH:MM:SS,SSS
            return '{:02}:{:02}:{:02},{:03}'.format(srt_time.hours,
                                                    srt_time.minutes,
                                                    srt_time.seconds,
                                                    srt_time.milliseconds)

        subs = pysrt.open(subtitle_filename)

        for sub in subs:
            start_time = to_timestamp(sub.start)
            end_time = to_timestamp(sub.end)
            subtitle_text = sub.text.replace('\n', ' ')  # Remove newline characters

            self.cursor.execute('''
                INSERT INTO subtitles (video_id, start_time, end_time, subtitle_text)
                VALUES (?, ?, ?, ?)
                ''', (lecture_id, start_time, end_time, subtitle_text))

            self.connection.commit()

    def __del__(self):
        self.connection.close()

In [346]:
sub_file = "./data/lecture1/4. Philosophers and Kings： Plato's Republic, I-II [nVQKbQVc2_w].en.srt"
lecture_id = 'lecture1'

subtitle_index = SubtitleIndex()
subtitle_index.add_subtitle_file(sub_file, lecture_id)

In [347]:
# subtitle_index.get_subtitles('00:00:00.000', '00:01:01.000', 'lecture1')

from langchain.utilities import SQLDatabase
from langchain.chains import create_sql_query_chain
from langchain.prompts import ChatPromptTemplate
from langchain.chains.router import MultiRetrievalQAChain

subtitle_db = SQLDatabase.from_uri("sqlite:///subtitles.db")
query_chain = create_sql_query_chain(llm, subtitle_db, k=20)

system_message = """Use the information from the below sources to answer any questions.

Source 2: Content from the lecture. Use if relevant to the question.
<source2>
{source2}
</source2>

Source 1: Relevant timestamped snippets from the lecture. Use if the questions mentions
a specific part of the lecture.
<source1>
{source1}
</source1>
"""

prompt = ChatPromptTemplate.from_messages(
    [("system", system_message), ("human", "{question}")]
)


full_chain = (
    {
        "source1": {"question": lambda x: x["question"]} | query_chain | subtitle_db.run,
        "source2": (lambda x: x["question"]) | retriever,
        "question": lambda x: x["question"],
    }
    | prompt
    | llm
)

response = full_chain.invoke({"question": "Can you summarize the content between minute 30 and 35?"})
print(response)

# Experimenting with MultiRetrievalQAChain (currently unable to retrieve content from SQL Database)

retriever_infos = [
    {
        "name": "Lecture Timestamps",
        "description": "Good for answering questions that refer to specific time points in the lecture.",
        "retriever": query_chain
    },
    {
        "name": "Lecture Content",
        "description": "Good for answering questions about the lecture generally",
        "retriever": retriever
    }
]

# chain = MultiRetrievalQAChain.from_retrievers(llm, retriever_infos, verbose=True)
# print(chain.run("Can you tell me what the professor in lecture1 says right after 5 minutes?"))

OperationalError: (sqlite3.OperationalError) no such column: end
[SQL: SELECT subtitle_text FROM subtitles WHERE start_time >= 30 AND end]
(Background on this error at: https://sqlalche.me/e/14/e3q8)