In [55]:
import os
import dotenv

dotenv.load_dotenv()

'123'

In [52]:
import langchain as lc
from langchain.llms import HuggingFaceHub
from langchain.llms import OpenAI

llm = OpenAI()

llm = HuggingFaceHub(
    repo_id="google/flan-t5-xxl", 
    model_kwargs={"temperature": 0.6, "max_length":4096}
)



In [46]:
import glob
import os
from pathlib import Path
from typing import Any, List

from langchain.docstore.document import Document
from langchain.document_loaders import SRTLoader
from langchain.document_loaders.base import BaseLoader


class LectureLoader(BaseLoader):
    def __init__(self, 
                 lecture_file: str,
                 add_lecture_info: bool = False
                ):
        self.add_lecture_info = add_lecture_info
        self.lecture_file = lecture_file

    @classmethod
    def from_folder(cls, folder_name: str, **kwargs: Any) -> 'LectureLoader':
        return cls(folder_name, kwargs)

    def load(self) -> List[Document]:
        documents = []
        
        for file_name in Path(self.lecture_file).rglob('*'):
            
            file_path = Path(file_name)
            if not file_path.is_file(): continue
                
            with open(file_name, "r") as f:

                metadata = {}
                
                # Load the transcript data
                if file_name.suffix == ".srt":
                # or file_name.suffix == ".sbv" \
                # or file_name.suffix == ".vtt" \
                # or file_name.suffix == ".txt":

                    srt_loader = SRTLoader(file_name)
                        
                    if self.add_lecture_info:
                        metadata["lecture_name"] = file_path.parent.name
                        metadata["source"] = file_path.stem
                        metadata["type"] = "transcript"

                    for doc in srt_loader.load():
                        doc.metadata.update(metadata)
                        documents.append(doc)

        return documents

In [4]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.vectorstores.base import VectorStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema.embeddings import Embeddings

from typing import Type, Iterable, Optional, List


class LectureIndex(FAISS):
    """Wrapper around the FAISS VectorStore"""

    def from_documents(documents: List[Document], embedding: Embeddings):
        text_splitter = RecursiveCharacterTextSplitter(chunk_size = 700, chunk_overlap = 0)
        docs_split = text_splitter.split_documents(documents)
        return FAISS.from_documents(docs_split, embedding)

In [47]:
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

lecture_loader = LectureLoader.from_folder("./data", add_lecture_info=True)
lecture_docs = lecture_loader.load()
print(lecture_docs)

lecture_index = LectureIndex.from_documents(lecture_docs, embeddings)



In [50]:
results = lecture_index.similarity_search("Student?")

for doc in results:
    print("- ", doc.metadata)

-  {'source': "4. Philosophers and Kings： Plato's Republic, I-II [nVQKbQVc2_w].en", 'lecture_name': 'lecture1', 'type': 'transcript'}
-  {'source': "4. Philosophers and Kings： Plato's Republic, I-II [nVQKbQVc2_w].en", 'lecture_name': 'lecture1', 'type': 'transcript'}
-  {'source': "4. Philosophers and Kings： Plato's Republic, I-II [nVQKbQVc2_w].en", 'lecture_name': 'lecture1', 'type': 'transcript'}
-  {'source': "4. Philosophers and Kings： Plato's Republic, I-II [nVQKbQVc2_w].en", 'lecture_name': 'lecture1', 'type': 'transcript'}
