In [1]:
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

In [2]:
from langchain.document_loaders import SRTLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


def load_srt_files_and_split():
    # 要讀取的字母清單
    srt_files = [
        "srt_files/How to Start a Service Business _ The Journey - English (United States).srt",
        "srt_files/5 Small Habits That Will Change Your Life Forever - YouTube - English.srt",
        "srt_files/5 Tips on How to Film a Travel Video with your iPhone - YouTube - English.srt",
        "srt_files/8 Habits to Help You Live Your Best Life - English.srt",
        "srt_files/10 Mistakes You Should Avoid in Your Life - YouTube - English.srt",
        "srt_files/Generative AI for business - YouTube - English.srt",
        "srt_files/How (And What) To Pack For a Weekend Getaway - English.srt",
        "srt_files/How to Discover Business Opportunities No One is Seeing - YouTube - English.srt",
        "srt_files/How to Start a Service Business _ The Journey - English (United States).srt",
        "srt_files/Learn Your Partner&#39;s Language Fast - Language Learning Tips - YouTube - English.srt",
        "srt_files/ROAD TRIP SNACKS_ BREAKFAST & LUNCH IDEAS _ Easy & Essential _ Road tripping during a pandemic!! - YouTube - English.srt",
        "srt_files/SHOPPING IN KOREA🇰🇷 SEOUL travel vlog 2022 - YouTube - English.srt",
        "srt_files/When learning a new language_ what&#39;s more important_ Grammar or vocabulary_ _ #DailyMIKE 043 - YouTube - English (United States).srt",
        "srt_files/World Stories to Help You Learn _ practice English with Spotlight - English.srt",
        "srt_files/🧠The Amazing Language Learning Machine Inside Your Head💗Ep 579 - YouTube - English (United Kingdom).srt",
    ]

    # 所以取得出來的 document 的集合
    docs = []

    for srt_file in srt_files:
        loader = SRTLoader(srt_file)
        docs += loader.load()

    # 切分文本，我們不用單純的 line 分行，因為意義太小了，語義比對應也不會有太大的幫助
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 300,
        chunk_overlap = 0
    )

    # 切分文本
    splitted_documents = text_splitter.split_documents(docs)
    print(f'documents:{len(splitted_documents)}')

    return splitted_documents

def load_document_from_srt_file(srt_file):
    loader = SRTLoader(srt_file)

    document_loaded = loader.load()
    content_lines = document_loaded[0].page_content.splitlines()

    return content_lines

In [3]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings

# usage scenario vector database creation
# 試著從 data/chroma_db 讀取 vector database，失敗的話，就讀取 srt 檔案來重建 vector database
db_vector = None

db_vector = Chroma(persist_directory="data/chroma_db", embedding_function=OpenAIEmbeddings())

doc_count = db_vector._collection.count()
if doc_count == 0:
    splitted_documents = load_srt_files_and_split()
    db_vector.add_documents(splitted_documents)
else:    
    print(f'croma loaded, doc count: {doc_count}')

documents:606


In [4]:
db_vector.similarity_search('travel')

[Document(page_content='about', metadata={'source': 'srt_files/5 Tips on How to Film a Travel Video with your iPhone - YouTube - English.srt'}),
 Document(page_content='meal, just the two of us, walking around maybe doing some shopping, dinner with a couple friends\nat really nice restaurant, an early brunch, and then finally, heading home. So let\'s get packing. (soft music) I\'m Brian Sacawa. This is "He Spoke Style," advice and inspiration', metadata={'source': 'srt_files/How (And What) To Pack For a Weekend Getaway - English.srt'}),
 Document(page_content="specific outfit for your trip. And planning ahead is what\nthis all really boils down to. Knowing when you're\ngoing, where you're going, how long you're going\nfor, how you're traveling, and what you're gonna be doing, makes it so much easier", metadata={'source': 'srt_files/How (And What) To Pack For a Weekend Getaway - English.srt'}),
 Document(page_content='hey guys welcome back to my channel\nso 2020 has been quite different