In [1]:
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

In [2]:
from langchain.document_loaders import SRTLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


def load_srt_files_and_split():
    # 要讀取的字母清單
    srt_files = [
        "srt_files/How to Start a Service Business _ The Journey - English (United States).srt",
        "srt_files/5 Small Habits That Will Change Your Life Forever - YouTube - English.srt",
        "srt_files/5 Tips on How to Film a Travel Video with your iPhone - YouTube - English.srt",
        "srt_files/8 Habits to Help You Live Your Best Life - English.srt",
        "srt_files/10 Mistakes You Should Avoid in Your Life - YouTube - English.srt",
        "srt_files/Generative AI for business - YouTube - English.srt",
        "srt_files/How (And What) To Pack For a Weekend Getaway - English.srt",
        "srt_files/How to Discover Business Opportunities No One is Seeing - YouTube - English.srt",
        "srt_files/How to Start a Service Business _ The Journey - English (United States).srt",
        "srt_files/Learn Your Partner&#39;s Language Fast - Language Learning Tips - YouTube - English.srt",
        "srt_files/ROAD TRIP SNACKS_ BREAKFAST & LUNCH IDEAS _ Easy & Essential _ Road tripping during a pandemic!! - YouTube - English.srt",
        "srt_files/SHOPPING IN KOREA🇰🇷 SEOUL travel vlog 2022 - YouTube - English.srt",
        "srt_files/When learning a new language_ what&#39;s more important_ Grammar or vocabulary_ _ #DailyMIKE 043 - YouTube - English (United States).srt",
        "srt_files/World Stories to Help You Learn _ practice English with Spotlight - English.srt",
        "srt_files/🧠The Amazing Language Learning Machine Inside Your Head💗Ep 579 - YouTube - English (United Kingdom).srt",
    ]

    # 所以取得出來的 document 的集合
    docs = []

    for srt_file in srt_files:
        loader = SRTLoader(srt_file)
        docs += loader.load()

    # 切分文本，我們不用單純的 line 分行，因為意義太小了，語義比對應也不會有太大的幫助
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 300,
        chunk_overlap = 0
    )

    # 切分文本
    splitted_documents = text_splitter.split_documents(docs)
    print(f'documents:{len(splitted_documents)}')

    return splitted_documents

def load_document_from_srt_file(srt_file):
    loader = SRTLoader(srt_file)

    document_loaded = loader.load()
    content_lines = document_loaded[0].page_content.splitlines()

    return content_lines

In [3]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings

# 試著從 data/chroma_db 讀取 vector database。
db_vector = Chroma(persist_directory="data/chroma_db", embedding_function=OpenAIEmbeddings())

# 是否已經有資料了？
doc_count = db_vector._collection.count()
if doc_count == 0:
    # 如果沒有資料，就從 srt 檔讀檔案並給加入到 db_vector 裡面
    splitted_documents = load_srt_files_and_split()
    db_vector.add_documents(splitted_documents)
else:    
    print(f'chroma loaded, doc count: {doc_count}')

chroma loaded, doc count: 606


In [4]:
db_vector.similarity_search('what to do while travel')

[Document(page_content="videos more interesting and you can really focus on a subject that is doing something and build a story out of it. Now, there could be moments where you're traveling alone, so it's a good idea to\nmaybe ask the local people if they would like to", metadata={'source': 'srt_files/5 Tips on How to Film a Travel Video with your iPhone - YouTube - English.srt'}),
 Document(page_content="wear when you travel. And finally, and probably\nmost importantly, what are you going to be doing\nwhile you're on that trip? Knowing your itinerary,\nwhere you're going, where you're eating, the dress code or vibe\nof each of those things is essential to planning each", metadata={'source': 'srt_files/How (And What) To Pack For a Weekend Getaway - English.srt'}),
 Document(page_content="out and about road tripping and hiking and stuff for long periods of\ntime it is so important so of course we are going to have lots\nand lots of water with us especially when we're in the park when", 