In [20]:
from langchain_community.document_loaders import Docx2txtLoader
from langchain_text_splitters.markdown import MarkdownHeaderTextSplitter
from langchain_text_splitters.character import CharacterTextSplitter
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document

In [21]:
loader_docx = Docx2txtLoader("Introduction_to_Data_and_Data_Science_2.docx")
pages = loader_docx.load()

md_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=[("#", "course title"),
                            ("##", "lecture list")]
)

pages_md_split = md_splitter.split_text(pages[0].page_content)

for i in range(len(pages_md_split)):
    pages_md_split[i].page_content = ' '.join(pages_md_split[i].page_content.split())

char_splitter = CharacterTextSplitter(
    separator = ".",
    chunk_size = 500,
    chunk_overlap  = 50
)

pages_char_split = char_splitter.split_documents(pages_md_split)

embedding = OpenAIEmbeddings(model = "text-embedding-ada-002")

In [22]:
len(pages_char_split)

20

In [23]:
vectorstore = Chroma.from_documents(
    documents=pages_char_split,
    embedding=embedding,
    persist_directory="./intro-to-ds-lectures",
)

In [24]:
vectorstore_from_directory = Chroma(
    persist_directory="./intro-to-ds-lectures",
    embedding_function=embedding
)

In [25]:
vectorstore_from_directory.get()

{'ids': ['55f99d4e-a229-4823-8192-f03afd95d954',
  '75a3925a-0128-4fe3-bf44-c99bd7936787',
  '3193ec4d-21ab-4bc7-b3d4-c08ce115e009',
  'a407fe6b-d783-4ad2-a960-bc0626ad7ba1',
  '0382c8b1-87e3-45d5-9503-f0927106c81f',
  'ded4a843-bc65-4c36-b7ed-e0aafe04f1af',
  '06cef932-8ad9-423a-9949-fa765053f283',
  '7ff26ffe-cd35-4434-9e04-c0bbf69ac45f',
  'e9ec352e-95d6-49cb-a40a-aa0330e87247',
  '17dcd632-c3bf-45cd-a2c0-4e5b6dd24021',
  'f577846b-2362-4517-9b94-89b53901d75d',
  '043d7e53-980f-4670-a953-09e048fe6108',
  '6c117b77-6496-42a7-ad32-f629e42064e1',
  'bb5d502a-941a-46cd-88c7-e86038845191',
  '2a16f23a-b9d2-49d9-b64a-f3c7272aee15',
  '9b2b861b-4156-4cb8-a639-63819ae435c1',
  'd3b4a07c-4416-4c39-aeb3-82758031a891',
  'fff59e94-0304-436d-8038-fc6a5434e035',
  '658cbe8a-0543-4e0a-ac67-2f2fea02bb0b',
  '2b5ca7a1-3484-4792-aff6-930d2a6c9e2d',
  'd3880c99-5f06-4fa3-bf0e-0402d7f35018',
  '05df4834-bc24-404c-93c0-95b2d1cb1eb1',
  '834f48cb-6620-402e-bbc2-5c532b465dc0',
  'e15d9f69-505f-4bdc-a711-

In [26]:
vectorstore_from_directory.get(ids = '75a3925a-0128-4fe3-bf44-c99bd7936787',
                               include = ['embeddings'])

{'ids': ['75a3925a-0128-4fe3-bf44-c99bd7936787'],
 'embeddings': array([[-0.00145079,  0.00294724,  0.04136246, ...,  0.00858565,
         -0.02052466, -0.00128198]]),
 'documents': None,
 'uris': None,
 'included': ['embeddings'],
 'data': None,
 'metadatas': None}

In [27]:
added_document = Document(page_content="This is a new document to add to the vector store.",
                           metadata={"Course Title": "Test Course", 
                                     "Lecture Title": "Test Lecture"} )

In [29]:
vectorstore_from_directory.add_documents([added_document])

['ff331c0c-d815-4929-8de1-17819ae4596d']

In [31]:
vectorstore.get('ff331c0c-d815-4929-8de1-17819ae4596d')

{'ids': ['ff331c0c-d815-4929-8de1-17819ae4596d'],
 'embeddings': None,
 'documents': ['This is a new document to add to the vector store.'],
 'uris': None,
 'included': ['metadatas', 'documents'],
 'data': None,
 'metadatas': [{'Lecture Title': 'Test Lecture',
   'Course Title': 'Test Course'}]}

In [32]:
updated_document = Document(page_content="This is an updated document content.",
                            metadata={"Course Title": "Test Course"})

In [33]:
vectorstore_from_directory.update_document(document_id = 'ff331c0c-d815-4929-8de1-17819ae4596d',
                                           document = updated_document)

In [None]:
vectorstore.get('ff331c0c-d815-4929-8de1-17819ae4596d')

{'ids': ['ff331c0c-d815-4929-8de1-17819ae4596d'],
 'embeddings': None,
 'documents': ['This is an updated document content.'],
 'uris': None,
 'included': ['metadatas', 'documents'],
 'data': None,
 'metadatas': [{'Course Title': 'Test Course',
   'Lecture Title': 'Test Lecture'}]}

In [36]:
vectorstore_from_directory.delete("ff331c0c-d815-4929-8de1-17819ae4596d")

In [37]:
vectorstore.get('ff331c0c-d815-4929-8de1-17819ae4596d')

{'ids': [],
 'embeddings': None,
 'documents': [],
 'uris': None,
 'included': ['metadatas', 'documents'],
 'data': None,
 'metadatas': []}