# Indexing: Inspecting and Managing Documents in a Vectorstore

In [1]:
# Run the line of code below to check the version of langchain in the current environment.
# Substitute "langchain" with any other package name to check their version.

In [2]:
!pip install chromadb



In [3]:
!pip show langchain

Name: langchain
Version: 0.3.27
Summary: Building applications with LLMs through composability
Home-page: 
Author: 
Author-email: 
License: MIT
Location: /usr/local/lib/python3.12/dist-packages
Requires: langchain-core, langchain-text-splitters, langsmith, pydantic, PyYAML, requests, SQLAlchemy
Required-by: 


In [1]:
from google.colab import userdata
import os

os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')

In [5]:
# %load_ext dotenv
# %dotenv

In [3]:
import os

# Uninstall the old langchain metapackage (if present) to prevent conflicts
# with newer modular packages like langchain-community.
# The -y flag confirms the uninstallation without prompting.
!pip uninstall -y langchain

# Install/Upgrade langchain-openai and langchain-community.
# langchain-core will be installed as a dependency automatically.
!pip install --upgrade langchain-openai langchain-community

from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document



In [17]:
!pip install langchain-community docx2txt

Collecting docx2txt
  Downloading docx2txt-0.9-py3-none-any.whl.metadata (529 bytes)
Downloading docx2txt-0.9-py3-none-any.whl (4.0 kB)
Installing collected packages: docx2txt
Successfully installed docx2txt-0.9


In [21]:
from langchain_community.document_loaders import Docx2txtLoader
from langchain_text_splitters.markdown import MarkdownHeaderTextSplitter
from langchain_text_splitters.character import CharacterTextSplitter
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

In [39]:
loader_docx = Docx2txtLoader("Introduction_to_Data_and_Data_Science_2.docx")
pages = loader_docx.load()

md_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on = [("#", "Course Title"),
                           ("##", "Lecture Title")]
)

pages_md_split = md_splitter.split_text(pages[0].page_content)

for i in range(len(pages_md_split)):
    pages_md_split[i].page_content = ' '.join(pages_md_split[i].page_content.split())

char_splitter = CharacterTextSplitter(
    separator = ".",
    chunk_size = 500,
    chunk_overlap  = 50
)

pages_char_split = char_splitter.split_documents(pages_md_split)

In [38]:
embedding = OpenAIEmbeddings(model='text-embedding-ada-002')

In [40]:
vectorstore_from_directory = Chroma(persist_directory = "./intro-to-ds-lectures",
                                    embedding_function = embedding)

I didnt got full vector ids

In [41]:
vectorstore_from_directory.get()

{'ids': ['ebda7571-0e6b-4725-b022-492c2715b2c2'],
 'embeddings': None,
 'documents': ['Alright! So… Let’s discuss the not-so-obvious differences between the terms analysis and analytics. Due to the similarity of the words, some people believe they share the same meaning, and thus use them interchangeably. Technically, this isn’t correct. There is, in fact, a distinct difference between the two. And the reason for one often being used instead of the other is the lack of a transparent understanding of both. So, let’s clear this up, shall we? First, we will start with analysis'],
 'uris': None,
 'included': ['metadatas', 'documents'],
 'data': None,
 'metadatas': [{'Lecture Title': 'Analysis vs Analytics',
   'Course Title': 'Introduction to Data and Data Science'}]}

In [42]:
vectorstore_from_directory.get(ids = "ebda7571-0e6b-4725-b022-492c2715b2c2",
                               include = ["embeddings"])

{'ids': ['ebda7571-0e6b-4725-b022-492c2715b2c2'],
 'embeddings': array([[ 0.00478017, -0.01535145,  0.02508651, ...,  0.02121745,
         -0.01364157, -0.00687695]]),
 'documents': None,
 'uris': None,
 'included': ['embeddings'],
 'data': None,
 'metadatas': None}

Below one we have taken from document and testing it

In [26]:
added_document = Document(page_content='Alright! So… Let’s discuss the not-so-obvious differences between the terms analysis and analytics. Due to the similarity of the words, some people believe they share the same meaning, and thus use them interchangeably. Technically, this isn’t correct. There is, in fact, a distinct difference between the two. And the reason for one often being used instead of the other is the lack of a transparent understanding of both. So, let’s clear this up, shall we? First, we will start with analysis',
                          metadata={'Course Title': 'Introduction to Data and Data Science',
                                    'Lecture Title': 'Analysis vs Analytics'})

In [27]:
vectorstore_from_directory.add_documents([added_document])

['4f7a5c89-da72-4768-91b0-82000bb18932']

In [29]:
vectorstore_from_directory.get("4f7a5c89-da72-4768-91b0-82000bb18932")

{'ids': ['4f7a5c89-da72-4768-91b0-82000bb18932'],
 'embeddings': None,
 'documents': ['Alright! So… Let’s discuss the not-so-obvious differences between the terms analysis and analytics. Due to the similarity of the words, some people believe they share the same meaning, and thus use them interchangeably. Technically, this isn’t correct. There is, in fact, a distinct difference between the two. And the reason for one often being used instead of the other is the lack of a transparent understanding of both. So, let’s clear this up, shall we? First, we will start with analysis'],
 'uris': None,
 'included': ['metadatas', 'documents'],
 'data': None,
 'metadatas': [{'Lecture Title': 'Analysis vs Analytics',
   'Course Title': 'Introduction to Data and Data Science'}]}

In [30]:
updated_document = Document(page_content='Great! We hope we gave you a good idea about the level of applicability of the most frequently used programming and software tools in the field of data science. Thank you for watching!',
                            metadata={'Course Title': 'Introduction to Data and Data Science',
                                     'Lecture Title': 'Programming Languages & Software Employed in Data Science - All the Tools You Need'})

In [31]:
vectorstore_from_directory.update_document(document_id = "4f7a5c89-da72-4768-91b0-82000bb18932",
                                           document = updated_document)

In [33]:
vectorstore_from_directory.get("4f7a5c89-da72-4768-91b0-82000bb18932")

{'ids': ['4f7a5c89-da72-4768-91b0-82000bb18932'],
 'embeddings': None,
 'documents': ['Great! We hope we gave you a good idea about the level of applicability of the most frequently used programming and software tools in the field of data science. Thank you for watching!'],
 'uris': None,
 'included': ['metadatas', 'documents'],
 'data': None,
 'metadatas': [{'Lecture Title': 'Programming Languages & Software Employed in Data Science - All the Tools You Need',
   'Course Title': 'Introduction to Data and Data Science'}]}

In [35]:
vectorstore_from_directory.delete("4f7a5c89-da72-4768-91b0-82000bb18932")

In [36]:
vectorstore_from_directory.get("4f7a5c89-da72-4768-91b0-82000bb18932")

{'ids': [],
 'embeddings': None,
 'documents': [],
 'uris': None,
 'included': ['metadatas', 'documents'],
 'data': None,
 'metadatas': []}