<a href="https://colab.research.google.com/github/vektor8891/llm/blob/main/projects/33_langchain_vector_store/33_langchain_vector_store.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
# ! pip install -qq langchain_community
# ! pip install -qq ibm_watsonx_ai
# ! pip install -qq langchain_ibm
# ! pip install -qq chromadb
# ! pip install -qq faiss-cpu

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m62.8 MB/s[0m eta [36m0:00:00[0m
[?25h

# Create and Configure a Vector Database to Store Document Embeddings

In [3]:
from langchain_community.document_loaders import TextLoader

!wget "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/BYlUHaillwM8EUItaIytHQ/companypolicies.txt"
loader = TextLoader("companypolicies.txt")
data = loader.load()
data

--2025-07-14 18:08:25--  https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/BYlUHaillwM8EUItaIytHQ/companypolicies.txt
Resolving cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)... 169.63.118.104
Connecting to cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)|169.63.118.104|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15660 (15K) [text/plain]
Saving to: ‘companypolicies.txt’


2025-07-14 18:08:25 (227 MB/s) - ‘companypolicies.txt’ saved [15660/15660]





In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,
)

chunks = text_splitter.split_documents(data)

len(chunks)

215

### Embedding model

In [7]:
from ibm_watsonx_ai.metanames import EmbedTextParamsMetaNames
from langchain_ibm import WatsonxEmbeddings
from google.colab import userdata

embed_params = {
    EmbedTextParamsMetaNames.TRUNCATE_INPUT_TOKENS: 3,
    EmbedTextParamsMetaNames.RETURN_OPTIONS: {"input_text": True},
}

watsonx_embedding = WatsonxEmbeddings(
    model_id="ibm/slate-125m-english-rtrvr",
    url=userdata.get("WATSONX_URL"),
    project_id=userdata.get("WATSONX_PROJECT_ID"),
    params=embed_params,
    apikey=userdata.get('IBM_CLOUD_API_KEY')
)

## Vector store

In [10]:
from langchain.vectorstores import Chroma

ids = [str(i) for i in range(0, len(chunks))]

vectordb = Chroma.from_documents(chunks, watsonx_embedding, ids=ids)

for i in range(3):
    print(vectordb._collection.get(ids=str(i)))

vectordb._collection.count()

{'ids': ['0'], 'embeddings': None, 'documents': ['1.\tCode of Conduct'], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': [{'source': 'companypolicies.txt'}]}
{'ids': ['1'], 'embeddings': None, 'documents': ['Our Code of Conduct outlines the fundamental principles and ethical standards that guide every'], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': [{'source': 'companypolicies.txt'}]}
{'ids': ['2'], 'embeddings': None, 'documents': ['that guide every member of our organization. We are committed to maintaining a workplace that is'], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': [{'source': 'companypolicies.txt'}]}


215

#### Similarity search

In [11]:
query = "Email policy"
docs = vectordb.similarity_search(query)
docs

[Document(metadata={'source': 'companypolicies.txt'}, page_content='internet and email usage, including those related to copyright and data protection.'),
 Document(metadata={'source': 'companypolicies.txt'}, page_content='to this policy. Non-compliance may lead to appropriate disciplinary action, which could include'),
 Document(metadata={'source': 'companypolicies.txt'}, page_content='This policy serves as a framework for handling discipline and termination. The organization'),
 Document(metadata={'source': 'companypolicies.txt'}, page_content='Policy Purpose: The Smoking Policy has been established to provide clear guidance and expectations')]

In [12]:
vectordb.similarity_search(query, k = 1)

[Document(metadata={'source': 'companypolicies.txt'}, page_content='internet and email usage, including those related to copyright and data protection.')]

### FIASS DB

In [16]:
from langchain_community.vectorstores import FAISS

faissdb = FAISS.from_documents(chunks, watsonx_embedding, ids=ids)

for i in range(3):
    print(faissdb.docstore.search(str(i)))

query = "Email policy"
docs = faissdb.similarity_search(query)
docs

page_content='1.	Code of Conduct' metadata={'source': 'companypolicies.txt'}
page_content='Our Code of Conduct outlines the fundamental principles and ethical standards that guide every' metadata={'source': 'companypolicies.txt'}
page_content='that guide every member of our organization. We are committed to maintaining a workplace that is' metadata={'source': 'companypolicies.txt'}


[Document(id='69', metadata={'source': 'companypolicies.txt'}, page_content='internet and email usage, including those related to copyright and data protection.'),
 Document(id='118', metadata={'source': 'companypolicies.txt'}, page_content='to this policy. Non-compliance may lead to appropriate disciplinary action, which could include'),
 Document(id='209', metadata={'source': 'companypolicies.txt'}, page_content='This policy serves as a framework for handling discipline and termination. The organization'),
 Document(id='102', metadata={'source': 'companypolicies.txt'}, page_content='Policy Purpose: The Smoking Policy has been established to provide clear guidance and expectations')]

### Managing vector store: Adding, updating, and deleting entries

#### Add

In [17]:
from langchain_core.documents import Document

text = "Instructlab is the best open source tool for fine-tuning a LLM."

new_chunk =  Document(
    page_content=text,
    metadata={
        "source": "ibm.com",
        "page": 1
    }
)

new_chunks = [new_chunk]

print(vectordb._collection.get(ids=['215']))

vectordb.add_documents(
    new_chunks,
    ids=["215"]
)

{'ids': [], 'embeddings': None, 'documents': [], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': []}


['215']

In [18]:
vectordb._collection.count()

216

In [19]:
print(vectordb._collection.get(ids=['215']))

{'ids': ['215'], 'embeddings': None, 'documents': ['Instructlab is the best open source tool for fine-tuning a LLM.'], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': [{'source': 'ibm.com', 'page': 1}]}


#### Update


In [20]:
update_chunk =  Document(
    page_content="Instructlab is a perfect open source tool for fine-tuning a LLM.",
    metadata={
        "source": "ibm.com",
        "page": 1
    }
)

vectordb.update_document(
    '215',
    update_chunk,
)

print(vectordb._collection.get(ids=['215']))

{'ids': ['215'], 'embeddings': None, 'documents': ['Instructlab is a perfect open source tool for fine-tuning a LLM.'], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': [{'source': 'ibm.com', 'page': 1}]}


#### Delete


In [21]:
vectordb._collection.delete(ids=['215'])

print(vectordb._collection.get(ids=['215']))

{'ids': [], 'embeddings': None, 'documents': [], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': []}


# Exercises


### Exercise 1 - Use another query to conduct similarity search.

In [22]:
query = "Smoking policy"
docs = vectordb.similarity_search(query)
docs

[Document(metadata={'source': 'companypolicies.txt'}, page_content='Smoking Restrictions: Smoking inside company buildings, offices, meeting rooms, and other enclosed'),
 Document(metadata={'source': 'companypolicies.txt'}, page_content='Designated Smoking Areas: Smoking is only permitted in designated smoking areas, as marked by'),
 Document(metadata={'source': 'companypolicies.txt'}, page_content='No Smoking in Company Vehicles: Smoking is not permitted in company vehicles, whether they are'),
 Document(metadata={'source': 'companypolicies.txt'}, page_content='Policy Purpose: The Smoking Policy has been established to provide clear guidance and expectations')]

In [23]:
! pip freeze > requirements.txt