## Load text

In [1]:
from langchain_community.document_loaders import TextLoader
loader = TextLoader("companypolicies.txt")
data = loader.load()
data



## Split data

In [2]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len
)
chunks = text_splitter.split_documents(data)
len(chunks)

215

## Embedding model

In [3]:
from langchain_huggingface import HuggingFaceEmbeddings

model_kwargs = {"device": "cuda:0"}
embd_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2",
    model_kwargs=model_kwargs
)

## Vector store

### ChromaDB
#### Build the database

In [4]:
from langchain.vectorstores import Chroma

In [5]:
ids = [str(i) for i in range(len(chunks))]

vectordb = Chroma.from_documents(chunks, embd_model, ids=ids)

In [6]:
# Test the vectordb
for i in range(3):
    print(vectordb._collection.get(ids=str(i)))

{'ids': ['0'], 'embeddings': None, 'documents': ['1.\tCode of Conduct'], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': [{'source': 'companypolicies.txt'}]}
{'ids': ['1'], 'embeddings': None, 'documents': ['Our Code of Conduct outlines the fundamental principles and ethical standards that guide every'], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': [{'source': 'companypolicies.txt'}]}
{'ids': ['2'], 'embeddings': None, 'documents': ['that guide every member of our organization. We are committed to maintaining a workplace that is'], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': [{'source': 'companypolicies.txt'}]}


In [7]:
vectordb._collection.count()

215

#### Similarity search

In [8]:
query = "Email policy"
docs = vectordb.similarity_search(query)
docs

[Document(metadata={'source': 'companypolicies.txt'}, page_content='3.\tInternet and Email Policy'),
 Document(metadata={'source': 'companypolicies.txt'}, page_content='Our Internet and Email Policy aims to promote safe, responsible usage of digital communication'),
 Document(metadata={'source': 'companypolicies.txt'}, page_content='Our Internet and Email Policy is established to guide the responsible and secure use of these'),
 Document(metadata={'source': 'companypolicies.txt'}, page_content='Confidentiality: Reserve email for the transmission of confidential information, trade secrets, and')]

In [9]:
vectordb.similarity_search(query, k=1)

[Document(metadata={'source': 'companypolicies.txt'}, page_content='3.\tInternet and Email Policy')]

### FAISS DB

In [10]:
from langchain_community.vectorstores import FAISS

In [11]:
faissdb = FAISS.from_documents(chunks, embd_model, ids=ids)

In [14]:
for i in range(3):
    print(faissdb.docstore.search(str(i)))

page_content='1.	Code of Conduct' metadata={'source': 'companypolicies.txt'}
page_content='Our Code of Conduct outlines the fundamental principles and ethical standards that guide every' metadata={'source': 'companypolicies.txt'}
page_content='that guide every member of our organization. We are committed to maintaining a workplace that is' metadata={'source': 'companypolicies.txt'}


#### Similarity search

In [15]:
query = "Email policy"
docs = faissdb.similarity_search(query)
docs

[Document(id='51', metadata={'source': 'companypolicies.txt'}, page_content='3.\tInternet and Email Policy'),
 Document(id='73', metadata={'source': 'companypolicies.txt'}, page_content='Our Internet and Email Policy aims to promote safe, responsible usage of digital communication'),
 Document(id='52', metadata={'source': 'companypolicies.txt'}, page_content='Our Internet and Email Policy is established to guide the responsible and secure use of these'),
 Document(id='62', metadata={'source': 'companypolicies.txt'}, page_content='Confidentiality: Reserve email for the transmission of confidential information, trade secrets, and')]

In [16]:
faissdb.similarity_search(query, k=1)

[Document(id='51', metadata={'source': 'companypolicies.txt'}, page_content='3.\tInternet and Email Policy')]

## Managing vector store: Adding, updating and deleting entries

### Add

In [17]:
text = "Instructlab is the best open source tool for fine-tuning a LLM."

In [18]:
from langchain_core.documents import Document

In [19]:
new_chunk = Document(
    page_content=text,
    metadata={
        "source": "tuntun.com",
        "page": 1
    }
)

In [20]:
new_chunks = [new_chunk]

In [21]:
print(vectordb._collection.get(ids=["215"]))

{'ids': [], 'embeddings': None, 'documents': [], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': []}


In [22]:
# Adding process
vectordb.add_documents(
    new_chunks,
    ids=["215"]
)

['215']

In [23]:
print(vectordb._collection.get(ids=["215"]))

{'ids': ['215'], 'embeddings': None, 'documents': ['Instructlab is the best open source tool for fine-tuning a LLM.'], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': [{'page': 1, 'source': 'tuntun.com'}]}


#### Update

In [27]:
update_chunk =  Document(
    page_content="Instructlab is a perfect open source tool for fine-tuning a LLM.",
    metadata={
        "source": "tuananh.com",
        "page": 1
    }
)

In [28]:
vectordb.update_document(
    document_id='215',
    document=update_chunk
)

In [29]:
print(vectordb._collection.get(ids=["215"]))

{'ids': ['215'], 'embeddings': None, 'documents': ['Instructlab is a perfect open source tool for fine-tuning a LLM.'], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': [{'source': 'tuananh.com', 'page': 1}]}


#### Delete

In [30]:
vectordb._collection.delete(ids=["215"])

In [31]:
print(vectordb._collection.get(ids=["215"]))

{'ids': [], 'embeddings': None, 'documents': [], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': []}


# Test

In [32]:
# Test 1
query = "Drug policy"
vectordb.similarity_search(query=query, k=5)

[Document(metadata={'source': 'companypolicies.txt'}, page_content='6.\tDrug and Alcohol Policy'),
 Document(metadata={'source': 'companypolicies.txt'}, page_content='Policy Objective: The Drug and Alcohol Policy is established to establish clear expectations and'),
 Document(metadata={'source': 'companypolicies.txt'}, page_content='Your adherence to this policy is appreciated as it helps to maintain a safe and drug-free workplace'),
 Document(metadata={'source': 'companypolicies.txt'}, page_content='expectations and guidelines for the responsible use of drugs and alcohol within the organization.'),
 Document(metadata={'source': 'companypolicies.txt'}, page_content='drug-free workplace for all.')]