### FAISS

Facebook AI similarity search

In [2]:
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

loader = TextLoader("speech.txt")
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=20)
text_splitter.split_documents(documents)
docs = text_splitter.split_documents(documents)
embeddings = OllamaEmbeddings(model="gemma:2b")
db = FAISS.from_documents(docs, embeddings)
db

<langchain_community.vectorstores.faiss.FAISS at 0x1187dd400>

In [4]:
query = "What is the main idea of the speech?"
docs = db.similarity_search(query)
docs


[Document(id='09a46416-c9d6-489d-864d-8c316c3782c8', metadata={'source': 'speech.txt'}, page_content='is more equitable, more sustainable, and more just?'),
 Document(id='ec085b49-e8b4-40e7-b9c8-ee87a04bfd63', metadata={'source': 'speech.txt'}, page_content='and strengthen our resolve.'),
 Document(id='5b81f9d6-d9b2-4589-bfb4-81e84266981f', metadata={'source': 'speech.txt'}, page_content='us write that story with courage, compassion, and conviction.'),
 Document(id='7983853e-c42a-4a26-9aaf-7c45e2b10f9b', metadata={'source': 'speech.txt'}, page_content='together, we can achieve extraordinary things. Thank you.')]

### As a retriever

We can also convert the vectorstore into a retriever class. This allows us to easily use it in the LangChain methods, which largely work with retrievers.

In [5]:
retriever = db.as_retriever()
retriever.invoke(query)

[Document(id='09a46416-c9d6-489d-864d-8c316c3782c8', metadata={'source': 'speech.txt'}, page_content='is more equitable, more sustainable, and more just?'),
 Document(id='ec085b49-e8b4-40e7-b9c8-ee87a04bfd63', metadata={'source': 'speech.txt'}, page_content='and strengthen our resolve.'),
 Document(id='5b81f9d6-d9b2-4589-bfb4-81e84266981f', metadata={'source': 'speech.txt'}, page_content='us write that story with courage, compassion, and conviction.'),
 Document(id='7983853e-c42a-4a26-9aaf-7c45e2b10f9b', metadata={'source': 'speech.txt'}, page_content='together, we can achieve extraordinary things. Thank you.')]

### Similarity search with score

There are some FAISS specific methods. One of them is similarity_search_with_score, which allows you to return not only the documents but also the distance score of the query to them. The returned distance score is L2 distance. Therefore, a lower score is better.

In [6]:
docs_and_score = db.similarity_search_with_score(query)
docs_and_score



[(Document(id='09a46416-c9d6-489d-864d-8c316c3782c8', metadata={'source': 'speech.txt'}, page_content='is more equitable, more sustainable, and more just?'),
  np.float32(2298.5415)),
 (Document(id='ec085b49-e8b4-40e7-b9c8-ee87a04bfd63', metadata={'source': 'speech.txt'}, page_content='and strengthen our resolve.'),
  np.float32(2338.145)),
 (Document(id='5b81f9d6-d9b2-4589-bfb4-81e84266981f', metadata={'source': 'speech.txt'}, page_content='us write that story with courage, compassion, and conviction.'),
  np.float32(2420.672)),
 (Document(id='7983853e-c42a-4a26-9aaf-7c45e2b10f9b', metadata={'source': 'speech.txt'}, page_content='together, we can achieve extraordinary things. Thank you.'),
  np.float32(2431.5718))]

In [7]:
embedding_vector = embeddings.embed_query(query)
docs_score = db.similarity_search_by_vector(embedding_vector)
docs_score

[Document(id='09a46416-c9d6-489d-864d-8c316c3782c8', metadata={'source': 'speech.txt'}, page_content='is more equitable, more sustainable, and more just?'),
 Document(id='ec085b49-e8b4-40e7-b9c8-ee87a04bfd63', metadata={'source': 'speech.txt'}, page_content='and strengthen our resolve.'),
 Document(id='5b81f9d6-d9b2-4589-bfb4-81e84266981f', metadata={'source': 'speech.txt'}, page_content='us write that story with courage, compassion, and conviction.'),
 Document(id='7983853e-c42a-4a26-9aaf-7c45e2b10f9b', metadata={'source': 'speech.txt'}, page_content='together, we can achieve extraordinary things. Thank you.')]

In [8]:
db.save_local("faiss_index")

In [11]:
new_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)

In [14]:
new_db.similarity_search_by_vector(embedding_vector)

[Document(id='09a46416-c9d6-489d-864d-8c316c3782c8', metadata={'source': 'speech.txt'}, page_content='is more equitable, more sustainable, and more just?'),
 Document(id='ec085b49-e8b4-40e7-b9c8-ee87a04bfd63', metadata={'source': 'speech.txt'}, page_content='and strengthen our resolve.'),
 Document(id='5b81f9d6-d9b2-4589-bfb4-81e84266981f', metadata={'source': 'speech.txt'}, page_content='us write that story with courage, compassion, and conviction.'),
 Document(id='7983853e-c42a-4a26-9aaf-7c45e2b10f9b', metadata={'source': 'speech.txt'}, page_content='together, we can achieve extraordinary things. Thank you.')]

### Chroma

Chroma is a AI-native open source vector database focused on developer productivity.

In [17]:
## Building a sample vectordb

from langchain_chroma import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings

loader = TextLoader("speech.txt")
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=20)
text_splitter.split_documents(documents)
docs = text_splitter.split_documents(documents)
embeddings = OllamaEmbeddings(model="gemma:2b")

In [18]:
vectordb = Chroma.from_documents(
    documents = docs,
    embedding = embeddings,
)

In [20]:
query = "What is the main idea of the speech?"
docs = vectordb.similarity_search(query)
docs

vectordb.similarity_search_with_score(query)


[(Document(id='f4fc077a-1800-41d0-87a6-7c0b14a4f3f3', metadata={'source': 'speech.txt'}, page_content='is more equitable, more sustainable, and more just?'),
  2298.54150390625),
 (Document(id='50c26385-7223-4bb2-9a52-0c2a4c2660eb', metadata={'source': 'speech.txt'}, page_content='and strengthen our resolve.'),
  2338.145263671875),
 (Document(id='230e0522-cc64-4df2-a3f5-226641d73118', metadata={'source': 'speech.txt'}, page_content='us write that story with courage, compassion, and conviction.'),
  2420.672119140625),
 (Document(id='d8e0090e-62ea-4e1c-aa99-8f2efe96c568', metadata={'source': 'speech.txt'}, page_content='together, we can achieve extraordinary things. Thank you.'),
  2431.57177734375)]

In [21]:
retriever = vectordb.as_retriever()
retriever.invoke(query)

[Document(id='f4fc077a-1800-41d0-87a6-7c0b14a4f3f3', metadata={'source': 'speech.txt'}, page_content='is more equitable, more sustainable, and more just?'),
 Document(id='50c26385-7223-4bb2-9a52-0c2a4c2660eb', metadata={'source': 'speech.txt'}, page_content='and strengthen our resolve.'),
 Document(id='230e0522-cc64-4df2-a3f5-226641d73118', metadata={'source': 'speech.txt'}, page_content='us write that story with courage, compassion, and conviction.'),
 Document(id='d8e0090e-62ea-4e1c-aa99-8f2efe96c568', metadata={'source': 'speech.txt'}, page_content='together, we can achieve extraordinary things. Thank you.')]

In [23]:
vectordb = Chroma.from_documents(
    documents = docs,
    embedding = embeddings,
    persist_directory = "chroma_db2"
)