# langchain-faiss

In [19]:
# https://python.langchain.com/docs/integrations/vectorstores/faiss

In [20]:
pip install faiss-cpu

Note: you may need to restart the kernel to use updated packages.


In [28]:
import getpass
import os

#os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")

# Uncomment the following line if you need to initialize FAISS with no AVX2 optimization
# os.environ['FAISS_NO_AVX2'] = '1'

from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS

loader = TextLoader("../data/state_of_the_union.txt")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

embeddings = OpenAIEmbeddings()

In [13]:
print(type(docs))
print(len(docs))
print(docs)

<class 'list'>
42
[Document(page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.  \n\nLast year COVID-19 kept us apart. This year we are finally together again. \n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n\nWith a duty to one another to the American people to the Constitution. \n\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \n\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \n\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \n\nHe met the Ukrainian people. \n\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world.', 

In [15]:
db = FAISS.from_documents(docs, embeddings)

query = "What did the president say about Ketanji Brown Jackson"
docs = db.similarity_search(query)

In [16]:
print(type(docs))
print(len(docs))
print(docs[0].page_content)

<class 'list'>
4
Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. 

Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. 

One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. 

And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.


In [21]:
docs_and_scores = db.similarity_search_with_score(query)

In [22]:
docs_and_scores[0]

(Document(page_content='Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.', metadata={'source': '../data/state_of_the_union.txt'}),
 0.36881018)

In [23]:
db.save_local("faiss_index")

new_db = FAISS.load_local("faiss_index", embeddings)

docs = new_db.similarity_search(query)

In [24]:
docs[0]

Document(page_content='Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.', metadata={'source': '../data/state_of_the_union.txt'})

In [29]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

pkl = db.serialize_to_bytes()  # serializes the faiss
#embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

db = FAISS.deserialize_from_bytes(
    embeddings=embeddings, serialized=pkl
)  # Load the index

In [31]:
print(query)
docs_and_scores = db.similarity_search_with_score(query)
docs_and_scores[0]

What did the president say about Ketanji Brown Jackson


(Document(page_content='Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.', metadata={'source': '../data/state_of_the_union.txt'}),
 0.36881018)

# faiss

In [18]:
# https://medium.com/loopio-tech/how-to-use-faiss-to-build-your-first-similarity-search-bf0f708aa772
import pandas as pd
data = [['Where are your headquarters located?', 'location'],
['Throw my cellphone in the water', 'random'],
['Network Access Control?', 'networking'],
['Address', 'location']]
df = pd.DataFrame(data, columns = ['text', 'category'])

In [19]:
from sentence_transformers import SentenceTransformer
text = df['text']
encoder = SentenceTransformer("paraphrase-mpnet-base-v2")
vectors = encoder.encode(text)

In [11]:
print(vectors)

[[-0.00128639 -0.01877659 -0.0341595  ...  0.02724511 -0.01405937
  -0.0161202 ]
 [-0.00368726 -0.07847735 -0.01199681 ...  0.04308362  0.04113805
   0.03189649]
 [-0.00447406  0.01662988  0.01022722 ...  0.02680059 -0.04437997
  -0.00651698]
 [-0.00118615  0.03910449 -0.01067313 ...  0.00961001 -0.03332149
   0.00778077]]


In [20]:
import faiss

vector_dimension = vectors.shape[1]
index = faiss.IndexFlatL2(vector_dimension)
faiss.normalize_L2(vectors)
index.add(vectors)

In [22]:
print(vectors.shape)
print(vector_dimension)
print(vectors)

(4, 768)
768
[[-0.00128639 -0.01877659 -0.0341595  ...  0.02724511 -0.01405937
  -0.0161202 ]
 [-0.00368726 -0.07847735 -0.01199681 ...  0.04308362  0.04113805
   0.03189649]
 [-0.00447406  0.01662988  0.01022722 ...  0.02680059 -0.04437997
  -0.00651698]
 [-0.00118615  0.03910449 -0.01067313 ...  0.00961001 -0.03332149
   0.00778077]]


In [4]:
import numpy as np

search_text = 'where is your office?'
search_vector = encoder.encode(search_text)
_vector = np.array([search_vector])
faiss.normalize_L2(_vector)

In [24]:
print(_vector.shape)

(1, 768)


In [5]:
k = index.ntotal
distances, ann = index.search(_vector, k=k)

In [29]:
print(k)
print(type(distances))
print(type(ann))
print(distances)
print(ann)

4
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
[[0.58487254 1.1759499  1.6442652  1.9197674 ]]
[[0 3 2 1]]


In [6]:
results = pd.DataFrame({'distances': distances[0], 'ann': ann[0]})

In [30]:
print(results)

   distances  ann
0   0.584873    0
1   1.175950    3
2   1.644265    2
3   1.919767    1


In [7]:
# join
merge = pd.merge(results, df, left_on='ann', right_index=True)

In [9]:
print(merge)

   distances  ann                                  text    category
0   0.584873    0  Where are your headquarters located?    location
1   1.175950    3                               Address    location
2   1.644265    2               Network Access Control?  networking
3   1.919767    1       Throw my cellphone in the water      random


In [8]:
labels  = df['category']
category = labels[ann[0][0]]

In [31]:
print(category)

location


# faiss - tutorial

In [8]:
import numpy as np
d = 64                           # dimension
nb = 100000                      # database size
nq = 10000                       # nb of queries
np.random.seed(1234)             # make reproducible
xb = np.random.random((nb, d)).astype('float32')
xb[:, 0] += np.arange(nb) / 1000.
xq = np.random.random((nq, d)).astype('float32')
xq[:, 0] += np.arange(nq) / 1000.

In [11]:
np.arange(nb)

array([    0,     1,     2, ..., 99997, 99998, 99999])

In [13]:
np.arange(nq)

array([   0,    1,    2, ..., 9997, 9998, 9999])

In [14]:
print(xb.shape)
print(xb)

(100000, 64)
[[1.91519454e-01 6.22108757e-01 4.37727749e-01 ... 6.24916732e-01
  4.78093803e-01 1.95675179e-01]
 [3.83317441e-01 5.38736843e-02 4.51648414e-01 ... 1.51395261e-01
  3.35174650e-01 6.57551765e-01]
 [7.53425434e-02 5.50063960e-02 3.23194802e-01 ... 3.44416976e-01
  6.40880406e-01 1.26205325e-01]
 ...
 [1.00811470e+02 5.90245306e-01 7.98893511e-01 ... 3.39859009e-01
  3.01949501e-01 8.53854537e-01]
 [1.00669464e+02 9.16068792e-01 9.55078781e-01 ... 5.95364332e-01
  3.84918079e-02 1.05637990e-01]
 [1.00855637e+02 5.91134131e-01 6.78907931e-01 ... 2.18976989e-01
  6.53015897e-02 2.17538327e-01]]


In [15]:
import faiss                   # make faiss available
index = faiss.IndexFlatL2(d)   # build the index
print(index.is_trained)
index.add(xb)                  # add vectors to the index
print(index.ntotal)

True
100000


In [16]:
k = 4                          # we want to see 4 nearest neighbors
D, I = index.search(xb[:5], k) # sanity check
print(I)
print(D)
D, I = index.search(xq, k)     # actual search
print(I[:5])                   # neighbors of the 5 first queries
print(I[-5:])                  # neighbors of the 5 last queries

[[  0 393 363  78]
 [  1 555 277 364]
 [  2 304 101  13]
 [  3 173  18 182]
 [  4 288 370 531]]
[[0.        7.1751738 7.20763   7.2511625]
 [0.        6.3235645 6.684581  6.799946 ]
 [0.        5.7964087 6.391736  7.2815123]
 [0.        7.2779055 7.5279875 7.662846 ]
 [0.        6.7638035 7.2951202 7.3688145]]
[[ 381  207  210  477]
 [ 526  911  142   72]
 [ 838  527 1290  425]
 [ 196  184  164  359]
 [ 526  377  120  425]]
[[ 9900 10500  9309  9831]
 [11055 10895 10812 11321]
 [11353 11103 10164  9787]
 [10571 10664 10632  9638]
 [ 9628  9554 10036  9582]]


In [18]:
print(xb[:5])

[[0.19151945 0.62210876 0.43772775 0.7853586  0.77997583 0.2725926
  0.27646425 0.8018722  0.95813936 0.87593263 0.35781726 0.5009951
  0.6834629  0.71270204 0.37025076 0.5611962  0.50308317 0.01376845
  0.7728266  0.8826412  0.364886   0.6153962  0.07538124 0.368824
  0.9331401  0.65137815 0.39720258 0.78873014 0.31683612 0.56809866
  0.8691274  0.4361734  0.8021476  0.14376682 0.70426095 0.7045813
  0.21879211 0.92486763 0.44214076 0.90931594 0.05980922 0.18428709
  0.04735528 0.6748809  0.59462476 0.5333102  0.04332406 0.5614331
  0.32966843 0.5029668  0.11189432 0.6071937  0.5659447  0.00676406
  0.6174417  0.9121229  0.7905241  0.99208146 0.95880175 0.7919641
  0.28525096 0.62491673 0.4780938  0.19567518]
 [0.38331744 0.05387368 0.4516484  0.98200476 0.1239427  0.1193809
  0.73852307 0.58730364 0.47163254 0.10712682 0.22921857 0.89996517
  0.41675353 0.53585166 0.00620852 0.30064172 0.43689317 0.612149
  0.91819805 0.62573665 0.7059976  0.14983371 0.7460634  0.831007
  0.63372576 

# rag

In [1]:
pip install langchain openai faiss-cpu tiktoken

Note: you may need to restart the kernel to use updated packages.


In [2]:
from operator import itemgetter

from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough
from langchain.vectorstores import FAISS

In [3]:
vectorstore = FAISS.from_texts(
    ["harrison worked at kensho"], embedding=OpenAIEmbeddings()
)
retriever = vectorstore.as_retriever()

template = """Answer the question based only on the following context:
{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

model = ChatOpenAI()

In [4]:
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [5]:
chain.invoke("where did harrison work?")

'Harrison worked at Kensho.'

In [6]:
template = """Answer the question based only on the following context:
{context}

Question: {question}

Answer in the following language: {language}
"""
prompt = ChatPromptTemplate.from_template(template)

chain = (
    {
        "context": itemgetter("question") | retriever,
        "question": itemgetter("question"),
        "language": itemgetter("language"),
    }
    | prompt
    | model
    | StrOutputParser()
)

In [7]:
chain.invoke({"question": "where did harrison work", "language": "italian"})

'Dove ha lavorato Harrison?'