# Faiss
> **Facebook AI Similarity Search**

## synchronous 


**[ASYNC版本](https://python.langchain.com/docs/integrations/vectorstores/async_faiss)**

In [2]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.document_loaders import TextLoader

In [3]:
from langchain.document_loaders import TextLoader

In [4]:
loader = TextLoader('./input/state_of_the_union.txt')

In [5]:
documents = loader.load()

In [6]:
documents.__len__()

1

In [7]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)

In [8]:
docs = text_splitter.split_documents(documents)

In [9]:
embeddings = OpenAIEmbeddings()

In [10]:
db = FAISS.from_documents(docs, embeddings)

### 1. 字符串查

In [11]:
query = "What did the president say about Ketanji Brown Jackson"

In [12]:
docs = db.similarity_search(query)

In [13]:
docs.__len__()

4

In [15]:
print(docs[0].page_content)

Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. 

Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. 

One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. 

And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.


### 2. 字符串查,且返回分数

`similarity_search_with_score`
1. L2 distance : a lower score is better
2. 不仅返回文档,还返回score

In [16]:
docs_and_scores = db.similarity_search_with_score(query)

In [18]:
for doc, score in docs_and_scores:
    print(doc, score)

page_content='Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.' metadata={'source': './input/state_of_the_union.txt'} 0.36921751
page_content='A former top litigator in private practice. A former federal public defender. And from a family of public school

### 3. 根据向量查

In [19]:
embedding_vector = embeddings.embed_query(query)

In [20]:
for doc, score in db.similarity_search_with_score_by_vector(embedding_vector):
    print(doc, score)

page_content='Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.' metadata={'source': './input/state_of_the_union.txt'} 0.36912858
page_content='A former top litigator in private practice. A former federal public defender. And from a family of public school

### 4. Saving and loading

In [21]:
db.save_local('./faiss_index')

In [22]:
new_db = FAISS.load_local('./faiss_index', embeddings)

### 5. Serializing and De-Serializing to bytes

1. save_local 空间很大, 
2. 序列化接口很小, 如果要将向量库持久化到sql数据库, 这是个不错的选择. 

In [None]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

**序列化**

In [23]:
pkl = db.serialize_to_bytes()

**反序列化**

In [24]:
db = FAISS.deserialize_from_bytes(embeddings=embeddings, serialized=pkl)

### 6. Merging, 合并索引

In [25]:
db1 = FAISS.from_texts(["foo"], embeddings)
db2 = FAISS.from_texts(['bar'], embeddings)

In [26]:
db1.docstore._dict

{'1c3c6c3d-d931-44dc-b147-96cddd259275': Document(page_content='foo')}

In [27]:
db2.docstore._dict

{'3d91c136-36bf-462a-9248-14adec957622': Document(page_content='bar')}

In [28]:
db1.merge_from(db2)

In [29]:
db1.docstore._dict

{'1c3c6c3d-d931-44dc-b147-96cddd259275': Document(page_content='foo'),
 '3d91c136-36bf-462a-9248-14adec957622': Document(page_content='bar')}

### 7. 带过滤器的相似度查找

> 1. fetch_k : 是过滤之前获取的文档数 <br>

In [30]:
from langchain.schema import Document

In [31]:
list_of_documents = [
    Document(page_content="foo", metadata=dict(page=1)),
    Document(page_content="bar", metadata=dict(page=1)),
    Document(page_content="foo", metadata=dict(page=2)),
    Document(page_content="barbar", metadata=dict(page=2)),
    Document(page_content="foo", metadata=dict(page=3)),
    Document(page_content="bar burr", metadata=dict(page=3)),
    Document(page_content="foo", metadata=dict(page=4)),
    Document(page_content="bar bruh", metadata=dict(page=4)),
]

In [52]:
db = FAISS.from_documents(list_of_documents, embeddings)

In [33]:
results_with_scores = db.similarity_search_with_score("foo")

In [34]:
for doc, score in results_with_scores:
    print(f"Content: {doc.page_content}, Metadata: {doc.metadata}, Score: {score}")

Content: foo, Metadata: {'page': 1}, Score: 0.0
Content: foo, Metadata: {'page': 2}, Score: 0.0
Content: foo, Metadata: {'page': 3}, Score: 0.0
Content: foo, Metadata: {'page': 4}, Score: 0.0


> Now we make the same query call but we filter for only `page = 1`

In [35]:
results_with_scores = db.similarity_search_with_score("foo", filter=dict(page=1))
for doc, score in results_with_scores:
    print(f"Content: {doc.page_content}, Metadata: {doc.metadata}, Score: {score}")

Content: foo, Metadata: {'page': 1}, Score: 1.4206954801920801e-05
Content: bar, Metadata: {'page': 1}, Score: 0.3131061792373657


> Same thing can be done with the `max_marginal_relevance_search` as well.

In [36]:
results = db.max_marginal_relevance_search("foo", filter=dict(page=1))
for doc in results:
    print(f"Content: {doc.page_content}, Metadata: {doc.metadata}")

Content: foo, Metadata: {'page': 1}
Content: bar, Metadata: {'page': 1}


> `fetch_k` parameter is the number of documents that will be fetched before filtering<br>
> `fetch_k` parameter 是在过滤之前抓取的文档数量

In [37]:
results = db.similarity_search("foo", filter=dict(page=1), k=1, fetch_k=4)
for doc in results:
    print(f"Content: {doc.page_content}, Metadata: {doc.metadata}")

Content: foo, Metadata: {'page': 1}


In [53]:
db.index_to_docstore_id

{0: 'a992967c-761b-4fe6-9f64-5aa7ed88d545',
 1: 'ba1094b9-b7c8-437c-a1db-33c1a39fdfe1',
 2: 'b31d8eac-3d9c-4d26-8244-2272a14b9122',
 3: 'a8dc35f4-6334-42ae-b76d-87fb5a123674',
 4: '7a95ca75-8b31-4744-8a7b-f41915b09c6e',
 5: 'a3f06aaf-096b-48fe-ba16-36a3da942f65',
 6: '9aa83411-5832-41c1-8fff-091e13f2cd5e',
 7: '3d8263fd-7c18-444c-b5e4-b3d73b5abf91'}

In [54]:
db.index_to_docstore_id[0]

'a992967c-761b-4fe6-9f64-5aa7ed88d545'

In [55]:
db.delete([db.index_to_docstore_id[0]])

True

In [56]:
0 in db.index_to_docstore_id

True

In [57]:
db.index_to_docstore_id

{0: 'ba1094b9-b7c8-437c-a1db-33c1a39fdfe1',
 1: 'b31d8eac-3d9c-4d26-8244-2272a14b9122',
 2: 'a8dc35f4-6334-42ae-b76d-87fb5a123674',
 3: '7a95ca75-8b31-4744-8a7b-f41915b09c6e',
 4: 'a3f06aaf-096b-48fe-ba16-36a3da942f65',
 5: '9aa83411-5832-41c1-8fff-091e13f2cd5e',
 6: '3d8263fd-7c18-444c-b5e4-b3d73b5abf91'}

> 少了一个 `a992967c-761b-4fe6-9f64-5aa7ed88d545`

In [58]:
db.add_documents?

[0;31mSignature:[0m [0mdb[0m[0;34m.[0m[0madd_documents[0m[0;34m([0m[0mdocuments[0m[0;34m:[0m [0;34m'List[Document]'[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m:[0m [0;34m'Any'[0m[0;34m)[0m [0;34m->[0m [0;34m'List[str]'[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Run more documents through the embeddings and add to the vectorstore.

Args:
    documents (List[Document]: Documents to add to the vectorstore.

Returns:
    List[str]: List of IDs of the added texts.
[0;31mFile:[0m      /opt/conda/envs/preventloss/lib/python3.9/site-packages/langchain/schema/vectorstore.py
[0;31mType:[0m      method

In [59]:
db.add_texts?

[0;31mSignature:[0m
[0mdb[0m[0;34m.[0m[0madd_texts[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mtexts[0m[0;34m:[0m [0;34m'Iterable[str]'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmetadatas[0m[0;34m:[0m [0;34m'Optional[List[dict]]'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mids[0m[0;34m:[0m [0;34m'Optional[List[str]]'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m**[0m[0mkwargs[0m[0;34m:[0m [0;34m'Any'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0;34m'List[str]'[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Run more texts through the embeddings and add to the vectorstore.

Args:
    texts: Iterable of strings to add to the vectorstore.
    metadatas: Optional list of metadatas associated with the texts.
    ids: Optional list of unique IDs.

Returns:
    List of ids from adding the texts into the vectorstore.
[0;31mFile:[0m      /opt/conda/envs/preventloss/lib/pyt

# FAISS 源码解析

> 1. langchain 里默认 faiss的索引是IndexFlatL2(欧式距离) , 也支持IndexFlatIP(内积)<br><br>
    ```python
        FAISS.from_documents(docuemnts=documents, embedding=embedding, distance_strategy="MAX_INNER_PRODUCT")
    ```
    
> 2. Uncomment the following line if you need to initialize FAISS with no AVX2 optimization
    ```python
        # 如果您需要在没有 AVX2 优化的情况下初始化 FAISS，请取消以下注释
        # os.environ['FAISS_NO_AVX2'] = '1'
    ```

## 数据结构

![](./imgs/FAISS_data_structure.png)

### index:
> <font color=blue>这个就是facebook的faiss的向量库</font>

### Docstore:
> 存储Document唯一id与Document对应关系的字典<br>
> <font color=blue>key: id,  str<font><br>
> <font color=blue>value: Document</font>

### index_to_docstore_id
> Dict[int, str] <br>
> faiss的自增序列-> Document唯一id<br>  

    
 
---
### 源码分析过程

> 1. search： 输入str, 如果存在返回 page summary + 一个 Document对象
> 2. search:  输入str, 如果不存在, 返回相似实体(<font color=blue>似乎并没有实现，InMemoryDocstore，仅仅是返回找不到</font>)
> 3. delete:  <font color=blue>从内存字典中， 删除IDs </font>
> 4. add: <font color=blue>增加documents Dict[str->Document]</font>

```python
class Docstore(ABC):                                                                                                                                       
    """Interface to access to place that stores documents."""                                                                                              
                                                                                                                                                           
    @abstractmethod                                                                                                                                        
    def search(self, search: str) -> Union[str, Document]:                                                                                                 
        """Search for document.                                                                                                                            
                                                                                                                                                           
        If page exists, return the page summary, and a Document object.                                                                                    
        If page does not exist, return similar entries.                                                                                                    
        """                                                                                                                                                
                                                                                                                                                           
    def delete(self, ids: List) -> None:                                                                                                                   
        """Deleting IDs from in memory dictionary."""                                                                                                      
        raise NotImplementedError  
        
class AddableMixin(ABC):
     """Mixin class that supports adding texts."""                                                                                                          
             
     @abstractmethod
     def add(self, texts: Dict[str, Document]) -> None:                                                                                                     
         """Add more documents."""
```

```python
class InMemoryDocstore(Docstore, AddableMixin):                                                                                                            
     """Simple in memory docstore in the form of a dict."""                                                                                                 
                                                                                                                                                            
     def __init__(self, _dict: Optional[Dict[str, Document]] = None):                                                                                       
         """Initialize with dict."""                                                                                                                        
         self._dict = _dict if _dict is not None else {}                                                                                                    
                                                                                                                                                            
     def add(self, texts: Dict[str, Document]) -> None:                                                                                                     
         """Add texts to in memory dictionary.                                                                                                              
                                                                                                                                                            
         Args:                                                                                                                                              
             texts: dictionary of id -> document.                                                                                                           
                                                                                                                                                            
         Returns:                                                                                                                                           
             None                                                                                                                                           
         """                                                                                                                                                
         overlapping = set(texts).intersection(self._dict)                                                                                                  
         if overlapping:                                                                                                                                    
             raise ValueError(f"Tried to add ids that already exist: {overlapping}")                                                                        
         self._dict = {**self._dict, **texts}                                                                                                               
                                                                                                                                                            
     def delete(self, ids: List) -> None:                                                                                                                   
         """Deleting IDs from in memory dictionary."""                                                                                                      
         """从字典的keys中删除ids"""
         overlapping = set(ids).intersection(self._dict)                                                                                                    
         if not overlapping:                                                                                                                                
             raise ValueError(f"Tried to delete ids that does not  exist: {ids}")                                                                           
         for _id in ids:                                                                                                                                                                                                                                                                                                   
             self._dict.pop(_id)                                                                                                                            
                                                                                                                                                            
     def search(self, search: str) -> Union[str, Document]:                                                                                                 
         """Search via direct lookup.                                                                                                                       
                                                                                                                                                            
         Args:                                                                                                                                              
             search: id of a document to search for.                                                                                                        
                                                                                                                                                            
         Returns:                                                                                                                                           
             Document if found, else error message.                                                                                                         
         """                                                                                                                                                
         if search not in self._dict:                                                                                                                       
             return f"ID {search} not found."                                                                                                               
         else:                                                                                                                                              
             return self._dict[search]            
```

> 1. <font color=blue>from_documents 其实是调用的子类的 from_texts 方法 <font>
> 2. <font color=blue>默认ids是None, 是一个List\<str\><font>

```python
class VectorStore(ABC):
    """Interface for vector store."""
@classmethod
def from_documents(                                                                                                                                                                                                                                                                                                   
    cls: Type[VST],
    documents: List[Document],
    embedding: Embeddings,
    **kwargs: Any,
) -> VST:
    """Return VectorStore initialized from documents and embeddings."""
    texts = [d.page_content for d in documents]
    metadatas = [d.metadata for d in documents]
    return cls.from_texts(texts, embedding, metadatas=metadatas, **kwargs)


class FAISS(VectorStore):     
     @classmethod
     def from_texts(                                                                                                                                                                                                                                                                                                       
         cls,
         texts: List[str],
         embedding: Embeddings,
         metadatas: Optional[List[dict]] = None,
         ids: Optional[List[str]] = None,
         **kwargs: Any,
     ) -> FAISS:
         """Construct FAISS wrapper from raw documents.
 
         This is a user friendly interface that:
             1. Embeds documents.
             2. Creates an in memory docstore
             3. Initializes the FAISS database
 
         This is intended to be a quick way to get started.
 
         Example:
             .. code-block:: python
 
                 from langchain.vectorstores import FAISS
                 from langchain.embeddings import OpenAIEmbeddings
 
                 embeddings = OpenAIEmbeddings()
                 faiss = FAISS.from_texts(texts, embeddings)
         """
         embeddings = embedding.embed_documents(texts)
         return cls.__from(
             texts,
             embeddings,
             embedding,
             metadatas=metadatas,
             ids=ids,
             **kwargs,
         )
        
    @classmethod
    def __from(
        cls,
        texts: Iterable[str],
        embeddings: List[List[float]],
        embedding: Embeddings,
        metadatas: Optional[Iterable[dict]] = None,
        ids: Optional[List[str]] = None,
        normalize_L2: bool = False,
        distance_strategy: DistanceStrategy = DistanceStrategy.EUCLIDEAN_DISTANCE,
        **kwargs: Any,
    ) -> FAISS:
        faiss = dependable_faiss_import()
        
        >> 1. 这里确认了索引类型： 默认是欧式距离（归一化的内积） ， 另一个可选是 内积
        >> 1. 这里主要确认了向量的:  宽度
        if distance_strategy == DistanceStrategy.MAX_INNER_PRODUCT:
            index = faiss.IndexFlatIP(len(embeddings[0]))
        else:
            # Default to L2, currently other metric types not initialized.
            index = faiss.IndexFlatL2(len(embeddings[0]))
          
        >> 2. 实例化: FAISS
        >> 2. 默认的文档存储器是: InMemoryDocstore
        >> 2. 这里将创建的faiss向量库（无数据）传递给了FAISS类
        vecstore = cls(
            embedding.embed_query,
            index,
            InMemoryDocstore(),
            {},
            normalize_L2=normalize_L2,
            #distance_strategy=distance_strategy,
            **kwargs,
        )
        >> 3. 这里是添加文档到 faiss 和 docstore： 存入向量数据库 和 存入 内存文档库
        vecstore.__add(texts, embeddings, metadatas=metadatas, ids=ids)
        return vecstore
    
    def __add(
        self,
        texts: Iterable[str],
        embeddings: Iterable[List[float]],
        metadatas: Optional[Iterable[dict]] = None,
        ids: Optional[List[str]] = None,
    ) -> List[str]:
        faiss = dependable_faiss_import()
    
        if not isinstance(self.docstore, AddableMixin):
            raise ValueError(
                "If trying to add texts, the underlying docstore should support "
                f"adding items, which {self.docstore} does not"
            )
    
        _len_check_if_sized(texts, metadatas, "texts", "metadatas")
        _metadatas = metadatas or ({} for _ in texts)
        documents = [
            Document(page_content=t, metadata=m) for t, m in zip(texts, _metadatas)
        ]
    
        _len_check_if_sized(documents, embeddings, "documents", "embeddings")
        _len_check_if_sized(documents, ids, "documents", "ids")
    
        # Add to the index.
        vector = np.array(embeddings, dtype=np.float32)
        if self._normalize_L2:
            faiss.normalize_L2(vector)
        # index是真正的faiss向量数据库， index传过来是空数据库， 这里真正添加了Document的向量-数据
        self.index.add(vector)
    
        >> ids 如果为None，这里去了uuid, 所以docstore本质上是为： 唯一id-> Document的映射
        # Add information to docstore and index.
        ids = ids or [str(uuid.uuid4()) for _ in texts]
        >> docstore本质上是为： 唯一id-> Document的映射
        self.docstore.add({id_: doc for id_, doc in zip(ids, documents)})
        starting_len = len(self.index_to_docstore_id)
        >> faiss.index  与  新增加的ids 之间建立联系 
        index_to_id = {starting_len + j: id_ for j, id_ in enumerate(ids)}
        self.index_to_docstore_id.update(index_to_id)
        return ids
    
    
    def similarity_search_with_score_by_vector(
        self,
        embedding: List[float],
        k: int = 4,
        filter: Optional[Dict[str, Any]] = None,
        fetch_k: int = 20,
        **kwargs: Any,
    ) -> List[Tuple[Document, float]]:
        """Return docs most similar to query.
    
        Args:
            embedding: Embedding vector to look up documents similar to.
            k: Number of Documents to return. Defaults to 4.
            filter (Optional[Dict[str, Any]]): Filter by metadata. Defaults to None.
            fetch_k: (Optional[int]) Number of Documents to fetch before filtering.
                      Defaults to 20.
            **kwargs: kwargs to be passed to similarity search. Can include:
                score_threshold: Optional, a floating point value between 0 to 1 to
                    filter the resulting set of retrieved docs
    
        Returns:
            List of documents most similar to the query text and L2 distance
            in float for each. Lower score represents more similarity.
        """
        faiss = dependable_faiss_import()
        vector = np.array([embedding], dtype=np.float32)
        if self._normalize_L2:
            faiss.normalize_L2(vector)
        scores, indices = self.index.search(vector, k if filter is None else fetch_k)                                                                                                                                                                                                                                     
        docs = []
        for j, i in enumerate(indices[0]):
            if i == -1:
                # This happens when not enough docs are returned.
                continue
            >> 这里可以验证： 确实是用 index返回的 索引列表 indices 当做key 去查询的  index_to_docstore_id
            _id = self.index_to_docstore_id[i]
            >> 然后再用返回的 唯一id
            doc = self.docstore.search(_id)
            if not isinstance(doc, Document):
                raise ValueError(f"Could not find document for id {_id}, got {doc}")
            if filter is not None:
                filter = {
                    key: [value] if not isinstance(value, list) else value
                    for key, value in filter.items()
                }
                if all(doc.metadata.get(key) in value for key, value in filter.items()):
                    docs.append((doc, scores[0][j]))
            else:
                docs.append((doc, scores[0][j]))
    
        score_threshold = kwargs.get("score_threshold")
        if score_threshold is not None:
            cmp = (
                operator.ge
                if self.distance_strategy
                in (DistanceStrategy.MAX_INNER_PRODUCT, DistanceStrategy.JACCARD)
                else operator.le
            )
            docs = [
                (doc, similarity)
                for doc, similarity in docs
                if cmp(similarity, score_threshold)
            ]
        return docs[:k]


```