In [6]:
import os
from getpass import getpass

os.environ["OPENAI_API_KEY"] = getpass(prompt="Your api key")

Your api key········


## 1. embed_documents

In [3]:
from langchain_openai import OpenAIEmbeddings

embeddings_model = OpenAIEmbeddings()

embeddings = embeddings_model.embed_documents(
    [
        "Hi there!",
        "Oh, hello!",
        "What's your name?",
        "My friends call me World",
        "Hello World!"
    ]
)
# openai的embedding向量维度默认为1536
len(embeddings), len(embeddings[0])

(5, 1536)

## 2. embed_query

In [5]:
embedded_query = embeddings_model.embed_query("What was the name mentioned in the conversation?")
embedded_query[:5]

[0.0053772740534241935,
 -0.0006527779663918577,
 0.038980290283414216,
 -0.002967397499514861,
 -0.008834564037682272]

## 3. Caching
+ 使用CacheBackedEmbeddings
+ 初始化CacheBackedEmbeddings的方法是from_bytes_store
    + underlying_embedder: The embedder to use for embedding.
    + document_embedding_cache: Any ByteStore for caching document embeddings.
   

In [8]:
from langchain.embeddings import CacheBackedEmbeddings
from langchain.storage import LocalFileStore
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings

underlying_embeddings = OpenAIEmbeddings()

store = LocalFileStore("./cache/")

cached_embedder = CacheBackedEmbeddings.from_bytes_store(
    underlying_embeddings, store, namespace=underlying_embeddings.model
)

嵌入之前没有缓存

In [9]:
list(store.yield_keys())

[]

In [11]:
raw_documents = TextLoader("./example data/HuggingFaceOverview.txt").load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
documents = text_splitter.split_documents(raw_documents)

第一次创建向量存储

In [12]:
%%time
db = FAISS.from_documents(documents, cached_embedder)

CPU times: user 72.8 ms, sys: 32.3 ms, total: 105 ms
Wall time: 1.68 s


第二次向量存储会快很多，因为不需要重新计算词嵌入

In [13]:
%%time
db2 = FAISS.from_documents(documents, cached_embedder)

CPU times: user 3.74 ms, sys: 2.21 ms, total: 5.95 ms
Wall time: 6.05 ms


In [14]:
list(store.yield_keys())[:5]

['text-embedding-ada-002e0431e9f-fc84-5090-94ef-1fbd28a73ea3',
 'text-embedding-ada-00263711f6a-a725-592f-8dec-662c2a2407eb']

创建可以交换的ByteStore

In [15]:
from langchain.embeddings import CacheBackedEmbeddings
from langchain.storage import InMemoryByteStore

store = InMemoryByteStore()

cached_embedder = CacheBackedEmbeddings.from_bytes_store(
    underlying_embeddings, store, namespace=underlying_embeddings.model
)