In [1]:
%pip install pypdf

Looking in indexes: https://mirrors.aliyun.com/pypi/simple/Note: you may need to restart the kernel to use updated packages.



# Loading a PDF

In [2]:
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader("./data/Redis.pdf")


In [3]:
documents = loader.load()


In [4]:
len(documents)

10

In [5]:
documents[9]

Document(page_content='\x01\x01Redis\x01does\x01not\x01guarantee\x01consistency\x01and\x01thus\x01cannot\x01be\x01used\x01as\x01a\x01DB.\x01The\x01DBA\x01team\x01does\x01not\x01\npromise\x01that\x01Redis\x01can\x01recover\x01data\x01according\x01to\x01a\x01specified\x01point\x01in\x01time;\x01The\x01architecture\x01\ndesign\x01for\x01Redis\x01must\x01take\x01into\x01account\x01recovery\x01solutions\x01for\x01data\x01misdeletion\x01and\x01other\x01\nsimilar\x01scenarios.\x01\x01', metadata={'source': './data/Redis.pdf', 'page': 9})

# Splitting the text

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,
)

In [7]:
chunks = loader.load_and_split(text_splitter=splitter)

In [8]:
len(chunks)

162

In [9]:
chunks[100]

Document(page_content='staggering\x01of\x01the\x01expiration\x01time\x01to\x01prevent\x01centralized\x01expiration)\x01to\x01ensure\x01that\x01unused\x01keys\x01', metadata={'source': './data/Redis.pdf', 'page': 6})

# 文本词嵌入 (Word Embedding)

In [10]:
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
import os

load_dotenv()

True

In [11]:
embeddings_model = OpenAIEmbeddings()

In [20]:
%pip install tiktoken

Looking in indexes: https://mirrors.aliyun.com/pypi/simple/
Note: you may need to restart the kernel to use updated packages.


In [12]:
embeddings = embeddings_model.embed_documents([chunks[4].page_content])

In [13]:
len(embeddings),len(embeddings[0])

(1, 1536)

In [23]:
embeddings[0]

[-0.02101807136819113,
 0.030045505996304532,
 0.012665972275157675,
 -0.026076190506215166,
 -0.021927707176936078,
 -0.001352392270715061,
 -0.017793003696646752,
 -0.02344376437465076,
 0.021307500351041057,
 -0.017145233447481856,
 0.022671953132835893,
 0.0066051875152238,
 -0.0070289947057724875,
 -0.003566181099823731,
 -0.020797554668806242,
 0.013272395526772583,
 -0.00019639836932969018,
 0.0054233517258177695,
 0.04220153072658222,
 -0.015353528978552184,
 0.006846378644706269,
 0.02822623624145981,
 -0.02524925055521537,
 -0.016332075208117,
 -0.009220386972905826,
 0.01312768103534762,
 0.026296709068245227,
 -0.01899206662559646,
 -0.012776231556172706,
 -0.007104797379393704,
 0.005857495200092662,
 -0.005654205640251448,
 -0.01165296990386034,
 -0.019433101887011407,
 -0.01856481493846162,
 0.015698088533232214,
 -0.00026186451184214077,
 -0.019102325906611487,
 0.02713743073087996,
 0.005771355777083948,
 0.03423533632313361,
 -0.005922961589987675,
 -0.008441684875273

# 向量存储

In [24]:
%pip install chromadb

Looking in indexes: https://mirrors.aliyun.com/pypi/simple/
Note: you may need to restart the kernel to use updated packages.


In [25]:
from langchain_community.vectorstores.chroma import Chroma

db = Chroma.from_documents(chunks, embeddings_model)

这个db对象就是我们可以直接操作的数据库本身

# 相似度检索

In [35]:
query = "容量管理"
docs = db.similarity_search(query)

In [36]:
len(docs)

4

In [39]:
docs[3]

Document(page_content='内存限额\x01 3.\nRedis单分⽚使⽤内存不要超过10G，超过10G限额有性能⻛险，如超过限额请提前清理数据或者\n找DBA申请分⽚扩容。\x01\n四、客⼾端使⽤\x01', metadata={'page': 2, 'source': './data/Redis.pdf'})