# 数据检索（综述） | 🦜️🔗 LangChain

https://techdiylife.github.io/blog/topic.html?category2=t07&blogid=0044

https://python.langchain.com/v0.1/docs/modules/data_connection/

## 文档加载器

In [None]:


from langchain_community.document_loaders import TextLoader

loader = TextLoader("../data/txt/odyssey.txt")
documents = loader.load()

text = documents[0].page_content
text 

## 文本分割器 

https://github.com/FullStackRetrieval-com/RetrievalTutorials/blob/main/tutorials/LevelsOfTextSplitting/5_Levels_Of_Text_Splitting.ipynb

In [None]:
# pip install langchain-text-splitters 

# level 1 - Character Splitting
# from langchain.text_splitter import CharacterTextSplitter

# text_splitter = CharacterTextSplitter(
#     separator="\n",
#     chunk_size=100,
#     chunk_overlap=10
# )

# text_splitter.create_documents([text])



# level 2 - Recursive Character Text Splitting
# from langchain.text_splitter import RecursiveCharacterTextSplitter

# text_splitter = RecursiveCharacterTextSplitter(chunk_size = 150, chunk_overlap=10)
# text_splitter.create_documents([text])



# level 3 - Document Specific Splitting
# from langchain.text_splitter import MarkdownTextSplitter
# md_splitter = MarkdownTextSplitter(chunk_size = 40, chunk_overlap=0)

# from langchain.text_splitter import PythonCodeTextSplitter
# python_splitter = PythonCodeTextSplitter(chunk_size=100, chunk_overlap=0)

# from langchain.text_splitter import RecursiveCharacterTextSplitter, Language
# js_splitter = RecursiveCharacterTextSplitter.from_language(
#     language=Language.JS, chunk_size=65, chunk_overlap=0
# )


In [None]:
## PDF elements extraction

# Method 1 : basic use of unstructured 
# !pip3 install "unstructured[all-docs]"
# import os
# from unstructured.partition.pdf import partition_pdf
# from unstructured.staging.base import elements_to_json


# pdf_path = "../data/pdf/Attention_is_all_you_need_2017.pdf"

# # Extracts the elements from the PDF
# elements = partition_pdf(
#     filename=pdf_path,

#     # Unstructured Helpers
#     strategy="hi_res", 
#     infer_table_structure=True, 
#     model_name="yolox" # get bounding boxs (for tables) and find tables
# )


# for element in elements:
#     print(element)
    
    
# Method 2 : use vLLM
# from typing import Any

# from pydantic import BaseModel
# from unstructured.partition.pdf import partition_pdf

# # Get elements
# raw_pdf_elements = partition_pdf(
#     filename=pdf_path,
    
#     # Using pdf format to find embedded image blocks
#     extract_images_in_pdf=True,
    
#     # Use layout model (YOLOX) to get bounding boxes (for tables) and find titles
#     # Titles are any sub-section of the document
#     infer_table_structure=True,
    
#     # Post processing to aggregate text once we have the title
#     chunking_strategy="by_title",
#     # Chunking params to aggregate text blocks
#     # Attempt to create a new chunk 3800 chars
#     # Attempt to keep chunks > 2000 chars
#     # Hard max on chunks
#     max_characters=4000,
#     new_after_n_chars=3800,
#     combine_text_under_n_chars=2000,
#     image_output_dir_path="static/pdfImages/",
# )


# Method 3 : translate image into semantic text 
from langchain.chat_models import ChatOpenAI
from langchain.schema.messages import HumanMessage
import os
from dotenv import load_dotenv
from PIL import Image
import base64
import io

load_dotenv()


llm = ChatOpenAI(model="gpt-4-vision-preview")

# Function to convert image to base64
def image_to_base64(image_path):
    with Image.open(image_path) as image:
        buffered = io.BytesIO()
        image.save(buffered, format=image.format)
        img_str = base64.b64encode(buffered.getvalue())
        return img_str.decode('utf-8')

image_str = image_to_base64("static/pdfImages/figure-15-6.jpg")

chat = ChatOpenAI(model="gpt-4-vision-preview",
                  max_tokens=1024)

msg = chat.invoke(
    [
        HumanMessage(
            content=[
                {"type": "text", "text" : "Please give a summary of the image provided. Be descriptive"},
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{image_str}"
                    },
                },
            ]
        )
    ]
)

msg.content

# Method 4 : https://zhuanlan.zhihu.com/p/706065120 使用 vLLM 来解析PDF 排版，数学公式，表格，图片，图表 来获得 Markdown 文件
# from gptpdf import parse_pdf # pip install gptpdf
# api_key = 'Your OpenAI API Key'
# content, image_paths = parse_pdf(pdf_path, api_key=api_key)
# print(content)

In [13]:
# level 4 : Semantic Chunking  
# 基本想法是语义相似的片段应该在一起 。下面是在连续句子中寻找断点的方法，#1 句子 递归的和 #2 #3 比较，寻找嵌入距离较大的断点。
# 超过阈值的认为是新语义的开始。为了避免噪声，选择每组3个句子为一个窗口，get embedding but abandoned first sentence. 实际测文章可能更加复杂

import chardet 

with open("../data/txt/三体全集.txt", "rb") as f:
    raw_text = f.read()
    result = chardet.detect(raw_text)
    charenc = result['encoding']

try:  # try opening with the detected encoding
    with open("../data/txt/三体全集.txt", "r", encoding=charenc) as f:
        text = f.read()
except UnicodeDecodeError:  # if error occurs, try with 'gb18030'
    with open("../data/txt/三体全集.txt", "r", encoding='gb18030') as f:
        text = f.read()

#print(text[:2000])  # print a portion to verify if it's correct, you can adjust the range as needed


import re

# Splitting the essay on '.', '?', and '!'
single_sentences_list = re.split(r'(?<=[.?!])\s+', text[:2000])
print (f"{len(single_sentences_list)} senteneces were found")


sentences = [{'sentence': x, 'index' : i} for i, x in enumerate(single_sentences_list)]
sentences[:3]

1 senteneces were found


In [None]:
# 文本嵌入

from langchain_openai import OpenAIEmbeddings
embeddings_model = OpenAIEmbeddings()


embeddings = embeddings_model.embed_documents([
    "Hi there!",
    "Oh, hello!",
    "What's your name?",
    "My friends call me World",
    "Hello World!"
])

print(len(embeddings), len(embeddings[0]))


embedded_query = embeddings_model.embed_query("What was the name mentioned in the conversation?")
print(embedded_query[:5])




In [None]:
# 向量存储  https://python.langchain.com/v0.1/docs/modules/data_connection/vectorstores/

# pip install faiss-cpu

import os
import getpass

os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API Key:')


from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.vectorstores import FAISS

# Load the document, split it into chunks, embed each chunk and load it into the vector store.
raw_documents = TextLoader('../../../state_of_the_union.txt').load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
documents = text_splitter.split_documents(raw_documents)
db = FAISS.from_documents(documents, OpenAIEmbeddings())



query = "What did the president say about Ketanji Brown Jackson"
docs = db.similarity_search(query)
print(docs[0].page_content)


embedding_vector = OpenAIEmbeddings().embed_query(query)
docs = db.similarity_search_by_vector(embedding_vector)
print(docs[0].page_content)




#pip install qdrant-client
from langchain_community.vectorstores import Qdrant
db = await Qdrant.afrom_documents(documents, embeddings, "http://localhost:6333")

# 异步检索
query = "What did the president say about Ketanji Brown Jackson"
docs = await db.asimilarity_search(query)
print(docs[0].page_content)

# 基于Vector的异步检索
embedding_vector = embeddings.embed_query(query)
docs = await db.asimilarity_search_by_vector(embedding_vector)


In [None]:
# 检索器 - https://python.langchain.com/v0.1/docs/modules/data_connection/retrievers/


In [None]:
# 索引



In [None]:
from langchain.indexes import SQLRecordManager, index
from langchain_core.documents import Document
from langchain_elasticsearch import ElasticsearchStore
from langchain_openai import OpenAIEmbeddings

collection_name = "test_index"
embedding = OpenAIEmbeddings()
vectorstore = ElasticsearchStore(
    es_url="http://localhost:9200", index_name="test_index", embedding=embedding
)

namespace = f"elasticsearch/{collection_name}"
record_manager = SQLRecordManager(
    namespace, db_url="sqlite:///record_manager_cache.sql"
)
record_manager.create_schema()

# 示例文档
doc1 = Document(page_content="kitty", metadata={"source": "kitty.txt"})
doc2 = Document(page_content="doggy", metadata={"source": "doggy.txt"})

# 索引操作
def _clear():
    """为清理内容提供辅助方法。请参阅`full`模式部分了解其工作原理。"""
    index([], record_manager, vectorstore, cleanup="full", source_id_key="source")

index(
    [doc1, doc1, doc1, doc1, doc1],
    record_manager,
    vectorstore,
    cleanup=None,
    source_id_key="source",
)