In [1]:
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter



In [2]:
# Step 1: Initialize Hugging Face Embedding Model
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")  # Replace with your desired model


In [3]:
loader = PyPDFLoader(
    "../../00-example_data/layout-parser-paper.pdf",
)

pdf_docs = loader.load()

print(f"Loaded {len(pdf_docs)} documents from the file.")



text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=1000,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)
documents = text_splitter.split_documents(pdf_docs)

print(len(documents))


Loaded 16 documents from the file.
53


In [4]:
texts = [doc.page_content for doc in documents]

In [5]:
texts

['LayoutParser: A Uniﬁed Toolkit for Deep\nLearning Based Document Image Analysis\nZejiang Shen1 (\x00 ), Ruochen Zhang2, Melissa Dell3, Benjamin Charles Germain\nLee4, Jacob Carlson3, and Weining Li5\n1 Allen Institute for AI\nshannons@allenai.org\n2 Brown University\nruochen zhang@brown.edu\n3 Harvard University\n{melissadell,jacob carlson}@fas.harvard.edu\n4 University of Washington\nbcgl@cs.washington.edu\n5 University of Waterloo\nw422li@uwaterloo.ca\nAbstract. Recent advances in document image analysis (DIA) have been\nprimarily driven by the application of neural networks. Ideally, research\noutcomes could be easily deployed in production and extended for further\ninvestigation. However, various factors like loosely organized codebases\nand sophisticated model conﬁgurations complicate the easy reuse of im-\nportant innovations by a wide audience. Though there have been on-going\neﬀorts to improve reusability and simplify deep learning (DL) model',
 'development in disciplines li

In [6]:

# Step 3: Create FAISS Vector Store
faiss_store = FAISS.from_texts(texts, embedding_model)

# Step 4: Save the Vector Store to Disk
faiss_store.save_local("faiss_pdf_index")
print("FAISS vector store saved to 'faiss_pdf_index'.")


FAISS vector store saved to 'faiss_pdf_index'.


In [7]:


# Step 5: Reload the Vector Store
loaded_faiss_store = FAISS.load_local(
    "faiss_pdf_index",
    embedding_model,
    allow_dangerous_deserialization=True  # Enable deserialization
)
print("FAISS vector store reloaded.")


FAISS vector store reloaded.


In [8]:

# Step 6: Perform a Query
query = "What algorithm discussed in the document?"  # Replace with your query
results = loaded_faiss_store.similarity_search(query, k=3)  # Get top 3 results

print("\nMost Similar Documents:")
for idx, result in enumerate(results, start=1):
    print(f"{idx}. {result.page_content}")



Most Similar Documents:
1. In: International conference on machine learning. pp. 1180–1189. PMLR (2015)
2. to develop, and is robust to outliers. The DL models also generate ﬁne-grained
results that enable creative approaches like page reorganization for OCR.
16 This measures the overlap between the detected and ground-truth characters, and
the maximum is 1.
17 This measures the number of edits from the ground-truth text to the predicted text,
and lower is better.
3. 16 Z. Shen et al.
[23] Paszke, A., Gross, S., Chintala, S., Chanan, G., Yang, E., DeVito, Z., Lin, Z.,
Desmaison, A., Antiga, L., Lerer, A.: Automatic diﬀerentiation in pytorch (2017)
[24] Paszke, A., Gross, S., Massa, F., Lerer, A., Bradbury, J., Chanan, G., Killeen,
T., Lin, Z., Gimelshein, N., Antiga, L., et al.: Pytorch: An imperative style,
high-performance deep learning library. arXiv preprint arXiv:1912.01703 (2019)
[25] Pletschacher, S., Antonacopoulos, A.: The page (page analysis and ground-truth
elements) format