In [2]:
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter



In [3]:
# Step 1: Initialize Hugging Face Embedding Model
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")  # Replace with your desired model


In [None]:


loader = PyPDFLoader(
    "../../00-example_data/layout-parser-paper.pdf",
)

pdf_docs = loader.load()

print(f"Loaded {len(pdf_docs)} documents from the file.")



text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=1000,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)
documents = text_splitter.split_documents(pdf_docs)

print(len(documents))


In [15]:
texts = [doc.page_content for doc in documents]


In [None]:
texts

In [None]:

# Step 3: Create FAISS Vector Store
faiss_store = FAISS.from_texts(texts, embedding_model)

# Step 4: Save the Vector Store to Disk
faiss_store.save_local("faiss_pdf_index")
print("FAISS vector store saved to 'faiss_pdf_index'.")


In [None]:


# Step 5: Reload the Vector Store
loaded_faiss_store = FAISS.load_local(
    "faiss_pdf_index",
    embedding_model,
    allow_dangerous_deserialization=True  # Enable deserialization
)
print("FAISS vector store reloaded.")


In [None]:

# Step 6: Perform a Query
query = "What algorithm discussed in the document?"  # Replace with your query
results = loaded_faiss_store.similarity_search(query, k=3)  # Get top 3 results

print("\nMost Similar Documents:")
for idx, result in enumerate(results, start=1):
    print(f"{idx}. {result.page_content}")
