In [None]:
%pip install langchain langchain-huggingface langchain-pinecone pinecone-notebooks

In [None]:
%pip install pinecone

In [1]:
from pinecone import Pinecone, ServerlessSpec
import getpass
import os
import time

if not os.getenv("PINECONE_API_KEY"):
    os.environ["PINECONE_API_KEY"] = getpass.getpass("Enter your Pinecone API key: ")

pinecone_api_key = os.environ.get("PINECONE_API_KEY")

pc = Pinecone(api_key=pinecone_api_key)

In [2]:
from langchain.vectorstores import Pinecone
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [3]:
# Step 1: Initialize Hugging Face Embedding Model
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")  # Replace with your desired model
# embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")


In [4]:
# Step 2: Load PDF and Split into Chunks
loader = PyPDFLoader("../../00-example_data/layout-parser-paper.pdf")
pdf_docs = loader.load()

print(f"Loaded {len(pdf_docs)} documents from the file.")

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)
documents = text_splitter.split_documents(pdf_docs)

Loaded 16 documents from the file.


In [5]:
len(documents)

53

In [6]:
import time

index_name = "langchain-test-index"  # change if desired

existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pc.Index(index_name)

In [7]:
pc.describe_index(index_name)

{
    "name": "langchain-test-index",
    "dimension": 384,
    "metric": "cosine",
    "host": "langchain-test-index-zhmtpmp.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "deletion_protection": "disabled"
}

In [8]:
from langchain_pinecone import PineconeVectorStore

vector_store = PineconeVectorStore(index=index, embedding=embedding_model)

In [9]:
vector_store.add_documents(documents=documents)

['6a54fb4e-747c-4a29-91cc-3b8d55cacad3',
 'e13c3999-e72b-43b1-aa98-d94305f89bcb',
 'e73eb3e3-31d9-4574-9663-96d273dc1428',
 'cdfee237-da6a-4030-83bf-2b98bc0abd20',
 '378bb921-3cd7-4b01-a63d-6a004375a499',
 'aba94494-6d63-41ff-8867-122d725aef32',
 '62c60466-0137-47ef-b47c-e91fbd983e4a',
 '6afe1173-d853-44c3-9f77-53d4933dbab0',
 '08fea401-fe6c-469b-905a-33433e90ceab',
 '96b64907-b281-4adf-80cc-2bc5d396b965',
 'f0ebab84-5db2-4b47-bba1-73b42196e459',
 'f054aed6-9aa0-496c-994d-f02f7dfd4ae5',
 'b96c782c-1113-4697-a198-222167ee32e2',
 '7621a5e9-d2c1-4ef7-89bb-c5944b046e22',
 'b6aae1d2-e7ab-4d53-94c6-d53d1eac3fac',
 'f067b340-2e6b-4991-9910-14e64513131a',
 '6833ad11-96a1-430c-9352-990eb8ff9622',
 '1ee004ea-a9dd-47cd-a022-291777c0aad2',
 '31e932c4-6d8f-4e8e-bce3-d7d99c9d5f41',
 'b3e9889e-6ce6-4768-b0d5-902e63c86102',
 'fc953d80-27cd-47cb-b5a9-62db28fde515',
 'd6d91df3-7b17-46ae-85b5-a442d854bc21',
 '2a173659-194f-4fdc-b866-4e3491848083',
 '8ec40d87-a38c-43fb-908a-457a90c4bf17',
 '30ba69e4-c706-

In [10]:
pc.list_indexes()

[
    {
        "name": "quickstart",
        "dimension": 2,
        "metric": "cosine",
        "host": "quickstart-zhmtpmp.svc.aped-4627-b74a.pinecone.io",
        "spec": {
            "serverless": {
                "cloud": "aws",
                "region": "us-east-1"
            }
        },
        "status": {
            "ready": true,
            "state": "Ready"
        },
        "deletion_protection": "disabled"
    },
    {
        "name": "langchain-test-index",
        "dimension": 384,
        "metric": "cosine",
        "host": "langchain-test-index-zhmtpmp.svc.aped-4627-b74a.pinecone.io",
        "spec": {
            "serverless": {
                "cloud": "aws",
                "region": "us-east-1"
            }
        },
        "status": {
            "ready": true,
            "state": "Ready"
        },
        "deletion_protection": "disabled"
    }
]

In [11]:

# Step 5: Perform a Query
query = "what is the use case being discussed in the documents?"
results = vector_store.similarity_search(query, k=3)

print("\nMost Similar Documents:")
for idx, result in enumerate(results, start=1):
    print(f"{idx}. {result.page_content}")



Most Similar Documents:
1. ument digitization pipelines. For example, sometimes the pipeline requires the
combination of multiple DL models to achieve better accuracy. Currently, pipelines
are mainly described in academic papers and implementations are often not pub-
licly available. To this end, the LayoutParser community platform also enables
the sharing of layout pipelines to promote the discussion and reuse of techniques.
For each shared pipeline, it has a dedicated project page, with links to the source
code, documentation, and an outline of the approaches. A discussion panel is
provided for exchanging ideas. Combined with the core LayoutParser library,
users can easily build reusable components based on the shared pipelines and
apply them to solve their unique problems.
5 Use Cases
The core objective of LayoutParser is to make it easier to create both large-scale
and light-weight document digitization pipelines. Large-scale document processing
2. AllenNLP [8] and transformers [ 

In [12]:
results = vector_store.similarity_search_with_score(
    "what is use case being discussed in the documents?", k=3
)
for res, score in results:
    print(f"* [SIM={score:3f}] {res.page_content} [{res.metadata}]")

* [SIM=0.289667] ument digitization pipelines. For example, sometimes the pipeline requires the
combination of multiple DL models to achieve better accuracy. Currently, pipelines
are mainly described in academic papers and implementations are often not pub-
licly available. To this end, the LayoutParser community platform also enables
the sharing of layout pipelines to promote the discussion and reuse of techniques.
For each shared pipeline, it has a dedicated project page, with links to the source
code, documentation, and an outline of the approaches. A discussion panel is
provided for exchanging ideas. Combined with the core LayoutParser library,
users can easily build reusable components based on the shared pipelines and
apply them to solve their unique problems.
5 Use Cases
The core objective of LayoutParser is to make it easier to create both large-scale
and light-weight document digitization pipelines. Large-scale document processing [{'page': 9.0, 'source': '../../00-example_dat