In [None]:
%pip install langchain langchain-huggingface langchain-pinecone pinecone-notebooks

In [None]:
%pip install pinecone

In [1]:
from pinecone import Pinecone, ServerlessSpec
import getpass
import os
import time

if not os.getenv("PINECONE_API_KEY"):
    os.environ["PINECONE_API_KEY"] = getpass.getpass("Enter your Pinecone API key: ")

pinecone_api_key = os.environ.get("PINECONE_API_KEY")

pc = Pinecone(api_key=pinecone_api_key)

In [2]:
from langchain.vectorstores import Pinecone
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter


In [10]:
# Step 1: Initialize Hugging Face Embedding Model
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")  # Replace with your desired model
# embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")



# Step 2: Load PDF and Split into Chunks
loader = PyPDFLoader("../../00-example_data/layout-parser-paper.pdf")
pdf_docs = loader.load()

print(f"Loaded {len(pdf_docs)} documents from the file.")

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)
documents = text_splitter.split_documents(pdf_docs)



Loaded 16 documents from the file.


In [11]:
import time

index_name = "langchain-test-index"  # change if desired

existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pc.Index(index_name)

In [12]:
pc.describe_index(index_name)

{
    "name": "langchain-test-index",
    "dimension": 384,
    "metric": "cosine",
    "host": "langchain-test-index-zhmtpmp.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "deletion_protection": "disabled"
}

In [13]:
from langchain_pinecone import PineconeVectorStore

vector_store = PineconeVectorStore(index=index, embedding=embedding_model)

In [7]:
from langchain_core.documents import Document

document_1 = Document(
    page_content="I had chocalate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"},
)
document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata={"source": "news"},
)


In [8]:
documents1 = [
    document_1,
    document_2,
]

In [9]:
vector_store.add_documents(documents=documents1)

['1c4a90fd-7a42-4d20-9388-88e42415e084',
 'f6c621c3-b2c4-4ba4-80f5-583237e61447']

In [14]:
vector_store.add_documents(documents=documents)

['b2de453b-e753-4191-af4b-d18803ca90da',
 '119f5b6a-dd6a-4207-b34a-9fd2568be618',
 '3012b3d5-936e-4966-8387-44393ce39061',
 'b336b8ef-d33e-4a6c-8324-d909bea5c389',
 '6ec7c53c-f872-4e26-9d8a-536127b26f36',
 'd1a71ec9-9ac3-4a03-b307-f02563090e75',
 'f330f22d-4c8f-42ca-b2e5-6439e31268fd',
 '11c5e34c-164c-41e7-a275-024116654e3d',
 '95b24b26-36da-4a5e-86ce-50fb0103ac2d',
 'e02cac47-90ff-45fc-9b68-189277b05b17',
 '3667e352-7d2b-471f-835b-d05d375dd01b',
 'ce2ba2fd-08f0-46a5-ac51-c54a6ea004ca',
 'f6356cd9-5c6f-480b-bfaf-b000ca227afb',
 '7412b7f5-be5b-4af5-a43a-cfa17e38fa79',
 '53868df4-0788-43f1-a275-028d07e92387',
 'c0d21ac3-a74c-4ca5-9698-3a5288dce857',
 '84a20d28-ed06-4b60-859a-3210852e66e1',
 '04c30a73-ecb3-4168-8c26-ad897d0db8a8',
 'df6bc58c-de2a-4e12-bac4-7d119b24709c',
 'd81ee478-3cb6-4164-a025-2820c7dafb8f',
 'b2db9a6e-2ba0-4967-8246-464e3eb45f70',
 'af3b69b2-944c-4785-aa4e-0f926f07c7dd',
 'd6a517d4-ce06-4e17-b0de-85df01851e28',
 'c127f191-9363-46a4-a6bf-7502a8ce1fca',
 '4c834b3a-bdea-

In [15]:
pc.list_indexes()

[
    {
        "name": "quickstart",
        "dimension": 2,
        "metric": "cosine",
        "host": "quickstart-zhmtpmp.svc.aped-4627-b74a.pinecone.io",
        "spec": {
            "serverless": {
                "cloud": "aws",
                "region": "us-east-1"
            }
        },
        "status": {
            "ready": true,
            "state": "Ready"
        },
        "deletion_protection": "disabled"
    },
    {
        "name": "langchain-test-index",
        "dimension": 384,
        "metric": "cosine",
        "host": "langchain-test-index-zhmtpmp.svc.aped-4627-b74a.pinecone.io",
        "spec": {
            "serverless": {
                "cloud": "aws",
                "region": "us-east-1"
            }
        },
        "status": {
            "ready": true,
            "state": "Ready"
        },
        "deletion_protection": "disabled"
    }
]

In [17]:

# Step 5: Perform a Query
query = "logic in the document?"
results = vector_store.similarity_search(query, k=3)

print("\nMost Similar Documents:")
for idx, result in enumerate(results, start=1):
    print(f"{idx}. {result.page_content}")



Most Similar Documents:
1. to develop, and is robust to outliers. The DL models also generate ﬁne-grained
results that enable creative approaches like page reorganization for OCR.
16 This measures the overlap between the detected and ground-truth characters, and
the maximum is 1.
17 This measures the number of edits from the ground-truth text to the predicted text,
and lower is better.
2. rated by white spaces of variable size,
and the vertical positions of objects
can be an indicator of their layout
type.
15 A document page consists of eight rows like this. For simplicity we skip the row
segmentation discussion and refer readers to the source code when available.
3. 16 Z. Shen et al.
[23] Paszke, A., Gross, S., Chintala, S., Chanan, G., Yang, E., DeVito, Z., Lin, Z.,
Desmaison, A., Antiga, L., Lerer, A.: Automatic diﬀerentiation in pytorch (2017)
[24] Paszke, A., Gross, S., Massa, F., Lerer, A., Bradbury, J., Chanan, G., Killeen,
T., Lin, Z., Gimelshein, N., Antiga, L., et al.: Pytor

In [20]:
results = vector_store.similarity_search_with_score(
    "logic in the document?", k=3
)
for res, score in results:
    print(f"* [SIM={score:3f}] {res.page_content} [{res.metadata}]")

* [SIM=0.312871] to develop, and is robust to outliers. The DL models also generate ﬁne-grained
results that enable creative approaches like page reorganization for OCR.
16 This measures the overlap between the detected and ground-truth characters, and
the maximum is 1.
17 This measures the number of edits from the ground-truth text to the predicted text,
and lower is better. [{'page': 11.0, 'source': '../../00-example_data/layout-parser-paper.pdf'}]
* [SIM=0.307854] rated by white spaces of variable size,
and the vertical positions of objects
can be an indicator of their layout
type.
15 A document page consists of eight rows like this. For simplicity we skip the row
segmentation discussion and refer readers to the source code when available. [{'page': 10.0, 'source': '../../00-example_data/layout-parser-paper.pdf'}]
* [SIM=0.281716] 16 Z. Shen et al.
[23] Paszke, A., Gross, S., Chintala, S., Chanan, G., Yang, E., DeVito, Z., Lin, Z.,
Desmaison, A., Antiga, L., Lerer, A.: Automatic diﬀ