In [1]:
%pip install langchain-qdrant langchain langchain-huggingface

Note: you may need to restart the kernel to use updated packages.


In [2]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [3]:
# Step 1: Initialize Hugging Face Embedding Model
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")  # Replace with your desired model

# Step 2: Load PDF and Split into Chunks
loader = PyPDFLoader("../../00-example_data/layout-parser-paper.pdf")
pdf_docs = loader.load()

print(f"Loaded {len(pdf_docs)} documents from the file.")

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)
documents = text_splitter.split_documents(pdf_docs)

Loaded 16 documents from the file.


In [4]:
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

client = QdrantClient(":memory:")

client.create_collection(
    collection_name="demo_collection",
    vectors_config=VectorParams(size=384, distance=Distance.COSINE),
)

True

In [5]:
vector_store = QdrantVectorStore(
    client=client,
    collection_name="demo_collection",
    embedding=embedding_model,
)

In [6]:
vector_store.add_documents(documents=documents)

['173fc7d3408946289985e942e01f2328',
 'c9d4fad099dc47d8a14fc9f7ad107748',
 '66036b6d59284a20a1eb2c0f197b4e4f',
 '9bf57317c3514b708f1db7acb92d3667',
 '01c8ad40e2b040e58e44032c895c737e',
 '353062d038044ef0a7fc7cf390709e5b',
 'b59e1def7e3a4f54b8c40357636db1aa',
 '34bb2b7458264b6fae1f69c14110286d',
 '0d12939096574784953618d375aec300',
 'e7d676e25c8b42d5bec634868fd555ae',
 '5e9d6899fe004377ba26deb950c48d0d',
 '3a0bdd0bb27c43f48fa0be23e54b1e3b',
 '1b37b31c995a47878d07accccf944955',
 '3c107977942c482f9ef73c3d0c906456',
 'f4efe23dcc6d40d7bc94271682744ca0',
 'a940f47391934c33a2c2eab5f7d98584',
 '3a15da8e677a41e2bf7d5831a33e7af8',
 '8c210bc1192b40cf871fce04d8a7eb87',
 'f4eb4eb6596c4950b2a6ac933fc3daeb',
 'b00e69c2ad59422285d617c9626d903e',
 'ae2bbdf7a64a4740aa15c5aac3fa581d',
 '42708cfaea154cbebfb58a2c88ace9e8',
 'd2e52491d3c64f3188b2c9f7ca0afa40',
 '4a34a8e519ba4b7d8398ad7ecfdcea35',
 'f33d5e66b7f14363b197e69c59b35ee5',
 'cd8201e9e5ca407b94da7d386c7582e3',
 '6da143a796fd4f1bbdc85ec63ba5570f',
 

In [7]:
# Step 5: Perform a Query
query = "What algorithm is discussed in the document?"
results = vector_store.similarity_search(query, k=3)

print("\nMost Similar Documents:")
for idx, result in enumerate(results, start=1):
    print(f"{idx}. {result.page_content}")


Most Similar Documents:
1. In: International conference on machine learning. pp. 1180–1189. PMLR (2015)
2. 16 Z. Shen et al.
[23] Paszke, A., Gross, S., Chintala, S., Chanan, G., Yang, E., DeVito, Z., Lin, Z.,
Desmaison, A., Antiga, L., Lerer, A.: Automatic diﬀerentiation in pytorch (2017)
[24] Paszke, A., Gross, S., Massa, F., Lerer, A., Bradbury, J., Chanan, G., Killeen,
T., Lin, Z., Gimelshein, N., Antiga, L., et al.: Pytorch: An imperative style,
high-performance deep learning library. arXiv preprint arXiv:1912.01703 (2019)
[25] Pletschacher, S., Antonacopoulos, A.: The page (page analysis and ground-truth
elements) format framework. In: 2010 20th International Conference on Pattern
Recognition. pp. 257–260. IEEE (2010)
[26] Prasad, D., Gadpal, A., Kapadni, K., Visave, M., Sultanpure, K.: Cascadetabnet:
An approach for end to end table detection and structure recognition from image-
based documents. In: Proceedings of the IEEE/CVF Conference on Computer
Vision and Pattern Recognit