In [1]:
from dotenv import load_dotenv
load_dotenv('.env')

True

In [19]:
import pandas as pd
from uuid import uuid4

from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain_qdrant import RetrievalMode
from langchain_qdrant import QdrantVectorStore

from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams, PointStruct, Distance
from qdrant_client.models import Filter, FieldCondition, MatchValue


## qdrant

In [8]:
# 初始化 QdrantClient
client = QdrantClient(":memory:")  # 使用内存数据库，仅用于示例

client.create_collection(
    collection_name="test_collection",
    vectors_config=VectorParams(size=4, distance=Distance.DOT),
)


True

### insert

In [9]:
operation_info = client.upsert(
    collection_name="test_collection",
    wait=True,
    points=[
        PointStruct(id=1, vector=[0.05, 0.61, 0.76, 0.74], payload={"city": "Berlin"}),
        PointStruct(id=2, vector=[0.19, 0.81, 0.75, 0.11], payload={"city": "London"}),
        PointStruct(id=3, vector=[0.36, 0.55, 0.47, 0.94], payload={"city": "Moscow"}),
        PointStruct(id=4, vector=[0.18, 0.01, 0.85, 0.80], payload={"city": "New York"}),
        PointStruct(id=5, vector=[0.24, 0.18, 0.22, 0.44], payload={"city": "Beijing"}),
        PointStruct(id=6, vector=[0.35, 0.08, 0.11, 0.44], payload={"city": "Mumbai"}),
    ],
)

print(operation_info)

operation_id=0 status=<UpdateStatus.COMPLETED: 'completed'>


### delete


In [21]:
client.delete(collection_name="test_collection", points_selector=[4])

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

### search

In [26]:
search_result = client.search(
    collection_name="test_collection",
    query_vector=[0.2, 0.1, 0.9, 0.7],
    limit=6
)

pd.DataFrame([i.dict() for i in search_result])

Unnamed: 0,id,version,score,payload,vector,shard_key,order_value
0,1,0,1.273,{'city': 'Berlin'},,,
1,3,0,1.208,{'city': 'shenzhen'},,,
2,2,0,0.871,{'city': 'London'},,,
3,5,0,0.572,{'city': 'Beijing'},,,
4,6,0,0.485,{'city': 'Mumbai'},,,


In [11]:


search_result = client.search(
    collection_name="test_collection",
    query_vector=[0.2, 0.1, 0.9, 0.7],
    query_filter=Filter(
        must=[FieldCondition(key="city", match=MatchValue(value="London"))]
    ),
    with_payload=True,
    limit=3,
)

print(search_result)

[ScoredPoint(id=2, version=0, score=0.8709999993443489, payload={'city': 'London'}, vector=None, shard_key=None, order_value=None)]


### modify

In [25]:
def modify(client, collection_name, uuid, *args, **kwargs):
    """
    Modify the payload of a vector in a Qdrant collection.

    Args:
    - client: QdrantClient instance.
    - collection_name: Name of the collection in Qdrant.
    - uuid: ID of the vector to modify.
    - *args: List of key-value pairs to update in the payload.
    - **kwargs: Dictionary of key-value pairs to update in the payload.

    Returns:
    - updated_payload: The updated payload after modification.
    """

    # Retrieve the existing point
    existing_points = client.retrieve(collection_name=collection_name, ids=[uuid])

    if not existing_points:
        raise ValueError(f"No vector found with ID {uuid} in collection {collection_name}.")

    # Load the existing payload
    existing_payload = existing_points[0].payload

    # Update the payload with *args
    for key, value in args:
        existing_payload[key] = value

    # Update the payload with **kwargs
    existing_payload.update(kwargs)

    # Update the payload in the database
    client.set_payload(
        collection_name=collection_name,
        payload=existing_payload,
        points=[uuid]
    )

    return existing_payload

modify(client, "test_collection", uuid=3, city='shenzhen')



{'city': 'shenzhen'}

## langchain

In [27]:
def search_results_to_dataframe(results):
    """
    Converts QdrantVectorStore search results to a pandas DataFrame.

    Args:
    results (list): List of tuples where each tuple contains a dictionary with the key 'text' and a similarity score.

    Returns:
    pd.DataFrame: DataFrame with columns 'text' and 'score'.
    """
    lst = pd.DataFrame([{**res.metadata, 'content': res.page_content, 'score':score} for res, score in results])
    df = pd.DataFrame(lst)

    return df

In [4]:

client = QdrantClient(":memory:")
# client = QdrantClient(path="./langchain_qdrant")
client.create_collection(
    collection_name="demo_collection",
    vectors_config=VectorParams(size=3072, distance=Distance.COSINE),
)

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

vector_store = QdrantVectorStore(
    client = client,
    collection_name = "demo_collection",
    embedding = embeddings,
    # retrieval_mode = RetrievalMode.DENSE,
)

In [6]:
def create_fack_docs():
    document_1 = Document(
        page_content="I had chocalate chip pancakes and scrambled eggs for breakfast this morning.",
        metadata={"source": "tweet"},
    )

    document_2 = Document(
        page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
        metadata={"source": "news"},
    )

    document_3 = Document(
        page_content="Building an exciting new project with LangChain - come check it out!",
        metadata={"source": "tweet"},
    )

    document_4 = Document(
        page_content="Robbers broke into the city bank and stole $1 million in cash.",
        metadata={"source": "news"},
    )

    document_5 = Document(
        page_content="Wow! That was an amazing movie. I can't wait to see it again.",
        metadata={"source": "tweet"},
    )

    document_6 = Document(
        page_content="Is the new iPhone worth the price? Read this review to find out.",
        metadata={"source": "website"},
    )

    document_7 = Document(
        page_content="The top 10 soccer players in the world right now.",
        metadata={"source": "website"},
    )

    document_8 = Document(
        page_content="LangGraph is the best framework for building stateful, agentic applications!",
        metadata={"source": "tweet"},
    )

    document_9 = Document(
        page_content="The stock market is down 500 points today due to fears of a recession.",
        metadata={"source": "news"},
    )

    document_10 = Document(
        page_content="I have a bad feeling I am going to get deleted :(",
        metadata={"source": "tweet"},
    )

    documents = [
        document_1,
        document_2,
        document_3,
        document_4,
        document_5,
        document_6,
        document_7,
        document_8,
        document_9,
        document_10,
    ]
    uuids = [str(uuid4()) for _ in range(len(documents))]

    return documents, uuids

documents, uuids = create_fack_docs()
vector_store.add_documents(documents=documents, ids=uuids)

['1800a356-d433-488d-a348-6ad47fa429ec',
 '4db8e03b-d0bd-4f33-b8e6-7a195d98e4e4',
 'ecbac5fb-25c0-4fac-908c-b34f207e1809',
 '2664c3b5-46f7-4afc-bb7b-5eda7ead83ea',
 '182d7db3-a542-4580-8617-8f201263e382',
 '021fd64d-1746-45d8-ae7c-a12403c2176e',
 '19aaf6bb-909c-4014-a9bd-f84271f57f8c',
 '1e1b704a-66a5-4121-9868-ed5d14357f6a',
 '70c74784-988c-4dcd-81c0-1829928e270f',
 '6bab51f8-9d57-4af9-87e6-6e96e06d9f26']

In [7]:
vector_store

<langchain_qdrant.qdrant.QdrantVectorStore at 0x7f7bc9eb0190>

In [9]:
# delete

vector_store.delete(ids=[uuids[-1]])

True

In [29]:

query = "LangChain provides abstractions to make working with LLMs easy"
results = vector_store.similarity_search_with_score(query=query, k=3)


In [17]:

for res, score in results:
    print(f"* {score:.3f}, {res.page_content} [{res.metadata}]")

* 0.478, Building an exciting new project with LangChain - come check it out! [{'source': 'tweet', '_id': 'ecbac5fb-25c0-4fac-908c-b34f207e1809', '_collection_name': 'demo_collection'}]
* 0.429, LangGraph is the best framework for building stateful, agentic applications! [{'source': 'tweet', '_id': '1e1b704a-66a5-4121-9868-ed5d14357f6a', '_collection_name': 'demo_collection'}]
* 0.085, I had chocalate chip pancakes and scrambled eggs for breakfast this morning. [{'source': 'tweet', '_id': '1800a356-d433-488d-a348-6ad47fa429ec', '_collection_name': 'demo_collection'}]


In [31]:
search_results_to_dataframe(results)

Unnamed: 0,source,_id,_collection_name,content,score
0,tweet,ecbac5fb-25c0-4fac-908c-b34f207e1809,demo_collection,Building an exciting new project with LangChai...,0.47786
1,tweet,1e1b704a-66a5-4121-9868-ed5d14357f6a,demo_collection,LangGraph is the best framework for building s...,0.428878
2,tweet,1800a356-d433-488d-a348-6ad47fa429ec,demo_collection,I had chocalate chip pancakes and scrambled eg...,0.085175


In [33]:
# 插入一个向量和初始 payload
vector_id = 1
initial_payload = {"category": "text", "content": "Initial content"}
vector = [0.1] * 3072  # 示例向量

client.upsert(
    collection_name="demo_collection",
    points=[PointStruct(id=vector_id, vector=vector, payload=initial_payload)]
)



UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [34]:
# 修改 payload
new_payload = {"category": "text", "content": "Updated content"}
client.set_payload(
    collection_name="demo_collection",
    payload=new_payload,
    points=[vector_id]
)


UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [35]:

# 检索并打印更新后的点信息
updated_point = client.retrieve(
    collection_name="demo_collection",
    ids=[vector_id]
)
print(updated_point)

[Record(id=1, payload={'category': 'text', 'content': 'Updated content'}, vector=None, shard_key=None, order_value=None)]


In [41]:
updated_point = client.retrieve(
    collection_name="demo_collection",
    ids=uuids[:2],
    with_vectors=True,
)
print(updated_point)

[Record(id='1800a356-d433-488d-a348-6ad47fa429ec', payload={'page_content': 'I had chocalate chip pancakes and scrambled eggs for breakfast this morning.', 'metadata': {'source': 'tweet'}}, vector=[-0.011683207005262375, -0.03141341358423233, -0.026787463575601578, 0.007245524786412716, -0.04501154646277428, 0.018116502091288567, 0.05658717080950737, -0.03283347189426422, 0.030940057709813118, 0.03993375971913338, 0.04544186592102051, 0.0337156280875206, -0.0009453607490286231, 0.018493032082915306, 0.019870057702064514, 0.0072293877601623535, -0.021214811131358147, 0.005241843871772289, -0.012941895052790642, 0.02498011663556099, 0.010080262087285519, -0.03184373304247856, -0.017739972099661827, -0.018331661820411682, 0.024764956906437874, -0.0001771710958564654, -0.006513979285955429, 0.011349708773195744, 0.01959034986793995, -0.0011181614827364683, -0.015609882771968842, 0.006234270986169577, -0.021526793017983437, -0.017514053732156754, 0.017116006463766098, -0.0050078569911420345

In [46]:
dir(updated_point[0])

['__abstractmethods__',
 '__annotations__',
 '__class__',
 '__class_getitem__',
 '__class_vars__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__fields__',
 '__fields_set__',
 '__format__',
 '__ge__',
 '__get_pydantic_core_schema__',
 '__get_pydantic_json_schema__',
 '__getattr__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__pretty__',
 '__private_attributes__',
 '__pydantic_complete__',
 '__pydantic_core_schema__',
 '__pydantic_custom_init__',
 '__pydantic_decorators__',
 '__pydantic_extra__',
 '__pydantic_fields_set__',
 '__pydantic_generic_metadata__',
 '__pydantic_init_subclass__',
 '__pydantic_parent_namespace__',
 '__pydantic_post_init__',
 '__pydantic_private__',
 '__pydantic_root_model__',
 '__pydantic_serializer__',
 '__pydantic_validator__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__repr_a

In [51]:
pd.DataFrame([i.dict() for i in updated_point ])

Unnamed: 0,id,payload,vector,shard_key,order_value
0,1800a356-d433-488d-a348-6ad47fa429ec,{'page_content': 'I had chocalate chip pancake...,"[-0.011683207005262375, -0.03141341358423233, ...",,
1,4db8e03b-d0bd-4f33-b8e6-7a195d98e4e4,{'page_content': 'The weather forecast for tom...,"[-0.016323087736964226, -0.02772817201912403, ...",,
