From 4a63ae2da2616a6b2fe54e5fa9ca7b21587a6572 Mon Sep 17 00:00:00 2001 From: Matvey Arye Date: Wed, 24 Jul 2024 15:02:57 -0400 Subject: [PATCH] Fix library to work with new names The index was renamed from Timescale Vector to diskann within vectorscale. --- README.md | 135 +++++++++------ nbs/00_vector.ipynb | 117 +++++++------ nbs/01_pgvectorizer.ipynb | 2 +- nbs/index.ipynb | 159 ++++++++++++------ nbs/tsv_python_getting_started_tutorial.ipynb | 12 +- timescale_vector/_modidx.py | 19 +-- timescale_vector/client.py | 51 +++--- 7 files changed, 307 insertions(+), 188 deletions(-) diff --git a/README.md b/README.md index bb019c7..5f04e04 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,5 @@ # Timescale Vector - PostgreSQL++ for AI Applications. @@ -111,7 +110,7 @@ vec.upsert([\ You can now create a vector index to speed up similarity search: ``` python -vec.create_embedding_index(client.TimescaleVectorIndex()) +vec.create_embedding_index(client.DiskAnnIndex()) ``` Now, you can query for similar items: @@ -120,12 +119,12 @@ Now, you can query for similar items: vec.search([1.0, 9.0]) ``` - [[UUID('45ecb666-0f15-11ef-8d89-e666703872d0'), + [[UUID('4494c186-4a0d-11ef-94a3-6ee10b77fd09'), {'action': 'jump', 'animal': 'fox'}, 'jumped over the', array([ 1. , 10.8], dtype=float32), 0.00016793422934946456], - [UUID('45ecb350-0f15-11ef-8d89-e666703872d0'), + [UUID('4494c12c-4a0d-11ef-94a3-6ee10b77fd09'), {'animal': 'fox'}, 'the brown fox', array([1. , 1.3], dtype=float32), @@ -141,7 +140,7 @@ constrained by a metadata filter. vec.search([1.0, 9.0], limit=1, filter={"action": "jump"}) ``` - [[UUID('45ecb666-0f15-11ef-8d89-e666703872d0'), + [[UUID('4494c186-4a0d-11ef-94a3-6ee10b77fd09'), {'action': 'jump', 'animal': 'fox'}, 'jumped over the', array([ 1. , 10.8], dtype=float32), @@ -165,7 +164,7 @@ records = vec.search([1.0, 9.0], limit=1, filter={"action": "jump"}) (records[0]["id"],records[0]["metadata"], records[0]["contents"], records[0]["embedding"], records[0]["distance"]) ``` - (UUID('45ecb666-0f15-11ef-8d89-e666703872d0'), + (UUID('4494c186-4a0d-11ef-94a3-6ee10b77fd09'), {'action': 'jump', 'animal': 'fox'}, 'jumped over the', array([ 1. , 10.8], dtype=float32), @@ -228,12 +227,12 @@ The basic query looks like: vec.search([1.0, 9.0]) ``` - [[UUID('4d629b54-0f15-11ef-8d89-e666703872d0'), + [[UUID('456dbbbc-4a0d-11ef-94a3-6ee10b77fd09'), {'times': 100, 'action': 'jump', 'animal': 'fox'}, 'jumped over the', array([ 1. , 10.8], dtype=float32), 0.00016793422934946456], - [UUID('4d629a50-0f15-11ef-8d89-e666703872d0'), + [UUID('456dbb6c-4a0d-11ef-94a3-6ee10b77fd09'), {'times': 1, 'action': 'sit', 'animal': 'fox'}, 'the brown fox', array([1. , 1.3], dtype=float32), @@ -245,7 +244,7 @@ You could provide a limit for the number of items returned: vec.search([1.0, 9.0], limit=1) ``` - [[UUID('4d629b54-0f15-11ef-8d89-e666703872d0'), + [[UUID('456dbbbc-4a0d-11ef-94a3-6ee10b77fd09'), {'times': 100, 'action': 'jump', 'animal': 'fox'}, 'jumped over the', array([ 1. , 10.8], dtype=float32), @@ -270,7 +269,7 @@ unconstrained): vec.search([1.0, 9.0], limit=1, filter={"action": "sit"}) ``` - [[UUID('4d629a50-0f15-11ef-8d89-e666703872d0'), + [[UUID('456dbb6c-4a0d-11ef-94a3-6ee10b77fd09'), {'times': 1, 'action': 'sit', 'animal': 'fox'}, 'the brown fox', array([1. , 1.3], dtype=float32), @@ -283,12 +282,12 @@ returned if it matches any dict: vec.search([1.0, 9.0], limit=2, filter=[{"action": "jump"}, {"animal": "fox"}]) ``` - [[UUID('4d629b54-0f15-11ef-8d89-e666703872d0'), + [[UUID('456dbbbc-4a0d-11ef-94a3-6ee10b77fd09'), {'times': 100, 'action': 'jump', 'animal': 'fox'}, 'jumped over the', array([ 1. , 10.8], dtype=float32), 0.00016793422934946456], - [UUID('4d629a50-0f15-11ef-8d89-e666703872d0'), + [UUID('456dbb6c-4a0d-11ef-94a3-6ee10b77fd09'), {'times': 1, 'action': 'sit', 'animal': 'fox'}, 'the brown fox', array([1. , 1.3], dtype=float32), @@ -303,7 +302,7 @@ could use greater than and less than conditions on numeric values. vec.search([1.0, 9.0], limit=2, predicates=client.Predicates("times", ">", 1)) ``` - [[UUID('4d629b54-0f15-11ef-8d89-e666703872d0'), + [[UUID('456dbbbc-4a0d-11ef-94a3-6ee10b77fd09'), {'times': 100, 'action': 'jump', 'animal': 'fox'}, 'jumped over the', array([ 1. , 10.8], dtype=float32), @@ -327,7 +326,7 @@ use the right type. Supported Python types are: `str`, `int`, and vec.search([1.0, 9.0], limit=2, predicates=client.Predicates("action", "==", "jump")) ``` - [[UUID('4d629b54-0f15-11ef-8d89-e666703872d0'), + [[UUID('456dbbbc-4a0d-11ef-94a3-6ee10b77fd09'), {'times': 100, 'action': 'jump', 'animal': 'fox'}, 'jumped over the', array([ 1. , 10.8], dtype=float32), @@ -341,7 +340,7 @@ combining using OR semantic). So you can do: vec.search([1.0, 9.0], limit=2, predicates=client.Predicates("action", "==", "jump") & client.Predicates("times", ">", 1)) ``` - [[UUID('4d629b54-0f15-11ef-8d89-e666703872d0'), + [[UUID('456dbbbc-4a0d-11ef-94a3-6ee10b77fd09'), {'times': 100, 'action': 'jump', 'animal': 'fox'}, 'jumped over the', array([ 1. , 10.8], dtype=float32), @@ -364,7 +363,7 @@ my_predicates = client.Predicates("action", "==", "jump") & (client.Predicates(" vec.search([1.0, 9.0], limit=2, predicates=my_predicates) ``` - [[UUID('4d629b54-0f15-11ef-8d89-e666703872d0'), + [[UUID('456dbbbc-4a0d-11ef-94a3-6ee10b77fd09'), {'times': 100, 'action': 'jump', 'animal': 'fox'}, 'jumped over the', array([ 1. , 10.8], dtype=float32), @@ -378,7 +377,7 @@ semantics. You can pass in multiple 3-tuples to vec.search([1.0, 9.0], limit=2, predicates=client.Predicates(("action", "==", "jump"), ("times", ">", 10))) ``` - [[UUID('4d629b54-0f15-11ef-8d89-e666703872d0'), + [[UUID('456dbbbc-4a0d-11ef-94a3-6ee10b77fd09'), {'times': 100, 'action': 'jump', 'animal': 'fox'}, 'jumped over the', array([ 1. , 10.8], dtype=float32), @@ -410,7 +409,7 @@ Then, you can filter using the timestamps by specifing a tpvec.search([1.0, 9.0], limit=4, uuid_time_filter=client.UUIDTimeRange(specific_datetime, specific_datetime+timedelta(days=1))) ``` - [[UUID('95899000-ef1d-11e7-990e-7d2f7e013038'), + [[UUID('33c52800-ef15-11e7-8a12-ea51d07b6447'), {'times': 1, 'action': 'sit', 'animal': 'fox'}, 'the brown fox', array([1. , 1.3], dtype=float32), @@ -426,12 +425,12 @@ unconstrained. tpvec.search([1.0, 9.0], limit=4, uuid_time_filter=client.UUIDTimeRange(start_date=specific_datetime)) ``` - [[UUID('0e505000-0def-11e9-8732-a154fea6fb50'), + [[UUID('ac8be800-0de6-11e9-a5fd-5a100e653c25'), {'times': 100, 'action': 'jump', 'animal': 'fox'}, 'jumped over the', array([ 1. , 10.8], dtype=float32), 0.00016793422934946456], - [UUID('95899000-ef1d-11e7-990e-7d2f7e013038'), + [UUID('33c52800-ef15-11e7-8a12-ea51d07b6447'), {'times': 1, 'action': 'sit', 'animal': 'fox'}, 'the brown fox', array([1. , 1.3], dtype=float32), @@ -448,7 +447,7 @@ One example: tpvec.search([1.0, 9.0], limit=4, uuid_time_filter=client.UUIDTimeRange(start_date=specific_datetime, start_inclusive=False)) ``` - [[UUID('0e505000-0def-11e9-8732-a154fea6fb50'), + [[UUID('ac8be800-0de6-11e9-a5fd-5a100e653c25'), {'times': 100, 'action': 'jump', 'animal': 'fox'}, 'jumped over the', array([ 1. , 10.8], dtype=float32), @@ -470,7 +469,7 @@ filters and `__uuid_timestamp` for predicates. Some examples below: tpvec.search([1.0, 9.0], limit=4, filter={ "__start_date": specific_datetime, "__end_date": specific_datetime+timedelta(days=1)}) ``` - [[UUID('95899000-ef1d-11e7-990e-7d2f7e013038'), + [[UUID('33c52800-ef15-11e7-8a12-ea51d07b6447'), {'times': 1, 'action': 'sit', 'animal': 'fox'}, 'the brown fox', array([1. , 1.3], dtype=float32), @@ -478,10 +477,10 @@ tpvec.search([1.0, 9.0], limit=4, filter={ "__start_date": specific_datetime, "_ ``` python tpvec.search([1.0, 9.0], limit=4, - predicates=client.Predicates("__uuid_timestamp", ">", specific_datetime) & client.Predicates("__uuid_timestamp", "<", specific_datetime+timedelta(days=1))) + predicates=client.Predicates("__uuid_timestamp", ">=", specific_datetime) & client.Predicates("__uuid_timestamp", "<", specific_datetime+timedelta(days=1))) ``` - [[UUID('95899000-ef1d-11e7-990e-7d2f7e013038'), + [[UUID('33c52800-ef15-11e7-8a12-ea51d07b6447'), {'times': 1, 'action': 'sit', 'animal': 'fox'}, 'the brown fox', array([1. , 1.3], dtype=float32), @@ -508,7 +507,7 @@ the trade-offs between these algorithms: | Algorithm | Build speed | Query speed | Need to rebuild after updates | |------------------|-------------|-------------|-------------------------------| -| timescale vector | Slow | Fastest | No | +| StreamingDiskANN | Fast | Fastest | No | | pgvector hnsw | Slowest | Faster | No | | pgvector ivfflat | Fastest | Slowest | Yes | @@ -520,7 +519,7 @@ We recommend using the Timescale Vector index for most use cases. This can be created with: ``` python -vec.create_embedding_index(client.TimescaleVectorIndex()) +vec.create_embedding_index(client.DiskAnnIndex()) ``` Indexes are created for a particular distance metric type. So it is @@ -534,18 +533,18 @@ query-time option for controlling accuracy during a particular query. We have smart defaults for all of these options but will also describe the details below so that you can adjust these options manually. -#### Timescale Vector index +#### StreamingDiskANN index -The Timescale Vector index is a graph-based algorithm that uses the -[DiskANN](https://github.com/microsoft/DiskANN) algorithm. You can read -more about it on our -[blog](https://www.timescale.com/blog/how-we-made-postgresql-the-best-vector-database/) +The StreamingDiskANN index from pgvectorscale is a graph-based algorithm +that uses the [DiskANN](https://github.com/microsoft/DiskANN) algorithm. +You can read more about it on our +[blog](https://www.timescale.com/blog/how-we-made-postgresql-as-fast-as-pinecone-for-vector-data/) announcing its release. To create this index, run: ``` python -vec.create_embedding_index(client.TimescaleVectorIndex()) +vec.create_embedding_index(client.DiskAnnIndex()) ``` The above command will create the index using smart defaults. There are @@ -554,31 +553,47 @@ trade-off. The parameters you can set at index build time are: -| Parameter name | Description | Default value | -|------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------| -| num_neighbors | Sets the maximum number of neighbors per node. Higher values increase accuracy but make the graph traversal slower. | 50 | -| search_list_size | This is the S parameter used in the greedy search algorithm used during construction. Higher values improve graph quality at the cost of slower index builds. | 100 | -| max_alpha | Is the alpha parameter in the algorithm. Higher values improve graph quality at the cost of slower index builds. | 1.0 | +| Parameter name | Description | Default value | +|--------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------| +| `storage_layout` | `memory_optimized` which uses SBQ to compress vector data or `plain` which stores data uncompressed | memory_optimized | +| `num_neighbors` | Sets the maximum number of neighbors per node. Higher values increase accuracy but make the graph traversal slower. | 50 | +| `search_list_size` | This is the S parameter used in the greedy search algorithm used during construction. Higher values improve graph quality at the cost of slower index builds. | 100 | +| `max_alpha` | Is the alpha parameter in the algorithm. Higher values improve graph quality at the cost of slower index builds. | 1.2 | +| `num_dimensions` | The number of dimensions to index. By default, all dimensions are indexed. But you can also index less dimensions to make use of [Matryoshka embeddings](https://huggingface.co/blog/matryoshka) | 0 (all dimensions) | +| `num_bits_per_dimension` | Number of bits used to encode each dimension when using SBQ | 2 for less than 900 dimensions, 1 otherwise | To set these parameters, you could run: ``` python -vec.create_embedding_index(client.TimescaleVectorIndex(num_neighbors=50, search_list_size=100, max_alpha=1.0)) +vec.create_embedding_index(client.DiskAnnIndex(num_neighbors=50, search_list_size=100, max_alpha=1.0, storage_layout="memory_optimized", num_dimensions=0, num_bits_per_dimension=1)) ``` You can also set a parameter to control the accuracy vs. query speed trade-off at query time. The parameter is set in the `search()` function -using the `query_params` argment. You can set the -`search_list_size`(default: 100). This is the number of additional -candidates considered during the graph search at query time. Higher -values improve query accuracy while making the query slower. +using the `query_params` argment. -You can specify this value during search as follows: +| Parameter name | Description | Default value | +|--------------------|-------------------------------------------------------------------------|---------------| +| `search_list_size` | The number of additional candidates considered during the graph search. | 100 | +| `rescore` | The number of elements rescored (0 to disable rescoring) | 50 | + +We suggest using the `rescore` parameter to fine-tune accuracy. ``` python -# vec.search([1.0, 9.0], limit=4, query_params=TimescaleVectorIndexParams(search_list_size=10)) +vec.search([1.0, 9.0], limit=4, query_params=client.DiskAnnIndexParams(rescore=400, search_list_size=10)) ``` + [[UUID('456dbbbc-4a0d-11ef-94a3-6ee10b77fd09'), + {'times': 100, 'action': 'jump', 'animal': 'fox'}, + 'jumped over the', + array([ 1. , 10.8], dtype=float32), + 0.00016793422934946456], + [UUID('456dbb6c-4a0d-11ef-94a3-6ee10b77fd09'), + {'times': 1, 'action': 'sit', 'animal': 'fox'}, + 'the brown fox', + array([1. , 1.3], dtype=float32), + 0.14489260377438218]] + To drop the index, run: ``` python @@ -623,9 +638,20 @@ the query slower. You can specify this value during search as follows: ``` python -# vec.search([1.0, 9.0], limit=4, query_params=HNSWIndexParams(ef_search=10)) +vec.search([1.0, 9.0], limit=4, query_params=client.HNSWIndexParams(ef_search=10)) ``` + [[UUID('456dbbbc-4a0d-11ef-94a3-6ee10b77fd09'), + {'times': 100, 'action': 'jump', 'animal': 'fox'}, + 'jumped over the', + array([ 1. , 10.8], dtype=float32), + 0.00016793422934946456], + [UUID('456dbb6c-4a0d-11ef-94a3-6ee10b77fd09'), + {'times': 1, 'action': 'sit', 'animal': 'fox'}, + 'the brown fox', + array([1. , 1.3], dtype=float32), + 0.14489260377438218]] + To drop the index run: ``` python @@ -679,9 +705,20 @@ improve query accuracy while making the query slower. You can specify this value during search as follows: ``` python -# vec.search([1.0, 9.0], limit=4, query_params=IvfflatIndexParams(probes=10)) +vec.search([1.0, 9.0], limit=4, query_params=client.IvfflatIndexParams(probes=10)) ``` + [[UUID('456dbbbc-4a0d-11ef-94a3-6ee10b77fd09'), + {'times': 100, 'action': 'jump', 'animal': 'fox'}, + 'jumped over the', + array([ 1. , 10.8], dtype=float32), + 0.00016793422934946456], + [UUID('456dbb6c-4a0d-11ef-94a3-6ee10b77fd09'), + {'times': 1, 'action': 'sit', 'animal': 'fox'}, + 'the brown fox', + array([1. , 1.3], dtype=float32), + 0.14489260377438218]] + To drop the index, run: ``` python @@ -851,7 +888,7 @@ from langchain.docstore.document import Document from langchain.text_splitter import CharacterTextSplitter from timescale_vector import client, pgvectorizer from langchain_openai import OpenAIEmbeddings -from langchain.vectorstores.timescalevector import TimescaleVector +from langchain_community.vectorstores.timescalevector import TimescaleVector from datetime import timedelta ``` @@ -963,8 +1000,8 @@ res = vector_store.similarity_search_with_score("Blogs about cats") res ``` - [(Document(page_content='Author Matvey Arye, title: First Post, contents:some super interesting content about cats.', metadata={'id': '4a784000-4bc4-11eb-979c-e8748f6439f2', 'author': 'Matvey Arye', 'blog_id': 1, 'category': 'AI', 'published_time': '2021-01-01T00:00:00+00:00'}), - 0.12657619616729976)] + [(Document(metadata={'id': '334e4800-4bee-11eb-a52a-57b3c4a96ccb', 'author': 'Matvey Arye', 'blog_id': 1, 'category': 'AI', 'published_time': '2021-01-01T00:00:00-05:00'}, page_content='Author Matvey Arye, title: First Post, contents:some super interesting content about cats.'), + 0.12680577303752072)] ## Development diff --git a/nbs/00_vector.ipynb b/nbs/00_vector.ipynb index c9b7bb1..abbf7be 100644 --- a/nbs/00_vector.ipynb +++ b/nbs/00_vector.ipynb @@ -226,44 +226,48 @@ " return \"CREATE INDEX {index_name} ON {table_name} USING hnsw ({column_name} {index_method}) {with_clause};\"\\\n", " .format(index_name=index_name_quoted, table_name=table_name_quoted, column_name=column_name_quoted, index_method=index_method, with_clause=with_clause)\n", "\n", - "class TimescaleVectorIndex(BaseIndex):\n", + "class DiskAnnIndex(BaseIndex):\n", " def __init__(self, \n", - " use_pq: Optional[bool] = None, \n", - " num_neighbors: Optional[int] = None, \n", " search_list_size: Optional[int] = None, \n", + " num_neighbors: Optional[int] = None, \n", " max_alpha: Optional[float] = None,\n", - " pq_vector_length: Optional[int] = None,\n", + " storage_layout: Optional[str] = None,\n", + " num_dimensions: Optional[int] = None,\n", + " num_bits_per_dimension: Optional[int] = None,\n", " ) -> None:\n", " \"\"\"\n", " Timescale's vector index.\n", " \"\"\"\n", - " self.use_pq = use_pq\n", - " self.num_neighbors = num_neighbors\n", " self.search_list_size = search_list_size\n", + " self.num_neighbors = num_neighbors\n", " self.max_alpha = max_alpha\n", - " self.pq_vector_length = pq_vector_length\n", + " self.storage_layout = storage_layout\n", + " self.num_dimensions = num_dimensions\n", + " self.num_bits_per_dimension = num_bits_per_dimension\n", "\n", " def create_index_query(self, table_name_quoted:str, column_name_quoted: str, index_name_quoted: str, distance_type: str, num_records_callback: Callable[[], int]) -> str:\n", " if distance_type != \"<=>\":\n", " raise ValueError(f\"Timescale's vector index only supports cosine distance, but distance_type was {distance_type}\")\n", "\n", " with_clauses = []\n", - " if self.use_pq is not None:\n", - " with_clauses.append(f\"use_pq = {self.use_pq}\")\n", - " if self.num_neighbors is not None:\n", - " with_clauses.append(f\"num_neighbors = {self.num_neighbors}\")\n", " if self.search_list_size is not None:\n", " with_clauses.append(f\"search_list_size = {self.search_list_size}\")\n", + " if self.num_neighbors is not None:\n", + " with_clauses.append(f\"num_neighbors = {self.num_neighbors}\")\n", " if self.max_alpha is not None:\n", " with_clauses.append(f\"max_alpha = {self.max_alpha}\")\n", - " if self.pq_vector_length is not None:\n", - " with_clauses.append(f\"pq_vector_length = {self.pq_vector_length}\")\n", + " if self.storage_layout is not None:\n", + " with_clauses.append(f\"storage_layout = {self.storage_layout}\")\n", + " if self.num_dimensions is not None:\n", + " with_clauses.append(f\"num_dimensions = {self.num_dimensions}\")\n", + " if self.num_bits_per_dimension is not None:\n", + " with_clauses.append(f\"num_bits_per_dimension = {self.num_bits_per_dimension}\")\n", " \n", " with_clause = \"\"\n", " if len(with_clauses) > 0:\n", " with_clause = \"WITH (\" + \", \".join(with_clauses) + \")\"\n", "\n", - " return \"CREATE INDEX {index_name} ON {table_name} USING tsv ({column_name}) {with_clause};\"\\\n", + " return \"CREATE INDEX {index_name} ON {table_name} USING diskann ({column_name}) {with_clause};\"\\\n", " .format(index_name=index_name_quoted, table_name=table_name_quoted, column_name=column_name_quoted, with_clause=with_clause)\n" ] }, @@ -290,9 +294,14 @@ " def get_statements(self) -> List[str]:\n", " return [\"SET LOCAL \" + key + \" = \" + str(value) for key, value in self.params.items()]\n", "\n", - "class TimescaleVectorIndexParams(QueryParams):\n", - " def __init__(self, search_list_size: int) -> None:\n", - " super().__init__({\"tsv.query_search_list_size\": search_list_size})\n", + "class DiskAnnIndexParams(QueryParams):\n", + " def __init__(self, search_list_size: Optional[int] = None, rescore: Optional[int] = None) -> None:\n", + " params = {}\n", + " if search_list_size is not None:\n", + " params[\"diskann.query_search_list_size\"] = search_list_size\n", + " if rescore is not None:\n", + " params[\"diskann.query_rescore\"] = rescore\n", + " super().__init__(params)\n", "\n", "class IvfflatIndexParams(QueryParams):\n", " def __init__(self, probes: int) -> None:\n", @@ -728,7 +737,7 @@ " )\n", " return '''\n", "CREATE EXTENSION IF NOT EXISTS vector;\n", - "CREATE EXTENSION IF NOT EXISTS timescale_vector;\n", + "CREATE EXTENSION IF NOT EXISTS vectorscale;\n", "\n", "\n", "CREATE TABLE IF NOT EXISTS {table_name} (\n", @@ -903,24 +912,24 @@ "text/markdown": [ "---\n", "\n", - "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L546){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L562){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### QueryBuilder.get_create_query\n", "\n", "> QueryBuilder.get_create_query ()\n", "\n", - "*Generates a query to create the tables, indexes, and extensions needed to store the vector data.*" + "Generates a query to create the tables, indexes, and extensions needed to store the vector data." ], "text/plain": [ "---\n", "\n", - "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L546){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L562){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### QueryBuilder.get_create_query\n", "\n", "> QueryBuilder.get_create_query ()\n", "\n", - "*Generates a query to create the tables, indexes, and extensions needed to store the vector data.*" + "Generates a query to create the tables, indexes, and extensions needed to store the vector data." ] }, "execution_count": null, @@ -1223,24 +1232,24 @@ "text/markdown": [ "---\n", "\n", - "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L884){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L900){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### Async.create_tables\n", "\n", "> Async.create_tables ()\n", "\n", - "*Creates necessary tables.*" + "Creates necessary tables." ], "text/plain": [ "---\n", "\n", - "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L884){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L900){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### Async.create_tables\n", "\n", "> Async.create_tables ()\n", "\n", - "*Creates necessary tables.*" + "Creates necessary tables." ] }, "execution_count": null, @@ -1262,24 +1271,24 @@ "text/markdown": [ "---\n", "\n", - "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L884){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L900){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### Async.create_tables\n", "\n", "> Async.create_tables ()\n", "\n", - "*Creates necessary tables.*" + "Creates necessary tables." ], "text/plain": [ "---\n", "\n", - "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L884){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L900){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### Async.create_tables\n", "\n", "> Async.create_tables ()\n", "\n", - "*Creates necessary tables.*" + "Creates necessary tables." ] }, "execution_count": null, @@ -1301,7 +1310,7 @@ "text/markdown": [ "---\n", "\n", - "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L985){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L1001){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### Async.search\n", "\n", @@ -1311,7 +1320,7 @@ "> uuid_time_filter:Optional[__main__.UUIDTimeRange]=None,\n", "> query_params:Optional[__main__.QueryParams]=None)\n", "\n", - "*Retrieves similar records using a similarity query.*\n", + "Retrieves similar records using a similarity query.\n", "\n", "| | **Type** | **Default** | **Details** |\n", "| -- | -------- | ----------- | ----------- |\n", @@ -1326,7 +1335,7 @@ "text/plain": [ "---\n", "\n", - "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L985){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L1001){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### Async.search\n", "\n", @@ -1336,7 +1345,7 @@ "> uuid_time_filter:Optional[__main__.UUIDTimeRange]=None,\n", "> query_params:Optional[__main__.QueryParams]=None)\n", "\n", - "*Retrieves similar records using a similarity query.*\n", + "Retrieves similar records using a similarity query.\n", "\n", "| | **Type** | **Default** | **Details** |\n", "| -- | -------- | ----------- | ----------- |\n", @@ -1424,9 +1433,9 @@ "await vec.drop_embedding_index()\n", "await vec.create_embedding_index(HNSWIndex(20, 125))\n", "await vec.drop_embedding_index()\n", - "await vec.create_embedding_index(TimescaleVectorIndex())\n", + "await vec.create_embedding_index(DiskAnnIndex())\n", "await vec.drop_embedding_index()\n", - "await vec.create_embedding_index(TimescaleVectorIndex(False, 50, 50, 1.5))\n", + "await vec.create_embedding_index(DiskAnnIndex(50, 50, 1.5, \"memory_optimized\", 2, 1))\n", "\n", "rec = await vec.search([1.0, 2.0])\n", "assert len(rec) == 10\n", @@ -1652,9 +1661,9 @@ "assert len(rec) == 0\n", "rec = await vec.search([1.0, 2.0], limit=4, uuid_time_filter=UUIDTimeRange(end_date=specific_datetime+timedelta(seconds=1), time_delta=timedelta(days=7)))\n", "assert len(rec) == 1\n", - "rec = await vec.search([1.0, 2.0], limit=4, query_params=TimescaleVectorIndexParams(10))\n", + "rec = await vec.search([1.0, 2.0], limit=4, query_params=DiskAnnIndexParams(10, 5))\n", "assert len(rec) == 2\n", - "rec = await vec.search([1.0, 2.0], limit=4, query_params=TimescaleVectorIndexParams(100))\n", + "rec = await vec.search([1.0, 2.0], limit=4, query_params=DiskAnnIndexParams(100))\n", "assert len(rec) == 2\n", "await vec.drop_table()\n", "await vec.close()" @@ -2024,24 +2033,24 @@ "text/markdown": [ "---\n", "\n", - "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L1198){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L1217){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### Sync.create_tables\n", "\n", "> Sync.create_tables ()\n", "\n", - "*Creates necessary tables.*" + "Creates necessary tables." ], "text/plain": [ "---\n", "\n", - "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L1198){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L1217){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### Sync.create_tables\n", "\n", "> Sync.create_tables ()\n", "\n", - "*Creates necessary tables.*" + "Creates necessary tables." ] }, "execution_count": null, @@ -2063,13 +2072,13 @@ "text/markdown": [ "---\n", "\n", - "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L1178){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L1197){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### Sync.upsert\n", "\n", "> Sync.upsert (records)\n", "\n", - "*Performs upsert operation for multiple records.*\n", + "Performs upsert operation for multiple records.\n", "\n", "| | **Type** | **Details** |\n", "| -- | -------- | ----------- |\n", @@ -2079,13 +2088,13 @@ "text/plain": [ "---\n", "\n", - "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L1178){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L1197){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### Sync.upsert\n", "\n", "> Sync.upsert (records)\n", "\n", - "*Performs upsert operation for multiple records.*\n", + "Performs upsert operation for multiple records.\n", "\n", "| | **Type** | **Details** |\n", "| -- | -------- | ----------- |\n", @@ -2112,7 +2121,7 @@ "text/markdown": [ "---\n", "\n", - "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L1313){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L1332){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### Sync.search\n", "\n", @@ -2122,7 +2131,7 @@ "> uuid_time_filter:Optional[__main__.UUIDTimeRange]=None,\n", "> query_params:Optional[__main__.QueryParams]=None)\n", "\n", - "*Retrieves similar records using a similarity query.*\n", + "Retrieves similar records using a similarity query.\n", "\n", "| | **Type** | **Default** | **Details** |\n", "| -- | -------- | ----------- | ----------- |\n", @@ -2137,7 +2146,7 @@ "text/plain": [ "---\n", "\n", - "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L1313){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L1332){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### Sync.search\n", "\n", @@ -2147,7 +2156,7 @@ "> uuid_time_filter:Optional[__main__.UUIDTimeRange]=None,\n", "> query_params:Optional[__main__.QueryParams]=None)\n", "\n", - "*Retrieves similar records using a similarity query.*\n", + "Retrieves similar records using a similarity query.\n", "\n", "| | **Type** | **Default** | **Details** |\n", "| -- | -------- | ----------- | ----------- |\n", @@ -2232,9 +2241,9 @@ "vec.drop_embedding_index()\n", "vec.create_embedding_index(HNSWIndex(20, 125))\n", "vec.drop_embedding_index()\n", - "vec.create_embedding_index(TimescaleVectorIndex())\n", + "vec.create_embedding_index(DiskAnnIndex())\n", "vec.drop_embedding_index()\n", - "vec.create_embedding_index(TimescaleVectorIndex(False, 50, 50, 1.5))\n", + "vec.create_embedding_index(DiskAnnIndex(50, 50, 1.5))\n", "\n", "rec = vec.search([1.0, 2.0])\n", "assert len(rec) == 10\n", @@ -2418,9 +2427,9 @@ "assert len(rec) == 0\n", "rec = vec.search([1.0, 2.0], limit=4, uuid_time_filter=UUIDTimeRange(end_date=specific_datetime+timedelta(seconds=1), time_delta=timedelta(days=7)))\n", "assert len(rec) == 1\n", - "rec = vec.search([1.0, 2.0], limit=4, query_params=TimescaleVectorIndexParams(10))\n", + "rec = vec.search([1.0, 2.0], limit=4, query_params=DiskAnnIndexParams(10, 5))\n", "assert len(rec) == 2\n", - "rec = vec.search([1.0, 2.0], limit=4, query_params=TimescaleVectorIndexParams(100))\n", + "rec = vec.search([1.0, 2.0], limit=4, query_params=DiskAnnIndexParams(100, rescore=2))\n", "assert len(rec) == 2\n", "vec.drop_table()\n", "vec.close()" diff --git a/nbs/01_pgvectorizer.ipynb b/nbs/01_pgvectorizer.ipynb index 373f9c6..265ddb1 100644 --- a/nbs/01_pgvectorizer.ipynb +++ b/nbs/01_pgvectorizer.ipynb @@ -226,7 +226,7 @@ "from langchain.text_splitter import CharacterTextSplitter\n", "from timescale_vector import client\n", "from langchain_openai import OpenAIEmbeddings\n", - "from langchain.vectorstores.timescalevector import TimescaleVector\n", + "from langchain_community.vectorstores.timescalevector import TimescaleVector\n", "from datetime import timedelta" ] }, diff --git a/nbs/index.ipynb b/nbs/index.ipynb index dc2d4d1..31e080b 100644 --- a/nbs/index.ipynb +++ b/nbs/index.ipynb @@ -201,7 +201,7 @@ "metadata": {}, "outputs": [], "source": [ - "vec.create_embedding_index(client.TimescaleVectorIndex())" + "vec.create_embedding_index(client.DiskAnnIndex())" ] }, { @@ -220,12 +220,12 @@ { "data": { "text/plain": [ - "[[UUID('45ecb666-0f15-11ef-8d89-e666703872d0'),\n", + "[[UUID('4494c186-4a0d-11ef-94a3-6ee10b77fd09'),\n", " {'action': 'jump', 'animal': 'fox'},\n", " 'jumped over the',\n", " array([ 1. , 10.8], dtype=float32),\n", " 0.00016793422934946456],\n", - " [UUID('45ecb350-0f15-11ef-8d89-e666703872d0'),\n", + " [UUID('4494c12c-4a0d-11ef-94a3-6ee10b77fd09'),\n", " {'animal': 'fox'},\n", " 'the brown fox',\n", " array([1. , 1.3], dtype=float32),\n", @@ -259,7 +259,7 @@ { "data": { "text/plain": [ - "[[UUID('45ecb666-0f15-11ef-8d89-e666703872d0'),\n", + "[[UUID('4494c186-4a0d-11ef-94a3-6ee10b77fd09'),\n", " {'action': 'jump', 'animal': 'fox'},\n", " 'jumped over the',\n", " array([ 1. , 10.8], dtype=float32),\n", @@ -301,7 +301,7 @@ { "data": { "text/plain": [ - "(UUID('45ecb666-0f15-11ef-8d89-e666703872d0'),\n", + "(UUID('4494c186-4a0d-11ef-94a3-6ee10b77fd09'),\n", " {'action': 'jump', 'animal': 'fox'},\n", " 'jumped over the',\n", " array([ 1. , 10.8], dtype=float32),\n", @@ -430,12 +430,12 @@ { "data": { "text/plain": [ - "[[UUID('4d629b54-0f15-11ef-8d89-e666703872d0'),\n", + "[[UUID('456dbbbc-4a0d-11ef-94a3-6ee10b77fd09'),\n", " {'times': 100, 'action': 'jump', 'animal': 'fox'},\n", " 'jumped over the',\n", " array([ 1. , 10.8], dtype=float32),\n", " 0.00016793422934946456],\n", - " [UUID('4d629a50-0f15-11ef-8d89-e666703872d0'),\n", + " [UUID('456dbb6c-4a0d-11ef-94a3-6ee10b77fd09'),\n", " {'times': 1, 'action': 'sit', 'animal': 'fox'},\n", " 'the brown fox',\n", " array([1. , 1.3], dtype=float32),\n", @@ -467,7 +467,7 @@ { "data": { "text/plain": [ - "[[UUID('4d629b54-0f15-11ef-8d89-e666703872d0'),\n", + "[[UUID('456dbbbc-4a0d-11ef-94a3-6ee10b77fd09'),\n", " {'times': 100, 'action': 'jump', 'animal': 'fox'},\n", " 'jumped over the',\n", " array([ 1. , 10.8], dtype=float32),\n", @@ -509,7 +509,7 @@ { "data": { "text/plain": [ - "[[UUID('4d629a50-0f15-11ef-8d89-e666703872d0'),\n", + "[[UUID('456dbb6c-4a0d-11ef-94a3-6ee10b77fd09'),\n", " {'times': 1, 'action': 'sit', 'animal': 'fox'},\n", " 'the brown fox',\n", " array([1. , 1.3], dtype=float32),\n", @@ -541,12 +541,12 @@ { "data": { "text/plain": [ - "[[UUID('4d629b54-0f15-11ef-8d89-e666703872d0'),\n", + "[[UUID('456dbbbc-4a0d-11ef-94a3-6ee10b77fd09'),\n", " {'times': 100, 'action': 'jump', 'animal': 'fox'},\n", " 'jumped over the',\n", " array([ 1. , 10.8], dtype=float32),\n", " 0.00016793422934946456],\n", - " [UUID('4d629a50-0f15-11ef-8d89-e666703872d0'),\n", + " [UUID('456dbb6c-4a0d-11ef-94a3-6ee10b77fd09'),\n", " {'times': 1, 'action': 'sit', 'animal': 'fox'},\n", " 'the brown fox',\n", " array([1. , 1.3], dtype=float32),\n", @@ -580,7 +580,7 @@ { "data": { "text/plain": [ - "[[UUID('4d629b54-0f15-11ef-8d89-e666703872d0'),\n", + "[[UUID('456dbbbc-4a0d-11ef-94a3-6ee10b77fd09'),\n", " {'times': 100, 'action': 'jump', 'animal': 'fox'},\n", " 'jumped over the',\n", " array([ 1. , 10.8], dtype=float32),\n", @@ -616,7 +616,7 @@ { "data": { "text/plain": [ - "[[UUID('4d629b54-0f15-11ef-8d89-e666703872d0'),\n", + "[[UUID('456dbbbc-4a0d-11ef-94a3-6ee10b77fd09'),\n", " {'times': 100, 'action': 'jump', 'animal': 'fox'},\n", " 'jumped over the',\n", " array([ 1. , 10.8], dtype=float32),\n", @@ -648,7 +648,7 @@ { "data": { "text/plain": [ - "[[UUID('4d629b54-0f15-11ef-8d89-e666703872d0'),\n", + "[[UUID('456dbbbc-4a0d-11ef-94a3-6ee10b77fd09'),\n", " {'times': 100, 'action': 'jump', 'animal': 'fox'},\n", " 'jumped over the',\n", " array([ 1. , 10.8], dtype=float32),\n", @@ -708,7 +708,7 @@ { "data": { "text/plain": [ - "[[UUID('4d629b54-0f15-11ef-8d89-e666703872d0'),\n", + "[[UUID('456dbbbc-4a0d-11ef-94a3-6ee10b77fd09'),\n", " {'times': 100, 'action': 'jump', 'animal': 'fox'},\n", " 'jumped over the',\n", " array([ 1. , 10.8], dtype=float32),\n", @@ -741,7 +741,7 @@ { "data": { "text/plain": [ - "[[UUID('4d629b54-0f15-11ef-8d89-e666703872d0'),\n", + "[[UUID('456dbbbc-4a0d-11ef-94a3-6ee10b77fd09'),\n", " {'times': 100, 'action': 'jump', 'animal': 'fox'},\n", " 'jumped over the',\n", " array([ 1. , 10.8], dtype=float32),\n", @@ -799,7 +799,7 @@ { "data": { "text/plain": [ - "[[UUID('95899000-ef1d-11e7-990e-7d2f7e013038'),\n", + "[[UUID('33c52800-ef15-11e7-8a12-ea51d07b6447'),\n", " {'times': 1, 'action': 'sit', 'animal': 'fox'},\n", " 'the brown fox',\n", " array([1. , 1.3], dtype=float32),\n", @@ -831,12 +831,12 @@ { "data": { "text/plain": [ - "[[UUID('0e505000-0def-11e9-8732-a154fea6fb50'),\n", + "[[UUID('ac8be800-0de6-11e9-a5fd-5a100e653c25'),\n", " {'times': 100, 'action': 'jump', 'animal': 'fox'},\n", " 'jumped over the',\n", " array([ 1. , 10.8], dtype=float32),\n", " 0.00016793422934946456],\n", - " [UUID('95899000-ef1d-11e7-990e-7d2f7e013038'),\n", + " [UUID('33c52800-ef15-11e7-8a12-ea51d07b6447'),\n", " {'times': 1, 'action': 'sit', 'animal': 'fox'},\n", " 'the brown fox',\n", " array([1. , 1.3], dtype=float32),\n", @@ -868,7 +868,7 @@ { "data": { "text/plain": [ - "[[UUID('0e505000-0def-11e9-8732-a154fea6fb50'),\n", + "[[UUID('ac8be800-0de6-11e9-a5fd-5a100e653c25'),\n", " {'times': 100, 'action': 'jump', 'animal': 'fox'},\n", " 'jumped over the',\n", " array([ 1. , 10.8], dtype=float32),\n", @@ -902,7 +902,7 @@ { "data": { "text/plain": [ - "[[UUID('95899000-ef1d-11e7-990e-7d2f7e013038'),\n", + "[[UUID('33c52800-ef15-11e7-8a12-ea51d07b6447'),\n", " {'times': 1, 'action': 'sit', 'animal': 'fox'},\n", " 'the brown fox',\n", " array([1. , 1.3], dtype=float32),\n", @@ -926,7 +926,7 @@ { "data": { "text/plain": [ - "[[UUID('95899000-ef1d-11e7-990e-7d2f7e013038'),\n", + "[[UUID('33c52800-ef15-11e7-8a12-ea51d07b6447'),\n", " {'times': 1, 'action': 'sit', 'animal': 'fox'},\n", " 'the brown fox',\n", " array([1. , 1.3], dtype=float32),\n", @@ -940,7 +940,7 @@ ], "source": [ "tpvec.search([1.0, 9.0], limit=4, \n", - " predicates=client.Predicates(\"__uuid_timestamp\", \">\", specific_datetime) & client.Predicates(\"__uuid_timestamp\", \"<\", specific_datetime+timedelta(days=1)))\n", + " predicates=client.Predicates(\"__uuid_timestamp\", \">=\", specific_datetime) & client.Predicates(\"__uuid_timestamp\", \"<\", specific_datetime+timedelta(days=1)))\n", " " ] }, @@ -961,7 +961,7 @@ "\n", "| Algorithm | Build speed | Query speed | Need to rebuild after updates |\n", "|------------------|-------------|-------------|-------------------------------|\n", - "| timescale vector | Slow | Fastest | No |\n", + "| StreamingDiskANN | Fast | Fastest | No |\n", "| pgvector hnsw | Slowest | Faster | No |\n", "| pgvector ivfflat | Fastest | Slowest | Yes |\n", "\n", @@ -978,7 +978,7 @@ "metadata": {}, "outputs": [], "source": [ - "vec.create_embedding_index(client.TimescaleVectorIndex())" + "vec.create_embedding_index(client.DiskAnnIndex())" ] }, { @@ -1006,9 +1006,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### Timescale Vector index\n", + "#### StreamingDiskANN index\n", "\n", - "The Timescale Vector index is a graph-based algorithm that uses the [DiskANN](https://github.com/microsoft/DiskANN) algorithm. You can read more about it on our [blog](https://www.timescale.com/blog/how-we-made-postgresql-the-best-vector-database/) announcing its release.\n", + "The StreamingDiskANN index from pgvectorscale is a graph-based algorithm that uses the [DiskANN](https://github.com/microsoft/DiskANN) algorithm. You can read more about it on our [blog](https://www.timescale.com/blog/how-we-made-postgresql-as-fast-as-pinecone-for-vector-data/) announcing its release.\n", "\n", "\n", "To create this index, run:" @@ -1020,7 +1020,7 @@ "metadata": {}, "outputs": [], "source": [ - "vec.create_embedding_index(client.TimescaleVectorIndex())" + "vec.create_embedding_index(client.DiskAnnIndex())" ] }, { @@ -1034,9 +1034,13 @@ "\n", "| Parameter name | Description | Default value |\n", "|------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------|\n", - "| num_neighbors | Sets the maximum number of neighbors per node. Higher values increase accuracy but make the graph traversal slower. | 50 |\n", - "| search_list_size | This is the S parameter used in the greedy search algorithm used during construction. Higher values improve graph quality at the cost of slower index builds. | 100 |\n", - "| max_alpha | Is the alpha parameter in the algorithm. Higher values improve graph quality at the cost of slower index builds. | 1.0 |\n", + "| `storage_layout` | `memory_optimized` which uses SBQ to compress vector data or `plain` which stores data uncompressed | memory_optimized\n", + "| `num_neighbors` | Sets the maximum number of neighbors per node. Higher values increase accuracy but make the graph traversal slower. | 50 |\n", + "| `search_list_size` | This is the S parameter used in the greedy search algorithm used during construction. Higher values improve graph quality at the cost of slower index builds. | 100 |\n", + "| `max_alpha` | Is the alpha parameter in the algorithm. Higher values improve graph quality at the cost of slower index builds. | 1.2 |\n", + "| `num_dimensions` | The number of dimensions to index. By default, all dimensions are indexed. But you can also index less dimensions to make use of [Matryoshka embeddings](https://huggingface.co/blog/matryoshka) | 0 (all dimensions)\n", + "| `num_bits_per_dimension` | Number of bits used to encode each dimension when using SBQ | 2 for less than 900 dimensions, 1 otherwise\n", + "\n", "\n", "To set these parameters, you could run:" ] @@ -1057,7 +1061,7 @@ "metadata": {}, "outputs": [], "source": [ - "vec.create_embedding_index(client.TimescaleVectorIndex(num_neighbors=50, search_list_size=100, max_alpha=1.0))" + "vec.create_embedding_index(client.DiskAnnIndex(num_neighbors=50, search_list_size=100, max_alpha=1.0, storage_layout=\"memory_optimized\", num_dimensions=0, num_bits_per_dimension=1))" ] }, { @@ -1065,18 +1069,43 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "You can also set a parameter to control the accuracy vs. query speed trade-off at query time. The parameter is set in the `search()` function using the `query_params` argment. You can set the `search_list_size`(default: 100). This is the number of additional candidates considered during the graph search at query time. Higher values improve query accuracy while making the query slower.\n", + "You can also set a parameter to control the accuracy vs. query speed trade-off at query time. The parameter is set in the `search()` function using the `query_params` argment. \n", "\n", - "You can specify this value during search as follows:" + "| Parameter name | Description | Default value |\n", + "|------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------|\n", + "| `search_list_size` | The number of additional candidates considered during the graph search. | 100\n", + "| `rescore` | The number of elements rescored (0 to disable rescoring) | 50\n", + "\n", + "We suggest using the `rescore` parameter to fine-tune accuracy." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[[UUID('456dbbbc-4a0d-11ef-94a3-6ee10b77fd09'),\n", + " {'times': 100, 'action': 'jump', 'animal': 'fox'},\n", + " 'jumped over the',\n", + " array([ 1. , 10.8], dtype=float32),\n", + " 0.00016793422934946456],\n", + " [UUID('456dbb6c-4a0d-11ef-94a3-6ee10b77fd09'),\n", + " {'times': 1, 'action': 'sit', 'animal': 'fox'},\n", + " 'the brown fox',\n", + " array([1. , 1.3], dtype=float32),\n", + " 0.14489260377438218]]" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# vec.search([1.0, 9.0], limit=4, query_params=TimescaleVectorIndexParams(search_list_size=10))" + "vec.search([1.0, 9.0], limit=4, query_params=client.DiskAnnIndexParams(rescore=400, search_list_size=10))" ] }, { @@ -1167,9 +1196,29 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[[UUID('456dbbbc-4a0d-11ef-94a3-6ee10b77fd09'),\n", + " {'times': 100, 'action': 'jump', 'animal': 'fox'},\n", + " 'jumped over the',\n", + " array([ 1. , 10.8], dtype=float32),\n", + " 0.00016793422934946456],\n", + " [UUID('456dbb6c-4a0d-11ef-94a3-6ee10b77fd09'),\n", + " {'times': 1, 'action': 'sit', 'animal': 'fox'},\n", + " 'the brown fox',\n", + " array([1. , 1.3], dtype=float32),\n", + " 0.14489260377438218]]" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# vec.search([1.0, 9.0], limit=4, query_params=HNSWIndexParams(ef_search=10))" + "vec.search([1.0, 9.0], limit=4, query_params=client.HNSWIndexParams(ef_search=10))" ] }, { @@ -1280,9 +1329,29 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[[UUID('456dbbbc-4a0d-11ef-94a3-6ee10b77fd09'),\n", + " {'times': 100, 'action': 'jump', 'animal': 'fox'},\n", + " 'jumped over the',\n", + " array([ 1. , 10.8], dtype=float32),\n", + " 0.00016793422934946456],\n", + " [UUID('456dbb6c-4a0d-11ef-94a3-6ee10b77fd09'),\n", + " {'times': 1, 'action': 'sit', 'animal': 'fox'},\n", + " 'the brown fox',\n", + " array([1. , 1.3], dtype=float32),\n", + " 0.14489260377438218]]" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# vec.search([1.0, 9.0], limit=4, query_params=IvfflatIndexParams(probes=10))" + "vec.search([1.0, 9.0], limit=4, query_params=client.IvfflatIndexParams(probes=10))" ] }, { @@ -1482,7 +1551,7 @@ "from langchain.text_splitter import CharacterTextSplitter\n", "from timescale_vector import client, pgvectorizer\n", "from langchain_openai import OpenAIEmbeddings\n", - "from langchain.vectorstores.timescalevector import TimescaleVector\n", + "from langchain_community.vectorstores.timescalevector import TimescaleVector\n", "from datetime import timedelta" ] }, @@ -1639,8 +1708,8 @@ { "data": { "text/plain": [ - "[(Document(page_content='Author Matvey Arye, title: First Post, contents:some super interesting content about cats.', metadata={'id': '4a784000-4bc4-11eb-979c-e8748f6439f2', 'author': 'Matvey Arye', 'blog_id': 1, 'category': 'AI', 'published_time': '2021-01-01T00:00:00+00:00'}),\n", - " 0.12657619616729976)]" + "[(Document(metadata={'id': '334e4800-4bee-11eb-a52a-57b3c4a96ccb', 'author': 'Matvey Arye', 'blog_id': 1, 'category': 'AI', 'published_time': '2021-01-01T00:00:00-05:00'}, page_content='Author Matvey Arye, title: First Post, contents:some super interesting content about cats.'),\n", + " 0.12680577303752072)]" ] }, "execution_count": null, @@ -1677,10 +1746,6 @@ "display_name": "python3", "language": "python", "name": "python3" - }, - "language_info": { - "name": "python", - "version": "3.10.11" } }, "nbformat": 4, diff --git a/nbs/tsv_python_getting_started_tutorial.ipynb b/nbs/tsv_python_getting_started_tutorial.ipynb index 4a0d121..04cfd02 100644 --- a/nbs/tsv_python_getting_started_tutorial.ipynb +++ b/nbs/tsv_python_getting_started_tutorial.ipynb @@ -13,7 +13,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This notebook shows how to use the PostgreSQL vector database `Timescale Vector` via the [Timescale Vector python client library](https://github.com/timescale/python-vector). You'll learn how to use TimescaleVector for (1) semantic search, (2) time-based vector search, (3) and how to create indexes to speed up queries.\n", + "This notebook shows how to use the PostgreSQL as vector database via the [Python Vector python client library](https://github.com/timescale/python-vector). You'll learn how to use the client for (1) semantic search, (2) time-based vector search, (3) and how to create indexes to speed up queries.\n", "\n", "Follow along by downloading the [Jupyter notebook version of this tutorial here](https://github.com/timescale/python-vector/blob/main/nbs/tsv_python_getting_started_tutorial.ipynb).\n", "\n", @@ -914,7 +914,7 @@ "\n", "Important note: In PostgreSQL, each table can only have one index on a particular column. So if you'd like to test the performance of different index types, you can do so either by (1) creating multiple tables with different indexes, (2) creating multiple vector columns in the same table and creating different indexes on each column, or (3) by dropping and recreating the index on the same column and comparing results.\n", "\n", - "Let's look at how to create each type of index in Timescale Vector, starting with the TimescaleVector (DiskANN) index." + "Let's look at how to create each type of index, starting with the StreamingDiskANN index." ] }, { @@ -924,7 +924,7 @@ "outputs": [], "source": [ "# Create a timescale vector (DiskANN) search index on the embedding column\n", - "await vec.create_embedding_index(client.TimescaleVectorIndex())" + "await vec.create_embedding_index(client.DiskAnnIndex())" ] }, { @@ -974,7 +974,7 @@ "outputs": [], "source": [ "await vec.drop_embedding_index()\n", - "await vec.create_embedding_index(client.TimescaleVectorIndex())" + "await vec.create_embedding_index(client.DiskAnnIndex())" ] }, { @@ -993,13 +993,13 @@ "- Finding the most recent embeddings that are similar to a query vector (e.g recent news).\n", "- Constraining similarity search to a relevant time range (e.g asking time-based questions about a knowledge base)\n", "\n", - "Let's look at how to run similarity searches with time range filters using the TimescaleVector client.\n", + "Let's look at how to run similarity searches with time range filters using the client.\n", "\n", "- The first step to using time filtering with Timescale Vector is to create a table with the `time_partition_interval` argument set to the desired time interval. This will automatically partition the table into time-based chunks to speed up queries. We completed this step in Part 1 above.\n", "\n", "- Next, we ensure the `id` of our row is a `uuid` with a datetime portion that reflects the date and time we want to associated with the embedding. We completed this step in Part 2 above, where we used the `uuid_from_time()` method provided by the Timescale Vector library.\n", "\n", - "- Finally, we can run similarity searches with time range filters using the TimescaleVector client. We'll illustrate this below." + "- Finally, we can run similarity searches with time range filters using the client. We'll illustrate this below." ] }, { diff --git a/timescale_vector/_modidx.py b/timescale_vector/_modidx.py index 6fa0877..9f639cf 100644 --- a/timescale_vector/_modidx.py +++ b/timescale_vector/_modidx.py @@ -42,6 +42,15 @@ 'timescale_vector/client.py'), 'timescale_vector.client.BaseIndex.get_index_method': ( 'vector.html#baseindex.get_index_method', 'timescale_vector/client.py'), + 'timescale_vector.client.DiskAnnIndex': ('vector.html#diskannindex', 'timescale_vector/client.py'), + 'timescale_vector.client.DiskAnnIndex.__init__': ( 'vector.html#diskannindex.__init__', + 'timescale_vector/client.py'), + 'timescale_vector.client.DiskAnnIndex.create_index_query': ( 'vector.html#diskannindex.create_index_query', + 'timescale_vector/client.py'), + 'timescale_vector.client.DiskAnnIndexParams': ( 'vector.html#diskannindexparams', + 'timescale_vector/client.py'), + 'timescale_vector.client.DiskAnnIndexParams.__init__': ( 'vector.html#diskannindexparams.__init__', + 'timescale_vector/client.py'), 'timescale_vector.client.HNSWIndex': ('vector.html#hnswindex', 'timescale_vector/client.py'), 'timescale_vector.client.HNSWIndex.__init__': ( 'vector.html#hnswindex.__init__', 'timescale_vector/client.py'), @@ -152,16 +161,6 @@ 'timescale_vector.client.Sync.table_is_empty': ( 'vector.html#sync.table_is_empty', 'timescale_vector/client.py'), 'timescale_vector.client.Sync.upsert': ('vector.html#sync.upsert', 'timescale_vector/client.py'), - 'timescale_vector.client.TimescaleVectorIndex': ( 'vector.html#timescalevectorindex', - 'timescale_vector/client.py'), - 'timescale_vector.client.TimescaleVectorIndex.__init__': ( 'vector.html#timescalevectorindex.__init__', - 'timescale_vector/client.py'), - 'timescale_vector.client.TimescaleVectorIndex.create_index_query': ( 'vector.html#timescalevectorindex.create_index_query', - 'timescale_vector/client.py'), - 'timescale_vector.client.TimescaleVectorIndexParams': ( 'vector.html#timescalevectorindexparams', - 'timescale_vector/client.py'), - 'timescale_vector.client.TimescaleVectorIndexParams.__init__': ( 'vector.html#timescalevectorindexparams.__init__', - 'timescale_vector/client.py'), 'timescale_vector.client.UUIDTimeRange': ( 'vector.html#uuidtimerange', 'timescale_vector/client.py'), 'timescale_vector.client.UUIDTimeRange.__init__': ( 'vector.html#uuidtimerange.__init__', diff --git a/timescale_vector/client.py b/timescale_vector/client.py index ccab029..ad6c87b 100644 --- a/timescale_vector/client.py +++ b/timescale_vector/client.py @@ -2,9 +2,9 @@ # %% auto 0 __all__ = ['SEARCH_RESULT_ID_IDX', 'SEARCH_RESULT_METADATA_IDX', 'SEARCH_RESULT_CONTENTS_IDX', 'SEARCH_RESULT_EMBEDDING_IDX', - 'SEARCH_RESULT_DISTANCE_IDX', 'uuid_from_time', 'BaseIndex', 'IvfflatIndex', 'HNSWIndex', - 'TimescaleVectorIndex', 'QueryParams', 'TimescaleVectorIndexParams', 'IvfflatIndexParams', 'HNSWIndexParams', - 'UUIDTimeRange', 'Predicates', 'QueryBuilder', 'Async', 'Sync'] + 'SEARCH_RESULT_DISTANCE_IDX', 'uuid_from_time', 'BaseIndex', 'IvfflatIndex', 'HNSWIndex', 'DiskAnnIndex', + 'QueryParams', 'DiskAnnIndexParams', 'IvfflatIndexParams', 'HNSWIndexParams', 'UUIDTimeRange', 'Predicates', + 'QueryBuilder', 'Async', 'Sync'] # %% ../nbs/00_vector.ipynb 5 import asyncpg @@ -153,44 +153,48 @@ def create_index_query(self, table_name_quoted:str, column_name_quoted: str, ind return "CREATE INDEX {index_name} ON {table_name} USING hnsw ({column_name} {index_method}) {with_clause};"\ .format(index_name=index_name_quoted, table_name=table_name_quoted, column_name=column_name_quoted, index_method=index_method, with_clause=with_clause) -class TimescaleVectorIndex(BaseIndex): +class DiskAnnIndex(BaseIndex): def __init__(self, - use_pq: Optional[bool] = None, - num_neighbors: Optional[int] = None, search_list_size: Optional[int] = None, + num_neighbors: Optional[int] = None, max_alpha: Optional[float] = None, - pq_vector_length: Optional[int] = None, + storage_layout: Optional[str] = None, + num_dimensions: Optional[int] = None, + num_bits_per_dimension: Optional[int] = None, ) -> None: """ Timescale's vector index. """ - self.use_pq = use_pq - self.num_neighbors = num_neighbors self.search_list_size = search_list_size + self.num_neighbors = num_neighbors self.max_alpha = max_alpha - self.pq_vector_length = pq_vector_length + self.storage_layout = storage_layout + self.num_dimensions = num_dimensions + self.num_bits_per_dimension = num_bits_per_dimension def create_index_query(self, table_name_quoted:str, column_name_quoted: str, index_name_quoted: str, distance_type: str, num_records_callback: Callable[[], int]) -> str: if distance_type != "<=>": raise ValueError(f"Timescale's vector index only supports cosine distance, but distance_type was {distance_type}") with_clauses = [] - if self.use_pq is not None: - with_clauses.append(f"use_pq = {self.use_pq}") - if self.num_neighbors is not None: - with_clauses.append(f"num_neighbors = {self.num_neighbors}") if self.search_list_size is not None: with_clauses.append(f"search_list_size = {self.search_list_size}") + if self.num_neighbors is not None: + with_clauses.append(f"num_neighbors = {self.num_neighbors}") if self.max_alpha is not None: with_clauses.append(f"max_alpha = {self.max_alpha}") - if self.pq_vector_length is not None: - with_clauses.append(f"pq_vector_length = {self.pq_vector_length}") + if self.storage_layout is not None: + with_clauses.append(f"storage_layout = {self.storage_layout}") + if self.num_dimensions is not None: + with_clauses.append(f"num_dimensions = {self.num_dimensions}") + if self.num_bits_per_dimension is not None: + with_clauses.append(f"num_bits_per_dimension = {self.num_bits_per_dimension}") with_clause = "" if len(with_clauses) > 0: with_clause = "WITH (" + ", ".join(with_clauses) + ")" - return "CREATE INDEX {index_name} ON {table_name} USING tsv ({column_name}) {with_clause};"\ + return "CREATE INDEX {index_name} ON {table_name} USING diskann ({column_name}) {with_clause};"\ .format(index_name=index_name_quoted, table_name=table_name_quoted, column_name=column_name_quoted, with_clause=with_clause) @@ -202,9 +206,14 @@ def __init__(self, params: dict[str, Any]) -> None: def get_statements(self) -> List[str]: return ["SET LOCAL " + key + " = " + str(value) for key, value in self.params.items()] -class TimescaleVectorIndexParams(QueryParams): - def __init__(self, search_list_size: int) -> None: - super().__init__({"tsv.query_search_list_size": search_list_size}) +class DiskAnnIndexParams(QueryParams): + def __init__(self, search_list_size: Optional[int] = None, rescore: Optional[int] = None) -> None: + params = {} + if search_list_size is not None: + params["diskann.query_search_list_size"] = search_list_size + if rescore is not None: + params["diskann.query_rescore"] = rescore + super().__init__(params) class IvfflatIndexParams(QueryParams): def __init__(self, probes: int) -> None: @@ -602,7 +611,7 @@ def get_create_query(self): ) return ''' CREATE EXTENSION IF NOT EXISTS vector; -CREATE EXTENSION IF NOT EXISTS timescale_vector; +CREATE EXTENSION IF NOT EXISTS vectorscale; CREATE TABLE IF NOT EXISTS {table_name} (