feat: unify settings for vector and nodestore connections to PostgreS…

…QL (#1730) * Unify pgvector and postgres connection settings * Remove local changes * Update file pgvector->postgres
zylon-ai · Mar 15, 2024 · 63de7e4 · 63de7e4
1 parent 68b3a34
commit 63de7e4
Show file tree

Hide file tree

Showing 5 changed files with 39 additions and 45 deletions.
diff --git a/fern/docs/pages/manual/vectordb.mdx b/fern/docs/pages/manual/vectordb.mdx
@@ -1,7 +1,7 @@
 ## Vectorstores
 PrivateGPT supports [Qdrant](https://qdrant.tech/), [Chroma](https://www.trychroma.com/) and [PGVector](https://github.com/pgvector/pgvector) as vectorstore providers. Qdrant being the default.
 
-In order to select one or the other, set the `vectorstore.database` property in the `settings.yaml` file to `qdrant`, `chroma` or `pgvector`.
+In order to select one or the other, set the `vectorstore.database` property in the `settings.yaml` file to `qdrant`, `chroma` or `postgres`.
 
 ```yaml
 vectorstore:
@@ -50,14 +50,15 @@ poetry install --extras chroma
 By default `chroma` will use a disk-based database stored in local_data_path / "chroma_db" (being local_data_path defined in settings.yaml)
 
 ### PGVector
+To use the PGVector store a [postgreSQL](https://www.postgresql.org/) database with the PGVector extension must be used.
 
-To enable PGVector, set the `vectorstore.database` property in the `settings.yaml` file to `pgvector` and install the `vector-stores-postgres` extra.
+To enable PGVector, set the `vectorstore.database` property in the `settings.yaml` file to `postgres` and install the `vector-stores-postgres` extra.
 
 ```bash
 poetry install --extras vector-stores-postgres
 ```
 
-PGVector settings can be configured by setting values to the `pgvector` property in the `settings.yaml` file.
+PGVector settings can be configured by setting values to the `postgres` property in the `settings.yaml` file.
 
 The available configuration options are:
 | Field         | Description                                               |
@@ -67,19 +68,36 @@ The available configuration options are:
 | **database**  | The specific database to connect to. Default is `postgres` |
 | **user**      | The username for database access. Default is `postgres` |
 | **password**  | The password for database access. (Required)            |
-| **embed_dim** | The dimensionality of the embedding model (Required)    |
 | **schema_name** | The database schema to use. Default is `private_gpt`       |
-| **table_name** | The database table to use. Default is `embeddings`    |
 
 For example:
 ```yaml
-pgvector:
+vectorstore:
+  database: postgresql
+
+postgres:
   host: localhost
   port: 5432
   database: postgres
   user: postgres
   password: <PASSWORD>
-  embed_dim: 384 # 384 is for BAAI/bge-small-en-v1.5
   schema_name: private_gpt
-  table_name: embeddings
 ```
+
+The following table will be created in the database
+```
+postgres=# \d private_gpt.data_embeddings
+                                      Table "private_gpt.data_embeddings"
+  Column   |       Type        | Collation | Nullable |                         Default
+-----------+-------------------+-----------+----------+---------------------------------------------------------
+ id        | bigint            |           | not null | nextval('private_gpt.data_embeddings_id_seq'::regclass)
+ text      | character varying |           | not null |
+ metadata_ | json              |           |          |
+ node_id   | character varying |           |          |
+ embedding | vector(768)       |           |          |
+Indexes:
+    "data_embeddings_pkey" PRIMARY KEY, btree (id)
+
+postgres=# 
+```
+The dimensions of the embeddings columns will be set based on the `embedding.embed_dim` value.  If the embedding model changes this table may need to be dropped and recreated to avoid a dimension mismatch.
diff --git a/private_gpt/components/vector_store/vector_store_component.py b/private_gpt/components/vector_store/vector_store_component.py
@@ -38,7 +38,7 @@ class VectorStoreComponent:
     def __init__(self, settings: Settings) -> None:
         self.settings = settings
         match settings.vectorstore.database:
-            case "pgvector":
+            case "postgres":
                 try:
                     from llama_index.vector_stores.postgres import (  # type: ignore
                         PGVectorStore,
@@ -48,15 +48,17 @@ def __init__(self, settings: Settings) -> None:
                         "Postgres dependencies not found, install with `poetry install --extras vector-stores-postgres`"
                     ) from e
 
-                if settings.pgvector is None:
+                if settings.postgres is None:
                     raise ValueError(
-                        "PGVectorStore settings not found. Please provide settings."
+                        "Postgres settings not found. Please provide settings."
                     )
 
                 self.vector_store = typing.cast(
                     VectorStore,
                     PGVectorStore.from_params(
-                        **settings.pgvector.model_dump(exclude_none=True)
+                        **settings.postgres.model_dump(exclude_none=True),
+                        table_name="embeddings",
+                        embed_dim=settings.embedding.embed_dim,
                     ),
                 )
 

diff --git a/private_gpt/settings/settings.py b/private_gpt/settings/settings.py
@@ -105,7 +105,7 @@ class LLMSettings(BaseModel):
 
 
 class VectorstoreSettings(BaseModel):
-    database: Literal["chroma", "qdrant", "pgvector"]
+    database: Literal["chroma", "qdrant", "postgres"]
 
 
 class NodeStoreSettings(BaseModel):
@@ -177,6 +177,10 @@ class EmbeddingSettings(BaseModel):
             "Do not set it higher than your number of threads of your CPU."
         ),
     )
+    embed_dim: int = Field(
+        384,
+        description="The dimension of the embeddings stored in the Postgres database",
+    )
 
 
 class SagemakerSettings(BaseModel):
@@ -280,17 +284,6 @@ class PostgresSettings(BaseModel):
     )
 
 
-class PGVectorSettings(PostgresSettings):
-    embed_dim: int = Field(
-        384,
-        description="The dimension of the embeddings stored in the Postgres database",
-    )
-    table_name: str = Field(
-        "embeddings",
-        description="The name of the table in the Postgres database where the embeddings are stored",
-    )
-
-
 class QdrantSettings(BaseModel):
     location: str | None = Field(
         None,
@@ -360,7 +353,6 @@ class Settings(BaseModel):
     nodestore: NodeStoreSettings
     qdrant: QdrantSettings | None = None
     postgres: PostgresSettings | None = None
-    pgvector: PGVectorSettings | None = None
 
 
 """

diff --git a/settings-ollama-pg.yaml b/settings-ollama-pg.yaml
@@ -11,6 +11,7 @@ llm:
 
 embedding:
   mode: ollama
+  embed_dim: 768
 
 ollama:
   llm_model: mistral
@@ -21,17 +22,7 @@ nodestore:
   database: postgres
 
 vectorstore:
-  database: pgvector
-
-pgvector:
-  host: localhost
-  port: 5432
   database: postgres
-  user: postgres
-  password: admin
-  embed_dim: 768
-  schema_name: private_gpt
-  table_name: embeddings
 
 postgres:
   host: localhost

diff --git a/settings.yaml b/settings.yaml
@@ -55,6 +55,7 @@ embedding:
   # Should be matching the value above in most cases
   mode: huggingface
   ingest_mode: simple
+  embed_dim: 384 # 384 is for BAAI/bge-small-en-v1.5
 
 huggingface:
   embedding_hf_model_name: BAAI/bge-small-en-v1.5
@@ -68,16 +69,6 @@ nodestore:
 qdrant:
   path: local_data/private_gpt/qdrant
 
-pgvector:
-  host: localhost
-  port: 5432
-  database: postgres
-  user: postgres
-  password: postgres
-  embed_dim: 384 # 384 is for BAAI/bge-small-en-v1.5
-  schema_name: private_gpt
-  table_name: embeddings
-
 postgres:
   host: localhost
   port: 5432