Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 33 additions & 33 deletions docs/catalog/s3vectors.md
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ You can also manually create the foreign table like below if you did not use `im
```sql
create foreign table s3_vectors.embeddings (
key text not null,
data embd not null,
data s3vec not null,
metadata jsonb
)
server s3_vectors_server
Expand All @@ -160,51 +160,51 @@ create foreign table s3_vectors.embeddings (
```
### Custom Data Types

#### embd
#### s3vec

The `embd` type is a custom PostgreSQL data type designed to store and work with high-dimensional vectors for machine learning and AI applications.
The `s3vec` type is a custom PostgreSQL data type designed to store and work with high-dimensional vectors for machine learning and AI applications.

**Structure:**

The `embd` type internally contains:
The `s3vec` type internally contains:

- Vector data as an array of 32-bit floating point numbers (Float32)
- Additional metadata fields for internal use

**Input Formats:**

The `embd` type accepts input in JSON array format:
The `s3vec` type accepts input in JSON array format:

```sql
-- Simple array format (most common)
'[0.1, 0.2, 0.3, 0.4, 0.5]'::embd
'[0.1, 0.2, 0.3, 0.4, 0.5]'::s3vec

-- Full JSON object format (advanced)
'{"data": [0.1, 0.2, 0.3], "key": "vector_001"}'::embd
'{"data": [0.1, 0.2, 0.3], "key": "vector_001"}'::s3vec
```

**Output Format:**

When displayed, the `embd` type shows a summary format:
When displayed, the `s3vec` type shows a summary format:

```
embd:5 -- indicates an embedding with 5 dimensions
s3vec:5 -- indicates an embedding with 5 dimensions
```

**Usage Examples:**

See the following sections for complete examples:

- [Inserting Vectors](#inserting-vectors) - Examples of inserting data with `embd` type
- [Inserting Vectors](#inserting-vectors) - Examples of inserting data with `s3vec` type
- [Querying Vectors](#querying-vectors) - Basic queries and vector similarity search
- [Vector Similarity Search with Filtering](#vector-similarity-search-with-filtering) - Advanced search with metadata filtering
- [Advanced Example: Semantic Search](#advanced-example-semantic-search) - Complete semantic search implementation

**Operations:**

- **Vector similarity search**: Use the `<==>` operator for approximate nearest neighbor search
- **Distance calculation**: Use `embd_distance()` function to get similarity scores
- **Type casting**: Convert JSON arrays to `embd` type using `::embd` cast
- **Distance calculation**: Use `s3vec_distance()` function to get similarity scores
- **Type casting**: Convert JSON arrays to `s3vec` type using `::s3vec` cast

**Constraints:**

Expand All @@ -214,19 +214,19 @@ See the following sections for complete examples:

### Functions

#### embd_distance(embd)
#### s3vec_distance(s3vec)

Returns the distance score from the most recent vector similarity search operation.

**Syntax:**

```sql
embd_distance(vector_data) -> real
s3vec_distance(vector_data) -> real
```

**Parameters:**

- `vector_data` - An `embd` type column containing vector data
- `vector_data` - An `s3vec` type column containing vector data

**Returns:**

Expand All @@ -236,9 +236,9 @@ embd_distance(vector_data) -> real

```sql
-- Get similarity search results with distance scores
select embd_distance(data) as distance, key, metadata
select s3vec_distance(data) as distance, key, metadata
from s3_vectors.embeddings
where data <==> '[0.1, 0.2, 0.3, 0.4, 0.5]'::embd
where data <==> '[0.1, 0.2, 0.3, 0.4, 0.5]'::s3vec
order by 1
limit 5;
```
Expand Down Expand Up @@ -312,18 +312,18 @@ For exact key lookups:

3. **Vector similarity search**:
```sql
select embd_distance(data) as distance, *
select s3vec_distance(data) as distance, *
from s3_vectors.embeddings
where data <==> '[0.1, 0.2, 0.3, ...]'::embd
where data <==> '[0.1, 0.2, 0.3, ...]'::s3vec
order by 1
limit 10;
```

4. **Vector search with metadata filtering**:
```sql
select embd_distance(data) as distance, *
select s3vec_distance(data) as distance, *
from s3_vectors.embeddings
where data <==> '[0.1, 0.2, 0.3, ...]'::embd
where data <==> '[0.1, 0.2, 0.3, ...]'::s3vec
and metadata <==> '{"category": "product"}'::jsonb
order by 1
limit 5;
Expand All @@ -338,7 +338,7 @@ For exact key lookups:
| Postgres Type | S3 Vectors Type |
| ---------------- | -------------------------------------- |
| text | String (for vector key) |
| embd | Float32 vector data |
| s3vec | Float32 vector data |
| jsonb | Document metadata |

## Limitations
Expand Down Expand Up @@ -380,7 +380,7 @@ import foreign schema s3_vectors
-- or, create the foreign table manually
create foreign table if not exists s3_vectors.embeddings (
key text not null,
data embd not null,
data s3vec not null,
metadata jsonb
)
server s3_vectors_server
Expand All @@ -401,9 +401,9 @@ select * from s3_vectors.embeddings;
select * from s3_vectors.embeddings where key = 'product_001';

-- Vector similarity search (top 5 similar vectors)
select embd_distance(data) as distance, key, metadata
select s3vec_distance(data) as distance, key, metadata
from s3_vectors.embeddings
where data <==> '[0.1, 0.2, 0.3, 0.4, 0.5]'::embd
where data <==> '[0.1, 0.2, 0.3, 0.4, 0.5]'::s3vec
order by 1
limit 5;
```
Expand All @@ -415,15 +415,15 @@ limit 5;
insert into s3_vectors.embeddings (key, data, metadata)
values (
'product_001',
'[0.1, 0.2, 0.3, 0.4, 0.5]'::embd,
'[0.1, 0.2, 0.3, 0.4, 0.5]'::s3vec,
'{"category": "electronics", "price": 299.99}'::jsonb
);

-- Insert multiple vectors
insert into s3_vectors.embeddings (key, data, metadata)
values
('product_002', '[0.2, 0.3, 0.4, 0.5, 0.6]'::embd, '{"category": "books"}'::jsonb),
('product_003', '[0.3, 0.4, 0.5, 0.6, 0.7]'::embd, '{"category": "clothing"}'::jsonb);
('product_002', '[0.2, 0.3, 0.4, 0.5, 0.6]'::s3vec, '{"category": "books"}'::jsonb),
('product_003', '[0.3, 0.4, 0.5, 0.6, 0.7]'::s3vec, '{"category": "clothing"}'::jsonb);
```

### Deleting Vectors
Expand All @@ -440,9 +440,9 @@ delete from s3_vectors.embeddings;

```sql
-- Find similar vectors with metadata filtering
select embd_distance(data) as distance, key, metadata
select s3vec_distance(data) as distance, key, metadata
from s3_vectors.embeddings
where data <==> '[0.1, 0.2, 0.3, 0.4, 0.5]'::embd
where data <==> '[0.1, 0.2, 0.3, 0.4, 0.5]'::s3vec
and metadata <==> '{"category": "electronics"}'::jsonb
order by 1
limit 3;
Expand All @@ -454,15 +454,15 @@ limit 3;
-- Create a function to convert text to embeddings (pseudo-code)
-- This would typically use an external embedding service
create or replace function text_to_embedding(input_text text)
returns embd
returns s3vec
language sql
as $$
-- This is a placeholder - you would implement actual text embedding logic
select '[0.1, 0.2, 0.3, 0.4, 0.5]'::embd;
select '[0.1, 0.2, 0.3, 0.4, 0.5]'::s3vec;
$$;

-- Semantic search example
select embd_distance(data) as distance, key, metadata
select s3vec_distance(data) as distance, key, metadata
from s3_vectors.embeddings
where data <==> text_to_embedding('Find similar products')
and metadata <==> '{"status": "active"}'::jsonb
Expand Down
1 change: 1 addition & 0 deletions wrappers/src/fdw/s3vectors_fdw/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,5 @@ This is a foreign data wrapper for [AWS S3 Vectors](https://aws.amazon.com/s3/fe

| Version | Date | Notes |
| ------- | ---------- | ---------------------------------------------------- |
| 0.1.1 | 2025-11-17 | Changed 'embd' type name to 's3vec' |
| 0.1.0 | 2025-09-14 | Initial version |
5 changes: 4 additions & 1 deletion wrappers/src/fdw/s3vectors_fdw/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#![allow(clippy::module_inception)]
mod conv;
mod embd;
mod s3vec;
mod s3vectors_fdw;
mod tests;

Expand All @@ -23,6 +23,9 @@ enum S3VectorsFdwError {
#[error("query filter is not supported, check S3 Vectors wrapper documents for more details")]
QueryNotSupported,

#[error("invalid s3vec value: {0}")]
InvalidS3Vec(String),

#[error("invalid insert value {0}")]
InvalidInsertValue(String),

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use super::conv::document_to_json_value;
use super::S3VectorsFdwError;
use aws_sdk_s3vectors::types::{GetOutputVector, ListOutputVector, QueryOutputVector, VectorData};
use pgrx::{pg_sys::bytea, prelude::*, stringinfo::StringInfo, JsonB};
use serde::{Deserialize, Serialize};
Expand All @@ -7,38 +8,38 @@ use std::ffi::CStr;

#[derive(Debug, Default, PostgresType, Serialize, Deserialize)]
#[inoutfuncs]
pub(super) struct Embd {
pub(super) struct S3Vec {
pub key: String,
pub data: Vec<f32>,
pub metadata: Option<JsonValue>,
pub distance: f32,
}

impl InOutFuncs for Embd {
const NULL_ERROR_MESSAGE: Option<&'static str> = Some("cannot insert NULL to embd column");
impl InOutFuncs for S3Vec {
const NULL_ERROR_MESSAGE: Option<&'static str> = Some("cannot insert NULL to s3vec column");

fn input(input: &CStr) -> Self {
let value: JsonValue = serde_json::from_str(input.to_str().unwrap_or_default())
.expect("embd input should be a valid JSON string");
.expect("s3vec input should be a valid JSON string");

if value.is_array() {
Self {
data: serde_json::from_value(value).expect("embd data should be a float32 array"),
data: serde_json::from_value(value).expect("s3vec data should be a float32 array"),
..Default::default()
}
} else {
let ret: Self =
serde_json::from_value(value).expect("embd should be in valid JSON format");
serde_json::from_value(value).expect("s3vec should be in valid JSON format");
ret
}
}

fn output(&self, buffer: &mut StringInfo) {
buffer.push_str(&format!("embd:{}", self.data.len()));
buffer.push_str(&format!("s3vec:{}", self.data.len()));
}
}

impl From<&ListOutputVector> for Embd {
impl From<&ListOutputVector> for S3Vec {
fn from(v: &ListOutputVector) -> Self {
let data = if let Some(VectorData::Float32(vector_data)) = &v.data {
vector_data.clone()
Expand All @@ -56,7 +57,7 @@ impl From<&ListOutputVector> for Embd {
}
}

impl From<&GetOutputVector> for Embd {
impl From<&GetOutputVector> for S3Vec {
fn from(v: &GetOutputVector) -> Self {
let data = if let Some(VectorData::Float32(vector_data)) = &v.data {
vector_data.clone()
Expand All @@ -74,7 +75,7 @@ impl From<&GetOutputVector> for Embd {
}
}

impl From<&QueryOutputVector> for Embd {
impl From<&QueryOutputVector> for S3Vec {
fn from(v: &QueryOutputVector) -> Self {
let data = if let Some(VectorData::Float32(vector_data)) = &v.data {
vector_data.clone()
Expand All @@ -92,16 +93,23 @@ impl From<&QueryOutputVector> for Embd {
}
}

impl From<*mut bytea> for Embd {
fn from(v: *mut bytea) -> Self {
impl TryFrom<*mut bytea> for S3Vec {
type Error = S3VectorsFdwError;

fn try_from(v: *mut bytea) -> Result<Self, Self::Error> {
if v.is_null() {
return Err(S3VectorsFdwError::InvalidS3Vec(
"input bytea pointer is null".to_string(),
));
}
let ret: Self = unsafe { pgrx::datum::cbor_decode(v) };
ret
Ok(ret)
}
}

#[pg_operator(immutable, parallel_safe)]
#[opname(<==>)]
fn embd_knn(_left: Embd, _right: Embd) -> bool {
fn s3vec_knn(_left: S3Vec, _right: S3Vec) -> bool {
// always return true here, actual calculation will be done in the wrapper
true
}
Expand All @@ -114,6 +122,6 @@ fn metadata_filter(_left: JsonB, _right: JsonB) -> bool {
}

#[pg_extern]
fn embd_distance(embd: Embd) -> f32 {
embd.distance
fn s3vec_distance(s3vec: S3Vec) -> f32 {
s3vec.distance
}
Loading
Loading