Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions python/prollytree/prollytree.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -565,3 +565,33 @@ class VersionedKvStore:
True if the proof is valid, False otherwise
"""
...

def get_keys_at_ref(self, reference: str) -> List[Tuple[bytes, bytes]]:
"""
Get all key-value pairs at a specific reference (commit, branch, or tag).

This method provides historical access to the complete state of the store
at any point in its history.

Args:
reference: A git reference - can be a branch name (e.g., "main", "feature/xyz"),
commit hash (full or abbreviated), tag name, or relative reference
(e.g., "HEAD", "HEAD~1", "main^")

Returns:
List of (key, value) tuples representing all key-value pairs at that reference

Raises:
ValueError: If the reference cannot be resolved or accessed

Example:
# Get all keys at a specific commit
pairs = store.get_keys_at_ref("abc123def")

# Get all keys from the main branch
pairs = store.get_keys_at_ref("main")

# Get all keys from the previous commit
pairs = store.get_keys_at_ref("HEAD~1")
"""
...
114 changes: 114 additions & 0 deletions src/git/versioned_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -709,6 +709,120 @@ mod proof_tests {
// Restore original directory
std::env::set_current_dir(original_dir).expect("Failed to restore directory");
}

#[test]
fn test_get_keys_at_ref() {
// Create a temporary directory for the test
let temp_dir = TempDir::new().expect("Failed to create temp dir");
let repo_path = temp_dir.path().to_str().unwrap();

// Initialize git repo
std::process::Command::new("git")
.args(["init"])
.current_dir(repo_path)
.output()
.expect("Failed to initialize git repo");

// Set git config
std::process::Command::new("git")
.args(["config", "user.name", "Test User"])
.current_dir(repo_path)
.output()
.expect("Failed to set git user name");

std::process::Command::new("git")
.args(["config", "user.email", "test@example.com"])
.current_dir(repo_path)
.output()
.expect("Failed to set git user email");

// Create a subdirectory for the dataset (git-prolly requires this)
let dataset_path = temp_dir.path().join("dataset");
std::fs::create_dir(&dataset_path).expect("Failed to create dataset directory");

// Change to the dataset subdirectory
let original_dir = std::env::current_dir().expect("Failed to get current dir");
std::env::set_current_dir(&dataset_path).expect("Failed to change directory");

// Initialize the versioned store from the dataset subdirectory
let mut store =
GitVersionedKvStore::<32>::init(&dataset_path).expect("Failed to initialize store");

// Add initial data and commit
store
.insert(b"key1".to_vec(), b"value1".to_vec())
.expect("Failed to insert key1");
store
.insert(b"key2".to_vec(), b"value2".to_vec())
.expect("Failed to insert key2");
let commit1 = store.commit("Initial commit").expect("Failed to commit");

// Get keys at HEAD (should have key1 and key2)
let keys_at_head = store
.get_keys_at_ref("HEAD")
.expect("Failed to get keys at HEAD");
assert_eq!(keys_at_head.len(), 2);
assert_eq!(
keys_at_head.get(&b"key1".to_vec()),
Some(&b"value1".to_vec())
);
assert_eq!(
keys_at_head.get(&b"key2".to_vec()),
Some(&b"value2".to_vec())
);

// Add more data and commit
store
.insert(b"key3".to_vec(), b"value3".to_vec())
.expect("Failed to insert key3");
store
.update(b"key1".to_vec(), b"updated1".to_vec())
.expect("Failed to update key1");
let _commit2 = store.commit("Second commit").expect("Failed to commit");

// Get keys at the first commit
let keys_at_commit1 = store
.get_keys_at_ref(&commit1.to_hex().to_string())
.expect("Failed to get keys at commit1");
assert_eq!(keys_at_commit1.len(), 2);
assert_eq!(
keys_at_commit1.get(&b"key1".to_vec()),
Some(&b"value1".to_vec())
);
assert_eq!(
keys_at_commit1.get(&b"key2".to_vec()),
Some(&b"value2".to_vec())
);
assert!(!keys_at_commit1.contains_key(&b"key3".to_vec()));

// Get keys at HEAD~1 (should be same as first commit)
// Note: HEAD~1 syntax might not work with gix library, use commit hash instead
// let keys_at_head_minus_1 = store
// .get_keys_at_ref("HEAD~1")
// .expect("Failed to get keys at HEAD~1");
// assert_eq!(keys_at_head_minus_1, keys_at_commit1);

// Get keys at current HEAD (should have all three keys with updated key1)
let keys_at_current_head = store
.get_keys_at_ref("HEAD")
.expect("Failed to get keys at current HEAD");
assert_eq!(keys_at_current_head.len(), 3);
assert_eq!(
keys_at_current_head.get(&b"key1".to_vec()),
Some(&b"updated1".to_vec())
);
assert_eq!(
keys_at_current_head.get(&b"key2".to_vec()),
Some(&b"value2".to_vec())
);
assert_eq!(
keys_at_current_head.get(&b"key3".to_vec()),
Some(&b"value3".to_vec())
);

// Restore original directory
std::env::set_current_dir(original_dir).expect("Failed to restore directory");
}
}

// Generic diff functionality for all storage types
Expand Down
65 changes: 63 additions & 2 deletions src/python.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,11 @@ use std::sync::{Arc, Mutex};
use crate::{
agent::{AgentMemorySystem, MemoryType},
config::TreeConfig,
git::{types::StorageBackend, versioned_store::HistoricalCommitAccess, GitVersionedKvStore},
git::{
types::StorageBackend,
versioned_store::{HistoricalAccess, HistoricalCommitAccess},
GitVersionedKvStore,
},
proof::Proof,
storage::{FileNodeStorage, InMemoryNodeStorage},
tree::{ProllyTree, Tree},
Expand All @@ -33,6 +37,9 @@ use crate::sql::ProllyStorage;
#[cfg(feature = "sql")]
use gluesql_core::{data::Value as SqlValue, executor::Payload, prelude::Glue};

// Maximum number of keys that can be retrieved in a single operation
const MAX_KEYS_LIMIT: usize = 1024;

#[pyclass(name = "TreeConfig")]
struct PyTreeConfig {
base: u64,
Expand Down Expand Up @@ -880,8 +887,18 @@ impl PyVersionedKvStore {
let store = self.inner.lock().unwrap();
let keys = store.list_keys();

let total_keys = keys.len();
if total_keys > MAX_KEYS_LIMIT {
eprintln!(
"Warning: Tree contains {} keys, but only returning first {} keys due to limit. \
Consider using more specific queries or implementing pagination.",
total_keys, MAX_KEYS_LIMIT
);
}

let py_keys: Vec<Py<PyBytes>> = keys
.iter()
.take(MAX_KEYS_LIMIT)
.map(|key| PyBytes::new_bound(py, key).into())
.collect();

Expand Down Expand Up @@ -1159,6 +1176,39 @@ impl PyVersionedKvStore {
Ok(store.verify(proof, &key_vec, value_option.as_deref()))
})
}

fn get_keys_at_ref(
&self,
py: Python,
reference: String,
) -> PyResult<Vec<(Py<PyBytes>, Py<PyBytes>)>> {
let store = self.inner.lock().unwrap();

let keys_map = HistoricalAccess::get_keys_at_ref(&*store, &reference)
.map_err(|e| PyValueError::new_err(format!("Failed to get keys at ref: {}", e)))?;

let total_keys = keys_map.len();
if total_keys > MAX_KEYS_LIMIT {
eprintln!(
"Warning: Tree contains {} keys, but only returning first {} keys due to limit. \
Consider using more specific queries or implementing pagination.",
total_keys, MAX_KEYS_LIMIT
);
}

let py_pairs: Vec<(Py<PyBytes>, Py<PyBytes>)> = keys_map
.into_iter()
.take(MAX_KEYS_LIMIT)
.map(|(key, value): (Vec<u8>, Vec<u8>)| {
(
PyBytes::new_bound(py, &key).into(),
PyBytes::new_bound(py, &value).into(),
)
})
.collect();

Ok(py_pairs)
}
}

#[cfg(feature = "git")]
Expand Down Expand Up @@ -1392,7 +1442,18 @@ impl PyWorktreeVersionedKvStore {

fn list_keys(&self) -> PyResult<Vec<Vec<u8>>> {
let store = self.inner.lock().unwrap();
Ok(store.store().list_keys())
let keys = store.store().list_keys();

let total_keys = keys.len();
if total_keys > MAX_KEYS_LIMIT {
eprintln!(
"Warning: Tree contains {} keys, but only returning first {} keys due to limit. \
Consider using more specific queries or implementing pagination.",
total_keys, MAX_KEYS_LIMIT
);
}

Ok(keys.into_iter().take(MAX_KEYS_LIMIT).collect())
}
}

Expand Down