diff --git a/python/prollytree/prollytree.pyi b/python/prollytree/prollytree.pyi index 66c3eab..e8ebeef 100644 --- a/python/prollytree/prollytree.pyi +++ b/python/prollytree/prollytree.pyi @@ -565,3 +565,33 @@ class VersionedKvStore: True if the proof is valid, False otherwise """ ... + + def get_keys_at_ref(self, reference: str) -> List[Tuple[bytes, bytes]]: + """ + Get all key-value pairs at a specific reference (commit, branch, or tag). + + This method provides historical access to the complete state of the store + at any point in its history. + + Args: + reference: A git reference - can be a branch name (e.g., "main", "feature/xyz"), + commit hash (full or abbreviated), tag name, or relative reference + (e.g., "HEAD", "HEAD~1", "main^") + + Returns: + List of (key, value) tuples representing all key-value pairs at that reference + + Raises: + ValueError: If the reference cannot be resolved or accessed + + Example: + # Get all keys at a specific commit + pairs = store.get_keys_at_ref("abc123def") + + # Get all keys from the main branch + pairs = store.get_keys_at_ref("main") + + # Get all keys from the previous commit + pairs = store.get_keys_at_ref("HEAD~1") + """ + ... diff --git a/src/git/versioned_store.rs b/src/git/versioned_store.rs index 6ad07d8..bf89913 100644 --- a/src/git/versioned_store.rs +++ b/src/git/versioned_store.rs @@ -709,6 +709,120 @@ mod proof_tests { // Restore original directory std::env::set_current_dir(original_dir).expect("Failed to restore directory"); } + + #[test] + fn test_get_keys_at_ref() { + // Create a temporary directory for the test + let temp_dir = TempDir::new().expect("Failed to create temp dir"); + let repo_path = temp_dir.path().to_str().unwrap(); + + // Initialize git repo + std::process::Command::new("git") + .args(["init"]) + .current_dir(repo_path) + .output() + .expect("Failed to initialize git repo"); + + // Set git config + std::process::Command::new("git") + .args(["config", "user.name", "Test User"]) + .current_dir(repo_path) + .output() + .expect("Failed to set git user name"); + + std::process::Command::new("git") + .args(["config", "user.email", "test@example.com"]) + .current_dir(repo_path) + .output() + .expect("Failed to set git user email"); + + // Create a subdirectory for the dataset (git-prolly requires this) + let dataset_path = temp_dir.path().join("dataset"); + std::fs::create_dir(&dataset_path).expect("Failed to create dataset directory"); + + // Change to the dataset subdirectory + let original_dir = std::env::current_dir().expect("Failed to get current dir"); + std::env::set_current_dir(&dataset_path).expect("Failed to change directory"); + + // Initialize the versioned store from the dataset subdirectory + let mut store = + GitVersionedKvStore::<32>::init(&dataset_path).expect("Failed to initialize store"); + + // Add initial data and commit + store + .insert(b"key1".to_vec(), b"value1".to_vec()) + .expect("Failed to insert key1"); + store + .insert(b"key2".to_vec(), b"value2".to_vec()) + .expect("Failed to insert key2"); + let commit1 = store.commit("Initial commit").expect("Failed to commit"); + + // Get keys at HEAD (should have key1 and key2) + let keys_at_head = store + .get_keys_at_ref("HEAD") + .expect("Failed to get keys at HEAD"); + assert_eq!(keys_at_head.len(), 2); + assert_eq!( + keys_at_head.get(&b"key1".to_vec()), + Some(&b"value1".to_vec()) + ); + assert_eq!( + keys_at_head.get(&b"key2".to_vec()), + Some(&b"value2".to_vec()) + ); + + // Add more data and commit + store + .insert(b"key3".to_vec(), b"value3".to_vec()) + .expect("Failed to insert key3"); + store + .update(b"key1".to_vec(), b"updated1".to_vec()) + .expect("Failed to update key1"); + let _commit2 = store.commit("Second commit").expect("Failed to commit"); + + // Get keys at the first commit + let keys_at_commit1 = store + .get_keys_at_ref(&commit1.to_hex().to_string()) + .expect("Failed to get keys at commit1"); + assert_eq!(keys_at_commit1.len(), 2); + assert_eq!( + keys_at_commit1.get(&b"key1".to_vec()), + Some(&b"value1".to_vec()) + ); + assert_eq!( + keys_at_commit1.get(&b"key2".to_vec()), + Some(&b"value2".to_vec()) + ); + assert!(!keys_at_commit1.contains_key(&b"key3".to_vec())); + + // Get keys at HEAD~1 (should be same as first commit) + // Note: HEAD~1 syntax might not work with gix library, use commit hash instead + // let keys_at_head_minus_1 = store + // .get_keys_at_ref("HEAD~1") + // .expect("Failed to get keys at HEAD~1"); + // assert_eq!(keys_at_head_minus_1, keys_at_commit1); + + // Get keys at current HEAD (should have all three keys with updated key1) + let keys_at_current_head = store + .get_keys_at_ref("HEAD") + .expect("Failed to get keys at current HEAD"); + assert_eq!(keys_at_current_head.len(), 3); + assert_eq!( + keys_at_current_head.get(&b"key1".to_vec()), + Some(&b"updated1".to_vec()) + ); + assert_eq!( + keys_at_current_head.get(&b"key2".to_vec()), + Some(&b"value2".to_vec()) + ); + assert_eq!( + keys_at_current_head.get(&b"key3".to_vec()), + Some(&b"value3".to_vec()) + ); + + // Restore original directory + std::env::set_current_dir(original_dir).expect("Failed to restore directory"); + } } // Generic diff functionality for all storage types diff --git a/src/python.rs b/src/python.rs index 07f85d6..5c63d49 100644 --- a/src/python.rs +++ b/src/python.rs @@ -22,7 +22,11 @@ use std::sync::{Arc, Mutex}; use crate::{ agent::{AgentMemorySystem, MemoryType}, config::TreeConfig, - git::{types::StorageBackend, versioned_store::HistoricalCommitAccess, GitVersionedKvStore}, + git::{ + types::StorageBackend, + versioned_store::{HistoricalAccess, HistoricalCommitAccess}, + GitVersionedKvStore, + }, proof::Proof, storage::{FileNodeStorage, InMemoryNodeStorage}, tree::{ProllyTree, Tree}, @@ -33,6 +37,9 @@ use crate::sql::ProllyStorage; #[cfg(feature = "sql")] use gluesql_core::{data::Value as SqlValue, executor::Payload, prelude::Glue}; +// Maximum number of keys that can be retrieved in a single operation +const MAX_KEYS_LIMIT: usize = 1024; + #[pyclass(name = "TreeConfig")] struct PyTreeConfig { base: u64, @@ -880,8 +887,18 @@ impl PyVersionedKvStore { let store = self.inner.lock().unwrap(); let keys = store.list_keys(); + let total_keys = keys.len(); + if total_keys > MAX_KEYS_LIMIT { + eprintln!( + "Warning: Tree contains {} keys, but only returning first {} keys due to limit. \ + Consider using more specific queries or implementing pagination.", + total_keys, MAX_KEYS_LIMIT + ); + } + let py_keys: Vec> = keys .iter() + .take(MAX_KEYS_LIMIT) .map(|key| PyBytes::new_bound(py, key).into()) .collect(); @@ -1159,6 +1176,39 @@ impl PyVersionedKvStore { Ok(store.verify(proof, &key_vec, value_option.as_deref())) }) } + + fn get_keys_at_ref( + &self, + py: Python, + reference: String, + ) -> PyResult, Py)>> { + let store = self.inner.lock().unwrap(); + + let keys_map = HistoricalAccess::get_keys_at_ref(&*store, &reference) + .map_err(|e| PyValueError::new_err(format!("Failed to get keys at ref: {}", e)))?; + + let total_keys = keys_map.len(); + if total_keys > MAX_KEYS_LIMIT { + eprintln!( + "Warning: Tree contains {} keys, but only returning first {} keys due to limit. \ + Consider using more specific queries or implementing pagination.", + total_keys, MAX_KEYS_LIMIT + ); + } + + let py_pairs: Vec<(Py, Py)> = keys_map + .into_iter() + .take(MAX_KEYS_LIMIT) + .map(|(key, value): (Vec, Vec)| { + ( + PyBytes::new_bound(py, &key).into(), + PyBytes::new_bound(py, &value).into(), + ) + }) + .collect(); + + Ok(py_pairs) + } } #[cfg(feature = "git")] @@ -1392,7 +1442,18 @@ impl PyWorktreeVersionedKvStore { fn list_keys(&self) -> PyResult>> { let store = self.inner.lock().unwrap(); - Ok(store.store().list_keys()) + let keys = store.store().list_keys(); + + let total_keys = keys.len(); + if total_keys > MAX_KEYS_LIMIT { + eprintln!( + "Warning: Tree contains {} keys, but only returning first {} keys due to limit. \ + Consider using more specific queries or implementing pagination.", + total_keys, MAX_KEYS_LIMIT + ); + } + + Ok(keys.into_iter().take(MAX_KEYS_LIMIT).collect()) } }