From 631a3a23334064361d6ff39402bb3b4010663d67 Mon Sep 17 00:00:00 2001 From: zhangfengcdt Date: Wed, 16 Jul 2025 14:09:20 -0700 Subject: [PATCH 1/9] Fix the staging issue with git-prolly Problem Identified The staging area (HashMap, Option>>) was being created fresh on each command invocation, causing staged changes to be lost between commands. Solution Implemented 1. Added staging area persistence: Created save_staging_area() and load_staging_area() methods that serialize/deserialize the staging area to/from .git/PROLLY_STAGING file 2. Fixed HEAD reference updates: Implemented proper update_head() method that writes branch references and HEAD file 3. Updated all staging operations: Modified insert(), update(), delete(), and checkout() to persist staging area changes 4. Fixed commit flow: Ensured staging area is cleared after successful commits --- src/git/versioned_store.rs | 75 +++++++++++++++++++++++++++++++++----- 1 file changed, 65 insertions(+), 10 deletions(-) diff --git a/src/git/versioned_store.rs b/src/git/versioned_store.rs index 43a516e..b1f55bf 100644 --- a/src/git/versioned_store.rs +++ b/src/git/versioned_store.rs @@ -17,6 +17,7 @@ use crate::git::storage::GitNodeStorage; use crate::git::types::*; use crate::tree::{ProllyTree, Tree}; use gix::prelude::*; +use std::collections::HashMap; use std::path::Path; /// A versioned key-value store backed by Git and ProllyTree @@ -27,7 +28,7 @@ use std::path::Path; pub struct VersionedKvStore { tree: ProllyTree>, git_repo: gix::Repository, - staging_area: std::collections::HashMap, Option>>, // None = deleted + staging_area: HashMap, Option>>, // None = deleted current_branch: String, } @@ -49,7 +50,7 @@ impl VersionedKvStore { let mut store = VersionedKvStore { tree, git_repo, - staging_area: std::collections::HashMap::new(), + staging_area: HashMap::new(), current_branch: "main".to_string(), }; @@ -80,17 +81,23 @@ impl VersionedKvStore { .map(|r| r.name().shorten().to_string()) .unwrap_or_else(|| "main".to_string()); - Ok(VersionedKvStore { + let mut store = VersionedKvStore { tree, git_repo, - staging_area: std::collections::HashMap::new(), + staging_area: HashMap::new(), current_branch, - }) + }; + + // Load staging area from file if it exists + store.load_staging_area()?; + + Ok(store) } /// Insert a key-value pair (stages the change) pub fn insert(&mut self, key: Vec, value: Vec) -> Result<(), GitKvError> { self.staging_area.insert(key, Some(value)); + self.save_staging_area()?; Ok(()) } @@ -99,6 +106,7 @@ impl VersionedKvStore { let exists = self.get(&key).is_some(); if exists { self.staging_area.insert(key, Some(value)); + self.save_staging_area()?; } Ok(exists) } @@ -108,6 +116,7 @@ impl VersionedKvStore { let exists = self.get(key).is_some(); if exists { self.staging_area.insert(key.to_vec(), None); + self.save_staging_area()?; } Ok(exists) } @@ -193,6 +202,9 @@ impl VersionedKvStore { // Update HEAD self.update_head(commit_id)?; + // Clear staging area file since we've committed + self.save_staging_area()?; + Ok(commit_id) } @@ -220,6 +232,7 @@ impl VersionedKvStore { pub fn checkout(&mut self, branch_or_commit: &str) -> Result<(), GitKvError> { // Clear staging area self.staging_area.clear(); + self.save_staging_area()?; // Update HEAD to point to the new branch/commit let target_ref = if branch_or_commit.starts_with("refs/") { @@ -379,13 +392,26 @@ impl VersionedKvStore { } /// Update HEAD to point to the new commit - fn update_head(&mut self, _commit_id: gix::ObjectId) -> Result<(), GitKvError> { + fn update_head(&mut self, commit_id: gix::ObjectId) -> Result<(), GitKvError> { // Update the current branch reference to point to the new commit - let _branch_ref = format!("refs/heads/{}", self.current_branch); + let branch_ref = format!("refs/heads/{}", self.current_branch); - // Note: This is a simplified implementation - // A full implementation would use gix reference transactions to properly update - // the branch reference to point to the new commit + // For now, use a simple implementation that writes directly to the file + let refs_dir = self.git_repo.path().join("refs").join("heads"); + std::fs::create_dir_all(&refs_dir).map_err(|e| { + GitKvError::GitObjectError(format!("Failed to create refs directory: {e}")) + })?; + + let branch_file = refs_dir.join(&self.current_branch); + std::fs::write(&branch_file, commit_id.to_hex().to_string()).map_err(|e| { + GitKvError::GitObjectError(format!("Failed to write branch reference: {e}")) + })?; + + // Update HEAD to point to the branch + let head_file = self.git_repo.path().join("HEAD"); + std::fs::write(&head_file, format!("ref: {}\n", branch_ref)).map_err(|e| { + GitKvError::GitObjectError(format!("Failed to write HEAD reference: {e}")) + })?; Ok(()) } @@ -396,6 +422,35 @@ impl VersionedKvStore { // In reality, we'd need to reconstruct the ProllyTree from Git objects Ok(()) } + + /// Save the staging area to a file + fn save_staging_area(&self) -> Result<(), GitKvError> { + let staging_file = self.git_repo.path().join("PROLLY_STAGING"); + + // Serialize the staging area + let serialized = bincode::serialize(&self.staging_area) + .map_err(|e| GitKvError::SerializationError(e))?; + + std::fs::write(staging_file, serialized) + .map_err(|e| GitKvError::GitObjectError(format!("Failed to write staging area: {e}")))?; + + Ok(()) + } + + /// Load the staging area from a file + fn load_staging_area(&mut self) -> Result<(), GitKvError> { + let staging_file = self.git_repo.path().join("PROLLY_STAGING"); + + if staging_file.exists() { + let data = std::fs::read(staging_file) + .map_err(|e| GitKvError::GitObjectError(format!("Failed to read staging area: {e}")))?; + + self.staging_area = bincode::deserialize(&data) + .map_err(|e| GitKvError::SerializationError(e))?; + } + + Ok(()) + } } #[cfg(test)] From 57adad9a93228defa2d8617d7fc5762a8c855106 Mon Sep 17 00:00:00 2001 From: zhangfengcdt Date: Wed, 16 Jul 2025 14:59:11 -0700 Subject: [PATCH 2/9] fix save and load of the storage --- src/git/storage.rs | 11 +++++++ src/git/versioned_store.rs | 60 ++++++++++++++++++++++++---------- src/tree.rs | 67 +++++++++++++++++++++++++++++++------- 3 files changed, 110 insertions(+), 28 deletions(-) diff --git a/src/git/storage.rs b/src/git/storage.rs index 004d874..0a25a1e 100644 --- a/src/git/storage.rs +++ b/src/git/storage.rs @@ -35,6 +35,17 @@ pub struct GitNodeStorage { hash_to_object_id: Mutex, gix::ObjectId>>, } +impl Clone for GitNodeStorage { + fn clone(&self) -> Self { + Self { + _repository: self._repository.clone(), + cache: Mutex::new(LruCache::new(NonZeroUsize::new(1000).unwrap())), + configs: Mutex::new(HashMap::new()), + hash_to_object_id: Mutex::new(HashMap::new()), + } + } +} + impl GitNodeStorage { /// Create a new GitNodeStorage instance pub fn new(repository: gix::Repository) -> Result { diff --git a/src/git/versioned_store.rs b/src/git/versioned_store.rs index b1f55bf..1eb53b5 100644 --- a/src/git/versioned_store.rs +++ b/src/git/versioned_store.rs @@ -54,6 +54,9 @@ impl VersionedKvStore { current_branch: "main".to_string(), }; + // Save initial configuration + let _ = store.tree.save_config(); + // Create initial commit store.commit("Initial commit")?; @@ -70,9 +73,15 @@ impl VersionedKvStore { // Create GitNodeStorage let storage = GitNodeStorage::new(git_repo.clone())?; - // Load tree configuration (using default for now) - let config: TreeConfig = TreeConfig::default(); - let tree = ProllyTree::new(storage, config); + // Load tree configuration from storage + let config: TreeConfig = match ProllyTree::load_config(&storage) { + Ok(config) => config, + Err(_) => TreeConfig::default(), + }; + + // Try to load existing tree from storage, or create new one + let tree = ProllyTree::load_from_storage(storage.clone(), config.clone()) + .unwrap_or_else(|| ProllyTree::new(storage, config)); // Get current branch let current_branch = git_repo @@ -91,6 +100,9 @@ impl VersionedKvStore { // Load staging area from file if it exists store.load_staging_area()?; + // Reload the tree from the current HEAD + store.reload_tree_from_head()?; + Ok(store) } @@ -193,6 +205,9 @@ impl VersionedKvStore { } } + // Persist the tree state + self.tree.persist_root(); + // Create tree object in Git let tree_id = self.create_git_tree()?; @@ -418,37 +433,48 @@ impl VersionedKvStore { /// Reload the ProllyTree from the current HEAD fn reload_tree_from_head(&mut self) -> Result<(), GitKvError> { - // This is a simplified implementation - // In reality, we'd need to reconstruct the ProllyTree from Git objects + // Load tree configuration from storage + let config: TreeConfig = match ProllyTree::load_config(&self.tree.storage) { + Ok(config) => config, + Err(_) => TreeConfig::default(), + }; + + // Try to load existing tree from storage, or create new one + let storage = self.tree.storage.clone(); + self.tree = ProllyTree::load_from_storage(storage.clone(), config.clone()) + .unwrap_or_else(|| ProllyTree::new(storage, config)); + Ok(()) } /// Save the staging area to a file fn save_staging_area(&self) -> Result<(), GitKvError> { let staging_file = self.git_repo.path().join("PROLLY_STAGING"); - + // Serialize the staging area let serialized = bincode::serialize(&self.staging_area) .map_err(|e| GitKvError::SerializationError(e))?; - - std::fs::write(staging_file, serialized) - .map_err(|e| GitKvError::GitObjectError(format!("Failed to write staging area: {e}")))?; - + + std::fs::write(staging_file, serialized).map_err(|e| { + GitKvError::GitObjectError(format!("Failed to write staging area: {e}")) + })?; + Ok(()) } /// Load the staging area from a file fn load_staging_area(&mut self) -> Result<(), GitKvError> { let staging_file = self.git_repo.path().join("PROLLY_STAGING"); - + if staging_file.exists() { - let data = std::fs::read(staging_file) - .map_err(|e| GitKvError::GitObjectError(format!("Failed to read staging area: {e}")))?; - - self.staging_area = bincode::deserialize(&data) - .map_err(|e| GitKvError::SerializationError(e))?; + let data = std::fs::read(staging_file).map_err(|e| { + GitKvError::GitObjectError(format!("Failed to read staging area: {e}")) + })?; + + self.staging_area = + bincode::deserialize(&data).map_err(|e| GitKvError::SerializationError(e))?; } - + Ok(()) } } diff --git a/src/tree.rs b/src/tree.rs index 51c3ccd..3136102 100644 --- a/src/tree.rs +++ b/src/tree.rs @@ -220,9 +220,9 @@ impl Default for TreeStats { } pub struct ProllyTree> { - root: ProllyNode, - storage: S, - config: TreeConfig, + pub root: ProllyNode, + pub storage: S, + pub config: TreeConfig, } impl> Tree for ProllyTree { @@ -525,14 +525,6 @@ impl> Tree for ProllyTree { } } -impl, const N: usize> ProllyTree { - fn persist_root(&mut self) { - // Save the updated child node back to the storage - self.storage - .insert_node(self.root.get_hash(), self.root.clone()); - } -} - impl> ProllyTree { /// Recursively computes the differences between two Prolly Nodes. /// @@ -589,6 +581,36 @@ impl> ProllyTree { diffs.push(DiffResult::Added(new_key.clone(), new_value.clone())); } } + + /// Persist the root node to storage and save configuration + pub fn persist_root(&mut self) { + // Store the root node in the storage + let root_hash = self.root.get_hash(); + if let Some(_) = self + .storage + .insert_node(root_hash.clone(), self.root.clone()) + { + // Update the config with the new root hash + self.config.root_hash = Some(root_hash); + + // Save the configuration + let _ = self.save_config(); + } + } + + /// Load a ProllyTree from an existing root hash in storage + pub fn load_from_storage(storage: S, config: TreeConfig) -> Option { + if let Some(ref root_hash) = config.root_hash { + if let Some(root_node) = storage.get_node_by_hash(root_hash) { + return Some(ProllyTree { + root: root_node, + storage, + config, + }); + } + } + None + } } #[cfg(test)] @@ -735,6 +757,29 @@ mod tests { assert!(tree.find(b"key3").is_none()); } + #[test] + fn test_persist_and_load() { + let storage = InMemoryNodeStorage::<32>::default(); + let config = TreeConfig::default(); + + // Create tree and add data + let mut tree = ProllyTree::new(storage.clone(), config.clone()); + tree.insert(b"key1".to_vec(), b"value1".to_vec()); + tree.insert(b"key2".to_vec(), b"value2".to_vec()); + + // Persist the tree + tree.persist_root(); + + // Load the tree from storage + let loaded_tree = ProllyTree::load_from_storage(storage, tree.config) + .expect("Should be able to load tree from storage"); + + // Verify data is preserved + assert!(loaded_tree.find(b"key1").is_some()); + assert!(loaded_tree.find(b"key2").is_some()); + assert!(loaded_tree.find(b"key3").is_none()); + } + #[test] fn test_insert_batch_and_find() { let storage = InMemoryNodeStorage::<32>::default(); From da72631257a446325e700ee73b362280ca1e0c45 Mon Sep 17 00:00:00 2001 From: zhangfengcdt Date: Wed, 16 Jul 2025 15:16:39 -0700 Subject: [PATCH 3/9] fix preserving prolly tree root issue The issue was that the original implementation was creating a Git tree with a null ObjectId placeholder, which resulted in the prolly_tree_root file being deleted during staging. Now it properly creates a real Git blob containing the serialized ProllyTree root and reconstructs the tree correctly when loading from HEAD. --- src/git/versioned_store.rs | 105 +++++++++++++++++++++++++++++-------- src/tree.rs | 4 +- 2 files changed, 86 insertions(+), 23 deletions(-) diff --git a/src/git/versioned_store.rs b/src/git/versioned_store.rs index 1eb53b5..2391c67 100644 --- a/src/git/versioned_store.rs +++ b/src/git/versioned_store.rs @@ -15,6 +15,7 @@ limitations under the License. use crate::config::TreeConfig; use crate::git::storage::GitNodeStorage; use crate::git::types::*; +use crate::node::ProllyNode; use crate::tree::{ProllyTree, Tree}; use gix::prelude::*; use std::collections::HashMap; @@ -74,10 +75,7 @@ impl VersionedKvStore { let storage = GitNodeStorage::new(git_repo.clone())?; // Load tree configuration from storage - let config: TreeConfig = match ProllyTree::load_config(&storage) { - Ok(config) => config, - Err(_) => TreeConfig::default(), - }; + let config: TreeConfig = ProllyTree::load_config(&storage).unwrap_or_default(); // Try to load existing tree from storage, or create new one let tree = ProllyTree::load_from_storage(storage.clone(), config.clone()) @@ -334,14 +332,23 @@ impl VersionedKvStore { /// Create a Git tree object from the current ProllyTree state fn create_git_tree(&self) -> Result { - // For now, create a simple tree with a placeholder entry - // In a real implementation, this would serialize the ProllyTree root - // and create a proper Git tree structure + // Serialize the ProllyTree root node + let root_node = &self.tree.root; + let serialized = bincode::serialize(root_node).map_err(GitKvError::SerializationError)?; + // Create a Git blob for the serialized root + let blob = gix::objs::Blob { data: serialized }; + let blob_id = self + .git_repo + .objects + .write(&blob) + .map_err(|e| GitKvError::GitObjectError(format!("Failed to write blob: {e}")))?; + + // Create a tree with the root blob let tree_entries = vec![gix::objs::tree::Entry { mode: gix::objs::tree::EntryMode(0o100644), filename: "prolly_tree_root".into(), - oid: gix::ObjectId::null(gix::hash::Kind::Sha1), // Placeholder + oid: blob_id, }]; let tree = gix::objs::Tree { @@ -424,7 +431,7 @@ impl VersionedKvStore { // Update HEAD to point to the branch let head_file = self.git_repo.path().join("HEAD"); - std::fs::write(&head_file, format!("ref: {}\n", branch_ref)).map_err(|e| { + std::fs::write(&head_file, format!("ref: {branch_ref}\n")).map_err(|e| { GitKvError::GitObjectError(format!("Failed to write HEAD reference: {e}")) })?; @@ -433,17 +440,73 @@ impl VersionedKvStore { /// Reload the ProllyTree from the current HEAD fn reload_tree_from_head(&mut self) -> Result<(), GitKvError> { - // Load tree configuration from storage - let config: TreeConfig = match ProllyTree::load_config(&self.tree.storage) { - Ok(config) => config, - Err(_) => TreeConfig::default(), - }; - - // Try to load existing tree from storage, or create new one + // Get the current HEAD commit + let head_ref = self.git_repo.head_ref().map_err(|e| { + GitKvError::GitObjectError(format!("Failed to get HEAD reference: {e}")) + })?; + + if let Some(head_ref) = head_ref { + if let Some(commit_id) = head_ref.target().try_id() { + // Load the commit object + let mut buffer = Vec::new(); + let commit_obj = self.git_repo.objects.find(commit_id, &mut buffer).map_err(|e| { + GitKvError::GitObjectError(format!("Failed to find commit {commit_id}: {e}")) + })?; + + // Parse the commit object + let commit = match commit_obj.decode() { + Ok(gix::objs::ObjectRef::Commit(commit)) => commit, + _ => return Err(GitKvError::GitObjectError("Object is not a commit".to_string())), + }; + + // Get the tree ID from the commit + let tree_id = commit.tree(); + + // Load the tree object with a fresh buffer + let mut tree_buffer = Vec::new(); + let tree_obj = self.git_repo.objects.find(&tree_id, &mut tree_buffer).map_err(|e| { + GitKvError::GitObjectError(format!("Failed to find tree {tree_id}: {e}")) + })?; + + // Parse the tree object + let tree = match tree_obj.decode() { + Ok(gix::objs::ObjectRef::Tree(tree)) => tree, + _ => return Err(GitKvError::GitObjectError("Object is not a tree".to_string())), + }; + + // Find the prolly_tree_root entry in the tree + for entry in tree.entries { + if entry.filename == "prolly_tree_root" { + // Load the blob containing the serialized root with a fresh buffer + let mut blob_buffer = Vec::new(); + let blob_obj = self.git_repo.objects.find(&entry.oid, &mut blob_buffer).map_err(|e| { + GitKvError::GitObjectError(format!("Failed to find blob {}: {e}", entry.oid)) + })?; + + // Deserialize the root node + let root_node: ProllyNode = bincode::deserialize(blob_obj.data) + .map_err(GitKvError::SerializationError)?; + + // Create a new tree with the loaded root + let storage = self.tree.storage.clone(); + let config = TreeConfig::default(); + self.tree = ProllyTree { + root: root_node, + storage, + config, + }; + + return Ok(()); + } + } + } + } + + // If we can't load from HEAD, create a new tree let storage = self.tree.storage.clone(); - self.tree = ProllyTree::load_from_storage(storage.clone(), config.clone()) - .unwrap_or_else(|| ProllyTree::new(storage, config)); - + let config = TreeConfig::default(); + self.tree = ProllyTree::new(storage, config); + Ok(()) } @@ -453,7 +516,7 @@ impl VersionedKvStore { // Serialize the staging area let serialized = bincode::serialize(&self.staging_area) - .map_err(|e| GitKvError::SerializationError(e))?; + .map_err(GitKvError::SerializationError)?; std::fs::write(staging_file, serialized).map_err(|e| { GitKvError::GitObjectError(format!("Failed to write staging area: {e}")) @@ -472,7 +535,7 @@ impl VersionedKvStore { })?; self.staging_area = - bincode::deserialize(&data).map_err(|e| GitKvError::SerializationError(e))?; + bincode::deserialize(&data).map_err(GitKvError::SerializationError)?; } Ok(()) diff --git a/src/tree.rs b/src/tree.rs index 3136102..5125c88 100644 --- a/src/tree.rs +++ b/src/tree.rs @@ -586,9 +586,9 @@ impl> ProllyTree { pub fn persist_root(&mut self) { // Store the root node in the storage let root_hash = self.root.get_hash(); - if let Some(_) = self + if self .storage - .insert_node(root_hash.clone(), self.root.clone()) + .insert_node(root_hash.clone(), self.root.clone()).is_some() { // Update the config with the new root hash self.config.root_hash = Some(root_hash); From aace667ede110a743ae725156fd9dbcd7c693d85 Mon Sep 17 00:00:00 2001 From: zhangfengcdt Date: Wed, 16 Jul 2025 16:23:53 -0700 Subject: [PATCH 4/9] fixed the git-prolly issue where prolly list was showing "No keys found" after committing data The problem had three root causes: 1. Missing config save after commit - Issue: The commit() function wasn't calling save_config() after persisting the tree - Fix: Added self.tree.save_config() call in /Users/feng/github/prollytree/src/git/versioned_store.rs:216 2. Hash mappings not loaded when opening store - Issue: GitNodeStorage::new() wasn't calling load_hash_mappings() - Fix: Added storage.load_hash_mappings() call in /Users/feng/github/prollytree/src/git/storage.rs:67 3. ValueDigest deserialization incompatible with JSON - Issue: ValueDigest::deserialize() expected binary bytes but JSON config contained array format [188,78,66,...] - Fix: Updated deserialization in /Users/feng/github/prollytree/src/digest.rs:134 to handle Vec from JSON --- src/digest.rs | 7 ++- src/git/storage.rs | 117 ++++++++++++++++++++++++++++++++---- src/git/versioned_store.rs | 118 +++++++++---------------------------- src/storage.rs | 20 +++++-- src/tree.rs | 25 ++++++-- 5 files changed, 175 insertions(+), 112 deletions(-) diff --git a/src/digest.rs b/src/digest.rs index aa5bca4..17df8f4 100644 --- a/src/digest.rs +++ b/src/digest.rs @@ -130,9 +130,10 @@ impl<'de, const N: usize> Deserialize<'de> for ValueDigest { where D: serde::Deserializer<'de>, { - let bytes: &[u8] = serde::de::Deserialize::deserialize(deserializer)?; - let array = <[u8; N]>::try_from(bytes) - .map_err(|_| serde::de::Error::invalid_length(bytes.len(), &stringify!(N)))?; + // Try to deserialize as a sequence of bytes (for JSON format) + let bytes: Vec = serde::de::Deserialize::deserialize(deserializer)?; + let array = <[u8; N]>::try_from(bytes.as_slice()) + .map_err(|_| serde::de::Error::invalid_length(bytes.len(), &format!("array of length {}", N).as_str()))?; Ok(ValueDigest(array)) } } diff --git a/src/git/storage.rs b/src/git/storage.rs index 0a25a1e..98b79cf 100644 --- a/src/git/storage.rs +++ b/src/git/storage.rs @@ -37,12 +37,17 @@ pub struct GitNodeStorage { impl Clone for GitNodeStorage { fn clone(&self) -> Self { - Self { + let cloned = Self { _repository: self._repository.clone(), cache: Mutex::new(LruCache::new(NonZeroUsize::new(1000).unwrap())), configs: Mutex::new(HashMap::new()), hash_to_object_id: Mutex::new(HashMap::new()), - } + }; + + // Load the hash mappings for the cloned instance + cloned.load_hash_mappings(); + + cloned } } @@ -51,12 +56,17 @@ impl GitNodeStorage { pub fn new(repository: gix::Repository) -> Result { let cache_size = NonZeroUsize::new(1000).unwrap(); // Default cache size - Ok(GitNodeStorage { + let storage = GitNodeStorage { _repository: Arc::new(Mutex::new(repository)), cache: Mutex::new(LruCache::new(cache_size)), configs: Mutex::new(HashMap::new()), hash_to_object_id: Mutex::new(HashMap::new()), - }) + }; + + // Load existing hash mappings + storage.load_hash_mappings(); + + Ok(storage) } /// Create GitNodeStorage with custom cache size @@ -66,12 +76,17 @@ impl GitNodeStorage { ) -> Result { let cache_size = NonZeroUsize::new(cache_size).unwrap_or(NonZeroUsize::new(1000).unwrap()); - Ok(GitNodeStorage { + let storage = GitNodeStorage { _repository: Arc::new(Mutex::new(repository)), cache: Mutex::new(LruCache::new(cache_size)), configs: Mutex::new(HashMap::new()), hash_to_object_id: Mutex::new(HashMap::new()), - }) + }; + + // Load existing hash mappings + storage.load_hash_mappings(); + + Ok(storage) } /// Store a node as a Git blob @@ -129,7 +144,11 @@ impl NodeStorage for GitNodeStorage { match self.store_node_as_blob(&node) { Ok(blob_id) => { // Store the mapping between ProllyTree hash and Git object ID - self.hash_to_object_id.lock().unwrap().insert(hash, blob_id); + self.hash_to_object_id.lock().unwrap().insert(hash.clone(), blob_id); + + // Persist the mapping to filesystem + self.save_hash_mapping(&hash, &blob_id); + Some(()) } Err(_) => None, @@ -150,14 +169,92 @@ impl NodeStorage for GitNodeStorage { } fn save_config(&self, key: &str, config: &[u8]) { - // Store config in memory for now - // In a real implementation, we'd store this as a Git blob or in a config file + // Store config in memory let mut configs = self.configs.lock().unwrap(); configs.insert(key.to_string(), config.to_vec()); + + // Also persist to filesystem for durability + let repo = self._repository.lock().unwrap(); + let config_path = repo.path().join(format!("prolly_config_{}", key)); + let _ = std::fs::write(config_path, config); } fn get_config(&self, key: &str) -> Option> { - self.configs.lock().unwrap().get(key).cloned() + // First try to get from memory + if let Some(config) = self.configs.lock().unwrap().get(key).cloned() { + return Some(config); + } + + // If not in memory, try to load from filesystem + let repo = self._repository.lock().unwrap(); + let config_path = repo.path().join(format!("prolly_config_{}", key)); + if let Ok(config) = std::fs::read(config_path) { + // Cache in memory for future use + drop(repo); + self.configs.lock().unwrap().insert(key.to_string(), config.clone()); + return Some(config); + } + + None + } +} + +impl GitNodeStorage { + /// Save hash mapping to filesystem + fn save_hash_mapping(&self, hash: &ValueDigest, object_id: &gix::ObjectId) { + let repo = self._repository.lock().unwrap(); + let mapping_path = repo.path().join("prolly_hash_mappings"); + + // Read existing mappings + let mut mappings = if mapping_path.exists() { + std::fs::read_to_string(&mapping_path).unwrap_or_default() + } else { + String::new() + }; + + // Add new mapping - use simple format for now without hex dependency + let hash_bytes: Vec = hash.0.iter().map(|b| format!("{:02x}", b)).collect(); + let hash_hex = hash_bytes.join(""); + let object_hex = object_id.to_hex().to_string(); + mappings.push_str(&format!("{}:{}\n", hash_hex, object_hex)); + + // Write back + let _ = std::fs::write(mapping_path, mappings); + } + + /// Load hash mappings from filesystem + fn load_hash_mappings(&self) { + let repo = self._repository.lock().unwrap(); + let mapping_path = repo.path().join("prolly_hash_mappings"); + + if let Ok(mappings) = std::fs::read_to_string(mapping_path) { + let mut hash_map = self.hash_to_object_id.lock().unwrap(); + + for line in mappings.lines() { + if let Some((hash_hex, object_hex)) = line.split_once(':') { + // Parse hex string manually + if hash_hex.len() == N * 2 { + let mut hash_bytes = Vec::new(); + for i in 0..N { + if let Ok(byte) = u8::from_str_radix(&hash_hex[i*2..i*2+2], 16) { + hash_bytes.push(byte); + } else { + break; + } + } + + if hash_bytes.len() == N { + if let Ok(object_id) = gix::ObjectId::from_hex(object_hex.as_bytes()) { + let mut hash_array = [0u8; N]; + hash_array.copy_from_slice(&hash_bytes); + let hash = ValueDigest(hash_array); + hash_map.insert(hash, object_id); + } + } + } + } + } + } } } diff --git a/src/git/versioned_store.rs b/src/git/versioned_store.rs index 2391c67..31f91bd 100644 --- a/src/git/versioned_store.rs +++ b/src/git/versioned_store.rs @@ -15,7 +15,6 @@ limitations under the License. use crate::config::TreeConfig; use crate::git::storage::GitNodeStorage; use crate::git::types::*; -use crate::node::ProllyNode; use crate::tree::{ProllyTree, Tree}; use gix::prelude::*; use std::collections::HashMap; @@ -58,6 +57,12 @@ impl VersionedKvStore { // Save initial configuration let _ = store.tree.save_config(); + // Create .gitignore to ignore prolly files + let gitignore_path = path.join(".gitignore"); + std::fs::write(&gitignore_path, "prolly_tree_root\nprolly_config_*\nprolly_hash_mappings\n").map_err(|e| { + GitKvError::GitObjectError(format!("Failed to create .gitignore: {e}")) + })?; + // Create initial commit store.commit("Initial commit")?; @@ -152,7 +157,12 @@ impl VersionedKvStore { pub fn list_keys(&self) -> Vec> { let mut keys = std::collections::HashSet::new(); - // Add keys from staging area + // Add keys from the committed ProllyTree + for key in self.tree.collect_keys() { + keys.insert(key); + } + + // Add keys from staging area (overrides committed data) for (key, value) in &self.staging_area { if value.is_some() { keys.insert(key.clone()); @@ -161,10 +171,6 @@ impl VersionedKvStore { } } - // Add keys from committed data (if not in staging) - // This is a simplified implementation - // In reality, we'd need to traverse the ProllyTree properly - keys.into_iter().collect() } @@ -205,6 +211,10 @@ impl VersionedKvStore { // Persist the tree state self.tree.persist_root(); + + // Save the updated configuration with the new root hash + self.tree.save_config() + .map_err(|e| GitKvError::GitObjectError(format!("Failed to save config: {e}")))?; // Create tree object in Git let tree_id = self.create_git_tree()?; @@ -332,24 +342,10 @@ impl VersionedKvStore { /// Create a Git tree object from the current ProllyTree state fn create_git_tree(&self) -> Result { - // Serialize the ProllyTree root node - let root_node = &self.tree.root; - let serialized = bincode::serialize(root_node).map_err(GitKvError::SerializationError)?; - - // Create a Git blob for the serialized root - let blob = gix::objs::Blob { data: serialized }; - let blob_id = self - .git_repo - .objects - .write(&blob) - .map_err(|e| GitKvError::GitObjectError(format!("Failed to write blob: {e}")))?; - - // Create a tree with the root blob - let tree_entries = vec![gix::objs::tree::Entry { - mode: gix::objs::tree::EntryMode(0o100644), - filename: "prolly_tree_root".into(), - oid: blob_id, - }]; + // Create an empty tree - the ProllyTree state is managed through GitNodeStorage + // We don't need to create a prolly_tree_root file since the tree structure + // is stored in Git blobs and managed through the NodeStorage interface + let tree_entries = vec![]; let tree = gix::objs::Tree { entries: tree_entries, @@ -440,73 +436,17 @@ impl VersionedKvStore { /// Reload the ProllyTree from the current HEAD fn reload_tree_from_head(&mut self) -> Result<(), GitKvError> { - // Get the current HEAD commit - let head_ref = self.git_repo.head_ref().map_err(|e| { - GitKvError::GitObjectError(format!("Failed to get HEAD reference: {e}")) - })?; + // Since we're no longer storing prolly_tree_root in the Git tree, + // we need to reload the tree state from the GitNodeStorage - if let Some(head_ref) = head_ref { - if let Some(commit_id) = head_ref.target().try_id() { - // Load the commit object - let mut buffer = Vec::new(); - let commit_obj = self.git_repo.objects.find(commit_id, &mut buffer).map_err(|e| { - GitKvError::GitObjectError(format!("Failed to find commit {commit_id}: {e}")) - })?; - - // Parse the commit object - let commit = match commit_obj.decode() { - Ok(gix::objs::ObjectRef::Commit(commit)) => commit, - _ => return Err(GitKvError::GitObjectError("Object is not a commit".to_string())), - }; - - // Get the tree ID from the commit - let tree_id = commit.tree(); - - // Load the tree object with a fresh buffer - let mut tree_buffer = Vec::new(); - let tree_obj = self.git_repo.objects.find(&tree_id, &mut tree_buffer).map_err(|e| { - GitKvError::GitObjectError(format!("Failed to find tree {tree_id}: {e}")) - })?; - - // Parse the tree object - let tree = match tree_obj.decode() { - Ok(gix::objs::ObjectRef::Tree(tree)) => tree, - _ => return Err(GitKvError::GitObjectError("Object is not a tree".to_string())), - }; - - // Find the prolly_tree_root entry in the tree - for entry in tree.entries { - if entry.filename == "prolly_tree_root" { - // Load the blob containing the serialized root with a fresh buffer - let mut blob_buffer = Vec::new(); - let blob_obj = self.git_repo.objects.find(&entry.oid, &mut blob_buffer).map_err(|e| { - GitKvError::GitObjectError(format!("Failed to find blob {}: {e}", entry.oid)) - })?; - - // Deserialize the root node - let root_node: ProllyNode = bincode::deserialize(blob_obj.data) - .map_err(GitKvError::SerializationError)?; - - // Create a new tree with the loaded root - let storage = self.tree.storage.clone(); - let config = TreeConfig::default(); - self.tree = ProllyTree { - root: root_node, - storage, - config, - }; - - return Ok(()); - } - } - } - } - - // If we can't load from HEAD, create a new tree + // Load tree configuration from storage + let config: TreeConfig = ProllyTree::load_config(&self.tree.storage).unwrap_or_default(); + + // Try to load existing tree from storage, or create new one let storage = self.tree.storage.clone(); - let config = TreeConfig::default(); - self.tree = ProllyTree::new(storage, config); - + self.tree = ProllyTree::load_from_storage(storage.clone(), config.clone()) + .unwrap_or_else(|| ProllyTree::new(storage, config)); + Ok(()) } diff --git a/src/storage.rs b/src/storage.rs index 31fddba..faf617e 100644 --- a/src/storage.rs +++ b/src/storage.rs @@ -19,6 +19,7 @@ use std::fmt::{Display, Formatter, LowerHex}; use std::fs::{self, File}; use std::io::{Read, Write}; use std::path::PathBuf; +use std::sync::RwLock; /// A trait for storage of nodes in the ProllyTree. /// @@ -66,10 +67,18 @@ pub trait NodeStorage: Send + Sync { /// # Type Parameters /// /// - `N`: The size of the value digest. -#[derive(Clone)] pub struct InMemoryNodeStorage { map: HashMap, ProllyNode>, - configs: HashMap>, + configs: RwLock>>, +} + +impl Clone for InMemoryNodeStorage { + fn clone(&self) -> Self { + InMemoryNodeStorage { + map: self.map.clone(), + configs: RwLock::new(self.configs.read().unwrap().clone()), + } + } } impl Default for InMemoryNodeStorage { @@ -82,7 +91,7 @@ impl InMemoryNodeStorage { pub fn new() -> Self { InMemoryNodeStorage { map: HashMap::new(), - configs: HashMap::new(), + configs: RwLock::new(HashMap::new()), } } } @@ -107,12 +116,11 @@ impl NodeStorage for InMemoryNodeStorage { } fn save_config(&self, key: &str, config: &[u8]) { - let mut configs = self.configs.clone(); - configs.insert(key.to_string(), config.to_vec()); + self.configs.write().unwrap().insert(key.to_string(), config.to_vec()); } fn get_config(&self, key: &str) -> Option> { - self.configs.get(key).cloned() + self.configs.read().unwrap().get(key).cloned() } } diff --git a/src/tree.rs b/src/tree.rs index 5125c88..9993db6 100644 --- a/src/tree.rs +++ b/src/tree.rs @@ -586,10 +586,7 @@ impl> ProllyTree { pub fn persist_root(&mut self) { // Store the root node in the storage let root_hash = self.root.get_hash(); - if self - .storage - .insert_node(root_hash.clone(), self.root.clone()).is_some() - { + if self.storage.insert_node(root_hash.clone(), self.root.clone()).is_some() { // Update the config with the new root hash self.config.root_hash = Some(root_hash); @@ -611,6 +608,26 @@ impl> ProllyTree { } None } + + /// Collect all keys from the tree + pub fn collect_keys(&self) -> Vec> { + let mut keys = Vec::new(); + self.collect_keys_recursive(&self.root, &mut keys); + keys + } + + /// Recursively collect keys from a node and its children + fn collect_keys_recursive(&self, node: &ProllyNode, keys: &mut Vec>) { + // Add all keys from this node + for key in &node.keys { + keys.push(key.clone()); + } + + // Recursively collect keys from child nodes + for child_node in node.children(&self.storage) { + self.collect_keys_recursive(&child_node, keys); + } + } } #[cfg(test)] From ba2199a7b5e6eba16d62569a0124fd7f98027e37 Mon Sep 17 00:00:00 2001 From: zhangfengcdt Date: Wed, 16 Jul 2025 16:55:06 -0700 Subject: [PATCH 5/9] fix test_persist_and_load --- src/digest.rs | 8 ++++-- src/git/storage.rs | 54 +++++++++++++++++++++----------------- src/git/versioned_store.rs | 19 ++++++++------ src/storage.rs | 5 +++- src/tree.rs | 8 ++++-- 5 files changed, 57 insertions(+), 37 deletions(-) diff --git a/src/digest.rs b/src/digest.rs index 17df8f4..5456a0e 100644 --- a/src/digest.rs +++ b/src/digest.rs @@ -132,8 +132,12 @@ impl<'de, const N: usize> Deserialize<'de> for ValueDigest { { // Try to deserialize as a sequence of bytes (for JSON format) let bytes: Vec = serde::de::Deserialize::deserialize(deserializer)?; - let array = <[u8; N]>::try_from(bytes.as_slice()) - .map_err(|_| serde::de::Error::invalid_length(bytes.len(), &format!("array of length {}", N).as_str()))?; + let array = <[u8; N]>::try_from(bytes.as_slice()).map_err(|_| { + serde::de::Error::invalid_length( + bytes.len(), + &format!("array of length {N}").as_str(), + ) + })?; Ok(ValueDigest(array)) } } diff --git a/src/git/storage.rs b/src/git/storage.rs index 98b79cf..bf724c7 100644 --- a/src/git/storage.rs +++ b/src/git/storage.rs @@ -43,10 +43,10 @@ impl Clone for GitNodeStorage { configs: Mutex::new(HashMap::new()), hash_to_object_id: Mutex::new(HashMap::new()), }; - + // Load the hash mappings for the cloned instance cloned.load_hash_mappings(); - + cloned } } @@ -62,10 +62,10 @@ impl GitNodeStorage { configs: Mutex::new(HashMap::new()), hash_to_object_id: Mutex::new(HashMap::new()), }; - + // Load existing hash mappings storage.load_hash_mappings(); - + Ok(storage) } @@ -82,10 +82,10 @@ impl GitNodeStorage { configs: Mutex::new(HashMap::new()), hash_to_object_id: Mutex::new(HashMap::new()), }; - + // Load existing hash mappings storage.load_hash_mappings(); - + Ok(storage) } @@ -144,11 +144,14 @@ impl NodeStorage for GitNodeStorage { match self.store_node_as_blob(&node) { Ok(blob_id) => { // Store the mapping between ProllyTree hash and Git object ID - self.hash_to_object_id.lock().unwrap().insert(hash.clone(), blob_id); - + self.hash_to_object_id + .lock() + .unwrap() + .insert(hash.clone(), blob_id); + // Persist the mapping to filesystem self.save_hash_mapping(&hash, &blob_id); - + Some(()) } Err(_) => None, @@ -172,10 +175,10 @@ impl NodeStorage for GitNodeStorage { // Store config in memory let mut configs = self.configs.lock().unwrap(); configs.insert(key.to_string(), config.to_vec()); - + // Also persist to filesystem for durability let repo = self._repository.lock().unwrap(); - let config_path = repo.path().join(format!("prolly_config_{}", key)); + let config_path = repo.path().join(format!("prolly_config_{key}")); let _ = std::fs::write(config_path, config); } @@ -184,17 +187,20 @@ impl NodeStorage for GitNodeStorage { if let Some(config) = self.configs.lock().unwrap().get(key).cloned() { return Some(config); } - + // If not in memory, try to load from filesystem let repo = self._repository.lock().unwrap(); - let config_path = repo.path().join(format!("prolly_config_{}", key)); + let config_path = repo.path().join(format!("prolly_config_{key}")); if let Ok(config) = std::fs::read(config_path) { // Cache in memory for future use drop(repo); - self.configs.lock().unwrap().insert(key.to_string(), config.clone()); + self.configs + .lock() + .unwrap() + .insert(key.to_string(), config.clone()); return Some(config); } - + None } } @@ -204,20 +210,20 @@ impl GitNodeStorage { fn save_hash_mapping(&self, hash: &ValueDigest, object_id: &gix::ObjectId) { let repo = self._repository.lock().unwrap(); let mapping_path = repo.path().join("prolly_hash_mappings"); - + // Read existing mappings let mut mappings = if mapping_path.exists() { std::fs::read_to_string(&mapping_path).unwrap_or_default() } else { String::new() }; - + // Add new mapping - use simple format for now without hex dependency - let hash_bytes: Vec = hash.0.iter().map(|b| format!("{:02x}", b)).collect(); + let hash_bytes: Vec = hash.0.iter().map(|b| format!("{b:02x}")).collect(); let hash_hex = hash_bytes.join(""); let object_hex = object_id.to_hex().to_string(); - mappings.push_str(&format!("{}:{}\n", hash_hex, object_hex)); - + mappings.push_str(&format!("{hash_hex}:{object_hex}\n")); + // Write back let _ = std::fs::write(mapping_path, mappings); } @@ -226,23 +232,23 @@ impl GitNodeStorage { fn load_hash_mappings(&self) { let repo = self._repository.lock().unwrap(); let mapping_path = repo.path().join("prolly_hash_mappings"); - + if let Ok(mappings) = std::fs::read_to_string(mapping_path) { let mut hash_map = self.hash_to_object_id.lock().unwrap(); - + for line in mappings.lines() { if let Some((hash_hex, object_hex)) = line.split_once(':') { // Parse hex string manually if hash_hex.len() == N * 2 { let mut hash_bytes = Vec::new(); for i in 0..N { - if let Ok(byte) = u8::from_str_radix(&hash_hex[i*2..i*2+2], 16) { + if let Ok(byte) = u8::from_str_radix(&hash_hex[i * 2..i * 2 + 2], 16) { hash_bytes.push(byte); } else { break; } } - + if hash_bytes.len() == N { if let Ok(object_id) = gix::ObjectId::from_hex(object_hex.as_bytes()) { let mut hash_array = [0u8; N]; diff --git a/src/git/versioned_store.rs b/src/git/versioned_store.rs index 31f91bd..f7c56d2 100644 --- a/src/git/versioned_store.rs +++ b/src/git/versioned_store.rs @@ -59,9 +59,11 @@ impl VersionedKvStore { // Create .gitignore to ignore prolly files let gitignore_path = path.join(".gitignore"); - std::fs::write(&gitignore_path, "prolly_tree_root\nprolly_config_*\nprolly_hash_mappings\n").map_err(|e| { - GitKvError::GitObjectError(format!("Failed to create .gitignore: {e}")) - })?; + std::fs::write( + &gitignore_path, + "prolly_tree_root\nprolly_config_*\nprolly_hash_mappings\n", + ) + .map_err(|e| GitKvError::GitObjectError(format!("Failed to create .gitignore: {e}")))?; // Create initial commit store.commit("Initial commit")?; @@ -211,9 +213,10 @@ impl VersionedKvStore { // Persist the tree state self.tree.persist_root(); - + // Save the updated configuration with the new root hash - self.tree.save_config() + self.tree + .save_config() .map_err(|e| GitKvError::GitObjectError(format!("Failed to save config: {e}")))?; // Create tree object in Git @@ -438,7 +441,7 @@ impl VersionedKvStore { fn reload_tree_from_head(&mut self) -> Result<(), GitKvError> { // Since we're no longer storing prolly_tree_root in the Git tree, // we need to reload the tree state from the GitNodeStorage - + // Load tree configuration from storage let config: TreeConfig = ProllyTree::load_config(&self.tree.storage).unwrap_or_default(); @@ -455,8 +458,8 @@ impl VersionedKvStore { let staging_file = self.git_repo.path().join("PROLLY_STAGING"); // Serialize the staging area - let serialized = bincode::serialize(&self.staging_area) - .map_err(GitKvError::SerializationError)?; + let serialized = + bincode::serialize(&self.staging_area).map_err(GitKvError::SerializationError)?; std::fs::write(staging_file, serialized).map_err(|e| { GitKvError::GitObjectError(format!("Failed to write staging area: {e}")) diff --git a/src/storage.rs b/src/storage.rs index faf617e..5a6db82 100644 --- a/src/storage.rs +++ b/src/storage.rs @@ -116,7 +116,10 @@ impl NodeStorage for InMemoryNodeStorage { } fn save_config(&self, key: &str, config: &[u8]) { - self.configs.write().unwrap().insert(key.to_string(), config.to_vec()); + self.configs + .write() + .unwrap() + .insert(key.to_string(), config.to_vec()); } fn get_config(&self, key: &str) -> Option> { diff --git a/src/tree.rs b/src/tree.rs index 9993db6..004525d 100644 --- a/src/tree.rs +++ b/src/tree.rs @@ -586,7 +586,11 @@ impl> ProllyTree { pub fn persist_root(&mut self) { // Store the root node in the storage let root_hash = self.root.get_hash(); - if self.storage.insert_node(root_hash.clone(), self.root.clone()).is_some() { + if self + .storage + .insert_node(root_hash.clone(), self.root.clone()) + .is_some() + { // Update the config with the new root hash self.config.root_hash = Some(root_hash); @@ -788,7 +792,7 @@ mod tests { tree.persist_root(); // Load the tree from storage - let loaded_tree = ProllyTree::load_from_storage(storage, tree.config) + let loaded_tree = ProllyTree::load_from_storage(tree.storage, tree.config) .expect("Should be able to load tree from storage"); // Verify data is preserved From 74028411c8984c37d341a09a98479b31d9c999f0 Mon Sep 17 00:00:00 2001 From: zhangfengcdt Date: Wed, 16 Jul 2025 17:21:42 -0700 Subject: [PATCH 6/9] remove .gitignore logic --- src/digest.rs | 5 +---- src/git/versioned_store.rs | 8 -------- 2 files changed, 1 insertion(+), 12 deletions(-) diff --git a/src/digest.rs b/src/digest.rs index 5456a0e..f81e300 100644 --- a/src/digest.rs +++ b/src/digest.rs @@ -133,10 +133,7 @@ impl<'de, const N: usize> Deserialize<'de> for ValueDigest { // Try to deserialize as a sequence of bytes (for JSON format) let bytes: Vec = serde::de::Deserialize::deserialize(deserializer)?; let array = <[u8; N]>::try_from(bytes.as_slice()).map_err(|_| { - serde::de::Error::invalid_length( - bytes.len(), - &format!("array of length {N}").as_str(), - ) + serde::de::Error::invalid_length(bytes.len(), &format!("array of length {N}").as_str()) })?; Ok(ValueDigest(array)) } diff --git a/src/git/versioned_store.rs b/src/git/versioned_store.rs index f7c56d2..2364959 100644 --- a/src/git/versioned_store.rs +++ b/src/git/versioned_store.rs @@ -57,14 +57,6 @@ impl VersionedKvStore { // Save initial configuration let _ = store.tree.save_config(); - // Create .gitignore to ignore prolly files - let gitignore_path = path.join(".gitignore"); - std::fs::write( - &gitignore_path, - "prolly_tree_root\nprolly_config_*\nprolly_hash_mappings\n", - ) - .map_err(|e| GitKvError::GitObjectError(format!("Failed to create .gitignore: {e}")))?; - // Create initial commit store.commit("Initial commit")?; From b6234532764ec35b96acf7c7467dee9382cee5bf Mon Sep 17 00:00:00 2001 From: zhangfengcdt Date: Wed, 16 Jul 2025 19:52:33 -0700 Subject: [PATCH 7/9] Auto commit after git-prolly init and commit to make sure prollytree metadata are kept --- src/git/operations.rs | 14 ++- src/git/storage.rs | 33 ++++--- src/git/versioned_store.rs | 190 ++++++++++++++++++++++++++++++++++--- 3 files changed, 208 insertions(+), 29 deletions(-) diff --git a/src/git/operations.rs b/src/git/operations.rs index b73e902..d182263 100644 --- a/src/git/operations.rs +++ b/src/git/operations.rs @@ -397,14 +397,24 @@ mod tests { #[test] fn test_git_operations_creation() { let temp_dir = TempDir::new().unwrap(); - let store = VersionedKvStore::<32>::init(temp_dir.path()).unwrap(); + // Initialize git repository (regular, not bare) + gix::init(temp_dir.path()).unwrap(); + // Create subdirectory for dataset + let dataset_dir = temp_dir.path().join("dataset"); + std::fs::create_dir_all(&dataset_dir).unwrap(); + let store = VersionedKvStore::<32>::init(&dataset_dir).unwrap(); let _ops = GitOperations::new(store); } #[test] fn test_parse_commit_id() { let temp_dir = TempDir::new().unwrap(); - let store = VersionedKvStore::<32>::init(temp_dir.path()).unwrap(); + // Initialize git repository (regular, not bare) + gix::init(temp_dir.path()).unwrap(); + // Create subdirectory for dataset + let dataset_dir = temp_dir.path().join("dataset"); + std::fs::create_dir_all(&dataset_dir).unwrap(); + let store = VersionedKvStore::<32>::init(&dataset_dir).unwrap(); let ops = GitOperations::new(store); // Test HEAD parsing diff --git a/src/git/storage.rs b/src/git/storage.rs index bf724c7..c1eca86 100644 --- a/src/git/storage.rs +++ b/src/git/storage.rs @@ -33,6 +33,8 @@ pub struct GitNodeStorage { configs: Mutex>>, // Maps ProllyTree hashes to Git object IDs hash_to_object_id: Mutex, gix::ObjectId>>, + // Directory where this dataset's config and mapping files are stored + dataset_dir: std::path::PathBuf, } impl Clone for GitNodeStorage { @@ -42,6 +44,7 @@ impl Clone for GitNodeStorage { cache: Mutex::new(LruCache::new(NonZeroUsize::new(1000).unwrap())), configs: Mutex::new(HashMap::new()), hash_to_object_id: Mutex::new(HashMap::new()), + dataset_dir: self.dataset_dir.clone(), }; // Load the hash mappings for the cloned instance @@ -53,7 +56,7 @@ impl Clone for GitNodeStorage { impl GitNodeStorage { /// Create a new GitNodeStorage instance - pub fn new(repository: gix::Repository) -> Result { + pub fn new(repository: gix::Repository, dataset_dir: std::path::PathBuf) -> Result { let cache_size = NonZeroUsize::new(1000).unwrap(); // Default cache size let storage = GitNodeStorage { @@ -61,6 +64,7 @@ impl GitNodeStorage { cache: Mutex::new(LruCache::new(cache_size)), configs: Mutex::new(HashMap::new()), hash_to_object_id: Mutex::new(HashMap::new()), + dataset_dir, }; // Load existing hash mappings @@ -72,6 +76,7 @@ impl GitNodeStorage { /// Create GitNodeStorage with custom cache size pub fn with_cache_size( repository: gix::Repository, + dataset_dir: std::path::PathBuf, cache_size: usize, ) -> Result { let cache_size = NonZeroUsize::new(cache_size).unwrap_or(NonZeroUsize::new(1000).unwrap()); @@ -81,6 +86,7 @@ impl GitNodeStorage { cache: Mutex::new(LruCache::new(cache_size)), configs: Mutex::new(HashMap::new()), hash_to_object_id: Mutex::new(HashMap::new()), + dataset_dir, }; // Load existing hash mappings @@ -176,9 +182,8 @@ impl NodeStorage for GitNodeStorage { let mut configs = self.configs.lock().unwrap(); configs.insert(key.to_string(), config.to_vec()); - // Also persist to filesystem for durability - let repo = self._repository.lock().unwrap(); - let config_path = repo.path().join(format!("prolly_config_{key}")); + // Also persist to filesystem for durability in the dataset directory + let config_path = self.dataset_dir.join(format!("prolly_config_{key}")); let _ = std::fs::write(config_path, config); } @@ -189,11 +194,9 @@ impl NodeStorage for GitNodeStorage { } // If not in memory, try to load from filesystem - let repo = self._repository.lock().unwrap(); - let config_path = repo.path().join(format!("prolly_config_{key}")); + let config_path = self.dataset_dir.join(format!("prolly_config_{key}")); if let Ok(config) = std::fs::read(config_path) { // Cache in memory for future use - drop(repo); self.configs .lock() .unwrap() @@ -208,8 +211,7 @@ impl NodeStorage for GitNodeStorage { impl GitNodeStorage { /// Save hash mapping to filesystem fn save_hash_mapping(&self, hash: &ValueDigest, object_id: &gix::ObjectId) { - let repo = self._repository.lock().unwrap(); - let mapping_path = repo.path().join("prolly_hash_mappings"); + let mapping_path = self.dataset_dir.join("prolly_hash_mappings"); // Read existing mappings let mut mappings = if mapping_path.exists() { @@ -230,8 +232,7 @@ impl GitNodeStorage { /// Load hash mappings from filesystem fn load_hash_mappings(&self) { - let repo = self._repository.lock().unwrap(); - let mapping_path = repo.path().join("prolly_hash_mappings"); + let mapping_path = self.dataset_dir.join("prolly_hash_mappings"); if let Ok(mappings) = std::fs::read_to_string(mapping_path) { let mut hash_map = self.hash_to_object_id.lock().unwrap(); @@ -300,8 +301,8 @@ mod tests { #[test] fn test_git_node_storage_basic_operations() { - let (_temp_dir, repo) = create_test_repo(); - let mut storage = GitNodeStorage::<32>::new(repo).unwrap(); + let (temp_dir, repo) = create_test_repo(); + let mut storage = GitNodeStorage::<32>::new(repo, temp_dir.path().to_path_buf()).unwrap(); let node = create_test_node(); let hash = node.get_hash(); @@ -324,8 +325,10 @@ mod tests { #[test] fn test_cache_functionality() { - let (_temp_dir, repo) = create_test_repo(); - let mut storage = GitNodeStorage::<32>::with_cache_size(repo, 2).unwrap(); + let (temp_dir, repo) = create_test_repo(); + let dataset_dir = temp_dir.path().join("dataset"); + std::fs::create_dir_all(&dataset_dir).unwrap(); + let mut storage = GitNodeStorage::<32>::with_cache_size(repo, dataset_dir, 2).unwrap(); let node1 = create_test_node(); let hash1 = node1.get_hash(); diff --git a/src/git/versioned_store.rs b/src/git/versioned_store.rs index 2364959..2addec9 100644 --- a/src/git/versioned_store.rs +++ b/src/git/versioned_store.rs @@ -33,15 +33,59 @@ pub struct VersionedKvStore { } impl VersionedKvStore { + /// Find the git repository root by walking up the directory tree + fn find_git_root>(start_path: P) -> Option { + let mut current = start_path.as_ref().to_path_buf(); + loop { + if current.join(".git").exists() { + return Some(current); + } + if !current.pop() { + break; + } + } + None + } + + /// Check if we're running in the git repository root directory + fn is_in_git_root>(path: P) -> Result { + let path = path.as_ref().canonicalize() + .map_err(|e| GitKvError::GitObjectError(format!("Failed to resolve path: {e}")))?; + + if let Some(git_root) = Self::find_git_root(&path) { + let git_root = git_root.canonicalize() + .map_err(|e| GitKvError::GitObjectError(format!("Failed to resolve git root: {e}")))?; + Ok(path == git_root) + } else { + Err(GitKvError::GitObjectError( + "Not inside a git repository. Please run from within a git repository.".to_string() + )) + } + } + /// Initialize a new versioned KV store at the given path pub fn init>(path: P) -> Result { let path = path.as_ref(); - // Initialize Git repository - let git_repo = gix::init(path).map_err(|e| GitKvError::GitInitError(Box::new(e)))?; + // Reject if trying to initialize in git root directory + if Self::is_in_git_root(path)? { + return Err(GitKvError::GitObjectError( + "Cannot initialize git-prolly in git root directory. Please run from a subdirectory to create a dataset.".to_string() + )); + } - // Create GitNodeStorage - let storage = GitNodeStorage::new(git_repo.clone())?; + // Find the git repository + let git_root = Self::find_git_root(path).ok_or_else(|| { + GitKvError::GitObjectError( + "Not inside a git repository. Please run from within a git repository.".to_string() + ) + })?; + + // Open the existing git repository instead of initializing a new one + let git_repo = gix::open(&git_root).map_err(|e| GitKvError::GitOpenError(Box::new(e)))?; + + // Create GitNodeStorage with the current directory as dataset directory + let storage = GitNodeStorage::new(git_repo.clone(), path.to_path_buf())?; // Create ProllyTree with default config let config: TreeConfig = TreeConfig::default(); @@ -60,6 +104,9 @@ impl VersionedKvStore { // Create initial commit store.commit("Initial commit")?; + // Auto-commit prolly metadata files after initialization + store.commit_prolly_metadata(" after init")?; + Ok(store) } @@ -67,11 +114,25 @@ impl VersionedKvStore { pub fn open>(path: P) -> Result { let path = path.as_ref(); + // Reject if trying to open in git root directory + if Self::is_in_git_root(path)? { + return Err(GitKvError::GitObjectError( + "Cannot run git-prolly in git root directory. Please run from a subdirectory containing a dataset.".to_string() + )); + } + + // Find the git repository + let git_root = Self::find_git_root(path).ok_or_else(|| { + GitKvError::GitObjectError( + "Not inside a git repository. Please run from within a git repository.".to_string() + ) + })?; + // Open existing Git repository - let git_repo = gix::open(path).map_err(|e| GitKvError::GitOpenError(Box::new(e)))?; + let git_repo = gix::open(&git_root).map_err(|e| GitKvError::GitOpenError(Box::new(e)))?; - // Create GitNodeStorage - let storage = GitNodeStorage::new(git_repo.clone())?; + // Create GitNodeStorage with the current directory as dataset directory + let storage = GitNodeStorage::new(git_repo.clone(), path.to_path_buf())?; // Load tree configuration from storage let config: TreeConfig = ProllyTree::load_config(&storage).unwrap_or_default(); @@ -223,6 +284,9 @@ impl VersionedKvStore { // Clear staging area file since we've committed self.save_staging_area()?; + // Auto-commit prolly metadata files to git + self.commit_prolly_metadata(&format!(" after commit: {}", message))?; + Ok(commit_id) } @@ -355,6 +419,23 @@ impl VersionedKvStore { Ok(tree_id) } + /// Get git user configuration (name and email) + fn get_git_user_config(&self) -> Result<(String, String), GitKvError> { + let config = self.git_repo.config_snapshot(); + + let name = config + .string("user.name") + .map(|n| n.to_string()) + .unwrap_or_else(|| "git-prolly".to_string()); + + let email = config + .string("user.email") + .map(|e| e.to_string()) + .unwrap_or_else(|| "git-prolly@example.com".to_string()); + + Ok((name, email)) + } + /// Create a Git commit object fn create_git_commit( &self, @@ -367,10 +448,13 @@ impl VersionedKvStore { .unwrap() .as_secs() as i64; + // Get git user configuration + let (name, email) = self.get_git_user_config()?; + // Create author and committer signatures let signature = gix::actor::Signature { - name: "git-prolly".into(), - email: "git-prolly@example.com".into(), + name: name.into(), + email: email.into(), time: gix::date::Time { seconds: now, offset: 0, @@ -460,6 +544,73 @@ impl VersionedKvStore { Ok(()) } + /// Stage and commit prolly metadata files to git + fn commit_prolly_metadata(&self, additional_message: &str) -> Result<(), GitKvError> { + // Get relative paths to the prolly files from git root + let git_root = Self::find_git_root(&self.git_repo.path().parent().unwrap()).unwrap(); + let current_dir = std::env::current_dir() + .map_err(|e| GitKvError::GitObjectError(format!("Failed to get current directory: {e}")))?; + + let config_file = "prolly_config_tree_config"; + let mapping_file = "prolly_hash_mappings"; + + // Check if files exist before trying to stage them + let config_path = current_dir.join(config_file); + let mapping_path = current_dir.join(mapping_file); + + let mut files_to_stage = Vec::new(); + + if config_path.exists() { + // Get relative path from git root + if let Ok(relative_path) = config_path.strip_prefix(&git_root) { + files_to_stage.push(relative_path.to_string_lossy().to_string()); + } + } + + if mapping_path.exists() { + // Get relative path from git root + if let Ok(relative_path) = mapping_path.strip_prefix(&git_root) { + files_to_stage.push(relative_path.to_string_lossy().to_string()); + } + } + + if files_to_stage.is_empty() { + return Ok(()); // Nothing to commit + } + + // Stage the files using git add + for file in &files_to_stage { + let add_cmd = std::process::Command::new("git") + .args(["add", file]) + .current_dir(&git_root) + .output() + .map_err(|e| GitKvError::GitObjectError(format!("Failed to run git add: {e}")))?; + + if !add_cmd.status.success() { + let stderr = String::from_utf8_lossy(&add_cmd.stderr); + return Err(GitKvError::GitObjectError(format!("git add failed: {stderr}"))); + } + } + + // Commit the staged files + let commit_message = format!("Update prolly metadata{}", additional_message); + let commit_cmd = std::process::Command::new("git") + .args(["commit", "-m", &commit_message]) + .current_dir(&git_root) + .output() + .map_err(|e| GitKvError::GitObjectError(format!("Failed to run git commit: {e}")))?; + + if !commit_cmd.status.success() { + let stderr = String::from_utf8_lossy(&commit_cmd.stderr); + // It's okay if there's nothing to commit + if !stderr.is_empty() && !stderr.contains("nothing to commit") { + return Err(GitKvError::GitObjectError(format!("git commit failed: {stderr}"))); + } + } + + Ok(()) + } + /// Load the staging area from a file fn load_staging_area(&mut self) -> Result<(), GitKvError> { let staging_file = self.git_repo.path().join("PROLLY_STAGING"); @@ -485,14 +636,24 @@ mod tests { #[test] fn test_versioned_store_init() { let temp_dir = TempDir::new().unwrap(); - let store = VersionedKvStore::<32>::init(temp_dir.path()); + // Initialize git repository (regular, not bare) + gix::init(temp_dir.path()).unwrap(); + // Create subdirectory for dataset + let dataset_dir = temp_dir.path().join("dataset"); + std::fs::create_dir_all(&dataset_dir).unwrap(); + let store = VersionedKvStore::<32>::init(&dataset_dir); assert!(store.is_ok()); } #[test] fn test_basic_kv_operations() { let temp_dir = TempDir::new().unwrap(); - let mut store = VersionedKvStore::<32>::init(temp_dir.path()).unwrap(); + // Initialize git repository (regular, not bare) + gix::init(temp_dir.path()).unwrap(); + // Create subdirectory for dataset + let dataset_dir = temp_dir.path().join("dataset"); + std::fs::create_dir_all(&dataset_dir).unwrap(); + let mut store = VersionedKvStore::<32>::init(&dataset_dir).unwrap(); // Test insert and get store.insert(b"key1".to_vec(), b"value1".to_vec()).unwrap(); @@ -512,7 +673,12 @@ mod tests { #[test] fn test_commit_workflow() { let temp_dir = TempDir::new().unwrap(); - let mut store = VersionedKvStore::<32>::init(temp_dir.path()).unwrap(); + // Initialize git repository (regular, not bare) + gix::init(temp_dir.path()).unwrap(); + // Create subdirectory for dataset + let dataset_dir = temp_dir.path().join("dataset"); + std::fs::create_dir_all(&dataset_dir).unwrap(); + let mut store = VersionedKvStore::<32>::init(&dataset_dir).unwrap(); // Stage changes store.insert(b"key1".to_vec(), b"value1".to_vec()).unwrap(); From 09480cd728d0b0a66b882928b667751e608a579d Mon Sep 17 00:00:00 2001 From: zhangfengcdt Date: Wed, 16 Jul 2025 22:12:31 -0700 Subject: [PATCH 8/9] fix fmt --- src/git/storage.rs | 5 +++- src/git/versioned_store.rs | 58 ++++++++++++++++++++++---------------- 2 files changed, 37 insertions(+), 26 deletions(-) diff --git a/src/git/storage.rs b/src/git/storage.rs index c1eca86..daed189 100644 --- a/src/git/storage.rs +++ b/src/git/storage.rs @@ -56,7 +56,10 @@ impl Clone for GitNodeStorage { impl GitNodeStorage { /// Create a new GitNodeStorage instance - pub fn new(repository: gix::Repository, dataset_dir: std::path::PathBuf) -> Result { + pub fn new( + repository: gix::Repository, + dataset_dir: std::path::PathBuf, + ) -> Result { let cache_size = NonZeroUsize::new(1000).unwrap(); // Default cache size let storage = GitNodeStorage { diff --git a/src/git/versioned_store.rs b/src/git/versioned_store.rs index 2addec9..69e5d34 100644 --- a/src/git/versioned_store.rs +++ b/src/git/versioned_store.rs @@ -49,16 +49,19 @@ impl VersionedKvStore { /// Check if we're running in the git repository root directory fn is_in_git_root>(path: P) -> Result { - let path = path.as_ref().canonicalize() + let path = path + .as_ref() + .canonicalize() .map_err(|e| GitKvError::GitObjectError(format!("Failed to resolve path: {e}")))?; - + if let Some(git_root) = Self::find_git_root(&path) { - let git_root = git_root.canonicalize() - .map_err(|e| GitKvError::GitObjectError(format!("Failed to resolve git root: {e}")))?; + let git_root = git_root.canonicalize().map_err(|e| { + GitKvError::GitObjectError(format!("Failed to resolve git root: {e}")) + })?; Ok(path == git_root) } else { Err(GitKvError::GitObjectError( - "Not inside a git repository. Please run from within a git repository.".to_string() + "Not inside a git repository. Please run from within a git repository.".to_string(), )) } } @@ -77,7 +80,7 @@ impl VersionedKvStore { // Find the git repository let git_root = Self::find_git_root(path).ok_or_else(|| { GitKvError::GitObjectError( - "Not inside a git repository. Please run from within a git repository.".to_string() + "Not inside a git repository. Please run from within a git repository.".to_string(), ) })?; @@ -124,7 +127,7 @@ impl VersionedKvStore { // Find the git repository let git_root = Self::find_git_root(path).ok_or_else(|| { GitKvError::GitObjectError( - "Not inside a git repository. Please run from within a git repository.".to_string() + "Not inside a git repository. Please run from within a git repository.".to_string(), ) })?; @@ -285,7 +288,7 @@ impl VersionedKvStore { self.save_staging_area()?; // Auto-commit prolly metadata files to git - self.commit_prolly_metadata(&format!(" after commit: {}", message))?; + self.commit_prolly_metadata(&format!(" after commit: {message}"))?; Ok(commit_id) } @@ -547,37 +550,38 @@ impl VersionedKvStore { /// Stage and commit prolly metadata files to git fn commit_prolly_metadata(&self, additional_message: &str) -> Result<(), GitKvError> { // Get relative paths to the prolly files from git root - let git_root = Self::find_git_root(&self.git_repo.path().parent().unwrap()).unwrap(); - let current_dir = std::env::current_dir() - .map_err(|e| GitKvError::GitObjectError(format!("Failed to get current directory: {e}")))?; - + let git_root = Self::find_git_root(self.git_repo.path().parent().unwrap()).unwrap(); + let current_dir = std::env::current_dir().map_err(|e| { + GitKvError::GitObjectError(format!("Failed to get current directory: {e}")) + })?; + let config_file = "prolly_config_tree_config"; let mapping_file = "prolly_hash_mappings"; - + // Check if files exist before trying to stage them let config_path = current_dir.join(config_file); let mapping_path = current_dir.join(mapping_file); - + let mut files_to_stage = Vec::new(); - + if config_path.exists() { // Get relative path from git root if let Ok(relative_path) = config_path.strip_prefix(&git_root) { files_to_stage.push(relative_path.to_string_lossy().to_string()); } } - + if mapping_path.exists() { // Get relative path from git root if let Ok(relative_path) = mapping_path.strip_prefix(&git_root) { files_to_stage.push(relative_path.to_string_lossy().to_string()); } } - + if files_to_stage.is_empty() { return Ok(()); // Nothing to commit } - + // Stage the files using git add for file in &files_to_stage { let add_cmd = std::process::Command::new("git") @@ -585,29 +589,33 @@ impl VersionedKvStore { .current_dir(&git_root) .output() .map_err(|e| GitKvError::GitObjectError(format!("Failed to run git add: {e}")))?; - + if !add_cmd.status.success() { let stderr = String::from_utf8_lossy(&add_cmd.stderr); - return Err(GitKvError::GitObjectError(format!("git add failed: {stderr}"))); + return Err(GitKvError::GitObjectError(format!( + "git add failed: {stderr}" + ))); } } - + // Commit the staged files - let commit_message = format!("Update prolly metadata{}", additional_message); + let commit_message = format!("Update prolly metadata{additional_message}"); let commit_cmd = std::process::Command::new("git") .args(["commit", "-m", &commit_message]) .current_dir(&git_root) .output() .map_err(|e| GitKvError::GitObjectError(format!("Failed to run git commit: {e}")))?; - + if !commit_cmd.status.success() { let stderr = String::from_utf8_lossy(&commit_cmd.stderr); // It's okay if there's nothing to commit if !stderr.is_empty() && !stderr.contains("nothing to commit") { - return Err(GitKvError::GitObjectError(format!("git commit failed: {stderr}"))); + return Err(GitKvError::GitObjectError(format!( + "git commit failed: {stderr}" + ))); } } - + Ok(()) } From 697a0d44cfd2378f6ff054ca19dc64d29e9384af Mon Sep 17 00:00:00 2001 From: zhangfengcdt Date: Wed, 16 Jul 2025 22:28:18 -0700 Subject: [PATCH 9/9] fix the multiple datasets issue --- src/git/versioned_store.rs | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/src/git/versioned_store.rs b/src/git/versioned_store.rs index 69e5d34..8608352 100644 --- a/src/git/versioned_store.rs +++ b/src/git/versioned_store.rs @@ -534,7 +534,7 @@ impl VersionedKvStore { /// Save the staging area to a file fn save_staging_area(&self) -> Result<(), GitKvError> { - let staging_file = self.git_repo.path().join("PROLLY_STAGING"); + let staging_file = self.get_staging_file_path()?; // Serialize the staging area let serialized = @@ -621,7 +621,7 @@ impl VersionedKvStore { /// Load the staging area from a file fn load_staging_area(&mut self) -> Result<(), GitKvError> { - let staging_file = self.git_repo.path().join("PROLLY_STAGING"); + let staging_file = self.get_staging_file_path()?; if staging_file.exists() { let data = std::fs::read(staging_file).map_err(|e| { @@ -634,6 +634,32 @@ impl VersionedKvStore { Ok(()) } + + /// Get the dataset-specific staging file path + fn get_staging_file_path(&self) -> Result { + // Get the current directory relative to git root + let current_dir = std::env::current_dir().map_err(|e| { + GitKvError::GitObjectError(format!("Failed to get current directory: {e}")) + })?; + + let git_root = Self::find_git_root(¤t_dir) + .ok_or_else(|| GitKvError::GitObjectError("Not in a git repository".to_string()))?; + + // Create a dataset-specific identifier from the relative path + let relative_path = current_dir + .strip_prefix(&git_root) + .map_err(|e| GitKvError::GitObjectError(format!("Failed to get relative path: {e}")))?; + + // Use the relative path to create a unique staging file name + let path_str = relative_path.to_string_lossy().replace(['/', '\\'], "_"); + let staging_filename = if path_str.is_empty() { + "PROLLY_STAGING_root".to_string() + } else { + format!("PROLLY_STAGING_{path_str}") + }; + + Ok(self.git_repo.path().join(staging_filename)) + } } #[cfg(test)]