treeverse · idanovo · Jun 11, 2024 · Apr 25, 2024 · Apr 25, 2024 · Apr 28, 2024
diff --git a/pkg/catalog/catalog_test.go b/pkg/catalog/catalog_test.go
@@ -533,6 +533,7 @@ func TestCatalog_PrepareGCUncommitted(t *testing.T) {
 		numRecords             int
 		expectedCalls          int
 		expectedForUncommitted int
+		compactBranch          bool
 	}{
 		{
 			name:          "no branches",
@@ -546,23 +547,44 @@ func TestCatalog_PrepareGCUncommitted(t *testing.T) {
 			numRecords:    0,
 			expectedCalls: 1,
 		},
+		{
+			name:          "no objects",
+			numBranch:     3,
+			numRecords:    0,
+			expectedCalls: 1,
+			compactBranch: true,
+		},
 		{
 			name:          "sanity",
 			numBranch:     5,
 			numRecords:    3,
 			expectedCalls: 1,
 		},
+		{
+			name:          "compacted",
+			numBranch:     5,
+			numRecords:    3,
+			expectedCalls: 1,
+			compactBranch: true,
+		},
 		{
 			name:          "tokenized",
 			numBranch:     500,
 			numRecords:    500,
 			expectedCalls: 2,
 		},
+		{
+			name:          "tokenized and compacted",
+			numBranch:     500,
+			numRecords:    500,
+			expectedCalls: 2,
+			compactBranch: true,
+		},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			const repositoryID = "repo1"
-			g, expectedRecords := createPrepareUncommittedTestScenario(t, repositoryID, tt.numBranch, tt.numRecords, tt.expectedCalls)
+			g, expectedRecords := createPrepareUncommittedTestScenario(t, repositoryID, tt.numBranch, tt.numRecords, tt.expectedCalls, tt.compactBranch)
 			blockAdapter := testutil.NewBlockAdapterByType(t, block.BlockstoreTypeMem)
 			c := &catalog.Catalog{
 				Store:                 g.Sut,
@@ -614,19 +636,35 @@ func TestCatalog_PrepareGCUncommitted(t *testing.T) {
 	}
 }
 
-func createPrepareUncommittedTestScenario(t *testing.T, repositoryID string, numBranches, numRecords, expectedCalls int) (*gUtils.GravelerTest, []string) {
+func createPrepareUncommittedTestScenario(t *testing.T, repositoryID string, numBranches, numRecords, expectedCalls int, compact bool) (*gUtils.GravelerTest, []string) {
 	t.Helper()
 
 	test := gUtils.InitGravelerTest(t)
 	records := make([][]*graveler.ValueRecord, numBranches)
+	diffs := make([][]graveler.Diff, numBranches)
 	var branches []*graveler.BranchRecord
 	var expectedRecords []string
+	if numBranches > 0 {
+		test.RefManager.EXPECT().GetCommit(gomock.Any(), gomock.Any(), gomock.Any()).MinTimes(1).Return(&graveler.Commit{}, nil)
+		test.CommittedManager.EXPECT().List(gomock.Any(), gomock.Any(), gomock.Any()).MinTimes(1).Return(cUtils.NewFakeValueIterator([]*graveler.ValueRecord{}), nil)
+	}
 	for i := 0; i < numBranches; i++ {
 		branchID := graveler.BranchID(fmt.Sprintf("branch%04d", i))
 		token := graveler.StagingToken(fmt.Sprintf("%s_st%04d", branchID, i))
-		branches = append(branches, &graveler.BranchRecord{BranchID: branchID, Branch: &graveler.Branch{StagingToken: token}})
+		branch := &graveler.BranchRecord{BranchID: branchID, Branch: &graveler.Branch{StagingToken: token}}
+		compactBaseMetaRangeID := graveler.MetaRangeID(fmt.Sprintf("base%04d", i))
+		commitID := graveler.CommitID(fmt.Sprintf("commit%04d", i))
+		if !compact {
+			test.RefManager.EXPECT().GetBranch(gomock.Any(), gomock.Any(), branchID).MinTimes(1).Return(&graveler.Branch{StagingToken: token}, nil)
+		} else {
+			branch.CompactedBaseMetaRangeID = compactBaseMetaRangeID
+			branch.Branch.CommitID = commitID
+			test.RefManager.EXPECT().GetBranch(gomock.Any(), gomock.Any(), branchID).MinTimes(1).Return(&graveler.Branch{StagingToken: token, CommitID: commitID, CompactedBaseMetaRangeID: compactBaseMetaRangeID}, nil)
+		}
+		branches = append(branches, branch)
 
 		records[i] = make([]*graveler.ValueRecord, 0, numRecords)
+		diffs[i] = make([]graveler.Diff, 0, numRecords)
 		for j := 0; j < numRecords; j++ {
 			var (
 				addressType catalog.Entry_AddressType
@@ -661,6 +699,30 @@ func createPrepareUncommittedTestScenario(t *testing.T, repositoryID string, num
 			})
 			// we always keep the relative path - as the prepared uncommitted will trim the storage namespace
 			expectedRecords = append(expectedRecords, fmt.Sprintf("%s_record%04d", branchID, j))
+
+			if compact {
+				diffs[i] = append(diffs[i], graveler.Diff{
+					Type: graveler.DiffTypeAdded,
+					Key:  []byte(e.Address),
+					Value: &graveler.Value{
+						Identity: []byte("dont care"),
+						Data:     v,
+					},
+				}) // record in compaction and in staging so no need to add it again to expected records
+				e.Address = fmt.Sprintf("%s_%s", e.Address, "compacted")
+				v, err = proto.Marshal(&e)
+				require.NoError(t, err)
+				diffs[i] = append(diffs[i], graveler.Diff{
+					Type: graveler.DiffTypeAdded,
+					Key:  []byte(e.Address),
+					Value: &graveler.Value{
+						Identity: []byte("dont care"),
+						Data:     v,
+					},
+				})
+				// record in compaction but not in staging
+				expectedRecords = append(expectedRecords, fmt.Sprintf("%s_record%04d_%s", branchID, j, "compacted"))
+			}
 		}
 
 		// Add tombstone
@@ -669,6 +731,11 @@ func createPrepareUncommittedTestScenario(t *testing.T, repositoryID string, num
 			Value: nil,
 		})
 
+		diffs[i] = append(diffs[i], graveler.Diff{
+			Type: graveler.DiffTypeRemoved,
+			Key:  []byte(fmt.Sprintf("%s_tombstone", branchID)),
+		})
+
 		// Add external address
 		e := catalog.Entry{
 			Address:      fmt.Sprintf("external/address/object_%s", branchID),
@@ -688,6 +755,15 @@ func createPrepareUncommittedTestScenario(t *testing.T, repositoryID string, num
 				Data:     v,
 			},
 		})
+
+		diffs[i] = append(diffs[i], graveler.Diff{
+			Type: graveler.DiffTypeAdded,
+			Key:  []byte(e.Address),
+			Value: &graveler.Value{
+				Identity: []byte("dont care"),
+				Data:     v,
+			},
+		})
 	}
 
 	test.GarbageCollectionManager.EXPECT().NewID().Return("TestRunID")
@@ -699,21 +775,27 @@ func createPrepareUncommittedTestScenario(t *testing.T, repositoryID string, num
 			DefaultBranchID:  "main",
 		},
 	}
-	test.RefManager.EXPECT().GetRepository(gomock.Any(), graveler.RepositoryID(repositoryID)).Times(expectedCalls).Return(repository, nil)
+	test.RefManager.EXPECT().GetRepository(gomock.Any(), graveler.RepositoryID(repositoryID)).AnyTimes().Return(repository, nil)
 
 	// expect tracked addresses does not list branches, so remove one and keep at least the first
-	test.RefManager.EXPECT().ListBranches(gomock.Any(), gomock.Any()).Times(expectedCalls).Return(gUtils.NewFakeBranchIterator(branches), nil)
+	test.RefManager.EXPECT().ListBranches(gomock.Any(), gomock.Any()).AnyTimes().Return(gUtils.NewFakeBranchIterator(branches), nil)
 	for i := 0; i < len(branches); i++ {
 		sort.Slice(records[i], func(ii, jj int) bool {
 			return bytes.Compare(records[i][ii].Key, records[i][jj].Key) < 0
 		})
 		test.StagingManager.EXPECT().List(gomock.Any(), branches[i].StagingToken, gomock.Any()).AnyTimes().Return(cUtils.NewFakeValueIterator(records[i]))
+		if compact {
+			sort.Slice(diffs[i], func(ii, jj int) bool {
+				return bytes.Compare(diffs[i][ii].Key, diffs[i][jj].Key) < 0
+			})
+			test.CommittedManager.EXPECT().Diff(gomock.Any(), repository.StorageNamespace, gomock.Any(), branches[i].CompactedBaseMetaRangeID).AnyTimes().Return(gUtils.NewDiffIter(diffs[i]), nil)
+		}
 	}
 
 	if numRecords > 0 {
 		test.GarbageCollectionManager.EXPECT().
 			GetUncommittedLocation(gomock.Any(), gomock.Any()).
-			Times(expectedCalls).
+			AnyTimes().
 			DoAndReturn(func(runID string, sn graveler.StorageNamespace) (string, error) {
 				return fmt.Sprintf("%s/retention/gc/uncommitted/%s/uncommitted/", "_lakefs", runID), nil
 			})
@@ -736,7 +818,6 @@ func readPhysicalAddressesFromParquetObject(t *testing.T, repositoryID string, c
 	require.NoError(t, err)
 	bufferFile := buffer.NewBufferFileFromBytes(data)
 	defer func() { _ = bufferFile.Close() }()
-
 	pr, err := reader.NewParquetReader(bufferFile, new(catalog.UncommittedParquetObject), 4)
 	require.NoError(t, err)
 

diff --git a/pkg/catalog/fake_graveler_test.go b/pkg/catalog/fake_graveler_test.go
@@ -14,7 +14,6 @@ type FakeGraveler struct {
 	KeyValue                   map[string]*graveler.Value
 	Err                        error
 	ListIteratorFactory        func() graveler.ValueIterator
-	ListStagingIteratorFactory func(token graveler.StagingToken) graveler.ValueIterator
 	DiffIteratorFactory        func() graveler.DiffIterator
 	RepositoryIteratorFactory  func() graveler.RepositoryIterator
 	BranchIteratorFactory      func() graveler.BranchIterator
@@ -125,13 +124,6 @@ func (g *FakeGraveler) DeleteBatch(ctx context.Context, repository *graveler.Rep
 	return nil
 }
 
-func (g *FakeGraveler) ListStaging(_ context.Context, b *graveler.Branch, _ int) (graveler.ValueIterator, error) {
-	if g.Err != nil {
-		return nil, g.Err
-	}
-	return g.ListStagingIteratorFactory(b.StagingToken), nil
-}
-
 func (g *FakeGraveler) List(_ context.Context, _ *graveler.RepositoryRecord, _ graveler.Ref, _ int) (graveler.ValueIterator, error) {
 	if g.Err != nil {
 		return nil, g.Err

diff --git a/pkg/catalog/gc_write_uncommitted.go b/pkg/catalog/gc_write_uncommitted.go
@@ -17,74 +17,115 @@ func gcWriteUncommitted(ctx context.Context, store Store, repository *graveler.R
 	}
 	pw.CompressionType = parquet.CompressionCodec_GZIP
 
-	// write uncommitted data from branches
-	it, err := NewUncommittedIterator(ctx, store, repository)
+	branchIterator, err := store.ListBranches(ctx, repository)
 	if err != nil {
 		return nil, false, err
 	}
-	defer it.Close()
+	defer branchIterator.Close()
 
-	if mark != nil {
-		it.SeekGE(mark.BranchID, mark.Path)
-	}
+	normalizedStorageNamespace := normalizeStorageNamespace(string(repository.StorageNamespace))
 
-	normalizedStorageNamespace := string(repository.StorageNamespace)
-	if !strings.HasSuffix(normalizedStorageNamespace, DefaultPathDelimiter) {
-		normalizedStorageNamespace += DefaultPathDelimiter
+	if mark != nil {
+		branchIterator.SeekGE(mark.BranchID)
 	}
 
 	count := 0
+	var nextMark *GCUncommittedMark
+	hasData := false
 	startTime := time.Now()
+	for branchIterator.Next() {
+		nextMark, count, err = processBranch(ctx, store, repository, branchIterator.Value().BranchID, runID, pw, normalizedStorageNamespace, maxFileSize, prepareDuration, w, count, mark, startTime)
+		if err != nil {
+			return nil, false, err
+		}
+		if nextMark != nil {
+			break
+		}
+	}
+	if branchIterator.Err() != nil {
+		return nil, false, branchIterator.Err()
+	}
+
+	if err := pw.WriteStop(); err != nil {
+		return nil, false, err
+	}
+
+	if count > 0 {
+		hasData = true
+	}
+	return nextMark, hasData, err
+}
+
+func normalizeStorageNamespace(namespace string) string {
+	if !strings.HasSuffix(namespace, DefaultPathDelimiter) {
+		namespace += DefaultPathDelimiter
+	}
+	return namespace
+}
+
+func processBranch(ctx context.Context, store Store, repository *graveler.RepositoryRecord, branchID graveler.BranchID, runID string, parquetWriter *writer.ParquetWriter, normalizedStorageNamespace string, maxFileSize int64, prepareDuration time.Duration, writer *UncommittedWriter, count int, mark *GCUncommittedMark, startTime time.Time) (*GCUncommittedMark, int, error) {
+	diffIterator, err := store.DiffUncommitted(ctx, repository, branchID)
+	if err != nil {
+		return nil, 0, err
+	}
+	defer diffIterator.Close()
+
 	var nextMark *GCUncommittedMark
-	for it.Next() {
-		entry := it.Value()
-		// Skip if entry is tombstone
-		if entry.Entry == nil {
+
+	if mark != nil && mark.BranchID == branchID && mark.Path != "" {
+		diffIterator.SeekGE(graveler.Key(mark.Path))
+	}
+
+	for diffIterator.Next() {
+		diff := diffIterator.Value()
+
+		// Skip tombstones
+		if diff.Type == graveler.DiffTypeRemoved {
 			continue
 		}
-		// Skip non-relative that address outside the storage namespace
+
+		entry, err := ValueToEntry(diff.Value)
+		if err != nil {
+			return nil, 0, err
+		}
+
+		// Skip non-relative addresses outside the storage namespace
 		entryAddress := entry.Address
-		if entry.Entry.AddressType != Entry_RELATIVE {
+		if entry.AddressType != Entry_RELATIVE {
 			if !strings.HasPrefix(entry.Address, normalizedStorageNamespace) {
 				continue
 			}
 			entryAddress = entryAddress[len(normalizedStorageNamespace):]
 		}
 
-		count += 1
+		count++
 		if count%gcPeriodicCheckSize == 0 {
-			if err := pw.Flush(true); err != nil {
-				return nil, false, err
+			if err := parquetWriter.Flush(true); err != nil {
+				return nil, 0, err
 			}
 		}
+
 		// check if we need to stop - based on max file size or prepare duration.
 		// prepare duration is optional, if 0 it will be ignored.
 		// prepare duration is used to stop the process in cases we scan a lot of data, and we want to stop
 		// so the api call will not time out.
-		if w.Size() > maxFileSize || (prepareDuration > 0 && time.Since(startTime) > prepareDuration) {
+		if writer.Size() > maxFileSize || (prepareDuration > 0 && time.Since(startTime) > prepareDuration) {
 			nextMark = &GCUncommittedMark{
 				RunID:    runID,
-				BranchID: entry.branchID,
-				Path:     entry.Path,
+				BranchID: branchID,
+				Path:     Path(diff.Key.String()),
 			}
 			break
 		}
-		if err = pw.Write(UncommittedParquetObject{
+
+		err = parquetWriter.Write(UncommittedParquetObject{
 			PhysicalAddress: entryAddress,
 			CreationDate:    entry.LastModified.AsTime().Unix(),
-		}); err != nil {
-			return nil, false, err
+		})
+		if err != nil {
+			return nil, 0, err
 		}
 	}
-	if err := it.Err(); err != nil {
-		return nil, false, err
-	}
-	// stop writer before we return
-	if err := pw.WriteStop(); err != nil {
-		return nil, false, err
-	}
 
-	// Finished reading all staging area - return marker to switch processing tracked physical addresses
-	hasData := count > 0
-	return nextMark, hasData, nil
+	return nextMark, count, diffIterator.Err()
 }