Skip to content

Commit

Permalink
bulk-checkin: replace fast-import based implementation
Browse files Browse the repository at this point in the history
This extends the earlier approach to stream a large file directly from the
filesystem to its own packfile, and allows "git add" to send large files
directly into a single pack. Older code used to spawn fast-import, but the
new bulk-checkin API replaces it.

Signed-off-by: Junio C Hamano <gitster@pobox.com>
  • Loading branch information
gitster committed Dec 1, 2011
1 parent 6c52614 commit 568508e
Show file tree
Hide file tree
Showing 11 changed files with 403 additions and 78 deletions.
2 changes: 2 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -505,6 +505,7 @@ LIB_H += argv-array.h
LIB_H += attr.h
LIB_H += blob.h
LIB_H += builtin.h
LIB_H += bulk-checkin.h
LIB_H += cache.h
LIB_H += cache-tree.h
LIB_H += color.h
Expand Down Expand Up @@ -591,6 +592,7 @@ LIB_OBJS += base85.o
LIB_OBJS += bisect.o
LIB_OBJS += blob.o
LIB_OBJS += branch.o
LIB_OBJS += bulk-checkin.o
LIB_OBJS += bundle.o
LIB_OBJS += cache-tree.o
LIB_OBJS += color.o
Expand Down
5 changes: 5 additions & 0 deletions builtin/add.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "diff.h"
#include "diffcore.h"
#include "revision.h"
#include "bulk-checkin.h"

static const char * const builtin_add_usage[] = {
"git add [options] [--] <filepattern>...",
Expand Down Expand Up @@ -458,11 +459,15 @@ int cmd_add(int argc, const char **argv, const char *prefix)
free(seen);
}

plug_bulk_checkin();

exit_status |= add_files_to_cache(prefix, pathspec, flags);

if (add_new_files)
exit_status |= add_files(&dir, flags);

unplug_bulk_checkin();

finish:
if (active_cache_changed) {
if (write_cache(newfd, active_cache, active_nr) ||
Expand Down
6 changes: 1 addition & 5 deletions builtin/pack-objects.c
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ static struct pack_idx_option pack_idx_opts;
static const char *base_name;
static int progress = 1;
static int window = 10;
static unsigned long pack_size_limit, pack_size_limit_cfg;
static unsigned long pack_size_limit;
static int depth = 50;
static int delta_search_threads;
static int pack_to_stdout;
Expand Down Expand Up @@ -2009,10 +2009,6 @@ static int git_pack_config(const char *k, const char *v, void *cb)
pack_idx_opts.version);
return 0;
}
if (!strcmp(k, "pack.packsizelimit")) {
pack_size_limit_cfg = git_config_ulong(k, v);
return 0;
}
return git_default_config(k, v, cb);
}

Expand Down
275 changes: 275 additions & 0 deletions bulk-checkin.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,275 @@
/*
* Copyright (c) 2011, Google Inc.
*/
#include "bulk-checkin.h"
#include "csum-file.h"
#include "pack.h"

static int pack_compression_level = Z_DEFAULT_COMPRESSION;

static struct bulk_checkin_state {
unsigned plugged:1;

char *pack_tmp_name;
struct sha1file *f;
off_t offset;
struct pack_idx_option pack_idx_opts;

struct pack_idx_entry **written;
uint32_t alloc_written;
uint32_t nr_written;
} state;

static void finish_bulk_checkin(struct bulk_checkin_state *state)
{
unsigned char sha1[20];
char packname[PATH_MAX];
int i;

if (!state->f)
return;

if (state->nr_written == 0) {
close(state->f->fd);
unlink(state->pack_tmp_name);
goto clear_exit;
} else if (state->nr_written == 1) {
sha1close(state->f, sha1, CSUM_FSYNC);
} else {
int fd = sha1close(state->f, sha1, 0);
fixup_pack_header_footer(fd, sha1, state->pack_tmp_name,
state->nr_written, sha1,
state->offset);
close(fd);
}

sprintf(packname, "%s/pack/pack-", get_object_directory());
finish_tmp_packfile(packname, state->pack_tmp_name,
state->written, state->nr_written,
&state->pack_idx_opts, sha1);
for (i = 0; i < state->nr_written; i++)
free(state->written[i]);

clear_exit:
free(state->written);
memset(state, 0, sizeof(*state));

/* Make objects we just wrote available to ourselves */
reprepare_packed_git();
}

static int already_written(struct bulk_checkin_state *state, unsigned char sha1[])
{
int i;

/* The object may already exist in the repository */
if (has_sha1_file(sha1))
return 1;

/* Might want to keep the list sorted */
for (i = 0; i < state->nr_written; i++)
if (!hashcmp(state->written[i]->sha1, sha1))
return 1;

/* This is a new object we need to keep */
return 0;
}

/*
* Read the contents from fd for size bytes, streaming it to the
* packfile in state while updating the hash in ctx. Signal a failure
* by returning a negative value when the resulting pack would exceed
* the pack size limit and this is not the first object in the pack,
* so that the caller can discard what we wrote from the current pack
* by truncating it and opening a new one. The caller will then call
* us again after rewinding the input fd.
*
* The already_hashed_to pointer is kept untouched by the caller to
* make sure we do not hash the same byte when we are called
* again. This way, the caller does not have to checkpoint its hash
* status before calling us just in case we ask it to call us again
* with a new pack.
*/
static int stream_to_pack(struct bulk_checkin_state *state,
git_SHA_CTX *ctx, off_t *already_hashed_to,
int fd, size_t size, enum object_type type,
const char *path, unsigned flags)
{
git_zstream s;
unsigned char obuf[16384];
unsigned hdrlen;
int status = Z_OK;
int write_object = (flags & HASH_WRITE_OBJECT);
off_t offset = 0;

memset(&s, 0, sizeof(s));
git_deflate_init(&s, pack_compression_level);

hdrlen = encode_in_pack_object_header(type, size, obuf);
s.next_out = obuf + hdrlen;
s.avail_out = sizeof(obuf) - hdrlen;

while (status != Z_STREAM_END) {
unsigned char ibuf[16384];

if (size && !s.avail_in) {
ssize_t rsize = size < sizeof(ibuf) ? size : sizeof(ibuf);
if (xread(fd, ibuf, rsize) != rsize)
die("failed to read %d bytes from '%s'",
(int)rsize, path);
offset += rsize;
if (*already_hashed_to < offset) {
size_t hsize = offset - *already_hashed_to;
if (rsize < hsize)
hsize = rsize;
if (hsize)
git_SHA1_Update(ctx, ibuf, hsize);
*already_hashed_to = offset;
}
s.next_in = ibuf;
s.avail_in = rsize;
size -= rsize;
}

status = git_deflate(&s, size ? 0 : Z_FINISH);

if (!s.avail_out || status == Z_STREAM_END) {
if (write_object) {
size_t written = s.next_out - obuf;

/* would we bust the size limit? */
if (state->nr_written &&
pack_size_limit_cfg &&
pack_size_limit_cfg < state->offset + written) {
git_deflate_abort(&s);
return -1;
}

sha1write(state->f, obuf, written);
state->offset += written;
}
s.next_out = obuf;
s.avail_out = sizeof(obuf);
}

switch (status) {
case Z_OK:
case Z_BUF_ERROR:
case Z_STREAM_END:
continue;
default:
die("unexpected deflate failure: %d", status);
}
}
git_deflate_end(&s);
return 0;
}

/* Lazily create backing packfile for the state */
static void prepare_to_stream(struct bulk_checkin_state *state,
unsigned flags)
{
if (!(flags & HASH_WRITE_OBJECT) || state->f)
return;

state->f = create_tmp_packfile(&state->pack_tmp_name);
reset_pack_idx_option(&state->pack_idx_opts);

/* Pretend we are going to write only one object */
state->offset = write_pack_header(state->f, 1);
if (!state->offset)
die_errno("unable to write pack header");
}

static int deflate_to_pack(struct bulk_checkin_state *state,
unsigned char result_sha1[],
int fd, size_t size,
enum object_type type, const char *path,
unsigned flags)
{
off_t seekback, already_hashed_to;
git_SHA_CTX ctx;
unsigned char obuf[16384];
unsigned header_len;
struct sha1file_checkpoint checkpoint;
struct pack_idx_entry *idx = NULL;

seekback = lseek(fd, 0, SEEK_CUR);
if (seekback == (off_t) -1)
return error("cannot find the current offset");

header_len = sprintf((char *)obuf, "%s %" PRIuMAX,
typename(type), (uintmax_t)size) + 1;
git_SHA1_Init(&ctx);
git_SHA1_Update(&ctx, obuf, header_len);

/* Note: idx is non-NULL when we are writing */
if ((flags & HASH_WRITE_OBJECT) != 0)
idx = xcalloc(1, sizeof(*idx));

already_hashed_to = 0;

while (1) {
prepare_to_stream(state, flags);
if (idx) {
sha1file_checkpoint(state->f, &checkpoint);
idx->offset = state->offset;
crc32_begin(state->f);
}
if (!stream_to_pack(state, &ctx, &already_hashed_to,
fd, size, type, path, flags))
break;
/*
* Writing this object to the current pack will make
* it too big; we need to truncate it, start a new
* pack, and write into it.
*/
if (!idx)
die("BUG: should not happen");
sha1file_truncate(state->f, &checkpoint);
state->offset = checkpoint.offset;
finish_bulk_checkin(state);
if (lseek(fd, seekback, SEEK_SET) == (off_t) -1)
return error("cannot seek back");
}
git_SHA1_Final(result_sha1, &ctx);
if (!idx)
return 0;

idx->crc32 = crc32_end(state->f);
if (already_written(state, result_sha1)) {
sha1file_truncate(state->f, &checkpoint);
state->offset = checkpoint.offset;
free(idx);
} else {
hashcpy(idx->sha1, result_sha1);
ALLOC_GROW(state->written,
state->nr_written + 1,
state->alloc_written);
state->written[state->nr_written++] = idx;
}
return 0;
}

int index_bulk_checkin(unsigned char *sha1,
int fd, size_t size, enum object_type type,
const char *path, unsigned flags)
{
int status = deflate_to_pack(&state, sha1, fd, size, type,
path, flags);
if (!state.plugged)
finish_bulk_checkin(&state);
return status;
}

void plug_bulk_checkin(void)
{
state.plugged = 1;
}

void unplug_bulk_checkin(void)
{
state.plugged = 0;
if (state.f)
finish_bulk_checkin(&state);
}
16 changes: 16 additions & 0 deletions bulk-checkin.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
/*
* Copyright (c) 2011, Google Inc.
*/
#ifndef BULK_CHECKIN_H
#define BULK_CHECKIN_H

#include "cache.h"

extern int index_bulk_checkin(unsigned char sha1[],
int fd, size_t size, enum object_type type,
const char *path, unsigned flags);

extern void plug_bulk_checkin(void);
extern void unplug_bulk_checkin(void);

#endif
2 changes: 2 additions & 0 deletions cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ int git_inflate(git_zstream *, int flush);
void git_deflate_init(git_zstream *, int level);
void git_deflate_init_gzip(git_zstream *, int level);
void git_deflate_end(git_zstream *);
int git_deflate_abort(git_zstream *);
int git_deflate_end_gently(git_zstream *);
int git_deflate(git_zstream *, int flush);
unsigned long git_deflate_bound(git_zstream *, unsigned long);
Expand Down Expand Up @@ -598,6 +599,7 @@ extern size_t packed_git_window_size;
extern size_t packed_git_limit;
extern size_t delta_base_cache_limit;
extern unsigned long big_file_threshold;
extern unsigned long pack_size_limit_cfg;
extern int read_replace_refs;
extern int fsync_object_files;
extern int core_preload_index;
Expand Down
4 changes: 4 additions & 0 deletions config.c
Original file line number Diff line number Diff line change
Expand Up @@ -797,6 +797,10 @@ int git_default_config(const char *var, const char *value, void *dummy)
return 0;
}

if (!strcmp(var, "pack.packsizelimit")) {
pack_size_limit_cfg = git_config_ulong(var, value);
return 0;
}
/* Add other config variables here and to Documentation/config.txt. */
return 0;
}
Expand Down
1 change: 1 addition & 0 deletions environment.c
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ char *notes_ref_name;
int grafts_replace_parents = 1;
int core_apply_sparse_checkout;
struct startup_info *startup_info;
unsigned long pack_size_limit_cfg;

/* Parallel index stat data preload? */
int core_preload_index = 0;
Expand Down
Loading

0 comments on commit 568508e

Please sign in to comment.