Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Group.RotateFile need call Flush() before rename. #2428

Closed
goolAdapter opened this issue Sep 18, 2018 · 3 comments
Closed

Group.RotateFile need call Flush() before rename. #2428

goolAdapter opened this issue Sep 18, 2018 · 3 comments
Labels
C:libs Component: Library T:bug Type Bug (Confirmed)
Milestone

Comments

@goolAdapter
Copy link
Contributor

Group.RotateFile need call Flush() before rename, Group.headBuf may have residual content, which may cause DataCorruptionError(WALDecoder presume whole message),and can't recovery from a crash.

@goolAdapter
Copy link
Contributor Author

goolAdapter commented Sep 18, 2018

I create a test case to show this situation, 60 message will exceed the headBuf's size (which is 4096*10),and trigger the truncate, cause SearchForEndHeight failed to search height 50.

func MakeNBlocksWhitWAL(wal *baseWAL, numBlocks int) (err error) {
	config := getConfig()

	app := kvstore.NewPersistentKVStoreApplication(filepath.Join(config.DBDir(), "wal_generator"))

	logger := log.TestingLogger().With("wal_generator", "wal_generator")
	logger.Info("generating WAL (last height msg excluded)", "numBlocks", numBlocks)

	/////////////////////////////////////////////////////////////////////////////
	// COPY PASTE FROM node.go WITH A FEW MODIFICATIONS
	// NOTE: we can't import node package because of circular dependency
	privValidatorFile := config.PrivValidatorFile()
	privValidator := privval.LoadOrGenFilePV(privValidatorFile)
	genDoc, err := tmtypes.GenesisDocFromFile(config.GenesisFile())
	if err != nil {
		return errors.Wrap(err, "failed to read genesis file")
	}
	stateDB := db.NewMemDB()
	blockStoreDB := db.NewMemDB()
	state, err := sm.MakeGenesisState(genDoc)
	if err != nil {
		return errors.Wrap(err, "failed to make genesis state")
	}
	blockStore := bc.NewBlockStore(blockStoreDB)
	handshaker := NewHandshaker(stateDB, state, blockStore, genDoc)
	proxyApp := proxy.NewAppConns(proxy.NewLocalClientCreator(app), handshaker)
	proxyApp.SetLogger(logger.With("module", "proxy"))
	if err := proxyApp.Start(); err != nil {
		return errors.Wrap(err, "failed to start proxy app connections")
	}
	defer proxyApp.Stop()
	eventBus := tmtypes.NewEventBus()
	eventBus.SetLogger(logger.With("module", "events"))
	if err := eventBus.Start(); err != nil {
		return errors.Wrap(err, "failed to start event bus")
	}
	defer eventBus.Stop()
	mempool := sm.MockMempool{}
	evpool := sm.MockEvidencePool{}
	blockExec := sm.NewBlockExecutor(stateDB, log.TestingLogger(), proxyApp.Consensus(), mempool, evpool)
	consensusState := NewConsensusState(config.Consensus, state.Copy(), blockExec, blockStore, mempool, evpool)
	consensusState.SetLogger(logger)
	consensusState.SetEventBus(eventBus)
	if privValidator != nil {
		consensusState.SetPrivValidator(privValidator)
	}
	// END OF COPY PASTE
	/////////////////////////////////////////////////////////////////////////////

	// set consensus wal to buffered WAL, which will write all incoming msgs to buffer
	numBlocksWritten := make(chan struct{})
	consensusState.wal = newByteBufferWAL(logger, NewWALEncoder(wal.group), int64(numBlocks), numBlocksWritten)
	// see wal.go#103
	wal.Write(EndHeightMessage{0})

	if err := consensusState.Start(); err != nil {
		return errors.Wrap(err, "failed to start consensus state")
	}

	select {
	case <-numBlocksWritten:
		consensusState.Stop()
		wal.group.Flush()
		return nil
	case <-time.After(1 * time.Minute):
		consensusState.Stop()
		return fmt.Errorf("waited too long for tendermint to produce %d blocks (grep logs for `wal_generator`)", numBlocks)
	}
}

func processTicks(g *autofile.Group) {
	for {
		size, err := g.Head.Size()
		if err != nil {
			panic(err)
		}
		if size >= 4096 {
			g.RotateFile()
		}
	}
}

func TestWALResidual(t *testing.T) {
	walDir, err := ioutil.TempDir("", "wal")
	if err != nil {
		panic(fmt.Errorf("failed to create temp WAL file: %v", err))
	}

	walFile := filepath.Join(walDir, "wal")

	wal, err := NewWAL(walFile)
	if err != nil {
		t.Fatal(err)
	}

	go processTicks(wal.group)

	err = MakeNBlocksWhitWAL(wal, 60)
	if err != nil {
		t.Fatal(err)
	}

	h := int64(50)
	gr, found, err := wal.SearchForEndHeight(h, &WALSearchOptions{})
	assert.NoError(t, err, fmt.Sprintf("expected not to err on height %d", h))
	assert.True(t, found, fmt.Sprintf("expected to find end height for %d", h))
	assert.NotNil(t, gr, "expected group not to be nil")
	defer gr.Close()

	dec := NewWALDecoder(gr)
	msg, err := dec.Decode()
	assert.NoError(t, err, "expected to decode a message")
	rs, ok := msg.Msg.(tmtypes.EventDataRoundState)
	assert.True(t, ok, "expected message of type EventDataRoundState")
	assert.Equal(t, rs.Height, h+1, fmt.Sprintf("wrong height"))
}

The modified version RotateFile will pass it.

// RotateFile causes group to close the current head and assign it some index.
// Note it does not create a new head.
func (g *Group) RotateFile() {
	g.mtx.Lock()
	defer g.mtx.Unlock()

	headPath := g.Head.Path

	err := g.headBuf.Flush()
	if err = g.Head.closeFile(); err != nil {
		panic(err)
	}

	if err = g.Head.Sync(); err != nil {
		panic(err)
	}

	if err = g.Head.closeFile(); err != nil {
		panic(err)
	}

	indexPath := filePathForIndex(headPath, g.maxIndex, g.maxIndex+1)
	if err := os.Rename(headPath, indexPath); err != nil {
		panic(err)
	}

	g.maxIndex++
}

@xla
Copy link
Contributor

xla commented Sep 18, 2018

@goolAdapter Thanks for reporting and investigating the issue, would be able to move your code and tests into a PR? That'd be highly appreciated.

@xla xla added T:bug Type Bug (Confirmed) user C:libs Component: Library labels Sep 18, 2018
@xla xla modified the milestones: launch, v1.0 Sep 18, 2018
goolAdapter added a commit to goolAdapter/tendermint that referenced this issue Sep 19, 2018
goolAdapter added a commit to goolAdapter/tendermint that referenced this issue Sep 19, 2018
xla pushed a commit that referenced this issue Sep 25, 2018
* fix Group.RotateFile need call Flush() before rename. #2428
* fix some review issue. #2428
 refactor Group's config: replace  setting member with initial option
* fix a handwriting mistake
* fix a time window error between rename and write.
* fix a syntax mistake.
* change option name Get_ to With_
* fix review issue
* fix review issue
@melekes
Copy link
Contributor

melekes commented Sep 28, 2018

Merged to develop.

@melekes melekes closed this as completed Sep 28, 2018
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
C:libs Component: Library T:bug Type Bug (Confirmed)
Projects
None yet
Development

No branches or pull requests

3 participants