Skip to content

Commit

Permalink
Drop column family from write thread
Browse files Browse the repository at this point in the history
Summary: If we drop column family only from (single) write thread, we can be sure that nobody will drop the column family while we're writing (and our mutex is released). This greatly simplifies my patch that's getting rid of MakeRoomForWrite().

Test Plan: make check, but also running stress test

Reviewers: ljin, sdong

Reviewed By: sdong

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D22965
  • Loading branch information
igorcanadi committed Sep 5, 2014
1 parent 8de151b commit 9f1c80b
Show file tree
Hide file tree
Showing 4 changed files with 97 additions and 22 deletions.
44 changes: 23 additions & 21 deletions db/db_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -77,20 +77,6 @@ const std::string kDefaultColumnFamilyName("default");

void DumpLeveldbBuildVersion(Logger * log);

// Information kept for every waiting writer
struct DBImpl::Writer {
Status status;
WriteBatch* batch;
bool sync;
bool disableWAL;
bool in_batch_group;
bool done;
uint64_t timeout_hint_us;
port::CondVar cv;

explicit Writer(port::Mutex* mu) : cv(mu) { }
};

struct DBImpl::WriteContext {
autovector<SuperVersion*> superversions_to_free_;
autovector<log::Writer*> logs_to_free_;
Expand Down Expand Up @@ -3627,14 +3613,26 @@ Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) {
edit.DropColumnFamily();
edit.SetColumnFamily(cfd->GetID());

Writer w(&mutex_);
w.batch = nullptr;
w.sync = false;
w.disableWAL = false;
w.in_batch_group = false;
w.done = false;
w.timeout_hint_us = kNoTimeOut;

Status s;
{
MutexLock l(&mutex_);
if (cfd->IsDropped()) {
s = Status::InvalidArgument("Column family already dropped!\n");
}
if (s.ok()) {
// we drop column family from a single write thread
s = BeginWrite(&w, 0);
assert(s.ok() && !w.done); // No timeout and nobody should do our job
s = versions_->LogAndApply(cfd, &edit, &mutex_);
EndWrite(&w, &w, s);
}
}

Expand Down Expand Up @@ -4173,15 +4171,19 @@ void DBImpl::BuildBatchGroup(Writer** last_writer,
break;
}

if (w->batch != nullptr) {
size += WriteBatchInternal::ByteSize(w->batch);
if (size > max_size) {
// Do not make batch too big
break;
}
if (w->batch == nullptr) {
// Do not include those writes with nullptr batch. Those are not writes,
// those are something else. They want to be alone
break;
}

write_batch_group->push_back(w->batch);
size += WriteBatchInternal::ByteSize(w->batch);
if (size > max_size) {
// Do not make batch too big
break;
}

write_batch_group->push_back(w->batch);
w->in_batch_group = true;
*last_writer = w;
}
Expand Down
27 changes: 26 additions & 1 deletion db/db_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,17 @@ class DBImpl : public DB {
SequenceNumber* sequence);

Status TEST_ReadFirstLine(const std::string& fname, SequenceNumber* sequence);

void TEST_LockMutex();

void TEST_UnlockMutex();

// REQUIRES: mutex locked
void* TEST_BeginWrite();

// REQUIRES: mutex locked
// pass the pointer that you got from TEST_BeginWrite()
void TEST_EndWrite(void* w);
#endif // NDEBUG

// Structure to store information for candidate files to delete.
Expand Down Expand Up @@ -309,7 +320,7 @@ class DBImpl : public DB {
#endif
friend struct SuperVersion;
struct CompactionState;
struct Writer;

struct WriteContext;

Status NewDB();
Expand Down Expand Up @@ -349,6 +360,20 @@ class DBImpl : public DB {

uint64_t SlowdownAmount(int n, double bottom, double top);

// Information kept for every waiting writer
struct Writer {
Status status;
WriteBatch* batch;
bool sync;
bool disableWAL;
bool in_batch_group;
bool done;
uint64_t timeout_hint_us;
port::CondVar cv;

explicit Writer(port::Mutex* mu) : cv(mu) {}
};

// Before applying write operation (such as DBImpl::Write, DBImpl::Flush)
// thread should grab the mutex_ and be the first on writers queue.
// BeginWrite is used for it.
Expand Down
27 changes: 27 additions & 0 deletions db/db_impl_debug.cc
Original file line number Diff line number Diff line change
Expand Up @@ -130,5 +130,32 @@ Status DBImpl::TEST_ReadFirstLine(const std::string& fname,
SequenceNumber* sequence) {
return ReadFirstLine(fname, sequence);
}

void DBImpl::TEST_LockMutex() {
mutex_.Lock();
}

void DBImpl::TEST_UnlockMutex() {
mutex_.Unlock();
}

void* DBImpl::TEST_BeginWrite() {
auto w = new Writer(&mutex_);
w->batch = nullptr;
w->sync = false;
w->disableWAL = false;
w->in_batch_group = false;
w->done = false;
w->timeout_hint_us = kNoTimeOut;
Status s = BeginWrite(w, 0);
assert(s.ok() && !w->done); // No timeout and nobody should do our job
return reinterpret_cast<void*>(w);
}

void DBImpl::TEST_EndWrite(void* w) {
auto writer = reinterpret_cast<Writer*>(w);
EndWrite(writer, writer, Status::OK());
}

} // namespace rocksdb
#endif // ROCKSDB_LITE
21 changes: 21 additions & 0 deletions db/db_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <iostream>
#include <set>
#include <unistd.h>
#include <thread>
#include <unordered_set>
#include <utility>

Expand Down Expand Up @@ -7894,6 +7895,26 @@ TEST(DBTest, DBIteratorBoundTest) {
}
}

TEST(DBTest, WriteSingleThreadEntry) {
std::vector<std::thread> threads;
dbfull()->TEST_LockMutex();
auto w = dbfull()->TEST_BeginWrite();
threads.emplace_back([&] { Put("a", "b"); });
env_->SleepForMicroseconds(10000);
threads.emplace_back([&] { Flush(); });
env_->SleepForMicroseconds(10000);
dbfull()->TEST_UnlockMutex();
dbfull()->TEST_LockMutex();
dbfull()->TEST_EndWrite(w);
dbfull()->TEST_UnlockMutex();

for (auto& t : threads) {
t.join();
}
}



} // namespace rocksdb

int main(int argc, char** argv) {
Expand Down

0 comments on commit 9f1c80b

Please sign in to comment.