Skip to content

Commit

Permalink
mds: blocklist clients with "bloated" session metadata
Browse files Browse the repository at this point in the history
Buggy clients (or maybe a MDS bug) causes a huge buildup of
`completed_requests` metadata in its session information.
This could cause the MDS to go read-only when its flushing
session metadata to the journal since the bloated metadata
causes the ODSOp payload to exceed the maximum write size.

Blocklist such clients so as to allow the MDS to continue
servicing requests.

Fixes: http://tracker.ceph.com/issues/61947
Signed-off-by: Venky Shankar <vshankar@redhat.com>
  • Loading branch information
vshankar committed Aug 22, 2023
1 parent de99a52 commit bc6814d
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 16 deletions.
10 changes: 10 additions & 0 deletions src/common/options/mds.yaml.in
Expand Up @@ -1577,3 +1577,13 @@ options:
- mds
flags:
- runtime
- name: mds_session_metadata_threshold
type: size
level: advanced
desc: Evict non-advancing client-tid sessions exceeding the config size.
long_desc: Evict clients which are not advancing their request tids which causes a large buildup of session metadata (`completed_requests`) in the MDS causing the MDS to go read-only since the RADOS operation exceeds the size threashold. This config is the maximum size (in bytes) that a session metadata (encoded) can grow.
default: 16_M
services:
- mds
flags:
- runtime
1 change: 1 addition & 0 deletions src/mds/MDSRank.cc
Expand Up @@ -3867,6 +3867,7 @@ const char** MDSRankDispatcher::get_tracked_conf_keys() const
"mds_session_cap_acquisition_throttle",
"mds_session_max_caps_throttle_ratio",
"mds_symlink_recovery",
"mds_session_metadata_threshold",
NULL
};
return KEYS;
Expand Down
75 changes: 60 additions & 15 deletions src/mds/SessionMap.cc
Expand Up @@ -45,6 +45,11 @@ class SessionMapIOContext : public MDSIOContextBase
};
};

SessionMap::SessionMap(MDSRank *m)
: mds(m),
mds_session_metadata_threshold(g_conf().get_val<Option::size_t>("mds_session_metadata_threshold")) {
}

void SessionMap::register_perfcounters()
{
PerfCountersBuilder plb(g_ceph_context, "mds_sessions",
Expand Down Expand Up @@ -375,6 +380,11 @@ class C_IO_SM_Save : public SessionMapIOContext {
};
}

bool SessionMap::validate_and_encode_session(MDSRank *mds, Session *session, bufferlist& bl) {
session->info.encode(bl, mds->mdsmap->get_up_features());
return bl.length() < mds_session_metadata_threshold;
}

void SessionMap::save(MDSContext *onsave, version_t needv)
{
dout(10) << __func__ << ": needv " << needv << ", v " << version << dendl;
Expand Down Expand Up @@ -410,6 +420,7 @@ void SessionMap::save(MDSContext *onsave, version_t needv)

dout(20) << " updating keys:" << dendl;
map<string, bufferlist> to_set;
std::set<entity_name_t> to_blocklist;
for(std::set<entity_name_t>::iterator i = dirty_sessions.begin();
i != dirty_sessions.end(); ++i) {
const entity_name_t name = *i;
Expand All @@ -420,13 +431,19 @@ void SessionMap::save(MDSContext *onsave, version_t needv)
session->is_stale() ||
session->is_killing()) {
dout(20) << " " << name << dendl;
// Serialize K
CachedStackStringStream css;
*css << name;

// Serialize V
bufferlist bl;
session->info.encode(bl, mds->mdsmap->get_up_features());
if (!validate_and_encode_session(mds, session, bl)) {
derr << __func__ << ": session (" << name << ") exceeds"
<< " sesion metadata threshold - blocklisting" << dendl;
to_blocklist.emplace(name);
continue;
}

// Serialize K
CachedStackStringStream css;
*css << name;

// Add to RADOS op
to_set[std::string(css->strv())] = bl;
Expand Down Expand Up @@ -461,6 +478,7 @@ void SessionMap::save(MDSContext *onsave, version_t needv)
0,
new C_OnFinisher(new C_IO_SM_Save(this, version),
mds->finisher));
apply_blocklist(to_blocklist);
}

void SessionMap::_save_finish(version_t v)
Expand Down Expand Up @@ -826,7 +844,8 @@ void SessionMap::save_if_dirty(const std::set<entity_name_t> &tgt_sessions,
{
ceph_assert(gather_bld != NULL);

std::vector<entity_name_t> write_sessions;
std::set<entity_name_t> to_blocklist;
std::map<entity_name_t, bufferlist> write_sessions;

// Decide which sessions require a write
for (std::set<entity_name_t>::iterator i = tgt_sessions.begin();
Expand All @@ -851,35 +870,40 @@ void SessionMap::save_if_dirty(const std::set<entity_name_t> &tgt_sessions,
// need to pre-empt that.
continue;
}

// Serialize V
bufferlist bl;
if (!validate_and_encode_session(mds, session, bl)) {
derr << __func__ << ": session (" << session_id << ") exceeds"
<< " sesion metadata threshold - blocklisting" << dendl;
to_blocklist.emplace(session_id);
continue;
}

// Okay, passed all our checks, now we write
// this session out. The version we write
// into the OMAP may now be higher-versioned
// than the version in the header, but that's
// okay because it's never a problem to have
// an overly-fresh copy of a session.
write_sessions.push_back(*i);
write_sessions.emplace(session_id, std::move(bl));
session->clear_dirty_completed_requests();
}

dout(4) << __func__ << ": writing " << write_sessions.size() << dendl;

// Batch writes into mds_sessionmap_keys_per_op
const uint32_t kpo = g_conf()->mds_sessionmap_keys_per_op;
map<string, bufferlist> to_set;
for (uint32_t i = 0; i < write_sessions.size(); ++i) {
const entity_name_t &session_id = write_sessions[i];
Session *session = session_map[session_id];
session->clear_dirty_completed_requests();

uint32_t i = 0;
for (auto &[session_id, bl] : write_sessions) {
// Serialize K
CachedStackStringStream css;
*css << session_id;

// Serialize V
bufferlist bl;
session->info.encode(bl, mds->mdsmap->get_up_features());

// Add to RADOS op
to_set[css->str()] = bl;
to_set[css->str()] = std::move(bl);

// Complete this write transaction?
if (i == write_sessions.size() - 1
Expand All @@ -898,7 +922,10 @@ void SessionMap::save_if_dirty(const std::set<entity_name_t> &tgt_sessions,
new C_IO_SM_Save_One(this, on_safe),
mds->finisher));
}
++i;
}

apply_blocklist(to_blocklist);
}

// =================
Expand Down Expand Up @@ -1112,6 +1139,10 @@ void SessionMap::handle_conf_change(const std::set<std::string>& changed)
};
apply_to_open_sessions(mut);
}

if (changed.count("mds_session_metadata_threshold")) {
mds_session_metadata_threshold = g_conf().get_val<Option::size_t>("mds_session_metadata_threshold");
}
}

void SessionMap::update_average_session_age() {
Expand All @@ -1123,6 +1154,20 @@ void SessionMap::update_average_session_age() {
logger->set(l_mdssm_avg_session_uptime, (uint64_t)avg_uptime);
}

void SessionMap::apply_blocklist(const std::set<entity_name_t>& victims) {
if (victims.empty()) {
return;
}

C_GatherBuilder gather(g_ceph_context, new C_MDSInternalNoop);
for (auto &victim : victims) {
CachedStackStringStream css;
mds->evict_client(victim.num(), false, g_conf()->mds_session_blocklist_on_evict, *css,
gather.new_sub());
}
gather.activate();
}

int SessionFilter::parse(
const std::vector<std::string> &args,
std::ostream *ss)
Expand Down
7 changes: 6 additions & 1 deletion src/mds/SessionMap.h
Expand Up @@ -594,7 +594,7 @@ class SessionMapStore {
class SessionMap : public SessionMapStore {
public:
SessionMap() = delete;
explicit SessionMap(MDSRank *m) : mds(m) {}
explicit SessionMap(MDSRank *m);

~SessionMap() override
{
Expand Down Expand Up @@ -843,6 +843,11 @@ class SessionMap : public SessionMapStore {
}

time avg_birth_time = clock::zero();

size_t mds_session_metadata_threshold;

bool validate_and_encode_session(MDSRank *mds, Session *session, bufferlist& bl);
void apply_blocklist(const std::set<entity_name_t>& victims);
};

std::ostream& operator<<(std::ostream &out, const Session &s);
Expand Down

0 comments on commit bc6814d

Please sign in to comment.