Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Deadlock between starting a slave and reading system variables
Starting the SQL thread might deadlock with reading the values of the
replication filtering options.

The deadlock is due to a lock order violation when the variables are
read or set. For example, reading replicate_ignore_table first
acquires LOCK_global_system_variables in sys_var::value_ptr and later
acquires LOCK_active_mi in Sys_var_rpl_filter::global_value_ptr. This
violates the order established when starting a SQL thread, where
LOCK_active_mi is acquired before start_slave, and ends up creating a
thread (handle_slave_sql) that allocates a THD handle whose
constructor acquires LOCK_global_system_variables in THD::init.

The solution is to unlock LOCK_global_system_variables before the
replication filtering options are set or read. This way the lock
order is preserved and the data being read/set is still protected
given that it acquires LOCK_active_mi.
  • Loading branch information
Davi Arnaut committed Jun 20, 2012
1 parent 5ef4072 commit be79b54
Show file tree
Hide file tree
Showing 5 changed files with 131 additions and 11 deletions.
31 changes: 31 additions & 0 deletions mysql-test/suite/rpl/r/rpl_start_slave_deadlock_sys_vars.result
@@ -0,0 +1,31 @@
include/master-slave.inc
[connection master]
# connection: slave
SET @save_slave_net_timeout = @@GLOBAL.slave_net_timeout;
STOP SLAVE;
include/wait_for_slave_to_stop.inc
# open an extra connection to the slave
# connection: slave2
# set debug synchronization point
SET DEBUG_SYNC='fix_slave_net_timeout SIGNAL parked WAIT_FOR go';
# attempt to set slave_net_timeout, will wait on sync point
SET @@GLOBAL.slave_net_timeout = 100;
# connection: slave
SET DEBUG_SYNC='now WAIT_FOR parked';
# connection: slave1
# attempt to start the SQL thread
START SLAVE SQL_THREAD;
# connection: slave
# wait until SQL thread has been started
# sleep a bit so that the SQL thread THD handle is initialized
# signal the set slave_net_timeout to continue
SET DEBUG_SYNC='now SIGNAL go';
# connection: slave2
# reap result of set slave_net_timeout
# connection: slave1
# reap result of starting the SQL thread
# disconnect: slave2
# connection: slave
# cleanup
SET @@GLOBAL.slave_net_timeout = @save_slave_net_timeout;
include/rpl_end.inc
57 changes: 57 additions & 0 deletions mysql-test/suite/rpl/t/rpl_start_slave_deadlock_sys_vars.test
@@ -0,0 +1,57 @@
source include/master-slave.inc;
source include/have_debug_sync.inc;

--echo # connection: slave
connection slave;
SET @save_slave_net_timeout = @@GLOBAL.slave_net_timeout;
STOP SLAVE;
source include/wait_for_slave_to_stop.inc;

--echo # open an extra connection to the slave
connect(slave2,127.0.0.1,root,,test,$SLAVE_MYPORT,);
--echo # connection: slave2
--echo # set debug synchronization point
SET DEBUG_SYNC='fix_slave_net_timeout SIGNAL parked WAIT_FOR go';
--echo # attempt to set slave_net_timeout, will wait on sync point
--send SET @@GLOBAL.slave_net_timeout = 100

--echo # connection: slave
connection slave;
SET DEBUG_SYNC='now WAIT_FOR parked';

--echo # connection: slave1
connection slave1;
--echo # attempt to start the SQL thread
--send START SLAVE SQL_THREAD

--echo # connection: slave
connection slave;
--echo # wait until SQL thread has been started
let $wait_condition=
select count(*) = 1 from information_schema.processlist
where state = "Waiting for slave thread to start" and info = "START SLAVE SQL_THREAD";
--source include/wait_condition.inc
--echo # sleep a bit so that the SQL thread THD handle is initialized
sleep 2;
--echo # signal the set slave_net_timeout to continue
SET DEBUG_SYNC='now SIGNAL go';

--echo # connection: slave2
connection slave2;
--echo # reap result of set slave_net_timeout
--reap

--echo # connection: slave1
connection slave1;
--echo # reap result of starting the SQL thread
--reap

--echo # disconnect: slave2
disconnect slave2;

--echo # connection: slave
connection slave;
--echo # cleanup
SET @@GLOBAL.slave_net_timeout = @save_slave_net_timeout;

source include/rpl_end.inc;
6 changes: 4 additions & 2 deletions sql/slave.cc
Expand Up @@ -2768,6 +2768,8 @@ pthread_handler_t handle_slave_io(void *arg)
mysql= NULL ;
retry_count= 0;

thd= new THD; // note that contructor of THD uses DBUG_ !

mysql_mutex_lock(&mi->run_lock);
/* Inform waiting threads that slave has started */
mi->slave_run_id++;
Expand All @@ -2776,7 +2778,6 @@ pthread_handler_t handle_slave_io(void *arg)
mi->events_till_disconnect = disconnect_slave_event_count;
#endif

thd= new THD; // note that contructor of THD uses DBUG_ !
THD_CHECK_SENTRY(thd);
mi->io_thd = thd;

Expand Down Expand Up @@ -3189,6 +3190,8 @@ pthread_handler_t handle_slave_sql(void *arg)
my_thread_init();
DBUG_ENTER("handle_slave_sql");

thd = new THD; // note that contructor of THD uses DBUG_ !

DBUG_ASSERT(rli->inited);
mysql_mutex_lock(&rli->run_lock);
DBUG_ASSERT(!rli->slave_running);
Expand All @@ -3197,7 +3200,6 @@ pthread_handler_t handle_slave_sql(void *arg)
rli->events_till_abort = abort_slave_event_count;
#endif

thd = new THD; // note that contructor of THD uses DBUG_ !
thd->thread_stack = (char*)&thd; // remember where our stack is
rli->sql_thd= thd;

Expand Down
46 changes: 37 additions & 9 deletions sql/sys_vars.cc
Expand Up @@ -44,6 +44,7 @@
// mysql_user_table_is_in_short_password_format
#include "derror.h" // read_texts
#include "sql_base.h" // close_cached_tables
#include "debug_sync.h" // DEBUG_SYNC

#ifdef WITH_PERFSCHEMA_STORAGE_ENGINE
#include "../storage/perfschema/pfs_server.h"
Expand Down Expand Up @@ -3128,6 +3129,8 @@ bool Sys_var_rpl_filter::do_check(THD *thd, set_var *var)
{
bool status;

mysql_mutex_assert_not_owner(&LOCK_global_system_variables);

mysql_mutex_lock(&LOCK_active_mi);
mysql_mutex_lock(&active_mi->rli.run_lock);

Expand All @@ -3144,22 +3147,43 @@ bool Sys_var_rpl_filter::do_check(THD *thd, set_var *var)
return status;
}

bool Sys_var_rpl_filter::global_update(THD *thd, set_var *var)
void Sys_var_rpl_filter::lock(void)
{
bool slave_running, status= false;
/*
Starting a slave thread causes the new thread to attempt to
acquire LOCK_global_system_variables (in THD::init) while
LOCK_active_mi is being held by the thread that initiated
the process. In order to not violate the lock order, unlock
LOCK_global_system_variables before grabbing LOCK_active_mi.
*/
mysql_mutex_unlock(&LOCK_global_system_variables);

mysql_mutex_lock(&LOCK_active_mi);
mysql_mutex_lock(&active_mi->rli.run_lock);
}

if (! (slave_running= active_mi->rli.slave_running))
status= set_filter_value(var->save_result.string_value.str);

void Sys_var_rpl_filter::unlock(void)
{
mysql_mutex_unlock(&active_mi->rli.run_lock);
mysql_mutex_unlock(&LOCK_active_mi);

mysql_mutex_lock(&LOCK_global_system_variables);
}

bool Sys_var_rpl_filter::global_update(THD *thd, set_var *var)
{
bool slave_running, status= false;

lock();

if (! (slave_running= active_mi->rli.slave_running))
status= set_filter_value(var->save_result.string_value.str);

if (slave_running)
my_error(ER_SLAVE_MUST_STOP, MYF(0));

unlock();

return slave_running || status;
}

Expand Down Expand Up @@ -3198,8 +3222,7 @@ uchar *Sys_var_rpl_filter::global_value_ptr(THD *thd, LEX_STRING *base)

tmp.length(0);

mysql_mutex_lock(&LOCK_active_mi);
mysql_mutex_lock(&active_mi->rli.run_lock);
lock();

switch (opt_id) {
case OPT_REPLICATE_DO_DB:
Expand All @@ -3222,8 +3245,7 @@ uchar *Sys_var_rpl_filter::global_value_ptr(THD *thd, LEX_STRING *base)
break;
}

mysql_mutex_unlock(&active_mi->rli.run_lock);
mysql_mutex_unlock(&LOCK_active_mi);
unlock();

return (uchar *) thd->strmake(tmp.ptr(), tmp.length());
}
Expand Down Expand Up @@ -3276,6 +3298,9 @@ static Sys_var_charptr Sys_slave_load_tmpdir(

static bool fix_slave_net_timeout(sys_var *self, THD *thd, enum_var_type type)
{
DEBUG_SYNC(thd, "fix_slave_net_timeout");

mysql_mutex_unlock(&LOCK_global_system_variables);
mysql_mutex_lock(&LOCK_active_mi);
DBUG_PRINT("info", ("slave_net_timeout=%u mi->heartbeat_period=%.3f",
slave_net_timeout,
Expand All @@ -3285,6 +3310,7 @@ static bool fix_slave_net_timeout(sys_var *self, THD *thd, enum_var_type type)
ER_SLAVE_HEARTBEAT_VALUE_OUT_OF_RANGE_MAX,
ER(ER_SLAVE_HEARTBEAT_VALUE_OUT_OF_RANGE_MAX));
mysql_mutex_unlock(&LOCK_active_mi);
mysql_mutex_lock(&LOCK_global_system_variables);
return false;
}
static Sys_var_uint Sys_slave_net_timeout(
Expand All @@ -3311,6 +3337,7 @@ static bool check_slave_skip_counter(sys_var *self, THD *thd, set_var *var)
}
static bool fix_slave_skip_counter(sys_var *self, THD *thd, enum_var_type type)
{
mysql_mutex_unlock(&LOCK_global_system_variables);
mysql_mutex_lock(&LOCK_active_mi);
mysql_mutex_lock(&active_mi->rli.run_lock);
/*
Expand All @@ -3326,6 +3353,7 @@ static bool fix_slave_skip_counter(sys_var *self, THD *thd, enum_var_type type)
}
mysql_mutex_unlock(&active_mi->rli.run_lock);
mysql_mutex_unlock(&LOCK_active_mi);
mysql_mutex_lock(&LOCK_global_system_variables);
return 0;
}
static Sys_var_uint Sys_slave_skip_counter(
Expand Down
2 changes: 2 additions & 0 deletions sql/sys_vars.h
Expand Up @@ -548,6 +548,8 @@ class Sys_var_rpl_filter: public sys_var
protected:
uchar *global_value_ptr(THD *thd, LEX_STRING *base);
bool set_filter_value(const char *value);
void lock(void);
void unlock(void);
};

/**
Expand Down

0 comments on commit be79b54

Please sign in to comment.