Skip to content

Commit

Permalink
* Bug fix: Forcing more strict ssh checking. Originally MHA checks
Browse files Browse the repository at this point in the history
master's reachability by just connecting via SSH and exiting
with return code 0. This in some cases does not
work especially if SSH works but data files are not accessible.
In this fix, MHA checks master's ssh reachability by
executing save_binary_logs command (dry run).
  • Loading branch information
yoshinorim committed Dec 7, 2011
1 parent 6b27d1d commit 4607f29
Show file tree
Hide file tree
Showing 7 changed files with 210 additions and 65 deletions.
1 change: 1 addition & 0 deletions MANIFEST
Expand Up @@ -98,6 +98,7 @@ tests/t/t_apply_many_logs2.sh
tests/t/t_apply_many_logs3.sh
tests/t/t_binary.sh
tests/t/t_conf.sh
tests/t/t_data_io_error.sh
tests/t/t_dual_master_error.sh
tests/t/t_filter_incorrect.sh
tests/t/t_ignore_nostart.sh
Expand Down
37 changes: 27 additions & 10 deletions lib/MHA/HealthCheck.pm
Expand Up @@ -31,6 +31,7 @@ use MHA::DBHelper;
use MHA::ManagerConst;
use MHA::FileStatus;
use MHA::SlaveUtil;
use MHA::NodeUtil;

sub new {
my $class = shift;
Expand All @@ -46,6 +47,7 @@ sub new {
ssh_host => undef,
ssh_ip => undef,
ssh_port => undef,
ssh_check_command => undef,
workdir => undef,
status_handler => undef,
secondary_check_script => undef,
Expand Down Expand Up @@ -266,16 +268,27 @@ sub ping_select($) {
return 0;
}

sub ssh_check_simple {
my $ssh_user = shift;
my $ssh_host = shift;
my $ssh_ip = shift;
my $ssh_port = shift;
my $log = shift;
my $num_secs_to_timeout = shift;
return ssh_check( $ssh_user, $ssh_host, $ssh_ip, $ssh_port, $log,
$num_secs_to_timeout, "exit 0" );
}

sub ssh_check {
my $ssh_user = shift;
my $ssh_host = shift;
my $ssh_ip = shift;
my $ssh_port = shift;
my $log = shift;
my $num_secs_to_timeout = shift;
my $command = shift;
my $ssh_user_host = $ssh_user . '@' . $ssh_ip;
my $rc = 1;
my $command = "exit 0";
eval {
if ( my $pid = fork )
{
Expand All @@ -286,19 +299,19 @@ sub ssh_check {
alarm $num_secs_to_timeout;
waitpid( $pid, 0 );
alarm 0;
my $exit_code = $? >> 8;
if ( $exit_code == 0 ) {
$log->info("HealthCheck: SSH to $ssh_host is reachable.");
$rc = 0;
}
else {
my ( $high, $low ) = MHA::NodeUtil::system_rc($?);
if ( $high ne '0' || $low ne '0' ) {
$log->warning("HealthCheck: SSH to $ssh_host is NOT reachable.");
$rc = 1;
}
else {
$log->info("HealthCheck: SSH to $ssh_host is reachable.");
$rc = 0;
}
}
elsif ( defined $pid ) {
exec(
"ssh $MHA::ManagerConst::SSH_OPT_CHECK -p $ssh_port $ssh_user_host $command"
"ssh $MHA::ManagerConst::SSH_OPT_CHECK -p $ssh_port $ssh_user_host \"$command\""
);
}
else {
Expand Down Expand Up @@ -401,17 +414,21 @@ sub invoke_sec_check {

sub invoke_ssh_check {
my $self = shift;
my $log = $self->{logger};
if ( !$self->{_ssh_check_invoked} ) {
if ( $self->{_ssh_check_pid} = fork ) {
$self->{_ssh_check_invoked} = 1;
}
elsif ( defined $self->{_ssh_check_pid} ) {
$SIG{INT} = $SIG{HUP} = $SIG{QUIT} = $SIG{TERM} = "DEFAULT";
$log->info("Executing SSH check script: $self->{ssh_check_command}");

#child ssh check process
exit ssh_check(
$self->{ssh_user}, $self->{ssh_host}, $self->{ssh_ip},
$self->{ssh_port}, $self->{logger}, $self->{interval} * 3
$self->{ssh_user}, $self->{ssh_host},
$self->{ssh_ip}, $self->{ssh_port},
$self->{logger}, $self->{interval} * 3,
$self->{ssh_check_command}
);
}
else {
Expand Down
53 changes: 32 additions & 21 deletions lib/MHA/ManagerUtil.pm
Expand Up @@ -93,37 +93,48 @@ sub exec_ssh_cmd($$$$) {
);
}

sub check_node_version {
sub get_node_version {
my $log = shift;
my $ssh_user = shift;
my $ssh_host = shift;
my $ssh_ip = shift;
my $ssh_port = shift;
my $ssh_user_host;
my $node_version;
eval {
my $command = "apply_diff_relay_logs --version";

if ( $ssh_host || $ssh_ip ) {
if ($ssh_ip) {
$ssh_user_host = $ssh_user . '@' . $ssh_ip;
}
elsif ($ssh_host) {
$ssh_user_host = $ssh_user . '@' . $ssh_host;
}
$command =
"ssh $MHA::ManagerConst::SSH_OPT_ALIVE $ssh_user_host -p $ssh_port \"$command\" 2>&1";
my $command = "apply_diff_relay_logs --version";

if ( $ssh_host || $ssh_ip ) {
if ($ssh_ip) {
$ssh_user_host = $ssh_user . '@' . $ssh_ip;
}
my $v = `$command`;
chomp($v);
if ( $v =~ /version (\d+\.\d+)/ ) {
$node_version = $1;
}
else {
$log->error("Got error when getting node version. Error:");
$log->error("\n$v") if ($v);
elsif ($ssh_host) {
$ssh_user_host = $ssh_user . '@' . $ssh_host;
}
$command =
"ssh $MHA::ManagerConst::SSH_OPT_ALIVE $ssh_user_host -p $ssh_port \"$command\" 2>&1";
}
my $v = `$command`;
chomp($v);
if ( $v =~ /version (\d+\.\d+)/ ) {
$node_version = $1;
}
else {
$log->error("Got error when getting node version. Error:");
$log->error("\n$v") if ($v);
}
return $node_version;
}

sub check_node_version {
my $log = shift;
my $ssh_user = shift;
my $ssh_host = shift;
my $ssh_ip = shift;
my $ssh_port = shift;
my $node_version;
eval {
$node_version =
get_node_version( $log, $ssh_user, $ssh_host, $ssh_ip, $ssh_port );
my $host = $ssh_host ? $ssh_host : $ssh_ip;
croak
"node version on $host not found! Maybe MHA Node package is not installed?\n"
Expand Down
27 changes: 22 additions & 5 deletions lib/MHA/MasterFailover.pm
Expand Up @@ -455,17 +455,34 @@ sub force_shutdown($) {
# SSH reachability is unknown. Verify here.
if ( $_real_ssh_reachable >= 2 ) {
if (
MHA::HealthCheck::ssh_check(
$dead_master->{ssh_user}, $dead_master->{hostname},
$dead_master->{ip}, $dead_master->{logger},
5
MHA::HealthCheck::ssh_check_simple(
$dead_master->{ssh_user}, $dead_master->{ssh_host},
$dead_master->{ssh_ip}, $dead_master->{ssh_port},
$dead_master->{logger}, 5
)
)
{
$_real_ssh_reachable = 0;
}
else {
$_real_ssh_reachable = 1;

# additional check
if (
MHA::ManagerUtil::get_node_version(
$dead_master->{logger}, $dead_master->{ssh_user},
$dead_master->{ssh_host}, $dead_master->{ssh_ip},
$dead_master->{ssh_port}
)
)
{
$_real_ssh_reachable = 1;
}
else {
$log->warning(
"Failed to get MHA Node version from dead master. Guessing that SSH is NOT reachable."
);
$_real_ssh_reachable = 0;
}
}
}
force_shutdown_internal($dead_master);
Expand Down
123 changes: 95 additions & 28 deletions lib/MHA/MasterMonitor.pm
Expand Up @@ -34,6 +34,7 @@ use MHA::FileStatus;
use MHA::SSHCheck;
use MHA::ManagerConst;
use MHA::ManagerUtil;
use MHA::BinlogManager;
use File::Basename;

my $g_global_config_file = $MHA::ManagerConst::DEFAULT_GLOBAL_CONF;
Expand All @@ -47,6 +48,7 @@ my $g_interactive = 1;
my $g_logfile;
my $g_wait_on_monitor_error = 0;
my $g_skip_ssh_check;
my $_master_node_version;
my $_server_manager;
my $RETRY = 100;
my $_status_handler;
Expand All @@ -65,27 +67,82 @@ sub exit_by_signal {
exit 1;
}

sub check_master_env($) {
my $target = shift;
$log->info(
"Checking SSH publickey authentication and checking recovery script configurations on the current master.."
);
my $ssh_user_host = $target->{ssh_user} . '@' . $target->{ssh_host};

MHA::ManagerUtil::check_node_version( $log, $target->{ssh_user},
$target->{ssh_host}, $target->{ssh_ip}, $target->{ssh_port} );
sub get_binlog_check_command {
my $target = shift;
my $use_prefix = shift;

# this file is not created. just checking directory path
my $workfile = "$target->{remote_workdir}/save_binary_logs_test";
my $command =
"save_binary_logs --command=test --start_file=$target->{File} --start_pos=4 --binlog_dir=$target->{master_binlog_dir} --output_file=$workfile --manager_version=$MHA::ManagerConst::VERSION";
"save_binary_logs --command=test --start_pos=4 --binlog_dir=$target->{master_binlog_dir} --output_file=$workfile --manager_version=$MHA::ManagerConst::VERSION";
my $file;
if ( $target->{File} ) {
$file = $target->{File};
}
else {
my @alive_slaves = $_server_manager->get_alive_slaves();
my $slave = $alive_slaves[0];
$slave->current_slave_position();
$file = $slave->{Master_Log_File};
}
if ($use_prefix) {
my ( $binlog_prefix, $number ) =
MHA::BinlogManager::get_head_and_number($file);
$command .= " --binlog_prefix=$binlog_prefix";
}
else {
$command .= " --start_file=$file";
}

unless ( $target->{handle_raw_binlog} ) {
my $oldest_version = $_server_manager->get_oldest_version();
$command .= " --oldest_version=$oldest_version ";
}
if ( $target->{log_level} eq "debug" ) {
$command .= " --debug ";
}
return $command;
}

sub check_master_ssh_env($) {
my $target = shift;
$log->info(
"Checking SSH publickey authentication settings on the current master..");
my $ssh_user_host = $target->{ssh_user} . '@' . $target->{ssh_host};

my $ssh_reachable;
if (
MHA::HealthCheck::ssh_check_simple(
$target->{ssh_user}, $target->{ssh_host}, $target->{ssh_ip},
$target->{ssh_port}, $target->{logger}, 5
)
)
{
$ssh_reachable = 0;
}
else {
$ssh_reachable = 1;
}
if ($ssh_reachable) {
$_master_node_version =
MHA::ManagerUtil::get_node_version( $log, $target->{ssh_user},
$target->{ssh_host}, $target->{ssh_ip}, $target->{ssh_port} );
if ( !$_master_node_version ) {
$log->error(
"Failed to get MHA node version on the current master even though current master is reachable via SSH!"
);
croak;
}
$log->info("Master MHA Node version is $_master_node_version.");
}
return $ssh_reachable;
}

sub check_master_binlog($) {
my $target = shift;
$log->info("Checking recovery script configurations on the current master..");
my $ssh_user_host = $target->{ssh_user} . '@' . $target->{ssh_host};
my $command = get_binlog_check_command($target);
$log->info(" Executing command: $command ");
$log->info(" Connecting to $ssh_user_host($target->{ssh_host}).. ");
my ( $high, $low ) =
Expand Down Expand Up @@ -301,13 +358,13 @@ sub wait_until_master_is_unreachable() {
sprintf( "Identified master is %s.", $current_master->get_hostinfo() )
);
}
else {
if ( check_master_env($current_master) ) {
$_server_manager->validate_num_alive_servers( $current_master, 0 );
if ( check_master_ssh_env($current_master) ) {
if ( check_master_binlog($current_master) ) {
$log->error("Master configuration failed.");
croak;
}
}
$_server_manager->validate_num_alive_servers( $current_master, 0 );
$_status_handler->set_master_host( $current_master->{hostname} )
unless ($g_check_only);

Expand All @@ -333,22 +390,32 @@ sub wait_until_master_is_unreachable() {
$func_rc = 1;
my $master_ping;
eval {
my $ssh_check_command;
if ( $_master_node_version && $_master_node_version >= 0.53 ) {
$ssh_check_command = get_binlog_check_command( $current_master, 1 );
}
else {
$ssh_check_command = "exit 0";
}
$log->debug("SSH check command: $ssh_check_command");

$master_ping = new MHA::HealthCheck(
user => $current_master->{user},
password => $current_master->{password},
ip => $current_master->{ip},
hostname => $current_master->{hostname},
port => $current_master->{port},
interval => $current_master->{ping_interval},
ssh_user => $current_master->{ssh_user},
ssh_host => $current_master->{ssh_host},
ssh_ip => $current_master->{ssh_ip},
ssh_port => $current_master->{ssh_port},
status_handler => $_status_handler,
logger => $log,
logfile => $g_logfile,
workdir => $g_workdir,
ping_type => $current_master->{ping_type},
user => $current_master->{user},
password => $current_master->{password},
ip => $current_master->{ip},
hostname => $current_master->{hostname},
port => $current_master->{port},
interval => $current_master->{ping_interval},
ssh_user => $current_master->{ssh_user},
ssh_host => $current_master->{ssh_host},
ssh_ip => $current_master->{ssh_ip},
ssh_port => $current_master->{ssh_port},
ssh_check_command => $ssh_check_command,
status_handler => $_status_handler,
logger => $log,
logfile => $g_logfile,
workdir => $g_workdir,
ping_type => $current_master->{ping_type},
);
$log->info(
sprintf( "Set master ping interval %d seconds.",
Expand Down
2 changes: 1 addition & 1 deletion lib/MHA/Server.pm
Expand Up @@ -353,7 +353,7 @@ sub check_set_ssh_status {
my $set_dead = shift;
if ( !$self->{dead} ) {
if (
MHA::HealthCheck::ssh_check(
MHA::HealthCheck::ssh_check_simple(
$self->{ssh_user}, $self->{ssh_host}, $self->{ssh_ip},
$self->{ssh_port}, $self->{logger}, 5
)
Expand Down

0 comments on commit 4607f29

Please sign in to comment.