Skip to content

Commit 773c59d

Browse files
smit-gardhariyaLiliDeng
authored andcommitted
test: Add mshv crash test
Add testcase to verify crash functionality having crash generated from mshv. Separate the util functions used for kdump and Re-use kdump setup of LISA to run the test and verify if crashdump is generated. Signed-off-by: Smit Gardhariya <sgardhariya@microsoft.com>
1 parent b53d4ad commit 773c59d

File tree

4 files changed

+503
-303
lines changed

4 files changed

+503
-303
lines changed

lisa/tools/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@
5656
from .ip import Ip, IpInfo
5757
from .iperf3 import Iperf3
5858
from .journalctl import Journalctl
59-
from .kdump import KdumpBase
59+
from .kdump import KdumpBase, KdumpCheck
6060
from .kernel_config import KernelConfig
6161
from .kill import Kill
6262
from .lagscope import Lagscope
@@ -186,6 +186,7 @@
186186
"Iperf3",
187187
"Iptables",
188188
"Journalctl",
189+
"KdumpCheck",
189190
"KdumpBase",
190191
"KernelConfig",
191192
"Kill",

lisa/tools/kdump.py

Lines changed: 309 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,38 @@
11
# Copyright (c) Microsoft Corporation.
22
# Licensed under the MIT license.
3+
from __future__ import annotations
34

45
import math
56
import re
6-
from pathlib import PurePath, PurePosixPath
7+
from pathlib import Path, PurePath, PurePosixPath
78
from time import sleep
8-
from typing import TYPE_CHECKING, Any, List, Type
9+
from typing import TYPE_CHECKING, Any, List, Type, cast
910

11+
from func_timeout import FunctionTimedOut, func_set_timeout # type: ignore
1012
from semver import VersionInfo
1113

1214
from lisa.base_tools import Cat, Sed, Service, Wget
1315
from lisa.executable import Tool
1416
from lisa.operating_system import CBLMariner, Debian, Oracle, Posix, Redhat, Suse
1517
from lisa.tools import Find, Gcc
18+
from lisa.tools.df import Df
19+
from lisa.tools.dmesg import Dmesg
20+
from lisa.tools.echo import Echo
21+
from lisa.tools.free import Free
1622
from lisa.tools.lsblk import Lsblk
1723
from lisa.tools.lscpu import Lscpu
1824
from lisa.tools.make import Make
25+
from lisa.tools.stat import Stat
1926
from lisa.tools.sysctl import Sysctl
2027
from lisa.tools.tar import Tar
2128
from lisa.util import LisaException, SkippedException, UnsupportedDistroException
29+
from lisa.util.perf_timer import create_timer
30+
from lisa.util.shell import try_connect
2231

2332
from .kernel_config import KernelConfig
2433

2534
if TYPE_CHECKING:
26-
from lisa.node import Node
35+
from lisa.node import Node, RemoteNode
2736

2837

2938
class Kexec(Tool):
@@ -729,3 +738,300 @@ def capture_info(self) -> None:
729738
result = cat.run("/etc/kdump.conf", force_run=True, sudo=True)
730739
self._log.info(f"Current kdump configuration: {result.stdout}")
731740
return
741+
742+
743+
class KdumpCheck(Tool):
744+
# This tool will be wrapperf on top of KdumpBase with pre/post check for crash
745+
# kernel operations. We will wrap it around virtual tool of KdumpCheck here.
746+
# kdump_test is the function we will expose to trigger the kernel crash.
747+
748+
# When with large system memory, the dump file can achieve more than 7G. It will
749+
# cost about 10min to copy dump file to disk for some distros, such as Ubuntu.
750+
# So we set the timeout time 800s to make sure the dump file is completed.
751+
timeout_of_dump_crash = 800
752+
trigger_kdump_cmd = "echo c > /proc/sysrq-trigger"
753+
754+
@property
755+
def command(self) -> str:
756+
return ""
757+
758+
@property
759+
def can_install(self) -> bool:
760+
return False
761+
762+
def kdump_test(
763+
self,
764+
log_path: Path,
765+
is_auto: bool = False,
766+
trigger_kdump_cmd: str = "echo c > /proc/sysrq-trigger",
767+
) -> None:
768+
try:
769+
self._check_supported(is_auto=is_auto)
770+
except UnsupportedDistroException as e:
771+
raise SkippedException(e)
772+
773+
kdump = self.node.tools[KdumpBase]
774+
free = self.node.tools[Free]
775+
total_memory = free.get_total_memory()
776+
self.crash_kernel = kdump.calculate_crashkernel_size(total_memory)
777+
if is_auto:
778+
self.crash_kernel = "auto"
779+
780+
if self._is_system_with_more_memory():
781+
# As system memory is more than free os disk size, need to
782+
# change the dump path and increase the timeout duration
783+
kdump.config_resource_disk_dump_path(self._get_resource_disk_dump_path())
784+
self.timeout_of_dump_crash = 1200
785+
if "T" in total_memory and float(total_memory.strip("T")) > 6:
786+
self.timeout_of_dump_crash = 2000
787+
788+
kdump.config_crashkernel_memory(self.crash_kernel)
789+
kdump.enable_kdump_service()
790+
# Cleaning up any previous crash dump files
791+
self.node.execute(
792+
f"mkdir -p {kdump.dump_path} && rm -rf {kdump.dump_path}/*",
793+
shell=True,
794+
sudo=True,
795+
)
796+
797+
# Reboot system to make kdump take effect
798+
self.node.reboot()
799+
800+
# Confirm that the kernel dump mechanism is enabled
801+
kdump.check_crashkernel_loaded(self.crash_kernel)
802+
# Activate the magic SysRq option
803+
echo = self.node.tools[Echo]
804+
echo.write_to_file(
805+
value="1",
806+
file=self.node.get_pure_path("/proc/sys/kernel/sysrq"),
807+
sudo=True,
808+
)
809+
self.node.execute("sync", shell=True, sudo=True)
810+
811+
kdump.capture_info()
812+
813+
try:
814+
# Trigger kdump. After execute the trigger cmd, the VM will be disconnected
815+
# We set a timeout time 10.
816+
self.node.execute_async(
817+
trigger_kdump_cmd,
818+
shell=True,
819+
sudo=True,
820+
)
821+
except Exception as e:
822+
self._log.debug(f"ignorable ssh exception: {e}")
823+
824+
# Check if the vmcore file is generated after triggering a crash
825+
self._check_kdump_result(log_path, kdump)
826+
827+
# We should clean up the vmcore file since the test is passed
828+
self.node.execute(f"rm -rf {kdump.dump_path}/*", shell=True, sudo=True)
829+
830+
def trigger_kdump_on_specified_cpu(self, cpu_num: int, log_path: Path) -> None:
831+
lscpu = self.node.tools[Lscpu]
832+
cpu_count = lscpu.get_core_count()
833+
if cpu_count > cpu_num:
834+
trigger_kdump_cmd = f"taskset -c {cpu_num} echo c > /proc/sysrq-trigger"
835+
self.kdump_test(
836+
log_path=log_path,
837+
trigger_kdump_cmd=trigger_kdump_cmd,
838+
)
839+
else:
840+
raise SkippedException(
841+
"The cpu count can't meet the test case's requirement. "
842+
f"Expected more than {cpu_num} cpus, actual {cpu_count}"
843+
)
844+
845+
def _check_exists(self) -> bool:
846+
return True
847+
848+
# This method might stuck after triggering crash,
849+
# so use timeout to recycle it faster.
850+
@func_set_timeout(10) # type: ignore
851+
def _try_connect(self, remote_node: RemoteNode) -> Any:
852+
return try_connect(remote_node._connection_info)
853+
854+
def _check_supported(self, is_auto: bool = False) -> None:
855+
# Check the kernel config for kdump supported
856+
kdump = self.node.tools[KdumpBase]
857+
kdump.check_required_kernel_config()
858+
859+
# Check the VMBus version for kdump supported
860+
dmesg = self.node.tools[Dmesg]
861+
vmbus_version = dmesg.get_vmbus_version()
862+
if vmbus_version < "3.0.0":
863+
raise SkippedException(
864+
f"No negotiated VMBus version {vmbus_version}. "
865+
"Kernel might be old or patches not included. "
866+
"Full support for kdump is not present."
867+
)
868+
869+
# Below code aims to check the kernel config for "auto crashkernel" supported.
870+
# Redhat/Centos has this "auto crashkernel" feature. For version 7, it needs the
871+
# CONFIG_KEXEC_AUTO_RESERVE. For version 8, the ifdefine of that config is
872+
# removed. For these changes we can refer to Centos kernel, gotten according
873+
# to https://wiki.centos.org/action/show/Sources?action=show&redirect=sources
874+
# In addition, we didn't see upstream kernel has the auto crashkernel feature.
875+
# It may be a patch owned by Redhat/Centos.
876+
# Note that crashkernel=auto option in the boot command line is no longer
877+
# supported on RHEL 9 and later releases
878+
if not (
879+
isinstance(self.node.os, Redhat)
880+
and self.node.os.information.version >= "8.0.0-0"
881+
and self.node.os.information.version < "9.0.0-0"
882+
):
883+
if is_auto and not self.node.tools[KernelConfig].is_built_in(
884+
"CONFIG_KEXEC_AUTO_RESERVE"
885+
):
886+
raise SkippedException("crashkernel=auto doesn't work for the distro.")
887+
888+
def _get_resource_disk_dump_path(self) -> str:
889+
from lisa.features import Disk
890+
891+
mount_point = self.node.features[Disk].get_resource_disk_mount_point()
892+
dump_path = mount_point + "/crash"
893+
return dump_path
894+
895+
def _is_system_with_more_memory(self) -> bool:
896+
free = self.node.tools[Free]
897+
total_memory_in_gb = free.get_total_memory_gb()
898+
899+
df = self.node.tools[Df]
900+
available_space_in_os_disk = df.get_filesystem_available_space("/", True)
901+
902+
if total_memory_in_gb > available_space_in_os_disk:
903+
return True
904+
return False
905+
906+
def _is_system_connected(self) -> bool:
907+
from lisa.node import RemoteNode as RMNode
908+
909+
remote_node = cast(RMNode, self.node)
910+
try:
911+
self._try_connect(remote_node)
912+
except FunctionTimedOut as e:
913+
# The FunctionTimedOut must be caught separated, or the process will exit.
914+
self._log.debug(f"ignorable timeout exception: {e}")
915+
return False
916+
except Exception as e:
917+
self._log.debug(
918+
"Fail to connect SSH "
919+
f"{remote_node._connection_info.address}:"
920+
f"{remote_node._connection_info.port}. "
921+
f"{e.__class__.__name__}: {e}. Retry..."
922+
)
923+
return False
924+
return True
925+
926+
def _is_dump_file_generated(self, kdump: KdumpBase) -> bool:
927+
result = self.node.execute(
928+
f"find {kdump.dump_path} -type f -size +10M "
929+
"\\( -name vmcore -o -name dump.* -o -name vmcore.* \\) "
930+
"-exec ls -lh {} \\;",
931+
shell=True,
932+
sudo=True,
933+
)
934+
if result.stdout:
935+
return True
936+
return False
937+
938+
def _check_incomplete_dump_file_generated(self, kdump: KdumpBase) -> str:
939+
# Check if has dump incomplete file
940+
result = self.node.execute(
941+
f"find {kdump.dump_path} -name '*incomplete*'",
942+
shell=True,
943+
sudo=True,
944+
)
945+
return result.stdout
946+
947+
def _check_kdump_result(self, log_path: Path, kdump: KdumpBase) -> None:
948+
# We use this function to check if the dump file is generated.
949+
# Steps:
950+
# 1. Try to connect the VM;
951+
# 2. If connected:
952+
# 1). Check if the dump file is generated. If so, then jump the loop.
953+
# The test is passed.
954+
# 2). If there is no dump file, check the incomplete file (When dumping
955+
# hasn't completed, the dump file is named as "*incomplete").
956+
# a. If there is no incomplete file either, then raise and exception.
957+
# b. If there is an incomplete file, then check if the file size
958+
# is growing. If so, check it in a loop until the dump completes
959+
# or incomplete file doesn't grow or timeout.
960+
# 3. The VM can be connected may just when the crash kernel boots up. When
961+
# dumping or rebooting after dump completes, the VM might be disconnected.
962+
# We need to catch the exception, and retry to connect the VM. Then follow
963+
# the same steps to check.
964+
from lisa.features import SerialConsole
965+
966+
timer = create_timer()
967+
has_checked_console_log = False
968+
serial_console = self.node.features[SerialConsole]
969+
while timer.elapsed(False) < self.timeout_of_dump_crash:
970+
if not self._is_system_connected():
971+
if not has_checked_console_log and timer.elapsed(False) > 60:
972+
serial_console.check_initramfs(
973+
saved_path=log_path, stage="after_trigger_crash", force_run=True
974+
)
975+
has_checked_console_log = True
976+
continue
977+
978+
# After trigger kdump, the VM will reboot. We need to close the node
979+
self.node.close()
980+
saved_dumpfile_size = 0
981+
max_tries = 20
982+
check_incomplete_file_tries = 0
983+
check_dump_file_tries = 0
984+
# Check in this loop until the dump file is generated or incomplete file
985+
# doesn't grow or timeout
986+
while True:
987+
try:
988+
if self._is_dump_file_generated(kdump):
989+
return
990+
incomplete_file = self._check_incomplete_dump_file_generated(
991+
kdump=kdump,
992+
)
993+
if incomplete_file:
994+
check_dump_file_tries = 0
995+
stat = self.node.tools[Stat]
996+
incomplete_file_size = stat.get_total_size(incomplete_file)
997+
except Exception as e:
998+
self._log.debug(
999+
"Fail to execute command. It may be caused by the system kernel"
1000+
" reboot after dumping vmcore."
1001+
f"{e.__class__.__name__}: {e}. Retry..."
1002+
)
1003+
# Hit exception, break this loop and re-try to connect the system
1004+
break
1005+
if incomplete_file:
1006+
# If the incomplete file doesn't grow in 100s, then raise exception
1007+
if incomplete_file_size > saved_dumpfile_size:
1008+
saved_dumpfile_size = incomplete_file_size
1009+
check_incomplete_file_tries = 0
1010+
else:
1011+
check_incomplete_file_tries += 1
1012+
if check_incomplete_file_tries >= max_tries:
1013+
serial_console.get_console_log(
1014+
saved_path=log_path, force_run=True
1015+
)
1016+
self.node.execute("df -h")
1017+
raise LisaException(
1018+
"The vmcore file is incomplete with file size"
1019+
f" {round(incomplete_file_size/1024/1024, 2)}MB"
1020+
)
1021+
else:
1022+
# If there is no any dump file in 100s, then raise exception
1023+
check_dump_file_tries += 1
1024+
if check_dump_file_tries >= max_tries:
1025+
serial_console.get_console_log(
1026+
saved_path=log_path, force_run=True
1027+
)
1028+
raise LisaException(
1029+
"No vmcore or vmcore-incomplete is found under "
1030+
f"{kdump.dump_path} with file size greater than 10M."
1031+
)
1032+
if timer.elapsed(False) > self.timeout_of_dump_crash:
1033+
serial_console.get_console_log(saved_path=log_path, force_run=True)
1034+
raise LisaException("Timeout to dump vmcore file.")
1035+
sleep(5)
1036+
serial_console.get_console_log(saved_path=log_path, force_run=True)
1037+
raise LisaException("Timeout to connect the VM after triggering kdump.")

0 commit comments

Comments
 (0)