|
1 | 1 | # Copyright (c) Microsoft Corporation.
|
2 | 2 | # Licensed under the MIT license.
|
| 3 | +from __future__ import annotations |
3 | 4 |
|
4 | 5 | import math
|
5 | 6 | import re
|
6 |
| -from pathlib import PurePath, PurePosixPath |
| 7 | +from pathlib import Path, PurePath, PurePosixPath |
7 | 8 | from time import sleep
|
8 |
| -from typing import TYPE_CHECKING, Any, List, Type |
| 9 | +from typing import TYPE_CHECKING, Any, List, Type, cast |
9 | 10 |
|
| 11 | +from func_timeout import FunctionTimedOut, func_set_timeout # type: ignore |
10 | 12 | from semver import VersionInfo
|
11 | 13 |
|
12 | 14 | from lisa.base_tools import Cat, Sed, Service, Wget
|
13 | 15 | from lisa.executable import Tool
|
14 | 16 | from lisa.operating_system import CBLMariner, Debian, Oracle, Posix, Redhat, Suse
|
15 | 17 | from lisa.tools import Find, Gcc
|
| 18 | +from lisa.tools.df import Df |
| 19 | +from lisa.tools.dmesg import Dmesg |
| 20 | +from lisa.tools.echo import Echo |
| 21 | +from lisa.tools.free import Free |
16 | 22 | from lisa.tools.lsblk import Lsblk
|
17 | 23 | from lisa.tools.lscpu import Lscpu
|
18 | 24 | from lisa.tools.make import Make
|
| 25 | +from lisa.tools.stat import Stat |
19 | 26 | from lisa.tools.sysctl import Sysctl
|
20 | 27 | from lisa.tools.tar import Tar
|
21 | 28 | from lisa.util import LisaException, SkippedException, UnsupportedDistroException
|
| 29 | +from lisa.util.perf_timer import create_timer |
| 30 | +from lisa.util.shell import try_connect |
22 | 31 |
|
23 | 32 | from .kernel_config import KernelConfig
|
24 | 33 |
|
25 | 34 | if TYPE_CHECKING:
|
26 |
| - from lisa.node import Node |
| 35 | + from lisa.node import Node, RemoteNode |
27 | 36 |
|
28 | 37 |
|
29 | 38 | class Kexec(Tool):
|
@@ -729,3 +738,300 @@ def capture_info(self) -> None:
|
729 | 738 | result = cat.run("/etc/kdump.conf", force_run=True, sudo=True)
|
730 | 739 | self._log.info(f"Current kdump configuration: {result.stdout}")
|
731 | 740 | return
|
| 741 | + |
| 742 | + |
| 743 | +class KdumpCheck(Tool): |
| 744 | + # This tool will be wrapperf on top of KdumpBase with pre/post check for crash |
| 745 | + # kernel operations. We will wrap it around virtual tool of KdumpCheck here. |
| 746 | + # kdump_test is the function we will expose to trigger the kernel crash. |
| 747 | + |
| 748 | + # When with large system memory, the dump file can achieve more than 7G. It will |
| 749 | + # cost about 10min to copy dump file to disk for some distros, such as Ubuntu. |
| 750 | + # So we set the timeout time 800s to make sure the dump file is completed. |
| 751 | + timeout_of_dump_crash = 800 |
| 752 | + trigger_kdump_cmd = "echo c > /proc/sysrq-trigger" |
| 753 | + |
| 754 | + @property |
| 755 | + def command(self) -> str: |
| 756 | + return "" |
| 757 | + |
| 758 | + @property |
| 759 | + def can_install(self) -> bool: |
| 760 | + return False |
| 761 | + |
| 762 | + def kdump_test( |
| 763 | + self, |
| 764 | + log_path: Path, |
| 765 | + is_auto: bool = False, |
| 766 | + trigger_kdump_cmd: str = "echo c > /proc/sysrq-trigger", |
| 767 | + ) -> None: |
| 768 | + try: |
| 769 | + self._check_supported(is_auto=is_auto) |
| 770 | + except UnsupportedDistroException as e: |
| 771 | + raise SkippedException(e) |
| 772 | + |
| 773 | + kdump = self.node.tools[KdumpBase] |
| 774 | + free = self.node.tools[Free] |
| 775 | + total_memory = free.get_total_memory() |
| 776 | + self.crash_kernel = kdump.calculate_crashkernel_size(total_memory) |
| 777 | + if is_auto: |
| 778 | + self.crash_kernel = "auto" |
| 779 | + |
| 780 | + if self._is_system_with_more_memory(): |
| 781 | + # As system memory is more than free os disk size, need to |
| 782 | + # change the dump path and increase the timeout duration |
| 783 | + kdump.config_resource_disk_dump_path(self._get_resource_disk_dump_path()) |
| 784 | + self.timeout_of_dump_crash = 1200 |
| 785 | + if "T" in total_memory and float(total_memory.strip("T")) > 6: |
| 786 | + self.timeout_of_dump_crash = 2000 |
| 787 | + |
| 788 | + kdump.config_crashkernel_memory(self.crash_kernel) |
| 789 | + kdump.enable_kdump_service() |
| 790 | + # Cleaning up any previous crash dump files |
| 791 | + self.node.execute( |
| 792 | + f"mkdir -p {kdump.dump_path} && rm -rf {kdump.dump_path}/*", |
| 793 | + shell=True, |
| 794 | + sudo=True, |
| 795 | + ) |
| 796 | + |
| 797 | + # Reboot system to make kdump take effect |
| 798 | + self.node.reboot() |
| 799 | + |
| 800 | + # Confirm that the kernel dump mechanism is enabled |
| 801 | + kdump.check_crashkernel_loaded(self.crash_kernel) |
| 802 | + # Activate the magic SysRq option |
| 803 | + echo = self.node.tools[Echo] |
| 804 | + echo.write_to_file( |
| 805 | + value="1", |
| 806 | + file=self.node.get_pure_path("/proc/sys/kernel/sysrq"), |
| 807 | + sudo=True, |
| 808 | + ) |
| 809 | + self.node.execute("sync", shell=True, sudo=True) |
| 810 | + |
| 811 | + kdump.capture_info() |
| 812 | + |
| 813 | + try: |
| 814 | + # Trigger kdump. After execute the trigger cmd, the VM will be disconnected |
| 815 | + # We set a timeout time 10. |
| 816 | + self.node.execute_async( |
| 817 | + trigger_kdump_cmd, |
| 818 | + shell=True, |
| 819 | + sudo=True, |
| 820 | + ) |
| 821 | + except Exception as e: |
| 822 | + self._log.debug(f"ignorable ssh exception: {e}") |
| 823 | + |
| 824 | + # Check if the vmcore file is generated after triggering a crash |
| 825 | + self._check_kdump_result(log_path, kdump) |
| 826 | + |
| 827 | + # We should clean up the vmcore file since the test is passed |
| 828 | + self.node.execute(f"rm -rf {kdump.dump_path}/*", shell=True, sudo=True) |
| 829 | + |
| 830 | + def trigger_kdump_on_specified_cpu(self, cpu_num: int, log_path: Path) -> None: |
| 831 | + lscpu = self.node.tools[Lscpu] |
| 832 | + cpu_count = lscpu.get_core_count() |
| 833 | + if cpu_count > cpu_num: |
| 834 | + trigger_kdump_cmd = f"taskset -c {cpu_num} echo c > /proc/sysrq-trigger" |
| 835 | + self.kdump_test( |
| 836 | + log_path=log_path, |
| 837 | + trigger_kdump_cmd=trigger_kdump_cmd, |
| 838 | + ) |
| 839 | + else: |
| 840 | + raise SkippedException( |
| 841 | + "The cpu count can't meet the test case's requirement. " |
| 842 | + f"Expected more than {cpu_num} cpus, actual {cpu_count}" |
| 843 | + ) |
| 844 | + |
| 845 | + def _check_exists(self) -> bool: |
| 846 | + return True |
| 847 | + |
| 848 | + # This method might stuck after triggering crash, |
| 849 | + # so use timeout to recycle it faster. |
| 850 | + @func_set_timeout(10) # type: ignore |
| 851 | + def _try_connect(self, remote_node: RemoteNode) -> Any: |
| 852 | + return try_connect(remote_node._connection_info) |
| 853 | + |
| 854 | + def _check_supported(self, is_auto: bool = False) -> None: |
| 855 | + # Check the kernel config for kdump supported |
| 856 | + kdump = self.node.tools[KdumpBase] |
| 857 | + kdump.check_required_kernel_config() |
| 858 | + |
| 859 | + # Check the VMBus version for kdump supported |
| 860 | + dmesg = self.node.tools[Dmesg] |
| 861 | + vmbus_version = dmesg.get_vmbus_version() |
| 862 | + if vmbus_version < "3.0.0": |
| 863 | + raise SkippedException( |
| 864 | + f"No negotiated VMBus version {vmbus_version}. " |
| 865 | + "Kernel might be old or patches not included. " |
| 866 | + "Full support for kdump is not present." |
| 867 | + ) |
| 868 | + |
| 869 | + # Below code aims to check the kernel config for "auto crashkernel" supported. |
| 870 | + # Redhat/Centos has this "auto crashkernel" feature. For version 7, it needs the |
| 871 | + # CONFIG_KEXEC_AUTO_RESERVE. For version 8, the ifdefine of that config is |
| 872 | + # removed. For these changes we can refer to Centos kernel, gotten according |
| 873 | + # to https://wiki.centos.org/action/show/Sources?action=show&redirect=sources |
| 874 | + # In addition, we didn't see upstream kernel has the auto crashkernel feature. |
| 875 | + # It may be a patch owned by Redhat/Centos. |
| 876 | + # Note that crashkernel=auto option in the boot command line is no longer |
| 877 | + # supported on RHEL 9 and later releases |
| 878 | + if not ( |
| 879 | + isinstance(self.node.os, Redhat) |
| 880 | + and self.node.os.information.version >= "8.0.0-0" |
| 881 | + and self.node.os.information.version < "9.0.0-0" |
| 882 | + ): |
| 883 | + if is_auto and not self.node.tools[KernelConfig].is_built_in( |
| 884 | + "CONFIG_KEXEC_AUTO_RESERVE" |
| 885 | + ): |
| 886 | + raise SkippedException("crashkernel=auto doesn't work for the distro.") |
| 887 | + |
| 888 | + def _get_resource_disk_dump_path(self) -> str: |
| 889 | + from lisa.features import Disk |
| 890 | + |
| 891 | + mount_point = self.node.features[Disk].get_resource_disk_mount_point() |
| 892 | + dump_path = mount_point + "/crash" |
| 893 | + return dump_path |
| 894 | + |
| 895 | + def _is_system_with_more_memory(self) -> bool: |
| 896 | + free = self.node.tools[Free] |
| 897 | + total_memory_in_gb = free.get_total_memory_gb() |
| 898 | + |
| 899 | + df = self.node.tools[Df] |
| 900 | + available_space_in_os_disk = df.get_filesystem_available_space("/", True) |
| 901 | + |
| 902 | + if total_memory_in_gb > available_space_in_os_disk: |
| 903 | + return True |
| 904 | + return False |
| 905 | + |
| 906 | + def _is_system_connected(self) -> bool: |
| 907 | + from lisa.node import RemoteNode as RMNode |
| 908 | + |
| 909 | + remote_node = cast(RMNode, self.node) |
| 910 | + try: |
| 911 | + self._try_connect(remote_node) |
| 912 | + except FunctionTimedOut as e: |
| 913 | + # The FunctionTimedOut must be caught separated, or the process will exit. |
| 914 | + self._log.debug(f"ignorable timeout exception: {e}") |
| 915 | + return False |
| 916 | + except Exception as e: |
| 917 | + self._log.debug( |
| 918 | + "Fail to connect SSH " |
| 919 | + f"{remote_node._connection_info.address}:" |
| 920 | + f"{remote_node._connection_info.port}. " |
| 921 | + f"{e.__class__.__name__}: {e}. Retry..." |
| 922 | + ) |
| 923 | + return False |
| 924 | + return True |
| 925 | + |
| 926 | + def _is_dump_file_generated(self, kdump: KdumpBase) -> bool: |
| 927 | + result = self.node.execute( |
| 928 | + f"find {kdump.dump_path} -type f -size +10M " |
| 929 | + "\\( -name vmcore -o -name dump.* -o -name vmcore.* \\) " |
| 930 | + "-exec ls -lh {} \\;", |
| 931 | + shell=True, |
| 932 | + sudo=True, |
| 933 | + ) |
| 934 | + if result.stdout: |
| 935 | + return True |
| 936 | + return False |
| 937 | + |
| 938 | + def _check_incomplete_dump_file_generated(self, kdump: KdumpBase) -> str: |
| 939 | + # Check if has dump incomplete file |
| 940 | + result = self.node.execute( |
| 941 | + f"find {kdump.dump_path} -name '*incomplete*'", |
| 942 | + shell=True, |
| 943 | + sudo=True, |
| 944 | + ) |
| 945 | + return result.stdout |
| 946 | + |
| 947 | + def _check_kdump_result(self, log_path: Path, kdump: KdumpBase) -> None: |
| 948 | + # We use this function to check if the dump file is generated. |
| 949 | + # Steps: |
| 950 | + # 1. Try to connect the VM; |
| 951 | + # 2. If connected: |
| 952 | + # 1). Check if the dump file is generated. If so, then jump the loop. |
| 953 | + # The test is passed. |
| 954 | + # 2). If there is no dump file, check the incomplete file (When dumping |
| 955 | + # hasn't completed, the dump file is named as "*incomplete"). |
| 956 | + # a. If there is no incomplete file either, then raise and exception. |
| 957 | + # b. If there is an incomplete file, then check if the file size |
| 958 | + # is growing. If so, check it in a loop until the dump completes |
| 959 | + # or incomplete file doesn't grow or timeout. |
| 960 | + # 3. The VM can be connected may just when the crash kernel boots up. When |
| 961 | + # dumping or rebooting after dump completes, the VM might be disconnected. |
| 962 | + # We need to catch the exception, and retry to connect the VM. Then follow |
| 963 | + # the same steps to check. |
| 964 | + from lisa.features import SerialConsole |
| 965 | + |
| 966 | + timer = create_timer() |
| 967 | + has_checked_console_log = False |
| 968 | + serial_console = self.node.features[SerialConsole] |
| 969 | + while timer.elapsed(False) < self.timeout_of_dump_crash: |
| 970 | + if not self._is_system_connected(): |
| 971 | + if not has_checked_console_log and timer.elapsed(False) > 60: |
| 972 | + serial_console.check_initramfs( |
| 973 | + saved_path=log_path, stage="after_trigger_crash", force_run=True |
| 974 | + ) |
| 975 | + has_checked_console_log = True |
| 976 | + continue |
| 977 | + |
| 978 | + # After trigger kdump, the VM will reboot. We need to close the node |
| 979 | + self.node.close() |
| 980 | + saved_dumpfile_size = 0 |
| 981 | + max_tries = 20 |
| 982 | + check_incomplete_file_tries = 0 |
| 983 | + check_dump_file_tries = 0 |
| 984 | + # Check in this loop until the dump file is generated or incomplete file |
| 985 | + # doesn't grow or timeout |
| 986 | + while True: |
| 987 | + try: |
| 988 | + if self._is_dump_file_generated(kdump): |
| 989 | + return |
| 990 | + incomplete_file = self._check_incomplete_dump_file_generated( |
| 991 | + kdump=kdump, |
| 992 | + ) |
| 993 | + if incomplete_file: |
| 994 | + check_dump_file_tries = 0 |
| 995 | + stat = self.node.tools[Stat] |
| 996 | + incomplete_file_size = stat.get_total_size(incomplete_file) |
| 997 | + except Exception as e: |
| 998 | + self._log.debug( |
| 999 | + "Fail to execute command. It may be caused by the system kernel" |
| 1000 | + " reboot after dumping vmcore." |
| 1001 | + f"{e.__class__.__name__}: {e}. Retry..." |
| 1002 | + ) |
| 1003 | + # Hit exception, break this loop and re-try to connect the system |
| 1004 | + break |
| 1005 | + if incomplete_file: |
| 1006 | + # If the incomplete file doesn't grow in 100s, then raise exception |
| 1007 | + if incomplete_file_size > saved_dumpfile_size: |
| 1008 | + saved_dumpfile_size = incomplete_file_size |
| 1009 | + check_incomplete_file_tries = 0 |
| 1010 | + else: |
| 1011 | + check_incomplete_file_tries += 1 |
| 1012 | + if check_incomplete_file_tries >= max_tries: |
| 1013 | + serial_console.get_console_log( |
| 1014 | + saved_path=log_path, force_run=True |
| 1015 | + ) |
| 1016 | + self.node.execute("df -h") |
| 1017 | + raise LisaException( |
| 1018 | + "The vmcore file is incomplete with file size" |
| 1019 | + f" {round(incomplete_file_size/1024/1024, 2)}MB" |
| 1020 | + ) |
| 1021 | + else: |
| 1022 | + # If there is no any dump file in 100s, then raise exception |
| 1023 | + check_dump_file_tries += 1 |
| 1024 | + if check_dump_file_tries >= max_tries: |
| 1025 | + serial_console.get_console_log( |
| 1026 | + saved_path=log_path, force_run=True |
| 1027 | + ) |
| 1028 | + raise LisaException( |
| 1029 | + "No vmcore or vmcore-incomplete is found under " |
| 1030 | + f"{kdump.dump_path} with file size greater than 10M." |
| 1031 | + ) |
| 1032 | + if timer.elapsed(False) > self.timeout_of_dump_crash: |
| 1033 | + serial_console.get_console_log(saved_path=log_path, force_run=True) |
| 1034 | + raise LisaException("Timeout to dump vmcore file.") |
| 1035 | + sleep(5) |
| 1036 | + serial_console.get_console_log(saved_path=log_path, force_run=True) |
| 1037 | + raise LisaException("Timeout to connect the VM after triggering kdump.") |
0 commit comments