Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Stabilize MPI tests for Azure Linux #3521

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Stabilize MPI tests for Azure Linux
  • Loading branch information
cyberbandya007 committed Nov 20, 2024
commit 81559cd0b8f1d62ee558979683b217ce0c36c8ea
53 changes: 51 additions & 2 deletions lisa/features/infiniband.py
Original file line number Diff line number Diff line change
@@ -9,10 +9,11 @@
from retry import retry

from lisa.base_tools import Cat, Sed, Uname, Wget
from lisa.tools.git import Git
from lisa.feature import Feature
from lisa.features import Disk
from lisa.operating_system import CBLMariner, Oracle, Redhat, Ubuntu
from lisa.tools import Firewall, Ls, Lspci, Make, Service
from lisa.tools import Chmod, Find, Firewall, Ls, Lspci, Make, Service
from lisa.tools.tar import Tar
from lisa.util import (
LisaException,
@@ -466,7 +467,6 @@ def install_intel_mpi(self) -> None:

def install_open_mpi(self) -> None:
node = self._node
# Install Open MPI
wget = node.tools[Wget]
tar_file = (
"https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.5.tar.gz"
@@ -497,6 +497,55 @@ def install_open_mpi(self) -> None:
make.make("", cwd=openmpi_folder, sudo=True)
make.make_install(cwd=openmpi_folder, sudo=True)

def install_intel_mpi_benchmarking_tool(self, tool_names: List[str] = ["IMB-MPI1"]) -> None:
# Assumption is we have required mpi package built and installed
node = self._node
if not isinstance(node.os, CBLMariner):
# These tools are included in other distro packages
return
# Clone and build Intel MPI Benchmarks https://github.com/intel/mpi-benchmarks.git
git = node.tools[Git]
git.clone(url="https://github.com/intel/mpi-benchmarks.git", cwd=node.working_path)

imb_src_folder = node.get_pure_path(f"{node.working_path}/mpi-benchmarks")

find = node.tools[Find]
# find mpicc path
find_results = find.find_files(
node.get_pure_path("/"), "mpicc", sudo=True
)
assert_that(len(find_results)).described_as(
"Could not find location of mpicc from MPI package"
).is_greater_than(0)
mpicc_path = find_results[0]
assert_that(mpicc_path).described_as(
"Could not find location of mpicc from MPI package"
).is_not_empty()

# find mpicxx path
find_results = find.find_files(
node.get_pure_path("/"), "mpicxx", sudo=True
)
assert_that(len(find_results)).described_as(
"Could not find location of mpicxx from MPI package"
).is_greater_than(0)
mpicxx_path = find_results[0]
assert_that(mpicxx_path).described_as(
"Could not find location of mpicxx from MPI package"
).is_not_empty()

node.tools[Chmod].chmod(mpicc_path, "755", sudo=True)
node.tools[Chmod].chmod(mpicxx_path, "755", sudo=True)

# tool_names = ["IMB-MPI1", "IMB-RMA", "IMB-NBC"]
for tool in tool_names:
make = node.tools[Make]
make.make(f"{tool} CC={mpicc_path} CXX={mpicxx_path}",
cwd=imb_src_folder, sudo=True,
shell=False, sendYesCmd=False)
node.tools[Chmod].chmod(f"{imb_src_folder}/{tool}", "755", sudo=True)


def install_ibm_mpi(self, platform_mpi_url: str) -> None:
node = self._node
if isinstance(node.os, Redhat):
12 changes: 9 additions & 3 deletions lisa/tools/make.py
Original file line number Diff line number Diff line change
@@ -74,6 +74,8 @@ def make(
thread_count: int = 0,
update_envs: Optional[Dict[str, str]] = None,
ignore_error: bool = False,
shell: bool = True,
sendYesCmd: bool = True
) -> ExecutableResult:
expected_exit_code: Optional[int] = 0
if thread_count == 0:
@@ -95,13 +97,17 @@ def make(

if ignore_error:
expected_exit_code = None
# yes '' answers all questions with default value.
command = ""
if sendYesCmd:
# yes '' answers all questions with default value.
command = "yes '' | "

result = self.node.execute(
f"yes '' | make -j{thread_count} {arguments}",
f"{command} make -j{thread_count} {arguments}",
cwd=cwd,
timeout=timeout,
sudo=sudo,
shell=True,
shell=shell,
update_envs=update_envs,
expected_exit_code=expected_exit_code,
expected_exit_code_failure_message="Failed to make",
32 changes: 25 additions & 7 deletions microsoft/testsuites/hpc/infinibandsuite.py
Original file line number Diff line number Diff line change
@@ -15,7 +15,7 @@
simple_requirement,
)
from lisa.features import AvailabilitySetEnabled, Infiniband, Sriov
from lisa.operating_system import BSD, Windows
from lisa.operating_system import BSD, CBLMariner, Windows
from lisa.sut_orchestrator.azure.tools import Waagent
from lisa.tools import Find, KernelConfig, Ls, Modprobe, Ssh
from lisa.util import (
@@ -286,6 +286,9 @@ def verify_intel_mpi(self, environment: Environment, log: Logger) -> None:
client_ssh.enable_public_key(server_ssh.generate_key_pairs())
server_ssh.add_known_host(client_ip)
client_ssh.add_known_host(server_ip)
sudo=False
if isinstance(server_node.os, CBLMariner):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add comments above to explain why Mariner needs sudo.

sudo=True

# Note: Using bash because script is not supported by Dash
# sh points to dash on Ubuntu
@@ -295,6 +298,7 @@ def verify_intel_mpi(self, environment: Environment, log: Logger) -> None:
"-env I_MPI_FABRICS=shm:ofi -env SECS_PER_SAMPLE=600 "
"-env FI_PROVIDER=mlx -env I_MPI_DEBUG=5 -env I_MPI_PIN_DOMAIN=numa "
"/opt/intel/oneapi/mpi/2021.1.1/bin/IMB-MPI1 pingpong",
sudo=sudo,
expected_exit_code=0,
expected_exit_code_failure_message="Failed intra-node pingpong test "
"with intel mpi",
@@ -306,6 +310,7 @@ def verify_intel_mpi(self, environment: Environment, log: Logger) -> None:
"-env I_MPI_FABRICS=shm:ofi -env SECS_PER_SAMPLE=600 "
"-env FI_PROVIDER=mlx -env I_MPI_DEBUG=5 -env I_MPI_PIN_DOMAIN=numa "
"/opt/intel/oneapi/mpi/2021.1.1/bin/IMB-MPI1 pingpong",
sudo=sudo,
expected_exit_code=0,
expected_exit_code_failure_message="Failed inter-node pingpong test "
"with intel mpi",
@@ -319,6 +324,7 @@ def verify_intel_mpi(self, environment: Environment, log: Logger) -> None:
"-n 44 -env I_MPI_FABRICS=shm:ofi -env SECS_PER_SAMPLE=600 "
"-env FI_PROVIDER=mlx -env I_MPI_DEBUG=5 -env I_MPI_PIN_DOMAIN=numa "
f"/opt/intel/oneapi/mpi/2021.1.1/bin/{test}",
sudo=sudo,
expected_exit_code=0,
expected_exit_code_failure_message=f"Failed {test} test with intel mpi",
timeout=3000,
@@ -360,10 +366,13 @@ def verify_open_mpi(self, environment: Environment, log: Logger) -> None:
raise SkippedException(err)

run_in_parallel([server_ib.install_open_mpi, client_ib.install_open_mpi])

server_node.execute("ldconfig", sudo=True)
client_node.execute("ldconfig", sudo=True)

# Only for mariner, we need to build intel benchmarking tools
# as they are not included in our packages
server_ib.install_intel_mpi_benchmarking_tool()
Copy link
Member

@squirrelsc squirrelsc Nov 20, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please also add if block here to skip other distros. So the logic here is easier to know at the test case level. The if block in the install_intel_mpi_benchmarking_tool is a safe guard.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure.


# Restart the ssh sessions for changes to /etc/security/limits.conf
# to take effect
server_node.close()
@@ -386,7 +395,7 @@ def verify_open_mpi(self, environment: Environment, log: Logger) -> None:
# Ping Pong test
find = server_node.tools[Find]
find_results = find.find_files(
server_node.get_pure_path("/usr"), "IMB-MPI1", sudo=True
server_node.get_pure_path("/"), "IMB-MPI1", sudo=True
)
assert_that(len(find_results)).described_as(
"Could not find location of IMB-MPI1 for Open MPI"
@@ -407,7 +416,7 @@ def verify_open_mpi(self, environment: Environment, log: Logger) -> None:

# IMB-MPI Tests
find_results = find.find_files(
server_node.get_pure_path("/usr"), "IMB-MPI1", sudo=True
server_node.get_pure_path("/"), "IMB-MPI1", sudo=True
)
assert_that(len(find_results)).described_as(
"Could not find location of Open MPI test: IMB-MPI1"
@@ -417,7 +426,7 @@ def verify_open_mpi(self, environment: Environment, log: Logger) -> None:
"Could not find location of Open MPI test: IMB-MPI1"
).is_not_empty()
server_node.execute(
f"/usr/local/bin/mpirun --host {server_ip},{client_ip} "
f"/usr/local/bin/mpirun -hosts {server_ip},{client_ip} "
"-n 2 --mca btl self,vader,openib --mca btl_openib_cq_size 4096 "
"--mca btl_openib_allow_ib 1 --mca "
f"btl_openib_warn_no_device_params_found 0 {test_path}",
@@ -571,6 +580,12 @@ def verify_mvapich_mpi(self, environment: Environment, log: Logger) -> None:
raise SkippedException(err)

run_in_parallel([server_ib.install_mvapich_mpi, client_ib.install_mvapich_mpi])
test_names = ["IMB-MPI1", "IMB-RMA", "IMB-NBC"]
# Only for mariner, we need to build intel benchmarking tools
# as they are not included in our packages
server_ib.install_intel_mpi_benchmarking_tool(tool_names=test_names)

server_node.execute("ldconfig", sudo=True)

# Restart the ssh sessions for changes to /etc/security/limits.conf
# to take effect
@@ -590,13 +605,15 @@ def verify_mvapich_mpi(self, environment: Environment, log: Logger) -> None:
client_ssh.enable_public_key(server_ssh.generate_key_pairs())
server_ssh.add_known_host(client_ip)
client_ssh.add_known_host(server_ip)
sudo=False
if isinstance(server_node.os, CBLMariner):
sudo=True

# Run MPI tests
find = server_node.tools[Find]
test_names = ["IMB-MPI1", "IMB-RMA", "IMB-NBC"]
for test in test_names:
find_results = find.find_files(
server_node.get_pure_path("/usr"), test, sudo=True
server_node.get_pure_path("/"), test, sudo=True
)
assert_that(len(find_results)).described_as(
f"Could not find location of MVAPICH MPI test: {test}"
@@ -611,6 +628,7 @@ def verify_mvapich_mpi(self, environment: Environment, log: Logger) -> None:
expected_exit_code=0,
expected_exit_code_failure_message=f"Failed {test} test "
"with MVAPICH MPI",
sudo=sudo
)

def _check_nd_enabled(self, node: Node) -> None:
Loading
Oops, something went wrong.