diff --git a/CMakeLists.txt b/CMakeLists.txt index 7eb87554..e9e3eea8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -102,6 +102,7 @@ find_package(StdFilesystem REQUIRED) find_package(Sensors) find_package(Veosinfo) find_package(PkgConfig) +find_package(BpfObject) if(PkgConfig_FOUND) pkg_check_modules(Audit audit) @@ -122,6 +123,9 @@ CMAKE_DEPENDENT_OPTION(USE_LIBAUDIT "Use libaudit for syscall name resolution" O add_feature_info("USE_LIBAUDIT" USE_LIBAUDIT "Use libaudit for syscall name resolution.") CMAKE_DEPENDENT_OPTION(USE_VEOSINFO "Use libveosinfo to sample NEC SX-Aurora Tsubasa cards." ON "Veosinfo_FOUND" OFF) add_feature_info("USE_VEOSINFO" USE_VEOSINFO "Use libveosinfo to sample NEC SX-Aurora Tsubasa cards.") +CMAKE_DEPENDENT_OPTION(USE_BPF "Use BPF for filename lookup" ON BpfObject_FOUND OFF) +add_feature_info("USE_BPF" USE_BPF "Use BPF for filename lookup") + # system configuration checks CHECK_INCLUDE_FILES(linux/hw_breakpoint.h HAVE_HW_BREAKPOINT_H) CHECK_STRUCT_HAS_MEMBER("struct perf_event_attr" clockid linux/perf_event.h HAVE_PERF_EVENT_ATTR_CLOCKID) @@ -349,6 +353,18 @@ target_include_directories(lo2s PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/include ) + +if (USE_BPF) + if (BpfObject_FOUND) + target_compile_definitions(lo2s PUBLIC HAVE_BPF) + bpf_object(open src/perf/posix_io/open.bpf.c) + add_dependencies(lo2s open_skel) + target_link_libraries(lo2s PRIVATE open_skel) + else() + message(SEND_ERROR "BPF not found but requested.") + endif() +endif() + add_subdirectory(man) message(STATUS "Linux kernel version: ${LINUX_VERSION}") diff --git a/cmake/FindBpfObject.cmake b/cmake/FindBpfObject.cmake new file mode 100644 index 00000000..3df222b1 --- /dev/null +++ b/cmake/FindBpfObject.cmake @@ -0,0 +1,189 @@ +# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause + +#[=======================================================================[.rst: +FindBpfObject +-------- + +Find BpfObject + +This module finds if all the dependencies for eBPF Compile-Once-Run-Everywhere +programs are available and where all the components are located. + +The caller may set the following variables to disable automatic +search/processing for the associated component: + + ``BPFOBJECT_BPFTOOL_EXE`` + Path to ``bpftool`` binary + + ``BPFOBJECT_CLANG_EXE`` + Path to ``clang`` binary + + ``LIBBPF_INCLUDE_DIRS`` + Path to ``libbpf`` development headers + + ``LIBBPF_LIBRARIES`` + Path to `libbpf` library + + ``BPFOBJECT_VMLINUX_H`` + Path to ``vmlinux.h`` generated by ``bpftool``. If unset, this module will + attempt to automatically generate a copy. + +This module sets the following result variables: + +:: + + BpfObject_FOUND = TRUE if all components are found + + +This module also provides the ``bpf_object()`` macro. This macro generates a +cmake interface library for the BPF object's generated skeleton as well +as the associated dependencies. + +.. code-block:: cmake + + bpf_object( ) + +Given an abstract ```` for a BPF object and the associated ```` +file, generates an interface library target, ``_skel``, that may be +linked against by other cmake targets. + +Example Usage: + +:: + + find_package(BpfObject REQUIRED) + bpf_object(myobject myobject.bpf.c) + add_executable(myapp myapp.c) + target_link_libraries(myapp myobject_skel) + +#]=======================================================================] + +if(NOT BPFOBJECT_BPFTOOL_EXE) + find_program(BPFOBJECT_BPFTOOL_EXE NAMES bpftool DOC "Path to bpftool executable") +endif() + +if(NOT BPFOBJECT_CLANG_EXE) + find_program(BPFOBJECT_CLANG_EXE NAMES clang DOC "Path to clang executable") + + execute_process(COMMAND ${BPFOBJECT_CLANG_EXE} --version + OUTPUT_VARIABLE CLANG_version_output + ERROR_VARIABLE CLANG_version_error + RESULT_VARIABLE CLANG_version_result + OUTPUT_STRIP_TRAILING_WHITESPACE) + + # Check that clang is new enough + if(${CLANG_version_result} EQUAL 0) + if("${CLANG_version_output}" MATCHES "clang version ([^\n]+)\n") + # Transform X.Y.Z into X;Y;Z which can then be interpreted as a list + set(CLANG_VERSION "${CMAKE_MATCH_1}") + string(REPLACE "." ";" CLANG_VERSION_LIST ${CLANG_VERSION}) + list(GET CLANG_VERSION_LIST 0 CLANG_VERSION_MAJOR) + + # Anything older than clang 10 doesn't really work + string(COMPARE LESS ${CLANG_VERSION_MAJOR} 10 CLANG_VERSION_MAJOR_LT10) + if(${CLANG_VERSION_MAJOR_LT10}) + message(FATAL_ERROR "clang ${CLANG_VERSION} is too old for BPF CO-RE") + endif() + + message(STATUS "Found clang version: ${CLANG_VERSION}") + else() + message(FATAL_ERROR "Failed to parse clang version string: ${CLANG_version_output}") + endif() + else() + message(FATAL_ERROR "Command \"${BPFOBJECT_CLANG_EXE} --version\" failed with output:\n${CLANG_version_error}") + endif() +endif() + +if(NOT LIBBPF_INCLUDE_DIRS OR NOT LIBBPF_LIBRARIES) + find_package(LibBpf) +endif() + +if(BPFOBJECT_VMLINUX_H) + get_filename_component(GENERATED_VMLINUX_DIR ${BPFOBJECT_VMLINUX_H} DIRECTORY) +elseif(BPFOBJECT_BPFTOOL_EXE) + # Generate vmlinux.h + set(GENERATED_VMLINUX_DIR ${CMAKE_CURRENT_BINARY_DIR}/include/lo2s) + set(BPFOBJECT_VMLINUX_H "${GENERATED_VMLINUX_DIR}/vmlinux.h") + +file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/include/lo2s") + execute_process(COMMAND ${BPFOBJECT_BPFTOOL_EXE} btf dump file /sys/kernel/btf/vmlinux format c + OUTPUT_FILE ${BPFOBJECT_VMLINUX_H} + ERROR_VARIABLE VMLINUX_error + RESULT_VARIABLE VMLINUX_result) + if(${VMLINUX_result} EQUAL 0) + set(VMLINUX ${BPFOBJECT_VMLINUX_H}) + else() + message(FATAL_ERROR "Failed to dump vmlinux.h from BTF: ${VMLINUX_error}") + endif() +endif() + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(BpfObject + REQUIRED_VARS + BPFOBJECT_BPFTOOL_EXE + BPFOBJECT_CLANG_EXE + LIBBPF_INCLUDE_DIRS + LIBBPF_LIBRARIES + GENERATED_VMLINUX_DIR) + +# Get clang bpf system includes +execute_process( + COMMAND bash -c "${BPFOBJECT_CLANG_EXE} -v -E - < /dev/null 2>&1 | + sed -n '/<...> search starts here:/,/End of search list./{ s| \\(/.*\\)|-idirafter \\1|p }'" + OUTPUT_VARIABLE CLANG_SYSTEM_INCLUDES_output + ERROR_VARIABLE CLANG_SYSTEM_INCLUDES_error + RESULT_VARIABLE CLANG_SYSTEM_INCLUDES_result + OUTPUT_STRIP_TRAILING_WHITESPACE) +if(${CLANG_SYSTEM_INCLUDES_result} EQUAL 0) + separate_arguments(CLANG_SYSTEM_INCLUDES UNIX_COMMAND ${CLANG_SYSTEM_INCLUDES_output}) + message(STATUS "BPF system include flags: ${CLANG_SYSTEM_INCLUDES}") +else() + message(FATAL_ERROR "Failed to determine BPF system includes: ${CLANG_SYSTEM_INCLUDES_error}") +endif() + +# Get target arch +execute_process(COMMAND uname -m + COMMAND sed -e "s/x86_64/x86/" -e "s/aarch64/arm64/" -e "s/ppc64le/powerpc/" -e "s/mips.*/mips/" + OUTPUT_VARIABLE ARCH_output + ERROR_VARIABLE ARCH_error + RESULT_VARIABLE ARCH_result + OUTPUT_STRIP_TRAILING_WHITESPACE) +if(${ARCH_result} EQUAL 0) + set(ARCH ${ARCH_output}) + message(STATUS "BPF target arch: ${ARCH}") +else() + message(FATAL_ERROR "Failed to determine target architecture: ${ARCH_error}") +endif() + +# Public macro +macro(bpf_object name input) + set(BPF_C_FILE ${CMAKE_CURRENT_SOURCE_DIR}/${input}) + set(BPF_O_FILE ${CMAKE_CURRENT_BINARY_DIR}/${name}.bpf.o) + set(BPF_SKEL_FILE ${CMAKE_CURRENT_BINARY_DIR}/include/lo2s/${name}.skel.h) + message(STATUS ${CMAKE_CURRENT_BINARY_DIR}) + set(OUTPUT_TARGET ${name}_skel) + + # Build BPF object file + add_custom_command(OUTPUT ${BPF_O_FILE} + COMMAND ${BPFOBJECT_CLANG_EXE} -g -O2 -target bpf -D__TARGET_ARCH_${ARCH} + ${CLANG_SYSTEM_INCLUDES} -I${GENERATED_VMLINUX_DIR} + -I${CMAKE_SOURCE_DIR}/include + -isystem ${LIBBPF_INCLUDE_DIRS} -c ${BPF_C_FILE} -o ${BPF_O_FILE} + COMMAND_EXPAND_LISTS + VERBATIM + DEPENDS ${BPF_C_FILE} + COMMENT "[clang] Building BPF object: ${name}") + + # Build BPF skeleton header + add_custom_command(OUTPUT ${BPF_SKEL_FILE} + COMMAND bash -c "${BPFOBJECT_BPFTOOL_EXE} gen skeleton ${BPF_O_FILE} > ${BPF_SKEL_FILE}" + VERBATIM + DEPENDS ${BPF_O_FILE} + COMMENT "[skel] Building BPF skeleton: ${name}") + + add_library(${OUTPUT_TARGET} INTERFACE) + target_sources(${OUTPUT_TARGET} INTERFACE ${BPF_SKEL_FILE}) + target_include_directories(${OUTPUT_TARGET} INTERFACE ${CMAKE_CURRENT_BINARY_DIR}) + target_include_directories(${OUTPUT_TARGET} SYSTEM INTERFACE ${LIBBPF_INCLUDE_DIRS}) + target_link_libraries(${OUTPUT_TARGET} INTERFACE ${LIBBPF_LIBRARIES} -lelf -lz) +endmacro() diff --git a/cmake/FindLibBpf.cmake b/cmake/FindLibBpf.cmake new file mode 100644 index 00000000..cd558f55 --- /dev/null +++ b/cmake/FindLibBpf.cmake @@ -0,0 +1,32 @@ +# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause + +find_path(LIBBPF_INCLUDE_DIRS + NAMES + bpf/bpf.h + bpf/btf.h + bpf/libbpf.h + PATHS + /usr/include + /usr/local/include + /opt/local/include + /sw/include + ENV CPATH) + +find_library(LIBBPF_LIBRARIES + NAMES + bpf + PATHS + /usr/lib + /usr/local/lib + /opt/local/lib + /sw/lib + ENV LIBRARY_PATH + ENV LD_LIBRARY_PATH) + +include (FindPackageHandleStandardArgs) + +FIND_PACKAGE_HANDLE_STANDARD_ARGS(LibBpf "Please install the libbpf development package" + LIBBPF_LIBRARIES + LIBBPF_INCLUDE_DIRS) + +mark_as_advanced(LIBBPF_INCLUDE_DIRS LIBBPF_LIBRARIES) diff --git a/include/lo2s/config.hpp b/include/lo2s/config.hpp index 76427bf8..083b0458 100644 --- a/include/lo2s/config.hpp +++ b/include/lo2s/config.hpp @@ -91,6 +91,8 @@ struct Config bool use_x86_energy; // block I/O bool use_block_io; + // posix I/O + bool use_posix_io; // syscalls bool use_syscalls = false; std::vector syscall_filter; diff --git a/include/lo2s/measurement_scope.hpp b/include/lo2s/measurement_scope.hpp index d691f92a..250eca10 100644 --- a/include/lo2s/measurement_scope.hpp +++ b/include/lo2s/measurement_scope.hpp @@ -32,6 +32,7 @@ enum class MeasurementScopeType USERSPACE_METRIC, BIO, SYSCALL, + POSIX_IO, UNKNOWN }; @@ -73,6 +74,11 @@ struct MeasurementScope return { MeasurementScopeType::SYSCALL, s }; } + static MeasurementScope posix_io(ExecutionScope s) + { + return { MeasurementScopeType::POSIX_IO, s }; + } + friend bool operator==(const MeasurementScope& lhs, const MeasurementScope& rhs) { return (lhs.scope == rhs.scope) && lhs.type == rhs.type; @@ -103,6 +109,8 @@ struct MeasurementScope return fmt::format("block layer I/O events for {}", scope.name()); case MeasurementScopeType::SYSCALL: return fmt::format("syscall events for {}", scope.name()); + case MeasurementScopeType::POSIX_IO: + return fmt::format("POSIX I/O events for {}", scope.name()); default: throw new std::runtime_error("Unknown ExecutionScopeType!"); } diff --git a/include/lo2s/monitor/bio_monitor.hpp b/include/lo2s/monitor/bio_monitor.hpp new file mode 100644 index 00000000..e69de29b diff --git a/include/lo2s/monitor/poll_monitor.hpp b/include/lo2s/monitor/poll_monitor.hpp index 301d3e67..5d5e46e8 100644 --- a/include/lo2s/monitor/poll_monitor.hpp +++ b/include/lo2s/monitor/poll_monitor.hpp @@ -55,7 +55,7 @@ class PollMonitor : public ThreadedMonitor void add_fd(int fd); - virtual void monitor([[maybe_unused]] int fd){}; + virtual void monitor([[maybe_unused]] int fd) {}; struct pollfd& stop_pfd() { diff --git a/include/lo2s/monitor/posix_monitor.hpp b/include/lo2s/monitor/posix_monitor.hpp new file mode 100644 index 00000000..3ad1fd7a --- /dev/null +++ b/include/lo2s/monitor/posix_monitor.hpp @@ -0,0 +1,293 @@ +/* + * This file is part of the lo2s software. + * Linux OTF2 sampling + * + * Copyright (c) 2016, + * Technische Universitaet Dresden, Germany + * + * lo2s is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * lo2s is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with lo2s. If not, see . + */ + +#pragma once + +#include "otf2xx/common.hpp" +#include "otf2xx/event/io_create_handle.hpp" +#include "otf2xx/event/io_delete_file.hpp" +#include "otf2xx/event/io_destroy_handle.hpp" +#include +#include + +extern "C" +{ +#include +#include +#include +#include +} + +namespace lo2s +{ +namespace monitor +{ +class PosixMonitor : public ThreadedMonitor +{ +public: + struct RingBufferDeleter + { + void operator()(struct ring_buffer* rb) + { + ring_buffer__free(rb); + } + }; + + struct SkelDeleter + { + void operator()(struct open_bpf* skel) + { + open_bpf__destroy(skel); + } + }; + + PosixMonitor(trace::Trace& trace) + : ThreadedMonitor(trace, "open() monitor"), trace_(trace), + time_converter_(perf::time::Converter::instance()) + { + // Need to bump memlock rlimit to run anything but the most trivial BPF programs + + struct rlimit rlim_new; + rlim_new.rlim_cur = RLIM_INFINITY; + rlim_new.rlim_max = RLIM_INFINITY; + + if (setrlimit(RLIMIT_MEMLOCK, &rlim_new)) + { + throw_errno(); + } + + skel_ = std::unique_ptr(open_bpf__open_and_load()); + if (!skel_) + { + return; + } + + open_bpf__attach(skel_.get()); + + rb_ = std::unique_ptr( + ring_buffer__new(bpf_map__fd(skel_.get()->maps.rb), event_cb, this, NULL)); + + if (!rb_) + { + return; + } + } + + static int libbpf_print_fn(enum libbpf_print_level level, const char* format, va_list args) + { + if (level > LIBBPF_INFO) + return 0; + return vfprintf(stderr, format, args); + } + + void insert_thread(Thread thread [[maybe_unused]]) + { + char insert = 1; + pid_t pid = thread.as_pid_t(); + bpf_map__update_elem(skel_->maps.pids, &pid, sizeof(pid), &insert, sizeof(char), BPF_ANY); + + last_fd_[thread] = -1; + } + + void exit_thread(Thread thread [[maybe_unused]]) + { + pid_t pid = thread.as_pid_t(); + bpf_map__delete_elem(skel_->maps.pids, &pid, sizeof(pid), BPF_ANY); + } + + void initialize_thread() override + { + } + + void handle_event(void* data, size_t datasz [[maybe_unused]]) + { + struct posix_event_header* e = (struct posix_event_header*)data; + ThreadFd thread_fd = ThreadFd(e->fd, e->pid); + Thread thread = Thread(e->pid); + std::string filename; + if (e->fd == 0) + { + filename = "stdin"; + } + else if (e->fd == 1) + { + filename = "stdout"; + } + else if (e->fd == 2) + { + filename = "stderr"; + } + if (e->type == OPEN) + { + struct open_event* e = (struct open_event*)data; + + if (e->header.fd >= 3) + { + filename = std::string(e->filename); + } + if (!instance_.count(thread_fd)) + { + instance_.emplace(thread_fd, 0); + } + else + { + instance_[thread_fd] = instance_[thread_fd] + 1; + } + auto& handle = trace_.posix_io_handle(Thread(e->header.pid), e->header.fd, + instance_[thread_fd], filename); + otf2::writer::local& writer = trace_.posix_io_writer(thread); + + writer << otf2::event::io_create_handle(time_converter_(e->header.time), handle, + otf2::common::io_access_mode_type::read_write, + otf2::common::io_creation_flag_type::none, + otf2::common::io_status_flag_type::none); + } + if (e->type == CLOSE) + { + auto& handle = + trace_.posix_io_handle(Thread(e->pid), e->fd, instance_[thread_fd], filename); + + otf2::writer::local& writer = trace_.posix_io_writer(thread); + + writer << otf2::event::io_destroy_handle(time_converter_(e->time), handle); + } + + otf2::common::io_operation_mode_type mode = otf2::common::io_operation_mode_type::flush; + if (e->type == READ_ENTER || e->type == READ_EXIT) + { + mode = otf2::common::io_operation_mode_type::read; + } + else + { + mode = otf2::common::io_operation_mode_type::write; + } + + if (e->type == READ_ENTER || e->type == WRITE_ENTER) + { + struct read_write_event* event = (read_write_event*)data; + last_fd_[thread] = event->header.fd; + last_buf_[thread] = event->buf; + last_count_[thread] = event->count; + + otf2::writer::local& writer = trace_.posix_io_writer(thread); + otf2::definition::io_handle& handle = + trace_.posix_io_handle(thread, event->header.fd, instance_[thread_fd], filename); + writer << otf2::event::io_operation_begin( + time_converter_(event->header.time), handle, mode, + otf2::common::io_operation_flag_type::non_blocking, event->count, event->buf); + } + else if (e->type == READ_EXIT || e->type == WRITE_EXIT) + { + Thread thread(e->pid); + + if (last_fd_[thread] == -1) + { + return; + } + otf2::writer::local& writer = trace_.posix_io_writer(thread); + otf2::definition::io_handle& handle = trace_.posix_io_handle( + thread, last_fd_[thread], instance_[ThreadFd(last_fd_[thread], thread.as_pid_t())], + filename); + + writer << otf2::event::io_operation_complete(time_converter_(e->time), handle, + last_count_[thread], last_buf_[thread]); + last_fd_[thread] = -1; + } + } + + void run() override + { + while (!stop_) + { + ring_buffer__poll(rb_.get(), 100); + } + } + + void finalize_thread() override + { + } + + void stop() override + { + stop_ = true; + thread_.join(); + } + + static int event_cb(void* ctx, void* data, size_t data_sz) + { + ((PosixMonitor*)ctx)->handle_event(data, data_sz); + return 0; + } + + void monitor() + { + } + + std::string group() const override + { + return "PosixMonitor"; + } + +private: + struct ThreadFd + { + ThreadFd() : fd(-1), thread(Thread::invalid()) + { + } + + ThreadFd(int fd, pid_t thread) : fd(fd), thread(Thread(thread)) + { + } + + friend bool operator<(const ThreadFd& lhs, const ThreadFd& rhs) + { + if (lhs.fd == rhs.fd) + { + return lhs.thread < rhs.thread; + } + + return lhs.fd < rhs.fd; + } + + friend bool operator==(const ThreadFd& lhs, const ThreadFd& rhs) + { + return lhs.fd == rhs.fd && lhs.thread == rhs.thread; + } + + private: + int fd; + Thread thread; + }; + + trace::Trace& trace_; + perf::time::Converter& time_converter_; + + std::map last_fd_; + std::map last_count_; + std::map last_buf_; + std::map instance_; + std::unique_ptr rb_; + std::unique_ptr skel_; + bool stop_ = false; +}; + +} // namespace monitor +} // namespace lo2s diff --git a/include/lo2s/monitor/process_monitor.hpp b/include/lo2s/monitor/process_monitor.hpp index d5463c63..ef955987 100644 --- a/include/lo2s/monitor/process_monitor.hpp +++ b/include/lo2s/monitor/process_monitor.hpp @@ -22,6 +22,7 @@ #pragma once #include #include +#include #include #include @@ -53,6 +54,7 @@ class ProcessMonitor : public AbstractProcessMonitor, public MainMonitor private: std::map threads_; + std::unique_ptr posix_monitor_; }; } // namespace monitor } // namespace lo2s diff --git a/include/lo2s/perf/posix_io/common.h b/include/lo2s/perf/posix_io/common.h new file mode 100644 index 00000000..7e4c37a2 --- /dev/null +++ b/include/lo2s/perf/posix_io/common.h @@ -0,0 +1,88 @@ +/* + * This file is part of the lo2s software. + * Linux OTF2 sampling + * + * Copyright (c) 2016, + * Technische Universitaet Dresden, Germany + * + * lo2s is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * lo2s is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with lo2s. If not, see . + */ + +#pragma once + +enum type + { + OPEN, + CLOSE, + READ_ENTER, + READ_EXIT, + WRITE_ENTER, + WRITE_EXIT + }; + + + +struct syscalls_sys_enter_openat +{ + unsigned long __syscall_nr; + char padding[4]; + unsigned long long dfd; + char* filename; + unsigned long long flags; + unsigned long long mode; +}; + +struct syscalls_sys_exit_openat +{ + long __syscall_nr; + char padding[4]; + long ret; +}; + +struct syscalls_sys_enter_rw +{ + long __syscall_nr; + char padding0 [4]; + unsigned long long fd; + unsigned long long buf; + unsigned long long count; +}; + +struct syscalls_sys_enter_close +{ + long __syscall_nr; + char padding0 [4]; + unsigned long long fd; +}; + +struct posix_event_header +{ + int type; + unsigned long long time; + int pid; + int fd; +}; + +struct open_event +{ + struct posix_event_header header; + char filename[256]; +}; + +struct read_write_event +{ + struct posix_event_header header; + uint64_t buf; + uint64_t count; +}; diff --git a/include/lo2s/trace/reg_keys.hpp b/include/lo2s/trace/reg_keys.hpp index ad269b2e..73a54cf9 100644 --- a/include/lo2s/trace/reg_keys.hpp +++ b/include/lo2s/trace/reg_keys.hpp @@ -151,6 +151,48 @@ struct ByNecDeviceTag using ByNecDevice = SimpleKeyType; +struct ThreadFdInstance +{ +public: + ThreadFdInstance() : thread(Thread::invalid()), fd(-1), instance(0) + { + } + + ThreadFdInstance(Thread thread, int fd, int instance) + : thread(thread), fd(fd), instance(instance) + { + } + + friend bool operator==(const ThreadFdInstance& lhs, const ThreadFdInstance& rhs) + { + return lhs.thread == rhs.thread && lhs.fd == rhs.fd && lhs.instance == rhs.instance; + } + + friend bool operator<(const ThreadFdInstance& lhs, const ThreadFdInstance& rhs) + { + if (lhs.thread == rhs.thread) + { + if (lhs.fd == rhs.fd) + { + return lhs.instance < rhs.instance; + } + return lhs.fd < rhs.fd; + } + return lhs.thread < rhs.thread; + } + +private: + Thread thread; + int fd; + int instance; +}; + +struct ByThreadFdInstanceTag +{ +}; + +using ByThreadFdInstance = SimpleKeyType; + template struct Holder { @@ -188,13 +230,15 @@ struct Holder template <> struct Holder { - using type = otf2::lookup_definition_holder; + using type = otf2::lookup_definition_holder; }; template <> struct Holder { - using type = otf2::lookup_definition_holder; + using type = + otf2::lookup_definition_holder; }; template <> @@ -242,7 +286,8 @@ struct Holder template <> struct Holder { - using type = otf2::lookup_definition_holder; + using type = otf2::lookup_definition_holder; }; template <> diff --git a/include/lo2s/trace/trace.hpp b/include/lo2s/trace/trace.hpp index c687ab6a..36d858c2 100644 --- a/include/lo2s/trace/trace.hpp +++ b/include/lo2s/trace/trace.hpp @@ -136,8 +136,11 @@ class Trace otf2::writer::local& bio_writer(BlockDevice dev); otf2::writer::local& create_metric_writer(const std::string& name); otf2::writer::local& nec_writer(NecDevice device, const Thread& nec_thread); + otf2::writer::local& posix_io_writer(Thread thread); otf2::definition::io_handle& block_io_handle(BlockDevice dev); + otf2::definition::io_handle& posix_io_handle(Thread thread, int fd, int instance, + std::string& name); otf2::definition::metric_member metric_member(const std::string& name, const std::string& description, @@ -358,6 +361,9 @@ class Trace otf2::definition::detail::weak_ref bio_paradigm_; otf2::definition::detail::weak_ref bio_comm_group_; + otf2::definition::detail::weak_ref posix_paradigm_; + otf2::definition::detail::weak_ref posix_comm_group_; + const otf2::definition::system_tree_node& system_tree_root_node_; ExecutionScopeGroup& groups_; diff --git a/include/lo2s/types.hpp b/include/lo2s/types.hpp index 5d8a33b9..b528bb7c 100644 --- a/include/lo2s/types.hpp +++ b/include/lo2s/types.hpp @@ -64,6 +64,11 @@ class Thread return lhs.tid_ < rhs.tid_; } + friend bool operator>(const Thread& lhs, const Thread& rhs) + { + return lhs.tid_ > rhs.tid_; + } + friend bool operator!(const Thread& thread) { return thread.tid_ == -1; diff --git a/src/config.cpp b/src/config.cpp index ca9c5320..13798dce 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -353,6 +353,9 @@ void parse_program_options(int argc, const char** argv) .metavar("MSEC") .default_value("100"); + io_options.toggle("posix-io", + "Enable recording of POSIX I/o events (requires access to debugfs)"); + nitro::options::arguments arguments; try { @@ -380,6 +383,7 @@ void parse_program_options(int argc, const char** argv) config.use_sensors = arguments.given("sensors"); config.use_block_io = arguments.given("block-io"); config.use_nec = arguments.given("nec"); + config.use_posix_io = arguments.given("posix-io"); config.command = arguments.positionals(); if (arguments.given("help")) diff --git a/src/monitor/process_monitor.cpp b/src/monitor/process_monitor.cpp index 3b7cbeee..963db60f 100644 --- a/src/monitor/process_monitor.cpp +++ b/src/monitor/process_monitor.cpp @@ -31,6 +31,11 @@ namespace monitor ProcessMonitor::ProcessMonitor() : MainMonitor() { + if (config().use_posix_io) + { + posix_monitor_ = std::make_unique(trace_); + posix_monitor_->start(); + } trace_.add_monitoring_thread(gettid(), "ProcessMonitor", "ProcessMonitor"); } @@ -43,6 +48,10 @@ void ProcessMonitor::insert_process(Process parent, Process process, std::string void ProcessMonitor::insert_thread(Process process, Thread thread, std::string name, bool spawn) { + if (posix_monitor_) + { + posix_monitor_->insert_thread(thread); + } trace_.add_thread(thread, name); if (config().sampling) @@ -79,6 +88,10 @@ void ProcessMonitor::update_process_name(Process process, const std::string& nam void ProcessMonitor::exit_thread(Thread thread) { + if (posix_monitor_) + { + posix_monitor_->exit_thread(thread); + } if (threads_.count(thread) != 0) { threads_.at(thread).stop(); @@ -92,6 +105,11 @@ ProcessMonitor::~ProcessMonitor() { thread.second.stop(); } + + if (posix_monitor_) + { + posix_monitor_->stop(); + } } } // namespace monitor } // namespace lo2s diff --git a/src/monitor/threaded_monitor.cpp b/src/monitor/threaded_monitor.cpp index a5d1b467..18686995 100644 --- a/src/monitor/threaded_monitor.cpp +++ b/src/monitor/threaded_monitor.cpp @@ -39,7 +39,6 @@ ThreadedMonitor::ThreadedMonitor(trace::Trace& trace, const std::string& name) ThreadedMonitor::~ThreadedMonitor() { summary().record_perf_wakeups(num_wakeups_); - assert(!thread_.joinable()); } void ThreadedMonitor::start() diff --git a/src/perf/event_provider.cpp b/src/perf/event_provider.cpp index 3dd434ac..766bfb02 100644 --- a/src/perf/event_provider.cpp +++ b/src/perf/event_provider.cpp @@ -49,10 +49,7 @@ extern "C" namespace { -#define PERF_EVENT(name, type, id) \ - { \ - (name), (type), (id) \ - } +#define PERF_EVENT(name, type, id) { (name), (type), (id) } #define PERF_EVENT_HW(name, id) PERF_EVENT(name, PERF_TYPE_HARDWARE, PERF_COUNT_HW_##id) #define PERF_EVENT_SW(name, id) PERF_EVENT(name, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_##id) diff --git a/src/perf/posix_io/open.bpf.c b/src/perf/posix_io/open.bpf.c new file mode 100644 index 00000000..87534426 --- /dev/null +++ b/src/perf/posix_io/open.bpf.c @@ -0,0 +1,225 @@ +/* + * This file is part of the lo2s software. + * Linux OTF2 sampling + * + * Copyright (c) 2016, + * Technische Universitaet Dresden, Germany + * + * lo2s is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * lo2s is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with lo2s. If not, see . + */ + +// The generated vmlinux.h headers has to go first +// clang-format off +#include +// clang-format on + +#include +#include +#include +#include + +char LICENSE[] SEC("license") = "GPL"; + +struct +{ + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, 256 * 1024); +} rb SEC(".maps"); + +struct +{ + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __uint(max_entries, 256 * 1024); + __type(key, u32); + __type(value, char[256]); +} open_cache SEC(".maps"); + +struct +{ + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __uint(max_entries, 256 * 1024); + __type(key, u32); + __type(value, char); +} pids SEC(".maps"); + +SEC("kprobe/do_filp_open") + +int BPF_KPROBE(do_filp_open, int dfd, struct filename* fn, const struct open_flags* op) +{ + u32 pid = bpf_get_current_pid_tgid(); + + if (!bpf_map_lookup_elem(&pids, &pid)) + return 0; + + char name[256]; + char* name_ptr = BPF_CORE_READ(fn, name); + bpf_probe_read_kernel_str(name, 256, name_ptr); + bpf_map_update_elem(&open_cache, &pid, name, BPF_ANY); + return 0; +} + +SEC("tp/syscalls/sys_exit_openat") + +int handle_openat_ret(struct syscalls_sys_exit_openat* ctx) +{ + if (ctx->ret < 0) + return 0; + + u32 pid = bpf_get_current_pid_tgid(); + + if (!bpf_map_lookup_elem(&pids, &pid)) + return 0; + + char* filename = bpf_map_lookup_elem(&open_cache, &pid); + + if (filename == 0) + return 0; + + struct open_event* e; + + e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0); + if (!e) + { + return 0; + } + + __builtin_memcpy(e->filename, filename, 256); + e->header.type = OPEN; + e->header.pid = pid; + e->header.fd = ctx->ret; + e->header.time = bpf_ktime_get_ns(); + bpf_ringbuf_submit(e, 0); + return 0; +} + +SEC("tp/syscalls/sys_enter_close") + +int handle_close(struct syscalls_sys_enter_close* ctx) +{ + u32 pid = bpf_get_current_pid_tgid(); + + if (!bpf_map_lookup_elem(&pids, &pid)) + return 0; + + struct posix_event_header* e; + e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0); + + if (!e) + return 0; + + e->pid = pid; + e->time = bpf_ktime_get_ns(); + e->type = CLOSE; + e->fd = ctx->fd; + + bpf_ringbuf_submit(e, 0); + + return 0; +} + +SEC("tp/syscalls/sys_enter_read") + +int handle_enter_read(struct syscalls_sys_enter_rw* ctx) +{ + u32 pid = bpf_get_current_pid_tgid(); + + if (!bpf_map_lookup_elem(&pids, &pid)) + return 0; + + struct read_write_event* e; + + e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0); + + if (!e) + return 0; + e->header.pid = pid; + e->header.time = bpf_ktime_get_ns(); + e->header.type = READ_ENTER; + e->header.fd = ctx->fd; + e->count = ctx->count; + e->buf = ctx->buf; + bpf_ringbuf_submit(e, 0); + return 0; +} + +SEC("tp/syscalls/sys_enter_write") + +int handle_enter_write(struct syscalls_sys_enter_rw* ctx) +{ + u32 pid = bpf_get_current_pid_tgid(); + + if (!bpf_map_lookup_elem(&pids, &pid)) + return 0; + struct read_write_event* e; + + e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0); + + if (!e) + return 0; + e->header.pid = pid; + e->header.time = bpf_ktime_get_ns(); + e->header.type = WRITE_ENTER; + e->header.fd = ctx->fd; + + e->count = ctx->count; + e->buf = ctx->buf; + bpf_ringbuf_submit(e, 0); + return 0; +} + +SEC("tp/syscalls/sys_exit_read") + +int handle_exit_read(void* ctx) +{ + u32 pid = bpf_get_current_pid_tgid(); + + if (!bpf_map_lookup_elem(&pids, &pid)) + return 0; + + struct posix_event_header* e; + + e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0); + + if (!e) + return 0; + e->pid = pid; + e->time = bpf_ktime_get_ns(); + e->type = READ_EXIT; + + bpf_ringbuf_submit(e, 0); + return 0; +} + +SEC("tp/syscalls/sys_exit_write") + +int handle_exit_write(void* ctx) +{ + u32 pid = bpf_get_current_pid_tgid(); + if (!bpf_map_lookup_elem(&pids, &pid)) + return 0; + + struct posix_event_header* e; + + e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0); + + if (!e) + return 0; + + e->pid = pid; + e->time = bpf_ktime_get_ns(); + e->type = WRITE_EXIT; + + bpf_ringbuf_submit(e, 0); + return 0; +} diff --git a/src/trace/trace.cpp b/src/trace/trace.cpp index 46df1980..819be65d 100644 --- a/src/trace/trace.cpp +++ b/src/trace/trace.cpp @@ -201,6 +201,20 @@ Trace::Trace() intern("NEC sampling timer"), otf2::common::interrupt_generator_mode_type::count, otf2::common::base_type::decimal, 0, config().sampling_period); } + + if (config().use_posix_io) + { + + const std::vector properties; + const std::vector values; + posix_paradigm_ = registry_.create( + intern("POSIX"), intern("POSIX I/O"), otf2::common::io_paradigm_class_type::parallel, + otf2::common::io_paradigm_flag_type::os, properties, values); + + posix_comm_group_ = registry_.create( + intern("POSIX I/O files"), otf2::common::paradigm_type::hardware, + otf2::common::group_flag_type::none); + } } void Trace::begin_record() @@ -479,6 +493,19 @@ otf2::writer::local& Trace::bio_writer(BlockDevice dev) return archive_(intern_location); } +otf2::writer::local& Trace::posix_io_writer(Thread thread) +{ + MeasurementScope scope = MeasurementScope::posix_io(thread.as_scope()); + + const auto& intern_location = registry_.emplace( + ByMeasurementScope(scope), intern(scope.name()), + registry_.get( + ByExecutionScope(groups_.get_parent(thread.as_scope()))), + otf2::definition::location::location_type::cpu_thread); + + return archive_(intern_location); +} + otf2::writer::local& Trace::create_metric_writer(const std::string& name) { const auto& location = registry_.create( @@ -530,6 +557,42 @@ otf2::definition::io_handle& Trace::block_io_handle(BlockDevice dev) return handle; } +otf2::definition::io_handle& Trace::posix_io_handle(Thread thread, int fd, int instance, + std::string& name) +{ + ThreadFdInstance id(thread, fd, instance); + if (registry_.has(ByThreadFdInstance(id))) + { + return registry_.get(ByThreadFdInstance(id)); + } + const auto& filename = intern(name); + + const auto& file = registry_.emplace( + ByString(name), filename, system_tree_root_node_); + + auto& handle = registry_.emplace( + ByThreadFdInstance(id), filename, file, posix_paradigm_, + otf2::common::io_handle_flag_type::none, + registry_.get(ByProcess(groups_.get_process(thread)))); + + // Mark stdin, stdout and stderr as pre-created, because they are + if (fd < 3) + { + otf2::common::io_access_mode_type mode; + if (fd == 0) + { + mode = otf2::common::io_access_mode_type::read_only; + } + else + { + mode = otf2::common::io_access_mode_type::write_only; + } + registry_.create( + handle, mode, otf2::common::io_status_flag_type::none); + } + return handle; +} + otf2::definition::metric_member Trace::metric_member(const std::string& name, const std::string& description, otf2::common::metric_mode mode, otf2::common::type value_type,