test(bpf): Add eBPF Test Suite

This commit adds a more comprehensive eBPF test suite. Currently it tests the operation of a number of key functions within the eBPF code - for example the main sched_switch tracepoint that we run. In addition, it runs a number of micro benchmarks so we can track performance of these key pieces of code. Signed-off-by: Dave Tucker <dave@dtucker.co.uk>
sustainable-computing-io · Jul 12, 2024 · 9b9b1f2 · 9b9b1f2
1 parent d3c2906
commit 9b9b1f2
Show file tree

Hide file tree

Showing 23 changed files with 2,958 additions and 349 deletions.
diff --git a/Makefile b/Makefile
@@ -41,10 +41,10 @@ CTR_CMD            ?= $(or $(shell podman info > /dev/null 2>&1 && which podman)
 # E.g. --tls-verify=false for local develop when using podman
 CTR_CMD_PUSH_OPTIONS ?=
 
-GENERAL_TAGS := 'include_gcs include_oss containers_image_openpgp gssapi providerless netgo osusergo'
-GPU_TAGS := ' gpu '
+GENERAL_TAGS := include_gcs include_oss containers_image_openpgp gssapi providerless netgo osusergo
+GPU_TAGS := gpu
 ifeq ($(shell ldconfig -p | grep -q libhlml.so && echo exists),exists)
-	GPU_TAGS := $(GPU_TAGS)'habana '
+	GPU_TAGS := $(GPU_TAGS) habana
 endif
 
 # set GOENV
@@ -64,12 +64,12 @@ LDFLAGS := $(LDFLAGS) \
 
 DOCKERFILE := $(SRC_ROOT)/build/Dockerfile
 IMAGE_BUILD_TAG := $(GIT_VERSION)-linux-$(GOARCH)
-GO_BUILD_TAGS := $(GENERAL_TAGS)$(GOOS)$(GPU_TAGS)
-GO_TEST_TAGS := $(GENERAL_TAGS)$(GOOS)
+GO_BUILD_TAGS := '$(GENERAL_TAGS) $(GOOS) $(GPU_TAGS)'
+GO_TEST_TAGS := '$(GENERAL_TAGS) $(GOOS)'
 
 # for testsuite
 ENVTEST_ASSETS_DIR=$(SRC_ROOT)/test-bin
-export PATH := $(PATH):$(SRC_ROOT)/test-bin
+export PATH := $(PATH):$(ENVTEST_ASSETS_DIR)
 
 ifndef GOPATH
 	GOPATH := $(HOME)/go
@@ -79,6 +79,13 @@ ifndef GOBIN
 	GOBIN := $(GOPATH)/bin
 endif
 
+VERBOSE ?= 0
+ifeq ($(VERBOSE),1)
+  go_test_args=-v
+else
+  go_test_args=
+endif
+
 # NOTE: project related tools get installed to tmp dir which is ignored by
 PROJECT_DIR := $(shell dirname $(abspath $(firstword $(MAKEFILE_LIST))))
 TOOLS_DIR=$(PROJECT_DIR)/tmp/bin
@@ -163,6 +170,7 @@ build: clean_build_local _build_local copy_build_local ##  Build binary and copy
 .PHONY: generate
 generate: ## Generate BPF code locally.
 	+@$(GOENV) go generate ./pkg/bpf
+	+@$(GOENV) go generate ./pkg/bpftest
 
 _build_local: generate ##  Build Kepler binary locally.
 	@echo TAGS=$(GO_BUILD_TAGS)
@@ -239,7 +247,7 @@ ginkgo-set:
 	mkdir -p $(GOBIN)
 	mkdir -p $(ENVTEST_ASSETS_DIR)
 	@test -f $(ENVTEST_ASSETS_DIR)/ginkgo || \
-	 (go install -mod=mod github.com/onsi/ginkgo/v2/ginkgo@v2.15.0  && \
+	 (go install -mod=mod github.com/onsi/ginkgo/v2/ginkgo@v2.19.0  && \
 	  cp $(GOBIN)/ginkgo $(ENVTEST_ASSETS_DIR)/ginkgo)
 
 .PHONY: container_test
@@ -259,21 +267,18 @@ container_test:
 			cd - && git config --global --add safe.directory /kepler && \
 			make VERBOSE=1 unit-test bench'
 
-VERBOSE ?= 0
-TMPDIR := $(shell mktemp -d)
 TEST_PKGS := $(shell go list ./... | grep -v pkg/bpf | grep -v e2e)
 SUDO?=sudo
-SUDO_TEST_PKGS := $(shell go list ./... | grep pkg/bpf)
+SUDO_TEST_PKGS := $(shell go list ./... | grep pkg/bpftest)
 
 .PHONY: test
 test: unit-test bpf-test bench ## Run all tests.
 
 .PHONY: unit-test
 unit-test: generate ginkgo-set tidy-vendor ## Run unit tests.
 	@echo TAGS=$(GO_TEST_TAGS)
-	$(if $(VERBOSE),@echo GOENV=$(GOENV))
 	@$(GOENV) go test -tags $(GO_TEST_TAGS) \
-		$(if $(VERBOSE),-v) \
+		$(go_test_args) \
 		-cover -covermode=atomic -coverprofile=coverage.out \
 		--race --count=1 \
 		$(TEST_PKGS)
@@ -282,18 +287,19 @@ unit-test: generate ginkgo-set tidy-vendor ## Run unit tests.
 bench: ## Run benchmarks.
 	@echo TAGS=$(GO_TEST_TAGS)
 	$(GOENV) go test -tags $(GO_TEST_TAGS) \
-		$(if $(VERBOSE),-v) \
+		$(go_test_args) \
 		-test.run=dontrunanytests \
 		-bench=. --count=1 $(TEST_PKGS)
 
 .PHONY: bpf-test
-bpf-test: generate ## Run BPF tests.
-	for pkg in $(SUDO_TEST_PKGS); do \
-		$(GOENV) go test -c $$pkg -tags $(GO_TEST_TAGS) -cover \
-		-covermode=atomic -coverprofile=coverage.bpf.out \
-		-o $(TMPDIR)/$$(basename $$pkg).test && \
-		$(SUDO) $(TMPDIR)/$$(basename $$pkg).test; \
-	done
+bpf-test: generate ginkgo-set ## Run BPF tests.$(GOBIN)
+	$(GOENV) $(ENVTEST_ASSETS_DIR)/ginkgo build \
+		-tags $(GO_TEST_TAGS) \
+		-cover \
+		--covermode=atomic \
+		./pkg/bpftest
+	$(SUDO) $(ENVTEST_ASSETS_DIR)/ginkgo \
+		./pkg/bpftest/bpftest.test
 
 escapes_detect: tidy-vendor
 	@$(GOENV) go build -tags $(GO_BUILD_TAGS) -gcflags="-m -l" ./... 2>&1 | grep "escapes to heap" || true

diff --git a/bpf/kepler.bpf.c b/bpf/kepler.bpf.c
@@ -2,248 +2,17 @@
 // Copyright 2021.
 
 #include "kepler.bpf.h"
-struct {
-	__uint(type, BPF_MAP_TYPE_LRU_HASH);
-	__type(key, u32);
-	__type(value, process_metrics_t);
-	__uint(max_entries, MAP_SIZE);
-} processes SEC(".maps");
-
-struct {
-	__uint(type, BPF_MAP_TYPE_LRU_HASH);
-	__type(key, u32);
-	__type(value, u64);
-	__uint(max_entries, MAP_SIZE);
-} pid_time_map SEC(".maps");
-
-struct {
-	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
-	__type(key, int);
-	__type(value, u32);
-	__uint(max_entries, NUM_CPUS);
-} cpu_cycles_event_reader SEC(".maps");
-
-struct {
-	__uint(type, BPF_MAP_TYPE_ARRAY);
-	__type(key, u32);
-	__type(value, u64);
-	__uint(max_entries, NUM_CPUS);
-} cpu_cycles SEC(".maps");
-
-struct {
-	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
-	__type(key, int);
-	__type(value, u32);
-	__uint(max_entries, NUM_CPUS);
-} cpu_instructions_event_reader SEC(".maps");
-
-struct {
-	__uint(type, BPF_MAP_TYPE_ARRAY);
-	__type(key, u32);
-	__type(value, u64);
-	__uint(max_entries, NUM_CPUS);
-} cpu_instructions SEC(".maps");
-
-struct {
-	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
-	__type(key, int);
-	__type(value, u32);
-	__uint(max_entries, NUM_CPUS);
-} cache_miss_event_reader SEC(".maps");
-
-struct {
-	__uint(type, BPF_MAP_TYPE_ARRAY);
-	__type(key, u32);
-	__type(value, u64);
-	__uint(max_entries, NUM_CPUS);
-} cache_miss SEC(".maps");
-
-// The sampling rate should be disabled by default because its impact on the
-// measurements is unknown.
-SEC(".rodata.config")
-__attribute__((
-	btf_decl_tag("Sample Rate"))) static volatile const int SAMPLE_RATE = 0;
-
-int counter_sched_switch = 0;
-
-static inline u64 calc_delta(u64 *prev_val, u64 val)
-{
-	u64 delta = 0;
-	// Probably a clock issue where the recorded on-CPU event had a
-	// timestamp later than the recorded off-CPU event, or vice versa.
-	if (prev_val && val > *prev_val)
-		delta = val - *prev_val;
-
-	return delta;
-}
-
-static inline u64 get_on_cpu_elapsed_time_us(u32 prev_pid, u64 curr_ts)
-{
-	u64 cpu_time = 0;
-	u64 *prev_ts;
-
-	prev_ts = bpf_map_lookup_elem(&pid_time_map, &prev_pid);
-	if (prev_ts) {
-		cpu_time = calc_delta(prev_ts, curr_ts) / 1000;
-		bpf_map_delete_elem(&pid_time_map, &prev_pid);
-	}
-
-	return cpu_time;
-}
-
-static inline u64 get_on_cpu_cycles(u32 *cpu_id)
-{
-	u64 delta, val, *prev_val;
-	long error;
-	struct bpf_perf_event_value c = {};
-
-	error = bpf_perf_event_read_value(
-		&cpu_cycles_event_reader, *cpu_id, &c, sizeof(c));
-	if (error)
-		return 0;
-
-	val = c.counter;
-	prev_val = bpf_map_lookup_elem(&cpu_cycles, cpu_id);
-	delta = calc_delta(prev_val, val);
-	bpf_map_update_elem(&cpu_cycles, cpu_id, &val, BPF_ANY);
-
-	return delta;
-}
-
-static inline u64 get_on_cpu_instr(u32 *cpu_id)
-{
-	u64 delta, val, *prev_val;
-	long error;
-	struct bpf_perf_event_value c = {};
-
-	error = bpf_perf_event_read_value(
-		&cpu_instructions_event_reader, *cpu_id, &c, sizeof(c));
-	if (error)
-		return 0;
-
-	val = c.counter;
-	prev_val = bpf_map_lookup_elem(&cpu_instructions, cpu_id);
-	delta = calc_delta(prev_val, val);
-	bpf_map_update_elem(&cpu_instructions, cpu_id, &val, BPF_ANY);
-
-	return delta;
-}
-
-static inline u64 get_on_cpu_cache_miss(u32 *cpu_id)
-{
-	u64 delta, val, *prev_val;
-	long error;
-	struct bpf_perf_event_value c = {};
-
-	error = bpf_perf_event_read_value(
-		&cache_miss_event_reader, *cpu_id, &c, sizeof(c));
-	if (error)
-		return 0;
-	val = c.counter;
-	prev_val = bpf_map_lookup_elem(&cache_miss, cpu_id);
-	delta = calc_delta(prev_val, val);
-	bpf_map_update_elem(&cache_miss, cpu_id, &val, BPF_ANY);
-
-	return delta;
-}
-
-static inline void register_new_process_if_not_exist()
-{
-	u64 cgroup_id, pid_tgid;
-	u32 curr_tgid;
-	struct process_metrics_t *curr_tgid_metrics;
-
-	pid_tgid = bpf_get_current_pid_tgid();
-	curr_tgid = pid_tgid >> 32;
-
-	// create new process metrics
-	curr_tgid_metrics = bpf_map_lookup_elem(&processes, &curr_tgid);
-	if (!curr_tgid_metrics) {
-		cgroup_id = bpf_get_current_cgroup_id();
-		// the Kernel tgid is the user-space PID, and the Kernel pid is the
-		// user-space TID
-		process_metrics_t new_process = {
-			.pid = curr_tgid,
-			.cgroup_id = cgroup_id,
-		};
-		bpf_get_current_comm(&new_process.comm, sizeof(new_process.comm));
-		bpf_map_update_elem(
-			&processes, &curr_tgid, &new_process, BPF_NOEXIST);
-	}
-}
-
-static inline void collect_metrics_and_reset_counters(
-	struct process_metrics_t *buf, u32 prev_pid, u64 curr_ts, u32 cpu_id)
-{
-	buf->cpu_cycles = get_on_cpu_cycles(&cpu_id);
-	buf->cpu_instr = get_on_cpu_instr(&cpu_id);
-	buf->cache_miss = get_on_cpu_cache_miss(&cpu_id);
-	// Get current time to calculate the previous task on-CPU time
-	buf->process_run_time = get_on_cpu_elapsed_time_us(prev_pid, curr_ts);
-}
-
-struct task_struct {
-	int pid;
-	unsigned int tgid;
-} __attribute__((preserve_access_index));
 
 SEC("tp_btf/sched_switch")
 int kepler_sched_switch_trace(u64 *ctx)
 {
-	u32 prev_pid, next_pid, cpu_id;
-	u64 prev_tgid;
-	unsigned int prev_state;
-	u64 curr_ts = bpf_ktime_get_ns();
 	struct task_struct *prev_task, *next_task;
 
-	struct process_metrics_t *curr_tgid_metrics, *prev_tgid_metrics;
-	struct process_metrics_t buf = {};
-
 	prev_task = (struct task_struct *)ctx[1];
 	next_task = (struct task_struct *)ctx[2];
 
-	prev_pid = (u32)prev_task->pid;
-	next_pid = (u32)next_task->pid;
-	prev_tgid = prev_task->tgid;
-	cpu_id = bpf_get_smp_processor_id();
-
-	// Collect metrics
-	// Regardless of skipping the collection, we need to update the hardware
-	// counter events to keep the metrics map current.
-	collect_metrics_and_reset_counters(&buf, prev_pid, curr_ts, cpu_id);
-
-	// Skip some samples to minimize overhead
-	// Note that we can only skip samples after updating the metric maps to
-	// collect the right values
-	if (SAMPLE_RATE > 0) {
-		if (counter_sched_switch > 0) {
-			counter_sched_switch--;
-			return 0;
-		}
-		counter_sched_switch = SAMPLE_RATE;
-	}
-
-	// The process_run_time is 0 if we do not have the previous timestamp of
-	// the task or due to a clock issue. In either case, we skip collecting
-	// all metrics to avoid discrepancies between the hardware counter and CPU
-	// time.
-	if (buf.process_run_time > 0) {
-		prev_tgid_metrics = bpf_map_lookup_elem(&processes, &prev_tgid);
-		if (prev_tgid_metrics) {
-			prev_tgid_metrics->process_run_time += buf.process_run_time;
-			prev_tgid_metrics->cpu_cycles += buf.cpu_cycles;
-			prev_tgid_metrics->cpu_instr += buf.cpu_instr;
-			prev_tgid_metrics->cache_miss += buf.cache_miss;
-		}
-	}
-
-	// Add task on-cpu running start time
-	bpf_map_update_elem(&pid_time_map, &next_pid, &curr_ts, BPF_ANY);
-
-	// create new process metrics
-	register_new_process_if_not_exist();
-
-	return 0;
+	return do_kepler_sched_switch_trace(
+		prev_task->pid, next_task->pid, prev_task->tgid, next_task->tgid);
 }
 
 SEC("tp_btf/softirq_entry")
@@ -266,12 +35,9 @@ SEC("fexit/mark_page_accessed")
 int kepler_read_page_trace(void *ctx)
 {
 	u32 curr_tgid;
-	struct process_metrics_t *process_metrics;
 
-	curr_tgid = bpf_get_current_pid_tgid() >> 32;
-	process_metrics = bpf_map_lookup_elem(&processes, &curr_tgid);
-	if (process_metrics)
-		process_metrics->page_cache_hit++;
+	curr_tgid = bpf_get_current_pid_tgid();
+	do_page_cache_hit_increment(curr_tgid);
 	return 0;
 }
 
@@ -280,12 +46,9 @@ SEC("tp/writeback_dirty_folio")
 int kepler_write_page_trace(void *ctx)
 {
 	u32 curr_tgid;
-	struct process_metrics_t *process_metrics;
 
-	curr_tgid = bpf_get_current_pid_tgid() >> 32;
-	process_metrics = bpf_map_lookup_elem(&processes, &curr_tgid);
-	if (process_metrics)
-		process_metrics->page_cache_hit++;
+	curr_tgid = bpf_get_current_pid_tgid();
+	do_page_cache_hit_increment(curr_tgid);
 	return 0;
 }