Skip to content

Commit

Permalink
test(bpf): Add eBPF Test Suite
Browse files Browse the repository at this point in the history
This commit adds a more comprehensive eBPF test suite.
Currently it tests the operation of a number of key functions
within the eBPF code - for example the main sched_switch
tracepoint that we run. In addition, it runs a number
of micro benchmarks so we can track performance of these
key pieces of code.

Signed-off-by: Dave Tucker <dave@dtucker.co.uk>
  • Loading branch information
dave-tucker committed Jul 12, 2024
1 parent d3c2906 commit 9b9b1f2
Show file tree
Hide file tree
Showing 23 changed files with 2,958 additions and 349 deletions.
46 changes: 26 additions & 20 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,10 @@ CTR_CMD ?= $(or $(shell podman info > /dev/null 2>&1 && which podman)
# E.g. --tls-verify=false for local develop when using podman
CTR_CMD_PUSH_OPTIONS ?=

GENERAL_TAGS := 'include_gcs include_oss containers_image_openpgp gssapi providerless netgo osusergo'
GPU_TAGS := ' gpu '
GENERAL_TAGS := include_gcs include_oss containers_image_openpgp gssapi providerless netgo osusergo
GPU_TAGS := gpu
ifeq ($(shell ldconfig -p | grep -q libhlml.so && echo exists),exists)
GPU_TAGS := $(GPU_TAGS)'habana '
GPU_TAGS := $(GPU_TAGS) habana
endif

# set GOENV
Expand All @@ -64,12 +64,12 @@ LDFLAGS := $(LDFLAGS) \

DOCKERFILE := $(SRC_ROOT)/build/Dockerfile
IMAGE_BUILD_TAG := $(GIT_VERSION)-linux-$(GOARCH)
GO_BUILD_TAGS := $(GENERAL_TAGS)$(GOOS)$(GPU_TAGS)
GO_TEST_TAGS := $(GENERAL_TAGS)$(GOOS)
GO_BUILD_TAGS := '$(GENERAL_TAGS) $(GOOS) $(GPU_TAGS)'
GO_TEST_TAGS := '$(GENERAL_TAGS) $(GOOS)'

# for testsuite
ENVTEST_ASSETS_DIR=$(SRC_ROOT)/test-bin
export PATH := $(PATH):$(SRC_ROOT)/test-bin
export PATH := $(PATH):$(ENVTEST_ASSETS_DIR)

ifndef GOPATH
GOPATH := $(HOME)/go
Expand All @@ -79,6 +79,13 @@ ifndef GOBIN
GOBIN := $(GOPATH)/bin
endif

VERBOSE ?= 0
ifeq ($(VERBOSE),1)
go_test_args=-v
else
go_test_args=
endif

# NOTE: project related tools get installed to tmp dir which is ignored by
PROJECT_DIR := $(shell dirname $(abspath $(firstword $(MAKEFILE_LIST))))
TOOLS_DIR=$(PROJECT_DIR)/tmp/bin
Expand Down Expand Up @@ -163,6 +170,7 @@ build: clean_build_local _build_local copy_build_local ## Build binary and copy
.PHONY: generate
generate: ## Generate BPF code locally.
+@$(GOENV) go generate ./pkg/bpf
+@$(GOENV) go generate ./pkg/bpftest

_build_local: generate ## Build Kepler binary locally.
@echo TAGS=$(GO_BUILD_TAGS)
Expand Down Expand Up @@ -239,7 +247,7 @@ ginkgo-set:
mkdir -p $(GOBIN)
mkdir -p $(ENVTEST_ASSETS_DIR)
@test -f $(ENVTEST_ASSETS_DIR)/ginkgo || \
(go install -mod=mod github.com/onsi/ginkgo/v2/ginkgo@v2.15.0 && \
(go install -mod=mod github.com/onsi/ginkgo/v2/ginkgo@v2.19.0 && \
cp $(GOBIN)/ginkgo $(ENVTEST_ASSETS_DIR)/ginkgo)

.PHONY: container_test
Expand All @@ -259,21 +267,18 @@ container_test:
cd - && git config --global --add safe.directory /kepler && \
make VERBOSE=1 unit-test bench'

VERBOSE ?= 0
TMPDIR := $(shell mktemp -d)
TEST_PKGS := $(shell go list ./... | grep -v pkg/bpf | grep -v e2e)
SUDO?=sudo
SUDO_TEST_PKGS := $(shell go list ./... | grep pkg/bpf)
SUDO_TEST_PKGS := $(shell go list ./... | grep pkg/bpftest)

.PHONY: test
test: unit-test bpf-test bench ## Run all tests.

.PHONY: unit-test
unit-test: generate ginkgo-set tidy-vendor ## Run unit tests.
@echo TAGS=$(GO_TEST_TAGS)
$(if $(VERBOSE),@echo GOENV=$(GOENV))
@$(GOENV) go test -tags $(GO_TEST_TAGS) \
$(if $(VERBOSE),-v) \
$(go_test_args) \
-cover -covermode=atomic -coverprofile=coverage.out \
--race --count=1 \
$(TEST_PKGS)
Expand All @@ -282,18 +287,19 @@ unit-test: generate ginkgo-set tidy-vendor ## Run unit tests.
bench: ## Run benchmarks.
@echo TAGS=$(GO_TEST_TAGS)
$(GOENV) go test -tags $(GO_TEST_TAGS) \
$(if $(VERBOSE),-v) \
$(go_test_args) \
-test.run=dontrunanytests \
-bench=. --count=1 $(TEST_PKGS)

.PHONY: bpf-test
bpf-test: generate ## Run BPF tests.
for pkg in $(SUDO_TEST_PKGS); do \
$(GOENV) go test -c $$pkg -tags $(GO_TEST_TAGS) -cover \
-covermode=atomic -coverprofile=coverage.bpf.out \
-o $(TMPDIR)/$$(basename $$pkg).test && \
$(SUDO) $(TMPDIR)/$$(basename $$pkg).test; \
done
bpf-test: generate ginkgo-set ## Run BPF tests.$(GOBIN)
$(GOENV) $(ENVTEST_ASSETS_DIR)/ginkgo build \
-tags $(GO_TEST_TAGS) \
-cover \
--covermode=atomic \
./pkg/bpftest
$(SUDO) $(ENVTEST_ASSETS_DIR)/ginkgo \
./pkg/bpftest/bpftest.test

escapes_detect: tidy-vendor
@$(GOENV) go build -tags $(GO_BUILD_TAGS) -gcflags="-m -l" ./... 2>&1 | grep "escapes to heap" || true
Expand Down
249 changes: 6 additions & 243 deletions bpf/kepler.bpf.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,248 +2,17 @@
// Copyright 2021.

#include "kepler.bpf.h"
struct {
__uint(type, BPF_MAP_TYPE_LRU_HASH);
__type(key, u32);
__type(value, process_metrics_t);
__uint(max_entries, MAP_SIZE);
} processes SEC(".maps");

struct {
__uint(type, BPF_MAP_TYPE_LRU_HASH);
__type(key, u32);
__type(value, u64);
__uint(max_entries, MAP_SIZE);
} pid_time_map SEC(".maps");

struct {
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
__type(key, int);
__type(value, u32);
__uint(max_entries, NUM_CPUS);
} cpu_cycles_event_reader SEC(".maps");

struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__type(key, u32);
__type(value, u64);
__uint(max_entries, NUM_CPUS);
} cpu_cycles SEC(".maps");

struct {
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
__type(key, int);
__type(value, u32);
__uint(max_entries, NUM_CPUS);
} cpu_instructions_event_reader SEC(".maps");

struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__type(key, u32);
__type(value, u64);
__uint(max_entries, NUM_CPUS);
} cpu_instructions SEC(".maps");

struct {
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
__type(key, int);
__type(value, u32);
__uint(max_entries, NUM_CPUS);
} cache_miss_event_reader SEC(".maps");

struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__type(key, u32);
__type(value, u64);
__uint(max_entries, NUM_CPUS);
} cache_miss SEC(".maps");

// The sampling rate should be disabled by default because its impact on the
// measurements is unknown.
SEC(".rodata.config")
__attribute__((
btf_decl_tag("Sample Rate"))) static volatile const int SAMPLE_RATE = 0;

int counter_sched_switch = 0;

static inline u64 calc_delta(u64 *prev_val, u64 val)
{
u64 delta = 0;
// Probably a clock issue where the recorded on-CPU event had a
// timestamp later than the recorded off-CPU event, or vice versa.
if (prev_val && val > *prev_val)
delta = val - *prev_val;

return delta;
}

static inline u64 get_on_cpu_elapsed_time_us(u32 prev_pid, u64 curr_ts)
{
u64 cpu_time = 0;
u64 *prev_ts;

prev_ts = bpf_map_lookup_elem(&pid_time_map, &prev_pid);
if (prev_ts) {
cpu_time = calc_delta(prev_ts, curr_ts) / 1000;
bpf_map_delete_elem(&pid_time_map, &prev_pid);
}

return cpu_time;
}

static inline u64 get_on_cpu_cycles(u32 *cpu_id)
{
u64 delta, val, *prev_val;
long error;
struct bpf_perf_event_value c = {};

error = bpf_perf_event_read_value(
&cpu_cycles_event_reader, *cpu_id, &c, sizeof(c));
if (error)
return 0;

val = c.counter;
prev_val = bpf_map_lookup_elem(&cpu_cycles, cpu_id);
delta = calc_delta(prev_val, val);
bpf_map_update_elem(&cpu_cycles, cpu_id, &val, BPF_ANY);

return delta;
}

static inline u64 get_on_cpu_instr(u32 *cpu_id)
{
u64 delta, val, *prev_val;
long error;
struct bpf_perf_event_value c = {};

error = bpf_perf_event_read_value(
&cpu_instructions_event_reader, *cpu_id, &c, sizeof(c));
if (error)
return 0;

val = c.counter;
prev_val = bpf_map_lookup_elem(&cpu_instructions, cpu_id);
delta = calc_delta(prev_val, val);
bpf_map_update_elem(&cpu_instructions, cpu_id, &val, BPF_ANY);

return delta;
}

static inline u64 get_on_cpu_cache_miss(u32 *cpu_id)
{
u64 delta, val, *prev_val;
long error;
struct bpf_perf_event_value c = {};

error = bpf_perf_event_read_value(
&cache_miss_event_reader, *cpu_id, &c, sizeof(c));
if (error)
return 0;
val = c.counter;
prev_val = bpf_map_lookup_elem(&cache_miss, cpu_id);
delta = calc_delta(prev_val, val);
bpf_map_update_elem(&cache_miss, cpu_id, &val, BPF_ANY);

return delta;
}

static inline void register_new_process_if_not_exist()
{
u64 cgroup_id, pid_tgid;
u32 curr_tgid;
struct process_metrics_t *curr_tgid_metrics;

pid_tgid = bpf_get_current_pid_tgid();
curr_tgid = pid_tgid >> 32;

// create new process metrics
curr_tgid_metrics = bpf_map_lookup_elem(&processes, &curr_tgid);
if (!curr_tgid_metrics) {
cgroup_id = bpf_get_current_cgroup_id();
// the Kernel tgid is the user-space PID, and the Kernel pid is the
// user-space TID
process_metrics_t new_process = {
.pid = curr_tgid,
.cgroup_id = cgroup_id,
};
bpf_get_current_comm(&new_process.comm, sizeof(new_process.comm));
bpf_map_update_elem(
&processes, &curr_tgid, &new_process, BPF_NOEXIST);
}
}

static inline void collect_metrics_and_reset_counters(
struct process_metrics_t *buf, u32 prev_pid, u64 curr_ts, u32 cpu_id)
{
buf->cpu_cycles = get_on_cpu_cycles(&cpu_id);
buf->cpu_instr = get_on_cpu_instr(&cpu_id);
buf->cache_miss = get_on_cpu_cache_miss(&cpu_id);
// Get current time to calculate the previous task on-CPU time
buf->process_run_time = get_on_cpu_elapsed_time_us(prev_pid, curr_ts);
}

struct task_struct {
int pid;
unsigned int tgid;
} __attribute__((preserve_access_index));

SEC("tp_btf/sched_switch")
int kepler_sched_switch_trace(u64 *ctx)
{
u32 prev_pid, next_pid, cpu_id;
u64 prev_tgid;
unsigned int prev_state;
u64 curr_ts = bpf_ktime_get_ns();
struct task_struct *prev_task, *next_task;

struct process_metrics_t *curr_tgid_metrics, *prev_tgid_metrics;
struct process_metrics_t buf = {};

prev_task = (struct task_struct *)ctx[1];
next_task = (struct task_struct *)ctx[2];

prev_pid = (u32)prev_task->pid;
next_pid = (u32)next_task->pid;
prev_tgid = prev_task->tgid;
cpu_id = bpf_get_smp_processor_id();

// Collect metrics
// Regardless of skipping the collection, we need to update the hardware
// counter events to keep the metrics map current.
collect_metrics_and_reset_counters(&buf, prev_pid, curr_ts, cpu_id);

// Skip some samples to minimize overhead
// Note that we can only skip samples after updating the metric maps to
// collect the right values
if (SAMPLE_RATE > 0) {
if (counter_sched_switch > 0) {
counter_sched_switch--;
return 0;
}
counter_sched_switch = SAMPLE_RATE;
}

// The process_run_time is 0 if we do not have the previous timestamp of
// the task or due to a clock issue. In either case, we skip collecting
// all metrics to avoid discrepancies between the hardware counter and CPU
// time.
if (buf.process_run_time > 0) {
prev_tgid_metrics = bpf_map_lookup_elem(&processes, &prev_tgid);
if (prev_tgid_metrics) {
prev_tgid_metrics->process_run_time += buf.process_run_time;
prev_tgid_metrics->cpu_cycles += buf.cpu_cycles;
prev_tgid_metrics->cpu_instr += buf.cpu_instr;
prev_tgid_metrics->cache_miss += buf.cache_miss;
}
}

// Add task on-cpu running start time
bpf_map_update_elem(&pid_time_map, &next_pid, &curr_ts, BPF_ANY);

// create new process metrics
register_new_process_if_not_exist();

return 0;
return do_kepler_sched_switch_trace(
prev_task->pid, next_task->pid, prev_task->tgid, next_task->tgid);
}

SEC("tp_btf/softirq_entry")
Expand All @@ -266,12 +35,9 @@ SEC("fexit/mark_page_accessed")
int kepler_read_page_trace(void *ctx)
{
u32 curr_tgid;
struct process_metrics_t *process_metrics;

curr_tgid = bpf_get_current_pid_tgid() >> 32;
process_metrics = bpf_map_lookup_elem(&processes, &curr_tgid);
if (process_metrics)
process_metrics->page_cache_hit++;
curr_tgid = bpf_get_current_pid_tgid();
do_page_cache_hit_increment(curr_tgid);
return 0;
}

Expand All @@ -280,12 +46,9 @@ SEC("tp/writeback_dirty_folio")
int kepler_write_page_trace(void *ctx)
{
u32 curr_tgid;
struct process_metrics_t *process_metrics;

curr_tgid = bpf_get_current_pid_tgid() >> 32;
process_metrics = bpf_map_lookup_elem(&processes, &curr_tgid);
if (process_metrics)
process_metrics->page_cache_hit++;
curr_tgid = bpf_get_current_pid_tgid();
do_page_cache_hit_increment(curr_tgid);
return 0;
}

Expand Down
Loading

0 comments on commit 9b9b1f2

Please sign in to comment.