Skip to content

Commit

Permalink
bpf: Add batch ops to all htab bpf map
Browse files Browse the repository at this point in the history
htab can't use generic batch support due some problematic behaviours
inherent to the data structre, i.e. while iterating the bpf map  a
concurrent program might delete the next entry that batch was about to
use, in that case there's no easy solution to retrieve the next entry,
the issue has been discussed multiple times (see [1] and [2]).

The only way hmap can be traversed without the problem previously
exposed is by making sure that the map is traversing entire buckets.
This commit implements those strict requirements for hmap, the
implementation follows the same interaction that generic support with
some exceptions:

 - If keys/values buffer are not big enough to traverse a bucket,
   ENOSPC will be returned.
 - out_batch contains the value of the next bucket in the iteration, not
   the next key, but this is transparent for the user since the user
   should never use out_batch for other than bpf batch syscalls.

This commits implements BPF_MAP_LOOKUP_BATCH and adds support for new
command BPF_MAP_LOOKUP_AND_DELETE_BATCH. Note that for update/delete
batch ops it is possible to use the generic implementations.

[1] https://lore.kernel.org/bpf/20190724165803.87470-1-brianvv@google.com/
[2] https://lore.kernel.org/bpf/20190906225434.3635421-1-yhs@fb.com/

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Brian Vazquez <brianvv@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200115184308.162644-6-brianvv@google.com
  • Loading branch information
yonghong-song authored and Alexei Starovoitov committed Jan 15, 2020
1 parent c60f2d2 commit 0579963
Show file tree
Hide file tree
Showing 4 changed files with 276 additions and 1 deletion.
3 changes: 3 additions & 0 deletions include/linux/bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ struct bpf_map_ops {
void *(*map_lookup_elem_sys_only)(struct bpf_map *map, void *key);
int (*map_lookup_batch)(struct bpf_map *map, const union bpf_attr *attr,
union bpf_attr __user *uattr);
int (*map_lookup_and_delete_batch)(struct bpf_map *map,
const union bpf_attr *attr,
union bpf_attr __user *uattr);
int (*map_update_batch)(struct bpf_map *map, const union bpf_attr *attr,
union bpf_attr __user *uattr);
int (*map_delete_batch)(struct bpf_map *map, const union bpf_attr *attr,
Expand Down
1 change: 1 addition & 0 deletions include/uapi/linux/bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ enum bpf_cmd {
BPF_MAP_FREEZE,
BPF_BTF_GET_NEXT_ID,
BPF_MAP_LOOKUP_BATCH,
BPF_MAP_LOOKUP_AND_DELETE_BATCH,
BPF_MAP_UPDATE_BATCH,
BPF_MAP_DELETE_BATCH,
};
Expand Down
264 changes: 264 additions & 0 deletions kernel/bpf/hashtab.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,16 @@
(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \
BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED)

#define BATCH_OPS(_name) \
.map_lookup_batch = \
_name##_map_lookup_batch, \
.map_lookup_and_delete_batch = \
_name##_map_lookup_and_delete_batch, \
.map_update_batch = \
generic_map_update_batch, \
.map_delete_batch = \
generic_map_delete_batch

struct bucket {
struct hlist_nulls_head head;
raw_spinlock_t lock;
Expand Down Expand Up @@ -1232,6 +1242,256 @@ static void htab_map_seq_show_elem(struct bpf_map *map, void *key,
rcu_read_unlock();
}

static int
__htab_map_lookup_and_delete_batch(struct bpf_map *map,
const union bpf_attr *attr,
union bpf_attr __user *uattr,
bool do_delete, bool is_lru_map,
bool is_percpu)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
u32 bucket_cnt, total, key_size, value_size, roundup_key_size;
void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val;
void __user *uvalues = u64_to_user_ptr(attr->batch.values);
void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
void *ubatch = u64_to_user_ptr(attr->batch.in_batch);
u32 batch, max_count, size, bucket_size;
u64 elem_map_flags, map_flags;
struct hlist_nulls_head *head;
struct hlist_nulls_node *n;
unsigned long flags;
struct htab_elem *l;
struct bucket *b;
int ret = 0;

elem_map_flags = attr->batch.elem_flags;
if ((elem_map_flags & ~BPF_F_LOCK) ||
((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map)))
return -EINVAL;

map_flags = attr->batch.flags;
if (map_flags)
return -EINVAL;

max_count = attr->batch.count;
if (!max_count)
return 0;

if (put_user(0, &uattr->batch.count))
return -EFAULT;

batch = 0;
if (ubatch && copy_from_user(&batch, ubatch, sizeof(batch)))
return -EFAULT;

if (batch >= htab->n_buckets)
return -ENOENT;

key_size = htab->map.key_size;
roundup_key_size = round_up(htab->map.key_size, 8);
value_size = htab->map.value_size;
size = round_up(value_size, 8);
if (is_percpu)
value_size = size * num_possible_cpus();
total = 0;
/* while experimenting with hash tables with sizes ranging from 10 to
* 1000, it was observed that a bucket can have upto 5 entries.
*/
bucket_size = 5;

alloc:
/* We cannot do copy_from_user or copy_to_user inside
* the rcu_read_lock. Allocate enough space here.
*/
keys = kvmalloc(key_size * bucket_size, GFP_USER | __GFP_NOWARN);
values = kvmalloc(value_size * bucket_size, GFP_USER | __GFP_NOWARN);
if (!keys || !values) {
ret = -ENOMEM;
goto after_loop;
}

again:
preempt_disable();
this_cpu_inc(bpf_prog_active);
rcu_read_lock();
again_nocopy:
dst_key = keys;
dst_val = values;
b = &htab->buckets[batch];
head = &b->head;
raw_spin_lock_irqsave(&b->lock, flags);

bucket_cnt = 0;
hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
bucket_cnt++;

if (bucket_cnt > (max_count - total)) {
if (total == 0)
ret = -ENOSPC;
raw_spin_unlock_irqrestore(&b->lock, flags);
rcu_read_unlock();
this_cpu_dec(bpf_prog_active);
preempt_enable();
goto after_loop;
}

if (bucket_cnt > bucket_size) {
bucket_size = bucket_cnt;
raw_spin_unlock_irqrestore(&b->lock, flags);
rcu_read_unlock();
this_cpu_dec(bpf_prog_active);
preempt_enable();
kvfree(keys);
kvfree(values);
goto alloc;
}

hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
memcpy(dst_key, l->key, key_size);

if (is_percpu) {
int off = 0, cpu;
void __percpu *pptr;

pptr = htab_elem_get_ptr(l, map->key_size);
for_each_possible_cpu(cpu) {
bpf_long_memcpy(dst_val + off,
per_cpu_ptr(pptr, cpu), size);
off += size;
}
} else {
value = l->key + roundup_key_size;
if (elem_map_flags & BPF_F_LOCK)
copy_map_value_locked(map, dst_val, value,
true);
else
copy_map_value(map, dst_val, value);
check_and_init_map_lock(map, dst_val);
}
if (do_delete) {
hlist_nulls_del_rcu(&l->hash_node);
if (is_lru_map)
bpf_lru_push_free(&htab->lru, &l->lru_node);
else
free_htab_elem(htab, l);
}
dst_key += key_size;
dst_val += value_size;
}

raw_spin_unlock_irqrestore(&b->lock, flags);
/* If we are not copying data, we can go to next bucket and avoid
* unlocking the rcu.
*/
if (!bucket_cnt && (batch + 1 < htab->n_buckets)) {
batch++;
goto again_nocopy;
}

rcu_read_unlock();
this_cpu_dec(bpf_prog_active);
preempt_enable();
if (bucket_cnt && (copy_to_user(ukeys + total * key_size, keys,
key_size * bucket_cnt) ||
copy_to_user(uvalues + total * value_size, values,
value_size * bucket_cnt))) {
ret = -EFAULT;
goto after_loop;
}

total += bucket_cnt;
batch++;
if (batch >= htab->n_buckets) {
ret = -ENOENT;
goto after_loop;
}
goto again;

after_loop:
if (ret == -EFAULT)
goto out;

/* copy # of entries and next batch */
ubatch = u64_to_user_ptr(attr->batch.out_batch);
if (copy_to_user(ubatch, &batch, sizeof(batch)) ||
put_user(total, &uattr->batch.count))
ret = -EFAULT;

out:
kvfree(keys);
kvfree(values);
return ret;
}

static int
htab_percpu_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
false, true);
}

static int
htab_percpu_map_lookup_and_delete_batch(struct bpf_map *map,
const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
false, true);
}

static int
htab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
false, false);
}

static int
htab_map_lookup_and_delete_batch(struct bpf_map *map,
const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
false, false);
}

static int
htab_lru_percpu_map_lookup_batch(struct bpf_map *map,
const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
true, true);
}

static int
htab_lru_percpu_map_lookup_and_delete_batch(struct bpf_map *map,
const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
true, true);
}

static int
htab_lru_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
true, false);
}

static int
htab_lru_map_lookup_and_delete_batch(struct bpf_map *map,
const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
true, false);
}

const struct bpf_map_ops htab_map_ops = {
.map_alloc_check = htab_map_alloc_check,
.map_alloc = htab_map_alloc,
Expand All @@ -1242,6 +1502,7 @@ const struct bpf_map_ops htab_map_ops = {
.map_delete_elem = htab_map_delete_elem,
.map_gen_lookup = htab_map_gen_lookup,
.map_seq_show_elem = htab_map_seq_show_elem,
BATCH_OPS(htab),
};

const struct bpf_map_ops htab_lru_map_ops = {
Expand All @@ -1255,6 +1516,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
.map_delete_elem = htab_lru_map_delete_elem,
.map_gen_lookup = htab_lru_map_gen_lookup,
.map_seq_show_elem = htab_map_seq_show_elem,
BATCH_OPS(htab_lru),
};

/* Called from eBPF program */
Expand Down Expand Up @@ -1368,6 +1630,7 @@ const struct bpf_map_ops htab_percpu_map_ops = {
.map_update_elem = htab_percpu_map_update_elem,
.map_delete_elem = htab_map_delete_elem,
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
BATCH_OPS(htab_percpu),
};

const struct bpf_map_ops htab_lru_percpu_map_ops = {
Expand All @@ -1379,6 +1642,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
.map_update_elem = htab_lru_percpu_map_update_elem,
.map_delete_elem = htab_lru_map_delete_elem,
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
BATCH_OPS(htab_lru_percpu),
};

static int fd_htab_map_alloc_check(union bpf_attr *attr)
Expand Down
9 changes: 8 additions & 1 deletion kernel/bpf/syscall.c
Original file line number Diff line number Diff line change
Expand Up @@ -3310,7 +3310,8 @@ static int bpf_map_do_batch(const union bpf_attr *attr,
if (IS_ERR(map))
return PTR_ERR(map);

if (cmd == BPF_MAP_LOOKUP_BATCH &&
if ((cmd == BPF_MAP_LOOKUP_BATCH ||
cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) &&
!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
err = -EPERM;
goto err_put;
Expand All @@ -3324,6 +3325,8 @@ static int bpf_map_do_batch(const union bpf_attr *attr,

if (cmd == BPF_MAP_LOOKUP_BATCH)
BPF_DO_BATCH(map->ops->map_lookup_batch);
else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH)
BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch);
else if (cmd == BPF_MAP_UPDATE_BATCH)
BPF_DO_BATCH(map->ops->map_update_batch);
else
Expand Down Expand Up @@ -3434,6 +3437,10 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_MAP_LOOKUP_BATCH:
err = bpf_map_do_batch(&attr, uattr, BPF_MAP_LOOKUP_BATCH);
break;
case BPF_MAP_LOOKUP_AND_DELETE_BATCH:
err = bpf_map_do_batch(&attr, uattr,
BPF_MAP_LOOKUP_AND_DELETE_BATCH);
break;
case BPF_MAP_UPDATE_BATCH:
err = bpf_map_do_batch(&attr, uattr, BPF_MAP_UPDATE_BATCH);
break;
Expand Down

0 comments on commit 0579963

Please sign in to comment.