diff --git a/src/syscall/net.c b/src/syscall/net.c index 05b0c76..d4072e5 100644 --- a/src/syscall/net.c +++ b/src/syscall/net.c @@ -509,6 +509,9 @@ int64_t sys_getsockname(guest_t *g, uint64_t addr_gva, uint64_t addrlen_gva) { + if (fd_get_type(fd) == FD_NETLINK) + return netlink_getsockname(fd, g, addr_gva, addrlen_gva); + host_fd_ref_t host_ref; if (host_fd_ref_open(fd, &host_ref) < 0) return -LINUX_EBADF; @@ -639,6 +642,9 @@ int64_t sys_sendto(guest_t *g, uint64_t dest_gva, uint32_t addrlen) { + if (fd_get_type(fd) == FD_NETLINK) + return netlink_send(fd, g, buf_gva, len); + host_fd_ref_t host_ref; if (host_fd_ref_open(fd, &host_ref) < 0) return -LINUX_EBADF; @@ -706,6 +712,9 @@ int64_t sys_recvfrom(guest_t *g, uint64_t src_gva, uint64_t addrlen_gva) { + if (fd_get_type(fd) == FD_NETLINK) + return netlink_recv(fd, g, buf_gva, len, src_gva, addrlen_gva); + host_fd_ref_t host_ref; if (host_fd_ref_open(fd, &host_ref) < 0) return -LINUX_EBADF; diff --git a/src/syscall/net.h b/src/syscall/net.h index 29ecbb4..b7413d6 100644 --- a/src/syscall/net.h +++ b/src/syscall/net.h @@ -190,12 +190,25 @@ int64_t netlink_sendmsg(int guest_fd, guest_t *g, uint64_t msg_gva, int flags); /* Netlink recvmsg: return buffered response data. */ int64_t netlink_recvmsg(int guest_fd, guest_t *g, uint64_t msg_gva, int flags); -/* Netlink read: return buffered response data without msghdr metadata. */ int64_t netlink_read(int guest_fd, guest_t *g, uint64_t buf_gva, uint64_t count); +int64_t netlink_send(int guest_fd, guest_t *g, uint64_t buf_gva, uint64_t len); + +int64_t netlink_recv(int guest_fd, + guest_t *g, + uint64_t buf_gva, + uint64_t len, + uint64_t src_gva, + uint64_t addrlen_gva); + +int64_t netlink_getsockname(int guest_fd, + guest_t *g, + uint64_t addr_gva, + uint64_t addrlen_gva); + /* Clean up abstract socket filesystem entry for a fd being closed. */ void absock_unregister_fd(int guest_fd); diff --git a/src/syscall/netlink.c b/src/syscall/netlink.c index 32c3ec3..874a46e 100644 --- a/src/syscall/netlink.c +++ b/src/syscall/netlink.c @@ -188,8 +188,12 @@ static size_t nl_put_attr(uint8_t *buf, return aligned; } -/* Build RTM_GETLINK response from host getifaddrs(). */ -static int nl_build_getlink(netlink_state_t *ns) +/* Build RTM_GETLINK response from host getifaddrs(). A non-empty name_filter + * or non-zero index_filter restricts the reply to one matching link. + */ +static int nl_build_getlink(netlink_state_t *ns, + const char *name_filter, + uint32_t index_filter) { struct ifaddrs *ifalist, *ifa; if (getifaddrs(&ifalist) < 0) @@ -210,6 +214,11 @@ static int nl_build_getlink(netlink_state_t *ns) if (idx == 0) continue; + if (name_filter[0] && strcmp(ifa->ifa_name, name_filter) != 0) + continue; + if (index_filter != 0 && idx != index_filter) + continue; + /* Check if already seen */ bool found = false; for (int i = 0; i < nseen; i++) { @@ -459,6 +468,96 @@ int64_t netlink_bind(int guest_fd, return 0; } +/* Extract the LinkByName/LinkByIndex filter (ifi_index plus an optional + * IFLA_IFNAME) from a RTM_GETLINK request. Empty name / zero index = no filter. + */ +static void nl_parse_link_filter(const uint8_t *req, + size_t reqlen, + char *name_out, + size_t name_cap, + uint32_t *index_out) +{ + name_out[0] = '\0'; + *index_out = 0; + + if (reqlen < (size_t) NLMSG_HDRLEN + sizeof(ifinfomsg_t)) + return; + + ifinfomsg_t ifi; + memcpy(&ifi, req + NLMSG_HDRLEN, sizeof(ifi)); + if (ifi.ifi_index > 0) + *index_out = (uint32_t) ifi.ifi_index; + + uint32_t nlmsg_len; + memcpy(&nlmsg_len, req, sizeof(nlmsg_len)); + size_t total = (nlmsg_len < reqlen) ? nlmsg_len : reqlen; + + size_t off = NLMSG_HDRLEN + NLMSG_ALIGN(sizeof(ifinfomsg_t)); + while (off + RTA_HDRLEN <= total) { + rtattr_t rta; + memcpy(&rta, req + off, sizeof(rta)); + if (rta.rta_len < RTA_HDRLEN || off + rta.rta_len > total) + break; + if (rta.rta_type == IFLA_IFNAME) { + size_t dlen = rta.rta_len - RTA_HDRLEN; + size_t i = 0; + for (; i < dlen && i + 1 < name_cap && req[off + RTA_HDRLEN + i]; + i++) + name_out[i] = (char) req[off + RTA_HDRLEN + i]; + name_out[i] = '\0'; + } + off += RTA_ALIGN(rta.rta_len); + } +} + +/* Build the reply for one rtnetlink request (already copied into req). Mutates + * ns->buf/seq. Returns 0 on success (including a built NLMSG_ERROR reply for + * unsupported types), or a negative LINUX_E* on a build failure. Caller holds + * nl_lock. req is guaranteed to be at least NLMSG_HDRLEN bytes. + */ +static int nl_process_request(netlink_state_t *ns, + const uint8_t *req, + size_t reqlen) +{ + nlmsghdr_t req_hdr; + memcpy(&req_hdr, req, sizeof(req_hdr)); + ns->seq = req_hdr.nlmsg_seq; + + int ret; + switch (req_hdr.nlmsg_type) { + case RTM_GETLINK: { + char name[64]; + uint32_t index; + nl_parse_link_filter(req, reqlen, name, sizeof(name), &index); + ret = nl_build_getlink(ns, name, index); + break; + } + case RTM_GETADDR: + ret = nl_build_getaddr(ns); + break; + default: + /* Unsupported request: return NLMSG_ERROR with EOPNOTSUPP */ + if ((size_t) NLMSG_HDRLEN + 4 <= NETLINK_BUF_SIZE) { + size_t off = 0; + nlmsghdr_t err_hdr = { + .nlmsg_len = NLMSG_HDRLEN + 4, + .nlmsg_type = NLMSG_ERROR, + .nlmsg_seq = ns->seq, + .nlmsg_pid = ns->pid, + }; + memcpy(ns->buf + off, &err_hdr, sizeof(err_hdr)); + off += NLMSG_HDRLEN; + int32_t errcode = -95; /* -EOPNOTSUPP */ + memcpy(ns->buf + off, &errcode, 4); + ns->buf_len = off + 4; + ns->buf_pos = 0; + } + return 0; + } + + return (ret < 0) ? -LINUX_EIO : 0; +} + int64_t netlink_sendmsg(int guest_fd, guest_t *g, uint64_t msg_gva, int flags) { (void) flags; @@ -491,56 +590,153 @@ int64_t netlink_sendmsg(int guest_fd, guest_t *g, uint64_t msg_gva, int flags) goto out; } - if (iov.iov_len < NLMSG_HDRLEN) { + if (iov.iov_len < (uint64_t) NLMSG_HDRLEN) { result = -LINUX_EINVAL; goto out; } - nlmsghdr_t req_hdr; - if (guest_read_small(g, iov.iov_base, &req_hdr, sizeof(req_hdr)) < 0) { + /* Copy the whole request: the dispatcher inspects filter attributes past + * the fixed nlmsghdr. + */ + uint8_t req[512]; + size_t rlen = (iov.iov_len < sizeof(req)) ? iov.iov_len : sizeof(req); + if (guest_read(g, iov.iov_base, req, rlen) < 0) { result = -LINUX_EFAULT; goto out; } - ns->seq = req_hdr.nlmsg_seq; + int ret = nl_process_request(ns, req, rlen); + result = (ret < 0) ? ret : (int64_t) iov.iov_len; - /* Dispatch based on request type */ - int ret; - switch (req_hdr.nlmsg_type) { - case RTM_GETLINK: - ret = nl_build_getlink(ns); - break; - case RTM_GETADDR: - ret = nl_build_getaddr(ns); - break; - default: - /* Unsupported request: return NLMSG_ERROR with EOPNOTSUPP */ - if (ns->buf_len + NLMSG_HDRLEN + 4 <= NETLINK_BUF_SIZE) { - size_t off = 0; - nlmsghdr_t err_hdr = { - .nlmsg_len = NLMSG_HDRLEN + 4, - .nlmsg_type = NLMSG_ERROR, - .nlmsg_seq = ns->seq, - .nlmsg_pid = ns->pid, - }; - memcpy(ns->buf + off, &err_hdr, sizeof(err_hdr)); - off += NLMSG_HDRLEN; - int32_t errcode = -95; /* -EOPNOTSUPP */ - memcpy(ns->buf + off, &errcode, 4); - ns->buf_len = off + 4; - ns->buf_pos = 0; - } - result = (int64_t) iov.iov_len; +out: + pthread_mutex_unlock(&nl_lock); + return result; +} + +/* sendto(2) on a netlink socket: a flat request buffer (no msghdr). */ +int64_t netlink_send(int guest_fd, guest_t *g, uint64_t buf_gva, uint64_t len) +{ + pthread_mutex_lock(&nl_lock); + netlink_state_t *ns = nl_find(guest_fd); + if (!ns) { + pthread_mutex_unlock(&nl_lock); + return -LINUX_EBADF; + } + + int64_t result; + if (len < (uint64_t) NLMSG_HDRLEN) { + result = -LINUX_EINVAL; + goto out; + } + + uint8_t req[512]; + size_t rlen = (len < sizeof(req)) ? len : sizeof(req); + if (guest_read(g, buf_gva, req, rlen) < 0) { + result = -LINUX_EFAULT; goto out; } - result = (ret < 0) ? -LINUX_EIO : (int64_t) iov.iov_len; + int ret = nl_process_request(ns, req, rlen); + result = (ret < 0) ? ret : (int64_t) len; out: pthread_mutex_unlock(&nl_lock); return result; } +/* recvfrom(2) on a netlink socket: drain whole messages; write back a kernel + * sockaddr_nl (nl_pid 0) when src is requested. + */ +int64_t netlink_recv(int guest_fd, + guest_t *g, + uint64_t buf_gva, + uint64_t len, + uint64_t src_gva, + uint64_t addrlen_gva) +{ + pthread_mutex_lock(&nl_lock); + netlink_state_t *ns = nl_find(guest_fd); + if (!ns) { + pthread_mutex_unlock(&nl_lock); + return -LINUX_EBADF; + } + + if (ns->buf_pos >= ns->buf_len) { + pthread_mutex_unlock(&nl_lock); + return 0; + } + + size_t avail = ns->buf_len - ns->buf_pos; + size_t to_copy = (avail < len) ? avail : len; + + /* Return complete netlink messages only (same walk as netlink_recvmsg). */ + size_t msg_end = 0, pos = ns->buf_pos; + while (pos < ns->buf_len && (pos - ns->buf_pos + NLMSG_HDRLEN) <= to_copy) { + nlmsghdr_t *hdr = (nlmsghdr_t *) (ns->buf + pos); + if (hdr->nlmsg_len < NLMSG_HDRLEN) + break; + size_t msg_bytes = pos - ns->buf_pos + NLMSG_ALIGN(hdr->nlmsg_len); + if (msg_bytes > to_copy) + break; + pos += NLMSG_ALIGN(hdr->nlmsg_len); + msg_end = pos - ns->buf_pos; + } + if (msg_end == 0) + msg_end = to_copy; + + if (guest_write(g, buf_gva, ns->buf + ns->buf_pos, msg_end) < 0) { + pthread_mutex_unlock(&nl_lock); + return -LINUX_EFAULT; + } + ns->buf_pos += msg_end; + + if (src_gva && addrlen_gva) { + sockaddr_nl_t snl = { + .nl_family = LINUX_AF_NETLINK, + .nl_pid = 0, /* From kernel */ + }; + guest_write_small(g, src_gva, &snl, sizeof(snl)); + uint32_t namelen = sizeof(sockaddr_nl_t); + guest_write_small(g, addrlen_gva, &namelen, sizeof(namelen)); + } + + pthread_mutex_unlock(&nl_lock); + return (int64_t) msg_end; +} + +/* getsockname(2) on a netlink socket: returns the bound/auto-assigned pid. */ +int64_t netlink_getsockname(int guest_fd, + guest_t *g, + uint64_t addr_gva, + uint64_t addrlen_gva) +{ + pthread_mutex_lock(&nl_lock); + netlink_state_t *ns = nl_find(guest_fd); + if (!ns) { + pthread_mutex_unlock(&nl_lock); + return -LINUX_EBADF; + } + uint32_t pid = ns->pid; + pthread_mutex_unlock(&nl_lock); + + uint32_t cap = 0; + if (guest_read_small(g, addrlen_gva, &cap, sizeof(cap)) < 0) + return -LINUX_EFAULT; + + sockaddr_nl_t snl = { + .nl_family = LINUX_AF_NETLINK, + .nl_pid = pid, + }; + size_t n = (cap < sizeof(snl)) ? cap : sizeof(snl); + if (n > 0 && guest_write(g, addr_gva, &snl, n) < 0) + return -LINUX_EFAULT; + + uint32_t actual = sizeof(snl); + if (guest_write_small(g, addrlen_gva, &actual, sizeof(actual)) < 0) + return -LINUX_EFAULT; + return 0; +} + int64_t netlink_recvmsg(int guest_fd, guest_t *g, uint64_t msg_gva, int flags) { (void) flags; diff --git a/tests/manifest.txt b/tests/manifest.txt index 19b1b27..77bffa5 100644 --- a/tests/manifest.txt +++ b/tests/manifest.txt @@ -71,6 +71,7 @@ test-sysfs-cpu [section] Network tests test-net test-netstat +test-netlink [section] Threading tests test-thread # diff=skip diff --git a/tests/test-netlink.c b/tests/test-netlink.c new file mode 100644 index 0000000..fd3e34f --- /dev/null +++ b/tests/test-netlink.c @@ -0,0 +1,136 @@ +/* Exercise the AF_NETLINK getsockname/sendto/recvfrom dispatch paths. + * + * Copyright 2026 elfuse contributors + * Copyright 2025 Moritz Angermann, zw3rk pte. ltd. + * SPDX-License-Identifier: Apache-2.0 + * + * Regression guard for the netlink socket emulation. Before getsockname, + * sendto, and recvfrom were routed to the netlink handlers, these calls fell + * through to the host socket syscalls on the underlying pipe fd and failed + * with ENOTSOCK (errno 88), which in turn broke glibc getifaddrs(). The test + * drives each of the three syscalls directly against a NETLINK_ROUTE socket + * and then validates the end-to-end getifaddrs() path that originally + * regressed. + * + * The assertions hold for both the elfuse emulation and a real Linux kernel + * (the test matrix runs the same binary under qemu-aarch64), so only + * implementation-independent netlink semantics are checked. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +static int pass, fail; + +#define CHECK(cond, msg) \ + do { \ + if (cond) { \ + printf("PASS: %s\n", (msg)); \ + pass++; \ + } else { \ + printf("FAIL: %s (errno=%d %s)\n", (msg), errno, strerror(errno)); \ + fail++; \ + } \ + } while (0) + +/* RTM_GETLINK dump request: nlmsghdr immediately followed by ifinfomsg. */ +struct getlink_req { + struct nlmsghdr nlh; + struct ifinfomsg ifi; +}; + +int main(void) +{ + int fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + if (fd < 0) { + printf("FAIL: socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE): %s\n", + strerror(errno)); + /* Without a socket none of the dispatch paths can be reached. */ + printf("\n%d passed, %d failed\n", pass, fail + 1); + return 1; + } + printf("PASS: socket(AF_NETLINK, NETLINK_ROUTE) = %d\n", fd); + pass++; + + /* bind() with nl_pid=0 lets the kernel/emulation assign a port id. */ + struct sockaddr_nl local = {.nl_family = AF_NETLINK}; + CHECK(bind(fd, (struct sockaddr *) &local, sizeof(local)) == 0, + "bind(AF_NETLINK)"); + + /* 1. getsockname(): previously ENOTSOCK on the pipe fd. */ + struct sockaddr_nl got = {0}; + socklen_t gotlen = sizeof(got); + int rc = getsockname(fd, (struct sockaddr *) &got, &gotlen); + CHECK(rc == 0 && gotlen >= sizeof(struct sockaddr_nl) && + got.nl_family == AF_NETLINK, + "getsockname() returns an AF_NETLINK address"); + CHECK(rc == 0 && got.nl_pid != 0, + "getsockname() reports a non-zero port id"); + + /* 2. sendto(): flat request buffer, no msghdr. */ + struct getlink_req req = {0}; + req.nlh.nlmsg_len = sizeof(req); + req.nlh.nlmsg_type = RTM_GETLINK; + req.nlh.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP; + req.nlh.nlmsg_seq = 1; + req.ifi.ifi_family = AF_UNSPEC; + + struct sockaddr_nl kernel = {.nl_family = AF_NETLINK}; + ssize_t sent = sendto(fd, &req, req.nlh.nlmsg_len, 0, + (struct sockaddr *) &kernel, sizeof(kernel)); + CHECK(sent == (ssize_t) req.nlh.nlmsg_len, + "sendto(RTM_GETLINK) accepts the request"); + + /* 3. recvfrom(): drain the dump, expecting RTM_NEWLINK then NLMSG_DONE. */ + int saw_newlink = 0, saw_done = 0, src_ok = 0; + for (int iter = 0; iter < 64 && !saw_done; iter++) { + char buf[8192]; + struct sockaddr_nl src = {0}; + socklen_t srclen = sizeof(src); + ssize_t n = recvfrom(fd, buf, sizeof(buf), 0, (struct sockaddr *) &src, + &srclen); + if (n <= 0) + break; + if (srclen >= sizeof(struct sockaddr_nl) && src.nl_family == AF_NETLINK) + src_ok = 1; + for (struct nlmsghdr *nlh = (struct nlmsghdr *) buf; + NLMSG_OK(nlh, (unsigned) n); nlh = NLMSG_NEXT(nlh, n)) { + if (nlh->nlmsg_type == RTM_NEWLINK) + saw_newlink = 1; + else if (nlh->nlmsg_type == NLMSG_DONE) + saw_done = 1; + else if (nlh->nlmsg_type == NLMSG_ERROR) + saw_done = 1; /* stop draining on error terminator */ + } + } + CHECK(saw_newlink, "recvfrom() returns at least one RTM_NEWLINK"); + CHECK(src_ok, "recvfrom() fills an AF_NETLINK source address"); + + close(fd); + + /* 4. End-to-end: glibc getifaddrs() drives getsockname + sendto + recv + * internally. This is the exact call that regressed with ENOTSOCK. + */ + struct ifaddrs *ifa = NULL; + rc = getifaddrs(&ifa); + CHECK(rc == 0, "getifaddrs() succeeds"); + int n_ifaces = 0; + for (struct ifaddrs *p = ifa; p; p = p->ifa_next) + if (p->ifa_name && p->ifa_name[0]) + n_ifaces++; + CHECK(rc == 0 && n_ifaces > 0, + "getifaddrs() enumerates at least one interface"); + if (ifa) + freeifaddrs(ifa); + + printf("\n%d passed, %d failed\n", pass, fail); + return fail ? 1 : 0; +}