From 7b503b4cdbe17522dd2ccb1ab3d316d56984313f Mon Sep 17 00:00:00 2001
From: Philip Patsch <philip.patsch@tweag.io>
Date: Tue, 8 Jan 2019 00:33:19 +0100
Subject: [PATCH] debug: Add python linking debugging tools
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`rules_haskell` development features a large amount of varying fun
with different dynamic linking problems. These python helpers shall
make debugging them less noisy, and more programmatic and to the
point.

We can’t put them in the `tools/` directory because of the problem
mentioned now in tools/README.md.
---
 debug/linking_utils/BUILD     |   5 +
 debug/linking_utils/README.md |  51 +++++++++
 debug/linking_utils/ldd.py    | 201 ++++++++++++++++++++++++++++++++++
 tools/README.md               |   3 +
 4 files changed, 260 insertions(+)
 create mode 100644 debug/linking_utils/BUILD
 create mode 100644 debug/linking_utils/README.md
 create mode 100644 debug/linking_utils/ldd.py
 create mode 100644 tools/README.md

diff --git a/debug/linking_utils/BUILD b/debug/linking_utils/BUILD
new file mode 100644
index 000000000..05e839c9e
--- /dev/null
+++ b/debug/linking_utils/BUILD
@@ -0,0 +1,5 @@
+py_library(
+    name = "linking_utils",
+    srcs = ["ldd.py"],
+    visibility = ["//visibility:public"],
+)
diff --git a/debug/linking_utils/README.md b/debug/linking_utils/README.md
new file mode 100644
index 000000000..9865626ee
--- /dev/null
+++ b/debug/linking_utils/README.md
@@ -0,0 +1,51 @@
+# Debugging linking errors
+
+The usual utilties like `nm`, `objdump` and of course `ldd` (see
+[here](https://linux-audit.com/elf-binaries-on-linux-understanding-and-analysis/#tools-for-binary-analysis)
+for a good overview of existing tools) go a long way, but when
+debugging non-trivial runtime linker failures one would often like to
+filter outputs programmatically with more than just simple `grep` and
+`sed` expressions.
+
+This library provides a small set of utility subroutines that can help
+debug complicated linker errors.
+
+The main function is `ldd(f, elf_path)`, which is in the same spirit
+as `ldd(1)`, but returns a tree of shared dependencies instead of a
+flat list. Additionally, it expects a function `f` which is applied to
+each recursion level of dependencies.
+
+Functions that can be passed to `ldd`:
+
+- `identity`: pass through every info `ldd` can output
+- `remove_uninteresting_dependencies`: remove entries that are mostly noise
+- `was_runpath_used`: return a list of unused runpaths
+
+Helpers:
+- `dict_remove_empty`: remove fields with empty lists/dicts from an output
+
+Example usage:
+
+```python
+import pprint
+from bazel_tools.tools.python.runfiles import runfiles
+from debug.linking_utils.ldd import \
+  ldd, \
+  was_runpath_used, \
+  dict_remove_empty, \
+  remove_unnecessary_dependencies
+
+r = runfiles.Create()
+pp = pprint.PrettyPrinter(indent=2)
+
+pp.pprint(
+  ldd(remove_uninteresting_dependencies, path)
+)
+
+print("\nUnused RUNPATH entries:")
+pp.pprint(
+  dict_remove_empty(
+    ldd(was_runpath_used, path)['others']
+  )
+)
+```
diff --git a/debug/linking_utils/ldd.py b/debug/linking_utils/ldd.py
new file mode 100644
index 000000000..aa926de32
--- /dev/null
+++ b/debug/linking_utils/ldd.py
@@ -0,0 +1,201 @@
+import subprocess
+import os
+
+
+### helper functions
+
+def list_to_dict(f, l):
+    """dict with elements of list as keys & as values transformed by f"""
+    d = {}
+    for el in l:
+        d[el] = f(el)
+    return d
+
+def dict_remove_empty(d):
+    """remove keys that have [] or {} or as values"""
+    new = {}
+    for k, v in d.iteritems():
+        if not (v == [] or v == {}):
+             new[k] = v
+    return new
+
+def identity(x):
+    """identity function"""
+    return x
+
+def const(x):
+    """(curried) constant function"""
+    def f(y):
+        return x
+    return f
+
+
+### IO functions that find elf dependencies
+
+def get_runpath_dirs(elf):
+    """Find all runpath entries.
+
+    Returns:
+      { path: unmodified string from DT_RUNPATH
+      , absolute_path: fully normalized, absolute path to dir }
+    """
+    origin = os.path.dirname(elf)
+    # TODO: way to get info with less execution overhead
+    # TODO: cache the results to prevent more than one call per elf binary
+    res = subprocess.check_output("""objdump -x {} | grep RUNPATH | sed 's/^ *RUNPATH *//'""".format(elf), shell = True).strip()
+    return [{ 'path': path,
+              'absolute_path': os.path.normpath(path.replace("$ORIGIN", origin)) }
+            for path in res.strip(":").split(":")
+            if path != ""]
+
+def get_needed(elf):
+    """Returns the list of DT_NEEDED entries for elf"""
+    # TODO: way to get info with less execution overhead
+    # TODO: cache the results to prevent more than one call per elf binary
+    res = subprocess.check_output("""objdump -x {} | grep NEEDED | sed 's/^ *NEEDED *//'""".format(elf), shell = True).strip()
+    return res.strip("\n").split("\n")
+
+
+### Main utility
+
+# cannot find dependency
+LDD_MISSING = "MISSING"
+# don't know how to search for dependency
+LDD_UNKNOWN = "DUNNO"
+LDD_ERRORS = [ LDD_MISSING, LDD_UNKNOWN ]
+
+def ldd(f, elf_path):
+    """follows DT_NEEDED ELF headers for elf by searching the through DT_RUNPATH.
+
+    DependencyInfo :
+    { needed : dict(string, union(
+        LDD_MISSING, LDD_UNKNOWN,
+        {
+            # the needed dependency
+            item : a,
+            # where the dependency was found in
+            found_in : RunpathDir
+        }))
+    # all runpath directories that were searched
+    , runpath_dirs : [ RunpathDir ] }
+
+    Args:
+        f: DependencyInfo -> a
+        modifies the results of each level
+        elf_path: path to ELF file
+
+    Returns: a
+    """
+    def search(rdirs, elf_libname):
+        """search for elf_libname in rdirs and return either name or missing"""
+        res = LDD_MISSING
+        for rdir in rdirs:
+            potential_path = os.path.join(rdir['absolute_path'], elf_libname)
+            if os.path.exists(potential_path):
+                res = {
+                    'item': potential_path,
+                    'found_in': rdir,
+                }
+                break
+        return res
+
+    def recurse(search_res):
+        if search_res == LDD_MISSING:
+            return LDD_MISSING
+        else:
+            # we keep all other fields the same,
+            # just item is the one that does the recursion.
+            # This is the part that would normally be done by fmap.
+            search_res['item'] = ldd(f, search_res['item'])
+            return search_res
+
+    rdirs = get_runpath_dirs(elf_path)
+    # if there's no runpath dirs we don't know where to search
+    if rdirs == []:
+        needed = list_to_dict(const(LDD_UNKNOWN), get_needed(elf_path))
+    else:
+        needed = list_to_dict(
+            lambda name: recurse(search(rdirs, name)),
+            get_needed(elf_path)
+        )
+
+    result = {
+        'runpath_dirs': rdirs,
+        'needed': needed
+    }
+    return f(result)
+
+
+### Functions to pass to ldd
+
+def remove_uninteresting_dependencies(d):
+    """Filter that removes some uninteresting .sos and everything that points to the nix store. Can be abstracted later."""
+    def bad_needed_p(k):
+        "predicate for unneeded .sos"
+        names = [
+            'libc.so.6',
+            'ld-linux-x86-64.so.2',
+            'libgmp.so.10',
+            'libm.so.6',
+        ]
+        return (k in names)
+    def bad_runpath_p(p):
+        "predicate for unneeded paths"
+        prefixes = [
+            "/nix/store/"
+        ]
+        return any(p.startswith(pref) for pref in prefixes)
+
+    runpaths = []
+    for dir in d['runpath_dirs']:
+        absp = dir['absolute_path']
+
+        # TODO: put in different test, this is interesting info!
+        # non-existing RUNPATHs
+        if not os.path.exists(absp):
+            print("ATTN path doesnt exist: {}".format(absp))
+
+        if not bad_runpath_p(absp):
+            runpaths.append(absp)
+
+    needed = {}
+    for k, v in d['needed'].iteritems():
+        # filter out some uninteresting deps
+        if not bad_needed_p(k):
+            needed[k] = v['item'] if not v in LDD_ERRORS else v
+
+    return dict_remove_empty({
+        'runp': runpaths,
+        'need': needed,
+    })
+
+
+def was_runpath_used(d):
+    """returns a dict of two fields; `others` contains a flat dict of all .sos with unused runpath entries and a list of them for each .so"""
+    used = set()
+    given = set(r['absolute_path'] for r in d['runpath_dirs'])
+    prev = {}
+    for k, v in d['needed'].iteritems():
+        if not v in LDD_ERRORS:
+            used.add(v['found_in']['absolute_path'])
+            prev[k] = v['item']
+    unused = [
+        u for u in given.difference(used)
+        # leave out nix storepaths
+        if not u.startswith("/nix/store")
+    ]
+
+    # Each layer doesn't know about their own name
+    # So we return a list of unused for this layer ('mine')
+    # and a dict of all previeous layers combined (name to list)
+    def combine_unused(deps):
+        res = {}
+        for name, dep in deps.iteritems():
+            res.update(dep['others'])
+            res[name] = dep['mine']
+        return res
+
+    return {
+        'mine': unused,
+        'others': combine_unused(prev),
+    }
diff --git a/tools/README.md b/tools/README.md
new file mode 100644
index 000000000..1d3ef16f8
--- /dev/null
+++ b/tools/README.md
@@ -0,0 +1,3 @@
+Note: `py_library`s cannot be put into this folder, lest they produce
+a namespace collision with `@bazel_tools/tools`.
+See https://github.com/bazelbuild/bazel/issues/7051