Merge pull request #547 from tweag/python-linking-debugging-tools

debug: Add python linking debugging tools
tweag · Jan 18, 2019 · febef11 · febef11
2 parents cc100e9 + 7b503b4
commit febef11
Show file tree

Hide file tree

Showing 4 changed files with 260 additions and 0 deletions.
diff --git a/debug/linking_utils/BUILD b/debug/linking_utils/BUILD
@@ -0,0 +1,5 @@
+py_library(
+    name = "linking_utils",
+    srcs = ["ldd.py"],
+    visibility = ["//visibility:public"],
+)
diff --git a/debug/linking_utils/README.md b/debug/linking_utils/README.md
@@ -0,0 +1,51 @@
+# Debugging linking errors
+
+The usual utilties like `nm`, `objdump` and of course `ldd` (see
+[here](https://linux-audit.com/elf-binaries-on-linux-understanding-and-analysis/#tools-for-binary-analysis)
+for a good overview of existing tools) go a long way, but when
+debugging non-trivial runtime linker failures one would often like to
+filter outputs programmatically with more than just simple `grep` and
+`sed` expressions.
+
+This library provides a small set of utility subroutines that can help
+debug complicated linker errors.
+
+The main function is `ldd(f, elf_path)`, which is in the same spirit
+as `ldd(1)`, but returns a tree of shared dependencies instead of a
+flat list. Additionally, it expects a function `f` which is applied to
+each recursion level of dependencies.
+
+Functions that can be passed to `ldd`:
+
+- `identity`: pass through every info `ldd` can output
+- `remove_uninteresting_dependencies`: remove entries that are mostly noise
+- `was_runpath_used`: return a list of unused runpaths
+
+Helpers:
+- `dict_remove_empty`: remove fields with empty lists/dicts from an output
+
+Example usage:
+
+```python
+import pprint
+from bazel_tools.tools.python.runfiles import runfiles
+from debug.linking_utils.ldd import \
+  ldd, \
+  was_runpath_used, \
+  dict_remove_empty, \
+  remove_unnecessary_dependencies
+
+r = runfiles.Create()
+pp = pprint.PrettyPrinter(indent=2)
+
+pp.pprint(
+  ldd(remove_uninteresting_dependencies, path)
+)
+
+print("\nUnused RUNPATH entries:")
+pp.pprint(
+  dict_remove_empty(
+    ldd(was_runpath_used, path)['others']
+  )
+)
+```
diff --git a/debug/linking_utils/ldd.py b/debug/linking_utils/ldd.py
@@ -0,0 +1,201 @@
+import subprocess
+import os
+
+
+### helper functions
+
+def list_to_dict(f, l):
+    """dict with elements of list as keys & as values transformed by f"""
+    d = {}
+    for el in l:
+        d[el] = f(el)
+    return d
+
+def dict_remove_empty(d):
+    """remove keys that have [] or {} or as values"""
+    new = {}
+    for k, v in d.iteritems():
+        if not (v == [] or v == {}):
+             new[k] = v
+    return new
+
+def identity(x):
+    """identity function"""
+    return x
+
+def const(x):
+    """(curried) constant function"""
+    def f(y):
+        return x
+    return f
+
+
+### IO functions that find elf dependencies
+
+def get_runpath_dirs(elf):
+    """Find all runpath entries.
+
+    Returns:
+      { path: unmodified string from DT_RUNPATH
+      , absolute_path: fully normalized, absolute path to dir }
+    """
+    origin = os.path.dirname(elf)
+    # TODO: way to get info with less execution overhead
+    # TODO: cache the results to prevent more than one call per elf binary
+    res = subprocess.check_output("""objdump -x {} | grep RUNPATH | sed 's/^ *RUNPATH *//'""".format(elf), shell = True).strip()
+    return [{ 'path': path,
+              'absolute_path': os.path.normpath(path.replace("$ORIGIN", origin)) }
+            for path in res.strip(":").split(":")
+            if path != ""]
+
+def get_needed(elf):
+    """Returns the list of DT_NEEDED entries for elf"""
+    # TODO: way to get info with less execution overhead
+    # TODO: cache the results to prevent more than one call per elf binary
+    res = subprocess.check_output("""objdump -x {} | grep NEEDED | sed 's/^ *NEEDED *//'""".format(elf), shell = True).strip()
+    return res.strip("\n").split("\n")
+
+
+### Main utility
+
+# cannot find dependency
+LDD_MISSING = "MISSING"
+# don't know how to search for dependency
+LDD_UNKNOWN = "DUNNO"
+LDD_ERRORS = [ LDD_MISSING, LDD_UNKNOWN ]
+
+def ldd(f, elf_path):
+    """follows DT_NEEDED ELF headers for elf by searching the through DT_RUNPATH.
+
+    DependencyInfo :
+    { needed : dict(string, union(
+        LDD_MISSING, LDD_UNKNOWN,
+        {
+            # the needed dependency
+            item : a,
+            # where the dependency was found in
+            found_in : RunpathDir
+        }))
+    # all runpath directories that were searched
+    , runpath_dirs : [ RunpathDir ] }
+
+    Args:
+        f: DependencyInfo -> a
+        modifies the results of each level
+        elf_path: path to ELF file
+
+    Returns: a
+    """
+    def search(rdirs, elf_libname):
+        """search for elf_libname in rdirs and return either name or missing"""
+        res = LDD_MISSING
+        for rdir in rdirs:
+            potential_path = os.path.join(rdir['absolute_path'], elf_libname)
+            if os.path.exists(potential_path):
+                res = {
+                    'item': potential_path,
+                    'found_in': rdir,
+                }
+                break
+        return res
+
+    def recurse(search_res):
+        if search_res == LDD_MISSING:
+            return LDD_MISSING
+        else:
+            # we keep all other fields the same,
+            # just item is the one that does the recursion.
+            # This is the part that would normally be done by fmap.
+            search_res['item'] = ldd(f, search_res['item'])
+            return search_res
+
+    rdirs = get_runpath_dirs(elf_path)
+    # if there's no runpath dirs we don't know where to search
+    if rdirs == []:
+        needed = list_to_dict(const(LDD_UNKNOWN), get_needed(elf_path))
+    else:
+        needed = list_to_dict(
+            lambda name: recurse(search(rdirs, name)),
+            get_needed(elf_path)
+        )
+
+    result = {
+        'runpath_dirs': rdirs,
+        'needed': needed
+    }
+    return f(result)
+
+
+### Functions to pass to ldd
+
+def remove_uninteresting_dependencies(d):
+    """Filter that removes some uninteresting .sos and everything that points to the nix store. Can be abstracted later."""
+    def bad_needed_p(k):
+        "predicate for unneeded .sos"
+        names = [
+            'libc.so.6',
+            'ld-linux-x86-64.so.2',
+            'libgmp.so.10',
+            'libm.so.6',
+        ]
+        return (k in names)
+    def bad_runpath_p(p):
+        "predicate for unneeded paths"
+        prefixes = [
+            "/nix/store/"
+        ]
+        return any(p.startswith(pref) for pref in prefixes)
+
+    runpaths = []
+    for dir in d['runpath_dirs']:
+        absp = dir['absolute_path']
+
+        # TODO: put in different test, this is interesting info!
+        # non-existing RUNPATHs
+        if not os.path.exists(absp):
+            print("ATTN path doesnt exist: {}".format(absp))
+
+        if not bad_runpath_p(absp):
+            runpaths.append(absp)
+
+    needed = {}
+    for k, v in d['needed'].iteritems():
+        # filter out some uninteresting deps
+        if not bad_needed_p(k):
+            needed[k] = v['item'] if not v in LDD_ERRORS else v
+
+    return dict_remove_empty({
+        'runp': runpaths,
+        'need': needed,
+    })
+
+
+def was_runpath_used(d):
+    """returns a dict of two fields; `others` contains a flat dict of all .sos with unused runpath entries and a list of them for each .so"""
+    used = set()
+    given = set(r['absolute_path'] for r in d['runpath_dirs'])
+    prev = {}
+    for k, v in d['needed'].iteritems():
+        if not v in LDD_ERRORS:
+            used.add(v['found_in']['absolute_path'])
+            prev[k] = v['item']
+    unused = [
+        u for u in given.difference(used)
+        # leave out nix storepaths
+        if not u.startswith("/nix/store")
+    ]
+
+    # Each layer doesn't know about their own name
+    # So we return a list of unused for this layer ('mine')
+    # and a dict of all previeous layers combined (name to list)
+    def combine_unused(deps):
+        res = {}
+        for name, dep in deps.iteritems():
+            res.update(dep['others'])
+            res[name] = dep['mine']
+        return res
+
+    return {
+        'mine': unused,
+        'others': combine_unused(prev),
+    }
diff --git a/tools/README.md b/tools/README.md
@@ -0,0 +1,3 @@
+Note: `py_library`s cannot be put into this folder, lest they produce
+a namespace collision with `@bazel_tools/tools`.
+See https://github.com/bazelbuild/bazel/issues/7051