adding plots to compare requirements

Signed-off-by: vsoch <vsoch@users.noreply.github.com>
vsoch · Jan 18, 2021 · 044b65c · 044b65c
1 parent 87f3260
commit 044b65c
Show file tree

Hide file tree

Showing 8 changed files with 92,855 additions and 55 deletions.
diff --git a/.caliper/plots/pypi-tensorflow-module_sim-requirements-plot.png b/.caliper/plots/pypi-tensorflow-module_sim-requirements-plot.png
diff --git a/.caliper/plots/pypi-tensorflow-module_sim-requirements-plot.svg b/.caliper/plots/pypi-tensorflow-module_sim-requirements-plot.svg
diff --git a/.caliper/plots/pypi-tensorflow-module_version_sim-requirements-plot.png b/.caliper/plots/pypi-tensorflow-module_version_sim-requirements-plot.png
diff --git a/.caliper/plots/pypi-tensorflow-module_version_sim-requirements-plot.svg b/.caliper/plots/pypi-tensorflow-module_version_sim-requirements-plot.svg
diff --git a/.caliper/sims/pypi-tensorflow-requirements-sims.json b/.caliper/sims/pypi-tensorflow-requirements-sims.json
diff --git a/2.assess_change.py b/2.assess_change.py
@@ -32,7 +32,6 @@ def get_parser():
         "--funcdb",
         dest="funcdb",
         help="path to extracted function database (zip or json)",
-        default=".caliper",
     )
     return parser
 
@@ -89,18 +88,14 @@ def main():
         sys.exit("The data directory is missing from the caliper root folder.")
 
     # Create output directory
-    outdir = os.path.join(dirname, "changes")
+    outdir = os.path.join(dirname, "sims")
     if not os.path.exists(outdir):
         os.mkdir(outdir)
 
     ## Step 1: extract requirements to assses change
     extract_requirements(datadir, outdir)
 
-    outdir = os.path.join(dirname, "sims")
-    if not os.path.exists(outdir):
-        os.mkdir(outdir)
-
-    # Step 3: load in the function signatures to assess version changes
+    ## Step 2: load in the function signatures to assess version changes
     extract_function_changes(outdir, args.funcdb)
 
 
@@ -147,24 +142,26 @@ def extract_function_changes(outdir, funcdb):
     # We don't need a manager since we aren't extracting from a repository
     extractor = MetricsExtractor("pypi:tensorflow")
 
-    # Read in and load the function database metric
-    filename = os.path.abspath(funcdb)
-    if not os.path.exists(funcdb):
-        sys.exit("Function database file %s does not exist." % funcdb)
-    db = extractor.load_metric("functiondb", filename=filename)
+    if funcdb:
+        filename = os.path.abspath(funcdb)
+        if not os.path.exists(funcdb):
+            sys.exit("Function database file %s does not exist." % funcdb)
+        db = extractor.load_metric("functiondb", filename=filename)
+    else:
+        db = extractor.load_metric("functiondb")
 
     # Level 1 similarity: overall modules
     # Level 2 similarity: functions
     # Level 3 similarity: function arguments too
     sims = {}
 
     # First just compare functions that exist
-    for version1, db1 in db["by-file"].items():
-        for version2, db2 in db["by-file"].items():
+    for version1, db1 in db.items():
+        for version2, db2 in db.items():
 
             # Dont' calculate it twice
             scores = {}
-            key = "-".join(sorted([version1, version2]))
+            key = "..".join(sorted([version1, version2]))
 
             # Keep the user updated
             if key in sims:
@@ -208,40 +205,71 @@ def extract_function_changes(outdir, funcdb):
 
 
 def extract_requirements(datadir, outdir):
-    """Given a known list of dependencies for a package, we want to extract all
-    requirements (to see change between version) that can then be used to assess
-    overall change in a package
+    """Create a lookup for requirements including (and not including) versions
+    to generate similarity matrices. An alternative is to extract all
+    requirements (to see change between version) for a package and have this
+    say something about the parent package, but this seems more complicated.
     """
     # Keep a lookup of requirements.txt to compare across
-    rxments = set()
     requirements = {}
 
     # Read in input files, organize by python version, tensorflow version
     for filename in iter_files(datadir):
 
-        # Derive the name and versions from the filename (also in inputs:name)
-        result = read_json(filename)
-        if "requirements.txt" not in result:
-            requirements[filename] = {}
+        # Skip release candidates and a/b for now
+        if re.search("(rc|b|a)", os.path.basename(filename)):
             continue
 
-        [
-            rxments.add(re.split("(=|@)", x)[0].strip())
-            for x in result["requirements.txt"]
-        ]
-
-    client = MetricsExtractor()
-    for library in rxments:
-
-        try:
-            manager = PypiManager(library)
-            extractor = MetricsExtractor(manager)
-            # extractor.extract_metric("changedlines")
-            extractor.extract_metric("totalcounts")
-            extractor.save_all(outdir)
-            extractor.cleanup(force=True)
-        except:
-            print("Issue with %s" % library)
+        # Only include those we have requirements for (meaning success install)
+        result = read_json(filename)
+        if "requirements.txt" in result:
+            requirements[filename] = [
+                x.strip().lower() for x in result["requirements.txt"]
+            ]
+
+    # Level 1 similarity: overall modules
+    # Level 2 similarity: modules and version string
+    sims = {}
+
+    # First just compare functions that exist
+    for filename1, modules1 in requirements.items():
+        for filename2, modules2 in requirements.items():
+
+            uid1 = os.path.basename(filename1).rstrip(".json")
+            uid2 = os.path.basename(filename2).rstrip(".json")
+
+            # Dont' calculate it twice
+            scores = {}
+            key = "..".join(sorted([uid1, uid2]))
+            if key in sims:
+                continue
+
+            # Diagonal is perfectly similar
+            if uid1 == uid2:
+                scores = {"module_sim": 1, "module_version_sim": 1}
+                sims[key] = scores
+                continue
+
+            # Level 1: Module and version similarity
+            modules1 = set(modules1)
+            modules2 = set(modules2)
+            scores["module_version_sim"] = information_coefficient(
+                len(modules1), len(modules2), len(modules1.intersection(modules2))
+            )
+
+            # Level 2: Don't include versions, ignore casing
+            funcs1 = [re.split("(==|@)", x)[0].strip().lower() for x in modules1]
+            funcs2 = [re.split("(==|@)", x)[0].strip().lower() for x in modules2]
+            scores["module_sim"] = information_coefficient(
+                len(set(funcs1)),
+                len(set(funcs2)),
+                len(set(funcs1).intersection(set(funcs2))),
+            )
+            sims[key] = scores
+
+    outfile = os.path.join(outdir, "pypi-tensorflow-requirements-sims.json")
+    write_json(sims, outfile)
+    return outfile
 
 
 if __name__ == "__main__":

diff --git a/3.plot_sims.py b/3.plot_sims.py
@@ -22,6 +22,18 @@ def get_parser():
         dest="filename",
         help="path to the file with similarity scores to plot.",
     )
+    parser.add_argument(
+        "--name",
+        dest="name",
+        help="name to distinguish output file.",
+    )
+    parser.add_argument(
+        "--dim",
+        dest="dim",
+        type=int,
+        help="dimension for svg figure (defaults to 20)",
+        default=20,
+    )
     parser.add_argument(
         "--outdir", dest="outdir", help="path to output directory.", default=".caliper"
     )
@@ -49,7 +61,7 @@ def main():
     labels = set()
     for key in sims:
         label1, label2 = key.split(
-            "-"
+            ".."
         )  # important, other libraries should use .. in case - is part of the version
         if re.search("(rc|a|b)", label1) or re.search("(rc|a|b)", label2):
             continue
@@ -59,20 +71,19 @@ def main():
     # Versions need to be sorted by version, not string
     # For now we will remove the release candidtes
 
-    #lookup = {x.split('rc')[0]:x for x in labels}
-    #versions = [x.split('rc')[0] for x in labels]
     labels = list(labels)
-    labels.sort(key=StrictVersion)
-    # labels = [lookup[x] for x in versions]
-    # labels = list(labels).sort(key=StrictVersion)
+    try:
+        labels.sort(key=StrictVersion)
+    except:
+        labels.sort()
 
     # Next create a data frame for each
     dfs = {
         x: pandas.DataFrame(index=labels, columns=labels)
         for x in sims[list(sims.keys())[0]].keys()
     }
     for pair, values in sims.items():
-        label1, label2 = pair.split("-")
+        label1, label2 = pair.split("..")
         if re.search("(rc|a|b)", label1) or re.search("(rc|a|b)", label2):
             continue
         for key, value in values.items():
@@ -86,7 +97,7 @@ def main():
 
     # Finally, prepare plots!
     for name, df in dfs.items():
-        fig, ax = plt.subplots(figsize=(20, 20))
+        fig, ax = plt.subplots(figsize=(args.dim, args.dim))
         cax = ax.matshow(df.to_numpy(dtype=float), interpolation="nearest")
         ax.grid(True)
         plt.title("Tensorflow Version Similarity: %s" % name)
@@ -113,10 +124,13 @@ def main():
         )
         # plt.show()
         for extension in ["png", "svg"]:
-            outfile = os.path.join(outdir, "pypi-tensorflow-%s-plot.%s" % (name, extension))
+            outfile = os.path.join(
+                outdir, "pypi-tensorflow-%s-%s-plot.%s" % (name, args.name, extension)
+            )
             print("Saving %s" % outfile)
             plt.savefig(outfile, dpi=300)
 
+
 ## TODO: subtract matrices to see difference
 
 if __name__ == "__main__":

diff --git a/README.md b/README.md
@@ -68,19 +68,21 @@ can use function signatures [loaded from a repository](https://caliper-python.re
 $ caliper extract --metric functiondb pypi:tensorflow
 ```
 
-Since the file is too big for version control, we can then run our script to assess changes with
-a path to it.
+And this is added to the repository to be read by Caliper. We can then run the
+script:
 
 ```bash
-python 2.assess_change.py --funcdb ../caliper-metrics/pypi/tensorflow/functiondb/functiondb-results.zip
+python 2.assess_change.py
 ```
 
-This will save a json structure of the three kinds of changes above to the
+This will save two json structures of changes, the first for the function database, and
+the second for the requirements (modules and versions) changes. Both are saved to
 the [.caliper/sims](.caliper/sims) folder. We will want to plot these scores next,
 and compare the matrices. We can then next plot the similarities.
 
 ```bash
 $ python 3.plot_sims.py --filename .caliper/sims/pypi-tensorflow-sims.json
+$ python 3.plot_sims.py --name requirements --filename .caliper/sims/pypi-tensorflow-requirements-sims.json --dim 35
 ```
 
 Note that to make the plot simpler, we don't show the release candidates (as we assume they are
@@ -90,8 +92,19 @@ of tensorflow based on the function names *and* args (the most detailed comariso
 
 ![.caliper/plots/pypi-tensorflow-func_args_sim-plot.png](.caliper/plots/pypi-tensorflow-func_args_sim-plot.png)
 
-You can [look at the svg](.caliper/plots/pypi-tensorflow-func_args_sim-plot.svg) if you want to see more detail,
-or [browse the folder](.caliper/plots/) for other plots to compare just functions (or one level up), modules.
+We can also look at similarity based on requirements, either considering versions:
+
+![.caliper/plots/pypi-tensorflow-module_version_sim-requirements-plot.png](.caliper/plots/pypi-tensorflow-module_version_sim-requirements-plot.png)
+
+Or just modules:
+
+![.caliper/plots/pypi-tensorflow-module_sim-requirements-plot.png](.caliper/plots/pypi-tensorflow-module_sim-requirements-plot.png)
+
+
+*Todo** We should also look at how different the plots are.
+
+You can [browse the plots folder](.caliper/plots/) to see more detail, and for other 
+plots to compare just functions (or one level up), modules.
 
 
 ### 3. Parse Data