Skip to content

Commit

Permalink
adding plots to compare requirements
Browse files Browse the repository at this point in the history
Signed-off-by: vsoch <vsoch@users.noreply.github.com>
  • Loading branch information
vsoch committed Jan 18, 2021
1 parent 87f3260 commit 044b65c
Show file tree
Hide file tree
Showing 8 changed files with 92,855 additions and 55 deletions.
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
18,625 changes: 18,625 additions & 0 deletions .caliper/plots/pypi-tensorflow-module_sim-requirements-plot.svg
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
18,674 changes: 18,674 additions & 0 deletions .caliper/plots/pypi-tensorflow-module_version_sim-requirements-plot.svg
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
55,446 changes: 55,446 additions & 0 deletions .caliper/sims/pypi-tensorflow-requirements-sims.json

Large diffs are not rendered by default.

108 changes: 68 additions & 40 deletions 2.assess_change.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ def get_parser():
"--funcdb",
dest="funcdb",
help="path to extracted function database (zip or json)",
default=".caliper",
)
return parser

Expand Down Expand Up @@ -89,18 +88,14 @@ def main():
sys.exit("The data directory is missing from the caliper root folder.")

# Create output directory
outdir = os.path.join(dirname, "changes")
outdir = os.path.join(dirname, "sims")
if not os.path.exists(outdir):
os.mkdir(outdir)

## Step 1: extract requirements to assses change
extract_requirements(datadir, outdir)

outdir = os.path.join(dirname, "sims")
if not os.path.exists(outdir):
os.mkdir(outdir)

# Step 3: load in the function signatures to assess version changes
## Step 2: load in the function signatures to assess version changes
extract_function_changes(outdir, args.funcdb)


Expand Down Expand Up @@ -147,24 +142,26 @@ def extract_function_changes(outdir, funcdb):
# We don't need a manager since we aren't extracting from a repository
extractor = MetricsExtractor("pypi:tensorflow")

# Read in and load the function database metric
filename = os.path.abspath(funcdb)
if not os.path.exists(funcdb):
sys.exit("Function database file %s does not exist." % funcdb)
db = extractor.load_metric("functiondb", filename=filename)
if funcdb:
filename = os.path.abspath(funcdb)
if not os.path.exists(funcdb):
sys.exit("Function database file %s does not exist." % funcdb)
db = extractor.load_metric("functiondb", filename=filename)
else:
db = extractor.load_metric("functiondb")

# Level 1 similarity: overall modules
# Level 2 similarity: functions
# Level 3 similarity: function arguments too
sims = {}

# First just compare functions that exist
for version1, db1 in db["by-file"].items():
for version2, db2 in db["by-file"].items():
for version1, db1 in db.items():
for version2, db2 in db.items():

# Dont' calculate it twice
scores = {}
key = "-".join(sorted([version1, version2]))
key = "..".join(sorted([version1, version2]))

# Keep the user updated
if key in sims:
Expand Down Expand Up @@ -208,40 +205,71 @@ def extract_function_changes(outdir, funcdb):


def extract_requirements(datadir, outdir):
"""Given a known list of dependencies for a package, we want to extract all
requirements (to see change between version) that can then be used to assess
overall change in a package
"""Create a lookup for requirements including (and not including) versions
to generate similarity matrices. An alternative is to extract all
requirements (to see change between version) for a package and have this
say something about the parent package, but this seems more complicated.
"""
# Keep a lookup of requirements.txt to compare across
rxments = set()
requirements = {}

# Read in input files, organize by python version, tensorflow version
for filename in iter_files(datadir):

# Derive the name and versions from the filename (also in inputs:name)
result = read_json(filename)
if "requirements.txt" not in result:
requirements[filename] = {}
# Skip release candidates and a/b for now
if re.search("(rc|b|a)", os.path.basename(filename)):
continue

[
rxments.add(re.split("(=|@)", x)[0].strip())
for x in result["requirements.txt"]
]

client = MetricsExtractor()
for library in rxments:

try:
manager = PypiManager(library)
extractor = MetricsExtractor(manager)
# extractor.extract_metric("changedlines")
extractor.extract_metric("totalcounts")
extractor.save_all(outdir)
extractor.cleanup(force=True)
except:
print("Issue with %s" % library)
# Only include those we have requirements for (meaning success install)
result = read_json(filename)
if "requirements.txt" in result:
requirements[filename] = [
x.strip().lower() for x in result["requirements.txt"]
]

# Level 1 similarity: overall modules
# Level 2 similarity: modules and version string
sims = {}

# First just compare functions that exist
for filename1, modules1 in requirements.items():
for filename2, modules2 in requirements.items():

uid1 = os.path.basename(filename1).rstrip(".json")
uid2 = os.path.basename(filename2).rstrip(".json")

# Dont' calculate it twice
scores = {}
key = "..".join(sorted([uid1, uid2]))
if key in sims:
continue

# Diagonal is perfectly similar
if uid1 == uid2:
scores = {"module_sim": 1, "module_version_sim": 1}
sims[key] = scores
continue

# Level 1: Module and version similarity
modules1 = set(modules1)
modules2 = set(modules2)
scores["module_version_sim"] = information_coefficient(
len(modules1), len(modules2), len(modules1.intersection(modules2))
)

# Level 2: Don't include versions, ignore casing
funcs1 = [re.split("(==|@)", x)[0].strip().lower() for x in modules1]
funcs2 = [re.split("(==|@)", x)[0].strip().lower() for x in modules2]
scores["module_sim"] = information_coefficient(
len(set(funcs1)),
len(set(funcs2)),
len(set(funcs1).intersection(set(funcs2))),
)
sims[key] = scores

outfile = os.path.join(outdir, "pypi-tensorflow-requirements-sims.json")
write_json(sims, outfile)
return outfile


if __name__ == "__main__":
Expand Down
32 changes: 23 additions & 9 deletions 3.plot_sims.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,18 @@ def get_parser():
dest="filename",
help="path to the file with similarity scores to plot.",
)
parser.add_argument(
"--name",
dest="name",
help="name to distinguish output file.",
)
parser.add_argument(
"--dim",
dest="dim",
type=int,
help="dimension for svg figure (defaults to 20)",
default=20,
)
parser.add_argument(
"--outdir", dest="outdir", help="path to output directory.", default=".caliper"
)
Expand Down Expand Up @@ -49,7 +61,7 @@ def main():
labels = set()
for key in sims:
label1, label2 = key.split(
"-"
".."
) # important, other libraries should use .. in case - is part of the version
if re.search("(rc|a|b)", label1) or re.search("(rc|a|b)", label2):
continue
Expand All @@ -59,20 +71,19 @@ def main():
# Versions need to be sorted by version, not string
# For now we will remove the release candidtes

#lookup = {x.split('rc')[0]:x for x in labels}
#versions = [x.split('rc')[0] for x in labels]
labels = list(labels)
labels.sort(key=StrictVersion)
# labels = [lookup[x] for x in versions]
# labels = list(labels).sort(key=StrictVersion)
try:
labels.sort(key=StrictVersion)
except:
labels.sort()

# Next create a data frame for each
dfs = {
x: pandas.DataFrame(index=labels, columns=labels)
for x in sims[list(sims.keys())[0]].keys()
}
for pair, values in sims.items():
label1, label2 = pair.split("-")
label1, label2 = pair.split("..")
if re.search("(rc|a|b)", label1) or re.search("(rc|a|b)", label2):
continue
for key, value in values.items():
Expand All @@ -86,7 +97,7 @@ def main():

# Finally, prepare plots!
for name, df in dfs.items():
fig, ax = plt.subplots(figsize=(20, 20))
fig, ax = plt.subplots(figsize=(args.dim, args.dim))
cax = ax.matshow(df.to_numpy(dtype=float), interpolation="nearest")
ax.grid(True)
plt.title("Tensorflow Version Similarity: %s" % name)
Expand All @@ -113,10 +124,13 @@ def main():
)
# plt.show()
for extension in ["png", "svg"]:
outfile = os.path.join(outdir, "pypi-tensorflow-%s-plot.%s" % (name, extension))
outfile = os.path.join(
outdir, "pypi-tensorflow-%s-%s-plot.%s" % (name, args.name, extension)
)
print("Saving %s" % outfile)
plt.savefig(outfile, dpi=300)


## TODO: subtract matrices to see difference

if __name__ == "__main__":
Expand Down
25 changes: 19 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,19 +68,21 @@ can use function signatures [loaded from a repository](https://caliper-python.re
$ caliper extract --metric functiondb pypi:tensorflow
```

Since the file is too big for version control, we can then run our script to assess changes with
a path to it.
And this is added to the repository to be read by Caliper. We can then run the
script:

```bash
python 2.assess_change.py --funcdb ../caliper-metrics/pypi/tensorflow/functiondb/functiondb-results.zip
python 2.assess_change.py
```

This will save a json structure of the three kinds of changes above to the
This will save two json structures of changes, the first for the function database, and
the second for the requirements (modules and versions) changes. Both are saved to
the [.caliper/sims](.caliper/sims) folder. We will want to plot these scores next,
and compare the matrices. We can then next plot the similarities.

```bash
$ python 3.plot_sims.py --filename .caliper/sims/pypi-tensorflow-sims.json
$ python 3.plot_sims.py --name requirements --filename .caliper/sims/pypi-tensorflow-requirements-sims.json --dim 35
```

Note that to make the plot simpler, we don't show the release candidates (as we assume they are
Expand All @@ -90,8 +92,19 @@ of tensorflow based on the function names *and* args (the most detailed comariso

![.caliper/plots/pypi-tensorflow-func_args_sim-plot.png](.caliper/plots/pypi-tensorflow-func_args_sim-plot.png)

You can [look at the svg](.caliper/plots/pypi-tensorflow-func_args_sim-plot.svg) if you want to see more detail,
or [browse the folder](.caliper/plots/) for other plots to compare just functions (or one level up), modules.
We can also look at similarity based on requirements, either considering versions:

![.caliper/plots/pypi-tensorflow-module_version_sim-requirements-plot.png](.caliper/plots/pypi-tensorflow-module_version_sim-requirements-plot.png)

Or just modules:

![.caliper/plots/pypi-tensorflow-module_sim-requirements-plot.png](.caliper/plots/pypi-tensorflow-module_sim-requirements-plot.png)


*Todo** We should also look at how different the plots are.

You can [browse the plots folder](.caliper/plots/) to see more detail, and for other
plots to compare just functions (or one level up), modules.


### 3. Parse Data
Expand Down

0 comments on commit 044b65c

Please sign in to comment.