In [1]:
import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py
import urllib2
import datetime
import sys

from moztelemetry.spark import get_pings, get_pings_properties
import moztelemetry.spark

%pylab inline

from operator import add

Unable to parse whitelist (/home/hadoop/anaconda2/lib/python2.7/site-packages/moztelemetry/histogram-whitelists.json). Assuming all histograms are acceptable.
Populating the interactive namespace from numpy and matplotlib


In [2]:
sc.defaultParallelism

64

### Extract a working dataset

Collect nightly data from builds dated within a 2-week window.

In [3]:
def fmt_date(d):
    return d.strftime("%Y%m%d")

## Dates bounding the time window to look at.
t1 = fmt_date(datetime.datetime.now() - datetime.timedelta(16)) # go back 16 days
t2 = fmt_date(datetime.datetime.now() - datetime.timedelta(2)) # go back 2 days
t1, t2

('20160830', '20160913')

In [4]:
## Collect only saved-session pings (which cover full browser sessions rather than the usual subsessions).

pings = get_pings(sc, app="Firefox", channel="nightly", build_id=(t1, t2), fraction=1.0)

In [5]:
def parseAddons(addons):
    """ Create a list of enabled add-ons with elements of the form (ID, version). """
    return[(k, v.get("version")) for k, v in addons.iteritems()]

def extract(ping):
    """ Extract relevant fields from each payload.
    
        shims: Reason why add-on shims were used, keyed by add-on ID (enumerated count of reason codes)
        cpowTime: Contiguous time spent by an add-on blocking the main loop by performing a blocking
                  cross-process call (microseconds, keyed by add-on ID).
        cpowForbidden: Number of times an add-on used CPOWs when it was marked as e10s compatible
                       (count, keyed by add-on ID).
        addons: List of (ID, version) for each enabled add-on.
    """
    hists = ping["payload"].get("histograms", {})
    keyed = ping["payload"].get("keyedHistograms", {})
    clientId = ping.get("clientId", None)
    return {
        "clientId": ping.get("clientId", None),
        "os": ping["environment"]["system"]["os"]["name"],
        "e10s": ping["environment"]["settings"]["e10sEnabled"],
        "sessionLength": ping["payload"]["info"]["sessionLength"], ## in seconds
        "shims": keyed.get("ADDON_SHIM_USAGE", {}),
        "cpowTime": keyed.get("PERF_MONITORING_SLOW_ADDON_CPOW_US", {}),
        "cpowForbidden": keyed.get("ADDON_FORBIDDEN_CPOW_USAGE", {}),
        "addons": parseAddons(ping["environment"].get("addons", {}).get("activeAddons", {}))
    }

## Extract relevant data, and restrict to clients that have add-ons.
bySession = pings.map(extract)\
    .filter(lambda p: p["addons"])\
    .persist(StorageLevel.MEMORY_AND_DISK_SER)

The `bySession` dataset has one record per client session which had enabled add-ons.

How many session pings are in the dataset?

In [6]:
bySession.count()

916124

How many unique clients do these come from?

In [7]:
bySession.map(lambda p: p["clientId"]).distinct().count()

54137

How many add-ons are represented in the dataset, and what are the top few?

In [8]:
addonCounts = bySession.flatMap(lambda p: [(guid, p["clientId"]) for (guid, version) in p["addons"]])\
    .distinct()\
    .map(lambda (guid, clientid): guid)\
    .countByValue()
len(addonCounts)

7268

In [9]:
sorted(addonCounts.items(), key = lambda (guid, count): (-count, guid))[:20]

[(u'flyweb@mozilla.org', 53991),
 (u'webcompat@mozilla.org', 53953),
 (u'e10srollout@mozilla.org', 53885),
 (u'firefox@getpocket.com', 53720),
 (u'{d10d0bf8-f5b5-c8b4-a8b2-2b9879e08c5d}', 10761),
 (u'uBlock0@raymondhill.net', 4520),
 (u'{b9db16a4-6edc-47ec-a1f4-b86292ed211d}', 2616),
 (u'{e4a8a97b-f2ed-450b-b12d-ee082ba24781}', 2451),
 (u'{DDC359D1-844A-42a7-9AA1-88A850A938A8}', 1710),
 (u'{73a6fe31-595d-460b-a920-fcc0f8843232}', 1602),
 (u'firebug@software.joehewitt.com', 1501),
 (u'{46551EC9-40F0-4e47-8E18-8E5CF550CFB8}', 1339),
 (u'firefox@mega.co.nz', 1302),
 (u'firefox@ghostery.com', 1222),
 (u'{b9bfaf1c-a63f-47cd-8b9a-29526ced9060}', 1155),
 (u'adbhelper@mozilla.org', 1139),
 (u'fxdevtools-adapters@mozilla.org', 1108),
 (u'loop@mozilla.org', 1080),
 (u'wrc@avast.com', 1037),
 (u'support@lastpass.com', 1019)]

### Shims

Count enabled add-on installs (ID and version), together with whether or not they were observed to use shims.

An add-on is counted as using shims if it has entry in the `ADDON_SHIM_USAGE` keyed histogram for at least one client session (regardless of the values in the histogram). This histogram records shim usage occurrence by the [reason it was used](https://dxr.mozilla.org/mozilla-central/source/toolkit/components/addoncompat/CompatWarning.jsm#94).

In [10]:
## Summarize shim usage per add-on/client.
## Reduce multiple sessions observed for each client to
## a single entry of the form ((ID, version), clientID, usedShims).

def getShimData(d):
    """ Summarize each add-on in the ping as (((GUID, version), clientID), <used shims?>). """
    return [((addonv, d["clientId"]), addonv[0] in d["shims"]) for addonv in d["addons"]]

shimUsageByClient = bySession.flatMap(getShimData)\
    .reduceByKey(lambda a, b: a or b)\
    .map(lambda ((addon, clientId), usedShims): (addon, clientId, usedShims))

In [11]:
## Compute number of clients that used shims for each add-on.
## Result is of the form ((ID, version), usedShims, # clients).

shimUsageCounts = shimUsageByClient\
    .map(lambda (addon, clientId, usedShims): ((addon, usedShims), 1))\
    .reduceByKey(add)\
    .map(lambda ((addon, usedShims), count): (addon, usedShims, count))

In [12]:
## For each (add-on, version) pair, determine whether it ever used shims,
## along with its overall installation count.
## Result is of the form
##  ((ID, version), <used shims in at least one client session>, overall # installations).

shimUsageByAddon = shimUsageCounts\
    .map(lambda (addon, usedShims, count): (addon, (usedShims, count)))\
    .reduceByKey(lambda (s1, c1), (s2, c2): (s1 or s2, c1 + c2))\
    .map(lambda (addon, (usedShims, count)): (addon, usedShims, count))

Sanity check: how many add-on (GUID, version) pairs do we have?

In [13]:
shimUsageByAddon.count()

9930

Our final shim usage dataset is restricted to add-ons that were installed/enabled in at least 50 profiles.

In [14]:
## Order by decreasing installation count.
shimUsageFiltered = shimUsageByAddon.filter(lambda v: v[-1] >= 50)\
    .sortBy(lambda v: (-v[-1], v[:-1]))\
    .collect()

How many add-ons are in the final dataset?

In [15]:
len(shimUsageFiltered)

518

How many of these used shims?

In [16]:
shimUsageShimmed = filter(lambda (addon, usedShims, count): usedShims, shimUsageFiltered)
len(shimUsageShimmed)

270

Dump results to a JSON file that will be used in the HTML page.

The file is one big JSON array with elements of the form `[<numInstallations>, [<GUID>, <version>], <usedShims>]`.

In [17]:
def formatForJSON(d):
    return (d[-1],) + d[:-1]

shimUsageOutput = map(formatForJSON, shimUsageFiltered)

try:
    output = open('output/shim-data.json', 'w')
    json.dump(shimUsageOutput, output)
    output.close()
except:
    pass

try:
    output = open('shim-data.json', 'w')
    json.dump(shimUsageOutput, output)
    output.close()
except:
    pass

The shim usage data, orderd by decreasing installation count.

In [18]:
shimUsageFiltered

[((u'flyweb@mozilla.org', u'1.0.0'), False, 53991),
 ((u'webcompat@mozilla.org', u'1.0'), False, 53953),
 ((u'firefox@getpocket.com', u'1.0.4'), True, 53718),
 ((u'e10srollout@mozilla.org', u'1.2'), False, 50207),
 ((u'e10srollout@mozilla.org', u'1.0'), False, 29169),
 ((u'{d10d0bf8-f5b5-c8b4-a8b2-2b9879e08c5d}', u'2.7.3'), True, 10516),
 ((u'uBlock0@raymondhill.net', u'1.9.4'), True, 3710),
 ((u'uBlock0@raymondhill.net', u'1.9.6'), True, 3036),
 ((u'{b9db16a4-6edc-47ec-a1f4-b86292ed211d}', u'6.0.0'), True, 2499),
 ((u'{e4a8a97b-f2ed-450b-b12d-ee082ba24781}', u'3.9'), True, 2318),
 ((u'firebug@software.joehewitt.com', u'2.0.17'), False, 1365),
 ((u'{DDC359D1-844A-42a7-9AA1-88A850A938A8}', u'3.0.6'), False, 1340),
 ((u'{73a6fe31-595d-460b-a920-fcc0f8843232}', u'2.9.0.14'), True, 1299),
 ((u'{46551EC9-40F0-4e47-8E18-8E5CF550CFB8}', u'2.0.7'), False, 1268),
 ((u'firefox@ghostery.com', u'6.3.2'), True, 1179),
 ((u'adbhelper@mozilla.org', u'0.8.7'), False, 1131),
 ((u'fxdevtools-adapters@mo

### CPOWs

CPOW usage is recorded in the `PERF_MONITORING_SLOW_ADDON_CPOW_US` histogram as time in microseconds spent by an add-on blocking the main loop using a CPOW.

Summarize CPOW usage for each enabled add-on (ID and version) by:

- average number of microseconds per CPOW blocking occurrence
- average number of blocking occurrences per hour of session time

Note that in many cases an add-on has an entry in the histogram, but it only has observations with the value 0. It appears that the histogram is automatically recorded at the same time as `PERF_MONITORING_SLOW_ADDON_JANK_US` by the [AddonWatcher](https://dxr.mozilla.org/mozilla-central/source/toolkit/components/perfmonitoring/AddonWatcher.jsm#128), and so may not have any CPOW blocking to report at that time. This is handled by dropping observations in the histograms' '0' bucket.

In [19]:
## Summarize CPOW usage per add-on, returning entries of the form
## ((GUID, version), { 
##    "totalTime" : <total session time in seconds with this add-on>,
##    "numOccurrences": <total number of times add-on CPOW blocked main loop>,
##    "totalCPOWTime": <total blocking time for add-on CPOWs>
## })

def getCPOWData(d):
    """ Summarize CPOW data for each add-on/session as a list of
            ((GUID, version), {totalTime, numOccurrences, totalCPOWTime}).
    """
    result = []
    for addonv in d["addons"]:
        data = {
            "totalTime": d["sessionLength"],
            "numOccurrences": 0,
            "totalCPOWTime": 0
        }
        cpowData = d["cpowTime"].get(addonv[0])
        
        ## If the histogram is present, but all values are 0, ignore it completely.
        if cpowData and cpowData["sum"] > 0:
            ## Some of the CPOW values may be 0 - ignore those.
            data["numOccurrences"] = sum([n for v, n in cpowData["values"].items() if v != "0"])
            data["totalCPOWTime"] = cpowData["sum"]
        result.append((addonv, data))
    return result

def dictSum(a, b):
    """ Add up like entries between two dicts. """
    result = {}
    for k in a:
        result[k] = a[k] + b[k]
    return result

cpowBySession = bySession.flatMap(getCPOWData).reduceByKey(dictSum)

Our final CPOW dataset is restricted to add-ons which were active during a total accumulated session time of at least 1 hour.

In [20]:
cpowFiltered = cpowBySession.filter(lambda (addon, data): data["totalTime"] >= 3600)

In [21]:
## Summarize add-on CPOW usage with:
## - hadCPOWBlocking: were there any CPOW blocking occurrences?
## - avgBlockingTime: the average blocking time spent per occurrence (truncated to the nearest microsecond)
## - occurrenceFreq: the average number of blocking occurrences per session hour.

def summaryCPOWTime(d):
    sessionHours = float(d["totalTime"]) / 3600
    hadCPOWBlocking = d["numOccurrences"] > 0
    return {
        "hadCPOWBlocking": hadCPOWBlocking,
        ## Since these are microseconds anyway, truncate using integer division.
        "avgBlockingTime": d["totalCPOWTime"] / d["numOccurrences"] if hadCPOWBlocking else 0,
        "occurrenceFreq": float(d["numOccurrences"]) / sessionHours if hadCPOWBlocking else 0
    }

cpowSummary = cpowFiltered.mapValues(summaryCPOWTime)

How many add-ons were active during at least 1 hour of overall session time?

In [22]:
cpowFiltered.count()

8327

How many of these had blocking CPOWs?

In [23]:
cpowHadBlocking = cpowSummary.filter(lambda (addon, summary): summary["hadCPOWBlocking"]).collect()
len(cpowHadBlocking)

2044

Dump data for add-ons that had CPOW blocking to a JSON file that will be used in the HTML page.

The file is one big JSON array with elements of the form `[[<GUID>, <version>], <avgBlockingTime>, <occurrenceFreq>]`.

In [28]:
def formatForJSON((addon, summary)):
    return (addon, summary["avgBlockingTime"], summary["occurrenceFreq"])

## Order by decreasing average blocking time.
cpowBlockingOutput = map(formatForJSON, cpowHadBlocking)
cpowBlockingOutput.sort(key = lambda (addon, avg, freq): -avg)


try:
    output = open('output/cpow-data.json', 'w')
    json.dump(cpowBlockingOutput, output)
    output.close()
except:
    pass

try:
    output = open('cpow-data.json', 'w')
    json.dump(cpowBlockingOutput, output)
    output.close()
except:
    pass

The blocking CPOW data, ordered by decreasing average blocking time.

In [25]:
sorted(cpowHadBlocking, key = lambda (addon, summary): summary["avgBlockingTime"], reverse = True)

[((u'{e98b7313-167d-48c6-89be-bc514d6de8d9}', u'4.5.1-signed.1-signed'),
  {'avgBlockingTime': 31630591,
   'hadCPOWBlocking': True,
   'occurrenceFreq': 0.013292155002937402}),
 ((u'{53A03D43-5363-4669-8190-99061B2DEBA5}', u'1.5.13'),
  {'avgBlockingTime': 17252625,
   'hadCPOWBlocking': True,
   'occurrenceFreq': 0.030683369784040217}),
 ((u'{1BFCBDFC-41DB-11E1-9FC4-D3C94824019B}', u'3.0.1.1-signed.1-signed'),
  {'avgBlockingTime': 7405952,
   'hadCPOWBlocking': True,
   'occurrenceFreq': 0.40710059171597635}),
 ((u'cacaoweb@cacaoweb.org', u'1.0.34'),
  {'avgBlockingTime': 6961773,
   'hadCPOWBlocking': True,
   'occurrenceFreq': 0.1194925549499791}),
 ((u'mweb71@yahoo.com', u'2.0.1-signed.1-signed'),
  {'avgBlockingTime': 5977808,
   'hadCPOWBlocking': True,
   'occurrenceFreq': 0.025503914850929617}),
 ((u'ascsurfingprotectionnew@iobit.com', u'2.1.2'),
  {'avgBlockingTime': 5043876,
   'hadCPOWBlocking': True,
   'occurrenceFreq': 0.9085838396695611}),
 ((u'{53A03D43-5363-4669-8190

The blocking CPOW data, ordered by decreasing occurrence frequency.

In [26]:
sorted(cpowHadBlocking, key = lambda (addon, summary): summary["occurrenceFreq"], reverse = True)

[((u'{0d4cca85-dc2d-45b2-bbaf-78d1b51629f0}', u'1.0.1'),
  {'avgBlockingTime': 304021L,
   'hadCPOWBlocking': True,
   'occurrenceFreq': 1183.0276366340315}),
 ((u'puc@fantamondi.it', u'1.2.3.1-signed.1-signed'),
  {'avgBlockingTime': 206209,
   'hadCPOWBlocking': True,
   'occurrenceFreq': 1018.0955657353927}),
 ((u'PardusCopilot@mozilla.doslash.org', u'1.1.5.1-signed.1-signed'),
  {'avgBlockingTime': 123276,
   'hadCPOWBlocking': True,
   'occurrenceFreq': 974.8614967392006}),
 ((u'{632875c8-e432-4e5c-b398-1981a2c82534}', u'1.0.1'),
  {'avgBlockingTime': 217242,
   'hadCPOWBlocking': True,
   'occurrenceFreq': 930.1233118027012}),
 ((u'{c33c5b47-69c8-45a4-a5e0-af85bbe628dd}', u'1.6.3.1-signed.1-signed'),
  {'avgBlockingTime': 153120L,
   'hadCPOWBlocking': True,
   'occurrenceFreq': 760.8678947687704}),
 ((u'{13e8d46d-09b8-4fd6-b75a-25c04a0db747}', u'1.0.1'),
  {'avgBlockingTime': 299514L,
   'hadCPOWBlocking': True,
   'occurrenceFreq': 670.8649427031321}),
 ((u'{29b8df85-56af-474f-