In [1]:
import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py
import urllib2
import datetime
import sys

from moztelemetry.spark import get_pings, get_pings_properties
import moztelemetry.spark

%pylab inline

from operator import add



Unable to parse whitelist (/home/hadoop/anaconda2/lib/python2.7/site-packages/moztelemetry/histogram-whitelists.json). Assuming all histograms are acceptable.
Populating the interactive namespace from numpy and matplotlib


In [2]:
sc.defaultParallelism

64

### Extract working dataset

Collect nightly data from builds dated within a 2-week window.

In [3]:
def fmt_date(d):
    return d.strftime("%Y%m%d")

## Dates bounding the time window to look at.
t1 = fmt_date(datetime.datetime.now() - datetime.timedelta(16)) # go back 16 days
t2 = fmt_date(datetime.datetime.now() - datetime.timedelta(2)) # go back 2 days
t1, t2

('20160830', '20160913')

In [4]:
## Collect saved-session pings from build dates in the window.
pings = get_pings(sc, app="Firefox", channel="nightly", build_id=(t1, t2), fraction=1.0)

In [10]:
def parseAddons(addons):
    """ Create a list of enabled add-ons with elements of the form (ID, version). """
    return[(k, v.get("version")) for k, v in addons.iteritems()]

def extract(ping):
    """ Extract relevant fields from each payload.
    
        shims: Reason why add-on shims were used, keyed by add-on ID (enumerated count of reason codes)
        cpowTime: Contiguous time spent by an add-on blocking the main loop by performing a blocking
                  cross-process call (microseconds, keyed by add-on ID).
        cpowForbidden: Number of times an add-on used CPOWs when it was marked as e10s compatible
                       (count, keyed by add-on ID).
        addons: List of (ID, version) for each enabled add-on.
    """
    hists = ping["payload"].get("histograms", {})
    keyed = ping["payload"].get("keyedHistograms", {})
    clientId = ping.get("clientId", None)
    return {
        "clientId": ping.get("clientId", None),
        "os": ping["environment"]["system"]["os"]["name"],
        "e10s": ping["environment"]["settings"]["e10sEnabled"],
        "sessionLength": ping["payload"]["info"]["sessionLength"], ## in seconds
        "shims": keyed.get("ADDON_SHIM_USAGE", {}),
        "cpowTime": keyed.get("PERF_MONITORING_SLOW_ADDON_CPOW_US", {}),
        "cpowForbidden": keyed.get("ADDON_FORBIDDEN_CPOW_USAGE", {}),
        "addons": parseAddons(ping["environment"].get("addons", {}).get("activeAddons", {}))
    }

## Extract relevant data, and restrict to clients that have add-ons.
bySession = pings.map(extract)\
    .filter(lambda p: p["addons"])\
    .persist(StorageLevel.MEMORY_AND_DISK_SER)

How many session pings are in the dataset?

In [11]:
bySession.count()

913874

How many unique clients do these come from?

In [14]:
bySession.map(lambda p: p["clientId"]).distinct().count()

53996

How many add-ons are represented in the dataset, and what are the top few?

In [15]:
addonCounts = bySession.flatMap(lambda p: [(guid, p["clientId"]) for (guid, version) in p["addons"]])\
    .distinct()\
    .map(lambda (guid, clientid): guid)\
    .countByValue()
len(addonCounts)

7260

In [16]:
sorted(addonCounts.items(), key = lambda (guid, count): (-count, guid))[:20]

[(u'flyweb@mozilla.org', 53851),
 (u'webcompat@mozilla.org', 53813),
 (u'e10srollout@mozilla.org', 53746),
 (u'firefox@getpocket.com', 53582),
 (u'{d10d0bf8-f5b5-c8b4-a8b2-2b9879e08c5d}', 10730),
 (u'uBlock0@raymondhill.net', 4513),
 (u'{b9db16a4-6edc-47ec-a1f4-b86292ed211d}', 2605),
 (u'{e4a8a97b-f2ed-450b-b12d-ee082ba24781}', 2448),
 (u'{DDC359D1-844A-42a7-9AA1-88A850A938A8}', 1706),
 (u'{73a6fe31-595d-460b-a920-fcc0f8843232}', 1597),
 (u'firebug@software.joehewitt.com', 1497),
 (u'{46551EC9-40F0-4e47-8E18-8E5CF550CFB8}', 1337),
 (u'firefox@mega.co.nz', 1301),
 (u'firefox@ghostery.com', 1218),
 (u'{b9bfaf1c-a63f-47cd-8b9a-29526ced9060}', 1151),
 (u'adbhelper@mozilla.org', 1137),
 (u'fxdevtools-adapters@mozilla.org', 1106),
 (u'loop@mozilla.org', 1078),
 (u'wrc@avast.com', 1035),
 (u'support@lastpass.com', 1015)]

### Shims

Count enabled add-on installs (ID and version), together with whether or not they were observed to use shims.

An add-on is counted as using shims if it has entry in the `ADDON_SHIM_USAGE` keyed histogram for at least one client (regardless of the values in the histogram). This histogram records shim usage occurrence by the [reason it was used](https://dxr.mozilla.org/mozilla-central/source/toolkit/components/addoncompat/CompatWarning.jsm#94).

In [17]:
## Summarize shim usage per add-on/client.
## Reduce over multiple sessions observed for each client,
## and flatten to ((ID, version), clientID, usedShims).

def getShimData(d):
    """ Summarize each add-on in the ping as (((ID, version), clientID), <used shims?>). """
    return [((addonv, d["clientId"]), addonv[0] in d["shims"]) for addonv in d["addons"]]

shimUsageByClient = bySession.flatMap(getShimData)\
    .reduceByKey(lambda a, b: a or b)\
    .map(lambda ((addon, clientId), usedShims): (addon, clientId, usedShims))

In [18]:
## Compute number of clients that used shims for each add-on.
## Result is of the form ((ID, version), usedShims, # clients).

shimUsageCounts = shimUsageByClient\
    .map(lambda (addon, clientId, usedShims): ((addon, usedShims), 1))\
    .reduceByKey(add)\
    .map(lambda ((addon, usedShims), count): (addon, usedShims, count))

In [20]:
## For each (add-on, version) pair, determine whether it ever used shims,
## along with its overall installation count.

shimUsageByAddon = shimUsageCounts\
    .map(lambda (addon, usedShims, count): (addon, (usedShims, count)))\
    .reduceByKey(lambda (s1, c1), (s2, c2): (s1 or s2, c1 + c2))\
    .map(lambda (addon, (usedShims, count)): (addon, usedShims, count))

In [21]:
## Order by decreasing installation count.
## Only keep listing for (ID, version) pairs that were enabled in at least 50 profiles.

shimUsageFiltered = shimUsageByAddon.filter(lambda v: v[-1] >= 50)\
    .sortBy(lambda v: (-v[-1], v[:-1]))\
    .collect()

In [22]:
## Add-ons the were found to use shims.

shimUsageShimmed = filter(lambda (addon, usedShims, count): usedShims, shimUsageFiltered)

How many add-ons had at least 50 enabled installations in our dataset?

In [23]:
len(shimUsageFiltered)

515

How many of these used shims?

In [24]:
len(shimUsageShimmed)

268

In [25]:
def formatForJSON(d):
    """ Write shim usage info to a JSON file.
    
        File will contain one big array with elements of the form
        [<numInstallations>, [<GUID>, <version>], <usedShims>].
    """
    return (d[-1],) + d[:-1]

shimUsageOutput = map(lambda d: (d[-1],) + d[:-1], shimUsageFiltered)

try:
    output = open('output/shim-data.json', 'w')
    json.dump(shimUsageOutput, output)
    output.close()
except:
    pass

try:
    output = open('shim-data.json', 'w')
    json.dump(shimUsageOutput, output)
    output.close()
except:
    pass

In [26]:
shimUsageFiltered

[((u'flyweb@mozilla.org', u'1.0.0'), False, 53851),
 ((u'webcompat@mozilla.org', u'1.0'), False, 53813),
 ((u'firefox@getpocket.com', u'1.0.4'), True, 53580),
 ((u'e10srollout@mozilla.org', u'1.2'), False, 50051),
 ((u'e10srollout@mozilla.org', u'1.0'), False, 29155),
 ((u'{d10d0bf8-f5b5-c8b4-a8b2-2b9879e08c5d}', u'2.7.3'), True, 10486),
 ((u'uBlock0@raymondhill.net', u'1.9.4'), True, 3708),
 ((u'uBlock0@raymondhill.net', u'1.9.6'), True, 3015),
 ((u'{b9db16a4-6edc-47ec-a1f4-b86292ed211d}', u'6.0.0'), True, 2488),
 ((u'{e4a8a97b-f2ed-450b-b12d-ee082ba24781}', u'3.9'), True, 2315),
 ((u'firebug@software.joehewitt.com', u'2.0.17'), False, 1362),
 ((u'{DDC359D1-844A-42a7-9AA1-88A850A938A8}', u'3.0.6'), False, 1333),
 ((u'{73a6fe31-595d-460b-a920-fcc0f8843232}', u'2.9.0.14'), True, 1295),
 ((u'{46551EC9-40F0-4e47-8E18-8E5CF550CFB8}', u'2.0.7'), False, 1266),
 ((u'firefox@ghostery.com', u'6.3.2'), True, 1175),
 ((u'adbhelper@mozilla.org', u'0.8.7'), False, 1129),
 ((u'fxdevtools-adapters@mo

### CPOWs

Count CPOW usage, recorded in the `PERF_MONITORING_SLOW_ADDON_CPOW_US` histogram as time in microseconds spent by an add-on blocking the main loop using a CPOW.

The metric currently computed is the average time spent per CPOW usage occurrence, multiplied by the number of telemetry sessions that had a recorded histogram.

Note that in many cases an add-on has an entry in the histogram, but it only has observations with the value 0. It appears that the histogram is recorded at the same time as `PERF_MONITORING_SLOW_ADDON_JANK_US` by the [AddonWatcher](https://dxr.mozilla.org/mozilla-central/source/toolkit/components/perfmonitoring/AddonWatcher.jsm#128).

In [None]:
def processForHist(d):
    """ Return a list of ((ID, version), histogram) for each add-on with a CPOW histogram entry. """
    def getVersion(addon):
        """ Find the currently installed version for the given add-on ID. """
        for addonv in d['addons']:
            if addonv[0] == addon:
                return addonv[1] if len(addonv) == 2 else None
    return [ ((addon, getVersion(addon)), h) for (addon, h) in d['cpow'].items() ]

cpowHists = bySession.flatMap(lambda (id, d): processForHist(d))

In [None]:
cpowHists.take(1)

In [None]:
def fixHist((id, h)):
    """ Summarize histogram info, together with a count of subsessions ('hits') it occurs in. """
    ## Number of recorded values in a subsession.
    n = sum(h['values'].values())
    ## Total number of microseconds across the subsession.
    s = h['sum']
    vs = { int(k): v for (k, v) in h['values'].items() }
    return (id, {'n': n, 'sum': s, 'values': vs, 'hits': 1})

def combineHists(h1, h2):
    """ Merge histograms and counts. """
    vs = h1['values']
    for k, v in h2['values'].items():
        vs[k] = vs.get(k, 0) + v
    return {'n': h1['n'] + h2['n'],
            'sum': h1['sum'] + h2['sum'],
            'values': vs,
            'hits': h1['hits']+h2['hits']}

cpows = cpowHists.map(fixHist).reduceByKey(combineHists)

In [None]:
## Remove add-ons with CPOWs recorded in 50 or fewer subsessions.
cpowsFiltered = cpows.filter(lambda (addon, d): d['hits'] > 50)

In [None]:
## Sort by decreasing average CPOW usage time per occurrence.
## Should it just be d['sum'] / d['n']?
cpowsSorted = cpowsFiltered.map(lambda (addon, d): (d['sum']/d['n']*d['hits'], (addon, d))).sortByKey(False)

In [None]:
cpowsSimple = cpowsSorted.map(lambda (avg, (id, d)): (id, avg))

In [None]:
cpowTimes = cpowsSimple.collect()

In [None]:
try:
    output = open('output/cpow-data.json', 'w')
    json.dump(cpowTimes, output)
    output.close()
except:
    pass

try:
    output = open('cpow-data.json', 'w')
    json.dump(cpowTimes, output)
    output.close()
except:
    pass

In [None]:
cpowTimes