In [None]:
import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py
import operator
import json, time, sys
from __future__ import division
from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client
def fmt_date(d):
    return d.strftime("%Y%m%d")
def repartition(pipeline):
    return pipeline.repartition(MaxPartitions).cache()

%pylab inline

## !!! EDIT THESE BEFORE RUNNING !!!
##
## These represent the number of uniform random samples of raw
## sessions we'll take, from each channel, as a fraction.
##
## When testing on small clusters, it is important to keep the
## fraction low. Using a cluster of size 4, without about 100,000
## pings, the job below took me about 10 minutes. The fraction
## was f=0.001. For iterative development, it is best to use even
## smaller fractions.
##
## As of Nov 2015, 1% (fraction=0.01) of sessions results in
## about 1,000,000 samples for Beta, and about 1/5th that for
## Release. After FHR is removed, the Release population will
## become much, much larger.
BetaFraction = 0.0003
ReleaseFraction = 0.00005

In [None]:
# Disable logging, since this eats a ton of diskspace.
def quiet_logs(sc):
  logger = sc._jvm.org.apache.log4j
  logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR)
  logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR)
quiet_logs(sc)

In [None]:
# Constants.
DaysPerWeek = 7
GfxAdaptersKey = 'environment/system/gfx/adapters'
SystemOsNameKey = 'environment/system/os/name'
AdapterVendorKey = 'vendorID'
AdapterDeviceKey = 'deviceID'
AdapterDriverKey = 'driverVersion'

MajorVendors = {
    u'0x8086': 'Intel',
    u'0x10de': 'NVIDIA',
    u'0x1002': 'AMD',
}

In [None]:
def get_pings_for_channel(channel, fraction):
    end_date = datetime.datetime.now()
    start_date = end_date - datetime.timedelta(DaysPerWeek * 3)
    date_range = (fmt_date(start_date), fmt_date(end_date))
    
    args = {
        'app': 'Firefox',
        'schema': 'v4',
        'submission_date': date_range,
        'channel': channel,
        'fraction': fraction,
    }
    
    pings = get_pings(sc, **args)
    pings = get_pings_properties(pings, [
        'clientId',
        GfxAdaptersKey,
        SystemOsNameKey,
    ])
    pings = get_one_ping_per_client(pings)
    
    # Only care about Windows for now.
    pings = pings.filter(lambda p: p.get(SystemOsNameKey, None) == 'Windows_NT')
    pings = pings.filter(has_valid_adapter)
    return pings
    
def has_valid_adapter(p):
    try:
        adapter = p[GfxAdaptersKey][0]
    except:
        return False
    if adapter.get(AdapterVendorKey, 'unknown') not in MajorVendors:
        return False
    if not adapter.get(AdapterDriverKey, None):
        return False
    if not adapter.get(AdapterDeviceKey, None):
        return False
    return True

In [None]:
beta_pings = get_pings_for_channel('beta', BetaFraction)
release_pings = get_pings_for_channel('release', ReleaseFraction)

beta_pings = beta_pings.cache()
release_pings = release_pings.cache()

print('Found {0} beta pings.'.format(beta_pings.count()))
print('Found {0} release pings.'.format(release_pings.count()))

In [None]:
# These two helpers take in an RDD and return a map of
#   (Vendor, Info) => Population%
#   
def extract_info_map(pings, subkey):
    def to_key(p):
        adapter = p[GfxAdaptersKey][0]
        return (adapter[AdapterVendorKey], adapter[subkey])
    info_map = pings.map(lambda p: (to_key(p),)).countByKey()
    return compute_map_percentages(pings, info_map)

def compute_map_percentages(pings, info_map):
    output = {}
    total = float(pings.count())
    for key in info_map:
        output[key] = (float(info_map[key]) / total) * 100.0
    return output

In [None]:
# Return a list of tuples, in the format:
#   [((vendor, subkey), population%), ...]
# 
# Where each tuple is a key found in the Release population but not
# Beta, sorted by the rate at which it appears in Release from highest
# to lowest.
def compute_differences(subkey):
    beta_map = extract_info_map(beta_pings, subkey)
    release_map = extract_info_map(release_pings, subkey)
    
    # Find devices that are in the release population, but not beta.
    missing_in_beta = set(release_map.keys()) - set(beta_map.keys())
    
    missing_in_beta_map = {}
    for key in missing_in_beta:
        missing_in_beta_map[key] = release_map[key]
    
    missing_sorted = sorted(
        missing_in_beta_map.items(),
        key=operator.itemgetter(1),
        reverse=True)
    
    return (missing_sorted, release_map, beta_map)

In [None]:
driver_diff, release_drivers, beta_drivers = compute_differences(AdapterDriverKey)
device_diff, release_devices, beta_devices = compute_differences(AdapterDeviceKey)

In [None]:
print('-----------------------')
print('Unique drivers: {0} in release, {1} in beta.'.format(
    len(release_drivers),
    len(beta_drivers)))
print('{0} drivers that appear in release, but not beta:'.format(
    len(driver_diff)))
for ((vendor, driver), perc) in driver_diff:
    print('  {0} {1} ({2:.3f}%)'.format(MajorVendors[vendor], driver, perc))
    
print('-----------------------')

In [None]:
print('-----------------------')
print('Unique devices: {0} in release, {1} in beta.'.format(
    len(release_devices),
    len(beta_devices)))
print('{0} drivers that appear in release, but not beta:'.format(
    len(device_diff)))
for ((vendor, device), perc) in device_diff:
    print('  {0} {1} ({2:.3f}%)'.format(MajorVendors[vendor], device, perc))
print('-----------------------')