In [1]:
from datetime import date, timedelta, datetime

def int2date(n):
    """
    This function converts a number of days since Jan 1st 1970 <n> to a date.
    """
    return date(1970,1,1)+timedelta(days=n)

def date2int(d):
    """
    This function converts a date <d> to number of days since Jan 1st 1970.
    """
    return (d-date(1970,1,1)).days

def str2date(s, f="%Y%m%d"):
    """
    This function converts a string <s> in the format <f> to a date.
    """
    return datetime.strptime(s, f).date()

In [2]:
#TODAY_INT = date2int(date.today())                        # today (day notebook is run) as days since Jan 1st, 1970
PCD_CUTOUT_START_INT = date2int(date(2010, 1, 1))  # profiles created before this date are removed (days since Jan 1st, 1970)
SD_CUTOUT_START_STR = "20160801"                        # pings submitted before this date are removed
SD_CUTOUT_END_STR = "20161231"                        # pings submitted before this date are removed

In [7]:
# connect to the main_summary dataset
allPingsDF = sqlContext.read.load("s3://telemetry-parquet/main_summary/v3", "parquet", mergeSchema=True)

# perform variable selection with column renaming
allPingsDFSelect = allPingsDF.select(
           allPingsDF.client_id.alias("cid"),
           allPingsDF.sample_id.alias("sid"),
           allPingsDF.normalized_channel.alias("channel"),
           allPingsDF.submission_date_s3.alias("sd"),
           allPingsDF.app_name.alias("appname"),
           allPingsDF.app_version[:2].alias("appversion"),
           allPingsDF.country,
           allPingsDF.city)

In [8]:
# filter, replace missing values with zeroes, and cache dataframe
# - 1% sample (sample_id is 42)
# - channel is release
# - application is Firefox
# - data was submitted since July 2016 (previous data may be missing information)
# Dataframe is `cache`d to memory for performance improvements
filteredPingsDF = allPingsDFSelect.filter(allPingsDFSelect.sid == "42")\
                                  .filter(allPingsDFSelect.channel == "release")\
                                  .filter(allPingsDFSelect.appname == "Firefox")\
                                  .filter(allPingsDFSelect.country == "DE")\
                                  .filter(allPingsDFSelect.sd >= SD_CUTOUT_START_STR)\
                                  .filter(allPingsDFSelect.sd <= SD_CUTOUT_END_STR)\
                                  .select(["cid", "city", "sd", "appversion"])

In [9]:
filteredPingsDF.take(5)

[Row(cid=u'3ed8fd6b-d256-499b-a469-f6ead5da6a03', city=u'Heidelberg', sd=u'20161017', appversion=u'49'),
 Row(cid=u'077cdecd-cc71-412e-a8fe-fcd857aeea16', city=u'Meschede', sd=u'20161017', appversion=u'49'),
 Row(cid=u'bebcc4c4-43f0-4f44-abfe-1d0805154f9c', city=u'Andechs', sd=u'20161017', appversion=u'49'),
 Row(cid=u'b80fbb53-02c8-43e5-9bf4-875499f2f6a1', city=u'Dortmund', sd=u'20161017', appversion=u'49'),
 Row(cid=u'b80fbb53-02c8-43e5-9bf4-875499f2f6a1', city=u'Dortmund', sd=u'20161017', appversion=u'49')]

In [10]:
sqlContext.registerDataFrameAsTable(filteredPingsDF, "filteredPingsDF")

In [16]:
query = """
    SELECT city, to_date(CAST(UNIX_TIMESTAMP(sd, 'yyyyMMdd') AS TIMESTAMP)) AS sd, appversion, count(distinct cid)
    FROM filteredPingsDF
    GROUP BY 1, 2, 3
    ORDER BY 1, 2, 3
    """
aggregateDF = sqlContext.sql(query).cache()

In [17]:
aggregateDF.show()

+----+----------+----------+-------------------+
|city|        sd|appversion|count(DISTINCT cid)|
+----+----------+----------+-------------------+
|  ??|2016-08-01|        40|                  1|
|  ??|2016-08-01|        41|                 64|
|  ??|2016-08-01|        42|                143|
|  ??|2016-08-01|        43|                305|
|  ??|2016-08-01|        44|                165|
|  ??|2016-08-01|        45|                220|
|  ??|2016-08-01|        46|                278|
|  ??|2016-08-01|        47|               8938|
|  ??|2016-08-02|        40|                  2|
|  ??|2016-08-02|        41|                 63|
|  ??|2016-08-02|        42|                146|
|  ??|2016-08-02|        43|                291|
|  ??|2016-08-02|        44|                181|
|  ??|2016-08-02|        45|                226|
|  ??|2016-08-02|        46|                280|
|  ??|2016-08-02|        47|               8991|
|  ??|2016-08-02|        48|                 58|
|  ??|2016-08-03|   

In [15]:
aggregateDF.count() #1,600,531

1600531

In [18]:
aggregatePD = aggregateDF.toPandas()

In [20]:
aggregatePD.to_csv("FxDataViz.csv", index=False, encoding='utf-8')