# Summary


This notebook show how to use utils functions to access LLD and Ve data.  
We then present a use case to generate a report comparing LLD and API data.  


**Note:** 
 - the download of the report should take around 11min
 - to disable logs:
    ```
    import logging
    logging.getLogger("analytics_utils").setLevel(logging.INFO)
    ```

In [1]:
import os
os.chdir('..')
os.getcwd()

'/home/julien/Work/analytics_utils'

In [2]:
from analytics_utils.spark_utils import add_pyspark_path, init_spark_py3, init_spark_py2

# spark_home =  '/usr/hdp/current/spark-client/'
spark_home =  '/home/julien/apps/spark-1.6.2/'

add_pyspark_path(spark_home)

# Make sure that the .tar.gz is zipped in from the parent folder to pyspark3 and not /mnt/...
# archive = "/mnt/home/brayere/pyspark3.tar.gz#pyspark3"
archive = "/home/julien/pyspark3.tar.gz#pyspark3"

NOTEBOOK_NAME = "Analytics utils"

try:
    # sc, sql_context = init_spark_py3(NOTEBOOK_NAME, spark_home, archive)
    # If you want Python 2
    sc, sql_context = init_spark_py3(NOTEBOOK_NAME, spark_home, archive)
except ValueError:
    sc.stop()
    sc, sql_context = init_spark()

Exception: Java gateway process exited before sending the driver its port number

# LLD

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [4]:
from analytics_utils.data_feeds import DataFeeds
from analytics_utils.feeds import VeCapture, AppNexus
import analytics_utils.reports as reports

**Let's look at the available data: **

In [5]:
print('Available LLD feeds: %s' % [x.name for x in AppNexus])
print('Available VeData feeds: %s' % [x.name for x in VeCapture])

Available LLD feeds: ['standard', 'segment', 'pixel']
Available VeData feeds: ['category_1d', 'category_7d', 'category_30d', 'page_view', 'categorizer']


**We only want the standard feed**

In [7]:
standard_feed = DataFeeds.get_feed_parquet(sql_context, AppNexus.standard, 
                                           from_date="2016-07-10")

**Note**: this is equivalent to
```
standard_feed = DataFeeds.get_feed_parquet(sqlContext, AppNexus.standard)
standard_feed = standard_feed.filter(VeFuncs.filter_date(from_date="2016-07-01"))
```

In [8]:
standard_feed.printSchema()

root
 |-- auction_id_64: long (nullable = true)
 |-- datetime: timestamp (nullable = true)
 |-- user_tz_offset: integer (nullable = true)
 |-- width: integer (nullable = true)
 |-- height: integer (nullable = true)
 |-- media_type: integer (nullable = true)
 |-- fold_position: integer (nullable = true)
 |-- event_type: string (nullable = true)
 |-- imp_type: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- media_cost_dollars_cpm: double (nullable = true)
 |-- revenue_type: integer (nullable = true)
 |-- buyer_spend: double (nullable = true)
 |-- buyer_bid: double (nullable = true)
 |-- ecp: double (nullable = true)
 |-- eap: double (nullable = true)
 |-- is_imp: integer (nullable = true)
 |-- is_learn: integer (nullable = true)
 |-- predict_type_rev: integer (nullable = true)
 |-- othuser_id_64: long (nullable = true)
 |-- ip_address: string (nullable = true)
 |-- ip_address_trunc: string (nullable = true)
 |-- geo_country: string (nullable = true)
 |-- geo_r

**Or the VeCapture feed**

In [11]:
pageView_feed = DataFeeds.get_feed_parquet(sql_context, VeCapture.page_view, 
                                           from_date="2016-07-10", to_date=None)

In [12]:
pageView_feed.printSchema()

root
 |-- session_id: string (nullable = true)
 |-- page_id: long (nullable = true)
 |-- url: string (nullable = true)
 |-- viewed_at: timestamp (nullable = true)
 |-- num_page_events: long (nullable = true)
 |-- customer_id: long (nullable = true)
 |-- client_ip: string (nullable = true)
 |-- ve_cookie_id: string (nullable = true)
 |-- journey_code: string (nullable = true)
 |-- user_agent: string (nullable = true)
 |-- client_language: string (nullable = true)
 |-- start_date: timestamp (nullable = true)
 |-- end_date: timestamp (nullable = true)
 |-- platform: string (nullable = true)
 |-- abandon_data_state: string (nullable = true)
 |-- email: string (nullable = true)
 |-- partner_cookies: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- geo_info: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- continent: string (nullable = true)
 |    |-- country: string (nullable = true)
 |    |-- country_code: string