# Summary


This notebook show how to use utils functions to access LLD and Ve data.  
We then present a use case to generate a report comparing LLD and API data.  


**Note:** 
 - the download of the report should take around 11min
 - to disable logs:
    ```
    import logging
    logging.getLogger("analytics_utils").setLevel(logging.INFO)
    ```

In [1]:
from analytics_utils.spark_utils import add_pyspark_path, init_spark_py3, init_spark_py2

spark_home =  '/usr/hdp/current/spark-client/'


add_pyspark_path(spark_home)

# Make sure that the .tar.gz is zipped in from the parent folder to pyspark3 and not /mnt/...
archive = "/mnt/home/brayere/pyspark3.tar.gz#pyspark3"


NOTEBOOK_NAME = "Analytics utils"

try:
    # sc, sql_context = init_spark_py3(NOTEBOOK_NAME, spark_home, archive)
    # If you want Python 2
    sc, sql_context = init_spark_py3(NOTEBOOK_NAME, spark_home, archive)
except ValueError:
    sc.stop()
    sc, sql_context = init_spark()

# LLD

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
from analytics_utils.data_feeds import DataFeeds
from analytics_utils.feeds import VeCapture, AppNexus, Events
import analytics_utils.reports as reports

ImportError: cannot import name 'Events'

**Let's look at the available data: **

In [None]:
print('Available feeds:')
print('\tAppNexus: %s' % [x.name for x in AppNexus])
print('\tVeCapture: %s' % [x.name for x in VeCapture])
print('\tEvents: %s' % [x.name for x in Events])

**Loading parquet data**

In [None]:
standard_feed = DataFeeds.get_feed_parquet(sql_context, AppNexus.standard, 
                                           from_date="2016-07-10")

**Note**: this is equivalent to
```
standard_feed = DataFeeds.get_feed_parquet(sqlContext, AppNexus.standard)
standard_feed = standard_feed.filter(VeFuncs.filter_date(from_date="2016-07-01"))

```

In [7]:
standard_feed.printSchema()

root
 |-- auction_id_64: long (nullable = true)
 |-- datetime: timestamp (nullable = true)
 |-- user_tz_offset: integer (nullable = true)
 |-- width: integer (nullable = true)
 |-- height: integer (nullable = true)
 |-- media_type: integer (nullable = true)
 |-- fold_position: integer (nullable = true)
 |-- event_type: string (nullable = true)
 |-- imp_type: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- media_cost_dollars_cpm: double (nullable = true)
 |-- revenue_type: integer (nullable = true)
 |-- buyer_spend: double (nullable = true)
 |-- buyer_bid: double (nullable = true)
 |-- ecp: double (nullable = true)
 |-- eap: double (nullable = true)
 |-- is_imp: integer (nullable = true)
 |-- is_learn: integer (nullable = true)
 |-- predict_type_rev: integer (nullable = true)
 |-- othuser_id_64: long (nullable = true)
 |-- ip_address: string (nullable = true)
 |-- ip_address_trunc: string (nullable = true)
 |-- geo_country: string (nullable = true)
 |-- geo_r

In [15]:
standard_feed.take(1)[0].asDict()

{'advertiser_currency': 'GBP',
 'advertiser_exchange_rate': 0.7612,
 'advertiser_frequency': 2,
 'advertiser_id': 976697,
 'advertiser_recency': 10,
 'age': 0,
 'application_id': '---',
 'auction_id_64': 18313597427956,
 'auction_service_deduction': 0.000235,
 'auction_service_fees': 0.0,
 'bid_priority': None,
 'billing_period_id': 91478,
 'booked_revenue_adv_curr': 0.0025,
 'booked_revenue_dollars': 0.003284,
 'brand_id': 110429,
 'browser': 18,
 'buyer_bid': 2.299002,
 'buyer_currency': 'USD',
 'buyer_member_id': 992,
 'buyer_spend': 2.132056,
 'cadence_modifier': 0.695919,
 'campaign_group_id': 2983392,
 'campaign_id': 12984486,
 'can_convert': 1,
 'carrier_id': 359,
 'clear_fees': 0.0,
 'commission_cpm': 0.0,
 'commission_revshare': 0.0,
 'control_creative_id': 0,
 'control_pct': 0.0,
 'creative_freq': 1,
 'creative_id': 47059529,
 'creative_overage_fees': 0.0,
 'creative_rec': 10,
 'custom_model_id': 0,
 'custom_model_last_modified': 0,
 'data_costs_cpm': '0.000000',
 'datetime':

**Loading json data**

In [8]:
df = sql_context.read.json(sql_context, AppNexus.)

In [1]:
pageView_feed = DataFeeds.get_feed_json(sql_context, Events.apps)

NameError: name 'DataFeeds' is not defined

In [12]:
pageView_feed.printSchema()

root
 |-- session_id: string (nullable = true)
 |-- page_id: long (nullable = true)
 |-- url: string (nullable = true)
 |-- viewed_at: timestamp (nullable = true)
 |-- num_page_events: long (nullable = true)
 |-- customer_id: long (nullable = true)
 |-- client_ip: string (nullable = true)
 |-- ve_cookie_id: string (nullable = true)
 |-- journey_code: string (nullable = true)
 |-- user_agent: string (nullable = true)
 |-- client_language: string (nullable = true)
 |-- start_date: timestamp (nullable = true)
 |-- end_date: timestamp (nullable = true)
 |-- platform: string (nullable = true)
 |-- abandon_data_state: string (nullable = true)
 |-- email: string (nullable = true)
 |-- partner_cookies: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- geo_info: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- continent: string (nullable = true)
 |    |-- country: string (nullable = true)
 |    |-- country_code: string

In [None]:
standard_feed