In [1]:
import findspark
findspark.init()
from pyspark.sql.functions import col
from pyspark.sql import functions as F
import pandas as pd
import pyspark
from pyspark.sql import SparkSession

from functools import reduce

spark = (
    SparkSession.builder.appName('yxing_qa_test').master('yarn-client').config(
        'spark.executor.memory', '28g').config('spark.cores.max', '10').config(
            'spark.driver.memory',
            '10g').config('spark.shuffle.consolidateFiles').config(
                'spark.executor.instances',
                '20').config('spark.yarn.queue',
                             'root.digital.heavy').getOrCreate())
sc = spark.sparkContext

In [7]:
month_id = '240m'
prod_num = '1_324410_BJB'
test_num = '1_99992_BJB'

android_test_path = "/mapr/ri1.comscore.com/data/panel/momx/" + month_id + "/momx-fusion/22000/android/" + test_num + "_android_web_duration_bounding/final_close/output/part-*"
android_prod_path = "/mapr/ia1.comscore.com/data/panel/momx/" + month_id + "/momx-fusion/22000/android/" + prod_num + "_android_web_duration_bounding/final_close/output/part-*"

# android_mercury_path = "/mapr/ia1.comscore.com/data/panel/momx/7274d/momx-fusion/22000/android/1_0001_BJB_android_web_duration_bounding/final_close/output/mercury/part-*"
ios_test_path = "/mapr/ri1.comscore.com/data/panel/momx/" + month_id + "/momx-fusion/22000/ios/1_9999_BJB/final_close/*d/*.txt"
ios_prod_path = "/mapr/ia1.comscore.com//data/panel/momx/" + month_id + "/momx-fusion/22000/ios/1_324410_BJB/final_close/*d/*.txt"
# ios_mercury_path = "/mapr/ia1.comscore.com/data/panel/momx/7274d/momx-fusion/22000/ios/1_0001_BJB/final_close/mercury/*d/*.txt"
ios_pb_test_path = "/mapr/ri1.comscore.com/data/panel/momx/" + month_id + "/momx-fusion/22000/ios_pb/1_9999_BJB/final_close/*d/*"
ios_pb_path = "/mapr/ia1.comscore.com/data/panel/momx/" + month_id + "/momx-fusion/22000/ios_pb/1_324410_BJB/final_close/*d/*"
android_embee_test_path = "/mapr/ri1.comscore.com/data/panel/momx/239m/momx-fusion/22000/android_embee/1_9999_BJB/final_close/*d/*.txt"
android_embee_path = "/mapr/ia1.comscore.com/data/panel/momx/239m/momx-fusion/22000/android_embee/1_318980_BJB/final_close/*d/*.txt"
# browser_info_path = '/mapr/ia1.comscore.com/output/browser_info/weekly/1039w/browser_info_v2_1039w.txt'

In [3]:
schema = [
    "session_id", "house_id", "machine_id", "person_id", "time_id",
    "pattern_id", "pages", "dpages", "duration", "ssl_pages", "ssl_dpages",
    "ssl_duration", "num_redirects", "num_hits", "data_source", "bt1_id",
    "bt1_page_share", "bt1_dur_share", "bt2_id", "bt2_page_share",
    "bt2_dur_share", "ods_id", "first_ss2k", "start_ref_pattern_id"
]

browser_info_schema = [
    "house_id", "c_panelist_id", "machine_id", "c_machine_id", "aol_computer",
    "primary_browser", "default_browser", "cproxy_build",
    "cproxy_build_source", "os_major_version", "os_minor_version",
    "os_domain_mode", "os_bits", "system_primary_language_code", "dateadded",
    "dateremoved", "lastspeedread", "active_in_last_365_days",
    "connection_speed", "numeric_time_zone", "time_zone_bias",
    "inconsistent_vcountry_timezone", "computer_location",
    "computer_location_source", "computer_type", "first_in_cms_sample",
    "first_in_mmx_sample", "zip", "zip_source", "zip4", "zip5", "dma",
    "dma_name", "census_region", "reportable_region_id", "region",
    "region_source", "county_fips_code", "v_country", "iso_country_code",
    "ip_country_code", "network_id", "campaign_id", "vfromwhere", "partner_id",
    "i_site_id", "i_lang", "age_head", "age_head_source", "age_primary_user",
    "age_primary_user_source", "hhsize", "hhsize_source", "hoh_marital_status",
    "hoh_marital_status_source", "child_present", "child_present_source",
    "child_13to17", "child_13to17_source", "gender_head", "gender_head_source",
    "gender_primary_user", "gender_primary_user_source", "hhincome",
    "hhincome_source", "hoh_most_education", "hoh_most_education_source",
    "hoh_employment", "hoh_employment_source", "employees", "employees_source",
    "ethnicity", "ethnicity_source", "race", "race_source",
    "business_from_home", "hoh_social_class", "hoh_social_class_source",
    "occupation", "occupation_source", "device_id", "carrier_id",
    "lang_preference", "lang_preference_source", "city_name",
    "city_name_source", "pb_collection", "date_secure_cert",
    "date_sec_cert_req", "v_device_status", "v_advertising_id"
]

In [4]:
class Meta:
    def __init__(self, data):
        self.data_source = data.select('data_source').distinct()
        self.time_period_length = data.select('time_id').distinct().count()
        self.machine_id_count = data.select('machine_id').distinct().count()

In [5]:
agg_var = ['v_country', 'pattern_id', 'data_source']


def read_file(path, schema):
    return spark.read.format('csv').option("delimiter", '\t').option(
        "inferSchema", "true").load(path).toDF(*schema)


# def generate_rep(data, name):
#     return data.groupBy(agg_var).agg(
#         F.sum("duration").alias(name + '_duration'),
#         F.sum("pages").alias(name + '_pages'),
#         F.sum('num_hits').alias(name + '_hits'))


def compared_df(test_path, prod_path, schema):
    test = read_file(test_path, schema)
    prod = read_file(prod_path, schema)

    prod_meta = Meta(prod)
    test_meta = Meta(test)

    prod_tmp = prod.select(*(col(x).alias('prod_' + x) for x in prod.columns))
    test_tmp = test.select(*(col(x).alias('test_' + x) for x in test.columns))

    cond = [
        prod_tmp['prod_' + x] == test_tmp['test_' + x] for x in prod.columns
    ]

    compare = prod_tmp.join(test_tmp, cond, how='full')
    matched = prod.join(test, schema, how='inner')

    test_only = compare.where(
        reduce(lambda x, y: x & y,
               (col(x).isNull() for x in prod_tmp.columns)))
    prod_only = compare.where(
        reduce(lambda x, y: x & y,
               (col(x).isNull() for x in test_tmp.columns)))
    return {
        "compare_df": compare,
        'prod_df': prod,
        'test_df': test,
        'prod_only': prod_only,
        'test_only': test_only,
        'matched': matched,
        'meta': {
            'prod': prod_meta,
            'test': test_meta
        }
    }

In [6]:
android_compare = compared_df(android_test_path, android_prod_path, schema)
print android_compare['compare_df'].count()
print android_compare['matched'].count()
print android_compare['prod_df'].count()
print android_compare['test_df'].count()

for i in ['prod', 'test']:
    print i
    print android_compare['meta'][i].data_source.show()
    print android_compare['meta'][i].time_period_length
    print android_compare['meta'][i].machine_id_count


148186961
40973964
78289523
110871402
prod
+-----------+
|data_source|
+-----------+
|         20|
|         21|
|         60|
+-----------+

None
31
118302
test
+-----------+
|data_source|
+-----------+
|         20|
|         21|
|         60|
+-----------+

None
31
120788


In [14]:
ios_compare = compared_df(ios_test_path, ios_prod_path, schema)
print ios_compare['compare_df'].count()
print ios_compare['matched'].count()
print ios_compare['prod_df'].count()
print ios_compare['test_df'].count()

for i in ['prod', 'test']:
    print i
    print ios_compare['meta'][i].data_source.show()
    print ios_compare['meta'][i].time_period_length
    print ios_compare['meta'][i].machine_id_count

38485720
38485720
38485720
38485720
prod
+-----------+
|data_source|
+-----------+
|         40|
|         41|
+-----------+

None
31
57798
test
+-----------+
|data_source|
+-----------+
|         40|
|         41|
+-----------+

None
31
57798


In [15]:
ios_pb_compare = compared_df(ios_pb_test_path, ios_pb_path, schema)
print ios_pb_compare['compare_df'].count()
print ios_pb_compare['matched'].count()
print ios_pb_compare['prod_df'].count()
print ios_pb_compare['test_df'].count()

for i in ['prod', 'test']:
    print i
    print ios_pb_compare['meta'][i].data_source.show()
    print ios_pb_compare['meta'][i].time_period_length
    print ios_pb_compare['meta'][i].machine_id_count


2605054
2605054
2605054
2605054
prod
+-----------+
|data_source|
+-----------+
|         51|
|         50|
+-----------+

None
31
54608
test
+-----------+
|data_source|
+-----------+
|         51|
|         50|
+-----------+

None
31
54608


In [None]:
ndr_embee_compare = compared_df(android_embee_test_path, android_embee_path, schema)
print ndr_embee_compare['compare_df'].count()
print ndr_embee_compare['matched'].count()
print ndr_embee_compare['prod_df'].count()
print ndr_embee_compare['test_df'].count()
for i in ['prod', 'test']:
    print i
    print ndr_embee_compare['meta'][i].data_source.show()
    print ndr_embee_compare['meta'][i].time_period_length
    print ndr_embee_compare['meta'][i].machine_id_count

In [87]:
spark.stop()