In [0]:

import unittest
import datetime
from dateutil.relativedelta import relativedelta
from pyspark.sql.utils import AnalysisException
from pyspark.sql.functions import count
from pyspark.sql import Row
from aadatapipelinecore.core.fs.device import s3
from aadatapipelinecore.core.fs import Conf




kpi_mapping={1:"est_average_active_users", 2: "est_average_session_per_user", 3:"est_average_session_duration", 4:"est_install_penetration", 5:"est_average_active_days", 6:"est_percentage_active_days", 7:"est_average_bytes_per_user" , 8:"est_average_time_per_user", 9:"est_usage_penetration", 10:"est_open_rate",11:"est_total_time", 12:"est_share_of_category_time",14:"est_total_sessions", 15:"est_share_of_category_session", 17:"est_average_bytes_per_session", 18:"est_share_of_category_bytes", 20:"est_percent_of_wifi_total", 21:"est_mb_per_second", 22:"est_panel_size", 23:"est_installs", 24:"est_average_active_users_country_share", 25:"est_installs_country_share", 26:"est_audience_index", 27:"est_audience_percentage"}

ANDROID_COUNTRY_ID_CODES = { 1: 'AU', 2: 'CA', 3: 'CN', 4: 'DE', 5: 'ES', 6: 'FR', 7: 'GB', 8: 'IT', 9: 'JP', 10: 'US', 11: 'BE', 12: 'CH', 13: 'CL', 14: 'ZA', 15: 'VN', 16: 'HK', 17: 'AR', 18: 'BR', 19: 'IN', 20: 'FI', 21: 'ID', 22: 'RU', 23: 'NL', 24: 'MY', 25: 'TR', 26: 'MX', 27: 'KR', 28: 'PL', 29: 'TH', 30: 'TW', 31: 'PH', 32: 'SG', 33: 'EG', 34: 'SE', 35: 'AT', 36: 'CZ', 37: 'HU', 38: 'DK', 39: 'IE', 40: 'IL', 41: 'NZ', 42: 'NO', 43: 'PT', 44: 'RO', 45: 'SK', 46: 'GR', 47: 'BG', 48: 'UA', 49: 'AE', 50: 'KW', 51: 'SA', 52: 'CO', 65: 'LB', 56: 'PE', 80: 'HR', 54: 'PK', 62: 'EC', 73: 'QA', 102: 'MO', 103: 'LU', 53: 'KZ', 1000: 'WW' }


IOS_COUNTRY_ID_CODES={143460:'AU',143455:'CA',143465:'CN',143443:'DE',143454:'ES',143442:'FR',143444:'GB',143450:'IT',143462:'JP',143441:'US',143446:'BE',143459:'CH',143483:'CL',143472:'ZA',143471:'VN',143463:'HK',143505:'AR',143503:'BR',143467:'IN',143447:'FI',143476:'ID',143469:'RU',143452:'NL',143473:'MY',143480:'TR',143468:'MX',143466:'KR',143478:'PL',143475:'TH',143470:'TW',143474:'PH',143464:'SG',143516:'EG',143456:'SE',143445:'AT',143489:'CZ',143482:'HU',143458:'DK',143449:'IE',143491:'IL',143461:'NZ',143457:'NO',143453:'PT',143487:'RO',143496:'SK',143448:'GR',143526:'BG',143492:'UA',143481:'AE',143493:'KW',143479:'SA',143501:'CO',143451:'LU',143497:'LB',143515:'MO',143507:'PE',143494:'HR',143477:'PK',143509:'EC',143498:'QA',0:'WW'}

def get_device_list(device):
    if device=='ios':
        return {"2001":"ios-phone","2002":"ios-tablet"}
    else:
        return {"1001":"android-phone" ,"1002":"android-tablet"}


def last_day_of_month(check_month):
    next_month = check_month.replace(day=28) + datetime.timedelta(days=4)  
    return next_month - datetime.timedelta(days=next_month.day)


# def get_month_list():
#     result = []
#     today = datetime.date(2019, 10, 1) 
#     current = datetime.date(2013, 1, 1)    
#     while current <= today:
#         month_data_raw=datetime.datetime.strftime(current,'%Y-%m')
#         month_data_leg_db=datetime.datetime.strftime(current,'%Y%m')
#         result.append((month_data_raw, month_data_leg_db,current  ))
#         current += relativedelta(months=1)  
#     return result


# def get_path_date_list(granularity):
#     df_date = spark.read.parquet("s3://b2c-prod-data-pipeline-unified-usage/unified/usage.legacy-mu_app.v1/fact/granularity={}/".format(granularity)).select('date').dropDuplicates()
#     collect_date= df_date.collect()
#     # collect_date=[Row(date=u'2019-12-31'), Row(date=u'2019-08-31'), Row(date=u'2015-02-28'), Row(date=u'2017-04-30'), Row(date=u'2014-08-31'), Row(date=u'2017-10-31'), Row(date=u'2018-10-31'), Row(date=u'2018-05-31'), Row(date=u'2017-08-31'), Row(date=u'2013-09-30'), Row(date=u'2016-09-30'), Row(date=u'2014-06-30'), Row(date=u'2013-07-31'), Row(date=u'2018-06-30'), Row(date=u'2014-07-31'), Row(date=u'2016-12-31'), Row(date=u'2014-09-30'), Row(date=u'2017-05-31'), Row(date=u'2019-06-30'), Row(date=u'2013-05-31'), Row(date=u'2015-03-31'), Row(date=u'2015-11-30'), Row(date=u'2019-10-31'), Row(date=u'2017-06-30'), Row(date=u'2019-02-28'), Row(date=u'2015-06-30'), Row(date=u'2018-04-30'), Row(date=u'2016-05-31'), Row(date=u'2013-03-31'), Row(date=u'2016-03-31'), Row(date=u'2019-09-30'), Row(date=u'2015-12-31'), Row(date=u'2015-01-31'), Row(date=u'2013-01-31'), Row(date=u'2014-02-28'), Row(date=u'2019-03-31'), Row(date=u'2016-08-31'), Row(date=u'2018-02-28'), Row(date=u'2013-06-30'), Row(date=u'2016-07-31'), Row(date=u'2015-10-31'), Row(date=u'2018-03-31'), Row(date=u'2014-01-31'), Row(date=u'2018-09-30'), Row(date=u'2017-07-31'), Row(date=u'2019-04-30'), Row(date=u'2014-05-31'), Row(date=u'2019-01-31'), Row(date=u'2018-08-31'), Row(date=u'2014-04-30'), Row(date=u'2016-01-31'), Row(date=u'2017-12-31'), Row(date=u'2019-05-31'), Row(date=u'2017-09-30'), Row(date=u'2018-11-30'), Row(date=u'2018-01-31'), Row(date=u'2016-06-30'), Row(date=u'2015-04-30'), Row(date=u'2015-05-31'), Row(date=u'2018-07-31'), Row(date=u'2016-02-29'), Row(date=u'2015-09-30'), Row(date=u'2013-12-31'), Row(date=u'2014-12-31'), Row(date=u'2013-08-31'), Row(date=u'2013-04-30'), Row(date=u'2019-07-31'), Row(date=u'2013-02-28'), Row(date=u'2017-01-31'), Row(date=u'2017-11-30'), Row(date=u'2013-11-30'), Row(date=u'2013-10-31'), Row(date=u'2017-02-28'), Row(date=u'2016-11-30'), Row(date=u'2016-04-30'), Row(date=u'2014-03-31'), Row(date=u'2014-11-30'), Row(date=u'2015-07-31'), Row(date=u'2017-03-31'), Row(date=u'2014-10-31'), Row(date=u'2016-10-31'), Row(date=u'2015-08-31')]
#     date_list = [(x[0][:7],x[0]) for x in collect_date]
#     print date_list
#     return date_list

    
import traceback
def check_mu_app_transform_count(store_id_list, device_id_list, _granularity, date_list):
    t = unittest.TestCase('run')
    for id,country_code in store_id_list.items():
        for device,device_code in device_id_list.items():
            for m in date_list:
                raw_count_with_KPI=''
                # print id, device, m[0] , m[1]
                try:
                    if datetime.datetime.strptime(m[1],"%Y-%m-%d").date() > datetime.date(2019,10,31):
                        # print 'too large break'
                        continue

                    raw_path="s3://b2c-prod-data-pipeline-unified-usage/unified/usage.legacy-mu_app.v1/fact/granularity={raw_granularity}/month={raw_month}/device_id={raw_device_id}/store_id={raw_store_id}/"
                    unified_path="s3://b2c-prod-data-pipeline-unified-usage/unified/usage.basic-kpi.v1/fact/granularity={unified_granularity}/date={unified_date}/"
                    raw_path_parse=raw_path.format(raw_device_id=device,raw_store_id=id, raw_month=m[0], raw_granularity=_granularity)
                    raw_count_with_KPI=spark.read.parquet(raw_path_parse).filter("date='{}'".format(m[1])).select("kpi","app_id").distinct().groupBy("kpi").agg(count("kpi")).collect()
                    # print raw_count_with_KPI
                except AnalysisException as e: 
                    break
                    # traceback.print_exc()
                # print 'raw count', raw_count
                for row in raw_count_with_KPI:
                    # print 'row _ test', row["kpi"], row["count(kpi)"]
                    unified_path_parse=unified_path.format(unified_date=m[1], unified_granularity=_granularity)
                    unified_count= spark.read.parquet(unified_path_parse).filter("device_code='{}' and country_code='{}'".format(device_code,country_code)).filter("{} is not null".format(kpi_mapping[row["kpi"]])).select(kpi_mapping[row["kpi"]]).count()
                    # print 'unified count' , unified_count
                    t.assertEqual(row["count(kpi)"], unified_count, " raw: {} ~ unified data: {},device:{},  store_id:{} , month: {}, KPI {}".format(row["count(kpi)"], unified_count, device, id , m, row["kpi"]))

graularity_list=["daily"]
for graularity in graularity_list:
    print graularity
    granularity_date_list=get_path_date_list(graularity)
    # check_mu_app_transform_count(IOS_COUNTRY_ID_CODES, get_device_list('ios'),graularity, granularity_date_list)
    check_mu_app_transform_count(ANDROID_COUNTRY_ID_CODES, get_device_list('android'),graularity, granularity_date_list)
print 'pass'



In [0]:

import unittest
import datetime
from dateutil.relativedelta import relativedelta
from pyspark.sql.utils import AnalysisException
from pyspark.sql.functions import count
from pyspark.sql import Row
from aadatapipelinecore.core.fs.device import s3
from aadatapipelinecore.core.fs import Conf
from aadatapipelinecore.core.log import logger


kpi_mapping={1:"est_average_active_users", 2: "est_average_session_per_user", 3:"est_average_session_duration", 4:"est_install_penetration", 5:"est_average_active_days", 6:"est_percentage_active_days", 7:"est_average_bytes_per_user" , 8:"est_average_time_per_user", 9:"est_usage_penetration", 10:"est_open_rate",11:"est_total_time", 12:"est_share_of_category_time",14:"est_total_sessions", 15:"est_share_of_category_session", 17:"est_average_bytes_per_session", 18:"est_share_of_category_bytes", 20:"est_percent_of_wifi_total", 21:"est_mb_per_second", 22:"est_panel_size", 23:"est_installs", 24:"est_average_active_users_country_share", 25:"est_installs_country_share", 26:"est_audience_index", 27:"est_audience_percentage"}

country_code_list = { 1: 'AU', 2: 'CA', 3: 'CN', 4: 'DE', 5: 'ES', 6: 'FR', 7: 'GB', 8: 'IT', 9: 'JP', 10: 'US', 11: 'BE', 12: 'CH', 13: 'CL', 14: 'ZA', 15: 'VN', 16: 'HK', 17: 'AR', 18: 'BR', 19: 'IN', 20: 'FI', 21: 'ID', 22: 'RU', 23: 'NL', 24: 'MY', 25: 'TR', 26: 'MX', 27: 'KR', 28: 'PL', 29: 'TH', 30: 'TW', 31: 'PH', 32: 'SG', 33: 'EG', 34: 'SE', 35: 'AT', 36: 'CZ', 37: 'HU', 38: 'DK', 39: 'IE', 40: 'IL', 41: 'NZ', 42: 'NO', 43: 'PT', 44: 'RO', 45: 'SK', 46: 'GR', 47: 'BG', 48: 'UA', 49: 'AE', 50: 'KW', 51: 'SA', 52: 'CO', 65: 'LB', 56: 'PE', 80: 'HR', 54: 'PK', 62: 'EC', 73: 'QA', 102: 'MO', 103: 'LU', 53: 'KZ', 1000: 'WW', 143460:'AU',143455:'CA',143465:'CN',143443:'DE',143454:'ES',143442:'FR',143444:'GB',143450:'IT',143462:'JP',143441:'US',143446:'BE',143459:'CH',143483:'CL',143472:'ZA',143471:'VN',143463:'HK',143505:'AR',143503:'BR',143467:'IN',143447:'FI',143476:'ID',143469:'RU',143452:'NL',143473:'MY',143480:'TR',143468:'MX',143466:'KR',143478:'PL',143475:'TH',143470:'TW',143474:'PH',143464:'SG',143516:'EG',143456:'SE',143445:'AT',143489:'CZ',143482:'HU',143458:'DK',143449:'IE',143491:'IL',143461:'NZ',143457:'NO',143453:'PT',143487:'RO',143496:'SK',143448:'GR',143526:'BG',143492:'UA',143481:'AE',143493:'KW',143479:'SA',143501:'CO',143451:'LU',143497:'LB',143515:'MO',143507:'PE',143494:'HR',143477:'PK',143509:'EC',143498:'QA',0:'WW' }


# IOS_COUNTRY_ID_CODES={}

device_mapping = {"2001":"ios-phone","2002":"ios-tablet","1001":"android-phone" ,"1002":"android-tablet"}



raw_path_granularity = "s3://b2c-prod-data-pipeline-unified-usage/"

    
s3_bucket_list = s3.S3Bucket(Conf(bucket_name='b2c-prod-data-pipeline-unified-usage'))
usage_path_month_list = s3_bucket_list.all(prefix="unified/usage.legacy-mu_app.v1/fact/granularity={raw_granularity}/".format(raw_granularity="daily"), depth_is_1=True)


unified_path="s3://b2c-prod-data-pipeline-unified-usage/unified/usage.basic-kpi.v1/fact/granularity={unified_granularity}/date={unified_date}/"


def check_mu_app_transform_count_new():
    for month in usage_path_month_list:
        device_list = s3_bucket_list.all(prefix=month, depth_is_1=True)
        for device in device_list[:2]:
            raw_parse_path_list = s3_bucket_list.all(prefix=device, depth_is_1=True)
            for raw_path in raw_parse_path_list[:2]:
                print raw_path
                device_code = device_mapping[raw_path.split("/")[5].split("=")[1]]
                country_code = country_code_list[int(raw_path.split("/")[6].split("=")[1])]
                raw_count_with_KPI=spark.read.parquet(raw_path_granularity+raw_path).select("date","kpi","app_id").distinct().groupBy("kpi","date").agg(count("kpi")).collect()
                for singe_record in raw_count_with_KPI:
                    unified_path_parse=unified_path.format(unified_date=singe_record["date"], unified_granularity="daily")
                    unified_count= spark.read.parquet(unified_path_parse).filter("device_code='{}' and country_code='{}'".format(device_code,country_code)).filter("{} is not null".format(kpi_mapping[singe_record["kpi"]])).select(kpi_mapping[singe_record["kpi"]]).count()
                    if singe_record["count(kpi)"] == unified_count:
                       logger.info("the count is equal , country is: {} , kpi is: {}, date is : {}, raw_count is {}, unified_count is {}".format(country_code,singe_record["kpi"],singe_record["date"],singe_record["count(kpi)"] ,unified_count  ))
                    elif singe_record["count(kpi)"] != unified_count :
                        logger.info("the count is not equal!!! country is: {} , kpi is: {}, date is : {}, raw_count is {}, unified_count is {}".format(country_code,singe_record["kpi"],singe_record["date"] ,singe_record["count(kpi)"] ,unified_count ))

check_mu_app_transform_count_new()


In [0]:
%%sh

aws s3 ls s3://b2c-prod-data-pipeline-unified-usage/unified/usage.legacy-mu_app.v1/fact/granularity=daily/month=2015-12/device_id=1001/store_id=1/
  | head -3
 
                           

In [0]:

print spark.read.parquet("s3://b2c-prod-data-pipeline-unified-usage/unified/usage.legacy-mu_app.v1/fact/granularity=weekly/month=2014-02/device_id=1001/store_id=1/").filter("kpi=12 and date='2014-02-22'").show()


print spark.read.parquet("s3://b2c-prod-data-pipeline-unified-usage/unified/usage.legacy-mu_app.v1/fact/granularity=daily/month=2015-12/device_id=1001/store_id=1/").show()


In [0]:

import unittest
import datetime
from dateutil.relativedelta import relativedelta
from pyspark.sql.utils import AnalysisException
from pyspark.sql.functions import count
from pyspark.sql import Row
from aadatapipelinecore.core.fs.device import s3
from aadatapipelinecore.core.fs import Conf

s3_bucket_list = s3.S3Bucket(Conf(bucket_name='b2c-prod-data-pipeline-unified-usage'))
usage_path_month_list = s3_bucket_list.all(prefix="unified/usage.legacy-mu_app.v1/fact/granularity={raw_granularity}/".format(raw_granularity="daily"), depth_is_1=True)

for month in usage_path_month_list:
    device_list = s3_bucket_list.all(prefix=month, depth_is_1=True)
    for device in device_list:
        print s3_bucket_list.all(prefix=device, depth_is_1=True)