In [0]:

from pyspark.sql import types as T
from pyspark.sql import functions as F
from aadatapipelinecore.core.utils.spark import eject_all_caches
from aadatapipelinecore.core.urn import Urn
from aadatapipelinecore.core.pipeline import type_
from applications.common.parser import SqlParser
from applications.common.executor import SqlExecutor
from applications.auto_pipeline.transform import _view
import datetime
spark.sparkContext.addPyFile("/home/hadoop/bdp/application/libs/python/dependencies.zip")
import aaplproxy
class DryRunSqlExecutor(SqlExecutor):
    def _verify_tasks(self):
        pass
def run(spark, raw_data, sql_text, dry_run=True):
    urn = Urn(namespace=raw_data["namespace"])
    source_data_list = raw_data.pop("source")
    raw_data.update(raw_data.pop("options"))
    _view(spark, sql_text, None, source_data_list)
    context = raw_data
    tasks = SqlParser(spark, sql_text, context).parse()
    if dry_run:
        sql_executor = DryRunSqlExecutor
    else:
        sql_executor = SqlExecutor
    sql_executor(urn, spark, tasks, type_.EventType.TRANSFORM, context).run()
sql_text = """
-- mapping feed as metrc in raw
WITH feed_metric AS (
select *, 'free_app_download' as metric, "ios-phone" as device_code from rank_raw where  feed='0' and platform='ios'
UNION ALL
select *, 'paid_app_download' as metric, "ios-phone" as device_code from rank_raw where  feed='1' and platform='ios'
UNION ALL
select *, 'revenue' as metric , "ios-phone" as device_code from rank_raw where  feed='2' and platform='ios'
UNION ALL
select *, 'free_app_download' as metric, "ios-tablet" as device_code from rank_raw where  feed='101' and platform='ios'
UNION ALL
select *, 'paid_app_download' as metric, "ios-tablet" as device_code from rank_raw  where  feed='100' and platform='ios'
UNION ALL
select *, 'revenue' as metric, "ios-tablet" as device_code from rank_raw  where  feed='102' and platform='ios'
UNION ALL
select *, 'free_app_download' as metric, "ios-all" as device_code from rank_raw  where  feed='1000' and platform='ios'
UNION ALL
select *, 'paid_app_download' as metric, "ios-all" as device_code from rank_raw  where  feed='1001' and platform='ios'
UNION ALL
select *, 'revenue' as metric, "ios-all" as device_code from rank_raw  where  feed='1002' and platform='ios'
UNION ALL
select *, 'free_app_download' as metric , "android-all" as device_code from rank_raw   where  feed='0' and platform='android'
UNION ALL
select *, 'paid_app_download' as metric, "android-all" as device_code from rank_raw  where  feed='1' and platform='android'
UNION ALL
select *, 'revenue' as metric,  "android-all" as device_code from rank_raw  where  feed='2' and platform='android'
);
-- select tested column from raw data
WITH metric_raw_data AS (
SELECT id, category_id as raw_category_id,rank,store_id as raw_store_id , metric,device_code,date , platform from feed_metric where store_id not in (3,4,5,6, 1002,1003, 1005,1004, 1006,1007)
);
-- group by and count data in raw data
WITH group_by_raw AS (
SELECT count(id) AS total_count , raw_category_id, raw_store_id, metric,device_code,date,platform from metric_raw_data where raw_store_id not in (3,4,5,6, 1002,1003, 1004, 1005, 1006,1007) group by raw_category_id, raw_store_id, metric,device_code,date, platform
);
-- pivot metric column
WITH pivot_metric_rank_raw AS (
SELECT 
free_app_download,revenue, paid_app_download, raw_category_id,raw_store_id,device_code, platform,date
FROM
      group_by_raw
 PIVOT (
    max(total_count) 
	FOR metric IN ('free_app_download','revenue', 'paid_app_download')
  )
);
-- select tested column from raw data
WITH metric_raw_store_data AS (
SELECT distinct id, est, store_id as raw_store_id , metric,device_code, date , platform from feed_metric where store_id not in (3,4,5,6, 1002,1003, 1005,1004, 1006,1007)
);
-- group by and count data in raw data
WITH group_by_store_raw AS (
SELECT count(est) AS total_count ,raw_store_id, metric,device_code,date,platform from metric_raw_store_data where raw_store_id not in (3,4,5,6,1002,1003, 1004, 1005, 1006,1007) group by raw_store_id, metric,device_code,date, platform
);
-- pivot metric column
WITH pivot_metric_store_raw AS (
SELECT 
free_app_download,revenue, paid_app_download,raw_store_id,device_code, platform, date
FROM
      group_by_store_raw
 PIVOT (
    max(total_count) 
	FOR metric IN ('free_app_download','revenue', 'paid_app_download')
  )
);
-- map raw with category
WITH category_mapping_raw AS (
SELECT * from ( select *, 'ios' as mapping_platform from category_mapping_deminsion_service where market_code='apple-store' 
UNION ALL select *, 'android' as mapping_platform from category_mapping_deminsion_service where market_code='google-play'
 ) as mapping right join pivot_metric_rank_raw on mapping.legacy_category_id=pivot_metric_rank_raw.raw_category_id and 
mapping.mapping_platform=pivot_metric_rank_raw.platform
);
-- map raw with rank country_code
WITH country_category_mapping_rank_raw AS (
select date, raw_store_id, country_code,device_code,category_id,free_app_download,paid_app_download,revenue from country_code_mapping right join category_mapping_raw on country_code_mapping.country_code_store_id=category_mapping_raw.raw_store_id and country_code_mapping.market_code=category_mapping_raw.platform
);
-- map raw with store country_code
WITH country_mapping_store_raw AS (
select date, raw_store_id, country_code,device_code,free_app_download,paid_app_download,revenue from country_code_mapping right join pivot_metric_store_raw on country_code_mapping.country_code_store_id=pivot_metric_store_raw.raw_store_id and country_code_mapping.market_code=pivot_metric_store_raw.platform
);
-- group by unified data
WITH unified_group_data AS (
select count(app_id) as unified_count_app_id, count(free_app_download) as unified_count_free_app_download, count(paid_app_download) as unified_count_paid_app_download, count(revenue) as unified_count_revenue,
  country_code as unified_country_code, device_code as unified_device_code, category_id as unified_category_id from ( select distinct  app_id, free_app_download, paid_app_download, revenue, country_code, device_code, category_id from  rank_unified ) as unified
group by  category_id,  country_code,  device_code );
-- group by unified data
WITH unified_group_data_store AS (
select count(app_id) as unified_count_app_id, count(free_app_download) as unified_count_free_app_download, count(paid_app_download) as unified_count_paid_app_download, count(revenue) as unified_count_revenue,
  country_code as unified_country_code, device_code as unified_device_code from ( select distinct  app_id, free_app_download, paid_app_download, revenue, country_code, device_code from  store_unified ) as unified
group by   country_code,  device_code );
-- compare raw vs unified data
WITH compared_data_rank AS (
    SELECT * from country_category_mapping_rank_raw left join unified_group_data on unified_group_data.unified_country_code==country_category_mapping_rank_raw.country_code and unified_group_data.unified_category_id==country_category_mapping_rank_raw.category_id and unified_group_data.unified_device_code==country_category_mapping_rank_raw.device_code
);
WITH miss_data_rank AS (
select * from compared_data_rank where unified_count_paid_app_download!=paid_app_download or unified_count_free_app_download != free_app_download  or unified_count_revenue != revenue or unified_count_app_id is null
)
-- compare raw vs unified data store
WITH compared_store_data AS (
    SELECT * from country_mapping_store_raw left join unified_group_data_store on unified_group_data_store.unified_country_code==country_mapping_store_raw.country_code and unified_group_data_store.unified_device_code==country_mapping_store_raw.device_code
);
WITH miss_data_store AS (
select * from compared_store_data where free_app_download!=unified_count_free_app_download or paid_app_download!=unified_count_paid_app_download or revenue!=unified_count_revenue or unified_count_app_id is null
)
"""
start = '2020-03-03'
end = '2020-03-04'
real_date1 = datetime.date(*[int(x) for x in start.split('-')])
real_date2 = datetime.date(*[int(x) for x in end.split('-')])
date_range = real_date2 - real_date1
dates = list()
for days in xrange(date_range.days):
    dates.append(real_date1 + datetime.timedelta(days))
# dates=['2020-03-01','2020-01-24',"2019-12-06","2020-01-02","2020-01-05","2020-01-10","2020-01-11","2020-01-13","2020-01-20"]
d1 = spark.read.csv("s3://b2c-prod-dca-store-estimates/store_back/dimension/IOS_COUNTRY_MAPPING",
                    sep="\t").withColumnRenamed("_c0", "store_id").withColumnRenamed("_c1",
                                                                                     "country_code").withColumn(
    "market_code", F.lit("ios"))
d1 = spark.createDataFrame([(0, 'WW', 'Worldwide', 'ios')],
                           schema=["store_id", "country_code", "_c2", "market_code"]).union(d1)
d2 = spark.read.csv("s3://b2c-prod-dca-store-estimates/store_back/dimension/ANDROID_COUNTRY_MAPPING",
                    sep="\t").withColumnRenamed("_c0", "store_id").withColumnRenamed("_c1",
                                                                                     "country_code").withColumn(
    "market_code", F.lit("android"))
country_code_df = d1.union(d2).where("country_code is not null").cache()
country_code_df = country_code_df.withColumnRenamed("store_id", "country_code_store_id")
print 'country mapping table'
country_code_df.show(2)
country_code_df.createOrReplaceTempView("country_code_mapping")
category_mapping_table = spark.read.parquet(
    "s3://b2c-prod-data-pipeline-qa/aa.store/store_cateogry_mapping")
category_mapping_table.createOrReplaceTempView("category_mapping_deminsion_service")
namespace = "aa.store.market-size.v1"
for test_date in dates:
    ingest_msg = {
        "namespace": "aa.store.market-size.v1",
        "job_type": "routine",
        "options": {},
        "source": [
            {
                "name": "rank_raw",
                "data_encoding": "parquet",
                "compression": "gzip",
                "path": [
                    "s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/date={}/".format(
                        test_date)],
            }, {
                "data_encoding": "parquet",
                "compression": "gzip",
                "name": "rank_unified",
                "path": [
                    "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-rank.v1/fact/granularity=daily/date={}".format(
                        test_date)],
            }, {
                "data_encoding": "parquet",
                "compression": "gzip",
                "name": "store_unified",
                "path": [
                    "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/date={}".format(
                        test_date)],
            }
        ]
    }
    run(spark, ingest_msg, sql_text)
    result = spark.sql("select * from miss_data_rank").show()
    # result.write.format("delta").save("s3://b2c-prod-data-pipeline-qa/aa.store/result_rank_data_v2_{}/".format(start),
    #                                   mode="append",
    #                                   partitionBy=["date"])
    result_store = spark.sql("select * from miss_data_store").show()
    # result_store.write.format("delta").save("s3://b2c-prod-data-pipeline-qa/aa.store/result_store_data_v2_{}/".format(start),
    #                                   mode="append",
    #                                   partitionBy=["date"])
    eject_all_caches(spark)

In [0]:
%%sh
aws s3 ls s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/

In [0]:

spark.read.parquet("s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/").show(3)

In [0]:
%%sh
aws s3 ls s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-rank.v1/fact/granularity=daily/ | tail -n 4

In [0]:

spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-rank.v1/fact/granularity=daily/").show(3)

In [0]:
%%sh
aws s3 ls s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/ | head -n 60

In [0]:
%%sh
# aws s3 ls s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/ | head -n 5
aws s3 ls s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/date=2010-07-04/ --summarize --human --recursive | tail -4 
# aws s3 ls s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-cum.v1/fact/date=2010-07-31/

In [0]:
%md
## Cumulative Test

In [0]:

date = '2010-07-04'
df1 = spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/date={}/".format(date)).orderBy('free_app_download', ascending=0)
df1.show(10)

In [0]:

date = '2010-07-04'
df1 = spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/date={}/".format(date)).where("app_id='377194688' and country_code='US'")
df1.show(10)

In [0]:
%md
2010-07-04 选 5 ios (phone, tablet)
free/paid/ download, revenue top 3, + facebook
download value =2
2012-01-01 选 5 android 数据
free download top 1
download value =2
country code WW, US (edited) 






Fiona Zhang(opens in new tab)  10:32 AM
facebook: 20600000009072   android
facebook: 284882215 ios

In [0]:

date = '2012-07-04'
df2 = spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/date={}/".format(date)).where("device_code='android-all'").orderBy('free_app_download', ascending=0)
df2.show(5)
df2 = spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/date={}/".format(date)).where("device_code='android-all'").orderBy('paid_app_download', ascending=0)
df2.show(5)
df2 = spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/date={}/".format(date)).where("device_code='android-all'").orderBy('revenue', ascending=0)
df2.show(5)
df2 = spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/date={}/".format(date)).where("device_code='android-all' and free_app_download=2").orderBy('free_app_download', ascending=1)
df2.show(5)
app = [
    ('20600000025034', 'android-all', 'WW'),
    ('20600000357382', 'android-all', 'WW'),
    ('20600000246936', 'android-all', 'WW'),
    ('20600000348221', 'android-all', 'TH')
    ]

In [0]:

date = '2010-07-04'
df2 = spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/date={}/".format(date)).where("app_id='339739007'").orderBy('free_app_download', ascending=0)
df2.show(10)
spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/date={}/".format('2020-01-01')).where("app_id='339739007'").show()

In [0]:

date = '2010-07-04'
df2 = spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/date={}/".format(date)).where("app_id='284882215'").orderBy('free_app_download', ascending=0)
df2.show(10)
df2.printSchema()

In [0]:

# mapping:  ios-phone,    ios-all,     ios-tablet,  free_app_download=2, facebook
app_list = ['377194688', '364709193', '379174209',  '339739007',         '284882215']
app_list

# app_dict = {
#     'ios-phone': '377194688',
#     'ios-all': '364709193',
#     'ios-tablet': '379174209',
    
# }

In [0]:

def get_date_list(start_date, end_date, freq="D"):
    """
    freq:   D: calendar day frequency
            M: month end frequency
            MS: month start frequency
            A, Y: year end frequency
            AS, YS: year start frequency
    """
    date_list = [x.strftime('%Y-%m-%d') for x in list(pd.date_range(start=start_date, end=end_date, freq=freq))]
    return date_list

In [0]:
%%sh
aws s3 ls s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-cum.v1/fact/

In [0]:

date = "2010-07-04"
df = spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-cum.v1/fact/date={}/".format(date)).where("app_id='284882215'").orderBy("free_app_download", ascending=0)
df.show(10)

In [0]:

from pyspark.sql.types import StructType, StructField, StringType, IntegerType
schema = StructType([StructField("app_id", StringType(), True), 
                 StructField("device_code", StringType(), True),
                 StructField("country_code", StringType(), True),
                 StructField("free_app_download", IntegerType(), True),
                 StructField("paid_app_download", IntegerType(), True),
                 StructField("revenue", IntegerType(), True)])
df = spark.createDataFrame([], schema=schema)
df.createOrReplaceTempView('table')
spark.sql("SELECT * FROM table")


In [0]:
%%sh
aws s3 ls s3://b2c-prod-data-pipeline-qa/zidong/

In [0]:

from pyspark.sql.types import StructType, StructField, StringType, IntegerType
import pandas as pd

def get_date_list(start_date, end_date, freq="D"):
    import pandas as pd
    """
    freq:   D: calendar day frequency
            M: month end frequency
            MS: month start frequency
            A, Y: year end frequency
            AS, YS: year start frequency
    """
    date_list = [x.strftime('%Y-%m-%d') for x in list(pd.date_range(start=start_date, end=end_date, freq=freq))]
    return date_list
    
dates = get_date_list('2010-07-04', '2010-07-31', freq='D')    # >> ['2010-07-04', '2010-07-05', '2010-07-06']
dates = dates + get_date_list('2010-08-01', '2020-04-18', freq='D')
# dates = dates + get_date_list('2010-08-01', '2010-09-30', freq='D')
print dates
# mapping:  ios-phone,    ios-all,     ios-tablet,  free_app_download=2, facebook
app_list = ['377194688', '364709193', '379174209',  '339739007',         '284882215', '20600000025034', '20600000357382', '20600000246936', '20600000009072']
device_code = ['ios-phone','ios-tablet','ios-all', 'android-all']    # exclude 'android-all'
countries = ['WW', 'US']
# app = [
#     ('377194688', 'ios-phone', 'WW'),
#     ('364709193', 'ios-all', 'US'),
#     ('379174209', 'ios-tablet', 'US'),
#     ('339739007', 'ios-phone', 'WW'),
#     ('284882215', 'ios-all', 'WW'),    # facebook
#     ('20600000025034', 'android-all', 'WW'),
#     ('20600000357382', 'android-all', 'WW'),
#     ('20600000246936', 'android-all', 'WW'),
#     ('20600000348221', 'android-all', 'TH'),
#     ('20600000009072', 'android-all', 'WW')
#     ]

def main_test(dates, app, device_code, countries):
    where_clause = "app_id in ({}) and device_code in ('{}') and country_code in ('{}')".format(",".join(map(str,app_list)),   "','".join(device_code), "','".join(countries) )
    # where_clause = "app_id={} and device_code={} and country_code={}".format(app_id, device_code, country_code)
    # >> "app_id in (377194688,364709193,379174209,339739007,284882215) and device_code in ('ios-phone','ios-tablet','ios-all') and country_code in ('WW','US')"
    print where_clause

    concat_list = []
    for date in dates:
    # for app_id, device_code, country_code in app:
        # where_clause = "app_id='{}' and device_code='{}' and country_code='{}'".format(app_id, device_code, country_code)
        unified_data = spark.read.option("basePath", "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/")\
        .parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/date={}/".format(date))\
        .where(where_clause).select('app_id', 'device_code', 'country_code', 'free_app_download').toPandas()
        
        concat_list.append(unified_data)
        
    temp_df = pd.concat(concat_list).groupby(['app_id', 'device_code', 'country_code'], sort=True).sum(level=['free_app_download', 'paid_app_download', 'revenue'])
    
    # fetch data from cumulative
    # cum_concat_list = []
    # for app_id, device_code, country_code in app:
    # where_clause = "app_id='{}' and device_code='{}' and country_code='{}'".format(app_id, device_code, country_code)
    cum_df = spark.read.format('delta').load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-cum.v1/fact/date={}/".format(dates[-1])).where(where_clause).select('app_id', 'device_code', 'country_code', 'free_app_download').toPandas()
    # cum_concat_list.append(cum_df)
    # cum_temp_df = pd.concat(cum_concat_list).groupby(['app_id', 'device_code', 'country_code'], sort=True).sum(level=['free_app_download', 'paid_app_download', 'revenue'])
    cum_temp_df = cum_df.groupby(['app_id', 'device_code', 'country_code'], sort=True).sum(level=['free_app_download', 'paid_app_download', 'revenue'])
    return temp_df, cum_temp_df

temp_df, cum_temp_df = main_test(dates, app, device_code, countries)
print temp_df
print cum_temp_df

# compare
import numpy as np
def temp_log_to_s3(log, name, file_format):
    import boto3
    import json
    s3 = boto3.resource('s3')
    s3object = s3.Object('b2c-prod-data-pipeline-qa', 'zidong/aa.store.cum/{}.{}'.format(name, file_format))
    if file_format == 'json':
        s3object.put(Body=json.dumps(log))
        return json.dumps(log)
    else:
        s3object.put(Body=str(log))    
        return log

# print sorted(list(temp_df['free_app_download'])) == sorted(list(cum_df['free_app_download']))

# joined_df = temp_df.merge(cum_df, on=['app_id', 'device_code', 'country_code'])
# print joined_df.T

def _compare_df(df1, df2, on=None):
    for diff_type in ["left", "right"]:
        diff_df = df1.merge(df2, indicator=True, how=diff_type, on=on)  # .loc[lambda x : x['_merge']!='both']
        print diff_df
        diff_df = diff_df.loc[diff_df["_merge"] != "both"]
        print diff_df.empty
        if not diff_df.empty:
            print diff_type
            print diff_df
    temp_log_to_s3(diff_df, 'diff_df', 'csv')

# temp_df['free_app_download'] = [np.nan if i==5 else i for i in temp_df['free_app_download']]
# temp_df['free_app_download'] = [i if str(i).isdigit() else 0 for i in temp_df['free_app_download']]
# print [i.__repr__() for i in temp_df['free_app_download']]

temp_df[temp_df['free_app_download'].isnull()] = 0
# temp_df[temp_df['paid_app_download'].isnull()] = 0
# temp_df[temp_df['revenue'].isnull()] = 0

# cum_temp_df['free_app_download'] = [0 if l is None else l for l in cum_temp_df['free_app_download']]
# cum_temp_df['paid_app_download'] = [0 if l is None else l for l in cum_temp_df['paid_app_download']]
# cum_temp_df['revenue'] = [0 if l is None else l for l in cum_temp_df['revenue']]
print temp_df

_compare_df(temp_df, cum_temp_df, on=['app_id', 'device_code', 'country_code', 'free_app_download'])
# diff_df = cum_temp_df.merge(temp_df, indicator=True, how='left', on=['app_id'])
# print diff_df.T

# temp_df.reset_index(inplace=True)
# # temp_df.drop(['index'], axis=1, inplace=True)
# print temp_df
# # # save
# spark.createDataFrame(temp_df).write.parquet("s3://b2c-prod-data-pipeline-qa/zidong/aa.store.cum/store_cum_sample_result/", mode="overwrite")
# spark.read.parquet("s3://b2c-prod-data-pipeline-qa/zidong/aa.store.cum/store_cum_sample_result/").show(100)

# # # compare
# import numpy as np
# def temp_log_to_s3(log, name, file_format):
#     import boto3
#     import json
#     s3 = boto3.resource('s3')
#     s3object = s3.Object('b2c-prod-data-pipeline-qa', 'zidong/aa.store.cum/{}.{}'.format(name, file_format))
#     if file_format == 'json':
#         s3object.put(Body=json.dumps(log))
#         return json.dumps(log)
#     else:
#         s3object.put(Body=str(log))    
#         return log

# # print sorted(list(temp_df['free_app_download'])) == sorted(list(cum_df['free_app_download']))

# # joined_df = temp_df.merge(cum_df, on=['app_id', 'device_code', 'country_code'])
# # print joined_df.T

# def _compare_df(df1, df2, on=None):
#     for diff_type in ["left", "right"]:
#         diff_df = df1.merge(df2, indicator=True, how=diff_type, on=on)  # .loc[lambda x : x['_merge']!='both']
#         diff_df = diff_df.loc[diff_df["_merge"] != "both"]
#         print diff_df.empty
#         if not diff_df.empty:
#             print diff_type
#             print diff_df
#     print "diff_df", diff_df
    

# # temp_df['free_app_download'] = [np.nan if i==5 else i for i in temp_df['free_app_download']]
# # temp_df['free_app_download'] = [i if str(i).isdigit() else 0 for i in temp_df['free_app_download']]
# # print [i.__repr__() for i in temp_df['free_app_download']]

# temp_df[temp_df['free_app_download'].isnull()] = 0
# temp_df[temp_df['paid_app_download'].isnull()] = 0
# temp_df[temp_df['revenue'].isnull()] = 0

# cum_temp_df['free_app_download'] = [0 if l is None else l for l in cum_temp_df['free_app_download']]
# cum_temp_df['paid_app_download'] = [0 if l is None else l for l in cum_temp_df['paid_app_download']]
# cum_temp_df['revenue'] = [0 if l is None else l for l in cum_temp_df['revenue']]
# # cum_temp_df.set_index(['app_id', 'device_code', 'country_code'])
# print temp_df

# _compare_df(temp_df, cum_temp_df, on=['app_id', 'device_code', 'country_code', 'free_app_download', 'paid_app_download', 'revenue'])
# # diff_df = cum_temp_df.merge(temp_df, indicator=True, how='left', on=['app_id'])
# # print diff_df.T

In [0]:

# test_df = temp_df.reset_index()
# # temp_df.drop(['index'], axis=1, inplace=True)
# print temp_df
# # save
print test_df
spark.createDataFrame(test_df).write.parquet("s3://b2c-prod-data-pipeline-qa/zidong/aa.store.cum/store_cum_sample_result/", mode="overwrite")
spark.read.parquet("s3://b2c-prod-data-pipeline-qa/zidong/aa.store.cum/store_cum_sample_result/").show(100)

In [0]:

df = spark.read.parquet("s3://b2c-prod-data-pipeline-qa/zidong/aa.store.cum/store_cum_sample_result/")
df.printSchema()

In [0]:

print u"\nmismatched input '-' expecting <EOF>(line 1, pos 41)\n\n== SQL ==\napp_id='379174209' and device_code='['ios-phone', 'ios-tablet', 'ios-all']' and country_code='US'\n-----------------------------------------^^^\n"

In [0]:

print temp_df
# temp_df.reset_index(inplace=True)
# temp_df.drop(['index'], axis=1, inplace=True)
# print temp_df
# spark.createDataFrame(temp_df).write.parquet("s3://b2c-prod-data-pipeline-qa/zidong/aa.store.cum/store_cum_sample_result/", mode="overwrite")
# spark.read.parquet("s3://b2c-prod-data-pipeline-qa/zidong/aa.store.cum/store_cum_sample_result/").show(100)

In [0]:

print temp_df.drop(['index'], axis=1)

In [0]:
%%sh
aws s3 ls s3://b2c-prod-data-pipeline-qa/zidong/aa.store.cum/store_cum_sample_result/
aws s3 cp s3://b2c-prod-data-pipeline-qa/zidong/aa.store.cum/diff_df.csv -

In [0]:

# compare
import numpy as np
def temp_log_to_s3(log, name, file_format):
    import boto3
    import json
    s3 = boto3.resource('s3')
    s3object = s3.Object('b2c-prod-data-pipeline-qa', 'zidong/aa.store.cum/{}.{}'.format(name, file_format))
    if file_format == 'json':
        s3object.put(Body=json.dumps(log))
        return json.dumps(log)
    else:
        s3object.put(Body=str(log))    
        return log

# print sorted(list(temp_df['free_app_download'])) == sorted(list(cum_df['free_app_download']))

# joined_df = temp_df.merge(cum_df, on=['app_id', 'device_code', 'country_code'])
# print joined_df.T

def _compare_df(df1, df2, on=None):
    for diff_type in ["left", "right"]:
        diff_df = df1.merge(df2, indicator=True, how=diff_type, on=on)  # .loc[lambda x : x['_merge']!='both']
        print diff_df
        diff_df = diff_df.loc[diff_df["_merge"] != "both"]
        print diff_df.empty
        if not diff_df.empty:
            print diff_type
            print diff_df
    temp_log_to_s3(diff_df, 'diff_df', 'csv')

# temp_df['free_app_download'] = [np.nan if i==5 else i for i in temp_df['free_app_download']]
# temp_df['free_app_download'] = [i if str(i).isdigit() else 0 for i in temp_df['free_app_download']]
# print [i.__repr__() for i in temp_df['free_app_download']]

temp_df[temp_df['free_app_download'].isnull()] = 0
# temp_df[temp_df['paid_app_download'].isnull()] = 0
# temp_df[temp_df['revenue'].isnull()] = 0

# cum_temp_df['free_app_download'] = [0 if l is None else l for l in cum_temp_df['free_app_download']]
# cum_temp_df['paid_app_download'] = [0 if l is None else l for l in cum_temp_df['paid_app_download']]
# cum_temp_df['revenue'] = [0 if l is None else l for l in cum_temp_df['revenue']]
print temp_df

_compare_df(temp_df, cum_temp_df, on=['app_id', 'device_code', 'country_code', 'free_app_download'])
# diff_df = cum_temp_df.merge(temp_df, indicator=True, how='left', on=['app_id'])
# print diff_df.T

In [0]:
%%sh
aws s3 cp s3://b2c-prod-data-pipeline-qa/zidong/aa.store.cum/diff_df.csv -

In [0]:

from pyspark.sql.types import StructType, StructField, StringType, IntegerType
import pandas as pd

def get_date_list(start_date, end_date, freq="D"):
    import pandas as pd
    """
    freq:   D: calendar day frequency
            M: month end frequency
            MS: month start frequency
            A, Y: year end frequency
            AS, YS: year start frequency
    """
    date_list = [x.strftime('%Y-%m-%d') for x in list(pd.date_range(start=start_date, end=end_date, freq=freq))]
    return date_list
    
dates = get_date_list('2010-07-04', '2010-07-31', freq='D')    # >> ['2010-07-04', '2010-07-05', '2010-07-06']
# dates = dates + get_date_list('2010-08-01', '2020-04-18', freq='D')
print dates
# mapping:  ios-phone,    ios-all,     ios-tablet,  free_app_download=2, facebook
app_list = ['377194688', '364709193', '379174209',  '339739007',         '284882215']
device_code = ['ios-phone','ios-tablet','ios-all']    # exclude 'android-all'
countries = ['WW', 'US']
app = [
    ('377194688', 'ios-phone', 'WW'),
    ('364709193', 'ios-all', 'US'),
    ('379174209', 'ios-tablet', 'US'),
    ('339739007', 'ios-phone', 'WW'),
    ('284882215', 'ios-all', 'WW'),    # facebook
    ('20600000025034', 'android-all', 'WW'),
    ('20600000357382', 'android-all', 'WW'),
    ('20600000246936', 'android-all', 'WW'),
    ('20600000348221', 'android-all', 'TH'),
    ('20600000009072', 'android-all', 'WW')
    ]



concat_list = []

for app_id, device_code, country_code in app:
    where_clause = "app_id='{}' and device_code='{}' and country_code='{}'".format(app_id, device_code, country_code)
    unified_data = spark.read.option("basePath", "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/")\
    .parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/date=20[10-11]*/")\
    .where(where_clause).select('app_id', 'device_code', 'country_code', 'free_app_download', 'paid_app_download', 'revenue')
    # unified_data.createOrReplaceTempView('uni_table')
    # uni_df = spark.sql("SELECT *, sum(free_app_download) free_app_download, sum(paid_app_download) paid_app_download, sum(revenue) revenue FROM uni_table GROUPBY app_id, device_code, country_code")
    uni_df = unified_data.groupby('app_id', 'device_code', 'country_code').agg({'free_app_download': 'sum', 'paid_app_download': 'sum', 'revenue': 'sum'}).withColumnRenamed('sum(free_app_download)', 'free_app_download').withColumnRenamed('sum(paid_app_download)', 'paid_app_download').withColumnRenamed('sum(revenue)', 'revenue')
    
    # cumulative data
    cum_df = spark.read.format('delta').load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-cum.v1/fact/date={}/".format('2011-12-31')).where(where_clause).select('app_id', 'device_code', 'country_code', 'free_app_download', 'paid_app_download', 'revenue')
    
    join_df = uni_df.join(cum_df, ['app_id', 'device_code', 'country_code', 'free_app_download', 'paid_app_download', 'revenue'], 'full_outer')
    join_df.show(10)

# save
# spark.createDataFrame(temp_df).write.parquet("s3://b2c-prod-data-pipeline-qa/zidong/aa.store.cum/store_cum_sample_result/", mode="overwrite")
# spark.read.parquet("s3://b2c-prod-data-pipeline-qa/zidong/aa.store.cum/store_cum_sample_result/").show(100)

In [0]:

from pyspark.sql.types import StructType, StructField, StringType, IntegerType
schema = StructType([StructField("app_id", StringType(), True), 
                 StructField("device_code", StringType(), True),
                 StructField("country_code", StringType(), True),
                 StructField("free_app_download", IntegerType(), True),
                 StructField("paid_app_download", IntegerType(), True),
                 StructField("revenue", IntegerType(), True)])
rdd = sc.parallelize([{'app_id': '111', 'device_code': '222', 'country_code': 'WW', 'free_app_download': 1, 'paid_app_download': 2, 'revenue': 3}])
df1 = spark.createDataFrame(rdd, schema=schema)
df1.show()

rdd = sc.parallelize([{'app_id': '111', 'device_code': '222', 'country_code': 'WW', 'free_app_download': 1, 'paid_app_download': 2, 'revenue': 3}])
df2 = spark.createDataFrame(rdd, schema=schema)
df2.show()

join_df = df1.join(df2, ['app_id', 'device_code', 'country_code', 'free_app_download', 'paid_app_download', 'revenue'], 'full_outer')
join_df.show()

In [0]:

a.groupby('app_id', 'device_code', 'country_code').agg({'free_app_download': 'sum'}).orderBy('app_id', 'device_code', 'country_code').show(200)

In [0]:

date = '2010-07-05'
df = spark.read.format('delta').load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-cum.v1/fact/date={}/".format(date)).where("app_id='284882215' and device_code='ios-all' and country_code='US'")
df.show()

In [0]:

df = spark.read.option("basePath", "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/").parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/date=2010-07-0[4-5]/").where("app_id='284882215' and device_code='ios-all' and country_code='US'")
df.show(10)

In [0]:

print u"\nmismatched input ':' expecting <EOF>(line 1, pos 4)\n\n== SQL ==\ndate::text in (2010-07-04) and app_id in (377194688,364709193,379174209,339739007,284882215) and device_code in ('ios-phone','ios-tablet','ios-all') and country_code in ('WW','US')\n----^^^\n"

In [0]:

import datetime
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
# start = "2020-01-01"
# end = "2020-04-19"
start = '2010-07-04'
end = '2010-07-04'
real_date1 = datetime.date(*[int(x) for x in start.split('-')])
real_date2 = datetime.date(*[int(x) for x in end.split('-')])
date_range = real_date2 - real_date1
dates = list()
for days in xrange(date_range.days):
    dates.append(real_date1 + datetime.timedelta(days))


# test_ios_app_list = [377194688 , 375875657, 366236510, 354990881,365691077 , 343200656 ,20600000009072,20600000000553]
test_ios_app_list = ['377194688', '364709193', '379174209',  '339739007',         '284882215']
device_code=['ios-phone','ios-tablet','ios-all','android-all']
country_list=['WW', 'US']
# test_list = [(app, device, country ) for app in test_ios_app_list  for device in device_code for country in country_list ]


sql_where = "app_id in ({}) and device_code in ('{}') and country_code in ('{}')".format(",".join(map(str,test_ios_app_list)),   "','".join(device_code), "','".join(country_list) )
print sql_where


_unified_data = spark.read.option("basePath", "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/").parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/date=20{10,11,12,13,14,15,16,17,18,19}-*").where(sql_where).select("app_id", "device_code", "country_code","free_app_download","paid_app_download","revenue")

result=list()
for day in dates:
    _unified_data_single_date = spark.read.option("basePath", "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/").parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/date={%s}" % (day.strftime("%Y-%m-%d"))).where(sql_where).select("app_id", "device_code", "country_code","free_app_download","paid_app_download","revenue").collect()
    # print _unified_data
    result.extend(_unified_data_single_date)

schema = StructType([StructField("app_id", StringType(), True), 
                     StructField("device_code", StringType(), True),
                     StructField("country_code", StringType(), True),
                     StructField("free_app_download", IntegerType(), True),
                     StructField("paid_app_download", IntegerType(), True),
                     StructField("revenue", IntegerType(), True)])

_unified_data.createOrReplaceTempView("unified_data_view")
spark.createDataFrame(result, schema=schema).createOrReplaceTempView("unified_data_singe_date_view")
spark.sql("select * from unified_data_view union select * from unified_data_singe_date_view").createOrReplaceTempView("union_unified_data")
spark.sql("select app_id, sum(free_app_download) from union_unified_data group by app_id, device_code, country_code").show(2)



cdf1 = spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-cum.v1/fact/date={}/".format(dates[-1])).where(sql_where).cache()
cdf1.createOrReplaceTempView("cumu")

print "compare sum data: ", dates[-1]

# df1 = spark.sql("select app_id, sum(free_app_download) as free_app_download, device_code, country_code from unified_data_view  group by app_id , device_code, country_code EXCEPT ALL select app_id , device_code, country_code, free_app_download from cumu  ").show()
# spark.sql("select app_id, free_app_download, device_code, country_code from cumu EXCEPT ALL select app_id , device_code, country_code , sum(free_app_download) as free_app_download  from unified_data_view  group by app_id , device_code, country_code   ").show()

spark.sql("select app_id, sum(free_app_download) as free_app_download, sum(paid_app_download) as paid_app_download, sum(revenue) as revenue, device_code, country_code from union_unified_data  group by app_id , device_code, country_code order by app_id , device_code, country_code").createOrReplaceTempView("df1")
spark.sql("select app_id, free_app_download, paid_app_download, revenue, device_code, country_code from cumu order by app_id , device_code, country_code").createOrReplaceTempView("df2")
spark.sql("select * from df1 except all select * from df2").show()
spark.sql("select * from df2 except all select * from df1").show()



In [0]:

from collections import deque
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
import pandas as pd

def get_date_list(start_date, end_date, freq="D"):
    import pandas as pd
    """
    freq:   D: calendar day frequency
            M: month end frequency
            MS: month start frequency
            A, Y: year end frequency
            AS, YS: year start frequency
    """
    date_list = [x.strftime('%Y-%m-%d') for x in list(pd.date_range(start=start_date, end=end_date, freq=freq))]
    return date_list
    
cum_dates = get_date_list('2010-07-04', '2010-07-31', freq='D') + \
            get_date_list('2010-08-31', '2020-03-31', freq='M') + \
            get_date_list('2020-04-01', '2020-04-18', freq='D')     # prod
# cum_dates = get_date_list('2010-07-04', '2010-07-31', freq='D') + \
#             get_date_list('2010-08-31', '2011-09-30', freq='M')       # test
# cum_dates = get_date_list('2010-07-04', '2010-07-06', freq='D')
store_dates = get_date_list('2010-07-04', '2020-04-18', freq='D')   # prod
# store_dates = get_date_list('2010-07-04', '2011-09-30', freq='D')     # test
# store_dates = get_date_list('2010-07-04', '2010-07-06', freq='D')

print cum_dates
print store_dates


temp_df = pd.DataFrame({'app_id': []})
for cum_date in cum_dates:
    concat_list = [pd.DataFrame({'app_id': []})]
    store_dates_range = range(len(store_dates))
    # print store_dates
    for i in store_dates_range:
        unified_data = spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/date={}/".format(store_dates[i])).select('app_id').distinct().toPandas()
        concat_list.append(unified_data)

        if store_dates[i] == cum_date:
            store_dates = store_dates[i+1:]
            break
    # print "temp_df", temp_df
    concat_list.append(temp_df)
    temp_df = pd.concat(concat_list).drop_duplicates()

    # fetch data from cumulative
    cum_df = spark.read.format('delta').load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-cum.v1/fact/date={}/".format(cum_date)).select('app_id').distinct().toPandas()

    # compare
    joined_df = temp_df.merge(cum_df, on=['app_id'], indicator=True)
    if not joined_df.loc[joined_df['_merge'] != 'both'].empty:
        print cum_date


In [0]:

a = ['2010-07-04', '2010-07-05', '2010-07-06', '2010-07-07', '2010-07-08', '2010-07-09', '2010-07-10', '2010-07-11', '2010-07-12', '2010-07-13', '2010-07-14', '2010-07-15', '2010-07-16', '2010-07-17', '2010-07-18', '2010-07-19', '2010-07-20', '2010-07-21', '2010-07-22', '2010-07-23', '2010-07-24', '2010-07-25', '2010-07-26', '2010-07-27', '2010-07-28', '2010-07-29', '2010-07-30', '2010-07-31', '2010-08-01', '2010-08-02', '2010-08-03', '2010-08-04', '2010-08-05', '2010-08-06', '2010-08-07', '2010-08-08', '2010-08-09', '2010-08-10', '2010-08-11', '2010-08-12', '2010-08-13', '2010-08-14', '2010-08-15', '2010-08-16', '2010-08-17', '2010-08-18', '2010-08-19', '2010-08-20', '2010-08-21', '2010-08-22', '2010-08-23', '2010-08-24', '2010-08-25', '2010-08-26', '2010-08-27', '2010-08-28', '2010-08-29', '2010-08-30', '2010-08-31', '2010-09-01', '2010-09-02', '2010-09-03', '2010-09-04', '2010-09-05', '2010-09-06', '2010-09-07', '2010-09-08', '2010-09-09', '2010-09-10', '2010-09-11', '2010-09-12', '2010-09-13', '2010-09-14', '2010-09-15', '2010-09-16', '2010-09-17', '2010-09-18', '2010-09-19', '2010-09-20', '2010-09-21', '2010-09-22', '2010-09-23', '2010-09-24', '2010-09-25', '2010-09-26', '2010-09-27', '2010-09-28', '2010-09-29', '2010-09-30', '2010-10-01', '2010-10-02', '2010-10-03', '2010-10-04', '2010-10-05', '2010-10-06', '2010-10-07', '2010-10-08', '2010-10-09', '2010-10-10', '2010-10-11', '2010-10-12', '2010-10-13', '2010-10-14', '2010-10-15', '2010-10-16', '2010-10-17', '2010-10-18', '2010-10-19', '2010-10-20', '2010-10-21', '2010-10-22', '2010-10-23', '2010-10-24', '2010-10-25', '2010-10-26', '2010-10-27', '2010-10-28', '2010-10-29', '2010-10-30', '2010-10-31', '2010-11-01', '2010-11-02', '2010-11-03', '2010-11-04', '2010-11-05', '2010-11-06', '2010-11-07', '2010-11-08', '2010-11-09', '2010-11-10', '2010-11-11', '2010-11-12', '2010-11-13', '2010-11-14', '2010-11-15', '2010-11-16', '2010-11-17', '2010-11-18', '2010-11-19', '2010-11-20', '2010-11-21', '2010-11-22', '2010-11-23', '2010-11-24', '2010-11-25', '2010-11-26', '2010-11-27', '2010-11-28', '2010-11-29', '2010-11-30', '2010-12-01', '2010-12-02', '2010-12-03', '2010-12-04', '2010-12-05', '2010-12-06', '2010-12-07', '2010-12-08', '2010-12-09', '2010-12-10', '2010-12-11', '2010-12-12', '2010-12-13', '2010-12-14', '2010-12-15', '2010-12-16', '2010-12-17', '2010-12-18', '2010-12-19', '2010-12-20', '2010-12-21', '2010-12-22', '2010-12-23', '2010-12-24', '2010-12-25', '2010-12-26', '2010-12-27', '2010-12-28', '2010-12-29', '2010-12-30', '2010-12-31', '2011-01-01', '2011-01-02', '2011-01-03', '2011-01-04', '2011-01-05', '2011-01-06', '2011-01-07', '2011-01-08', '2011-01-09', '2011-01-10', '2011-01-11', '2011-01-12', '2011-01-13', '2011-01-14', '2011-01-15', '2011-01-16', '2011-01-17', '2011-01-18', '2011-01-19', '2011-01-20', '2011-01-21', '2011-01-22', '2011-01-23', '2011-01-24', '2011-01-25', '2011-01-26', '2011-01-27', '2011-01-28', '2011-01-29', '2011-01-30', '2011-01-31', '2011-02-01', '2011-02-02', '2011-02-03', '2011-02-04', '2011-02-05', '2011-02-06', '2011-02-07', '2011-02-08', '2011-02-09', '2011-02-10', '2011-02-11', '2011-02-12', '2011-02-13', '2011-02-14', '2011-02-15', '2011-02-16', '2011-02-17', '2011-02-18', '2011-02-19', '2011-02-20', '2011-02-21', '2011-02-22', '2011-02-23', '2011-02-24', '2011-02-25', '2011-02-26', '2011-02-27', '2011-02-28', '2011-03-01', '2011-03-02', '2011-03-03', '2011-03-04', '2011-03-05', '2011-03-06', '2011-03-07', '2011-03-08', '2011-03-09', '2011-03-10', '2011-03-11', '2011-03-12', '2011-03-13', '2011-03-14', '2011-03-15', '2011-03-16', '2011-03-17', '2011-03-18', '2011-03-19', '2011-03-20', '2011-03-21', '2011-03-22', '2011-03-23', '2011-03-24', '2011-03-25', '2011-03-26', '2011-03-27', '2011-03-28', '2011-03-29', '2011-03-30', '2011-03-31', '2011-04-01', '2011-04-02', '2011-04-03', '2011-04-04', '2011-04-05', '2011-04-06', '2011-04-07', '2011-04-08', '2011-04-09', '2011-04-10', '2011-04-11', '2011-04-12', '2011-04-13', '2011-04-14', '2011-04-15', '2011-04-16', '2011-04-17', '2011-04-18', '2011-04-19', '2011-04-20', '2011-04-21', '2011-04-22', '2011-04-23', '2011-04-24', '2011-04-25', '2011-04-26', '2011-04-27', '2011-04-28', '2011-04-29', '2011-04-30', '2011-05-01', '2011-05-02', '2011-05-03', '2011-05-04', '2011-05-05', '2011-05-06', '2011-05-07', '2011-05-08', '2011-05-09', '2011-05-10', '2011-05-11', '2011-05-12', '2011-05-13', '2011-05-14', '2011-05-15', '2011-05-16', '2011-05-17', '2011-05-18', '2011-05-19', '2011-05-20', '2011-05-21', '2011-05-22', '2011-05-23', '2011-05-24', '2011-05-25', '2011-05-26', '2011-05-27', '2011-05-28', '2011-05-29', '2011-05-30', '2011-05-31', '2011-06-01', '2011-06-02', '2011-06-03', '2011-06-04', '2011-06-05', '2011-06-06', '2011-06-07', '2011-06-08', '2011-06-09', '2011-06-10', '2011-06-11', '2011-06-12', '2011-06-13', '2011-06-14', '2011-06-15', '2011-06-16', '2011-06-17', '2011-06-18', '2011-06-19', '2011-06-20', '2011-06-21', '2011-06-22', '2011-06-23', '2011-06-24', '2011-06-25', '2011-06-26', '2011-06-27', '2011-06-28', '2011-06-29', '2011-06-30', '2011-07-01', '2011-07-02', '2011-07-03', '2011-07-04', '2011-07-05', '2011-07-06', '2011-07-07', '2011-07-08', '2011-07-09', '2011-07-10', '2011-07-11', '2011-07-12', '2011-07-13', '2011-07-14', '2011-07-15', '2011-07-16', '2011-07-17', '2011-07-18', '2011-07-19', '2011-07-20', '2011-07-21', '2011-07-22', '2011-07-23', '2011-07-24', '2011-07-25', '2011-07-26', '2011-07-27', '2011-07-28', '2011-07-29', '2011-07-30', '2011-07-31', '2011-08-01', '2011-08-02', '2011-08-03', '2011-08-04', '2011-08-05', '2011-08-06', '2011-08-07', '2011-08-08', '2011-08-09', '2011-08-10', '2011-08-11', '2011-08-12', '2011-08-13', '2011-08-14', '2011-08-15', '2011-08-16', '2011-08-17', '2011-08-18', '2011-08-19', '2011-08-20', '2011-08-21', '2011-08-22', '2011-08-23', '2011-08-24', '2011-08-25', '2011-08-26', '2011-08-27', '2011-08-28', '2011-08-29', '2011-08-30', '2011-08-31', '2011-09-01', '2011-09-02', '2011-09-03', '2011-09-04', '2011-09-05', '2011-09-06', '2011-09-07', '2011-09-08', '2011-09-09', '2011-09-10', '2011-09-11', '2011-09-12', '2011-09-13', '2011-09-14', '2011-09-15', '2011-09-16', '2011-09-17', '2011-09-18', '2011-09-19', '2011-09-20', '2011-09-21', '2011-09-22', '2011-09-23', '2011-09-24', '2011-09-25', '2011-09-26', '2011-09-27', '2011-09-28', '2011-09-29', '2011-09-30']
len(a)

In [0]:

from pyspark.sql.types import StructType, StructField, StringType, IntegerType
import pandas as pd

def get_date_list(start_date, end_date, freq="D"):
    import pandas as pd
    """
    freq:   D: calendar day frequency
            M: month end frequency
            MS: month start frequency
            A, Y: year end frequency
            AS, YS: year start frequency
    """
    date_list = [x.strftime('%Y-%m-%d') for x in list(pd.date_range(start=start_date, end=end_date, freq=freq))]
    return date_list
    
dates = get_date_list('2010-07-04', '2010-07-05', freq='D')    # >> ['2010-07-04', '2010-07-05', '2010-07-06']
# dates = dates + get_date_list('2010-08-01', '2020-04-18', freq='D')

temp_set = {}

# concat_list = []
for date in dates:
    unified_data = spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/date={}/".format(date)).select('app_id').distinct().collect()

    temp_set = pd.concat([unified_data, temp_df]).drop_duplicates()

    # fetch data from cumulative
    cum_df = spark.read.format('delta').load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-cum.v1/fact/date={}/".format(date)).select('app_id').distinct().collect()

    # compare
    joined_df = temp_df.merge(cum_df, on=['app_id'], indicator=True)
    print joined_df.loc[joined_df['_merge'] != 'both'].empty
    # print temp_df
    # print cum_df

In [0]:

unified_data = spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/date={}/".format('2020-01-01')).select('app_id').distinct().collect()
print unified_data[:10]

In [0]:

# unified_data = spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/date={}/".format(date)).select('app_id').distinct().toPandas()
temp_df = pd.DataFrame({'app_id': []})
df = pd.concat([unified_data, temp_df]).drop_duplicates()
print temp_df
print df

In [0]:


import numpy as np
joined_df = temp_df.merge(cum_df, on=['app_id'], indicator=True)
joined_df['_merge'] = np.nan
joined_df['_merge'].isnull()

In [0]:

def temp_log_to_s3(log, name, file_format):
    import boto3
    import json
    s3 = boto3.resource('s3')
    s3object = s3.Object('b2c-prod-data-pipeline-qa', 'zidong/aa.store.cum/{}.{}'.format(name, file_format))
    if file_format == 'json':
        s3object.put(Body=json.dumps(log))
        return json.dumps(log)
    else:
        s3object.put(Body=str(log))    
        return log
        
def s3_to_temp_log(name, file_format):
    import boto3
    import json
    s3 = boto3.resource('s3')
    s3object = s3.Object('b2c-prod-data-pipeline-qa', 'zidong/aa.store.cum/{}.{}'.format(name, file_format))
    body = s3object.get()['Body'].read()
    print body
    if file_format == 'json':
        body = json.loads(body)
        return body
    return body

df1 = pd.DataFrame({'angles': [1, 2, 3]})
print df1

df2 = pd.DataFrame({'angles': [2, 3, 6]})
print df2

df3 = pd.concat([df1, df2]).drop_duplicates()

df3.reset_index(drop=True, inplace=True)
print df3

csv = df3.to_csv(index=False)
temp_log_to_s3(csv, 'test_csv', 'csv')
body = s3_to_temp_log('test_csv', 'csv')

df = pd.read_csv(body)
print df

In [0]:

import sys
from StringIO import StringIO

df = pd.read_csv(StringIO(body), sep=",")
print df

In [0]:

from collections import deque
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
import pandas as pd

def temp_log_to_s3(log, name, file_format):
    import boto3
    import json
    s3 = boto3.resource('s3')
    s3object = s3.Object('b2c-prod-data-pipeline-qa', 'zidong/aa.store.cum/{}.{}'.format(name, file_format))
    if file_format == 'json':
        s3object.put(Body=json.dumps(log))
        return json.dumps(log)
    else:
        s3object.put(Body=str(log))    
        return log

def s3_to_temp_log(name, file_format):
    import boto3
    import json
    s3 = boto3.resource('s3')
    s3object = s3.Object('b2c-prod-data-pipeline-qa', 'zidong/aa.store.cum/{}.{}'.format(name, file_format))
    body = s3object.get()['Body'].read()

    if file_format == 'json':
        body = json.loads(body)
        return body
    return body

def get_date_list(start_date, end_date, freq="D"):
    import pandas as pd
    """
    freq:   D: calendar day frequency
            M: month end frequency
            MS: month start frequency
            A, Y: year end frequency
            AS, YS: year start frequency
    """
    date_list = [x.strftime('%Y-%m-%d') for x in list(pd.date_range(start=start_date, end=end_date, freq=freq))]
    return date_list
    
cum_dates = get_date_list('2010-07-04', '2010-07-31', freq='D') + \
            get_date_list('2010-08-31', '2020-03-31', freq='M') + \
            get_date_list('2020-04-01', '2020-04-18', freq='D')     # prod
# cum_dates = get_date_list('2010-07-04', '2010-07-31', freq='D') + \
#             get_date_list('2010-08-31', '2011-09-30', freq='M')       # test monthly
# cum_dates = get_date_list('2010-07-04', '2010-07-04', freq='D')
store_dates = get_date_list('2010-07-04', '2020-04-18', freq='D')   # prod
# store_dates = get_date_list('2010-07-04', '2011-09-30', freq='D')     # test monthly
# store_dates = get_date_list('2010-07-04', '2010-07-04', freq='D')

print cum_dates
print store_dates


temp_df = pd.DataFrame({'app_id': []})
for cum_date in cum_dates:
    concat_list = [pd.DataFrame({'app_id': []})]
    store_dates_range = range(len(store_dates))
    # print store_dates
    for i in store_dates_range:
        unified_data = spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/date={}/".format(store_dates[i])).select('app_id').distinct().toPandas()

        temp_df = pd.concat([temp_df, unified_data]).drop_duplicates()
        # temp_df.reset_index(drop=True, inplace=True)
        
        if store_dates[i] == cum_date:
            store_dates = store_dates[i+1:]
            break

    # fetch data from cumulative
    cum_df = spark.read.format('delta').load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-cum.v1/fact/date={}/".format(cum_date)).select('app_id').distinct().toPandas()

    # compare
    joined_df = temp_df.merge(cum_df, on=['app_id'], indicator=True)
    if not joined_df.loc[joined_df['_merge'] != 'both'].empty:
        print "failed data is", cum_date
        break

temp_df.reset_index(drop=True, inplace=True)
csv = temp_df.to_csv(index=False)
temp_log_to_s3(csv, 'cum_test_csv', 'csv')

# read from csv file and convert to pandas dataframe
body = s3_to_temp_log('cum_test_csv', 'csv')
import sys
from StringIO import StringIO
# saved_pandas_csv_df = pd.read_csv(StringIO(body), sep=",")
# print saved_pandas_csv_df

# # save
temp_df['app_id'] = temp_df['app_id'].astype(str)
temp_df['app_id'] = [i[:-2] for i in temp_df['app_id']]
spark.createDataFrame(temp_df).write.parquet("s3://b2c-prod-data-pipeline-qa/aa.store.app-est-cum.v1/store_cum_completeness_result/", mode="overwrite")
spark.read.parquet("s3://b2c-prod-data-pipeline-qa/aa.store.app-est-cum.v1/store_cum_completeness_result/").show(100)

print "END"

In [0]:
%%sh
aws s3 ls s3://b2c-prod-data-pipeline-qa/aa.store.app-est-cum.v1/store_cum_completeness_result/

In [0]:

print temp_df
spark.createDataFrame(temp_df).write.parquet("s3://b2c-prod-data-pipeline-qa/zidong/aa.store.cum/store_cum_completeness_result/", mode="overwrite")
df = spark.read.parquet("s3://b2c-prod-data-pipeline-qa/zidong/aa.store.cum/store_cum_completeness_result/")
df.show(100)

In [0]:

print temp_df
temp_df['app_id'] = [i[:-2] for i in temp_df['app_id']]
print temp_df

In [0]:
%%sh
aws s3 ls s3://b2c-prod-data-pipeline-qa/zidong/aa.store.cum/

aws s3 cp s3://b2c-prod-data-pipeline-qa/zidong/aa.store.cum/cum_test_csv.csv -

In [0]:
%%sh
aws s3 ls s3://b2c-prod-data-pipeline-unified-store-free/unified/


In [0]:

spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-free/unified/store.app.v1/dimension/").show(10, True, True)

In [0]:

import pandas as pd
def get_date_list(start_date, end_date, freq="D"):
    import pandas as pd
    """
    freq:   D: calendar day frequency
            M: month end frequency
            MS: month start frequency
            A, Y: year end frequency
            AS, YS: year start frequency
    """
    date_list = [x.strftime('%Y-%m-%d') for x in list(pd.date_range(start=start_date, end=end_date, freq=freq))]
    return date_list

# cum_dates = get_date_list('2010-07-04', '2010-07-31', freq='D') + \
#             get_date_list('2010-08-31', '2020-03-31', freq='M') + \
#             get_date_list('2020-04-01', '2020-04-18', freq='D')     # prod
# cum_dates = get_date_list('2010-07-04', '2010-07-31', freq='D') + \
            # get_date_list('2010-08-31', '2011-09-30', freq='M')       # test monthly
cum_dates = get_date_list('2010-07-04', '2010-07-04', freq='D')
# store_dates = get_date_list('2010-07-04', '2020-04-18', freq='D')   # prod
# store_dates = get_date_list('2010-07-04', '2011-09-30', freq='D')     # test monthly
store_dates = get_date_list('2010-07-04', '2010-07-04', freq='D')

base_path = "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/"

from pyspark.sql.types import StructType, StructField, StringType, IntegerType
schema = StructType([StructField("app_id", StringType(), True)])
prev_df = spark.createDataFrame([], schema=schema)

for cum_date in cum_dates:
    paths = []
    # date_range = range(len(store_dates))
    print store_dates
    for i in range(len(store_dates)):
        print i
        path = base_path + "granularity=daily/date=" + store_dates[i] + "/"
        paths.append(path)
        
        if store_dates[i] == cum_date:
            store_dates = store_dates[i+1 : ]
            break
    print paths

    next_df = spark.read.option('basePath', base_path).parquet(*paths).select('app_id').distinct()
    prev_df = next_df.union(prev_df).distinct()
    
    # compare
    cum_df = spark.read.format('delta').load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-cum.v1/fact/").where("date='{}'".format(cum_date)).select('app_id').distinct()
    except_df = prev_df.subtract(cum_df)
    if except_df.first():
        print "failed date:", cum_date
print "SUCCESSED"
prev_df.write.parquet("s3://b2c-prod-data-pipeline-qa/aa.store.app-est-cum.v1/store_cum_completeness_result/", mode="overwrite")



In [0]:

prev_df.write.parquet("s3://b2c-prod-data-pipeline-qa/aa.store.app-est-cum.v1/store_cum_completeness_result/", mode="overwrite")
spark.read.parquet("s3://b2c-prod-data-pipeline-qa/aa.store.app-est-cum.v1/store_cum_completeness_result/").show(100)

In [0]:

base_path = "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/"
path_list = [
    "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/date=2010-07-04/",
    "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/date=2010-07-05/"
    ]
df = spark.read.option('basePath', base_path).parquet(*path_list).select('app_id')

df.show(10)


In [0]:

spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/date=2010-07-05/").show(3)

In [0]:
%%sh
aws s3 ls s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/

In [0]:
%%sh


In [0]:
%%sh
PGPASSWORD='TMV!PYT02X*w' psql -h 54.210.244.2 -p 5433 -U app_tomcat -d aa_android << EOF
\d app;
SELECT id, name, company FROM app LIMIT 1;
EOF

In [0]:
%%sh
PGPASSWORD='TMV!PYT02X*w' psql -h 54.210.244.2 -p 5432 -U app_tomcat -d aa << EOF
\d aa_app;
SELECT id, name, company FROM aa_app LIMIT 1;
EOF


In [0]:

unified_data = spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/date=2010-07*/")
uni_df = unified_data.select('app_id').distinct()
uni_df.show(10)