In [0]:

spark.sparkContext.addPyFile("/home/hadoop/bdp/application/libs/python/dependencies.zip")
import pandas as pd
pd.set_option('expand_frame_repr', False)
import boto3
s3 = boto3.resource('s3')
s3object = s3.Object('b2c-prod-data-pipeline-qa', 'tom/gameiq/2020-03-13/regression1.1.txt')
log = [] 


In [0]:


import pandas as pd
from pyspark.sql import functions as F
from applications.db_check_v1.common.constants import COUNTRY_CODE_MAPPING_BY_MARKET_CODE as COUNTRY_CODE_MAPPING
from conf.settings import *
from applications.db_check_v1.common.db_check_utils import query_df
import pandas as pd


DEVICE_CODE_MAPPING_BY_DEVICE_ID = {
    'google-play':{
        1000: "android-all",
        1001: "android-phone",
        1002: "android-tablet",
    },
    'apple-store': {
        2000: "ios-all",
        2001: "ios-phone",
        2002: "ios-tablet"
    }
}
DEVICE_CODE_MAPPING = DEVICE_CODE_MAPPING_BY_DEVICE_ID

daily_est_dsn =(
    "dbname='{db}' user='{user}' password='{password}' "
    "host='{host}' port='{port}'".format(
        db=PG_DAILY_EST_NAME,
        user=PG_DAILY_EST_ACCESS_ID,
        host=PG_DAILY_EST_HOSTS[0][0],
        password=PG_DAILY_EST_SECRET_KEY,
        port=PG_DAILY_EST_HOSTS[0][1]
    )
)

mapping_df_unified = spark.read.parquet("s3://b2c-prod-data-pipeline-unified-dna/unified/dna.genre_id_product_mapping.v1/dimension/")
transformed_mapping_df = mapping_df_unified.select(["product_id", "genre_id"]).withColumn('genre_id', F.explode('genre_id'))

def compare(date):
    #collect
    store_est_unified_df = spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/date={}/".format(date))
    est_unified_df = spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.genre-est.v1/fact/granularity=daily/date={}/".format(date))
    
    # #transform
    transformed_store_est_unified_df = store_est_unified_df.select(["app_id", "device_code", "country_code", "free_app_download", "paid_app_download", "revenue"]).withColumnRenamed("app_id", "product_id")
    giq_df = transformed_store_est_unified_df.join(transformed_mapping_df, transformed_mapping_df.product_id == transformed_store_est_unified_df.product_id, how='inner').groupBy(["device_code", "country_code", "genre_id"]).agg({
            "free_app_download": "sum",
            "paid_app_download": "sum",
            "revenue": "sum",
        }).withColumnRenamed("sum(free_app_download)", "free_app_download").withColumnRenamed("sum(paid_app_download)", "paid_app_download").withColumnRenamed("sum(revenue)", "revenue")
    giq_df = giq_df.fillna({"free_app_download":0, "paid_app_download":0})
    giq_df = giq_df.withColumn("download", F.when(giq_df.free_app_download + giq_df.paid_app_download>0, giq_df.free_app_download + giq_df.paid_app_download).otherwise(F.lit(None)))

    #compare
    s1=giq_df.select(["device_code", "country_code", "genre_id", "download", "revenue"])
    s2=est_unified_df.select(["device_code", "country_code", "genre_id", "download", "revenue"])

    diff = s1.union(s2).subtract(s1.intersect(s2))
    
    ##########################
    # DB & UNIFIED LAYER TEST

    sql = """
select device_id, store_id, date, genre_id, modifier_id, download, revenue from plproxy.execute_select_nestloop($proxy$ 
    select device_id, store_id, date, genre_id, modifier_id, download, revenue
    from aa.genre_store_daily_estimate
    where 
        date = '{}'
$proxy$) tbl (device_id SMALLINT, store_id INT, date DATE , genre_id BIGINT, modifier_id BIGINT, download BIGINT, revenue BIGINT);""".format(date)

    db_df = query_df(daily_est_dsn, sql)

    db_df.loc[db_df['device_id'].isin(DEVICE_CODE_MAPPING['google-play'])] = db_df.loc[db_df['device_id'].isin(DEVICE_CODE_MAPPING['google-play'])].replace({"device_id": DEVICE_CODE_MAPPING['google-play']})
    db_df.loc[db_df['device_id'].isin(DEVICE_CODE_MAPPING['apple-store'])] = db_df.loc[db_df['device_id'].isin(DEVICE_CODE_MAPPING['apple-store'])].replace({"device_id": DEVICE_CODE_MAPPING['apple-store']})

    db_df = db_df.replace({"store_id": COUNTRY_CODE_MAPPING['google-play']})
    db_df = db_df.replace({"store_id": COUNTRY_CODE_MAPPING['apple-store']})

    db_df = db_df.rename(columns={'store_id': 'country_code'}).rename(columns={'device_id': 'device_code'}).rename(columns={'device_id': 'device_code'}).rename(columns={'device_id': 'device_code'})

    est_unified_df = est_unified_df.toPandas()
    est_unified_df["modifier_id"] = 100000

    diff_db = _compare_df(est_unified_df, db_df)
    
    if diff.count()>0:
        return "{}: FAIL UNIFIED".format(date)
        diff.show(2)
    elif len(diff_db)>0:
        return "{}: FAIL DB".format(date)
        print diff_db
    else:
        return  "{}: PASS".format(date)
    
    
def get_date_list(start_date, end_date, freq="D"):
    date_list = [x.strftime('%Y-%m-%d') for x in list(pd.date_range(start=start_date, end=end_date, freq=freq))]
    return date_list

def _compare_df(df1, df2):
    for diff_type in ["left", "right"]:
        diff_df = df1.merge(df2, indicator=True, how=diff_type)  # .loc[lambda x : x['_merge']!='both']
        diff_df = diff_df.loc[diff_df["_merge"] != "both"]
        if len(diff_df) != 0:
            print diff_type
            return diff_df
    return []

def write_log(strobj, s3obj):
    s3obj.put(Body=str(strobj))

# date_list = get_date_list("2010-07-04/", "2020-02-15/")
# date_list = get_date_list("2010-07-04/", "2011-12-31")
date_list = get_date_list("2020-02-29/", "2020-03-07")


for date in date_list:
    temp_log = ""
    try:
        temp_log = compare(date)
    except Exception, e:
        temp_log = "{}: ERROR".format(date) 
    log.append(temp_log)
    print temp_log
    write_log(log, s3object)


In [0]:
%%sh

aws s3 ls s3://b2c-prod-data-pipeline-unified-dna/unified/dna.genre_id_product_mapping.v1/dimension/_partition_col=0/

In [0]:


from pyspark.sql import functions as F

mapping_df_unified = spark.read.parquet("s3://b2c-prod-data-pipeline-unified-dna/unified/dna.genre_id_product_mapping.v1/dimension/_partition_col=0/")
transformed_mapping_df = mapping_df_unified.select(["product_id", "genre_id"]).withColumn('genre_id', F.explode('genre_id'))

print transformed_mapping_df.filter("genre_id=1").show(20000)
#print transformed_mapping_df.filter("genre_id=2").show(20000)
#print transformed_mapping_df.filter("genre_id=3").show(20000)



In [0]:

%sh

PGPASSWORD='2mHdFW6%#REu' psql -h internal-aa-prod-plproxy-internal-4-329644124.us-east-1.elb.amazonaws.com -U app_bdp_usage_qa -d dailyest -p 7432 << EOF 
select sum(count_a) from plproxy.execute_select_nestloop(\$proxy\$ 
    select count(*) as count_a
    from aa.app_store_daily_estimate
    where
        date = '2019-01-01'
\$proxy\$) tbl (count_a SMALLINT);

EOF

In [0]:

%sh

PGPASSWORD='$mfbKd^&#D&6w6jX' psql -h b2b-prod-uds-storage-meta-db-new.crlexxwtzodp.us-east-1.rds.amazonaws.com -U datapipeline_uds_writer -d metadb -p 5432 << EOF 
\d
EOF




In [0]:
%%sh
