In [0]:

from copy import deepcopy
import time
import pandas as pd
import datetime
import random

# PARAMS
date_time_str = "2016-01-01	,	2016-06-30"

begin_date = datetime.datetime.strptime(date_time_str.split(",")[0].strip(), '%Y-%m-%d')
end_date = datetime.datetime.strptime(date_time_str.split(",")[1].strip(), '%Y-%m-%d')
DEBUG = False

# CONSTANTS
DEVICE_ID_CODE_MAPPING_WITHOUT_ALL = {
    1001: 'android-phone',
    1002: 'android-tablet',
    2001: 'ios-phone',
    2002: 'ios-tablet',
}


DEVICE_ID_CODE_MAPPING = {
    1000: 'android-all',
    1001: 'android-phone',
    1002: 'android-tablet',
    2000: 'ios-all',
    2001: 'ios-phone',
    2002: 'ios-tablet',
}

AD_PLATFORMS = {
    1: 'ALL',
    101: 'ADMOB',
    109: 'TAPJOY',
    110: 'CHARTBOOST',
    111: 'INMOBI',
    112: 'ADCOLONY',
    114: 'APPLOVIN',
    116: 'VUNGLE',
    121: 'UNITYADS',
    136: 'FACEBOOK',
    142: 'SUPERSONIC',
    171: 'STARTAPP',
    200: 'TWITTER',
    201: 'INSTAGRAM',
    202: 'FBA',
    203: 'YOUTUBE',
    311: 'MOPUB',
    316: 'FYBER',
    324: 'FBCREATIVESONLY',
    325: 'IGCREATIVESONLY',
}

MKT_COUNTRY_CODE = ['CN', 'AU', 'BR', 'CA', 'FI', 'FR', 'DE', 'HK', 'IN', 'ID', 'IT', 'JP', 'MX', 'NZ', 'NO', 'RU', 'KR', 'ES', 'SE', 'TW', 'TH', 'TR', 'GB', 'US', 'DK']

def get_date_list(begin_date, end_date, freq):
    date_list = [x.strftime('%Y-%m-%d') for x in list(pd.date_range(start=begin_date, end=end_date, freq=freq))]
    return date_list

DATE_LIST = get_date_list(begin_date, end_date, "D")

# UTILS
def log(strlog):
    if DEBUG == True:
        print strlog

def write_to_file(filename, logstr):
    print logstr
    with open(filename,'a') as file_object:
        file_object.write(logstr + "\n")

def get_random(data_list):
    return random.choice(data_list)


def get_campaign_id_from_unified(campaign_sha):
    log("campaign_sha : {}".format(campaign_sha))
    s3_path = "s3://b2c-prod-data-pipeline-unified-market/unified/app-tech.market.campaign-id-mapping.v1/dimension/"
    df_unified = spark.read.parquet(s3_path).filter("campaign_original_id='{}'".format(campaign_sha)).collect()
    return df_unified[0].campaign_id

def get_creative_id_from_unified(creative_sha):
    log("creative_sha : {}".format(creative_sha))
    s3_path = "s3://b2c-prod-data-pipeline-unified-market/unified/app-tech.market.creative-id-mapping.v1/dimension/"
    df_unified = spark.read.parquet(s3_path).filter("creative_md5='{}'".format(creative_sha)).collect()
    return df_unified[0].creative_id

def get_platform_device_type_from_device_id(device_id):
    platform = int(device_id / 1000)
    device_type = device_id % 100
    return platform, device_type

def get_random_filter_dict_from_raw(s3_mart, date):
    MAX_STEP = 10
    random_flag = True
    for try_steps in range(1, MAX_STEP +1):
        device_id = get_random(DEVICE_ID_CODE_MAPPING_WITHOUT_ALL.keys())
        platform,device_type = get_platform_device_type_from_device_id(device_id)
        filter_dict = {
            "device_id": device_id,
            "network_id": get_random(AD_PLATFORMS.keys()),
            "platform": platform,
            "device_type": device_type,
            "country": get_random(MKT_COUNTRY_CODE),
            "date": date
        }

        log(filter_dict)
        s3_path = "s3://aardvark-prod-dca-data/fact/MKT_META_DATA_NEW/version=2.0.0/date={date}/".format(**filter_dict)
        filter_one_sql = "country='{country}' AND platform='{platform}' AND device_type='{device_type}' AND network_id='{network_id}'".format(
            **filter_dict)
        if try_steps == MAX_STEP:
            filter_one_sql = "True".format(**filter_dict)
            random_flag = False
        df_unified = []
        try:
            log("{}  {}".format(s3_path, filter_one_sql))
            df_unified = spark.read.parquet(s3_path).filter(filter_one_sql).limit(1).collect()
            df_unified_row = df_unified[0]
            log(df_unified_row)

            filter_dict["country"] = df_unified_row['country']
            filter_dict["platform"] = df_unified_row['platform']
            filter_dict["device_type"] = df_unified_row['device_type']
            filter_dict["network_id"] = df_unified_row['network_id']
            filter_dict["device_id"] = int("{}00{}".format(df_unified_row['platform'], df_unified_row['device_type']))

            filter_dict["creative_id_sha"] = df_unified_row['creative_id']
            filter_dict["ad_app_id"] = df_unified_row['advertiser_app_id']
            filter_dict["campaign_id_sha"] = df_unified_row['campaign_id']
            filter_dict["pub_app_id"] = df_unified_row['publisher_app_id']
            return filter_dict, random_flag
        except Exception, e:
            print "s"
            continue
    return None, random_flag


# CODE MAIN
pass_count = 0
fail_count = 0

timestamp = time.strftime('%Y-%m-%d_%H-%M-%S', time.localtime(time.time()))
log_filename = '/tmp/db_log_{}.log'.format(DATE_LIST[0])

for date in DATE_LIST:
    MAX_STEP = 10
    for i in range(1, MAX_STEP+1):
        time_start = time.time()
        raw_mart_name = "ad_app_pub_app_estimate"
        filter_dict, random_flag = get_random_filter_dict_from_raw(raw_mart_name, date)
        if filter_dict is None:
            print "NO RAW DATA for {}".format(date)
            break
        log(filter_dict)
        for key in deepcopy(filter_dict):
            if key == "device_id":
                filter_dict["device_code"] = DEVICE_ID_CODE_MAPPING[filter_dict[key]]
            if key == "country":
                filter_dict["country_code"] = filter_dict[key]
            if key == "network_id":
                filter_dict["ad_platform_id"] = filter_dict["network_id"]
            if key == "creative_id_sha":
                filter_dict["creative_id"] = get_creative_id_from_unified(filter_dict[key])
            if key == "campaign_id_sha":
                filter_dict["campaign_id"] = get_campaign_id_from_unified(filter_dict[key]) if filter_dict[key] else -1

        log(filter_dict)
        s3_path = "s3://b2c-prod-data-pipeline-unified-market/unified/app-tech.market.creative-log.v1/fact/date={date}/device_code={device_code}/".format(
            **filter_dict)
        filter_sql = "ad_platform_id='{ad_platform_id}' AND pub_app_id='{pub_app_id}' AND ad_app_id='{ad_app_id}' " \
                     "AND country_code='{country_code}' AND creative_id={creative_id} AND campaign_id='{campaign_id}' ".format(
            **filter_dict)

        df_unified = spark.read.parquet(s3_path).filter(filter_sql).collect()
        log(df_unified)
        time_end = time.time()

        time_cost = int(time_end-time_start)
        if len(df_unified) == 1:
            pass_count += 1

            logstr = "PASS P:{} F:{} T:{} D:{} R:{}".format(pass_count,fail_count, time_cost, date, random_flag)
            write_to_file(log_filename, logstr)

        else:
            fail_count += 1
            logstr = "FAIL P:{} F:{} T:{} D:{} R:{}".format(pass_count,fail_count,time_cost,date, random_flag)
            write_to_file(log_filename, logstr)
            write_to_file(log_filename, filter_dict)
            write_to_file(log_filename, df_unified)
        if not random_flag:
            print "NOT RANDOM for {}".format(date)
            break

