In [0]:


# Copyright (c) 2019 App Annie Inc. All rights reserved.

"""
Description:
Test cases for ios app store rank
"""
import datetime
import random
import unittest

import zlib

from aaintdatapipeline.application.app_qa.data_validation_v1.pysparktest import PySparkTest
from aaintdatapipeline.application.app_qa.common.utils import get_date_list
from aaintdatapipeline.application.app_qa.data_validation_v1.constants import IOS_PHONE, IOS_TABLET

from aaintdatapipeline.application.app_qa.data_validation_v1.utils import (
    get_random_ios_feed_id, get_random_date, get_unified_data_path,
    get_ios_device_code_from_feed_id, get_ios_unified_category_id, get_ios_feed_name_from_id, check_parquet_exist,
    get_app_store_logic_path, datetime_to_string, get_ios_country_id_from_code, rank_bucket)
from aaintdatapipeline.core.fs.device import unified_bucket

# constant
from aaintdatapipeline.core.fs.router import UnifiedRouter
from aaintdatapipeline.core.log import logger
from aaintdatapipeline.core.urn import Urn

MAC = "tv-os-tv"
APP_STORE_RANK_URN = Urn(
    namespace='app-tech.store.app-rank.v2',
    data_type='fact'
)

FEED_ID_MAPPING_TO_COLUMN_NAME_TV = {
    0: "free_app_download",
    1: "paid_app_download",
    2: "revenue"
}

FEED_ID_TO_NAME_TV = {
    0: "free_download",
    1: "paid_download",
    2: "revenue"
}
CATEGORY_MAPPING = {
    "tv-os-tv": {
        36: 300000,
        360: 300001,
        6004: 300002,
        6009: 300003,
        6012: 300004,
        6013: 300005,
        6014: 300006,
        6016: 300007,
        6017: 300008
    }
}


# utils
def get_tv_feed_name_from_id(key):
    return FEED_ID_TO_NAME_TV.get(int(key), "Can't find tv feed name mapping from feed_id:{}.".format(key))


def get_tv_unified_category_id(key):
    return CATEGORY_MAPPING['tv-os-tv'].get(int(key),
                                              "Can't find ios unified category id mapping from category_id:{}.".format(key))


def get_app_store_rank_tv_raw_df(date, country_code):
    """
    :return: raw_data_frame
    :rtype: list_of_dic
    raw_data:
    _________________________________________________________________________________
    |    date    |   country_id   |  category_id  |   feed_id   |   rank (app_id)   |
    |------------|----------------|---------------|-------------|-------------------|
    | 2019-04-27 | 143441(bigint) |   6016 (int)  |   0 (int)   | 376510438(bigint) |
    ---------------------------------------------------------------------------------
    unified_data:
    _____________________________________________________________________________________
    |  country_code  |   category_id   |       app_id       | feed_name (free_download) |
    |----------------|-----------------|--------------------|---------------------------|
    |      'US'      | 100026 (bigint) | 376510438 (bigint) |    25 (int) (app_rank)    |
    -------------------------------------------------------------------------------------
    """
    path = "country-ranks/{_date}/23/{_country_code}".format(_date=date, _country_code=country_code)
    bucket = rank_bucket("prod_appannie_appletv")
    raw_tv_df = []
    for key in bucket.list(path):
        if 'md5' in key or 'SUCCESS' in key:
            continue
        raw_data = zlib.decompress(bucket.get(key))
        for _data in raw_data.splitlines():
            data = _data.split(",")
            if len(data) == 5:
                data_dic = {
                    'date': data[0],
                    'country_code': data[1],
                    'category_id': data[2],
                    'feed_id': data[3],
                    'rank':  [name.split("-")[1] for name in data[4].split(" ")]
                }
                raw_tv_df.append(data_dic)
            else:
                logger.log("Error: The length of raw data is wrong, length is {}.".format(len(data)))
    return raw_tv_df


# cases
class TestTVRank(PySparkTest):
    start_date = '2019-11-12'
    end_date = '2019-11-12' # datetime_to_string(datetime.date.today() + datetime.timedelta(-2))
    date = get_random_date(start_date, end_date)
    country_code = 'US'
    test_times = 10000

    def setup_spark(self, spark):
        self.spark = spark

    def setUp(self):
        super(TestTVRank, self).setUp()
        self.prepare_data()

    def prepare_data(self):
        self.raw_data = get_app_store_rank_tv_raw_df(self.date, self.country_code)
        self.unified_data = {
            MAC: self.spark.read.parquet(get_unified_data_path(APP_STORE_RANK_URN, self.date, MAC)),
        }

    def test_tv_rank_completeness(self):
        if self.raw_data:
            for _ in range(self.test_times):
                raw_index = random.randint(0, len(self.raw_data) - 1)
                raw_sample = self.raw_data[raw_index]
                unified_device_code = MAC
                unified_category_id = get_tv_unified_category_id(raw_sample['category_id'])
                unified_column_name = get_tv_feed_name_from_id(raw_sample['feed_id'])
                raw_count = len(raw_sample['rank']) if raw_sample['rank'][0] else 0
                unified_count = self.unified_data[unified_device_code].filter(
                    "country_code='{}' and category_id={} and {} is not null".format(
                        self.country_code, unified_category_id, unified_column_name)).count()
                msg = "{}; {}; raw index = {}; raw count = {}; unified count = {}.".format(
                    raw_sample, self.unified_data, raw_index, raw_count, unified_count)
                print(msg)
                self.assertEqual(raw_count, unified_count, msg)
        else:
            self.check_no_unified_data()

    def test_tv_rank_accuracy(self):
        if self.raw_data:
            for _ in range(self.test_times):
                raw_index = random.randint(0, len(self.raw_data) - 1)
                print raw_index
                raw_sample = self.raw_data[raw_index]
                unified_device_code = MAC
                unified_category_id = get_tv_unified_category_id(raw_sample['category_id'])
                unified_column_name = get_tv_feed_name_from_id(raw_sample['feed_id'])
                _unified_data = self.unified_data[unified_device_code].filter(
                    "country_code='{}' and category_id={}".format(self.country_code, unified_category_id)
                )
                for _ in range(5):
                    # get raw data rank sample
                    raw_rank = random.randint(0, len(raw_sample['rank']) - 1)
                    print raw_rank
                    # get unified_app_id
                    unified_app_id = long(raw_sample['rank'][raw_rank])
                    # get unified data rank
                    unified_rank = int(_unified_data.filter("app_id={}".format(
                        unified_app_id)).collect()[0][unified_column_name])
                    expect_raw_rank = raw_rank + 1
                    msg = "{}; {}; {}; {}; app_id = {}; raw index = {}; raw rank = {}; unified rank = {}.".format(
                            self.date, unified_device_code, self.country_code, unified_category_id, unified_app_id,
                            raw_index, expect_raw_rank, unified_rank)
                    print(msg)
                    self.assertEqual(expect_raw_rank, unified_rank, msg)
            else:
                self.check_no_unified_data()

    def check_no_unified_data(self):
        self.assertEqual(
            self.unified_data[MAC].filter("country_code='{}'".format(self.country_code)).count(), 0,
            msg="Raw data is null, while unified data is not null at {} in {} on {}.".format(
                self.date, self.country_code, MAC))

abc = TestTVRank("test_tv_rank_accuracy")
abc.setup_spark(spark)
abc.setUp()
# abc.test_tv_rank_completeness()
abc.test_tv_rank_accuracy()
print 'pass'

# def main(spark, params):
#     assert spark, params
#     suite = unittest.TestSuite()
#     suite.addTest(TestMACRank('test_tv_rank_completeness'))
#     log_file = "/tmp/test.log"
#     with open(log_file, "w") as f:
#         runner = unittest.TextTestRunner(f)
#         runner.run(suite)



In [0]:


import psycopg2
import datetime
import pandas as pd

citus_dsn_ = (
    "dbname='{db}' user='{user}' password='{password}' "
    "host='{host}' port='{port}'".format(
        db="aa_citus_db",
        user="citus_bdp_usage_qa",
        host="10.2.10.132",
        password="dNzWtSV3pKTx",
        port=5432
    )
)

def query(dsn, sql):
    with psycopg2.connect(dsn) as conn:
        conn.autocommit = True
        with conn.cursor() as cur:
            cur.execute(sql)
            result = cur.fetchall()
            conn.commit()
    return result

def get_date_list(begin_date, end_date, freq):
    date_list = [x.strftime('%Y-%m-%d') for x in list(pd.date_range(start=begin_date, end=end_date, freq=freq))]
    return date_list

begin_date = datetime.datetime.strptime("2019-01-01", '%Y-%m-%d')
end_date = datetime.datetime.strptime("2019-11-17", '%Y-%m-%d')
DATE_LIST = get_date_list(begin_date, end_date, "D")


metrics = ["free_download"] #  "new_paid_download", "revenue", "paid_download", "new_free_download"

# print "%table \tdate\tcount"
print "%table {}\t{}".format("date", "count")

for date_str in DATE_LIST:
    date_count = {}
    for metric in metrics:
        sql = 'SELECT date, device_code, country_code, category_id, count({metric}),max({metric}) FROM store.store_app_rank_fact_v1  WHERE date = \'{date_str}\' AND ( granularity = \'hourly\' ) AND ( hour = 23 ) AND ({metric} is not null)  GROUP BY country_code, date, device_code, country_code, category_id ORDER BY date ASC'.format(
            metric=metric, date_str=date_str)

        test_result = query(citus_dsn_, sql)
        date_count[metric] = 0
        for result in test_result:
            date, device_code, country_code, category_id, count, max = result
            if count != 0 and count != max:
                date_count[metric] += 1
                # print ",".join([date.strftime("%Y-%m-%d"), device_code, country_code, str(category_id), metric, str(count),str(max)])
    total_count = sum([date_count[key] for key in date_count])
    print "\t".join([date_str, str(total_count)])

print "end"

In [0]:


import psycopg2
import datetime
import pandas as pd

citus_dsn_ = (
    "dbname='{db}' user='{user}' password='{password}' "
    "host='{host}' port='{port}'".format(
        db="aa_citus_db",
        user="citus_bdp_usage_qa",
        host="10.2.10.132",
        password="dNzWtSV3pKTx",
        port=5432
    )
)

def query(dsn, sql):
    with psycopg2.connect(dsn) as conn:
        conn.autocommit = True
        with conn.cursor() as cur:
            cur.execute(sql)
            result = cur.fetchall()
            conn.commit()
    return result

def get_date_list(begin_date, end_date, freq):
    date_list = [x.strftime('%Y-%m-%d') for x in list(pd.date_range(start=begin_date, end=end_date, freq=freq))]
    return date_list

begin_date = datetime.datetime.strptime("2014-10-15", '%Y-%m-%d')
end_date = datetime.datetime.strptime("2014-12-17", '%Y-%m-%d')
DATE_LIST = get_date_list(begin_date, end_date, "D")



metrics = ["free_download", "new_paid_download", "revenue", "paid_download", "new_free_download"]
device_list = ["ios-phone","ios-tablet","android-all","tv-os-tv","mac-os-mac"]
print "%table {}\t{}\t{}".format("date","device","count")

DATE_LIST = ["2014-02-09", "2014-02-13", "2014-02-14", "2014-03-04", "2014-03-06", "2014-03-09", "2014-03-11", "2014-03-12", "2014-03-18", "2014-03-19", "2014-04-02", "2014-04-10", "2014-05-01", "2014-05-31", "2014-06-01", "2014-06-02", "2014-06-03", "2014-06-04", "2014-06-05", "2014-06-06", "2014-06-07", "2014-06-08", "2014-06-09", "2014-06-10", "2014-06-11", "2014-06-17", "2014-06-18", "2014-06-19", "2014-06-20", "2014-07-31", "2014-08-01", "2014-08-02", "2014-08-03", "2014-08-04", "2014-08-06", "2014-08-07", "2014-08-08", "2014-08-09", "2014-08-10", "2014-08-11", "2014-08-12", "2014-08-13", "2014-08-21", "2014-08-28", "2014-09-09", "2014-09-12", "2014-10-02", "2014-10-05", "2014-10-14", "2014-10-16", "2014-10-17", "2014-10-18", "2014-10-19", "2014-10-20", "2014-10-23", "2014-10-24", "2014-10-25", "2014-10-26", "2014-10-27", "2014-10-28", "2014-10-29", "2014-10-30", "2014-10-31", "2014-11-01", "2014-11-02", "2014-11-03", "2014-11-04", "2014-11-05", "2014-11-06", "2014-11-07", "2014-11-08", "2014-11-09", "2014-11-10", "2014-11-11", "2014-11-12", "2014-11-13", "2014-11-14", "2014-11-15", "2014-11-16", "2014-11-17", "2014-11-18", "2014-11-19", "2014-11-20", "2014-11-21", "2014-11-22", "2014-11-23", "2014-11-24", "2014-11-25", "2014-11-26", "2014-11-27", "2014-11-28", "2014-11-29", "2014-11-30", "2014-12-01", "2014-12-02", "2014-12-03", "2014-12-04", "2014-12-05", "2014-12-06", "2014-12-07", "2014-12-08", "2014-12-09", "2014-12-10", "2014-12-11", "2014-12-12", "2014-12-13", "2014-12-14", "2014-12-15", "2014-12-16", "2014-12-27", "2014-12-28", "2015-01-06", "2015-01-07", "2015-01-14", "2015-01-15", "2015-02-05", "2015-02-24", "2015-02-25", "2015-03-05", "2015-03-06", "2015-03-09", "2015-03-11", "2015-03-16", "2015-03-20", "2015-03-25", "2015-03-26", "2015-04-03", "2015-04-04", "2015-04-05", "2015-04-08", "2015-04-12", "2015-04-13", "2015-04-16", "2015-04-17", "2015-04-18", "2015-04-19", "2015-04-21", "2015-04-22", "2015-05-04", "2015-05-06", "2015-05-15", "2015-05-26", "2015-06-02", "2015-06-16", "2015-06-17", "2015-06-24", "2016-04-19", "2016-04-24", "2016-05-29", "2016-10-04", "2016-10-11", "2016-10-19", "2016-11-30", "2019-06-10", "2019-06-26", "2019-07-01", "2019-07-05", "2019-07-22", "2019-07-24", "2019-07-29", "2019-08-23", "2019-08-24", "2019-09-05", "2019-09-06", "2019-09-07", "2019-09-08", "2019-09-09", "2019-09-10", "2019-09-11", "2019-09-12", "2019-09-13", "2019-09-14", "2019-09-15", "2019-09-16", "2019-09-17", "2019-09-18", "2019-09-19", "2019-09-22", "2019-09-23", "2019-09-24", "2019-09-25", "2019-09-26", "2019-09-27", "2019-09-28", "2019-09-29", "2019-09-30", "2019-10-01", "2019-10-02", "2019-10-03", "2019-10-04", "2019-10-05", "2019-10-06", "2019-10-07", "2019-10-09", "2019-10-10", "2019-10-11", "2019-10-12", "2019-10-13", "2019-10-16", "2019-10-19", "2019-10-22", "2019-10-23", "2019-10-24", "2019-10-25", "2019-10-26", "2019-10-27", "2019-10-28", "2019-10-29", "2019-10-30", "2019-10-31", "2019-11-01", "2019-11-02", "2019-11-03"]

for date_str in DATE_LIST:
    date_count = {"ios-phone":0,"ios-tablet":0,"android-all":0,"tv-os-tv":0,"mac-os-mac":0 }
    for metric in metrics:
        sql = 'SELECT date, device_code, country_code, category_id, count({metric}),max({metric}) FROM store.store_app_rank_fact_v1  WHERE date = \'{date_str}\' AND ( granularity = \'hourly\' ) AND ( hour = 23 ) AND ({metric} is not null)  GROUP BY country_code, date, device_code, country_code, category_id ORDER BY date ASC'.format(
            metric=metric, date_str=date_str)

        test_result = query(citus_dsn_, sql)
        for result in test_result:
            date, device_code, country_code, category_id, count, max = result
            if count != 0 and count != max:
                date_count[device_code] += 1
                # print ",".join([date.strftime("%Y-%m-%d"), device_code, country_code, str(category_id), metric, str(count),str(max)])
    for key in device_list:
        print "\t".join([date_str, key, str(date_count[key])])

print "end"

In [0]:

from aaintdatapipeline.core.conf import Conf
from aaintdatapipeline.core.fs.device import S3Bucket, specified_bucket
from aaintdatapipeline.core.fs.device import unified_bucket
import zlib

bucket_name = "prod_appannie_android"
s3 = S3Bucket(Conf(bucket_name = bucket_name))
conf = Conf(
        bucket_name=bucket_name,
        bucket_class=S3Bucket
)
path = "country-ranks/{_date}/23/{_country_code}".format(_date='2019-09-16', _country_code="CN")
bucket = specified_bucket(conf)
raw_data = zlib.decompress(bucket.get("country-ranks/2019-09-16/23/CN"))

for _data in raw_data.splitlines():
    if "MUSIC_AND_AUDIO" in _data:
        _data_list = _data.split(",")
        _rank_list = _data_list[4].split(" ")
        print _data_list[:3]
        print len(_rank_list)
        for _rank in _rank_list:
            print _rank






In [0]:


date = "2019-09-16"
device_code = "android-all"

print spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-free/unified/app-tech.store.app-rank.v1/fact/date={}/device_code={}/".format(date,device_code)).select(["app_id","free_download"]).filter("country_code='CN' AND category_id='400047'").orderBy("free_download").show(1000)


In [0]:
%%sh
#2019-09-16,android-all,CN,400047,free_download,638,1307

PGPASSWORD='dNzWtSV3pKTx' psql -h 10.2.10.132 -U citus_bdp_usage_qa -d aa_citus_db -p 5432 << EOF 
SELECT app_id, free_download FROM store.store_app_rank_fact_v1  WHERE (date='2019-09-16') AND category_id=400047 AND country_code='CN' AND ( granularity = 'hourly' ) AND ( hour = 23 ) ORDER BY free_download ASC;
EOF


In [0]:

from aaintdatapipeline.core.conf import Conf
from aaintdatapipeline.core.fs.device import S3Bucket, specified_bucket
from aaintdatapipeline.core.fs.device import unified_bucket
import zlib

bucket_name = "prod_appannie_ios"
s3 = S3Bucket(Conf(bucket_name = bucket_name))
conf = Conf(
        bucket_name=bucket_name,
        bucket_class=S3Bucket
)
path = "country-ranks/{_date}/23/{_country_code}".format(_date='2014-11-01', _country_code="CN")
bucket = specified_bucket(conf)
raw_data = zlib.decompress(bucket.get("country-ranks/2014-11-01/23/143465"))

for _data in raw_data.splitlines():
        _data_list = _data.split(("\t"))
        _date = _data_list[0]
        _category_id = _data_list[2]
        _country_id = _data_list[1]
        _feed_id = _data_list[3]
        _rank_list = _data_list[4].split(" ")
        # print _data_list
        if _date=="2014-11-01" and _category_id=='36' and _feed_id=='0' and _country_id=='143465':
            print _data
        # print _data_list[:3]
        # print len(_rank_list)
        # for _rank in _rank_list:
        #     print _rank




In [0]:
%%sh
#2019-09-16,android-all,CN,400047,free_download,638,1307

PGPASSWORD='dNzWtSV3pKTx' psql -h 10.2.10.132 -U citus_bdp_usage_qa -d aa_citus_db -p 5432 << EOF 
SELECT count(*) FROM store.store_app_rank_fact_v1  WHERE (date='2014-11-01') AND category_id=100000 AND country_code='CN' AND ( granularity = 'hourly' ) AND ( hour = 23 ) AND (device_code='ios-phone') AND (free_download is not null);
SELECT * FROM store.store_app_rank_fact_v1  WHERE (date='2014-11-03') AND (category_id=100000) AND (country_code='CN') AND ( granularity = 'hourly' ) AND ( hour = 23 ) AND (device_code='ios-phone') AND (free_download is not null) ORDER BY free_download ASC;
EOF


In [0]:


date = "2014-11-03"
device_code = "ios-phone"

print spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-free/unified/app-tech.store.app-rank.v1/fact/date={}/device_code={}/".format(date,device_code)).select(["app_id","free_download"]).filter("country_code='CN' AND category_id='100000' AND free_download is not null").orderBy("free_download").show(5000)


In [0]:
%%sh

# aws s3 ls s3://prod_appannie_android/country-ranks/
aws s3 ls s3://b2c-prod-data-pipeline-unified-store-free/unified/app-tech.store.app-rank.v1/fact/date=2019-09-16/device_code=android-all/
aws s3 ls s3://b2c-prod-data-pipeline-unified-store-free/unified/app-tech.store.app-rank.v1/fact/


In [0]:
%%sh

# aws s3 ls s3://prod_appannie_ios/country-ranks/2014-11-12/23/ | grep -- 'US\|143441'
# aws s3 ls s3://prod_appannie_android/country-ranks/2014-02-14/23/ | awk '{print $4}' | grep -v -E '^[a-zA-Z0-9]{2}$'
# aws s3 ls --recursive s3://prod_appannie_android/country-ranks/ | awk '{print $4}' | grep -v -E '\/[a-zA-Z]{2}$'
aws s3 ls --recursive s3://prod_appannie_ios/country-ranks/ | awk '{print $4}' | grep -v -E '\/[0-9]{6}$'


In [0]:
%%sh

aws s3 ls s3://prod_appannie_android/country-ranks/2016-10-30/23/


In [0]:
%%sh
