In [0]:

import gzip
import base64
import json
from io import BytesIO

from aadatapipelinecore.core.fs import Conf
from aadatapipelinecore.core.fs.device import S3Bucket
from aadatapipelinecore.core.fs.device.bucket import specified_bucket
from aadatapipelinecore.core.loader import es
from applications.db_check_v1.common.base_test import PipelineTest
from applications.db_check_v1.common.db_check_utils import _get_date_from_refresh_routing_config
from applications.db_check_v1.common.table_common_info import urn
from pyspark.sql import Row

def get_firehose_data(date, granularity):
    SAFE_SEPARATOR = "S-AIDPSEPA-E"
    conf = Conf()
    conf.bucket_name = "b2c-prod-data-pipeline-buffer-rating"
    conf.bucket_class = S3Bucket
    buffer_s3 = specified_bucket(conf)
    paths = buffer_s3.all("app-ss-review/{}/00/".format(date.replace("-", "/")))
    paths = [path.replace("s3://b2c-prod-data-pipeline-buffer-rating/", "") for path in paths]

    review_row = set()
    for path in paths:
        content = buffer_s3.get(path)
        b64_records = content.split(SAFE_SEPARATOR)[:-1]
        for b64_rec in b64_records:
            with gzip.GzipFile(fileobj=BytesIO(base64.b64decode(b64_rec)), mode='rb') as fh:
                json_record = json.loads(fh.read())
                for ss in json_record["source"]:
                    if ss["process_date"] == date and \
                            ss["process_granularity"] == granularity.upper() and \
                            ss["platform"] in ('ios', 'gp'):
                        review_row.add(ss["id"])

get_firehose_data("2020-05-05","daily")

In [0]:
%%sh

aws s3 ls s3://b2c-prod-data-pipeline-buffer-rating/app-ss-rating-v4/2020/05/05/00/  | sort -n

