In [0]:
%md
review migration:
original data: ```s3://b2c-prod-data-pipeline-unified-review/unified/review.v1/fact/process_granularity={hourly, daily, weekly, monthly}/```
Delta lake data: ```s3://b2c-prod-data-pipeline-unified-review/unified/review.v4/fact```
process date : from beginning to 2020-02-24

test point:
1. maybe some duplicate data generated during migration, so please help to do the review distinct count check
2. take like 100 samples from different platform do the detail check

In [0]:
%%sh
# aws s3 ls s3://b2c-prod-data-pipeline-unified-review/unified/review.v1/fact/process_granularity=weekly/process_date=2020-01-27/
# aws s3 ls s3://b2c-prod-data-pipeline-unified-review/unified/review.v1/fact/

# aws s3 ls s3://b2c-prod-data-pipeline-unified-review/unified/review.v1/fact/_delta_log/
aws s3 cp s3://b2c-prod-data-pipeline-unified-review/unified/review.v1/fact/_delta_log/00000000000000000000.json -

# aws s3 ls "s3://b2c-prod-data-pipeline-unified-review/unified/review.v1/fact/process_granularity=daily/process_date=2020-01-25/process_hour=23/device_code=ios-all/market_code=apple-store/" --recursive --human-readable --summarize

In [0]:
%%sh
aws s3 ls s3://b2c-prod-data-pipeline-unified-review/unified/review.v4/fact/_delta_log/

# aws s3 ls "s3://b2c-prod-data-pipeline-unified-review/unified/review.v4/fact/process_granularity=daily/process_date=2017-06-30/process_hour=23/device_code=ios-all/market_code=apple-store/" --recursive --human-readable --summarize | tail -5



In [0]:

df_ori = spark.read.format('delta').option("versionAsOf", '106').load("s3://b2c-prod-data-pipeline-unified-review/unified/review.v1/fact/")
df_ori.show(1, False, True)

df_del = spark.read.format('delta').option("versionAsOf", '2920').load("s3://b2c-prod-data-pipeline-unified-review/unified/review.v4/fact/")
df_del.show(1, False, True)

print df_ori.filter("process_granularity='weekly' and process_date='2020-01-27'").count()
print df_del.filter("process_granularity='weekly' and process_date='2020-01-27'").count()

df_a = df_ori.filter("process_granularity='weekly' and process_date='2020-01-27'")
df_b = df_del.filter("process_granularity='weekly' and process_date='2020-01-27'")
df_c = df_a.subtract(df_b)
df_c.show(1)

In [0]:

df = spark.read.format("delta").option("timestampAsOf", '2020-02-19').load("s3://b2c-prod-data-pipeline-unified-review/unified/review.v1/fact/")
df.show(3)

In [0]:

df_ori.createOrReplaceTempView("table")
df_ori2 = spark.sql("DESCRIBE HISTORY delta.`table`")

In [0]:

latest_version = spark.sql("SELECT max(version) FROM (DESCRIBE HISTORY delta.`/mnt/delta/events`)").collect()
df = spark.read.format("delta").option("versionAsOf", latest_version[0][0]).load("/mnt/delta/events")

# Every query that stems off df will use the same snapshot

In [0]:

print df_ori.filter("process_granularity='weekly' and process_date='2020-01-27'").count()
print df_del.filter("process_granularity='weekly' and process_date='2020-01-27'").count()

In [0]:

df1 = spark.read.parquet("s3://b2c-prod-data-pipeline-unified-review/unified/review.v1/fact/process_granularity={}/process_date={}/".format('weekly', '2020-01-27'))
df2 = spark.read.parquet("s3://b2c-prod-data-pipeline-unified-review/unified/review.v4/fact/process_granularity=weekly/process_date=2020-01-27/")

df1.show(1, False, True)
df2.show(1, False, True)

print df1.count()
print df2.count()

In [0]:

print df_ori.where("_identifier='220200218032358710'").count()
print df_del.where("_identifier LIKE '220200218%' and process_granularity='weekly' and device_code='android-all' and market_code='google-play'").count()

In [0]:

df_ori.printSchema()

In [0]:

s3_path1 = "s3://b2c-prod-data-pipeline-unified-review/unified/review.v1/fact/process_granularity=daily/process_date=2017-06-10/"
df1 = spark.read.parquet(s3_path1)
print df1.distinct().count()
df1.show(5, truncate=False, vertical=True)
df1.unpersist()

In [0]:

s3_path2 = "s3://b2c-prod-data-pipeline-unified-review/unified/review.v4/fact/process_granularity=daily/process_date=2017-06-10/"
df2 = spark.read.parquet(s3_path2)
print df2.distinct().count()
df2.show(5, truncate=False, vertical=True)

In [0]:

import datetime, re
from collections import defaultdict

# start_date = datetime.date(1998, 3, 31)
# end_date = datetime.date(2020, 2, 24)
start_date = datetime.date(2019, 1, 1)
end_date = datetime.date(2020, 2, 23)
date_range = end_date - start_date
date_range = date_range.days + 1
granularity_list = ['hourly', 'daily', 'weekly', 'monthly']
# granularity_list = ['hourly']
device_code_list = ['ios-all', 'android-all', 'windows-phone', 'all']
fails_list = []
fails_detail_list = []

# test code
# granularity = "hourly"
# process_date = "2019-02-02"
def detail_check(num, df_orig, df_delta, granularity, process_date):
    fails_dict = defaultdict(list)
    # df_diff = df_delta.head(num).subtract(df_orig.head(num))
    # return df_diff
    orig_review_ids = [row["review_id"] for row in df_orig.head(num)]
    delta_review_ids = [row["review_id"] for row in df_delta.head(num)]
    print("orig:", orig_review_ids)
    print("delta:", delta_review_ids)
    return orig_review_ids - delta_review_ids
    # for review_id in review_ids:
    #     if not df_delta.where("review_id='{}'".format(review_id)):
    #     fails_dict["granularity={}, process_date={}".format(granularity, process_date)].append(review_id)
    
    # return fails_dict

for granularity in granularity_list:
    for _ in range(date_range):
        process_date = start_date.strftime("%Y-%m-%d")
        try:
            orig_s3_path = "s3://b2c-prod-data-pipeline-unified-review/unified/review.v1/fact/process_granularity={}/process_date={}/"
            delta_s3_path = "s3://b2c-prod-data-pipeline-unified-review/unified/review.v1/fact/process_granularity={}/process_date={}/"
            
            df_orig = spark.read.parquet(orig_s3_path.format(granularity, process_date)).distinct().sort("review_id")
            df_delta = spark.read.parquet(delta_s3_path.format(granularity, process_date)).distinct().sort("review_id")
            
            dis_count_orig = df_orig.count()
            dis_count_delta = df_delta.count()
            
            # print "No distinct: ", count_orig, count_delta
            # print "Distinct: ", dis_count_orig, dis_count_delta
            if not dis_count_orig == dis_count_delta:
                fails_list.append((granularity, process_date))
                print dis_count_orig - dis_count_delta
            fails_detail_list.append(detail_check(10, df_orig, df_delta, granularity, process_date))
            # detail_check(10, df_orig, df_delta, granularity, process_date)
            print "DONE"
        except Exception as e:
            if re.search(r"Path does not exist:", str(e)):
                print e
        df_orig.unpersist()
        df_delta.unpersist()
        start_date += datetime.timedelta(days=1)


In [0]:

print(fails_list)
print(fails_dict_list)
print df_orig.head(10)
print df_delta.head(10)

In [0]:

start_date = datetime.date(1998, 3, 31)
end_date = datetime.date(2020, 2, 24)
date_range = end_date - start_date
date_range = date_range.days + 1
print(date_range)

In [0]:

from aadatapipelinecore.core.loader import es
from applications.db_check_v1.common.table_common_info import urn

class AdvancedReviewDBData(object):
    """
    Get data from database (ElasticSearch)
    """
    _common_config = {"database": "adv_review"}
    _index_config = "int-ss-advancedreview_v1*"

    def get(self):
        """
        :return: elasticsearch result
        :rtype: dict
        """
        query_body = {
            "track_total_hits": "true",
            "query": {
                        "match_all": {}
                }
            }
        es_conn = es.connection(urn, self._common_config)
        es_data = es_conn.search(
            index=self._index_config,
            body=query_body)
        return es_data
        
es_data = AdvancedReviewDBData().get()

In [0]:

es_data

In [0]:

es_total = es_data["hits"]["total"]["value"]
print es_total

In [0]:
%%sh
PGPASSWORD='dNzWtSV3pKTx' psql -h 10.2.10.132 -p 5432 -U citus_bdp_usage_qa -d aa_citus_db << EOF

SET search_path=advancedreview;

BEGIN;
select count(*) from advancedreview_topic_fact_v3;

SELECT * FROM 

COMMIT;
EOF 

In [0]:
%python,
import unittest

def add(a, b):
    return a+b

class TestES(unittest.TestCase):
    def test_add(self):
        self.assertTrue(add(2, 1)==1+2, "Failed")

if __name__=='__main__':
    unittest.main(argv=[''], exit=False, verbosity=2)

In [0]:
%python
from 
# Copyright (c) 2019 App Annie Inc. All rights reserved.

"""
Advanced Review Database Check:
    1. The number of data in "unified layer" is equal to elasticsearch
"""

import unittest
from collections import defaultdict
from applications.db_check_v1.common.base_test import PipelineTest
from applications.db_check_v1.common.table_common_info import urn
from aadatapipelinecore.core.loader import es


class AdvancedReviewUnifiedData(object):
    """
    Get data from unified s3 bucket, then return a DataFrame
    """
    _unified_s3_path = "s3://b2c-prod-data-pipeline-unified-advancedreview/" \
                       "unified/advancedreview.topic.v1/fact/process_date={}"

    def __init__(self, spark):
        self.spark = spark

    def get(self, date):
        """
        :param date: process_date in s3 bucket path
        :type date: string
        :return: unified_df
        :rtype: DataFrame
        """
        unified_df = self.spark.read.parquet(self._unified_s3_path.format(date))
        return unified_df


class AdvancedReviewDBData(object):
    """
    Get data from database (ElasticSearch)
    """
    _common_config = {"database": "adv_review"}
    _index_config = "int-ss-advancedreview_v1*"

    def get(self, country_code, device_code, identifier):
        """
        :return: elasticsearch result
        :rtype: dict
        """
        query_body = {
            "track_total_hits": "true",
            "query": {
                "bool": {
                    "must": [
                        {"match": {"country_code": country_code}},
                        {"match": {"device_code": device_code}},
                        {"match": {"_identifier": identifier}}
                    ]
                }
            }
        }
        es_conn = es.connection(urn, self._common_config)
        es_data = es_conn.search(
            index=self._index_config,
            body=query_body)
        return es_data


class TestTopicJoinReview(PipelineTest):
    """
    Compare ES data amount with unified review data
    """
    trigger_date_config = ("10 10 * * *", 1)

    def setUp(self):
        self.country_code_list = ["US", "GB", "CA", "AU", "IN", "JP", "KR"]
        self.device_code_list = ["ios-all", "android-all"]
        self.granularity = 'daily'
        self.failed_ids = defaultdict(list)

    def _compare_diff(self, device_code, country_code, unified_df, identifier):
        unified_count = unified_df.where(
            "country_code='{}' AND device_code='{}'".format(country_code, device_code)).count()
        es_data = AdvancedReviewDBData().get(country_code, device_code, identifier)
        es_total = es_data["hits"]["total"]["value"]
        if not es_total == unified_count:
            self.failed_ids[device_code].append(country_code)

    def test_advanced_review_topic_join_review(self):
        unified_df = AdvancedReviewUnifiedData(self.spark).get(self.check_date_str)
        identifier = unified_df.collect()[0]["_identifier"]
        for device_code in self.device_code_list:
            for country_code in self.country_code_list:
                self._compare_diff(device_code, country_code, unified_df, identifier)
        
        print failed_ids
        self.assertTrue(len(self.failed_ids) == 0, self.failed_ids)

if __name__=='__main__':
    unittest.main(argv=[''], exit=False, verbosity=2)

In [0]:
%python
from aadatapipelinecore.core.utils.spark import create_spark
spark = create_spark()

In [0]:

spark

In [0]:

from applications.db_check_v1.common.base_test import PipelineTest

a = PipelineTest.setUpClass()


In [0]:

spark.sparkContext

In [0]:

import pyspark

In [0]:

from pyspark import SparkContext, SparkConf

conf_cust = SparkConf().setAppName('test').setMaster('local')
sc = SparkContext(conf=conf)

In [0]:
%
from pyspark import SparkContext, SparkConf

conf_cust = SparkConf().setAppName('test').setMaster('local')
sc = SparkContext(conf=conf)

In [0]:

s3_path = "s3://b2c-prod-data-pipeline-unified-review/unified/review.v1/fact/"
df1 = spark.read.format('delta').load(s3_path).where("process_granularity='weekly'")
print df1.distinct().count()

df2 = spark.read.format('delta').load("s3://b2c-prod-data-pipeline-unified-review/unified/review.v4/fact/").where("process_granularity='weekly'")
print df2.distinct().count()


In [0]:

import datetime
import re

# def review_migration(spark):

# granularity_list = ['hourly']
# granularity_list = ['daily']
# granularity_list = ['weekly']
granularity_list = ['monthly']
# granularity_list = ['hourly', 'daily', 'weekly', 'monthly']
# granularity_list = ['weekly']
fails_list = []
fails_detail_list = []
orig_s3_path = "s3://b2c-prod-data-pipeline-unified-review/unified/review.v1/fact/process_granularity={}/process_date={}/"
delta_s3_path = "s3://b2c-prod-data-pipeline-unified-review/unified/review.v4/fact/"
# df_orig = spark.read.parquet(orig_s3_path)
# df_delta = spark.read.format('delta').load(delta_s3_path)

# df_ori = spark.read.format('delta').option("versionAsOf", '106').load("s3://b2c-prod-data-pipeline-unified-review/unified/review.v1/fact/")
# df_ori.show(1, False, True)

# df_del = spark.read.format('delta').option("versionAsOf", '2920').load("s3://b2c-prod-data-pipeline-unified-review/unified/review.v4/fact/")
# df_del.show(1, False, True)

for granularity in granularity_list:
    # start_date = datetime.date(1998, 3, 31)
    # end_date = datetime.date(2020, 2, 24)
    start_date = datetime.date(2006, 3, 31)
    end_date = start_date
    # start_date = datetime.date(2020, 1, 11)
    # end_date = datetime.date(2020, 2, 11)
    date_range = end_date - start_date
    date_range = date_range.days + 1
    # print date_range
    for _ in range(date_range):
        process_date = start_date.strftime("%Y-%m-%d")
        try:
            df_orig_temp = spark.read.parquet(orig_s3_path.format(granularity, process_date))
            df_delta_temp = spark.read.format('delta').load(delta_s3_path).where("process_granularity='{}' and process_date='{}'".format(granularity, process_date))
            # dis_count_orig = df_orig_temp.distinct().count()
            # dis_count_delta = df_delta_temp.distinct().count()
            # print "org = {}, delta = {}".format(dis_count_orig, dis_count_delta)
            # if not dis_count_orig == dis_count_delta:
            #     print "Failed"
            #     fails_list.append((granularity, process_date))
            #     print "Diff = ", dis_count_orig - dis_count_delta
            #     fails_list.append((granularity, process_date))
            df_sub = df_orig_temp.select('review_id').subtract(df_delta_temp.select('review_id'))
            if df_sub.take(1):
                orig_count, delta_count = df_orig_temp.distinct().count(), df_delta_temp.distinct().count()
                print "-----------------------------------------------"
                print 'process_date:', process_date, 'granularity:', granularity
                print 'orignal data:', orig_count, 'delta data:', delta_count
                # print "Diff length: ", len(df_sub.collect())
                fails_list.append((granularity, process_date))
            df_orig_temp.unpersist()
            df_delta_temp.unpersist()
        except Exception as e:
            if not re.search(r'Path does not exist:', str(e)):
                print e

        start_date += datetime.timedelta(days=1)

print "fails_list", fails_list
# def main(spark, params):
# review_migration(spark)

# main(spark, None)


In [0]:
%%sh
# aws s3 ls s3://b2c-prod-data-pipeline-unified-review/unified/review.v1/fact/process_granularity=monthly/process_date=2006-03-31/process_hour=23/device_code=android-all/market_code=amazon-store/
# aws s3 ls s3://b2c-prod-data-pipeline-unified-review/unified/review.v1/fact/process_granularity=weekly/process_date=2017-09-11/process_hour=23/device_code=ios-all/market_code=apple-store/
aws s3 ls s3://b2c-prod-data-pipeline-unified-review/unified/review.v1/fact/process_granularity=daily/process_date=2017-12-21/

In [0]:

aa = spark.read.parquet("s3://b2c-prod-data-pipeline-unified-review/unified/review.v1/fact/process_granularity=daily/process_date=2017-12-21/").count()
print aa

In [0]:

bb = spark.read.parquet("s3://b2c-prod-data-pipeline-unified-review/unified/review.v4/fact/process_granularity=hourly/process_date=2018-09-02/").count()
print bb

In [0]:

fails_list = [('hourly', '2018-09-02'), ('hourly', '2019-01-18'), ('daily', '2017-12-21'), ('daily', '2019-04-03'), ('daily', '2019-04-19'), ('weekly', '2017-09-11'), ('monthly', '2006-03-31')]
log = []
for g, p in fails_list:
    aa = spark.read.parquet("s3://b2c-prod-data-pipeline-unified-review/unified/review.v1/fact/process_granularity={}/process_date={}/".format(g, p)).take(1)
    bb = spark.read.parquet("s3://b2c-prod-data-pipeline-unified-review/unified/review.v4/fact/process_granularity={}/process_date={}/".format(g, p)).take(1)
    print '---------------------------------'
    print g, p
    print aa[0]['_identifier'], bb[0]['_identifier']
    log.append((g, p, aa[0]['_identifier']))
print log

In [0]:

df_orig_temp.distinct().count()

In [0]:

9763666 - 4405157

In [0]:

df_orig_temp.show(3, True, True)

In [0]:

df_delta_temp.show(3, True, True)

In [0]:

a = spark.read.parquet("s3://b2c-prod-data-pipeline-unified-review/unified/review.v1/fact/process_granularity={}/process_date={}/".format('hourly', '2020-02-11'))
a.show(1, False, True)

In [0]:

b = spark.read.format('delta').load(delta_s3_path).where("process_granularity='{}' and process_date='{}'".format('hourly', '2020-02-11'))
b.show(1, False, True)

In [0]:

a.printSchema()

In [0]:

b.printSchema()

In [0]:

a.select('review_id').subtract(b.select('review_id')).show()


In [0]:

b.select('review_id').subtract(a.select('review_id')).show(3)

In [0]:

print u"Except can only be performed on tables with the compatible column types. string <> boolean at the 20th column of the second table;;\n'Except false\n:- Relation[_identifier#1798L,app_id#1799L,content#1800,content_language#1801,country_code#1802,language#1803,product_version#1804,rating#1805,reply#1806,reply_date#1807,reply_id#1808,reply_language#1809,review_id#1810,time#1811,title#1812,title_language#1813,user_device#1814,user_id#1815,user_language#1816,user_name#1817,user_purchased#1818,user_review_url#1819,process_hour#1820,device_code#1821,market_code#1822] parquet\n+- Project [_identifier#1848L, app_id#1849L, content#1850, content_language#1851, country_code#1852, device_code#1853, language#1854, market_code#1855, process_hour#1858, product_version#1859, rating#1860, reply#1861, reply_date#1862, reply_id#1863, reply_language#1864, review_id#1865, time#1866, title#1867, title_language#1868, user_device#1869, user_id#1870, user_language#1871, user_name#1872, user_purchased#1873, user_review_url#1874]\n   +- Filter ((process_granularity#1857 = hourly) && (process_date#1856 = 2020-01-18))\n      +- Relation[_identifier#1848L,app_id#1849L,content#1850,content_language#1851,country_code#1852,device_code#1853,language#1854,market_code#1855,process_date#1856,process_granularity#1857,process_hour#1858,product_version#1859,rating#1860,reply#1861,reply_date#1862,reply_id#1863,reply_language#1864,review_id#1865,time#1866,title#1867,title_language#1868,user_device#1869,user_id#1870,user_language#1871,... 3 more fields] parquet\n"


In [0]:

a_set = set(['_identifier'
,'app_id'
,'content'
,'content_language'
,'country_code'
,'language'
,'product_version'
,'rating'
,'reply'
,'reply_date'
,'reply_id'
,'reply_language'
,'review_id'
,'time'
,'title'
,'title_language'
,'user_device'
,'user_id'
,'user_language'
,'user_name'
,'user_purchased'
,'user_review_url'
,'process_hour'
,'device_code'
,'market_code'])

b_set = set([
'_identifier'
,'app_id'
,'content'
,'content_language'
,'country_code'
,'device_code'
,'language'
,'market_code'
,'process_date'
,'process_granularity'
,'process_hour'
,'product_version'
,'rating'
,'reply'
,'reply_date'
,'reply_id'
,'reply_language'
,'review_id'
,'time'
,'title'
,'title_language'
,'user_device'
,'user_id'
,'user_language'
,'user_name'
,'user_purchased'
,'user_review_url'])
a_set - b_set

In [0]:

import datetime
import re

# def review_migration(spark):

# granularity_list = ['hourly']
granularity_list = ['daily']
# granularity_list = ['weekly']
# granularity_list = ['monthly']
# granularity_list = ['hourly', 'daily', 'weekly', 'monthly']
# granularity_list = ['weekly']
fails_list = []
fails_detail_list = []
orig_s3_path = "s3://b2c-prod-data-pipeline-unified-review/unified/review.v1/fact/process_granularity={}/process_date={}/"
delta_s3_path = "s3://b2c-prod-data-pipeline-unified-review/unified/review.v4/fact/"
# df_orig = spark.read.parquet(orig_s3_path)
# df_delta = spark.read.format('delta').load(delta_s3_path)

# df_ori = spark.read.format('delta').option("versionAsOf", '106').load("s3://b2c-prod-data-pipeline-unified-review/unified/review.v1/fact/")
# df_ori.show(1, False, True)

# df_del = spark.read.format('delta').option("versionAsOf", '2920').load("s3://b2c-prod-data-pipeline-unified-review/unified/review.v4/fact/")
# df_del.show(1, False, True)

for granularity in granularity_list:
    # start_date = datetime.date(1998, 3, 31)
    # end_date = datetime.date(2020, 2, 24)
    start_date = datetime.date(2017, 12, 21)
    end_date = start_date
    # start_date = datetime.date(2020, 1, 11)
    # end_date = datetime.date(2020, 2, 11)
    date_range = end_date - start_date
    date_range = date_range.days + 1
    # print date_range
    for _ in range(date_range):
        process_date = start_date.strftime("%Y-%m-%d")
        try:
            df_orig_temp = spark.read.parquet(orig_s3_path.format(granularity, process_date))
            df_delta_temp = spark.read.format('delta').load(delta_s3_path).where("process_granularity='{}' and process_date='{}'".format(granularity, process_date))
            
            df_sub = df_orig_temp.select('review_id').subtract(df_delta_temp.select('review_id'))
            if df_sub.take(1):
                orig_count, delta_count = df_orig_temp.count(), df_delta_temp.count()
                print "-----------------------------------------------"
                print 'process_date:', process_date, 'granularity:', granularity
                print 'orignal data:', orig_count, 'delta data:', delta_count
                # print "Diff length: ", len(df_sub.collect())
                fails_list.append((granularity, process_date))
            df_orig_temp.unpersist()
            df_delta_temp.unpersist()
        except Exception as e:
            if not re.search(r'Path does not exist:', str(e)):
                print e

        start_date += datetime.timedelta(days=1)

print "fails_list", fails_list
# def main(spark, params):
# review_migration(spark)

# main(spark, None)


In [0]:

import datetime
import re


def detail_check(num, df_orig, df_delta):
    # df_diff = df_delta.head(num).subtract(df_orig.head(num))
    # return df_diff
    orig_review_ids = [row["review_id"] for row in df_orig.head(num)]
    delta_review_ids = [row["review_id"] for row in df_delta.head(num)]
    if not orig_review_ids == delta_review_ids:
        print("orig:", orig_review_ids)
        print("delta:", delta_review_ids)
    return orig_review_ids == delta_review_ids, orig_review_ids, delta_review_ids
    # for review_id in review_ids:
    #     if not df_delta.where("review_id='{}'".format(review_id)):
    #     fails_dict["granularity={}, process_date={}".format(granularity, process_date)].append(review_id)

    # return fails_dict


def review_migration(spark):

    granularity_list = ['hourly', 'daily', 'weekly', 'monthly']
    # granularity_list = ['hourly']
    # device_code_list = ['ios-all', 'android-all', 'windows-phone', 'all']
    fails_list = []
    fails_detail_list = []

    for granularity in granularity_list:
        start_date = datetime.date(1998, 3, 31)
        end_date = datetime.date(2020, 2, 24)
        # start_date = datetime.date(2020, 1, 1)
        # end_date = datetime.date(2020, 1, 2)
        date_range = end_date - start_date
        date_range = date_range.days + 1
        for _ in range(date_range):
            process_date = start_date.strftime("%Y-%m-%d")
            try:
                orig_s3_path = "s3://b2c-prod-data-pipeline-unified-review/unified/review.v1/fact/process_granularity={}/process_date={}/"
                delta_s3_path = "s3://b2c-prod-data-pipeline-unified-review/unified/review.v4/fact/process_granularity={}/process_date={}/"

                df_orig = spark.read.parquet(orig_s3_path.format(granularity, process_date)).distinct().sort(
                    "review_id")
                df_delta = spark.read.parquet(delta_s3_path.format(granularity, process_date)).distinct().sort(
                    "review_id")

                dis_count_orig = df_orig.count()
                dis_count_delta = df_delta.count()

                # print "No distinct: ", count_orig, count_delta
                # print "Distinct: ", dis_count_orig, dis_count_delta
                if not dis_count_orig == dis_count_delta:
                    fails_list.append((granularity, process_date))
                    print dis_count_orig - dis_count_delta
                fails_detail_list.append(detail_check(10, df_orig, df_delta))
                # detail_check(10, df_orig, df_delta, granularity, process_date)
                df_orig.unpersist()
                df_delta.unpersist()
                print "DONE"
            except Exception as e:
                if re.search(r'Path does not exist:', str(e)):
                    # print e
                    pass

            start_date += datetime.timedelta(days=1)
    for i in fails_detail_list:
        if not i[0]:
            print "fails_detail_list = ", i


def main(spark, params):
    review_migration(spark)
