In [4]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql.functions import col
import os

In [5]:
base_path = r"C:\Users\vasudha.tanniru\Documents\GitHub\data_projects\retail_data_warehouse\warehouse"
staging_path = os.path.join(base_path,"staging")
core_path = os.path.join(base_path,"core")

In [6]:
spark = SparkSession.builder.appName("Create Core Orders").getOrCreate()

In [7]:
order_master_df = spark.read.parquet(os.path.join(staging_path,"orders"))
order_items_df = spark.read.parquet(os.path.join(staging_path,"order_items"))
order_payments_df = spark.read.parquet(os.path.join(staging_path,"order_payments"))
order_reviews_df = spark.read.parquet(os.path.join(staging_path,"order_reviews"))

In [8]:
order_master_df.show()

+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|            order_id|         customer_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|
+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|60550084e6b4c0cb8...|f5458ddc3545711ef...|   delivered|     2018-02-21 18:15:12|2018-02-23 02:10:52|         2018-02-27 18:52:09|          2018-03-13 23:58:43|          2018-03-29 00:00:00|
|6bb1e842418aac0c9...|032f25110f17bb0d6...|   delivered|     2018-02-11 14:13:54|2018-02-11 14:25:28|         2018-02-21 18:49:53|          2018-03-05 22:04:37|          2018-03-23 00:00:00|
|1f6405caa14a2debb...|5ca6f3218e411dcbb...|  

In [9]:
order_items_df.show()

+--------------------+-------------+--------------------+--------------------+-------------------+------+-------------+
|            order_id|order_item_id|          product_id|           seller_id|shipping_limit_date| price|freight_value|
+--------------------+-------------+--------------------+--------------------+-------------------+------+-------------+
|002f98c0f7efd4263...|            1|d41dc2f2979f52d75...|7299e27ed73d2ad98...|2017-08-10 09:30:15|  8.99|        32.57|
|0113c9989fe6e5007...|            2|e8ca1ff2d0a28edb5...|6b536a23086fba0d5...|2018-06-27 13:31:19|  49.0|        15.44|
|01442602b15d4127a...|            1|d7cdea99e6f50310c...|966cb4760537b1404...|2018-08-01 14:25:12| 504.0|        19.72|
|02d33b6a9d9e61e68...|            1|e30d5b80f5e85b8f9...|1336efc61c316ddf9...|2017-11-29 15:13:31|  39.0|        16.11|
|0355ae43910316435...|            1|d12ad7f92749db412...|8b321bb669392f516...|2018-04-10 03:15:23| 12.95|         7.39|
|03a10721769c7d8f2...|            1|601a

In [10]:
order_payments_df.show()

+--------------------+------------------+------------+--------------------+-------------+
|            order_id|payment_sequential|payment_type|payment_installments|payment_value|
+--------------------+------------------+------------+--------------------+-------------+
|cf30fe76d1505192a...|                 1| credit_card|                   2|        47.72|
|a04639b53f2cbd1f7...|                 1|      boleto|                   1|        44.83|
|2e2dd1119ebf597a9...|                 1| credit_card|                   2|       105.42|
|440a666da55232dbd...|                 1| credit_card|                   3|        46.43|
|bfc9f97fc7ddacf8a...|                 1| credit_card|                   2|       229.74|
|8ff5ec04a3f395979...|                 1| credit_card|                   5|       121.78|
|d75cb3755738c4ae4...|                 1|      boleto|                   1|        37.69|
|2f9892eb8df4a437d...|                 1| credit_card|                   3|       121.09|
|5662e7f8b

In [11]:
order_reviews_df.show()

+--------------------+--------------------+------------+--------------------+----------------------+--------------------+-----------------------+
|           review_id|            order_id|review_score|review_comment_title|review_comment_message|review_creation_date|review_answer_timestamp|
+--------------------+--------------------+------------+--------------------+----------------------+--------------------+-----------------------+
|8a310b56e4d05a778...|c5a47daec61dfde19...|           4|           Recomendo|  O pedido chegou a...| 2018-05-08 00:00:00|    2018-05-10 17:19:17|
|f51a5d398f2374cb2...|dcbec508f4fc19506...|           1|                   6|  Eu não tenho notí...| 2018-05-02 00:00:00|    2018-05-02 11:17:26|
|0e2a9cd112e26220a...|668b09e578d6cdac0...|           5|         Excelente!!|  Produto entregue ...| 2018-07-12 00:00:00|    2018-07-13 11:27:33|
|7da6b99eedc285b7a...|df78dad0e4ef0211f...|           5|               ótimo|  O PRODUTO FOI ENT...| 2018-07-05 00:00:00|   

In [12]:
print("Before Dedup of order_master_df:",order_master_df.count(),"After :", order_master_df.dropDuplicates().count())
print("Before Dedup of order_items_df:",order_items_df.count(),"After :", order_items_df.dropDuplicates().count())
print("Before Dedup of order_payments_df:",order_payments_df.count(),"After :", order_payments_df.dropDuplicates().count())
print("Before Dedup of order_reviews_df:",order_reviews_df.count(),"After :", order_reviews_df.dropDuplicates().count())

Before Dedup of order_master_df: 99441 After : 99441
Before Dedup of order_items_df: 112650 After : 112650
Before Dedup of order_payments_df: 103886 After : 103886
Before Dedup of order_reviews_df: 104067 After : 104067


In [13]:
df_dict={
    "order_master_df":order_master_df,
    "order_items_df" :order_items_df,
    "order_payments_df":order_payments_df,
    "order_reviews_df":order_reviews_df
}

key_cols={
    "order_master_df":"order_id",
    "order_items_df" :"order_id",
    "order_payments_df":"order_id",
    "order_reviews_df":"order_id"
}

In [14]:
def data_quality_checks(df_dict,key_col=None):
    for name,df in df_dict.items():
        print(f"\n Checking Dataframe: {name}")
        print("-"*50)
        
        total = df.count()
        print(f"Total rows: {total}")
        
        if key_col:
            if isinstance(key_col,dict) and name in key_col:
                key = key_col[name]
            else:
                key = key_col
            dup_count = (df.groupBy(key).count().filter("count>1").count())
            print(f"Duplicates by {key}: {dup_count}")
        else:
            dup_count = total - df.dropDuplicates().count()
            print(f"Techincal duplicates : {dup_count}")
            
        null_counts = (
            df.select([f.count(f.when(f.col(c).isNull(), c)).alias(c) for c in df.columns])
            .toPandas()
            .T
                 )
        null_counts.columns = ["null_count"]
        print("Null counts:")
        print(null_counts[null_counts["null_count"] > 0])

        print("-" * 50)
    

In [15]:
data_quality_checks(df_dict, key_cols)



 Checking Dataframe: order_master_df
--------------------------------------------------
Total rows: 99441
Duplicates by order_id: 0
Null counts:
                               null_count
order_approved_at                     160
order_delivered_carrier_date         1783
order_delivered_customer_date        2965
--------------------------------------------------

 Checking Dataframe: order_items_df
--------------------------------------------------
Total rows: 112650
Duplicates by order_id: 9803
Null counts:
Empty DataFrame
Columns: [null_count]
Index: []
--------------------------------------------------

 Checking Dataframe: order_payments_df
--------------------------------------------------
Total rows: 103886
Duplicates by order_id: 2961
Null counts:
Empty DataFrame
Columns: [null_count]
Index: []
--------------------------------------------------

 Checking Dataframe: order_reviews_df
--------------------------------------------------
Total rows: 104067
Duplicates by order_id: 944

In [16]:
order_items_df.show()

+--------------------+-------------+--------------------+--------------------+-------------------+------+-------------+
|            order_id|order_item_id|          product_id|           seller_id|shipping_limit_date| price|freight_value|
+--------------------+-------------+--------------------+--------------------+-------------------+------+-------------+
|002f98c0f7efd4263...|            1|d41dc2f2979f52d75...|7299e27ed73d2ad98...|2017-08-10 09:30:15|  8.99|        32.57|
|0113c9989fe6e5007...|            2|e8ca1ff2d0a28edb5...|6b536a23086fba0d5...|2018-06-27 13:31:19|  49.0|        15.44|
|01442602b15d4127a...|            1|d7cdea99e6f50310c...|966cb4760537b1404...|2018-08-01 14:25:12| 504.0|        19.72|
|02d33b6a9d9e61e68...|            1|e30d5b80f5e85b8f9...|1336efc61c316ddf9...|2017-11-29 15:13:31|  39.0|        16.11|
|0355ae43910316435...|            1|d12ad7f92749db412...|8b321bb669392f516...|2018-04-10 03:15:23| 12.95|         7.39|
|03a10721769c7d8f2...|            1|601a

In [19]:
order_items_df.groupBy("order_id").agg(f.count("order_item_id")).show()

+--------------------+--------------------+
|            order_id|count(order_item_id)|
+--------------------+--------------------+
|41d85a7a138b7205e...|                   1|
|1c2e2705fc091788f...|                   1|
|028dc52e12ddda803...|                   1|
|199c3d5d545678663...|                   1|
|26ebbef3221e8b51c...|                   1|
|33b7013ecf08d45cc...|                   1|
|184f17bb701af22b8...|                   1|
|28eaf054725f4dd3c...|                   1|
|3303092810c37e9ab...|                   1|
|014405982914c2cde...|                   2|
|240891986117bf8fd...|                   1|
|278863630df775393...|                   1|
|099d60cb800db65a7...|                   1|
|406de5462666366ba...|                   1|
|2b31af271f3efcfd1...|                   1|
|0e4672661531addf3...|                   1|
|125e0c61fdbfc589f...|                   4|
|1c4a92d82c1b0dec1...|                   3|
|2d08fef3d5af150cd...|                   1|
|116debdd5705aaa51...|          

In [9]:
#  order_reviews_df.select("review_score").show(10,truncate = False)
order_reviews_df.dtypes

[('review_id', 'string'),
 ('order_id', 'string'),
 ('review_score', 'int'),
 ('review_comment_title', 'string'),
 ('review_comment_message', 'string'),
 ('review_creation_date', 'timestamp'),
 ('review_answer_timestamp', 'timestamp')]

In [8]:
order_reviews_df.select("review_score").distinct().show(150)

+------------+
|review_score|
+------------+
|           1|
|           3|
|           5|
|           4|
|           2|
+------------+



In [10]:
order_reviews_df.groupBy("order_id").agg(f.avg("review_score")).show()

+--------------------+-----------------+
|            order_id|avg(review_score)|
+--------------------+-----------------+
|ff29c7cdeea12d00d...|              4.0|
|9df7c08a42d7b4f7a...|              5.0|
|4e88622fff810dd27...|              3.0|
|eedb09f615e5d17f4...|              5.0|
|873c916c2bff3ea61...|              1.0|
|9f672a6ad991e6e23...|              3.0|
|358062b4847c9e2ce...|              5.0|
|08b450b388221bfe5...|              1.0|
|d17a342bb9f94d40c...|              5.0|
|598296af46cba70d2...|              4.0|
|b0200f320613faea2...|              5.0|
|66b83d9733e909624...|              5.0|
|9e55ecf53f805f324...|              5.0|
|7d76335e7e80375e2...|              5.0|
|444a494f39275142d...|              4.0|
|1b2ea47a491df032b...|              5.0|
|1a7ea14458cd657d2...|              5.0|
|f1462e755e6dad34d...|              5.0|
|a1e28dc56f8cf4e56...|              4.0|
|69340678ddc4f528c...|              5.0|
+--------------------+-----------------+
only showing top

In [11]:
order_reviews_df.show()

+--------------------+--------------------+------------+--------------------+----------------------+--------------------+-----------------------+
|           review_id|            order_id|review_score|review_comment_title|review_comment_message|review_creation_date|review_answer_timestamp|
+--------------------+--------------------+------------+--------------------+----------------------+--------------------+-----------------------+
|f99ff17e430a53efe...|009838529bb913846...|           3| PRODUTO COM DEFEITO|  SOLICITEI DEVOLUÇ...| 2018-06-27 00:00:00|    2018-06-28 11:09:14|
|edb8423465e501f51...|b1829b9fc61d2f28d...|           2|   Produto diferente|  O copo não é exat...| 2018-07-29 00:00:00|    2018-07-29 21:42:34|
|61e4da09f3d99e918...|0ed6d545d159bc1c0...|           5|         Otmo vendor|  Vou comprar mais ...| 2018-06-19 00:00:00|    2018-06-19 23:36:08|
|77701c0425c416ba0...|00eead1d5a799277e...|           5|           muito bom|  mais rapido que j...| 2018-08-31 00:00:00|   

In [12]:
order_reviews_agg = (order_reviews_df.groupBy("order_id").agg (
                                                               f.avg("review_score").alias("avg_review_score"),
                                                                f.max("review_creation_date").alias("latest_review_date")
                                                               )
                    )

In [13]:
order_reviews_agg.show()

+--------------------+----------------+-------------------+
|            order_id|avg_review_score| latest_review_date|
+--------------------+----------------+-------------------+
|ff29c7cdeea12d00d...|             4.0|2018-05-09 00:00:00|
|9df7c08a42d7b4f7a...|             5.0|2018-05-15 00:00:00|
|4e88622fff810dd27...|             3.0|2018-07-15 00:00:00|
|eedb09f615e5d17f4...|             5.0|2018-06-15 00:00:00|
|873c916c2bff3ea61...|             1.0|2018-08-15 00:00:00|
|9f672a6ad991e6e23...|             3.0|2018-06-14 00:00:00|
|358062b4847c9e2ce...|             5.0|2018-06-23 00:00:00|
|08b450b388221bfe5...|             1.0|2017-11-08 00:00:00|
|d17a342bb9f94d40c...|             5.0|2018-04-11 00:00:00|
|598296af46cba70d2...|             4.0|2018-04-26 00:00:00|
|b0200f320613faea2...|             5.0|2018-02-21 00:00:00|
|66b83d9733e909624...|             5.0|2017-12-02 00:00:00|
|9e55ecf53f805f324...|             5.0|2018-04-27 00:00:00|
|7d76335e7e80375e2...|             5.0|2

In [15]:
order_payments_df.groupBy("order_id").agg(f.count("payment_value")).show()

+--------------------+--------------------+
|            order_id|count(payment_value)|
+--------------------+--------------------+
|629eb58d177eb9d9e...|                   1|
|e2b9380fcb4f1f7e2...|                   1|
|a3797015424a5a231...|                   1|
|e239d280236cdd3c4...|                   1|
|f44cb69655f8e4d13...|                   2|
|9570e7230d3d4ecaa...|                   1|
|7a5472f7c8cecc2e1...|                   2|
|8eb78d502d1023cb6...|                   1|
|1d868462a56ffa7bb...|                   1|
|f8a7ddc8ce9090a26...|                   1|
|7ad26c8f2f119d814...|                   1|
|8a6927284335d25c4...|                   1|
|43ed2e201f59c4d2f...|                   1|
|bf240a39797d69697...|                   1|
|0e4672661531addf3...|                   1|
|631e74e14da293fc4...|                   1|
|faf1a9a55f20bf036...|                   2|
|84fe96b51754d901f...|                   1|
|51a5a1d89c87180ca...|                   1|
|aa1a6dd9036736db7...|          

In [16]:
order_payments_df.show()

+--------------------+------------------+------------+--------------------+-------------+
|            order_id|payment_sequential|payment_type|payment_installments|payment_value|
+--------------------+------------------+------------+--------------------+-------------+
|cf30fe76d1505192a...|                 1| credit_card|                   2|        47.72|
|a04639b53f2cbd1f7...|                 1|      boleto|                   1|        44.83|
|2e2dd1119ebf597a9...|                 1| credit_card|                   2|       105.42|
|440a666da55232dbd...|                 1| credit_card|                   3|        46.43|
|bfc9f97fc7ddacf8a...|                 1| credit_card|                   2|       229.74|
|8ff5ec04a3f395979...|                 1| credit_card|                   5|       121.78|
|d75cb3755738c4ae4...|                 1|      boleto|                   1|        37.69|
|2f9892eb8df4a437d...|                 1| credit_card|                   3|       121.09|
|5662e7f8b

In [18]:
order_payments_df.orderBy("order_id").show()

+--------------------+------------------+------------+--------------------+-------------+
|            order_id|payment_sequential|payment_type|payment_installments|payment_value|
+--------------------+------------------+------------+--------------------+-------------+
|00010242fe8c5a6d1...|                 1| credit_card|                   2|        72.19|
|00018f77f2f0320c5...|                 1| credit_card|                   3|       259.83|
|000229ec398224ef6...|                 1| credit_card|                   5|       216.87|
|00024acbcdf0a6daa...|                 1| credit_card|                   2|        25.78|
|00042b26cf59d7ce6...|                 1| credit_card|                   3|       218.04|
|00048cc3ae777c65d...|                 1|      boleto|                   1|        34.59|
|00054e8431b9d7675...|                 1| credit_card|                   1|        31.75|
|000576fe39319847c...|                 1| credit_card|                  10|       880.75|
|0005a1a17

In [19]:
order_payments_agg = (
                      order_payments_df.groupBy("order_id").
                        agg(f.sum("payment_value").alias("total_payment_value"),
                            f.countDistinct("payment_type").alias("num_payment_methods"),
                            f.first("payment_type").alias("primary_payment_type"),
                            f.max("payment_installments").alias("max_installments")
                            )
                        )

In [23]:
order_df = order_master_df.join(order_items_df,on="order_id",how="inner")

In [24]:
orders_items_payment_df = order_df.join(order_payments_agg,on="order_id",how="inner")

In [25]:
core_orders_df = orders_items_payment_df.join(order_reviews_agg,on = "order_id",how="left")

In [26]:
core_orders_df.show()

+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+-------------+--------------------+--------------------+-------------------+-----+-------------+-------------------+-------------------+--------------------+----------------+----------------+-------------------+
|            order_id|         customer_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|order_item_id|          product_id|           seller_id|shipping_limit_date|price|freight_value|total_payment_value|num_payment_methods|primary_payment_type|max_installments|avg_review_score| latest_review_date|
+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+------

In [27]:

core_orders_df.write.mode("overwrite").parquet(os.path.join(core_path,"core_orders"))