In [5]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
import os

In [6]:
spark = SparkSession.builder.appName("Create core customers parquet").getOrCreate()

In [7]:
base_path = r"C:\Users\vasudha.tanniru\Documents\GitHub\data_projects\retail_data_warehouse\warehouse"
staging_path = os.path.join(base_path,"staging")
core_path = os.path.join(base_path,"core")

In [8]:
customers_df = spark.read.parquet(os.path.join(staging_path,"customers"))

In [9]:
customers_df.show()

+--------------------+--------------------+------------------------+--------------------+--------------+
|         customer_id|  customer_unique_id|customer_zip_code_prefix|       customer_city|customer_state|
+--------------------+--------------------+------------------------+--------------------+--------------+
|5e7aa73120cbc53fa...|cd8738980c3332339...|                   23810|             itaguai|            RJ|
|1fda81133d4f067f7...|31973584858337356...|                   30170|      belo horizonte|            MG|
|ba4ec83bdd2a861c3...|8479ef9838c9bca6f...|                   15460|                icem|            SP|
|e8a2dab1e442d28c1...|43fcee98a4442ab2f...|                   66093|               belem|            PA|
|8ca53db6e60cffad0...|b3d23d61fb01dde31...|                   31710|      belo horizonte|            MG|
|7117d38240c00f274...|e2f1970303ff34a3a...|                    1229|           sao paulo|            SP|
|a3a21bf032df1c3f4...|a0488912f3470a18b...|            

In [10]:
customers_df.count()


99441

In [11]:
customers_df.dropDuplicates().count()


99441

In [14]:
customers_df.groupBy("customer_unique_id").count().filter("count>1").show()

+--------------------+-----+
|  customer_unique_id|count|
+--------------------+-----+
|7e7301841ddb4064c...|    2|
|7b0eaf68a16e4808e...|    2|
|c85df1c6ef6f7bb60...|    2|
|216ab90e27f18940c...|    2|
|7e4bebe20140a71b3...|    2|
|7462a753a77ed0933...|    2|
|5a2e847dd085d36e3...|    2|
|4702ba5faa8283e0f...|    2|
|26e025af2347c3968...|    2|
|4e3b6c25502ef69e5...|    2|
|a35e6ad969a429859...|    2|
|453c2895f29b6d9a4...|    2|
|bce006e903be688f1...|    2|
|cbe71e4cb0b82b97b...|    2|
|98a8081d6f922b47f...|    2|
|14a188558af6cd5bc...|    2|
|4011b7579e894fa92...|    2|
|4b384b778ebc0449d...|    2|
|1d2435aa3b858d45c...|    2|
|f31d2c22ddcad145e...|    2|
+--------------------+-----+
only showing top 20 rows


In [15]:
geolocation_summary_df = spark.read.parquet(os.path.join(core_path,"core_geolocation"))

In [16]:
geolocation_summary_df.show()

+---------------------------+----------------+-----------------+-------------------+-------------------+
|geolocation_zip_code_prefix|geolocation_city|geolocation_state|avg_geolocation_lat|avg_geolocation_lng|
+---------------------------+----------------+-----------------+-------------------+-------------------+
|                       1001|       sao paulo|               sp|-23.550226514316524| -46.63403856911852|
|                       1002|       sao paulo|               sp|-23.547657125819924|-46.634990509252304|
|                       1003|       sao paulo|               sp|-23.548999834572523|  -46.6355816587524|
|                       1004|       sao paulo|               sp| -23.54982914119189|-46.634792029508745|
|                       1005|       sao paulo|               sp|-23.549547473955094| -46.63640612884047|
|                       1006|       sao paulo|               sp|-23.550126527377728|-46.636044497481514|
|                       1007|       sao paulo|         

In [17]:
customers_enriched_df = (customers_df.join(geolocation_summary_df,customers_df.customer_zip_code_prefix == geolocation_summary_df.geolocation_zip_code_prefix,"left"))

In [18]:
customers_enriched_df.show()

+--------------------+--------------------+------------------------+--------------------+--------------+---------------------------+--------------------+-----------------+-------------------+-------------------+
|         customer_id|  customer_unique_id|customer_zip_code_prefix|       customer_city|customer_state|geolocation_zip_code_prefix|    geolocation_city|geolocation_state|avg_geolocation_lat|avg_geolocation_lng|
+--------------------+--------------------+------------------------+--------------------+--------------+---------------------------+--------------------+-----------------+-------------------+-------------------+
|5e7aa73120cbc53fa...|cd8738980c3332339...|                   23810|             itaguai|            RJ|                      23810|             itaguai|               rj|-22.871223439317493| -43.76920187277879|
|1fda81133d4f067f7...|31973584858337356...|                   30170|      belo horizonte|            MG|                      30170|      belo horizonte

In [20]:
customers_enriched_df = (customers_enriched_df
                             .withColumn(
                                 "customer_city",
                                 f.when(f.col("customer_city").isNull(),f.col("geolocation_city"))
                                 .otherwise(f.col("customer_city"))
                             )
                             .withColumn (
                                     "customer_state",
                                     f.when(f.col("customer_state").isNull(),f.col("geolocation_state"))
                                     .otherwise(f.col("customer_state"))
                             
                             
                             )
                        )

In [21]:
customers_enriched_df.show()

+--------------------+--------------------+------------------------+--------------------+--------------+---------------------------+--------------------+-----------------+-------------------+-------------------+
|         customer_id|  customer_unique_id|customer_zip_code_prefix|       customer_city|customer_state|geolocation_zip_code_prefix|    geolocation_city|geolocation_state|avg_geolocation_lat|avg_geolocation_lng|
+--------------------+--------------------+------------------------+--------------------+--------------+---------------------------+--------------------+-----------------+-------------------+-------------------+
|5e7aa73120cbc53fa...|cd8738980c3332339...|                   23810|             itaguai|            RJ|                      23810|             itaguai|               rj|-22.871223439317493| -43.76920187277879|
|1fda81133d4f067f7...|31973584858337356...|                   30170|      belo horizonte|            MG|                      30170|      belo horizonte

In [23]:
customers_enriched_df =  (customers_enriched_df.withColumn("avg_lat",f.col("avg_geolocation_lat"))
                                               .withColumn("avg_lng",f.col("avg_geolocation_lng"))
                         )

In [24]:
customers_enriched_df.show()

+--------------------+--------------------+------------------------+--------------------+--------------+---------------------------+--------------------+-----------------+-------------------+-------------------+-------------------+-------------------+
|         customer_id|  customer_unique_id|customer_zip_code_prefix|       customer_city|customer_state|geolocation_zip_code_prefix|    geolocation_city|geolocation_state|avg_geolocation_lat|avg_geolocation_lng|            avg_lat|            avg_lng|
+--------------------+--------------------+------------------------+--------------------+--------------+---------------------------+--------------------+-----------------+-------------------+-------------------+-------------------+-------------------+
|5e7aa73120cbc53fa...|cd8738980c3332339...|                   23810|             itaguai|            RJ|                      23810|             itaguai|               rj|-22.871223439317493| -43.76920187277879|-22.871223439317493| -43.76920187

In [25]:
customers_enriched_df = customers_enriched_df.drop(
    "geolocation_zip_code_prefix",
    "geolocation_city",
    "geolocation_state",
    "avg_geolocation_lat",
    "avg_geolocation_lng"
)


In [27]:
customers_enriched_df.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- customer_unique_id: string (nullable = true)
 |-- customer_zip_code_prefix: integer (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)
 |-- avg_lat: double (nullable = true)
 |-- avg_lng: double (nullable = true)



In [28]:

customers_enriched_df.coalesce(1).write.mode("overwrite").parquet(
    r"C:\Users\vasudha.tanniru\Documents\GitHub\data_projects\retail_data_warehouse\warehouse\core\core_customers"
)