In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
import os

In [2]:
base_path = r"C:\Users\vasudha.tanniru\Documents\GitHub\data_projects\retail_data_warehouse\warehouse"
staging_path = os.path.join(base_path,"staging")
core_path = os.path.join(base_path,"core")

In [4]:
spark = SparkSession.builder.appName("Create Core Sellers").getOrCreate()


In [6]:
sellers_df = spark.read.parquet(os.path.join(staging_path,"sellers"))

In [7]:
sellers_df.show()

+--------------------+----------------------+--------------------+------------+
|           seller_id|seller_zip_code_prefix|         seller_city|seller_state|
+--------------------+----------------------+--------------------+------------+
|f6122bc84774df1b3...|                  6436|             barueri|          SP|
|8132b9bd16876e1b0...|                 88058|       florianopolis|          SC|
|a254c682cc01e119f...|                 16500|          cafelandia|          SP|
|cb9fb4ca75d7ba843...|                  3551|           sao paulo|          SP|
|c5ebe6598748b0aea...|                 95910|             lajeado|          RS|
|bd15ee794d5e640d9...|                 14078|      ribeirao preto|          SP|
|7994b065a7ffb14e7...|                 29142|      cariacica / es|          ES|
|67883baaae6134ee8...|                  1310|           sao paulo|          SP|
|f52c2422904463fdd...|                  9230|santo andre/sao p...|          SP|
|528bcf6680c36dddf...|                  

In [8]:
print("Before dedup:",sellers_df.count(),"After:",sellers_df.dropDuplicates().count())

Before dedup: 3095 After: 3095


In [11]:
sellers_df.select(
    [f.count(f.when(f.col(c).isNull(), c)).alias(c) for c in sellers_df.columns]
).show()


+---------+----------------------+-----------+------------+
|seller_id|seller_zip_code_prefix|seller_city|seller_state|
+---------+----------------------+-----------+------------+
|        0|                     0|          0|           0|
+---------+----------------------+-----------+------------+



In [12]:
sellers_df.groupBy("seller_id").count().filter("count>1").show()

+---------+-----+
|seller_id|count|
+---------+-----+
+---------+-----+



In [13]:
geolocation_summary_df = spark.read.parquet(os.path.join(core_path,"core_geolocation"))

In [14]:
core_sellers_df = (
                    sellers_df.join(
                      geolocation_summary_df, sellers_df.seller_zip_code_prefix == geolocation_summary_df.geolocation_zip_code_prefix,"left"
                    )
)

In [15]:
core_sellers_df.show()

+--------------------+----------------------+--------------------+------------+---------------------------+--------------------+-----------------+-------------------+-------------------+
|           seller_id|seller_zip_code_prefix|         seller_city|seller_state|geolocation_zip_code_prefix|    geolocation_city|geolocation_state|avg_geolocation_lat|avg_geolocation_lng|
+--------------------+----------------------+--------------------+------------+---------------------------+--------------------+-----------------+-------------------+-------------------+
|f6122bc84774df1b3...|                  6436|             barueri|          SP|                       6436|             barueri|               sp|-23.537663121440325|-46.888915425858364|
|8132b9bd16876e1b0...|                 88058|       florianopolis|          SC|                      88058|florian&oacute;polis|               sc|-27.450674474897465| -48.40169285789716|
|8132b9bd16876e1b0...|                 88058|       florianopolis

In [18]:
core_sellers_df = (
    core_sellers_df
    .withColumn(
        "seller_city",
        f.when(f.col("seller_city").isNull(), f.col("geolocation_city"))
         .otherwise(f.col("seller_city"))
    )
    .withColumn(
        "seller_state",
        f.when(f.col("seller_state").isNull(), f.col("geolocation_state"))
         .otherwise(f.col("seller_state"))
    )
)


In [19]:
core_sellers_df.show()

+--------------------+----------------------+--------------------+------------+---------------------------+--------------------+-----------------+-------------------+-------------------+
|           seller_id|seller_zip_code_prefix|         seller_city|seller_state|geolocation_zip_code_prefix|    geolocation_city|geolocation_state|avg_geolocation_lat|avg_geolocation_lng|
+--------------------+----------------------+--------------------+------------+---------------------------+--------------------+-----------------+-------------------+-------------------+
|f6122bc84774df1b3...|                  6436|             barueri|          SP|                       6436|             barueri|               sp|-23.537663121440325|-46.888915425858364|
|8132b9bd16876e1b0...|                 88058|       florianopolis|          SC|                      88058|florian&oacute;polis|               sc|-27.450674474897465| -48.40169285789716|
|8132b9bd16876e1b0...|                 88058|       florianopolis

In [21]:
core_sellers_df = (
    core_sellers_df
    .withColumn("avg_lat", f.col("avg_geolocation_lat"))
    .withColumn("avg_lng", f.col("avg_geolocation_lng"))
)


In [22]:
core_sellers_df = core_sellers_df.drop(
    "geolocation_zip_code_prefix",
    "geolocation_city",
    "geolocation_state",
    "avg_geolocation_lat",
    "avg_geolocation_lng"
)


In [23]:
core_sellers_df.coalesce(1).write.mode("overwrite").parquet(
    r"C:\Users\vasudha.tanniru\Documents\GitHub\data_projects\retail_data_warehouse\warehouse\core\core_sellers"
)
