In [89]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
import os
import unicodedata
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

In [90]:
base_path = r"C:\Users\vasudha.tanniru\Documents\GitHub\data_projects\retail_data_warehouse\warehouse"
staging_path = os.path.join(base_path,"staging")
core_path = os.path.join(base_path,"core")

In [91]:
spark = SparkSession.builder.appName("Create Core Layer").getOrCreate()

In [92]:
file_name = [f for f in os.listdir(os.path.join(staging_path,"geolocation")) if f.endswith(".parquet")]
print(file_name)

['part-00000-fca9c3ab-a3c7-4706-a3df-dbffd37cee50-c000.snappy.parquet']


In [93]:
geolocation_df = spark.read.parquet(os.path.join(staging_path,"geolocation"))

In [94]:
geolocation_df.show()

+---------------------------+-------------------+-------------------+----------------+-----------------+
|geolocation_zip_code_prefix|    geolocation_lat|    geolocation_lng|geolocation_city|geolocation_state|
+---------------------------+-------------------+-------------------+----------------+-----------------+
|                       1001|-23.550497706907514| -46.63433817805407|       sao paulo|               SP|
|                       1046| -23.54711381127012| -46.64569331848347|       sao paulo|               SP|
|                       1044|-23.546454307519205|-46.640845320560956|       sao paulo|               SP|
|                       1038|-23.544139809596672| -46.64041077352986|       sao paulo|               SP|
|                       1050|-23.549107326677785|-46.644104604074144|       sao paulo|               SP|
|                       1107|-23.521656419429938| -46.63442349713816|       são paulo|               SP|
|                       1124|-23.527821688037392|-46.63

In [95]:
geolocation_df.groupBy("geolocation_zip_code_prefix").agg(f.count("*").alias("check_cnt")).show()

+---------------------------+---------+
|geolocation_zip_code_prefix|check_cnt|
+---------------------------+---------+
|                       1238|       72|
|                       3918|       38|
|                       5518|       25|
|                       2122|       21|
|                       4101|       28|
|                       2366|       18|
|                       4935|       14|
|                       3175|       27|
|                       2866|       31|
|                       6654|      128|
|                       9852|       79|
|                       6620|       50|
|                       6397|       81|
|                       7240|      138|
|                       8592|       18|
|                      12940|      187|
|                      15790|       12|
|                      13285|       42|
|                      18051|      154|
|                      18800|      239|
+---------------------------+---------+
only showing top 20 rows


In [96]:
geolocation_df.filter(f.col("geolocation_zip_code_prefix") == 1238).show()

+---------------------------+-------------------+-------------------+----------------+-----------------+
|geolocation_zip_code_prefix|    geolocation_lat|    geolocation_lng|geolocation_city|geolocation_state|
+---------------------------+-------------------+-------------------+----------------+-----------------+
|                       1238| -23.54323868858565| -46.65731599111321|       sao paulo|               SP|
|                       1238|-23.540473561198002| -46.65054819983838|       sao paulo|               SP|
|                       1238|-23.544552173754898|-46.654369789485784|       sao paulo|               SP|
|                       1238|   -23.539771561198| -46.65032214155458|       sao paulo|               SP|
|                       1238|-23.543396632263747| -46.65715435665641|       são paulo|               SP|
|                       1238| -23.54173669441403| -46.65963712155927|       sao paulo|               SP|
|                       1238|-23.543927088810687| -46.6

In [97]:
geolocation_df = geolocation_df.withColumn("geolocation_city",f.lower(f.trim(f.col("geolocation_city"))))
geolocation_df = geolocation_df.withColumn("geolocation_state",f.lower(f.trim(f.col("geolocation_state"))))

In [98]:
geolocation_df.show()

+---------------------------+-------------------+-------------------+----------------+-----------------+
|geolocation_zip_code_prefix|    geolocation_lat|    geolocation_lng|geolocation_city|geolocation_state|
+---------------------------+-------------------+-------------------+----------------+-----------------+
|                       1001|-23.550497706907514| -46.63433817805407|       sao paulo|               sp|
|                       1046| -23.54711381127012| -46.64569331848347|       sao paulo|               sp|
|                       1044|-23.546454307519205|-46.640845320560956|       sao paulo|               sp|
|                       1038|-23.544139809596672| -46.64041077352986|       sao paulo|               sp|
|                       1050|-23.549107326677785|-46.644104604074144|       sao paulo|               sp|
|                       1107|-23.521656419429938| -46.63442349713816|       são paulo|               sp|
|                       1124|-23.527821688037392|-46.63

In [99]:
def remove_accents(text):
    if text is None:
        return None
    text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')
    return text

In [100]:
remove_accents_udf = udf(remove_accents, StringType())

In [101]:
geolocation_df = geolocation_df.withColumn("geolocation_city",remove_accents_udf(f.col("geolocation_city")))
geolocation_df = geolocation_df.withColumn("geolocation_state",remove_accents_udf(f.col("geolocation_state")))

In [102]:
geolocation_df.show()

+---------------------------+-------------------+-------------------+----------------+-----------------+
|geolocation_zip_code_prefix|    geolocation_lat|    geolocation_lng|geolocation_city|geolocation_state|
+---------------------------+-------------------+-------------------+----------------+-----------------+
|                       1001|-23.550497706907514| -46.63433817805407|       sao paulo|               sp|
|                       1046| -23.54711381127012| -46.64569331848347|       sao paulo|               sp|
|                       1044|-23.546454307519205|-46.640845320560956|       sao paulo|               sp|
|                       1038|-23.544139809596672| -46.64041077352986|       sao paulo|               sp|
|                       1050|-23.549107326677785|-46.644104604074144|       sao paulo|               sp|
|                       1107|-23.521656419429938| -46.63442349713816|       sao paulo|               sp|
|                       1124|-23.527821688037392|-46.63

In [103]:
geolocation_summary_df=geolocation_df.groupBy(["geolocation_zip_code_prefix","geolocation_city","geolocation_state"]).agg(f.avg("geolocation_lat").alias("avg_geolocation_lat"), f.avg("geolocation_lng").alias("avg_geolocation_lng")).orderBy("geolocation_zip_code_prefix")

In [104]:
geolocation_summary_df.show()

+---------------------------+----------------+-----------------+-------------------+-------------------+
|geolocation_zip_code_prefix|geolocation_city|geolocation_state|avg_geolocation_lat|avg_geolocation_lng|
+---------------------------+----------------+-----------------+-------------------+-------------------+
|                       1001|       sao paulo|               sp|-23.550226514316524| -46.63403856911852|
|                       1002|       sao paulo|               sp|-23.547657125819924|-46.634990509252304|
|                       1003|       sao paulo|               sp|-23.548999834572523|  -46.6355816587524|
|                       1004|       sao paulo|               sp| -23.54982914119189|-46.634792029508745|
|                       1005|       sao paulo|               sp|-23.549547473955094| -46.63640612884047|
|                       1006|       sao paulo|               sp|-23.550126527377728|-46.636044497481514|
|                       1007|       sao paulo|         

In [106]:
geolocation_summary_df.select("geolocation_city").distinct().show(30,truncate=False)

+-----------------------+
|geolocation_city       |
+-----------------------+
|camacari               |
|arapiraca              |
|gloria                 |
|itaberaba              |
|redencao da serra      |
|igrejinha              |
|bacaxa                 |
|jangada                |
|pote                   |
|divino de sao lourenco |
|vermelho               |
|barracao               |
|jijoca de jericoacoara |
|divino das laranjeiras |
|dilermando de aguiar   |
|cacaratiba             |
|aguas de sao pedro     |
|iepe                   |
|boa vista              |
|fama                   |
|guaranta               |
|cachoeira              |
|sao romao              |
|itumirim               |
|estrela de alagoas     |
|centro novo do maranhao|
|cerro grande           |
|brusque                |
|maria helena           |
|coracao de jesus       |
+-----------------------+
only showing top 30 rows


In [107]:
print("Distinct cities:", geolocation_df.select("geolocation_city").distinct().count())


Distinct cities: 5967


In [108]:
geolocation_summary_df.coalesce(1).write.mode("overwrite").parquet(
    r"C:\Users\vasudha.tanniru\Documents\GitHub\data_projects\retail_data_warehouse\warehouse\core\core_geolocation"
)