In [0]:
from configs.adls_config import adls_config
from configs.urls import BRONZE_LAYER_PATH, SILVER_LAYER_PATH
from pyspark.sql.functions import when, col, regexp_replace, trim
from datetime import datetime
from configs.logger import get_logger


In [0]:
logger = get_logger("city_logger")

In [0]:
adls_config(spark,dbutils)
logger.info(f"ADLS Configured")

df = spark.read.csv(
    f"{BRONZE_LAYER_PATH}city_data/city.csv",
    header=True,
    inferSchema=True,
)
logger.info(f"Loaded data from {BRONZE_LAYER_PATH}")
df_cleaned = df \
    .withColumnRenamed("lat", "latitude") \
    .withColumnRenamed("lon", "longitude") \
    .withColumn("state",when(col("state").isNull(), trim(regexp_replace(col("city"), r"(?i)^thành\s+ph[ốo]\s*", ""))).otherwise(trim(regexp_replace(col("state"),"Province",""))))\
    .select("country","state","latitude","longitude")
logger.info(f"Data cleaned")

current_date_path = datetime.now().strftime("%Y/%m/%d")
silver_output_path=f"{SILVER_LAYER_PATH}city_data_cleaned/dt={current_date_path}/"
df_cleaned.write.format("delta").mode("append").save(silver_output_path)
logger.info(f"Data written to {silver_output_path}")