In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
#Parámetro para el entorno
dbutils.widgets.text("ENV", "dev")
ENV = dbutils.widgets.get("ENV").strip().lower()

if ENV not in ("dev", "prod"):
    raise ValueError("ENV debe ser 'dev' o 'prod'")

In [0]:
#Configuración de entorno para trabajar
if ENV == "dev":
    CATALOG = "dev-adventureworks"
    ADLS_ACCOUNT = "adsldevadventureworks"
else:
    CATALOG = "prod-adventureworks"
    ADLS_ACCOUNT = "adslprodadventureworks"

print("ENV:", ENV)
print("CATALOG:", CATALOG)
print("ADLS_ACCOUNT:", ADLS_ACCOUNT)

In [0]:
#Usamos la base de datos y el schema para poder trabajar
spark.sql(f"USE CATALOG `{CATALOG}`")
spark.sql(F"USE SCHEMA silver_schema")

**Transformar product_silver**

In [0]:
df_product_gold = spark.table(f"`{CATALOG}`.silver_schema.product_silver")
df_product_gold.display()

In [0]:
df_product_gold = df_product_gold.withColumn("brand", trim(split(col("product_name"), ",").getItem(0)))\
                                  .withColumn("aditional", trim(split(col("product_name"), ",").getItem(1)))
df_product_gold.display()

In [0]:
df_product_gold = df_product_gold.dropna()
df_product_gold.display()


**Trasnformar region_silver**

In [0]:
df_region_gold = spark.table(f"`{CATALOG}`.silver_schema.region_silver")
df_region_gold.display()

In [0]:
df_region_gold = df_region_gold.withColumn("Region", upper(col("Region")))\
                               .withColumn("Country", upper(col("Country")))\
                               .withColumn("Group", upper(col("Group")))\
                               .withColumnRenamed("SalesTerritoryKey", "salesterritory_id")\
                               .withColumnRenamed("Region", "region")\
                               .withColumnRenamed("Country", "country")\
                               .withColumnRenamed("Group", "group")
df_region_gold.display()

**Transformar Reseller**

In [0]:
df_reseller_gold = spark.table(f"`{CATALOG}`.silver_schema.reseller_silver")
df_reseller_gold.display()

In [0]:
df_reseller_gold = df_reseller_gold.withColumn("full_address", concat_ws(", ", col("City"), col("State_Province"), col("Country_Region")))\
                                   .withColumnRenamed("ResellerKey", "reseller_id")\
                                   .withColumnRenamed("Business_Type", "business_type")\
                                   .withColumnRenamed("Reseller", "reseller")\
                                   .withColumnRenamed("City", "city")\
                                   .withColumnRenamed("StateProvince", "state_province")\
                                   .withColumnRenamed("CountryRegion", "country_region")
df_reseller_gold.display()

**Transformar Sales**

In [0]:
df_sales_gold = spark.table(f"`{CATALOG}`.silver_schema.sales_silver")
df_sales_gold.display()

In [0]:
#Agregamos la columna profit/loss a la tabla sales para saber si el negocio es rentable, negativo perdida, positivo ganancia
df_sales_gold = df_sales_gold.withColumn("profit/loss", round(col("sales") - col("cost"), 2))
df_sales_gold.display()

**Transformar Sales person **

In [0]:
df_salesperson_gold = spark.table(f"`{CATALOG}`.silver_schema.salesperson_silver")
df_salesperson_gold.display()

In [0]:
df_salesperson_gold = df_salesperson_gold.withColumn("user_domain", split(col("email"), "@")[0])
df_salesperson_gold.display()

**Sales person region**

In [0]:
df_salespersonregion_gold = spark.table(f"`{CATALOG}`.silver_schema.salespersonregion_silver")
df_salespersonregion_gold.display()

In [0]:
df_salespersonregion_gold = df_salespersonregion_gold.withColumnRenamed("EmployeeKey", "salesperson_id")\
                                                      .withColumnRenamed("SalesTerritoryKey", "salesterritory_id")
df_salespersonregion_gold.display()

**Trasnformar Targets **

In [0]:
df_targets_gold = spark.table(f"`{CATALOG}`.silver_schema.targets_silver")
df_targets_gold.display()


In [0]:
df_targets_gold = df_targets_gold.withColumnRenamed("EmployeeKey", "employee_id")\
                                 .withColumnRenamed("Target", "target")\
                                 .withColumnRenamed("TargetMonth", "target_month")\
                                 
df_targets_gold.display()


In [0]:
def escribir_to_gold_catalog(df, table_name):
  df.write.mode("overwrite")\
          .format("delta")\
          .option("overwriteSchema", True)\
          .saveAsTable(f"`{CATALOG}`.gold_schema.{table_name}")

escribir_to_gold_catalog(df_product_gold, "product_gold")
escribir_to_gold_catalog(df_region_gold, "region_gold")
escribir_to_gold_catalog(df_reseller_gold, "reseller_gold")
escribir_to_gold_catalog(df_sales_gold, "sales_gold")
escribir_to_gold_catalog(df_salesperson_gold, "salesperson_gold")
escribir_to_gold_catalog(df_salespersonregion_gold, "salespersonregion_gold")
escribir_to_gold_catalog(df_targets_gold, "targets_gold")



In [0]:
gold_path = (f"abfss://gold@{ADLS_ACCOUNT}.dfs.core.windows.net")

df_product_gold.write.format("delta").mode("overwrite").save(f"{gold_path}/product_gold")
df_region_gold.write.format("delta").mode("overwrite").save(f"{gold_path}/region_gold")
df_reseller_gold.write.format("delta").mode("overwrite").save(f"{gold_path}/reseller_gold")
df_sales_gold.write.format("delta").mode("overwrite").save(f"{gold_path}/sales_gold")
df_salesperson_gold.write.format("delta").mode("overwrite").save(f"{gold_path}/salesperson_gold")
df_salespersonregion_gold.write.format("delta").mode("overwrite").save(f"{gold_path}/salespersonregion_gold")
df_targets_gold.write.format("delta").mode("overwrite").save(f"{gold_path}/targets_gold")



In [0]:
spark.sql(f"USE CATALOG `{CATALOG}`")
spark.sql(F"USE SCHEMA gold_schema")

In [0]:
%sql
DESCRIBE EXTENDED product_gold;

