In [0]:

# ====================
# 1. Set up credentials
# ====================
spark.conf.set(
    "fs.azure.account.key.mmixstorage.blob.core.windows.net",
    "UZTHs33FPYTUvC9G51zk+DQQp/FWf31YOteoW+dEnKuprRgxvk53yS+IpEiLn1062IBpOyoKaXp4+AStRcA1Cw=="
)

# ====================
# 2. Get notebook parameters
# ====================
dbutils.widgets.text("input_path", "")
dbutils.widgets.text("output_path", "")
dbutils.widgets.text("group_col", "")
dbutils.widgets.text("date_col", "")
dbutils.widgets.text("target_col", "")
dbutils.widgets.text("transformations", "{}")

import json
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import DateType

def sql_safe(colname):
    return f"`{colname}`"

# Load parameters
input_path = dbutils.widgets.get("input_path")
output_path = dbutils.widgets.get("output_path")
group_col = dbutils.widgets.get("group_col")
date_col = dbutils.widgets.get("date_col")
target_col = dbutils.widgets.get("target_col")
transformations = json.loads(dbutils.widgets.get("transformations"))

print("Transformations:")
print(json.dumps(transformations, indent=2))

# ====================
# 3. Read input CSV
# ====================
df = spark.read.option("header", True).option("inferSchema", True).csv(
    f"wasbs://mmix-blob-storage@mmixstorage.blob.core.windows.net/{input_path}"
)

# Convert date column to DateType
df = df.withColumn(date_col, F.to_timestamp(F.col(date_col), "M/d/yyyy H:mm"))

# ====================
# 4. Apply adstock + saturation
# ====================
for col, trans in transformations.items():
    lags = int(trans.get("lags", 0))
    decay = float(trans.get("decay", 0.0))

    transformed_col = f"{col}_transformed"

    # ----- Apply Adstock -----
    if lags > 0 and decay > 0:
        for i in range(1, lags + 1):
            df = df.withColumn(f"{col}_lag{i}", F.lag(col, i).over(Window.partitionBy(group_col).orderBy(date_col)))

        expr = f"COALESCE({sql_safe(col)}, 0)"
        for i in range(1, lags + 1):
            expr += f" + POWER({decay}, {i}) * COALESCE({sql_safe(col + '_lag' + str(i))}, 0)"

        df = df.withColumn(transformed_col, F.expr(f"({expr})"))
        df = df.drop(*[f"{col}_lag{i}" for i in range(1, lags + 1)])
    else:
        df = df.withColumn(transformed_col, F.col(col))

    # ----- Apply Saturation -----
    saturation = trans.get("saturation")
    if saturation:
        mode = saturation.get("mode", "power")
        k = float(saturation.get("k", 1))

        if mode == "log":
            df = df.withColumn(transformed_col, F.pow(F.log1p(F.col(transformed_col)), k))
        else:
            df = df.withColumn(transformed_col, F.pow(F.col(transformed_col), k))

    # ----- Overwrite original column -----
    # df = df.drop(col).withColumnRenamed(transformed_col, col)

# ====================
# 5. Write transformed CSV to output path
# ====================
df.show()
df.printSchema()

df.write.mode("overwrite").option("header", True).csv(
    f"wasbs://mmix-blob-storage@mmixstorage.blob.core.windows.net/{output_path}"
)


Transformations:
{}


[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-7518137628170193>, line 46[0m
[1;32m     41[0m df [38;5;241m=[39m spark[38;5;241m.[39mread[38;5;241m.[39moption([38;5;124m"[39m[38;5;124mheader[39m[38;5;124m"[39m, [38;5;28;01mTrue[39;00m)[38;5;241m.[39moption([38;5;124m"[39m[38;5;124minferSchema[39m[38;5;124m"[39m, [38;5;28;01mTrue[39;00m)[38;5;241m.[39mcsv(
[1;32m     42[0m     [38;5;124mf[39m[38;5;124m"[39m[38;5;124mwasbs://mmix-blob-storage@mmixstorage.blob.core.windows.net/[39m[38;5;132;01m{[39;00minput_path[38;5;132;01m}[39;00m[38;5;124m"[39m
[1;32m     43[0m )
[1;32m     45[0m [38;5;66;03m# Convert date column to DateType[39;00m
[0;32m---> 46[0m df [38;5;241m=[39m df[38;5;241m.[39mwithColumn(date_col, F[38;5;241m.[39mto_timestamp(F[38;5;241m.[39mcol(date_col), [38;5;124m"[39m[