In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import lit

In [0]:
%fs
ls /Volumes/workspace/ecommerce/ecommerce_data

In [0]:
df_oct = spark.read.parquet("/Volumes/workspace/ecommerce/ecommerce_data/parquet/oct/")
df_oct.limit(5).show()

#### There is NO event_id column in your table. Hence, The Schema is Clean

In [0]:
from delta.tables import DeltaTable

deltaTable = DeltaTable.forName(spark, "workspace.ecommerce.events_oct")

deltaTable.alias("t").merge(
    source = spark.table("updates_view").alias("s"),
    condition = "t.event_id = s.event_id"
).whenMatchedUpdateAll() \
 .whenNotMatchedInsertAll() \
 .execute()

In [0]:
updates = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/incremental/new_data.csv",header=True,inferSchema=True)

In [0]:
from pyspark.sql.functions import to_timestamp, col

updates_fixed = updates.withColumn(
    "event_time",
    to_timestamp(col("event_time"), "dd-MM-yyyy HH:mm")
)

In [0]:
updates_fixed.filter(col("event_time").isNull()).show(truncate=False)

## Ingest → DataFrame → Temp View → MERGE
### We use createOrReplaceTempView to turn a DataFrame into a SQL-accessible object so it can be used in MERGE and other SQL operations.


In [0]:
from pyspark.sql.functions import lit
from delta.tables import DeltaTable

updates_fixed = updates_fixed.withColumn("source_system", lit("incremental_csv"))

updates_fixed.createOrReplaceTempView("updates_view")

deltaTable = DeltaTable.forName(spark, "workspace.ecommerce.events_oct")

deltaTable.alias("t").merge(
    source = spark.table("updates_view").alias("s"),
    condition = """
        t.user_session = s.user_session
        AND t.event_time = s.event_time
        AND t.product_id = s.product_id
    """
).whenMatchedUpdateAll() \
 .whenNotMatchedInsertAll() \
 .execute()

In [0]:
result_df = spark.table("workspace.ecommerce.events_oct") \
    .filter(col("user_session") == "abc-session-1")

display(result_df.orderBy("event_time"))


In [0]:
display(spark.table("workspace.ecommerce.events_oct").limit(10))

In [0]:
result_df.groupBy('source_system').count().show()