In [0]:
dbutils.fs.cp("file:/Workspace/Shared/sales16.csv", "dbfs:/Filestore/streaming/input/sales16.csv")
dbutils.fs.cp("file:/Workspace/Shared/customer_data.json", "dbfs:/Filestore/streaming/input/customer_data.json")

True

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
spark = SparkSession.builder.appName("Spark DataFrames").getOrCreate()
sales_schema="OrderID INT, OrderDate DATE, CustomerID INT,Product STRING, Quantity INT, Price DECIMAL(10,2)"
df_sales_stream=spark.readStream.format("csv").option("header","true").schema(sales_schema).load("dbfs:/Filestore/streaming/input")
customer_schema="CustomerID INT, CustomerName STRING, Region STRING, SignUpDate Date"
df_customer_stream=spark.readStream.format("json").schema(customer_schema).load("dbfs:/Filestore/streaming/input")



In [0]:
from pyspark.sql.functions import current_date, datediff, to_timestamp

#Transform the sales data: Add a new column for total amount

df_sales_transformed= df_sales_stream.select(
col("OrderID"),
to_timestamp(col("OrderDate"), "yyyy-MM-dd HH:mm:ss").alias("OrderDate"), #Convert OrderDate to TIMESTAMP
col("Product"),
col("Quantity"),
col("Price"),
(col("Quantity") * col("Price")).alias("TotalAmount"))
print("Applied transformations on sales data...")

#Add watermark to handle late data and perform an aggregation
df_sales_aggregated = df_sales_transformed \
.withWatermark("OrderDate", "1 day") \
.groupBy("Product") \
.agg({"TotalAmount": "sum"})
print("Aggregated sales data by product...")

#Transform the customer data: Add a new column for the number of years since signup
df_customers_transformed = df_customer_stream.withColumn( "YearsSinceSignup",
datediff(current_date(), to_timestamp(col("SignupDate"), "yyyy-MM-dd")).cast("int") / 365)
print("Applied transformations on customer data...")


Applied transformations on sales data...
Aggregated sales data by product...
Applied transformations on customer data...


In [0]:
from pyspark.sql import SparkSession

from pyspark.sql.functions import col, to_timestamp

#Initialize SparkSession
spark=SparkSession.builder \
.appName("StructüredStreamingExample") \
.getOrCreate()

#Load data from CSV
df=spark.read.format("csv").option("header", "true").load("/Filestore/sales16.csv")
print("Data Loaded Successfully")

#Transform the data: Add a new column for total amount
df_transformed=df.withColumn ("TotalAmount", col("Quantity").cast("int") * col("Price").cast("double"))
print("Data Transformed Successfully")

#Write transformed data to a Delta table
df_transformed.write.format("delta").mode("overwrite").save("/delta/sales16")
print("Transformed data written to Delta table successfully")

Data Loaded Successfully
Data Transformed Successfully
Transformed data written to Delta table successfully


In [0]:
import pandas as pd
sales_data = {
"OrderID": [1, 2, 3, 4],
"OrderDate": ["2024-01-01 10:00:00", "2024-01-02 11:00:00", "2024-01-03 12:00:00", "2024-01-04 13:00:00"],
"CustomerID": ["С001", "С002", "0003", "с004"],
"Product": ["ProductA", "ProductB", "ProductC", "ProductD"],
"Quantity": [10, 20, 15, 5],
"Price": [100.0, 200.0, 150.0, 50.0]
}
df_sales = pd.DataFrame (sales_data)
csv_path = "/dbfs/FileStore/sales_data.csv"
df_sales.to_csv(csv_path, index=False)
parquet_path = "/dbfs/FileStore/sales_data.parquet"
df_sales.to_parquet(parquet_path, index=False)
print(f"Sample data saved to {csv_path} and {parquet_path}")


Sample data saved to /dbfs/FileStore/sales_data.csv and /dbfs/FileStore/sales_data.parquet


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp
spark=SparkSession.builder.appName("DLT example").getOrCreate()
sales_delta=spark.read.format("csv").option("header","true").load("/dbfs/FileStore/sales_data.csv")
sales_delta.write.format("delta").mode("overwrite").save("/delta/sales_delta")
delta_table_path="/delta/sales_delta"


[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-2188255052628636>, line 4[0m
[1;32m      2[0m [38;5;28;01mfrom[39;00m [38;5;21;01mpyspark[39;00m[38;5;21;01m.[39;00m[38;5;21;01msql[39;00m[38;5;21;01m.[39;00m[38;5;21;01mfunctions[39;00m [38;5;28;01mimport[39;00m col, to_timestamp
[1;32m      3[0m spark[38;5;241m=[39mSparkSession[38;5;241m.[39mbuilder[38;5;241m.[39mappName([38;5;124m"[39m[38;5;124mDLT example[39m[38;5;124m"[39m)[38;5;241m.[39mgetOrCreate()
[0;32m----> 4[0m sales_delta[38;5;241m=[39mspark[38;5;241m.[39mread[38;5;241m.[39mformat([38;5;124m"[39m[38;5;124mcsv[39m[38;5;124m"[39m)[38;5;241m.[39moption([38;5;124m"[39m[38;5;124mheader[39m[38;5;124m"[39m,[38;5;124m"[39m[38;5;124mtrue[39m[38;5;124m"[39m)[38;5;241m.[39mload([38;5;124m"[39m[38;5;124m/dbfs/FileStore/sales_dat

In [0]:
import dlt

@dlt.table
def sales_data():
    df=spark.read.format("delta").load(delta_table_path)
    return df.select(
    col("OrderID"),
    col("OrderDate"),
    col("CustomerID"),
    col("Product"),
    col("Quantity"),
    col("Price"),
    (col("Quantity").cast("int") * col("Price").cast("double")).alias("TotalAmount")
    )
print("Delta Live Table created.")


Delta Live Table created.
py4j.Py4JException: An exception was raised by the Python Proxy. Return Message: Traceback (most recent call last):
  File "/databricks/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 642, in _call_proxy
    return_value = getattr(self.pool[obj_id], method)(*params)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/databricks/spark/python/dlt/helpers.py", line 31, in call
    res = self.func()
          ^^^^^^^^^^^
  File "/root/.ipykernel/1096/command-2188255052628637-2274096183", line 5, in sales_data
    df=spark.read.format("delta").load(delta_table_path)
                                       ^^^^^^^^^^^^^^^^
NameError: name 'delta_table_path' is not defined

