In [0]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from delta.tables import *
spark=SparkSession.builder.appName("sep 17").getOrCreate()
dbutils.fs.cp("file:/Workspace/Shared/Employee.csv","dbfs:/Filestore/Employee.csv")
dbutils.fs.cp("file:/Workspace/Shared/products.json","dbfs:/Filestore/products.json")
employee_df=spark.read.format("csv").option("header","true").load("dbfs:/Filestore/Employee.csv")
employee_df.show()
products_df=spark.read.format("json").option("multiline","true").load("/content/products.json")
employee_df.write.format("delta").mode("overwrite").save("/delta/employee_delta")
products_df.write.format("delta").mode("overwrite").save("/delta/products_delta")
employee_delta = spark.read.format("delta").load("/delta/employee_delta")
products_delta = spark.read.format("delta").load("/delta/products_delta")

In [0]:

dbutils.fs.cp("File:/Workspace/Shared/new_employee.csv","dbfs:/Filestore/new_employee.csv")
employee_new_df=spark.read.format("csv").option("header","true").load("dbfs:/Filestore/new_employee.csv")
employee_new_df.write.format("delta").mode("overwrite").save("/delta/employee_new_delta")
employee_new_delta=spark.read.format("delta").load("/delta/employee_new_delta")
employee_delta.createOrReplaceTempView("employee_delta")
employee_new_delta.createOrReplaceTempView("new_employee_delta")

In [0]:

spark.sql("""
          merge into employee_delta as target
          using new_employee_delta as source
          on target.employeeID=source.employeeID
          when matched then update set target.Salary=source.Salary
          when not matched then 
           insert (EmployeeID, EmployeeName, JoiningDate, Salary)
           values (source.EmployeeID, source.EmployeeName, source.JoiningDate, source.Salary)
          """)
spark.sql("select * from employee_delta").show()

In [0]:
spark.sql("""
          create table if not exists employee  as select * from employee_delta
          """)
#Optmizing the table using zordering and optimize
spark.sql("optimize employee zorder by(Salary)")

#describing the history of the delta table
spark.sql("DESCRIBE HISTORY employee").show()

#vacuuming the table abd storing data of previous 7 days only
spark.sql("Vacuum employee retain 168 hours")

#using versioning of delta lake to find data with certain version
spark.sql("SELECT * FROM employee VERSION AS OF 3")


In [0]:
dbutils.fs.cp("file:/Workspace/Shared/transaction.csv","dbfs:/streaming/input/transaction.csv")
transaction_schema="transactionID String, transactionDate DATE, productID STRING,Quantity INT,Price INT"
transaction_stream=spark.readStream.format("csv").option("header","true").schema(transaction_schema).load("dbfs:/streaming/input/")


In [0]:
from pyspark.sql.functions import col,current_date, datediff, to_timestamp
transaction_transformed=transaction_stream.select(
    col("transactionID"),
    to_timestamp(col("TransactionDate"), "yyyy-MM-dd HH:mm:ss").alias("transactionDate"),
    col("productID"),
    col("Quantity"),
    col("Price"),
    (col("Quantity")*col("price")).alias("Total Price")
)
transaction_filter=transaction_transformed.filter((col("Quantity")>1).alias("Quantity"))

transactions_aggregated = transaction_transformed \
.withWatermark("transactionDate", "1 day") \
.groupBy("ProductID") \
.agg({"Total Price": "sum"})
print("Aggregated sales data by product...")



Aggregated sales data by product...


In [0]:
# Define the checkpoint and output locations
checkpoint_location = "/delta/checkpoints/transaction_parquet_checkpoint"
output_path = "/delta/output/transaction_parquet_output"

# Write the streaming DataFrame to a Parquet sink
transaction_transformed.writeStream \
    .format("parquet") \
    .option("path", output_path) \
    .option("checkpointLocation", checkpoint_location) \
    .outputMode("append")\
    .start()
    


<pyspark.sql.streaming.query.StreamingQuery at 0x7fb978a90690>

In [0]:
dbutils.fs.cp("file:/Workspace/Shared/products.json","dbfs:/Filestore/products.json")
product_schema="productId String,productName String,category String,Price INT"
product_stream=spark.readStream.format("json").option("multiline","true").schema(product_schema).load("dbfs:/Filestore/products.json")


In [0]:
# Perform the join on the two streaming DataFrames
output_path = "/delta/output/combined_stream_output"  # Ensure this is a directory
checkpoint_location = "/delta/checkpoints/combined_stream_checkpoint"  # Ensure this is a directory

combined_stream = transaction_transformed \
    .join(product_stream, on="ProductID", how="inner")  # Use "inner" join; adjust as needed
# Write the joined stream to the console
query = combined_stream.writeStream \
    .format("console") \
    .option("path", output_path)\
    .outputMode("append") \
    .start()

