## Assignment-1

Task 1 - Raw Data Ingestion

In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=96099af5bcedaa2bfeef645d800b86ae8d0ead519d7e502ebbccb2775a2196da
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import input_file_name
from pyspark.sql.types import StructType, StructField, StringType, FloatType, DateType
import os

spark = SparkSession.builder.appName("WeatherDataIngestion").getOrCreate()

schema = StructType([
    StructField("City", StringType(), True),
    StructField("Date", DateType(), True),
    StructField("Temperature", FloatType(), True),
    StructField("Humidity", FloatType(), True)
])
# Define path to the raw data
raw_data_path = "/content/sample_data/weather_data.csv"
delta_table_path = "/content/samplw_data/delta/weather_raw"

weather_df = spark.read.csv(raw_data_path, schema=schema, header=True).withColumn("file_name", input_file_name())

if os.path.exists(raw_data_path):
    try:

        weather_df = spark.read.csv(raw_data_path, schema=schema, header=True)

        weather_df.write.format("delta").mode("overwrite").save(delta_table_path)
        print("Data loaded and saved as Delta table.")
    except Exception as e:
        print(f"Error: {e}")
else:
    print(f"File not found: {raw_data_path}")

Task 2 - Data Cleaning

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("WeatherDataCleaning").getOrCreate()


raw_delta_table_path = "/content/sample_data/delta/weather_raw"
cleaned_delta_table_path = "/content/sample_data/delta/weather_cleaned"

raw_weather_df = spark.read.format("delta").load(raw_delta_table_path)
raw_weather_df.show()

cleaned_weather_df = raw_weather_df.na.drop()
cleaned_weather_df.show()

cleaned_weather_df.write.format("delta").mode("overwrite").save(cleaned_delta_table_path)
print("Data cleaned and saved to a new Delta table.")


Task 3: Data Transformation

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg

spark = SparkSession.builder.appName("WeatherDataTransformation").getOrCreate()

raw_delta_table_path = "/content/sample_data/delta/weather_raw"
cleaned_delta_table_path = "/content/sample_data/delta/weather_cleaned"


cleaned_weather_df = spark.read.format("delta").load(cleaned_delta_table_path)
cleaned_weather_df.show()

transformed_weather_df = cleaned_weather_df.groupBy("City").agg(
    avg("Temperature").alias("AvgTemperature"),
    avg("Humidity").alias("AvgHumidity")
)

transformed_weather_df.show()

transformed_weather_df.write.format("delta").mode("overwrite").save(transformed_delta_table_path)
print("Data transformed and saved to a new Delta table.")


Task-4 Creating Pipelines



Pipelines are created in Azure Databricks under the section Workflow - which will connect the above 3 tasks into a single performance and process required output.

In [None]:
# 2. Add Logging to track progress and errors
import logging
logging.basicConfig(filename='/path/to/pipeline_log.log', level=logging.INFO)

try:
  logging.info(f'Successfully executed {notebook}')

except Exception as e:
  logging.error(f'Failed to execute {notebook}: {e}')


Additional Task - Error Handling

In [None]:
import os
if not os.path.exists("dbfs:/FileStore/weather_data.csv"):
 raise FileNotFoundError("Weather data file not found")


try
except Exception as e:
  logging.error(f"Error: {str(e)}")
  error_df = spark.createDataFrame([(str(e),)], ["Error"])
  error_df.write.format("delta").mode("append").save("/delta/error_log")



## Assignment -2

Task 1 - Raw Data Ingestion

weather_data.csv

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import os

spark = SparkSession.builder.appName("Weather Data Ingestion").getOrCreate()
schema = StructType([
    StructField("City", StringType(), True),
    StructField("Date", DateType(), True),
    StructField("Temperature", FloatType(), True),
    StructField("Humidity", FloatType(), True)
])

file_path = "/content/sample_data/weather_data.csv"

if os.path.exists(file_path):

    weather_df = spark.read.format("csv").option("header", "true").load(file_path)

    weather_df.write.format("delta").mode("overwrite").save("/content/sample_data/delta/weather_raw")
    print("Data ingestion completed successfully.")
else:
    print(f"File {file_path} does not exist.")
    spark.createDataFrame([("File not found",)], ["Error"]).write.mode("append").save("/content/sample_data/delta/ingestion_logs")


Task 2 Data Cleaning

In [None]:
from pyspark.sql.functions import when, col

weather_df = spark.read.format("delta").load("/content/sample_data/delta/weather_raw")

cleaned_df = weather_df.withColumn(
    "Temperature", when(col("Temperature").isNull() | (col("Temperature") < -50) | (col("Temperature") > 50), None).otherwise(col("Temperature"))
).withColumn(
    "Humidity", when(col("Humidity").isNull() | (col("Humidity") < 0) | (col("Humidity") > 100), None).otherwise(col("Humidity"))
)

cleaned_df = cleaned_df.dropna()

cleaned_df.write.format("delta").mode("overwrite").save("/content/sample_data/delta/weather_cleaned")
print("Data cleaning completed successfully.")


Task 3 Data Transition

In [None]:
from pyspark.sql.functions import avg

cleaned_df = spark.read.format("delta").load("/content/sample_data/delta/weather_cleaned")
transformed_df = cleaned_df.groupBy("City").agg(
    avg("Temperature").alias("Average_Temperature"),
    avg("Humidity").alias("Average_Humidity")
)

transformed_df.write.format("delta").mode("overwrite").save("/content/sample_data/delta/weather_transformed")
print("Data transformation completed successfully.")


Task 4 Build and Run a Pipeline

In [None]:
import subprocess
import logging


logging.basicConfig(filename='/content/sample_data/logs/pipeline_log.log', level=logging.INFO)
notebooks = [
    "/content/sample_data/delta/weather_raw",
    "/content/sample_data/delta/weather_cleaned",
    "/content/sample_data/delta/weather_transformed"
]

for notebook in notebooks:
    try:
        subprocess.run(["databricks", "workspace", "import", notebook], check=True)
        logging.info(f"Successfully executed {notebook}")
    except subprocess.CalledProcessError as e:
        logging.error(f"Error occurred while executing {notebook}: {e}")


## Assignment 3

Task 1 Data Ingestion

In [None]:
from pyspark.sql import SparkSession
import os
import logging
spark = SparkSession.builder.appName("Customer Data Ingestion").getOrCreate()
file_path = "/content/sample_data/customer_transactions.csv"

logging.basicConfig(filename='/content/sample_data/logs/ingestion_log.log', level=logging.INFO)

if os.path.exists(file_path):
    customer_df = spark.read.format("csv").option("header", "true").load(file_path)

    customer_df.write.format("delta").mode("overwrite").save("/content/sample_data/delta/customer_raw")
    logging.info("Customer data ingestion completed successfully.")
else:
    logging.error(f"File {file_path} does not exist.")


Task 2  Data Cleaning

In [None]:
from pyspark.sql.functions import col

customer_df = spark.read.format("delta").load("/content/sample_data/delta/customer_raw")
cleaned_df = customer_df.dropDuplicates()
cleaned_df = cleaned_df.na.fill({"TransactionAmount": 0})


cleaned_df.write.format("delta").mode("overwrite").save("/content/sample_data/delta/customer_cleaned")
print("Customer data cleaning completed successfully.")


Task 3 Data Aggregation

In [None]:
from pyspark.sql.functions import sum
cleaned_df = spark.read.format("delta").load("/content/sample_data/delta/customer_cleaned")

aggregated_df = cleaned_df.groupBy("ProductCategory").agg(
    sum("TransactionAmount").alias("TotalTransactionAmount")
)

aggregated_df.write.format("delta").mode("overwrite").save("/content/sample_data/delta/customer_aggregated")
print("Customer data aggregation completed successfully.")


Task 4  Pipeline Creation

In [None]:
import subprocess
import logging

logging.basicConfig(filename='/content/sample_data/logs/pipeline_log.log', level=logging.INFO)
notebooks = [
    "/content/sample_data/delta/customer_raw",
    "/content/sample_data/delta/customer_cleaned",
    "/content/sample_data/delta/customer_aggregated"
]
for notebook in notebooks:
    try:
        subprocess.run(["databricks", "workspace", "import", notebook], check=True)
        logging.info(f"Successfully executed {notebook}")
    except subprocess.CalledProcessError as e:
        logging.error(f"Error occurred while executing {notebook}: {e}")


Task 5

In [None]:

cleaned_df = spark.read.format("delta").load("/content/sample_data/delta/customer_cleaned")
aggregated_df = spark.read.format("delta").load("/content/sample_data/delta/customer_aggregated")

total_transactions = cleaned_df.agg(sum("TransactionAmount").alias("TotalTransactions")).collect()[0]["TotalTransactions"]

total_aggregated_transactions = aggregated_df.agg(sum("TotalTransactionAmount").alias("TotalAggregatedTransactions")).collect()[0]["TotalAggregatedTransactions"]

if total_transactions == total_aggregated_transactions:
    print(f"Data validation passed: {total_transactions} == {total_aggregated_transactions}")
else:
    print(f"Data validation failed: {total_transactions} != {total_aggregated_transactions}")


## Assignment 4

Task 1 Data Ingestion

In [None]:
from pyspark.sql import SparkSession
import os
spark = SparkSession.builder.appName("Product Inventory Ingestion").getOrCreate()

file_path = "/content/sample_data/tables/product_inventory.csv"
logging.basicConfig(filename='/content/sample_data/logs/inventory_ingestion.log', level=logging.INFO)

try:
    if os.path.exists(file_path):
        product_df = spark.read.format("csv").option("header", "true").load(file_path)
        product_df.write.format("delta").mode("overwrite").save("/content/sample_data/delta/product_inventory_raw")
        logging.info("Product inventory ingestion completed successfully.")
    else:
        raise FileNotFoundError(f"File {file_path} not found.")

except FileNotFoundError as e:
    logging.error(f"FileNotFoundError: {str(e)}")
except Exception as e:
    logging.error(f"An error occurred: {str(e)}")


Task 2 Data Cleaning

In [None]:

product_df = spark.read.format("delta").load("/content/sample_data/delta/product_inventory_raw")
cleaned_df = product_df.na.fill({"StockQuantity": 0, "Price": 0.0})
cleaned_df = cleaned_df.filter(col("StockQuantity") >= 0)

cleaned_df.write.format("delta").mode("overwrite").save("/content/sample_data/delta/product_inventory_cleaned")
print("Product inventory cleaning completed successfully.")


Task 3 Inventory Analysis

In [None]:
from pyspark.sql.functions import col, expr
cleaned_df = spark.read.format("delta").load("/content/sample_data/delta/product_inventory_cleaned")

stock_value_df = cleaned_df.withColumn("TotalStockValue", col("StockQuantity") * col("Price"))
restock_df = cleaned_df.filter(col("StockQuantity") < 100)

stock_value_df.write.format("delta").mode("overwrite").save("/content/sample_data/delta/product_inventory_analysis")
restock_df.write.format("delta").mode("overwrite").save("/content/sample_data/delta/product_inventory_restock")
print("Product inventory analysis completed successfully.")


Task 4 Build an Inventory Pipeline

In [None]:
import subprocess
import logging
logging.basicConfig(filename='/content/sample_data/logs/inventory_pipeline_log.log', level=logging.INFO)

notebooks = [
    "/content/sample_data/delta/product_inventory_raw",
    "/content/sample_data/delta/product_inventory_cleaned",
    "/content/sample_data/delta/product_inventory_analysis"
]

for notebook in notebooks:
    try:
        subprocess.run(["databricks", "workspace", "import", notebook], check=True)
        logging.info(f"Successfully executed {notebook}")
    except subprocess.CalledProcessError as e:
        logging.error(f"Error occurred while executing {notebook}: {e}")


Task 5

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Inventory Monitoring").getOrCreate()

inventory_df = spark.read.format("delta").load("/content/sample_data/delta/product_inventory_cleaned")


urgent_restock_df = inventory_df.filter(col("StockQuantity") < 50)

if urgent_restock_df.count() > 0:
    print("Alert: Some products need urgent restocking!")
    urgent_restock_df.show()
else:
    print("No products need urgent restocking at the moment.")


## Assignment 5

Task 1 Data Ingession

In [None]:
import logging
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("EmployeeAttendance").getOrCreate()
logging.basicConfig(filename='/content/sample_data/logs/attendance_log.log', level=logging.INFO)
csv_file_path = "/content/sample_data/employee_attendance.csv"

try:
    attendance_df = spark.read.option("header", "true").csv(csv_file_path)
    attendance_df.write.format("delta").mode("overwrite").save("/content/sample_data/delta/employee_attendance_raw")

    logging.info("Employee attendance data ingested successfully.")

except Exception as e:
    logging.error(f"Error ingesting data: {str(e)}")
    print(f"Error: {str(e)}")


Task 2 Data Cleaning

In [None]:
from pyspark.sql.functions import col, unix_timestamp, round
attendance_df = spark.read.format("delta").load("/content/sample_data/delta/employee_attendance_raw")

cleaned_df = attendance_df.filter(col("CheckInTime").isNotNull() & col("CheckOutTime").isNotNull())
cleaned_df = cleaned_df.withColumn(
    "HoursWorked",
    round((unix_timestamp(col("CheckOutTime"), "HH:mm") - unix_timestamp(col("CheckInTime"), "HH:mm")) / 3600, 2)
)
cleaned_df.write.format("delta").mode("overwrite").save("/content/sample_data/delta/employee_attendance_cleaned")

print("Employee attendance cleaning completed successfully.")


Task 3  Attendance Summary

In [None]:
from pyspark.sql.functions import sum
cleaned_df = spark.read.format("delta").load("/content/sample_data/delta/employee_attendance_cleaned")

attendance_summary = cleaned_df.groupBy("EmployeeID").agg(sum("HoursWorked").alias("TotalHoursWorked"))
overtime_df = cleaned_df.filter(col("HoursWorked") > 8).select("EmployeeID", "Date", "HoursWorked")

attendance_summary.write.format("delta").mode("overwrite").save("/content/sample_data/delta/employee_attendance_summary")
overtime_df.write.format("delta").mode("overwrite").save("/content/sample_data/delta/employee_overtime_summary")

logging.info("Employee attendance summary and overtime analysis completed.")


Task 4 Create an Attendance Pipeline

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, unix_timestamp, sum

spark = SparkSession.builder.appName("EmployeeAttendancePipeline").getOrCreate()

def attendance_pipeline():
    try:

        attendance_df = spark.read.option("header", "true").csv("/content/sample_data/employee_attendance.csv")
        attendance_df.write.format("delta").mode("overwrite").save("/content/sample_data/delta/attendance")
        cleaned_df = attendance_df.filter(col("CheckInTime").isNotNull() & col("CheckOutTime").isNotNull())

        cleaned_df = cleaned_df.withColumn(
            "HoursWorked",
            (unix_timestamp(col("CheckOutTime"), 'HH:mm') - unix_timestamp(col("CheckInTime"), 'HH:mm')) / 3600
        )
        cleaned_df.write.format("delta").mode("overwrite").save("/content/sample_data/delta/cleaned_attendance")
        monthly_summary_df = cleaned_df.groupBy("EmployeeID").agg(sum("HoursWorked").alias("TotalHoursWorked"))
        overtime_df = cleaned_df.filter(col("HoursWorked") > 8)

        monthly_summary_df.write.format("delta").mode("overwrite").save("/content/sample_data/delta/attendance_summary")
        overtime_df.write.format("delta").mode("overwrite").save("/content/sample_data/delta/overtime_summary")

        print("Attendance pipeline completed successfully.")

    except FileNotFoundError:
        print("CSV file is missing.")
    except Exception as e:
        print(f"Error in pipeline: {e}")

attendance_pipeline()



Task 5

In [None]:

attendance_df = spark.read.format("delta").option("versionAsOf", 1).load("/content/sample_data/delta/employee_attendance_cleaned")

spark.sql("DESCRIBE HISTORY '/content/sample_data/delta/employee_attendance_cleaned'").show()
