In [1]:
! pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=18822fee7b1f7934816731ed09b10dd2654e57e9e57b3c38aa28b212bc4fcb17
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


Task 1: Vehicle Maintenance Data Ingestion

In [22]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import input_file_name
from pyspark.sql.types import StructType, StructField, StringType, FloatType, DateType
import os

# Initialize Spark session
spark = SparkSession.builder \
    .appName("VehicleMaintenanceDataIngestion") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtensions") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

# Define schema for vehicle maintenance data
schema = StructType([
    StructField("VehicleID", StringType(), True),
    StructField("Date", DateType(), True),
    StructField("ServiceType", StringType(), True),
    StructField("ServiceCost", FloatType(), True),
    StructField("Mileage", FloatType(), True)
])

# Define path to the raw data
raw_data_path = "/content/vehicle_maintenance.csv"
delta_table_path = "/content/vehicle_maintenance_delta"

# Check if the CSV file exists
if os.path.exists(raw_data_path):
    try:
        # Read the CSV file into a DataFrame
        vehicle_df = spark.read.csv(raw_data_path, schema=schema, header=True) \
            .withColumn("file_name", input_file_name())

        # Write to Delta table
        vehicle_df.write.format("delta").mode("overwrite").save(delta_table_path)
        print("Data loaded and saved as Delta table.")
    except Exception as e:
        print(f"Error during data ingestion: {e}")
else:
    print(f"File not found: {raw_data_path}")


Error during data ingestion: An error occurred while calling o223.save.
: org.apache.spark.SparkClassNotFoundException: [DATA_SOURCE_NOT_FOUND] Failed to find the data source: delta. Please find packages at `https://spark.apache.org/third-party-projects.html`.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.dataSourceNotFoundError(QueryExecutionErrors.scala:725)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:647)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSourceV2(DataSource.scala:697)
	at org.apache.spark.sql.DataFrameWriter.lookupV2Provider(DataFrameWriter.scala:863)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:257)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:240)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java

Task 2: Data Cleaning


In [27]:
from pyspark.sql.functions import col
# Clean the vehicle maintenance data
try:
    # Read the Delta table into a DataFrame
    vehicle_df = spark.read.format("delta").load(delta_table_path)

    # Ensure positive values and remove duplicates
    cleaned_df = vehicle_df.filter((col("ServiceCost") > 0) & (col("Mileage") > 0)) \
                            .dropDuplicates(["VehicleID", "Date"])

    # Save cleaned data to a new Delta table
    cleaned_delta_table_path = "/content/vehicle_maintenance_cleaned_delta"
    cleaned_df.write.format("delta").mode("overwrite").save(cleaned_delta_table_path)
    print("Cleaned data saved to new Delta table.")
except Exception as e:
    print(f"Error during data cleaning: {e}")


Error during data cleaning: An error occurred while calling o237.load.
: org.apache.spark.SparkClassNotFoundException: [DATA_SOURCE_NOT_FOUND] Failed to find the data source: delta. Please find packages at `https://spark.apache.org/third-party-projects.html`.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.dataSourceNotFoundError(QueryExecutionErrors.scala:725)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:647)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSourceV2(DataSource.scala:697)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:208)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:186)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.ja

Task 3: Vehicle Maintenance Analysis

In [28]:

# Analyze the vehicle maintenance data
try:
    # Read the cleaned Delta table into a DataFrame
    cleaned_df = spark.read.format("delta").load(cleaned_delta_table_path)

    # Calculate total maintenance cost for each vehicle
    total_cost_df = cleaned_df.groupBy("VehicleID").agg({"ServiceCost": "sum"}) \
                               .withColumnRenamed("sum(ServiceCost)", "TotalServiceCost")

    # Identify vehicles exceeding a mileage threshold
    mileage_threshold = 30000
    exceeding_mileage_df = cleaned_df.filter(col("Mileage") > mileage_threshold)

    # Save analysis results to Delta tables
    total_cost_df.write.format("delta").mode("overwrite").save("/content/total_service_cost_delta")
    exceeding_mileage_df.write.format("delta").mode("overwrite").save("/content/exceeding_mileage_delta")
    print("Analysis results saved to Delta tables.")
except Exception as e:
    print(f"Error during analysis: {e}")


Error during analysis: name 'cleaned_delta_table_path' is not defined


Task 4: Data Governance with Delta Lake

In [30]:
# Data Governance with Delta Lake
try:
    # Perform VACUUM on the original Delta table
    spark.sql(f"VACUUM delta.`{delta_table_path}` RETAIN 0 HOURS")

    # Describe history of the Delta table
    history_df = spark.sql(f"DESCRIBE HISTORY delta.`{delta_table_path}`")
    history_df.show(truncate=False)
except Exception as e:
    print(f"Error during data governance: {e}")



Error during data governance: 
[PARSE_SYNTAX_ERROR] Syntax error at or near 'VACUUM'.(line 1, pos 0)

== SQL ==
VACUUM delta.`/content/vehicle_maintenance_delta` RETAIN 0 HOURS
^^^

