In [1]:
! pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=5061c04ca572c7a5253294cf015250c3982039f33e94d6c9e2d949cd0caa82e0
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2



**Vehicle Maintanace**


Data Ingestion

In [None]:
from pyspark.sql import SparkSession
import os
spark = SparkSession.builder.appName("VehicleMaintenanceDataIngestion").getOrCreate()
file_path = "/content/sample_data/vehicle_maintenance.csv"

# Check if the file exists
if os.path.exists(file_path):
    try:
        df = spark.read.format("csv").option("header", "true").load(file_path)
        df.show()
        df.write.format("delta").mode("overwrite").save("/delta/vehicle_maintenance")
        print("Vehicle maintenance data saved successfully.")

    except Exception as e:
        print(f"Error: {str(e)}")
else:
    print("CSV file does not exist.")



Data Cleaning

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("VehicleMaintenanceDataCleaning").getOrCreate()
df = spark.read.format("delta").load("/delta/vehicle_maintenance")

cleaned_df = df.filter((df.ServiceCost > 0) & (df.Mileage > 0))
cleaned_df = cleaned_df.dropDuplicates(["VehicleID", "Date"])
cleaned_df.show()

cleaned_df.write.format("delta").mode("overwrite").save("/delta/cleaned_vehicle_maintenance")
print("Cleaned data saved successfully.")

Vehicle Maintenance Analysis

In [None]:

from pyspark.sql import SparkSession
from pyspark.sql.functions import sum, col
spark = SparkSession.builder.appName("VehicleMaintenanceAnalysis").getOrCreate()

df = spark.read.format("delta").load("/delta/cleaned_vehicle_maintenance")

total_cost_df = df.groupBy("VehicleID").agg(sum("ServiceCost").alias("TotalCost"))
total_cost_df.show()


high_mileage_df = df.filter(col("Mileage") > 30000)
high_mileage_df.show()

total_cost_df.write.format("delta").mode("overwrite").save("/delta/vehicle_total_cost")
high_mileage_df.write.format("delta").mode("overwrite").save("/delta/high_mileage_vehicles")
print("Analysis results saved successfully.")


Data Governance with Delta Lake

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("DeltaLakeGovernance").getOrCreate()
df = spark.read.format("delta").load("/delta/cleaned_vehicle_maintenance")

spark.sql("VACUUM '/delta/cleaned_vehicle_maintenance' RETAIN 168 HOURS")
print("VACUUM operation completed.")

history_df = spark.sql("DESCRIBE HISTORY '/delta/cleaned_vehicle_maintenance'")
history_df.show()


**Movie rating**

Data Ingestion

In [None]:

from pyspark.sql import SparkSession
import os

spark = SparkSession.builder.appName("MovieRatingsDataIngestion").getOrCreate()
file_path = "/content/sample_data/movie_ratings.csv"
if os.path.exists(file_path):
    try:
        df = spark.read.format("csv").option("header", "true").load(file_path)
        df.show()
        df.write.format("delta").mode("overwrite").save("/delta/movie_ratings")
        print("Movie ratings data saved successfully to Delta table")

    except Exception as e:
        print(f"Error: {str(e)}")
else:
    print(f"CSV file does not exist at {file_path}.")


Data Cleaning

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("MovieRatingsDataCleaning").getOrCreate()
df = spark.read.format("delta").load("/delta/movie_ratings")


cleaned_df = df.filter((df.Rating >= 1) & (df.Rating <= 5))
cleaned_df = cleaned_df.dropDuplicates(["UserID", "MovieID"])
cleaned_df.show()

cleaned_df.write.format("delta").mode("overwrite").save("/delta/cleaned_movie_ratings")
print("Cleaned data saved successfully.")


Movie Rating Analysis

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, col
spark = SparkSession.builder.appName("MovieRatingsAnalysis").getOrCreate()
df = spark.read.format("delta").load("/delta/cleaned_movie_ratings")


avg_rating_df = df.groupBy("MovieID").agg(avg("Rating").alias("AvgRating"))
avg_rating_df.show()


max_rating_df = avg_rating_df.orderBy(col("AvgRating").desc()).limit(1)
min_rating_df = avg_rating_df.orderBy(col("AvgRating").asc()).limit(1)
max_rating_df.show()
min_rating_df.show()


avg_rating_df.write.format("delta").mode("overwrite").save("/delta/movie_avg_ratings")
print("Average ratings saved successfully.")


Time Travel and Delta Lake History

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("MovieRatingsTimeTravel").getOrCreate()
df = spark.read.format("delta").load("/delta/cleaned_movie_ratings")

df = df.withColumn("Rating",
                   when(df.UserID == 'U001', 5).otherwise(df.Rating))
df.write.format("delta").mode("overwrite").save("/delta/cleaned_movie_ratings")


original_df = spark.read.format("delta").option("versionAsOf", 0).load("/delta/cleaned_movie_ratings")
original_df.show()


history_df = spark.sql("DESCRIBE HISTORY '/delta/cleaned_movie_ratings'")
history_df.show()


Optimize Delta Table

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("OptimizeDeltaTable").getOrCreate()


spark.sql("OPTIMIZE '/delta/cleaned_movie_ratings' ZORDER BY (MovieID)")

spark.sql("OPTIMIZE '/delta/cleaned_movie_ratings'")

spark.sql("VACUUM '/delta/cleaned_movie_ratings' RETAIN 168 HOURS")


**Student data**

Reading Data from Various Formats

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("DataIngestion").getOrCreate()

csv_file_path = "/content/Sample_data/student_info.csv"
try:
    students_df = spark.read.format("csv").option("header", "true").load(csv_file_path)
    students_df.show()
except Exception as e:
    print(f"Error loading CSV file: {str(e)}")


json_file_path = "/content/Sample_data/city_info.json"
try:
    city_df = spark.read.format("json").load(json_file_path)
    city_df.show()
except Exception as e:
    print(f"Error loading JSON file: {str(e)}")


parquet_file_path = "/content/Sample_data/hospitals.parquet"
try:
    hospitals_df = spark.read.format("parquet").load(parquet_file_path)
    hospitals_df.show()
except Exception as e:
    print(f"Error loading Parquet file: {str(e)}")


delta_table_path = "/delta/hospital_records"
try:
    hospital_delta_df = spark.read.format("delta").load(delta_table_path)
    hospital_delta_df.show()
except Exception as e:
    print(f"Error loading Delta table: {str(e)}")


Writing Data to Various Formats

In [None]:

students_df.write.format("csv").mode("overwrite").save("/dbfs/FileStore/output/student_info.csv")

city_df.write.format("json").mode("overwrite").save("/dbfs/FileStore/output/city_info.json")


hospitals_df.write.format("parquet").mode("overwrite").save("/dbfs/FileStore/output/hospital_data.parquet")

hospitals_df.write.format("delta").mode("overwrite").save("/delta/hospital_data")


In [None]:
csv_file_path = "/content/Sample_data/student_info.csv"
students_df = spark.read.format("csv").option("header", "true").load(csv_file_path)


cleaned_students_df = students_df.dropDuplicates().na.fill({'Score': 0})
cleaned_students_df.write.format("delta").mode("overwrite").save("/delta/cleaned_students")

dbutils.notebook.run("/path/to/Notebook_B", 60)


In [None]:

cleaned_students_df = spark.read.format("delta").load("/delta/cleaned_students")


avg_score_df = cleaned_students_df.groupBy("Class").avg("Score")
avg_score_df.show()

avg_score_df.write.format("delta").mode("overwrite").save("/delta/average_student_scores")


Databricks Ingestion from Various Sources

In [None]:

spark = SparkSession.builder.appName("DatabricksIngestion").getOrCreate()

#  Reading CSV from Azure Data Lake
csv_file_path_adl = "abfss://<container>@<storage_account>.dfs.core.windows.net/student_info.csv"
adl_students_df = spark.read.format("csv").option("header", "true").load(csv_file_path_adl)
adl_students_df.show()

# Reading JSON from Databricks FileStore
json_file_path = "/dbfs/FileStore/city_info.json"
filestore_city_df = spark.read.format("json").load(json_file_path)
filestore_city_df.show()

# Reading Parquet from AWS S3
parquet_file_path_s3 = "s3a://<bucket>/hospital_data.parquet"
s3_hospital_df = spark.read.format("parquet").load(parquet_file_path_s3)
s3_hospital_df.show()

#  Delta Table stored in Databricks
delta_table_path = "/delta/hospital_records"
delta_hospital_df = spark.read.format("delta").load(delta_table_path)
delta_hospital_df.show()

# Performing transformations:
filtered_students_df = adl_students_df.filter(adl_students_df.Score > 80)
filtered_students_df.show()

# Writing cleaned data to CSV, JSON, Parquet, and Delta formats
filtered_students_df.write.format("csv").mode("overwrite").save("/dbfs/FileStore/output/filtered_students.csv")
filestore_city_df.write.format("json").mode("overwrite").save("/dbfs/FileStore/output/filtered_city_data.json")
s3_hospital_df.write.format("parquet").mode("overwrite").save("/dbfs/FileStore/output/filtered_hospitals.parquet")
delta_hospital_df.write.format("delta").mode("overwrite").save("/delta/filtered_hospital_data")


Aditional Task

In [None]:

spark.sql("OPTIMIZE '/delta/filtered_hospital_data'")

spark.sql("OPTIMIZE '/delta/filtered_hospital_data' ZORDER BY (CityName)")

spark.sql("VACUUM '/delta/filtered_hospital_data' RETAIN 168 HOURS")



