In [1]:
! pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812363 sha256=5a8f6cd9914ad1588b49ff40232ad39f2a2ab31c23cfba1d056d234505761370
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


Task1: Data Ingestion - Reading Data from Various Formats

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import os

# Create a Spark session
spark = SparkSession.builder.appName("DataIngestion").getOrCreate()

# CSV Data
csv_data = [("S001", "Anil Kumar", 10, 85),
            ("S002", "Neha Sharma", 12, 92),
            ("S003", "Rajesh Gupta", 11, 78)]
csv_columns = ["StudentID", "Name", "Class", "Score"]
csv_df = spark.createDataFrame(csv_data, schema=csv_columns)

# JSON Data
json_data = [
    {"CityID": "C001", "CityName": "Mumbai", "Population": 20411000},
    {"CityID": "C002", "CityName": "Delhi", "Population": 16787941},
    {"CityID": "C003", "CityName": "Bangalore", "Population": 8443675}
]
json_df = spark.read.json(spark.sparkContext.parallelize(json_data))

# Parquet Data
parquet_path = "/content/sample_data/hospital_data.parquet"
try:
    hospital_parquet_df = spark.read.parquet(parquet_path)
except Exception as e:
    print(f"Error reading Parquet data: {e}")

# Delta Table
delta_table_path = "/content/sample_data/delta/hospital_records"
try:
    hospital_delta_df = spark.read.format("delta").load(delta_table_path)
except Exception as e:
    print(f"Error reading Delta table: {e}")


Task 2: Writing Data to Various Formats

In [None]:
# Write CSV
csv_output_path = "/content/sample_data/output/students.csv"
csv_df.write.mode("overwrite").csv(csv_output_path)

# Write JSON
json_output_path = "/content/sample_data/output/cities.json"
json_df.write.mode("overwrite").json(json_output_path)

# Write Parquet
parquet_output_path = "/content/sample_data/output/hospital_data.parquet"
hospital_parquet_df.write.mode("overwrite").parquet(parquet_output_path)

# Write Delta Table
delta_output_path = "/content/sample_data/delta/hospital_data"
hospital_parquet_df.write.format("delta").mode("overwrite").save(delta_output_path)

Task 3: Running One Notebook from Another

Notebook 1: Data Ingestion and Cleaning

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Create a Spark session
spark = SparkSession.builder.appName("StudentDataIngestion").getOrCreate()

# Ingest data from CSV
raw_students_df = spark.read.csv("/content/sample_data/students.csv", header=True)

# Clean the data
cleaned_students_df = raw_students_df.dropDuplicates().na.fill({"Score": 0})

# Save as Delta table
cleaned_delta_path = "/content/sample_data/delta/cleaned_students"
cleaned_students_df.write.format("delta").mode("overwrite").save(cleaned_delta_path)

# Run Notebook B
dbutils.notebook.run("/content/sample_data/Various_Formats_Notebook_B", 60)

Task 4: Databricks Ingestion

In [None]:
# Read from various sources
csv_azure_path = "abfss://<your-container>@<your-account>.dfs.core.windows.net/<path>/students.csv"
json_dbfs_path = "/FileStore/cities.json"
parquet_s3_path = "s3a://<your-bucket>/hospital_data.parquet"
delta_db_path = "delta.`/content/sample_data/delta/hospital_data`"

# Load data
students_df = spark.read.csv(csv_azure_path, header=True)
cities_df = spark.read.json(json_dbfs_path)
hospital_parquet_df = spark.read.parquet(parquet_s3_path)
hospital_delta_df = spark.read.format("delta").load(delta_db_path)

# Perform transformations (e.g., filter and calculate totals)
transformed_students_df = students_df.filter(col("Score") > 80)
total_students = transformed_students_df.count()

# Write cleaned data to various formats
transformed_students_df.write.mode("overwrite").csv("/content/sample_data/output/cleaned_students.csv")
transformed_students_df.write.mode("overwrite").json("/content/sample_data/output/cleaned_students.json")
transformed_students_df.write.mode("overwrite").parquet("/content/sample_data/output/cleaned_students.parquet")
transformed_students_df.write.format("delta").mode("overwrite").save("/content/sample_data/delta/cleaned_students")

Additional Tasks: Optimization, Z-ordering, and Vacuuming

In [None]:
# Optimize Delta table
spark.sql("OPTIMIZE delta.`/content/sample_data/delta/cleaned_students`")

# Apply Z-ordering on Class column
spark.sql("OPTIMIZE delta.`/content/sample_data/delta/cleaned_students` ZORDER BY (Class)")

# Vacuum old versions of the Delta table
spark.sql("VACUUM delta.`/content/sample_data/delta/cleaned_students` RETAIN 0 HOURS")

# New Section