In [1]:
! pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=f055cc081151992c624e2a18cd718004251a0aab0ee1d16a414522c86bb4f363
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [2]:
from pyspark.sql import SparkSession


# Initialize a Spark session
spark = SparkSession.builder \
    .appName("Advanced DataFrame Operations - Different Dataset") \
    .getOrCreate()

Extract

In [15]:
csv_file_path = "/content/sample_data/employee.csv"

#Now you can read it with pyspark
df_csv = spark.read.format("csv").option("header", "true").load(csv_file_path)
df_csv.show()


+-----+---+------+------+
| name|age|gender|salary|
+-----+---+------+------+
| John| 28|  Male| 60000|
| Jane| 32|Female| 72000|
| Mike| 45|  Male| 84000|
|Emily| 23|Female| 52000|
| Alex| 36|  Male| 67000|
+-----+---+------+------+



In [20]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("EmployeeSalaryETL").getOrCreate()
df_employee = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(csv_file_path)
df_employee.show()

+-----+---+------+------+
| name|age|gender|salary|
+-----+---+------+------+
| John| 28|  Male| 60000|
| Jane| 32|Female| 72000|
| Mike| 45|  Male| 84000|
|Emily| 23|Female| 52000|
| Alex| 36|  Male| 67000|
+-----+---+------+------+



Transform

In [22]:
# Transform: Filter employees aged 30 and above
from pyspark.sql.functions import col

df_filtered = df_employee.filter(col("age") >= 30)
df_filtered.show()

+----+---+------+------+
|name|age|gender|salary|
+----+---+------+------+
|Jane| 32|Female| 72000|
|Mike| 45|  Male| 84000|
|Alex| 36|  Male| 67000|
+----+---+------+------+



In [25]:
from pyspark.sql.functions import col,round
df_transformed = df_employee.withColumn("salary_with_bonus", round(col("salary") * 1.10, 2))

# Show transformed data
df_transformed.show()

+-----+---+------+------+-----------------+
| name|age|gender|salary|salary_with_bonus|
+-----+---+------+------+-----------------+
| John| 28|  Male| 60000|          66000.0|
| Jane| 32|Female| 72000|          79200.0|
| Mike| 45|  Male| 84000|          92400.0|
|Emily| 23|Female| 52000|          57200.0|
| Alex| 36|  Male| 67000|          73700.0|
+-----+---+------+------+-----------------+



In [26]:
from pyspark.sql.functions import round,avg
df_avg_salary_by_gender = df_employee.groupBy("gender").agg(round(avg("salary"), 2).alias("avg_salary"))

# Show the result (average salary by gender)
df_avg_salary_by_gender.show()

+------+----------+
|gender|avg_salary|
+------+----------+
|Female|   62000.0|
|  Male|  70333.33|
+------+----------+



Load

In [27]:
# Path to save the Parquet file
parquet_output_path ="/content/sample_data/employee_data_with_bonus.parquet"

# Save the transformed DataFrame to a Parquet file
df_transformed.write.parquet(parquet_output_path, mode="overwrite")

df_parquet = spark.read.parquet(parquet_output_path)

# Show the saved data
df_parquet.show()


+-----+---+------+------+-----------------+
| name|age|gender|salary|salary_with_bonus|
+-----+---+------+------+-----------------+
| John| 28|  Male| 60000|          66000.0|
| Jane| 32|Female| 72000|          79200.0|
| Mike| 45|  Male| 84000|          92400.0|
|Emily| 23|Female| 52000|          57200.0|
| Alex| 36|  Male| 67000|          73700.0|
+-----+---+------+------+-----------------+

