In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=b09bc87fc8e6f519965311001ad7ddd734a8c770ebad1e36bc5f576ff02b2d24
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


**Extract**

In [None]:
from pyspark.sql import SparkSession

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("EmployeeSalaryETL") \
    .getOrCreate()

# Load data from CSV
file_path = '/content/sample_data/employee.csv'
df = spark.read.csv(file_path, header=True, inferSchema=True)

# Show the initial DataFrame
df.show()


+-------+------+--------+---------+
|  name | age  | gender | salary  |
+-------+------+--------+---------+
| John  |  28.0| Male   |  60000.0|
| Jane  |  32.0| Female |  72000.0|
| Mike  |  45.0| Male   |  84000.0|
| Emily |  23.0| Female |  52000.0|
| Alex  |  36.0| Male   |  67000.0|
+-------+------+--------+---------+



**Transform**

In [None]:
from pyspark.sql.functions import col

# Filter employees aged 30 and above
# Check for case sensitivity or typos in the column name
filtered_df = df.filter(col(' age  ') >= 30)  # Adjust the column name if needed


In [None]:
from pyspark.sql.functions import expr, trim

# Add a new column 'salary_with_bonus' with 10% bonus
# Use the correct column name with spaces
transformed_df = filtered_df.withColumn('salary_with_bonus', trim(col(' salary  ')) * 1.10)

In [None]:
# Group by 'gender' and calculate the average salary, accounting for potential spaces
average_salary_by_gender = transformed_df.groupBy(' gender ').agg(
    expr('avg(` salary  `)').alias('average_salary'),  # Use the correct column name with spaces
    expr('avg(salary_with_bonus)').alias('average_salary_with_bonus')
)

# Show the results
average_salary_by_gender.show()

+--------+--------------+-------------------------+
| gender |average_salary|average_salary_with_bonus|
+--------+--------------+-------------------------+
| Female |       72000.0|                  79200.0|
| Male   |       75500.0|                  83050.0|
+--------+--------------+-------------------------+



**Load**

In [None]:
# Save the transformed DataFrame to Parquet format
parquet_path = 'path/to/transformed_employee_data.parquet'
transformed_df.write.parquet(parquet_path)


**Summary report**

In [None]:
# Print the schema of the DataFrame to verify the column names
transformed_df.printSchema()

# Use the correct column name with spaces
transformed_df.select(' age  ', ' gender ', ' salary  ', 'salary_with_bonus').show()

root
 |--  name : string (nullable = true)
 |--  age  : double (nullable = true)
 |--  gender : string (nullable = true)
 |--  salary  : double (nullable = true)
 |-- salary_with_bonus: double (nullable = true)

+------+--------+---------+-----------------+
| age  | gender | salary  |salary_with_bonus|
+------+--------+---------+-----------------+
|  32.0| Female |  72000.0|          79200.0|
|  45.0| Male   |  84000.0|92400.00000000001|
|  36.0| Male   |  67000.0|          73700.0|
+------+--------+---------+-----------------+



In [None]:
# Save the aggregated DataFrame to Parquet format
aggregated_parquet_path = 'path/to/average_salary_by_gender.parquet'
average_salary_by_gender.write.parquet(aggregated_parquet_path)
