In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
import random

# Create a SparkSession
spark = SparkSession.builder \
    .appName("example") \
    .getOrCreate()

# Sample data for departments and genders
departments = ["Sales", "Marketing", "Engineering", "Finance", "HR"]
genders = ["Male", "Female"]

# Generate sample data
data = []
for _ in range(20):
    deptname = random.choice(departments)
    gender = random.choice(genders)
    data.append((deptname, gender))

# Create an RDD of Rows
rdd = spark.sparkContext.parallelize(data)
rows = rdd.map(lambda x: Row(deptname=x[0], gender=x[1]))

# Create DataFrame
df = spark.createDataFrame(rows)

# Show DataFrame
display(df)


deptname,gender
Finance,Female
Sales,Male
HR,Female
Finance,Male
Engineering,Female
Finance,Female
Marketing,Male
Finance,Male
Marketing,Female
Engineering,Female


In [0]:
from pyspark.sql.functions import *

df_agg=df.select('deptname',when(col('gender')=='Female','f').alias('female'),when(col('gender')=='Male','M').alias('male')).groupBy(col('deptname'))\
.agg(count(col('female')).alias('count_female'),count(col('male')).alias('count_male')).withColumn('count_total',col('count_female')+col('count_male'))
display(df_agg)

deptname,count_female,count_male,count_total
Sales,0,2,2
Finance,3,5,8
HR,2,1,3
Engineering,3,0,3
Marketing,2,2,4


In [0]:
df_array = df_agg.select(
    "deptname",
    array(
        col("count_female").alias("female"),
        col("count_male").alias("male"),
        col("count_total").alias("total")
    ).alias("counts_array")
)
display(df_array)

deptname,counts_array
Sales,"List(0, 2, 2)"
Finance,"List(3, 5, 8)"
HR,"List(2, 1, 3)"
Engineering,"List(3, 0, 3)"
Marketing,"List(2, 2, 4)"


In [0]:
df_explode=df_array.select("deptname",explode("counts_array").alias("empcount"))
display(df_explode)


deptname,empcount
Sales,0
Sales,2
Sales,2
Finance,3
Finance,5
Finance,8
HR,2
HR,1
HR,3
Engineering,3


In [0]:
from pyspark.sql.window import Window

In [0]:
df_window=df_explode.select(col("deptname"),col("empcount"),row_number().over (Window.partitionBy(col("deptname")).orderBy(col("deptname"))).alias("rownumber"))

display(df_window)

deptname,empcount,rownumber
Engineering,3,1
Engineering,0,2
Engineering,3,3
Finance,3,1
Finance,5,2
Finance,8,3
HR,2,1
HR,1,2
HR,3,3
Marketing,2,1


In [0]:
df_totalemp=df_window.select('deptname','empcount',when (col('rownumber')==1,'f').alias("female"),when (col('rownumber')==2,'m').alias("male"),when (col('rownumber')==3,'e').alias("Total_emp"))
display(df_totalemp)

deptname,empcount,female,male,Total_emp
Engineering,3,f,,
Engineering,0,,m,
Engineering,3,,,e
Finance,3,f,,
Finance,5,,m,
Finance,8,,,e
HR,2,f,,
HR,1,,m,
HR,3,,,e
Marketing,2,f,,
