# When and Otherwise Statements in PySpark

In [8]:
data = [
    ('John', 'HR', 4000),
    ('Alice', 'IT', 7000),
    ('Bob', 'Finance', 5000),
    ('Eve', 'IT', 8000),
    ('Charlie', 'HR', 4500)
]
schema = ['Name', 'Department', 'Salary']
df = spark.createDataFrame(data, schema=schema)
df.show()


StatementMeta(, a2dfe7ed-b79c-4fd7-854a-3135ed7321c2, 10, Finished, Available, Finished)

+-------+----------+------+
|   Name|Department|Salary|
+-------+----------+------+
|   John|        HR|  4000|
|  Alice|        IT|  7000|
|    Bob|   Finance|  5000|
|    Eve|        IT|  8000|
|Charlie|        HR|  4500|
+-------+----------+------+



### 1. Basic Analysis with When-Otherwise

In [9]:
from pyspark.sql.functions import col, when, avg, sum

# Add salary category based on conditions
df_with_category = df.withColumn("SalaryCategory",
    when(col("Salary") <= 4000, "Entry Level")
    .when((col("Salary") > 4000) & (col("Salary") <= 6000), "Mid Level")
    .otherwise("Senior Level")
)

df_with_category.show()

StatementMeta(, a2dfe7ed-b79c-4fd7-854a-3135ed7321c2, 11, Finished, Available, Finished)

+-------+----------+------+--------------+
|   Name|Department|Salary|SalaryCategory|
+-------+----------+------+--------------+
|   John|        HR|  4000|   Entry Level|
|  Alice|        IT|  7000|  Senior Level|
|    Bob|   Finance|  5000|     Mid Level|
|    Eve|        IT|  8000|  Senior Level|
|Charlie|        HR|  4500|     Mid Level|
+-------+----------+------+--------------+



### 2. Department-wise Analysis

In [10]:
# Calculate department statistics
dept_stats = df.groupBy("Department").agg(
    avg("Salary").alias("AvgSalary"),
    sum("Salary").alias("TotalSalary")
)

dept_stats.show()

StatementMeta(, a2dfe7ed-b79c-4fd7-854a-3135ed7321c2, 12, Finished, Available, Finished)

+----------+---------+-----------+
|Department|AvgSalary|TotalSalary|
+----------+---------+-----------+
|        HR|   4250.0|       8500|
|        IT|   7500.0|      15000|
|   Finance|   5000.0|       5000|
+----------+---------+-----------+



### 3. Conditional Aggregations

In [11]:
# Count employees by salary ranges per department
salary_distribution = df.groupBy("Department").agg(
    sum(when(col("Salary") <= 5000, 1).otherwise(0)).alias("Junior_Count"),
    sum(when(col("Salary") > 5000, 1).otherwise(0)).alias("Senior_Count")
)

salary_distribution.show()

StatementMeta(, a2dfe7ed-b79c-4fd7-854a-3135ed7321c2, 13, Finished, Available, Finished)

+----------+------------+------------+
|Department|Junior_Count|Senior_Count|
+----------+------------+------------+
|        HR|           2|           0|
|        IT|           0|           2|
|   Finance|           1|           0|
+----------+------------+------------+



### 4. Complex Transformations

In [12]:
# Create detailed employee analysis
detailed_analysis = df.withColumn(
    "Performance_Bonus",
    when(col("Department") == "IT", col("Salary") * 0.15)
    .when(col("Department") == "HR", col("Salary") * 0.10)
    .otherwise(col("Salary") * 0.12)
).withColumn(
    "Department_Rank",
    when(col("Department") == "IT", "High")
    .when(col("Department") == "Finance", "Medium")
    .otherwise("Standard")
)

detailed_analysis.show()

StatementMeta(, a2dfe7ed-b79c-4fd7-854a-3135ed7321c2, 14, Finished, Available, Finished)

+-------+----------+------+-----------------+---------------+
|   Name|Department|Salary|Performance_Bonus|Department_Rank|
+-------+----------+------+-----------------+---------------+
|   John|        HR|  4000|            400.0|       Standard|
|  Alice|        IT|  7000|           1050.0|           High|
|    Bob|   Finance|  5000|            600.0|         Medium|
|    Eve|        IT|  8000|           1200.0|           High|
|Charlie|        HR|  4500|            450.0|       Standard|
+-------+----------+------+-----------------+---------------+



### 5. Window Functions with Conditions

In [13]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, dense_rank

# Create window specification
window_spec = Window.partitionBy("Department").orderBy(col("Salary").desc())

# Add rankings
ranked_df = df.withColumn("Dept_Rank", rank().over(window_spec))\
    .withColumn("Status",
        when(col("Dept_Rank") == 1, "Top Performer")
        .when(col("Dept_Rank") == 2, "Strong Performer")
        .otherwise("Regular")
    )

ranked_df.show()

StatementMeta(, a2dfe7ed-b79c-4fd7-854a-3135ed7321c2, 15, Finished, Available, Finished)

+-------+----------+------+---------+----------------+
|   Name|Department|Salary|Dept_Rank|          Status|
+-------+----------+------+---------+----------------+
|    Bob|   Finance|  5000|        1|   Top Performer|
|Charlie|        HR|  4500|        1|   Top Performer|
|   John|        HR|  4000|        2|Strong Performer|
|    Eve|        IT|  8000|        1|   Top Performer|
|  Alice|        IT|  7000|        2|Strong Performer|
+-------+----------+------+---------+----------------+



### 6. Multiple Column Conditions

In [14]:
# Complex categorization based on multiple columns
categorized_df = df.withColumn("Employee_Category",
    when((col("Department") == "IT") & (col("Salary") > 6000), "Senior IT")
    .when((col("Department") == "HR") & (col("Salary") > 4000), "Senior HR")
    .when((col("Department") == "Finance") & (col("Salary") > 5000), "Senior Finance")
    .otherwise("Junior Staff")
)

categorized_df.show()

StatementMeta(, a2dfe7ed-b79c-4fd7-854a-3135ed7321c2, 16, Finished, Available, Finished)

+-------+----------+------+-----------------+
|   Name|Department|Salary|Employee_Category|
+-------+----------+------+-----------------+
|   John|        HR|  4000|     Junior Staff|
|  Alice|        IT|  7000|        Senior IT|
|    Bob|   Finance|  5000|     Junior Staff|
|    Eve|        IT|  8000|        Senior IT|
|Charlie|        HR|  4500|        Senior HR|
+-------+----------+------+-----------------+

