## Setting Environment Variables

In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

## Create SparkSession

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
        .appName("Otherwise-when")\
        .getOrCreate()

## when() function

### DataFrame

In [3]:
data = [("James","M",60000),("Michael","M",70000),
        ("Robert",None,400000),("Maria","F",500000),
        ("Jen","",None)]

columns = ["name","gender","salary"]
df = spark.createDataFrame(data = data, schema = columns)
df.show()

+-------+------+------+
|   name|gender|salary|
+-------+------+------+
|  James|     M| 60000|
|Michael|     M| 70000|
| Robert|  null|400000|
|  Maria|     F|500000|
|    Jen|      |  null|
+-------+------+------+



### Using With Column

In [4]:
from pyspark.sql.functions import when
df2 = df.withColumn("new_gender", when(df.gender == 'M', 'Male')
                                   .when(df.gender == 'F', 'Female')
                                   .otherwise('N/A'))
df2.show()

+-------+------+------+----------+
|   name|gender|salary|new_gender|
+-------+------+------+----------+
|  James|     M| 60000|      Male|
|Michael|     M| 70000|      Male|
| Robert|  null|400000|       N/A|
|  Maria|     F|500000|    Female|
|    Jen|      |  null|       N/A|
+-------+------+------+----------+



### Using select() function

In [5]:
from pyspark.sql.functions import col
df3=df.select(col("*"),when(df.gender == "M","Male")
                  .when(df.gender == "F","Female")
                  .otherwise("N/A").alias("new_gender"))
df3.show()

+-------+------+------+----------+
|   name|gender|salary|new_gender|
+-------+------+------+----------+
|  James|     M| 60000|      Male|
|Michael|     M| 70000|      Male|
| Robert|  null|400000|       N/A|
|  Maria|     F|500000|    Female|
|    Jen|      |  null|       N/A|
+-------+------+------+----------+



### Using expr (Case, When) in SQL

In [6]:
from pyspark.sql.functions import expr
df4 = df.withColumn("new_Gender", expr(
    "CASE WHEN gender = 'M' Then 'Male'" +
    "WHEN gender = 'F' Then 'Female'" + 
    "ELSE 'Na' END"))
df4.show()

+-------+------+------+----------+
|   name|gender|salary|new_Gender|
+-------+------+------+----------+
|  James|     M| 60000|      Male|
|Michael|     M| 70000|      Male|
| Robert|  null|400000|        Na|
|  Maria|     F|500000|    Female|
|    Jen|      |  null|        Na|
+-------+------+------+----------+



### Using Case on WHEN SQL Expression

In [7]:
df.createOrReplaceTempView("sample")
df_sql = spark.sql("""
            SELECT name, 
            CASE
            WHEN gender = 'M' THEN 'MALE'
            WHEN gender = 'F' THEN 'FEMALE'
            ELSE 'NA'
            END as gender,
            salary
            FROM sample
        """)
df_sql.show()

+-------+------+------+
|   name|gender|salary|
+-------+------+------+
|  James|  MALE| 60000|
|Michael|  MALE| 70000|
| Robert|    NA|400000|
|  Maria|FEMALE|500000|
|    Jen|    NA|  null|
+-------+------+------+



## expr() function

### Dataframes

In [8]:
data=[("James","Bond"),("Scott","Varsa")] 
df=spark.createDataFrame(data).toDF("col1","col2") 
df.show()

+-----+-----+
| col1| col2|
+-----+-----+
|James| Bond|
|Scott|Varsa|
+-----+-----+



In [19]:
data1 = [
    ("James", "M"),
    ("Praveena", "F"),
    ("Yaswanth", "")
]
col1 = ["name", "gen"]
df1 = spark.createDataFrame(data1, col1)
df1.show()

+--------+---+
|    name|gen|
+--------+---+
|   James|  M|
|Praveena|  F|
|Yaswanth|   |
+--------+---+



In [34]:
data2=[("2019-01-23",1),("2019-06-24",2),("2019-09-20",3)] 
df2=spark.createDataFrame(data2).toDF("date","increment") 
df2.show()

+----------+---------+
|      date|increment|
+----------+---------+
|2019-01-23|        1|
|2019-06-24|        2|
|2019-09-20|        3|
+----------+---------+



### Concatenate using expr

In [14]:
df.withColumn("Name", expr("col1 ||' '|| col2")).show()

+-----+-----+-----------+
| col1| col2|       Name|
+-----+-----+-----------+
|James| Bond| James Bond|
|Scott|Varsa|Scott Varsa|
+-----+-----+-----------+



### When using expr

In [18]:
df1.withColumn("gender", expr(
    """
       CASE
        WHEN gen = 'M' THEN 'MALE'
        WHEN gen = 'F' THEN 'FEMALE'
        ELSE 'NA'
        END
    """
)).show()

+--------+---+------+
|    name|gen|gender|
+--------+---+------+
|   James|  M|  MALE|
|Praveena|  F|FEMALE|
|Yaswanth|   |    NA|
+--------+---+------+



### Using Existing Column Value for expr

In [32]:
df2.withColumn("updated_timestamp", expr(
    'add_months(date, increment)'
)).show()

+----------+---------+-----------------+
|      date|increment|updated_timestamp|
+----------+---------+-----------------+
|2019-01-23|        1|       2019-02-23|
|2019-06-24|        2|       2019-08-24|
|2019-09-20|        3|       2019-12-20|
+----------+---------+-----------------+



### Using alias with the column expr

In [35]:
df2.select(df2.date, df2.increment, expr(
    'add_months(date, increment) as new_date'
)).show()

+----------+---------+----------+
|      date|increment|  new_date|
+----------+---------+----------+
|2019-01-23|        1|2019-02-23|
|2019-06-24|        2|2019-08-24|
|2019-09-20|        3|2019-12-20|
+----------+---------+----------+



### cast function with expr

In [37]:
df2.select("increment", expr("cast(increment as string) as increment_str"))\
    .printSchema()

root
 |-- increment: long (nullable = true)
 |-- increment_str: string (nullable = true)



### Arithmetic operations

In [38]:
df2.select(df2.date,df2.increment, expr(
    "increment + 4 as new_increment"
)).show()

+----------+---------+-------------+
|      date|increment|new_increment|
+----------+---------+-------------+
|2019-01-23|        1|            5|
|2019-06-24|        2|            6|
|2019-09-20|        3|            7|
+----------+---------+-------------+



### Filter function

In [41]:
data3=[(100,2),(200,3000),(500,500)] 
df3=spark.createDataFrame(data3).toDF("col1","col2")
df3.filter(expr("col1 >col2")).show()

+----+----+
|col1|col2|
+----+----+
| 100|   2|
+----+----+



## lit() function

### Dataframe

In [42]:
data = [("111",50000),("222",60000),("333",40000)]
df = spark.createDataFrame(data, ["EmpId", "Salary"])
df.show()

+-----+------+
|EmpId|Salary|
+-----+------+
|  111| 50000|
|  222| 60000|
|  333| 40000|
+-----+------+



### lit() with select

In [48]:
from pyspark.sql.functions import lit
df.select("EmpId", "Salary", lit(1).alias("lit")).show()

+-----+------+---+
|EmpId|Salary|lit|
+-----+------+---+
|  111| 50000|  1|
|  222| 60000|  1|
|  333| 40000|  1|
+-----+------+---+



### lit with withColumn

In [50]:
df.withColumn("new_col", when(df.Salary > 40000, lit(200)).otherwise(lit(100))).show()

+-----+------+-------+
|EmpId|Salary|new_col|
+-----+------+-------+
|  111| 50000|    200|
|  222| 60000|    200|
|  333| 40000|    100|
+-----+------+-------+



### typedLit() withColumn

In [60]:
from pyspark.sql.types import StringType
from pyspark.sql.functions import typedLit
df.withColumn("new_typed", typedLit("flag", StringType())).show()

ImportError: cannot import name 'typedLit' from 'pyspark.sql.functions' (C:\spark\spark-3.4.2-bin-hadoop3\python\pyspark\sql\functions.py)