In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Define the schema for the employee data
schema = StructType([
    StructField("employee_id", IntegerType(), True),
    StructField("first_name", StringType(), True),
    StructField("last_name", StringType(), True),
    StructField("email", StringType(), True),
    StructField("phone_number", StringType(), True),
    StructField("hire_date", StringType(), True),
    StructField("job_id", StringType(), True),
    StructField("salary", IntegerType(), True),
    StructField("manager_id", IntegerType(), True),
    StructField("department_id", IntegerType(), True)
])

# Create the employee data dataframe
data = [
    (1, "John", "Doe", "john.doe@example.com", "123-456-7890", "2020-01-01", "IT_PROG", 5000, 1, 1),
    (2, "Jane", "Smith", "jane.smith@example.com", "987-654-3210", "2020-02-01", "SA_REP", 6000, 1, 2),
    (3, "Mike", "Johnson", "mike.johnson@example.com", "555-555-5555", "2020-03-01", "IT_PROG", 5500, 2, 1)
]

spark = SparkSession.builder.getOrCreate()
df = spark.createDataFrame(data, schema)

df.show()

+-----------+----------+---------+--------------------+------------+----------+-------+------+----------+-------------+
|employee_id|first_name|last_name|               email|phone_number| hire_date| job_id|salary|manager_id|department_id|
+-----------+----------+---------+--------------------+------------+----------+-------+------+----------+-------------+
|          1|      John|      Doe|john.doe@example.com|123-456-7890|2020-01-01|IT_PROG|  5000|         1|            1|
|          2|      Jane|    Smith|jane.smith@exampl...|987-654-3210|2020-02-01| SA_REP|  6000|         1|            2|
|          3|      Mike|  Johnson|mike.johnson@exam...|555-555-5555|2020-03-01|IT_PROG|  5500|         2|            1|
+-----------+----------+---------+--------------------+------------+----------+-------+------+----------+-------------+



### using the or ( | ) condition

In [0]:
from pyspark.sql.functions import col
df.filter((col('job_id').startswith('SA') == True) | (col('salary') > 5400)).show()

+-----------+----------+---------+--------------------+------------+----------+-------+------+----------+-------------+
|employee_id|first_name|last_name|               email|phone_number| hire_date| job_id|salary|manager_id|department_id|
+-----------+----------+---------+--------------------+------------+----------+-------+------+----------+-------------+
|          2|      Jane|    Smith|jane.smith@exampl...|987-654-3210|2020-02-01| SA_REP|  6000|         1|            2|
|          3|      Mike|  Johnson|mike.johnson@exam...|555-555-5555|2020-03-01|IT_PROG|  5500|         2|            1|
+-----------+----------+---------+--------------------+------------+----------+-------+------+----------+-------------+



### using SQL style syntax

In [0]:
df.filter("job_id like 'SA%' or salary > 5400").show()

+-----------+----------+---------+--------------------+------------+----------+-------+------+----------+-------------+
|employee_id|first_name|last_name|               email|phone_number| hire_date| job_id|salary|manager_id|department_id|
+-----------+----------+---------+--------------------+------------+----------+-------+------+----------+-------------+
|          2|      Jane|    Smith|jane.smith@exampl...|987-654-3210|2020-02-01| SA_REP|  6000|         1|            2|
|          3|      Mike|  Johnson|mike.johnson@exam...|555-555-5555|2020-03-01|IT_PROG|  5500|         2|            1|
+-----------+----------+---------+--------------------+------------+----------+-------+------+----------+-------------+

