# PySpark 02 Filter operation

**Summary** 
- Filter operation
- &, |, ==
- ~

### Setup

In [2]:
import os
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("filterOps").getOrCreate()
spark

### Load

In [22]:
# read the dataset
ROOT = "C:\\Users\\PySpark\\"
FILE = "./employees.csv"
df = spark.read.csv(ROOT+FILE, header=True, inferSchema=True)
df

DataFrame[EMPLOYEE_ID: int, FIRST_NAME: string, LAST_NAME: string, EMAIL: string, PHONE_NUMBER: string, HIRE_DATE: string, JOB_ID: string, SALARY: int, COMMISSION_PCT: string, MANAGER_ID: string, DEPARTMENT_ID: int]

In [23]:
df.show()

+-----------+----------+---------+--------+------------+---------+----------+------+--------------+----------+-------------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER|HIRE_DATE|    JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|
+-----------+----------+---------+--------+------------+---------+----------+------+--------------+----------+-------------+
|        198|    Donald| OConnell|DOCONNEL|650.507.9833|21-JUN-07|  SH_CLERK|  2600|            - |       124|           50|
|        199|   Douglas|    Grant|  DGRANT|650.507.9844|13-JAN-08|  SH_CLERK|  2600|            - |       124|           50|
|        200|  Jennifer|   Whalen| JWHALEN|515.123.4444|17-SEP-03|   AD_ASST|  4400|            - |       101|           10|
|        201|   Michael|Hartstein|MHARTSTE|515.123.5555|17-FEB-04|    MK_MAN| 13000|            - |       100|           20|
|        202|       Pat|      Fay|    PFAY|603.123.6666|17-AUG-05|    MK_REP|  6000|            - |       201|           20|


## 1. Filter by condition

**Examples**

- Salary of people less than 20,000

**Syntax**  

- `df.filter('column <= 20000')`
- `df.filter('column condition').select('col1', 'col2', ...)`
- `df.filter(df['col'] <= value)`
- `df.filter( (cond1) & (cond2) & ... ).select('col1', 'col2', ...)`    >> `&`, `|`
- `df.filter( ~(cond1) | ~(cond2) | ... ).select('col1', 'col2', ...)`    >> `&`, `|`

In [34]:
# numerical value comparison 

df.filter('SALARY >= 20000').show()
df.filter('SALARY >= 20000').select('FIRST_NAME', 'JOB_ID', 'SALARY').show()

+-----------+----------+---------+-----+------------+---------+-------+------+--------------+----------+-------------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|EMAIL|PHONE_NUMBER|HIRE_DATE| JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|
+-----------+----------+---------+-----+------------+---------+-------+------+--------------+----------+-------------+
|        100|    Steven|     King|SKING|515.123.4567|17-JUN-03|AD_PRES| 24000|            - |        - |           90|
+-----------+----------+---------+-----+------------+---------+-------+------+--------------+----------+-------------+

+----------+-------+------+
|FIRST_NAME| JOB_ID|SALARY|
+----------+-------+------+
|    Steven|AD_PRES| 24000|
+----------+-------+------+



In [41]:
df.filter((df['SALARY'] >= 10000) & 
          (df['SALARY'] <= 20000)).select('FIRST_NAME', 'JOB_ID', 'SALARY').show()

+----------+------+------+
|FIRST_NAME|JOB_ID|SALARY|
+----------+------+------+
|   Michael|MK_MAN| 13000|
|   Hermann|PR_REP| 10000|
|   Shelley|AC_MGR| 12008|
|     Neena| AD_VP| 17000|
|       Lex| AD_VP| 17000|
|     Nancy|FI_MGR| 12008|
|       Den|PU_MAN| 11000|
+----------+------+------+



In [44]:
df.filter(~(df['SALARY'] >= 10000) | 
          ~(df['SALARY'] <= 20000)).select('FIRST_NAME', 'JOB_ID', 'SALARY').show()

+-----------+----------+------+
| FIRST_NAME|    JOB_ID|SALARY|
+-----------+----------+------+
|     Donald|  SH_CLERK|  2600|
|    Douglas|  SH_CLERK|  2600|
|   Jennifer|   AD_ASST|  4400|
|        Pat|    MK_REP|  6000|
|      Susan|    HR_REP|  6500|
|    William|AC_ACCOUNT|  8300|
|     Steven|   AD_PRES| 24000|
|  Alexander|   IT_PROG|  9000|
|      Bruce|   IT_PROG|  6000|
|      David|   IT_PROG|  4800|
|      Valli|   IT_PROG|  4800|
|      Diana|   IT_PROG|  4200|
|     Daniel|FI_ACCOUNT|  9000|
|       John|FI_ACCOUNT|  8200|
|     Ismael|FI_ACCOUNT|  7700|
|Jose Manuel|FI_ACCOUNT|  7800|
|       Luis|FI_ACCOUNT|  6900|
|  Alexander|  PU_CLERK|  3100|
|     Shelli|  PU_CLERK|  2900|
|      Sigal|  PU_CLERK|  2800|
+-----------+----------+------+
only showing top 20 rows

