# Pyspark Dataframes  (filter operation)

- Filter Operation
- &,|,==
- ~

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('dataframe').getOrCreate()

In [3]:
spark

In [4]:
df = spark.read.csv('test1.csv',header=True, inferSchema = True)
df.show()

+-------+---+----------+------+
|   Name|Age|Experience|Salary|
+-------+---+----------+------+
|Vaibhav| 26|         3| 15000|
|  Krish| 31|         4| 20000|
|   Suny| 29|         5| 30000|
| Rohit | 24|         1| 10000|
| Arpit | 23|         1| 10000|
|Twinken| 23|         2| 15000|
|  Harsh| 21|         1| 14000|
+-------+---+----------+------+



# filter operation


In [5]:
### Salary of the people less than or equal to 20000
df.filter('Salary<=20000').show()

+-------+---+----------+------+
|   Name|Age|Experience|Salary|
+-------+---+----------+------+
|Vaibhav| 26|         3| 15000|
|  Krish| 31|         4| 20000|
| Rohit | 24|         1| 10000|
| Arpit | 23|         1| 10000|
|Twinken| 23|         2| 15000|
|  Harsh| 21|         1| 14000|
+-------+---+----------+------+



In [6]:
df.filter('Salary<=20000').select(['Name', 'Age']).show()

+-------+---+
|   Name|Age|
+-------+---+
|Vaibhav| 26|
|  Krish| 31|
| Rohit | 24|
| Arpit | 23|
|Twinken| 23|
|  Harsh| 21|
+-------+---+



In [8]:
df.filter(df['Salary']<=10000).show()

+------+---+----------+------+
|  Name|Age|Experience|Salary|
+------+---+----------+------+
|Rohit | 24|         1| 10000|
|Arpit | 23|         1| 10000|
+------+---+----------+------+



In [9]:
df.filter((df["Salary"]<=15000) & 
          (df["Salary"]>=10000)).show()

+-------+---+----------+------+
|   Name|Age|Experience|Salary|
+-------+---+----------+------+
|Vaibhav| 26|         3| 15000|
| Rohit | 24|         1| 10000|
| Arpit | 23|         1| 10000|
|Twinken| 23|         2| 15000|
|  Harsh| 21|         1| 14000|
+-------+---+----------+------+



In [10]:
df.filter(~(df["Salary"]<=10000)).show()

+-------+---+----------+------+
|   Name|Age|Experience|Salary|
+-------+---+----------+------+
|Vaibhav| 26|         3| 15000|
|  Krish| 31|         4| 20000|
|   Suny| 29|         5| 30000|
|Twinken| 23|         2| 15000|
|  Harsh| 21|         1| 14000|
+-------+---+----------+------+



# GroupBy And Aggregate Functions

In [11]:
from pyspark.sql import SparkSession

In [12]:
spark = SparkSession.builder.appName('Agg').getOrCreate()

In [13]:
df = spark.read.csv("test3.csv", header=True, inferSchema=True)

In [14]:
df.show()

+---------+------------+------+
|     Name| Departments|Salary|
+---------+------------+------+
|  Vaibhav|Data Science| 10000|
|   Mahesh|   Big  Data|  5000|
|Sudhanshu|Data Science| 20000|
|    Rohit|         IOT| 10000|
|    Krish|      Python| 15000|
|  Vaibhav|         IOT|  5000|
|    Krish|Data Science| 10000|
|    Rohit|   Big  Data| 10000|
+---------+------------+------+



In [15]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Departments: string (nullable = true)
 |-- Salary: integer (nullable = true)



In [16]:
## Groupby
## Grouped to find the maximum salary
df.groupBy("Name").sum().show()

+---------+-----------+
|     Name|sum(Salary)|
+---------+-----------+
|Sudhanshu|      20000|
|  Vaibhav|      15000|
|    Krish|      25000|
|    Rohit|      20000|
|   Mahesh|       5000|
+---------+-----------+



In [17]:
## Groupby Departments which gives maximum salary
df.groupBy('Departments').sum().show()

+------------+-----------+
| Departments|sum(Salary)|
+------------+-----------+
|         IOT|      15000|
|   Big  Data|      15000|
|      Python|      15000|
|Data Science|      40000|
+------------+-----------+



In [19]:
df.groupBy('Departments').mean().show()

+------------+------------------+
| Departments|       avg(Salary)|
+------------+------------------+
|         IOT|            7500.0|
|   Big  Data|            7500.0|
|      Python|           15000.0|
|Data Science|13333.333333333334|
+------------+------------------+



In [20]:
df.groupBy('Departments').count().show()

+------------+-----+
| Departments|count|
+------------+-----+
|         IOT|    2|
|   Big  Data|    2|
|      Python|    1|
|Data Science|    3|
+------------+-----+



In [21]:
# directly
df.agg({'Salary':'sum'}).show()

+-----------+
|sum(Salary)|
+-----------+
|      85000|
+-----------+



In [24]:
#max salary 
df.groupBy('Name').max().show()

+---------+-----------+
|     Name|max(Salary)|
+---------+-----------+
|Sudhanshu|      20000|
|  Vaibhav|      10000|
|    Krish|      15000|
|    Rohit|      10000|
|   Mahesh|       5000|
+---------+-----------+



In [23]:
df.groupBy('Name').min().show()

+---------+-----------+
|     Name|min(Salary)|
+---------+-----------+
|Sudhanshu|      20000|
|  Vaibhav|       5000|
|    Krish|      10000|
|    Rohit|      10000|
|   Mahesh|       5000|
+---------+-----------+

