#Filter Operations
<hr>

In [None]:
# Installation of pyspark package
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 46 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 55.1 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.0-py2.py3-none-any.whl size=281764026 sha256=68d30816feb3fb585b1a464735623bf4f854296c6495af14f426f107fe36e518
  Stored in directory: /root/.cache/pip/wheels/7a/8e/1b/f73a52650d2e5f337708d9f6a1750d451a7349a867f928b885
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.0


In [None]:
# creating spark session
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Filter").getOrCreate()
spark

In [None]:
# Reading dataset using pyspark
spark_df = spark.read.csv("dataset.csv", header=True, inferSchema=True)

In [None]:
# Fetching first 5 rows to glance 
spark_df.show(5)

+-------+---+----------+------+-----------+-------------+------------+-------+------+---------------+---------------------+--------------------+------------------+
| Gender|Age|Study_year|Living|Scholarship|Part_time_job|Transporting|Smoking|Drinks|Games_&_Hobbies|Cosmetics_&_Self-care|Monthly_Subscription|Monthly_expenses_$|
+-------+---+----------+------+-----------+-------------+------------+-------+------+---------------+---------------------+--------------------+------------------+
|Female | 21|         2|  Home|         No|           No|          No|     No|    No|             No|                  Yes|                  No|               150|
|  Male | 25|         3|Hostel|         No|          Yes|  Motorcycle|     No|    No|            Yes|                  Yes|                 Yes|               220|
|  Male | 23|         2|  Home|        Yes|           No|          No|     No|    No|             No|                   No|                null|               180|
|  Male | 19|   

In [None]:
# Getting a descriptive statistics
spark_df.describe().show()

+-------+-------+------------------+------------------+------+-----------+-------------+------------+-------+------+---------------+---------------------+--------------------+------------------+
|summary| Gender|               Age|        Study_year|Living|Scholarship|Part_time_job|Transporting|Smoking|Drinks|Games_&_Hobbies|Cosmetics_&_Self-care|Monthly_Subscription|Monthly_expenses_$|
+-------+-------+------------------+------------------+------+-----------+-------------+------------+-------+------+---------------+---------------------+--------------------+------------------+
|  count|    105|               105|               101|   103|        105|          100|          91|     94|    92|            105|                   92|                  96|                99|
|   mean|   null| 20.17142857142857|2.6534653465346536|  null|       null|         null|        null|   null|  null|           null|                 null|                null|214.94949494949495|
| stddev|   null|1.898669

In [None]:
# updating the name of Monthly_expenses_$ to Monthly_expenses
spark_df = spark_df.withColumn('Monthly_expenses', spark_df['Monthly_expenses_$'])
spark_df = spark_df.drop("Monthly_expenses_$")
spark_df.show()

+-------+---+----------+------+-----------+-------------+------------+-------+------+---------------+---------------------+--------------------+----------------+
| Gender|Age|Study_year|Living|Scholarship|Part_time_job|Transporting|Smoking|Drinks|Games_&_Hobbies|Cosmetics_&_Self-care|Monthly_Subscription|Monthly_expenses|
+-------+---+----------+------+-----------+-------------+------------+-------+------+---------------+---------------------+--------------------+----------------+
|Female | 21|         2|  Home|         No|           No|          No|     No|    No|             No|                  Yes|                  No|             150|
|  Male | 25|         3|Hostel|         No|          Yes|  Motorcycle|     No|    No|            Yes|                  Yes|                 Yes|             220|
|  Male | 23|         2|  Home|        Yes|           No|          No|     No|    No|             No|                   No|                null|             180|
|  Male | 19|         3|Host

In [None]:
# Fetching students who spend more than 250 monthly
spark_df.filter("Monthly_expenses > 250").show()

+-------+---+----------+------+-----------+-------------+------------+-------+------+---------------+---------------------+--------------------+----------------+
| Gender|Age|Study_year|Living|Scholarship|Part_time_job|Transporting|Smoking|Drinks|Games_&_Hobbies|Cosmetics_&_Self-care|Monthly_Subscription|Monthly_expenses|
+-------+---+----------+------+-----------+-------------+------------+-------+------+---------------+---------------------+--------------------+----------------+
|Female | 19|         2|  Home|         No|           No|  Motorcycle|     No|    No|             No|                  Yes|                  No|             300|
|  Male | 22|         3|  Home|         No|           No|         Car|     No|    No|             No|                  Yes|                 Yes|             350|
|Female | 22|         4|Hostel|        Yes|          Yes|         Car|     No|    No|             No|                  Yes|                 Yes|             300|
|  Male | 19|         2|Host

In [None]:
# another way to write above filter condition, which yelds the same result
spark_df.filter(spark_df['Monthly_expenses'] > 250).show()

+-------+---+----------+------+-----------+-------------+------------+-------+------+---------------+---------------------+--------------------+----------------+
| Gender|Age|Study_year|Living|Scholarship|Part_time_job|Transporting|Smoking|Drinks|Games_&_Hobbies|Cosmetics_&_Self-care|Monthly_Subscription|Monthly_expenses|
+-------+---+----------+------+-----------+-------------+------------+-------+------+---------------+---------------------+--------------------+----------------+
|Female | 19|         2|  Home|         No|           No|  Motorcycle|     No|    No|             No|                  Yes|                  No|             300|
|  Male | 22|         3|  Home|         No|           No|         Car|     No|    No|             No|                  Yes|                 Yes|             350|
|Female | 22|         4|Hostel|        Yes|          Yes|         Car|     No|    No|             No|                  Yes|                 Yes|             300|
|  Male | 19|         2|Host

In [None]:
# Filtering with mutliple conditions, 
# for example - 
# boys who spend more than 250 per month and are under 20 years old

spark_df.filter( 
    (spark_df['Gender'] == "Male ") & 
    (spark_df['Monthly_expenses'] > 250) & 
    (spark_df['Age'] <= 20)
     ).show()

+------+---+----------+------+-----------+-------------+------------+-------+------+---------------+---------------------+--------------------+----------------+
|Gender|Age|Study_year|Living|Scholarship|Part_time_job|Transporting|Smoking|Drinks|Games_&_Hobbies|Cosmetics_&_Self-care|Monthly_Subscription|Monthly_expenses|
+------+---+----------+------+-----------+-------------+------------+-------+------+---------------+---------------------+--------------------+----------------+
| Male | 19|         2|Hostel|         No|          Yes|         Car|   null|    No|            Yes|                 null|                 Yes|             300|
+------+---+----------+------+-----------+-------------+------------+-------+------+---------------+---------------------+--------------------+----------------+

