## Filtering Data

In [None]:
import os
# Find the latest version of spark 3.x  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.4.0'
spark_version = 'spark-3.4.0'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Start a SparkSession
import findspark
findspark.init()

In [None]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("sparkFunctions").getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/11/07 11:56:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/11/07 11:56:50 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [None]:
from pyspark import SparkFiles
url ="https://2u-data-curriculum-team.s3.amazonaws.com/dataviz-classroom/v1.2/22-big-data/1/nutrition.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("nutrition.csv"), sep=",", header=True, ignoreLeadingWhiteSpace=True) #Observe the need to use ignoreLeadingWhiteSpace=True, otherwise a leading whitespace will appear in the column names

# Show DataFrame
df.show()


[Stage 0:>                                                          (0 + 1) / 1]

                                                                                

+---------------+--------------+----------+---------+----------+-------------+--------------------+
|        dish_id|total_calories|total_mass|total_fat|total_carb|total_protein|         ingredients|
+---------------+--------------+----------+---------+----------+-------------+--------------------+
|dish_1561662216|    300.794281|193.000000|12.387489| 28.218290|    18.633970|soy sauce; garlic...|
|dish_1562688426|    137.569992| 88.000000| 8.256000|  5.190000|    10.297000|roasted potatoes;...|
|dish_1561662054|    419.438782|292.000000|23.838249| 26.351543|    25.910593|pepper; white ric...|
|dish_1562008979|    382.936646|290.000000|22.224644| 10.173570|    35.345387|jalapenos; lemon ...|
|dish_1560455030|     20.590000|103.000000| 0.148000|  4.625000|     0.956000|cherry tomatoes; ...|
|dish_1558372433|     74.360001|143.000000| 0.286000|  0.429000|    20.020000|          deprecated|
|dish_1563379132|    232.050003|119.000000|14.280000| 14.280000|    10.591001|         chilaquiles|


In [None]:
# Order a DataFrame by ascending values


+---------------+--------------+----------+---------+----------+-------------+-------------+
|        dish_id|total_calories|total_mass|total_fat|total_carb|total_protein|  ingredients|
+---------------+--------------+----------+---------+----------+-------------+-------------+
|dish_1556575700|      0.000000| 86.000000| 0.000000|  0.000000|     0.000000|   plate only|
|dish_1557861216|      0.000000|  1.000000| 0.000000|  0.000000|     0.000000|   plate only|
|dish_1558461431|      1.150000|  5.000000| 0.020000|  0.180000|     0.145000|spinach (raw)|
|dish_1558460205|      1.840000|  8.000000| 0.032000|  0.288000|     0.232000|spinach (raw)|
|dish_1551135590|     10.000000| 25.000000| 0.050000|  2.250000|     0.500000| bell peppers|
+---------------+--------------+----------+---------+----------+-------------+-------------+
only showing top 5 rows



In [None]:
# Order a DataFrame by descending values


+---------------+--------------+----------+---------+----------+-------------+--------------------+
|        dish_id|total_calories|total_mass|total_fat|total_carb|total_protein|         ingredients|
+---------------+--------------+----------+---------+----------+-------------+--------------------+
|dish_1566931674|    990.989014|531.000000|76.803001| 23.140001|    58.401997|chicken; mixed gr...|
|dish_1563476408|    990.400024|513.000000|47.925026| 55.908291|    79.199821|salmon; garlic; s...|
|dish_1559678104|     99.962006|135.000000| 7.348568|  7.806039|     2.654628|cucumbers; olive ...|
|dish_1558721434|     99.900002|350.000000| 0.778000| 22.713001|     3.759000|cherry tomatoes; ...|
|dish_1565981802|     99.754425|152.000000| 5.596065|  6.631088|     8.817584|carrot; salt; tof...|
+---------------+--------------+----------+---------+----------+-------------+--------------------+
only showing top 5 rows



In [None]:
# Import average function
from pyspark.sql.functions import avg


+-------------------+
|avg(total_calories)|
+-------------------+
| 223.98083459731635|
+-------------------+



In [None]:
# Using filter


+---------------+--------------+----------+---------+----------+-------------+--------------------+
|        dish_id|total_calories|total_mass|total_fat|total_carb|total_protein|         ingredients|
+---------------+--------------+----------+---------+----------+-------------+--------------------+
|dish_1562688426|    137.569992| 88.000000| 8.256000|  5.190000|    10.297000|roasted potatoes;...|
|dish_1560455030|     20.590000|103.000000| 0.148000|  4.625000|     0.956000|cherry tomatoes; ...|
|dish_1558372433|     74.360001|143.000000| 0.286000|  0.429000|    20.020000|          deprecated|
|dish_1565640549|     45.482903|139.000000| 1.568471|  7.043886|     2.641478|tomatoes; cilantr...|
|dish_1561575474|    120.058434|183.000000| 4.966118| 17.412746|     2.990431|salt; eggplant; r...|
|dish_1550795690|     68.119995|131.000000| 0.262000| 18.340000|     0.393000|               apple|
|dish_1565972591|    195.199997|122.000000|12.200000|  3.660000|    17.080000|chicken apple sau...|


In [None]:
# Filter by total_calories on certain columns



+----------+---------+----------+-------------+
|total_mass|total_fat|total_carb|total_protein|
+----------+---------+----------+-------------+
| 88.000000| 8.256000|  5.190000|    10.297000|
|103.000000| 0.148000|  4.625000|     0.956000|
|143.000000| 0.286000|  0.429000|    20.020000|
|139.000000| 1.568471|  7.043886|     2.641478|
|183.000000| 4.966118| 17.412746|     2.990431|
|131.000000| 0.262000| 18.340000|     0.393000|
|122.000000|12.200000|  3.660000|    17.080000|
| 78.000000| 0.234000| 10.920000|     0.546000|
| 19.000000| 0.038000|  1.900000|     0.171000|
| 79.000000| 0.079000| 10.270000|     0.395000|
|  6.000000| 2.520000|  0.084000|     2.220000|
| 95.000000| 2.571043|  3.280260|     2.804970|
| 88.000000| 0.088000| 14.960000|     1.760000|
| 46.000000| 5.357862|  2.088042|     0.792647|
|152.000000| 4.168533| 10.369692|     1.329856|
| 75.000000| 0.075000|  9.750000|     0.375000|
| 52.000000|11.686000| 11.480000|     6.152000|
|134.000000|14.740000|  2.144000|    13.

### Using Python Comparison Operators

In [None]:
# Same results only this time using python



+---------------+--------------+----------+---------+----------+-------------+--------------------+
|        dish_id|total_calories|total_mass|total_fat|total_carb|total_protein|         ingredients|
+---------------+--------------+----------+---------+----------+-------------+--------------------+
|dish_1562688426|    137.569992| 88.000000| 8.256000|  5.190000|    10.297000|roasted potatoes;...|
|dish_1560455030|     20.590000|103.000000| 0.148000|  4.625000|     0.956000|cherry tomatoes; ...|
|dish_1558372433|     74.360001|143.000000| 0.286000|  0.429000|    20.020000|          deprecated|
|dish_1565640549|     45.482903|139.000000| 1.568471|  7.043886|     2.641478|tomatoes; cilantr...|
|dish_1561575474|    120.058434|183.000000| 4.966118| 17.412746|     2.990431|salt; eggplant; r...|
|dish_1550795690|     68.119995|131.000000| 0.262000| 18.340000|     0.393000|               apple|
|dish_1565972591|    195.199997|122.000000|12.200000|  3.660000|    17.080000|chicken apple sau...|


In [None]:
# Filter on the price column for items less than 200 dollars and greater than 80 dollars.


+---------------+--------------+----------+---------+----------+-------------+--------------------+
|        dish_id|total_calories|total_mass|total_fat|total_carb|total_protein|         ingredients|
+---------------+--------------+----------+---------+----------+-------------+--------------------+
|dish_1561662216|    300.794281|193.000000|12.387489| 28.218290|    18.633970|soy sauce; garlic...|
|dish_1562688426|    137.569992| 88.000000| 8.256000|  5.190000|    10.297000|roasted potatoes;...|
|dish_1561662054|    419.438782|292.000000|23.838249| 26.351543|    25.910593|pepper; white ric...|
|dish_1562008979|    382.936646|290.000000|22.224644| 10.173570|    35.345387|jalapenos; lemon ...|
|dish_1560455030|     20.590000|103.000000| 0.148000|  4.625000|     0.956000|cherry tomatoes; ...|
|dish_1558372433|     74.360001|143.000000| 0.286000|  0.429000|    20.020000|          deprecated|
|dish_1563379132|    232.050003|119.000000|14.280000| 14.280000|    10.591001|         chilaquiles|


In [None]:
# Filter on a specific value in a column. 


+---------------+--------------+----------+---------+----------+-------------+-----------+
|        dish_id|total_calories|total_mass|total_fat|total_carb|total_protein|ingredients|
+---------------+--------------+----------+---------+----------+-------------+-----------+
|dish_1563381680|     32.459999|  6.000000| 2.520000|  0.084000|     2.220000|      bacon|
|dish_1559319860|     70.330002| 13.000000| 5.460000|  0.182000|     4.810000|      bacon|
|dish_1562086702|    178.529999| 33.000000|13.860000|  0.462000|    12.210000|      bacon|
|dish_1551391710|    102.789993| 19.000000| 7.980000|  0.266000|     7.030000|      bacon|
|dish_1564073860|    492.309998| 91.000000|38.219997|  1.274000|    33.670002|      bacon|
|dish_1550776767|     81.149994| 15.000000| 6.300000|  0.210000|     5.550000|      bacon|
|dish_1558032156|    140.660004| 26.000000|10.920000|  0.364000|     9.620000|      bacon|
|dish_1551136683|     70.330002| 13.000000| 5.460000|  0.182000|     4.810000|      bacon|