In [None]:
import os
# Find the latest version of spark 3.x  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.4.0'
spark_version = 'spark-3.4.0'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Start a SparkSession
import findspark
findspark.init()

In [None]:
# Import packages
from pyspark.sql import SparkSession
import time

# Create a SparkSession
spark = SparkSession.builder\
    .appName("SparkSQL")\
    .config("spark.sql.debug.maxToStringFields", 2000)\
    .config("spark.driver.memory", "2g")\
    .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/11/16 14:01:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [None]:
# Read in data from S3 Bucket
from pyspark import SparkFiles
url = "https://2u-data-curriculum-team.s3.amazonaws.com/dataviz-classroom/v1.2/22-big-data/3/DelayedFlights.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("DelayedFlights.csv"), sep=",", header=True)
df.show()

                                                                                

+---+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+
| id|Year|Month|DayofMonth|DayOfWeek|DepTime|CRSDepTime|ArrTime|CRSArrTime|UniqueCarrier|FlightNum|TailNum|ActualElapsedTime|CRSElapsedTime|AirTime|ArrDelay|DepDelay|Origin|Dest|Distance|TaxiIn|TaxiOut|Cancelled|CancellationCode|Diverted|CarrierDelay|WeatherDelay|NASDelay|SecurityDelay|LateAircraftDelay|
+---+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+
|  0|2008|    1|         3|        4|   2003|      1955|   2211|      2225|       

In [None]:
# Create a temporary view


In [None]:
# Run a sql query that groups the data on UniqueCarrier
# note the time functions will track the time it takes to load and run the data
# we are only interested in the time it take to run so run this cell twice.
start_time = time.time()



print("--- %s seconds ---" % (time.time() - start_time))



+-------------+-------------------+--------+
|UniqueCarrier|sum(CRSElapsedTime)|count(1)|
+-------------+-------------------+--------+
|           UA|        1.3998834E7|   82022|
|           NW|          6761017.0|   48410|
|           EV|          4284049.0|   42782|
|           DL|          8245701.0|   48888|
|           OO|          6883377.0|   73680|
|           F9|          2338358.0|   16006|
|           YV|          3216400.0|   34890|
|           US|          8759953.0|   53873|
|           MQ|          7710479.0|   82505|
|           OH|          3318613.0|   29152|
|           HA|           345580.0|    2597|
|           XE|          7386620.0|   62539|
|           FL|          4807695.0|   37201|
|           WN|        2.4182455E7|  214624|
|           AA|        1.7721836E7|  103120|
|           B6|          4169064.0|   22868|
|           AQ|            99698.0|     750|
|           AS|          2527656.0|   16553|
|           CO|          8693653.0|   44282|
|         



                                                                                

In [None]:
# Write out the data in parquet format


                                                                                

In [None]:
# Read in our new parquet formatted data


In [None]:
# A parquet formatted DataFrame has all the same methods as a row-based dataframe
# We can convert the dataframe to a view.


In [None]:
start_time = time.time()



print("--- %s seconds ---" % (time.time() - start_time))

+-------------+--------+
|UniqueCarrier|count(1)|
+-------------+--------+
|           UA|   82022|
|           AA|  103120|
|           NW|   48410|
|           EV|   42782|
|           B6|   22868|
|           DL|   48888|
|           OO|   73680|
|           F9|   16006|
|           YV|   34890|
|           US|   53873|
|           OH|   29152|
|           XE|   62539|
|           AS|   16553|
|           CO|   44282|
|           WN|  214624|
|           9E|   31833|
|           MQ|   82505|
|           HA|    2597|
|           FL|   37201|
|           AQ|     750|
+-------------+--------+

--- 0.2640209197998047 seconds ---


In [None]:
# Here is another sample
start_time = time.time()


print("--- %s seconds ---" % (time.time() - start_time))

+-------------+-------------------+--------+
|UniqueCarrier|sum(CRSElapsedTime)|count(1)|
+-------------+-------------------+--------+
|           UA|        1.3998834E7|   82022|
|           AA|        1.7721836E7|  103120|
|           NW|          6761017.0|   48410|
|           EV|          4284049.0|   42782|
|           B6|          4169064.0|   22868|
|           DL|          8245701.0|   48888|
|           OO|          6883377.0|   73680|
|           F9|          2338358.0|   16006|
|           YV|          3216400.0|   34890|
|           US|          8759953.0|   53873|
|           OH|          3318613.0|   29152|
|           XE|          7386620.0|   62539|
|           AS|          2527656.0|   16553|
|           CO|          8693653.0|   44282|
|           WN|        2.4182455E7|  214624|
|           9E|          3255692.0|   31833|
|           MQ|          7710479.0|   82505|
|           HA|           345580.0|    2597|
|           FL|          4807695.0|   37201|
|         

In [None]:
# Partition our data by UniqueCarrier


                                                                                

In [None]:
# Read in our new parquet formatted data


In [None]:
# Convert the DataFrame to a view.


In [None]:
# Query the partitioned data on the Partition key.
start_time = time.time()



print("--- %s seconds ---" % (time.time() - start_time))

+-------------+--------+
|UniqueCarrier|count(1)|
+-------------+--------+
|           UA|   82022|
|           AA|  103120|
|           OO|   73680|
|           WN|  214624|
|           MQ|   82505|
|           XE|   62539|
|           NW|   48410|
|           DL|   48888|
|           US|   53873|
|           CO|   44282|
|           EV|   42782|
|           YV|   34890|
|           FL|   37201|
|           9E|   31833|
|           B6|   22868|
|           OH|   29152|
|           AS|   16553|
|           F9|   16006|
|           AQ|     750|
|           HA|    2597|
+-------------+--------+

--- 0.3317570686340332 seconds ---


In [None]:
# Grouping by partition key and aggregating data.
start_time = time.time()



print("--- %s seconds ---" % (time.time() - start_time))

+-------------+-------------------+
|UniqueCarrier|sum(CRSElapsedTime)|
+-------------+-------------------+
|           UA|        1.3998834E7|
|           AA|        1.7721836E7|
|           OO|          6883377.0|
|           WN|        2.4182455E7|
|           MQ|          7710479.0|
|           XE|          7386620.0|
|           NW|          6761017.0|
|           DL|          8245701.0|
|           US|          8759953.0|
|           CO|          8693653.0|
|           EV|          4284049.0|
|           YV|          3216400.0|
|           FL|          4807695.0|
|           9E|          3255692.0|
|           B6|          4169064.0|
|           OH|          3318613.0|
|           AS|          2527656.0|
|           F9|          2338358.0|
|           AQ|            99698.0|
|           HA|           345580.0|
+-------------+-------------------+

--- 0.46139001846313477 seconds ---


In [None]:
# Another query filtering on the key.
start_time = time.time()


print("--- %s seconds ---" % (time.time() - start_time))

+-------------+-------------+
|UniqueCarrier|total_delayed|
+-------------+-------------+
|           US|    2077273.0|
+-------------+-------------+

--- 0.20259594917297363 seconds ---


In [None]:
# Same query as above against the parquet (non-partitioned) data.
start_time = time.time()



print("--- %s seconds ---" % (time.time() - start_time))

+-------------+-------------+
|UniqueCarrier|total_delayed|
+-------------+-------------+
|           US|    2077273.0|
+-------------+-------------+

--- 0.3114919662475586 seconds ---


In [None]:
# Here is a query that doesn't use the partition key at all (against the parquet data)
start_time = time.time()


print("--- %s seconds ---" % (time.time() - start_time))

+-------------+-------+
|UniqueCarrier|TailNum|
+-------------+-------+
|           WN| N712SW|
+-------------+-------+

--- 0.24602293968200684 seconds ---


In [None]:
# Here is a query that doesn't use the partition key at all (against the partitioned parquet data)
start_time = time.time()


print("--- %s seconds ---" % (time.time() - start_time))

+-------------+-------+
|UniqueCarrier|TailNum|
+-------------+-------+
|           WN| N712SW|
+-------------+-------+

--- 0.3094189167022705 seconds ---
