In [1]:
import os
# Find the latest version of spark 3.x  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.4.0'
spark_version = 'spark-3.4.0'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/ InRelease [3,622 B]
0% [Connecting to archive.ubuntu.com (185.125.190.39)] [Connecting to security.0% [Connecting to archive.ubuntu.com (185.125.190.39)] [Connecting to security.                                                                               Get:2 http://security.ubuntu.com/ubuntu focal-security InRelease [114 kB]
                                                                               Get:3 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu focal InRelease [18.1 kB]
0% [Waiting for headers] [2 InRelease 14.2 kB/114 kB 12%] [Connected to develop                                                                               Hit:4 http://archive.ubuntu.com/ubuntu focal InRelease
Get:5 http://archive.ubuntu.com/ubuntu focal-updates InRelease [114 kB]
Hit:6 http://ppa.launchpad.net/cran/libgit2/ubuntu focal InRelease
Get:7 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu

In [2]:
# Import packages
from pyspark.sql import SparkSession
import time

# Create a SparkSession
spark = SparkSession.builder\
    .appName("SparkSQL")\
    .config("spark.sql.debug.maxToStringFields", 2000)\
    .config("spark.driver.memory", "2g")\
    .getOrCreate()

In [3]:
# Read in data from S3 Bucket
from pyspark import SparkFiles
url = "https://2u-data-curriculum-team.s3.amazonaws.com/dataviz-classroom/v1.2/22-big-data/3/DelayedFlights.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("DelayedFlights.csv"), sep=",", header=True)
df.show()

+---+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+
| id|Year|Month|DayofMonth|DayOfWeek|DepTime|CRSDepTime|ArrTime|CRSArrTime|UniqueCarrier|FlightNum|TailNum|ActualElapsedTime|CRSElapsedTime|AirTime|ArrDelay|DepDelay|Origin|Dest|Distance|TaxiIn|TaxiOut|Cancelled|CancellationCode|Diverted|CarrierDelay|WeatherDelay|NASDelay|SecurityDelay|LateAircraftDelay|
+---+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+
|  0|2008|    1|         3|        4|   2003|      1955|   2211|      2225|       

In [4]:
# Create a temporary view
df.createOrReplaceTempView('delays')

In [5]:
# Start the runtime
start_time = time.time()

# Using spark.sql write a query that gives you the total distance 
# and the count of every unique Origin, Dest combination.
spark.sql("""select Origin, Dest ,sum(Distance), count(*) from delays group by 1,2""").show()

# Print out the runtime.
print("--- %s seconds ---" % (time.time() - start_time))

+------+----+-------------+--------+
|Origin|Dest|sum(Distance)|count(1)|
+------+----+-------------+--------+
|   LAS| LIT|      72520.0|      56|
|   PHL| MCO|     986706.0|    1146|
|   SMF| BUR|     178284.0|     498|
|   SNA| PHX|     218010.0|     645|
|   MCI| IAH|     156249.0|     243|
|   BFL| SAN|       4515.0|      21|
|   ROC| CLE|      39935.0|     163|
|   SPI| ORD|      34104.0|     196|
|   ATL| GSP|      54621.0|     357|
|   SFO| PMD|      26860.0|      85|
|   LAX| OXR|       6958.0|     142|
|   ORD| PDX|     909497.0|     523|
|   PBI| DCA|      99412.0|     116|
|   FSD| ATL|      56286.0|      59|
|   MLI| MCO|      25900.0|      25|
|   MSP| AVL|      24969.0|      29|
|   BQN| MCO|      48547.0|      43|
|   EWR| STT|      73530.0|      45|
|   CLE| SJU|       1839.0|       1|
|   MCI| MKE|        436.0|       1|
+------+----+-------------+--------+
only showing top 20 rows

--- 6.0708253383636475 seconds ---


In [6]:
# Write out the data in parquet format
df.write.parquet('parquet_delay_basic', mode='overwrite')

In [7]:
# Read in our new parquet formatted data
p_df=spark.read.parquet('parquet_delay_basic')

In [8]:
# Convert the DataFrame to a view.
p_df.createOrReplaceTempView('p_delays')

In [9]:
# Start the runtime
start_time = time.time()

# Run the same query here
spark.sql("""select Origin, Dest ,sum(Distance), count(*) from p_delays group by 1,2""").show()

# Print out the runtime
print("--- %s seconds ---" % (time.time() - start_time))

+------+----+-------------+--------+
|Origin|Dest|sum(Distance)|count(1)|
+------+----+-------------+--------+
|   LAS| LIT|      72520.0|      56|
|   PHL| MCO|     986706.0|    1146|
|   SMF| BUR|     178284.0|     498|
|   SNA| PHX|     218010.0|     645|
|   MCI| IAH|     156249.0|     243|
|   BFL| SAN|       4515.0|      21|
|   ROC| CLE|      39935.0|     163|
|   SPI| ORD|      34104.0|     196|
|   ATL| GSP|      54621.0|     357|
|   SFO| PMD|      26860.0|      85|
|   LAX| OXR|       6958.0|     142|
|   ORD| PDX|     909497.0|     523|
|   PBI| DCA|      99412.0|     116|
|   FSD| ATL|      56286.0|      59|
|   MLI| MCO|      25900.0|      25|
|   MSP| AVL|      24969.0|      29|
|   BQN| MCO|      48547.0|      43|
|   EWR| STT|      73530.0|      45|
|   CLE| SJU|       1839.0|       1|
|   MCI| MKE|        436.0|       1|
+------+----+-------------+--------+
only showing top 20 rows

--- 2.5319392681121826 seconds ---


In [10]:
# Write out your parquet data, partitioning on the Origin column
df.write.partitionBy("Origin").mode("overwrite").parquet("delayed_partitioned")

In [11]:
# Read in our new parquet formatted data
p_df_p=spark.read.parquet('delayed_partitioned')

In [12]:
# Convert the dataframe to a view.
p_df_p.createOrReplaceTempView('p_delays_p')

In [13]:
# Start the runtime
start_time = time.time()

# Run your query against your partitioned data one more time.
spark.sql("""select Origin, Dest ,sum(Distance), count(*) from p_delays_p group by 1,2""").show()

# Print out the runtime
print("--- %s seconds ---" % (time.time() - start_time))

+------+----+-------------+--------+
|Origin|Dest|sum(Distance)|count(1)|
+------+----+-------------+--------+
|   ORD| PDX|     909497.0|     523|
|   ATL| GSP|      54621.0|     357|
|   LAX| OXR|       6958.0|     142|
|   LAS| LIT|      72520.0|      56|
|   EWR| STT|      73530.0|      45|
|   SFO| PMD|      26860.0|      85|
|   MSP| AVL|      24969.0|      29|
|   SFO| TUS|      51068.0|      68|
|   CLT| ATW|        742.0|       1|
|   ORD| FWA|      89961.0|     573|
|   ATL| HDN|      30820.0|      23|
|   DFW| PNS|     228916.0|     379|
|   DFW| SDF|     142202.0|     194|
|   LAX| PIT|      49128.0|      23|
|   DTW| MKE|      62118.0|     261|
|   MCO| PVD|     386280.0|     360|
|   MCO| SFO|     110025.0|      45|
|   MSP| BOI|     113058.0|      99|
|   ORD| BUF|     332992.0|     704|
|   ORD| CAE|     225108.0|     338|
+------+----+-------------+--------+
only showing top 20 rows

--- 5.246726751327515 seconds ---


In [14]:
# Start  the runtime
start_time = time.time()

# Filter the data on something that selects your partition choice.
spark.sql("""Select distinct Origin, TailNum from p_delays where TailNum='N712SW' """).show()

# Print out the runtime.
print("--- %s seconds ---" % (time.time() - start_time))

+------+-------+
|Origin|TailNum|
+------+-------+
|   MSY| N712SW|
|   RNO| N712SW|
|   RDU| N712SW|
|   MDW| N712SW|
|   IAD| N712SW|
|   ABQ| N712SW|
|   STL| N712SW|
|   MHT| N712SW|
|   BUR| N712SW|
|   PIT| N712SW|
|   MCO| N712SW|
|   JAX| N712SW|
|   HRL| N712SW|
|   TPA| N712SW|
|   BNA| N712SW|
|   SNA| N712SW|
|   SMF| N712SW|
|   BUF| N712SW|
|   ELP| N712SW|
|   BDL| N712SW|
+------+-------+
only showing top 20 rows

--- 0.8149929046630859 seconds ---


In [15]:
# Start  the runtime
start_time = time.time()

# Filter the data on something that has nothing to do with your partition choice.
spark.sql("""Select distinct Dest, TailNum from p_delays_p where TailNum='N712SW' """).show()

# Print out the runtime.
print("--- %s seconds ---" % (time.time() - start_time))

+----+-------+
|Dest|TailNum|
+----+-------+
| MSY| N712SW|
| RNO| N712SW|
| RDU| N712SW|
| MDW| N712SW|
| IAD| N712SW|
| ABQ| N712SW|
| STL| N712SW|
| BUR| N712SW|
| PIT| N712SW|
| MCO| N712SW|
| TPA| N712SW|
| BNA| N712SW|
| IND| N712SW|
| SNA| N712SW|
| SMF| N712SW|
| ELP| N712SW|
| BUF| N712SW|
| OMA| N712SW|
| BHM| N712SW|
| SFO| N712SW|
+----+-------+
only showing top 20 rows

--- 3.4078776836395264 seconds ---
