In [1]:
# Activate Spark in our Colab notebook.
import os
# Find the latest version of spark 3.0  from http://www.apache.org/dist/spark/ and enter as the spark version
spark_version = 'spark-3.4.0'
# spark_version = 'spark-3.<enter version>'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Get:6 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease [18.1 kB]
Get:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [109 kB]
Get:8 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [979 kB]
Get:9 http://archive.ubuntu.com/ubuntu jammy-updates/restricted amd64 Packages [876 kB]
Hit:10 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:11 http://security.ubuntu.com/ubuntu jammy-security/restricted amd64 Packages [862 kB]
Get:12 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [840 kB

In [2]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("DelayedFlightsView").getOrCreate()

In [3]:
from pyspark import SparkFiles
url = "https://2u-data-curriculum-team.s3.amazonaws.com/dataviz-classroom/v1.2/22-big-data/3/DelayedFlights.csv"
spark.sparkContext.addFile(url)
delayed_flights_df = spark.read.csv(SparkFiles.get("DelayedFlights.csv"), sep=",", header=True)
delayed_flights_df.show()

+---+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+
| id|Year|Month|DayofMonth|DayOfWeek|DepTime|CRSDepTime|ArrTime|CRSArrTime|UniqueCarrier|FlightNum|TailNum|ActualElapsedTime|CRSElapsedTime|AirTime|ArrDelay|DepDelay|Origin|Dest|Distance|TaxiIn|TaxiOut|Cancelled|CancellationCode|Diverted|CarrierDelay|WeatherDelay|NASDelay|SecurityDelay|LateAircraftDelay|
+---+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+
|  0|2008|    1|         3|        4|   2003|      1955|   2211|      2225|       

In [4]:
# Create a temporary view.
delayed_flights_df.createOrReplaceTempView('delays')

In [5]:
# Get the first five airline carriers and their origin that will arrive at George Bush Intercontinental Airport (IAH) in Houston. 
# Hint: Pass in the airport code as a string.
spark.sql("SELECT UniqueCarrier, Origin FROM delays WHERE Dest='IAH'").show()

+-------------+------+
|UniqueCarrier|Origin|
+-------------+------+
|           XE|   ABQ|
|           XE|   MCI|
|           XE|   ABQ|
|           XE|   MCI|
|           XE|   CRW|
|           XE|   MEM|
|           XE|   RDU|
|           XE|   BHM|
|           XE|   MKE|
|           XE|   LFT|
|           XE|   BRO|
|           XE|   ICT|
|           XE|   MCI|
|           XE|   MSP|
|           XE|   BNA|
|           XE|   SDF|
|           XE|   JAX|
|           XE|   ORF|
|           XE|   BNA|
|           XE|   ICT|
+-------------+------+
only showing top 20 rows



In [6]:
# Get all the origin and destination of all the Southwest (WN) flights.
spark.sql("SELECT UniqueCarrier, Origin, Dest FROM delays WHERE UniqueCarrier=='WN'").show()

+-------------+------+----+
|UniqueCarrier|Origin|Dest|
+-------------+------+----+
|           WN|   IAD| TPA|
|           WN|   IAD| TPA|
|           WN|   IND| BWI|
|           WN|   IND| BWI|
|           WN|   IND| JAX|
|           WN|   IND| LAS|
|           WN|   IND| MCO|
|           WN|   IND| MCO|
|           WN|   IND| MDW|
|           WN|   IND| PHX|
|           WN|   IND| PHX|
|           WN|   IND| TPA|
|           WN|   ISP| BWI|
|           WN|   ISP| BWI|
|           WN|   ISP| BWI|
|           WN|   ISP| BWI|
|           WN|   ISP| BWI|
|           WN|   ISP| FLL|
|           WN|   ISP| FLL|
|           WN|   ISP| MCO|
+-------------+------+----+
only showing top 20 rows



In [7]:
# Get the carrier, the origin, the destination, and the elapsed time of the 10 flights in descending order.
spark.sql("SELECT UniqueCarrier, Origin, Dest, CRSElapsedTime FROM delays ORDER BY CRSElapsedTime DESC LIMIT 10").show()

+-------------+------+----+--------------+
|UniqueCarrier|Origin|Dest|CRSElapsedTime|
+-------------+------+----+--------------+
|           AS|   OAK| PDX|            99|
|           XE|   XNA| IAH|            99|
|           AS|   SFO| PDX|            99|
|           XE|   CLE| BNA|            99|
|           AS|   SFO| PDX|            99|
|           XE|   IAH| MGM|            99|
|           AS|   SFO| PDX|            99|
|           XE|   CLE| MHT|            99|
|           AS|   SFO| PDX|            99|
|           XE|   CLE| MHT|            99|
+-------------+------+----+--------------+



In [8]:
# Get the total number of diverted flights from each airline carrier, and group the results by carrier 
# Order by the total number of diverted flights in descending order.
spark.sql("SELECT UniqueCarrier, SUM(Diverted) AS Total_Diverted FROM delays GROUP BY UniqueCarrier ORDER BY Total_Diverted DESC").show()

+-------------+--------------+
|UniqueCarrier|Total_Diverted|
+-------------+--------------+
|           WN|         739.0|
|           AA|         469.0|
|           MQ|         370.0|
|           XE|         293.0|
|           OO|         292.0|
|           UA|         244.0|
|           DL|         196.0|
|           US|         189.0|
|           FL|         179.0|
|           9E|         155.0|
|           NW|         145.0|
|           YV|         130.0|
|           CO|         130.0|
|           AS|         125.0|
|           OH|         104.0|
|           B6|          97.0|
|           F9|          29.0|
|           AQ|           6.0|
|           HA|           4.0|
|           EV|           0.0|
+-------------+--------------+



In [9]:
# Get the average time for delayed departures and arrivals for each carrier.
# Group the results by the carrier.
query = """
SELECT UniqueCarrier, AVG(ArrDelay) AS AvgArrDelay, AVG(DepDelay) AS AvgDepDelay
FROM delays
GROUP BY UniqueCarrier
"""
spark.sql(query).show()

+-------------+------------------+------------------+
|UniqueCarrier|       AvgArrDelay|       AvgDepDelay|
+-------------+------------------+------------------+
|           UA| 49.28722883905207| 50.31698812513716|
|           AA| 47.79455631216452|   46.840312257564|
|           NW| 45.91269035532995| 41.70838669696344|
|           EV| 43.73542611378617| 45.39245477069796|
|           B6| 49.45584295814852| 50.69227741822634|
|           DL| 38.02667789369917|37.734883816069384|
|           OO| 46.39657709707309| 45.18429695982628|
|           F9|26.108155473493145|  25.6334499562664|
|           YV| 58.51809551208285| 58.29157351676698|
|           US| 35.11325534609939| 38.55870287528075|
|           AQ| 21.25537634408602|             26.68|
|           MQ| 47.34964387897973|  44.9969698806133|
|           OH| 48.05260258881851| 46.26107985729967|
|           HA|33.811415349016585| 34.00154023873701|
|           XE| 49.51955145712174| 49.27562001311182|
|           AS|36.1141952763