### Create SparkSession object

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import types as T
from pyspark.sql import functions as F
from time import time

spark = SparkSession \
    .builder \
    .appName("Data Source API using PySpark Demo") \
    .getOrCreate()

In [18]:
flights = spark.read.csv("flights.csv", sep="\t", header=True, inferSchema=True)
flights.limit(5).toPandas()
flights.columns

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
0,2013,1,1,517,515,2,830,819,11,UA,1545,N14228,EWR,IAH,227,1400,5,15,2013-01-01 05:00:00
1,2013,1,1,533,529,4,850,830,20,UA,1714,N24211,LGA,IAH,227,1416,5,29,2013-01-01 05:00:00
2,2013,1,1,542,540,2,923,850,33,AA,1141,N619AA,JFK,MIA,160,1089,5,40,2013-01-01 05:00:00
3,2013,1,1,544,545,-1,1004,1022,-18,B6,725,N804JB,JFK,BQN,183,1576,5,45,2013-01-01 05:00:00
4,2013,1,1,554,600,-6,812,837,-25,DL,461,N668DN,LGA,ATL,116,762,6,0,2013-01-01 06:00:00


['year',
 'month',
 'day',
 'dep_time',
 'sched_dep_time',
 'dep_delay',
 'arr_time',
 'sched_arr_time',
 'arr_delay',
 'carrier',
 'flight',
 'tailnum',
 'origin',
 'dest',
 'air_time',
 'distance',
 'hour',
 'minute',
 'time_hour']

Register `flights` df as SQL table

In [4]:
flights.createOrReplaceTempView("flights_table")

In [7]:
query = """
    SELECT 
        *
    FROM
        flights_table
"""
spark.sql(query).show(5)

+----+-----+---+--------+--------------+---------+--------+--------------+---------+-------+------+-------+------+----+--------+--------+----+------+-------------------+
|year|month|day|dep_time|sched_dep_time|dep_delay|arr_time|sched_arr_time|arr_delay|carrier|flight|tailnum|origin|dest|air_time|distance|hour|minute|          time_hour|
+----+-----+---+--------+--------------+---------+--------+--------------+---------+-------+------+-------+------+----+--------+--------+----+------+-------------------+
|2013|    1|  1|     517|           515|        2|     830|           819|       11|     UA|  1545| N14228|   EWR| IAH|     227|    1400|   5|    15|2013-01-01 05:00:00|
|2013|    1|  1|     533|           529|        4|     850|           830|       20|     UA|  1714| N24211|   LGA| IAH|     227|    1416|   5|    29|2013-01-01 05:00:00|
|2013|    1|  1|     542|           540|        2|     923|           850|       33|     AA|  1141| N619AA|   JFK| MIA|     160|    1089|   5|    40|2

In [9]:
query = """
    SELECT 
        carrier, 
        COUNT(*) AS count
    FROM 
        flights_table
    GROUP BY carrier
"""
spark.sql(query).show(5)

+-------+-----+
|carrier|count|
+-------+-----+
|     UA|58665|
|     AA|32729|
|     EV|54173|
|     B6|54635|
|     DL|48110|
+-------+-----+
only showing top 5 rows



In [21]:
query = """
    SELECT
        carrier,
        SUM(arr_delay) as Arrival_delay
    FROM 
        flights_table
    GROUP BY carrier
    ORDER BY Arrival_delay DESC
"""
spark.sql(query).show(5)

+-------+-------------+
|carrier|Arrival_delay|
+-------+-------------+
|     EV|     807324.0|
|     B6|     511194.0|
|     MQ|     269767.0|
|     UA|     205589.0|
|     9E|     127624.0|
+-------+-------------+
only showing top 5 rows



In [23]:
query = """
    SELECT
        carrier,
        MAX(arr_delay) as Max_Arrival_delay
    FROM 
        flights_table
    GROUP BY carrier
"""
spark.sql(query).show(5)

+-------+-----------------+
|carrier|Max_Arrival_delay|
+-------+-----------------+
|     UA|               NA|
|     AA|               NA|
|     EV|               NA|
|     B6|               NA|
|     DL|               NA|
+-------+-----------------+
only showing top 5 rows

