In [1]:
%load_ext nb_mypy
%nb_mypy On

Version 1.0.2


In [2]:
from typing import Union, List, Tuple, Optional, TypeVar, Any

In [3]:
from pyspark.sql import SparkSession, DataFrame, Window
import pyspark.sql.functions as F
import pandas as pd


**Code for adding 2 columns (IsArrDelayed, IsDepDelayed) to create new dataset for practice**


ArrDelay arrival delay, in minutes: A flight is counted as "on time" if it operated less than 15 minutes later the 

scheduled time shown in the carriers' Computerized Reservations Systems (CRS).



In [None]:

# airtraffic. \
# select('ArrTime', 'CRSArrTime','ArrDelay', 'DepTime', 'CRSDepTime', 'DepDelay').limit(10).toPandas()

# from pyspark.sql.types import IntegerType

# airtraffic = airtraffic.withColumn('ArrDelay', col('ArrDelay').cast(IntegerType())) \
#                         .withColumn('DepDelay', col('DepDelay').cast(IntegerType()))


# airtraffic = airtraffic.withColumn("IsArrDelayed", when(col("ArrDelay") >= 15, "YES").otherwise("NO")) \
#                         .withColumn("IsDepDelayed", when(col("DepDelay") >= 15, "YES").otherwise("NO"))


# airtraffic.write.format('csv'). \
# option('header',True).mode('overwrite'). \
# option('sep',','). \
# save('/home/nghiaht7/data-engineer/data-engineering-essentials/data/airflights_delay')

In [8]:
import getpass
username = getpass.getuser()

username

spark = SparkSession. \
    builder. \
    config('spark.ui.port', '4000'). \
    config("spark.sql.warehouse.dir", "hdfs://0.0.0.0:9000/user/hive/warehouse/"). \
    config("spark.sql.debug.maxToStringFields", 1000). \
    config("spark.dynamicAllocation.enabled", "false"). \
    enableHiveSupport(). \
    appName(f'{username} | Python - Data Processing - Overview'). \
    master('yarn'). \
    getOrCreate()

In [5]:
# example for getting files from hdfs

# !hdfs dfs -put /home/nghiaht7/data-engineer/data-engineering-essentials/data/flights.csv /user/nghiaht7

# !hdfs dfs -ls -R /user/nghiaht7 | grep csv

# airtraffic_path = "hdfs://0.0.0.0:9000/user/nghiaht7/flights.csv"

In [6]:
airtraffic_path = '/home/nghiaht7/data-engineer/data-engineering-essentials/data/airflights_delay'

In [9]:
airtraffic = spark.read.format("csv"). \
            option("header", "true"). \
            option("inferSchema", "true"). \
            load(airtraffic_path)

                                                                                

In [10]:
airtraffic.printSchema()

root
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- DayofMonth: integer (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- DepTime: string (nullable = true)
 |-- CRSDepTime: integer (nullable = true)
 |-- ArrTime: string (nullable = true)
 |-- CRSArrTime: integer (nullable = true)
 |-- UniqueCarrier: string (nullable = true)
 |-- FlightNum: integer (nullable = true)
 |-- TailNum: string (nullable = true)
 |-- ActualElapsedTime: string (nullable = true)
 |-- CRSElapsedTime: integer (nullable = true)
 |-- AirTime: string (nullable = true)
 |-- ArrDelay: integer (nullable = true)
 |-- DepDelay: integer (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- Distance: integer (nullable = true)
 |-- TaxiIn: string (nullable = true)
 |-- TaxiOut: string (nullable = true)
 |-- Cancelled: integer (nullable = true)
 |-- CancellationCode: string (nullable = true)
 |-- Diverted: integer (nullable = true)
 |-- 

In [13]:
airtraffic.count()

                                                                                

99999

In [15]:
airtraffic.limit(5).toPandas()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,...,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed,IsDepDelayed
0,2008,1,3,4,2003,1955,2211,2225,WN,335,...,0,,0,,,,,,NO,NO
1,2008,1,3,4,754,735,1002,1000,WN,3231,...,0,,0,,,,,,NO,YES
2,2008,1,3,4,628,620,804,750,WN,448,...,0,,0,,,,,,NO,NO
3,2008,1,3,4,926,930,1054,1100,WN,1746,...,0,,0,,,,,,NO,NO
4,2008,1,3,4,1829,1755,1959,1925,WN,3920,...,0,,0,2.0,0.0,0.0,0.0,32.0,YES,YES


In [12]:
# orderBy is extensive action because need shuffle to get all information about ranking

airtraffic.select('Year', 'Month', 'DayOfMonth').distinct().orderBy(F.col('DayOfMonth').desc()).show(31)



+----+-----+----------+
|Year|Month|DayOfMonth|
+----+-----+----------+
|2008|    1|        31|
|2008|    1|        30|
|2008|    1|        29|
|2008|    1|        28|
|2008|    1|        27|
|2008|    1|        26|
|2008|    1|        25|
|2008|    1|        24|
|2008|    1|        23|
|2008|    1|        22|
|2008|    1|        21|
|2008|    1|        20|
|2008|    1|        19|
|2008|    1|        18|
|2008|    1|        17|
|2008|    1|        16|
|2008|    1|        15|
|2008|    1|        14|
|2008|    1|        13|
|2008|    1|        12|
|2008|    1|        11|
|2008|    1|        10|
|2008|    1|         9|
|2008|    1|         8|
|2008|    1|         7|
|2008|    1|         6|
|2008|    1|         5|
|2008|    1|         4|
|2008|    1|         3|
|2008|    1|         2|
|2008|    1|         1|
+----+-----+----------+



                                                                                

In [16]:
airtraffic.select('Cancelled').distinct().show()

                                                                                

+---------+
|Cancelled|
+---------+
|        1|
|        0|
+---------+



In [17]:
# airtraffic.filter(airtraffic.Cancelled == 1).count()
# airtraffic.filter(airtraffic['Cancelled'] == 1).count()
# airtraffic.filter('Cancelled = 1').count()

#self-try that works
airtraffic.filter(F.col('Cancelled') == 1).count()





1142

In [15]:
airtraffic.select(
    F.concat(
        F.col("Year"),
        F.lit("-"),
        F.lpad(F.col("Month"), 2, "0"),
        F.lit("-"),
        F.lpad(F.col("DayOfMonth"), 2, "0")
    ).alias('FlightDate')
).show()

+----------+
|FlightDate|
+----------+
|2008-01-03|
|2008-01-03|
|2008-01-03|
|2008-01-03|
|2008-01-03|
|2008-01-03|
|2008-01-03|
|2008-01-03|
|2008-01-03|
|2008-01-03|
|2008-01-03|
|2008-01-03|
|2008-01-03|
|2008-01-03|
|2008-01-03|
|2008-01-03|
|2008-01-03|
|2008-01-03|
|2008-01-03|
|2008-01-03|
+----------+
only showing top 20 rows



In [18]:
from pyspark.sql.functions import col, concat, lpad, \
    date_format, to_date


airtraffic. \
    withColumn('FlightDate',
        concat(
            col("Year"),
            lpad(col("Month"), 2, "0"),
            lpad(col("DayOfMonth"), 2, "0")
        )
    ). \
    withColumn("FlightDate-right-format", to_date('FlightDate', 'yyyyMMdd')). \
    withColumn('Flight-DOW', date_format(col("FlightDate-right-format"), 'EEEE')). \
    select("FlightDate-right-format", 'Flight-DOW'). \
    show()

+-----------------------+----------+
|FlightDate-right-format|Flight-DOW|
+-----------------------+----------+
|             2008-01-03|  Thursday|
|             2008-01-03|  Thursday|
|             2008-01-03|  Thursday|
|             2008-01-03|  Thursday|
|             2008-01-03|  Thursday|
|             2008-01-03|  Thursday|
|             2008-01-03|  Thursday|
|             2008-01-03|  Thursday|
|             2008-01-03|  Thursday|
|             2008-01-03|  Thursday|
|             2008-01-03|  Thursday|
|             2008-01-03|  Thursday|
|             2008-01-03|  Thursday|
|             2008-01-03|  Thursday|
|             2008-01-03|  Thursday|
|             2008-01-03|  Thursday|
|             2008-01-03|  Thursday|
|             2008-01-03|  Thursday|
|             2008-01-03|  Thursday|
|             2008-01-03|  Thursday|
+-----------------------+----------+
only showing top 20 rows



In [19]:
#  .filter("Origin IN ('ORD', 'DFW', 'ATL', 'LAX', 'SFO')")    --> SQL-string style
#  .filter(col().isin())   ---> DataFrame style


airtraffic. \
    withColumn('FlightDate',
        concat(
            col("Year"),
            lpad(col("Month"), 2, "0"),
            lpad(col("DayOfMonth"), 2, "0")
        )
    ). \
    withColumn("FlightDate-right-format", to_date('FlightDate', 'yyyyMMdd')). \
    withColumn('Flight-DOW', date_format(col("FlightDate-right-format"), 'EEEE')). \
    filter(col('Flight-DOW').isin("Sunday", "Staturday")). \
    count()

                                                                                

13196

In [23]:
# Get count of flights which are departed late at origin and reach destination early or on time.

# & | for dataframe and AND/ OR for SQL syntax

# count(F.lit(1)) better count(*) is a myth

from pyspark.sql.functions import col

# |-- IsArrDelayed: string (nullable = true)
# |-- IsDepDelayed: string (nullable = true)

airtraffic.filter((col("IsArrDelayed") == "NO") & (col("IsDepDelayed") == "YES")).count()


                                                                                

4133

In [24]:
# Get number of flights which are departed late on Saturdays as well as on Sundays.

airtraffic. \
    withColumn('FlightDate',
        concat(
            col("Year"),
            lpad(col("Month"), 2, "0"),
            lpad(col("DayOfMonth"), 2, "0")
        )
    ). \
    withColumn("FlightDate-right-format", to_date('FlightDate', 'yyyyMMdd')). \
    withColumn('Flight-DOW', date_format(col("FlightDate-right-format"), 'EEEE')). \
    filter(col('Flight-DOW').isin("Sunday", "Staturday")). \
    filter(col("IsDepDelayed") == "YES"). \
    count()

                                                                                

3634

In [29]:
#UniqueCarrier

airtraffic.filter(col("UniqueCarrier").like("W%")).select("UniqueCarrier").distinct().show(10)



+-------------+
|UniqueCarrier|
+-------------+
|           WN|
+-------------+



                                                                                

In [33]:
# trip that arrive sooner between 5 and 10 minutes

airtraffic.filter(col("ArrDelay").between(5, 10)).count()

8863

In [37]:
# CancellationCode
# filter("bonus IS NULL").
airtraffic.filter(col("CancellationCode").isNotNull()).count()

1142

In [38]:

airtraffic. \
    select('Year', 'Month', 'DayOfMonth'). \
    describe(). \
    show()

                                                                                

+-------+--------------------+-----+------------------+
|summary|                Year|Month|        DayOfMonth|
+-------+--------------------+-----+------------------+
|  count|               99999|99999|             99999|
|   mean|              2008.0|  1.0|17.087740877408773|
| stddev|1.024708919288781...|  0.0| 8.356320851749247|
|    min|                2008|    1|                 1|
|    max|                2008|    1|                31|
+-------+--------------------+-----+------------------+



In [39]:
airtraffic. \
    select('Year', 'Month', 'DayOfMonth'). \
    summary(). \
    show()



+-------+------+-----+------------------+
|summary|  Year|Month|        DayOfMonth|
+-------+------+-----+------------------+
|  count| 99999|99999|             99999|
|   mean|2008.0|  1.0|17.087740877408773|
| stddev|   0.0|  0.0| 8.356320851749246|
|    min|  2008|    1|                 1|
|    25%|  2008|    1|                10|
|    50%|  2008|    1|                17|
|    75%|  2008|    1|                24|
|    max|  2008|    1|                31|
+-------+------+-----+------------------+



                                                                                

In [42]:
# sum ArrDelay

airtraffic.select(F.sum("ArrDelay").alias("Total ArrDelay")).show()

+--------------+
|Total ArrDelay|
+--------------+
|        565524|
+--------------+





In [67]:
## average ArrDelay
## count(*) vs count(F.lit(1))  --> same as count(1) vs count(*) in Hive: count(1) for only 1 col  vs count(*)


airtraffic.select((F.sum("ArrDelay") / F.count(F.lit(1))).alias("Average ArrDelay")).show()
print(565524/99999)

+-----------------+
| Average ArrDelay|
+-----------------+
|5.655296552965529|
+-----------------+

5.655296552965529


# **groupby**

In [53]:
airtraffic. \
    withColumn('FlightDate',
        concat(
            col("Year"),
            lpad(col("Month"), 2, "0"),
            lpad(col("DayOfMonth"), 2, "0")
        )
    ). \
    withColumn("FlightDate-right-format", to_date('FlightDate', 'yyyyMMdd')). \
    withColumn('Flight-DOW', date_format(col("FlightDate-right-format"), 'EEEE')). \
    groupby("Flight-DOW"). \
    count().show()

                                                                                

+----------+-----+
|Flight-DOW|count|
+----------+-----+
| Wednesday|14694|
|   Tuesday|14709|
|    Friday|14517|
|  Thursday|17079|
|  Saturday|11285|
|    Monday|14519|
|    Sunday|13196|
+----------+-----+



In [55]:
airtraffic. \
    withColumn('FlightDate',
        concat(
            col("Year"),
            lpad(col("Month"), 2, "0"),
            lpad(col("DayOfMonth"), 2, "0")
        )
    ). \
    withColumn("FlightDate-right-format", to_date('FlightDate', 'yyyyMMdd')). \
    withColumn('Flight-DOW', date_format(col("FlightDate-right-format"), 'EEEE')). \
    groupby("Flight-DOW"). \
    agg(F.count(F.lit(1)).alias("FlightCount")).show()



+----------+-----------+
|Flight-DOW|FlightCount|
+----------+-----------+
| Wednesday|      14694|
|   Tuesday|      14709|
|    Friday|      14517|
|  Thursday|      17079|
|  Saturday|      11285|
|    Monday|      14519|
|    Sunday|      13196|
+----------+-----------+



                                                                                

In [57]:
airtraffic. \
    withColumn('FlightDate',
        concat(
            col("Year"),
            lpad(col("Month"), 2, "0"),
            lpad(col("DayOfMonth"), 2, "0")
        )
    ). \
    withColumn("FlightDate-right-format", to_date('FlightDate', 'yyyyMMdd')). \
    withColumn('Flight-DOW', date_format(col("FlightDate-right-format"), 'EEEE')). \
    groupby("Flight-DOW").  \
    agg(
    F.count(F.lit(1)).alias("FlightCount"),
    F.sum("ArrDelay").alias("Total ArrDelay"),
    F.round(F.avg("ArrDelay"), 2).alias("Avg ArrDelay")
).show()

                                                                                

+----------+-----------+--------------+------------+
|Flight-DOW|FlightCount|Total ArrDelay|Avg ArrDelay|
+----------+-----------+--------------+------------+
| Wednesday|      14694|        -23881|       -1.65|
|   Tuesday|      14709|          6450|        0.45|
|    Friday|      14517|        110279|        7.68|
|  Thursday|      17079|        224569|       13.39|
|  Saturday|      11285|         23118|        2.07|
|    Monday|      14519|         65560|        4.57|
|    Sunday|      13196|        159429|       12.19|
+----------+-----------+--------------+------------+





#  advanced aggregations using rollup

In [61]:
airtraffic. \
    withColumn('FlightDate',
        concat(
            col("Year"),
            lpad(col("Month"), 2, "0"),
            lpad(col("DayOfMonth"), 2, "0")
        )
    ). \
    withColumn("FlightDate-right-format", to_date('FlightDate', 'yyyyMMdd')). \
    withColumn('Flight-DOW', date_format(col("FlightDate-right-format"), 'EEEE')). \
    cube("Flight-DOW", "IsDepDelayed").  \
    agg(
    F.count(F.lit(1)).alias("FlightCount"),
  ).orderBy("Flight-DOW", "IsDepDelayed").show()



+----------+------------+-----------+
|Flight-DOW|IsDepDelayed|FlightCount|
+----------+------------+-----------+
|      null|        null|      99999|
|      null|          NO|      79531|
|      null|         YES|      20468|
|    Friday|        null|      14517|
|    Friday|          NO|      10841|
|    Friday|         YES|       3676|
|    Monday|        null|      14519|
|    Monday|          NO|      11754|
|    Monday|         YES|       2765|
|  Saturday|        null|      11285|
|  Saturday|          NO|       9315|
|  Saturday|         YES|       1970|
|    Sunday|        null|      13196|
|    Sunday|          NO|       9562|
|    Sunday|         YES|       3634|
|  Thursday|        null|      17079|
|  Thursday|          NO|      12079|
|  Thursday|         YES|       5000|
|   Tuesday|        null|      14709|
|   Tuesday|          NO|      12805|
+----------+------------+-----------+
only showing top 20 rows





* Get total number of flights as well as number of flights which are delayed in departure and number of flights delayed in arrival. 
  * Output should contain 3 columns - **FlightCount**, **DepDelayedCount**, **ArrDelayedCount**

In [71]:
# need to count condition --> when with True for DataFrame vs CASE WHEN syntax in SQL style

from pyspark.sql.functions import col, count, when

airtraffic.filter(col("Cancelled") == 0). \
    select(count("*").alias("FlightCount"),
                  count(when(col("IsDepDelayed") == "YES", True)).alias(" DepDelayedCount"),
                  count(when(col("IsArrDelayed") == "YES", True)).alias("ArrDelayedCount")

).show()

+-----------+----------------+---------------+
|FlightCount| DepDelayedCount|ArrDelayedCount|
+-----------+----------------+---------------+
|      98857|           20468|          19629|
+-----------+----------------+---------------+



In [73]:
airtraffic. \
    filter('Cancelled = 0'). \
    agg(count(lit(1)).alias("FlightCount"),
        F.sum(when(col('IsDepDelayed') == 'YES', 1).otherwise(lit(0))).alias("DepDelayedCount"),
        F.sum(when(col('IsArrDelayed') == lit('YES'), 1).otherwise(lit(0))).alias("ArrDelayedCount")
       ). \
    show()

+-----------+---------------+---------------+
|FlightCount|DepDelayedCount|ArrDelayedCount|
+-----------+---------------+---------------+
|      98857|          20468|          19629|
+-----------+---------------+---------------+



* Get number of flights which are delayed in departure and number of flights delayed in arrival for each day along with number of flights departed for each day. 
  * Output should contain 4 columns - **FlightDate**, **FlightCount**, **DepDelayedCount**, **ArrDelayedCount**
  * **FlightDate** should be of **yyyy-MM-dd** format.
  * Data should be **sorted** in ascending order by **flightDate**

In [12]:
# count("*") is ok vs count(lit(1))

from pyspark.sql.functions import col, count, when, concat, to_date, lpad, date_format

count_by_flighdate = airtraffic. \
    withColumn('FlightDate',
        to_date(concat(
            col("Year"),
            lpad(col("Month"), 2, "0"),
            lpad(col("DayOfMonth"), 2, "0")
        ), "yyyyMMdd")
    ). \
groupby("FlightDate"). \
   agg(count("*").alias("FlightCount"),
       count(when(col("IsDepDelayed") == "YES", True)).alias(" DepDelayedCount"),
    count(when(col("IsArrDelayed") == "YES", True)).alias("ArrDelayedCount")
       ).orderBy(col('FlightDate'))


count_by_flighdate.show(31)



+----------+-----------+----------------+---------------+
|FlightDate|FlightCount| DepDelayedCount|ArrDelayedCount|
+----------+-----------+----------------+---------------+
|2008-01-01|        189|              55|             47|
|2008-01-02|        188|              57|             52|
|2008-01-03|       2554|            1293|           1130|
|2008-01-04|       3619|            1224|           1141|
|2008-01-05|       2981|            1161|           1115|
|2008-01-06|       3418|            1343|           1225|
|2008-01-07|       3635|             642|            607|
|2008-01-08|       3635|             531|            504|
|2008-01-09|       3632|             366|            331|
|2008-01-10|       3636|             511|            485|
|2008-01-11|       3639|             482|            413|
|2008-01-12|       2944|             158|            120|
|2008-01-13|       3260|             651|            590|
|2008-01-14|       3628|             199|            215|
|2008-01-15|  

                                                                                

In [13]:
print(count_by_flighdate.agg(F.sum("ArrDelayedCount")))
print(count_by_flighdate.agg(F.sum("FlightCount")))


                                                                                

+--------------------+
|sum(ArrDelayedCount)|
+--------------------+
|               19629|
+--------------------+





+----------------+
|sum(FlightCount)|
+----------------+
|           99999|
+----------------+



                                                                                

* Get all the flights which are departed late but arrived early (**IsArrDelayed is NO**).
  * Output should contain - **FlightCRSDepTime**, **UniqueCarrier**, **FlightNum**, **Origin**, **Dest**, **DepDelay**, **ArrDelay**
  * **FlightCRSDepTime** need to be computed using **Year**, **Month**, **DayOfMonth**, **CRSDepTime**
  * **FlightCRSDepTime** should be displayed using **yyyy-MM-dd HH:mm** format.
  * Output should be sorted by **FlightCRSDepTime** and then by the difference between **DepDelay** and **ArrDelay**
  * Also get the count of such flights

In [34]:
# exploration  ==> DepTime and CRSDepTimeh have form HHmm or Hmm ---> lpad + substring

from pyspark.sql.functions import col, lpad, substring

airtraffic.withColumn("hour-CRS", substring(lpad(col("CRSDepTime"), 4, "0"), 1, 2)). \
        withColumn("minutes-CRS", substring(col("CRSDepTime"), -2, 2)). \
        select("DepTime", "CRSDepTime", "DepDelay", "hour-CRS", "minutes-CRS").show(10)

+-------+----------+--------+--------+-----------+
|DepTime|CRSDepTime|DepDelay|hour-CRS|minutes-CRS|
+-------+----------+--------+--------+-----------+
|   2003|      1955|       8|      19|         55|
|    754|       735|      19|      07|         35|
|    628|       620|       8|      06|         20|
|    926|       930|      -4|      09|         30|
|   1829|      1755|      34|      17|         55|
|   1940|      1915|      25|      19|         15|
|   1937|      1830|      67|      18|         30|
|   1039|      1040|      -1|      10|         40|
|    617|       615|       2|      06|         15|
|   1620|      1620|       0|      16|         20|
+-------+----------+--------+--------+-----------+
only showing top 10 rows



In [64]:
from pyspark.sql.functions import  year, month, weekofyear, dayofmonth, \
    dayofyear, dayofweek, current_date, \
current_timestamp, hour, minute, second

from pyspark.sql.functions import lit, col, floor, to_timestamp


result = airtraffic. \
    withColumn('FlightCRSDepTime',
               concat(
                   col("Year"),
                   lit("-"),
                   lpad(col("Month"), 2, "0"),
                   lit("-"),
                   lpad(col("DayOfMonth"), 2, "0"),
                   lit(" "),
                   substring(lpad(col("CRSDepTime"), 4, "0"), 1, 2),
                   lit(":"),
                   substring(col("CRSDepTime"), -2, 2)
               )
    ).filter((col("IsArrDelayed") == "NO") & (col("IsDepDelayed") == "YES")). \
        select("FlightCRSDepTime", "UniqueCarrier", "FlightNum", "Origin", "Dest", "DepDelay", "ArrDelay"). \
orderBy("FlightCRSDepTime", F.abs(col("DepDelay") - col("ArrDelay")))



result.show()



+----------------+-------------+---------+------+----+--------+--------+
|FlightCRSDepTime|UniqueCarrier|FlightNum|Origin|Dest|DepDelay|ArrDelay|
+----------------+-------------+---------+------+----+--------+--------+
|2008-01-01 07:30|           XE|      301|   SAT| TUS|      23|       7|
|2008-01-01 08:51|           XE|     7672|   SLC| PHX|      26|       1|
|2008-01-01 08:55|           XE|      408|   SAN| TUL|      15|      13|
|2008-01-01 13:30|           XE|      319|   GEG| TUS|      18|      10|
|2008-01-01 14:45|           XE|      528|   MCI| JAX|      23|       9|
|2008-01-01 15:00|           XE|       54|   MCI| TUS|      15|     -11|
|2008-01-01 15:15|           XE|      423|   SAN| GEG|      23|      -7|
|2008-01-01 16:50|           XE|      415|   SAN| OMA|      17|      14|
|2008-01-01 18:10|           XE|      223|   SMF| TUL|      19|       4|
|2008-01-01 18:40|           XE|      619|   MSY| SAT|      16|       2|
|2008-01-01 18:45|           XE|      622|   JAX| M

                                                                                

In [65]:
result.count()

4133