# Import Dependencies

We begin by importing the necessary libraries.

In [44]:
# Data analysis
import numpy as np
import pandas as pd

# Repeat setup

Repeat the setup procedure from the previous notebook.

In [45]:
# Read data
df = pd.read_csv('../data/flights_df.csv')
# Drop irrelevant columns
df.drop(columns=['Unnamed: 0'], inplace=True)
# Drop rows with missing values
df.dropna(inplace=True)
# Print first 5 rows
df.head()

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
0,2013,1,1,517.0,515,2.0,830.0,819,11.0,UA,1545,N14228,EWR,IAH,227.0,1400,5,15,2013-01-01 05:00:00
1,2013,1,1,533.0,529,4.0,850.0,830,20.0,UA,1714,N24211,LGA,IAH,227.0,1416,5,29,2013-01-01 05:00:00
2,2013,1,1,542.0,540,2.0,923.0,850,33.0,AA,1141,N619AA,JFK,MIA,160.0,1089,5,40,2013-01-01 05:00:00
3,2013,1,1,544.0,545,-1.0,1004.0,1022,-18.0,B6,725,N804JB,JFK,BQN,183.0,1576,5,45,2013-01-01 05:00:00
4,2013,1,1,554.0,600,-6.0,812.0,837,-25.0,DL,461,N668DN,LGA,ATL,116.0,762,6,0,2013-01-01 06:00:00


In [46]:
# Create df with flights from LGA
lag_df = df[df['origin'] == 'LGA'].copy()
# Display first 5 rows
lag_df.head()

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
1,2013,1,1,533.0,529,4.0,850.0,830,20.0,UA,1714,N24211,LGA,IAH,227.0,1416,5,29,2013-01-01 05:00:00
4,2013,1,1,554.0,600,-6.0,812.0,837,-25.0,DL,461,N668DN,LGA,ATL,116.0,762,6,0,2013-01-01 06:00:00
7,2013,1,1,557.0,600,-3.0,709.0,723,-14.0,EV,5708,N829AS,LGA,IAD,53.0,229,6,0,2013-01-01 06:00:00
9,2013,1,1,558.0,600,-2.0,753.0,745,8.0,AA,301,N3ALAA,LGA,ORD,138.0,733,6,0,2013-01-01 06:00:00
14,2013,1,1,559.0,600,-1.0,941.0,910,31.0,AA,707,N3DUAA,LGA,DFW,257.0,1389,6,0,2013-01-01 06:00:00


In [47]:
# Create df with flights from LGA to ATL
lag_to_atlanta_df = lag_df[lag_df['dest'] == 'ATL'].copy()
# Display first 5 rows
lag_to_atlanta_df.head()

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
4,2013,1,1,554.0,600,-6.0,812.0,837,-25.0,DL,461,N668DN,LGA,ATL,116.0,762,6,0,2013-01-01 06:00:00
18,2013,1,1,600.0,600,0.0,837.0,825,12.0,MQ,4650,N542MQ,LGA,ATL,134.0,762,6,0,2013-01-01 06:00:00
62,2013,1,1,658.0,700,-2.0,944.0,939,5.0,DL,1547,N6703D,LGA,ATL,126.0,762,7,0,2013-01-01 07:00:00
101,2013,1,1,754.0,759,-5.0,1039.0,1041,-2.0,DL,2047,N935DL,LGA,ATL,126.0,762,7,59,2013-01-01 07:00:00
123,2013,1,1,814.0,810,4.0,1047.0,1030,17.0,FL,346,N977AT,LGA,ATL,132.0,762,8,10,2013-01-01 08:00:00


## Question 2

Add a column `status` to `lag_to_atlanta_df` with 3 possible values: `early`, `on_time`, and `late`. Define a flight to be “on time” if it is not more than 15 minutes late. Use the `tally()` command to produce a contingency table with a row for each airline and a column for each value of `status`. Be sure to show the row and column totals.

In [48]:
# Create mapping function for delay to status
def map_status(delay):
    if delay < 0:
        return 'early'
    elif delay > 15:
        return 'late'
    else:
        return 'on_time'
# Map departure delays to status
lag_to_atlanta_df['status'] = lag_to_atlanta_df['dep_delay'].apply(map_status)
# Create contingency table
pd.crosstab(index=lag_to_atlanta_df['origin'], columns=lag_to_atlanta_df['status'], margins=True)

status,early,late,on_time,All
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LGA,6017,1880,2144,10041
All,6017,1880,2144,10041


As we can see, most flights appear to depart earlier than scheduled from LAG to ATL, with only a smaller (but not insignificant) number of flights departing more than 15 minutes past schedule. 

Out of curiosity, we'll repeat this process for all available airports. 

In [49]:
# Map departure delays to status
df['status'] = df['dep_delay'].apply(map_status)
# Create contingency table
pd.crosstab(index=df['origin'], columns=df['status'], margins=True)

status,early,late,on_time,All
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
EWR,59144,28718,29265,117127
JFK,61025,22514,25540,109079
LGA,62966,19056,19118,101140
All,183135,70288,73923,327346
