# Import Dependencies

We begin by importing the necessary libraries.

In [39]:
# Data analysis
import pandas as pd
import scipy.stats as stats

# Repeat setup

Repeat the setup procedure from the previous notebooks.

In [2]:
# Read data
df = pd.read_csv('../data/flights_df.csv')
# Drop irrelevant columns
df.drop(columns=['Unnamed: 0'], inplace=True)
# Drop rows with missing values
df.dropna(inplace=True)
# Print first 5 rows
df.head()

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
0,2013,1,1,517.0,515,2.0,830.0,819,11.0,UA,1545,N14228,EWR,IAH,227.0,1400,5,15,2013-01-01 05:00:00
1,2013,1,1,533.0,529,4.0,850.0,830,20.0,UA,1714,N24211,LGA,IAH,227.0,1416,5,29,2013-01-01 05:00:00
2,2013,1,1,542.0,540,2.0,923.0,850,33.0,AA,1141,N619AA,JFK,MIA,160.0,1089,5,40,2013-01-01 05:00:00
3,2013,1,1,544.0,545,-1.0,1004.0,1022,-18.0,B6,725,N804JB,JFK,BQN,183.0,1576,5,45,2013-01-01 05:00:00
4,2013,1,1,554.0,600,-6.0,812.0,837,-25.0,DL,461,N668DN,LGA,ATL,116.0,762,6,0,2013-01-01 06:00:00


In [3]:
# Create df with flights from LGA
lag_df = df[df['origin'] == 'LGA'].copy()
# Display first 5 rows
lag_df.head()

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
1,2013,1,1,533.0,529,4.0,850.0,830,20.0,UA,1714,N24211,LGA,IAH,227.0,1416,5,29,2013-01-01 05:00:00
4,2013,1,1,554.0,600,-6.0,812.0,837,-25.0,DL,461,N668DN,LGA,ATL,116.0,762,6,0,2013-01-01 06:00:00
7,2013,1,1,557.0,600,-3.0,709.0,723,-14.0,EV,5708,N829AS,LGA,IAD,53.0,229,6,0,2013-01-01 06:00:00
9,2013,1,1,558.0,600,-2.0,753.0,745,8.0,AA,301,N3ALAA,LGA,ORD,138.0,733,6,0,2013-01-01 06:00:00
14,2013,1,1,559.0,600,-1.0,941.0,910,31.0,AA,707,N3DUAA,LGA,DFW,257.0,1389,6,0,2013-01-01 06:00:00


In [4]:
# Create df with flights from LGA to ATL
lag_to_atlanta_df = lag_df[lag_df['dest'] == 'ATL'].copy()
# Display first 5 rows
lag_to_atlanta_df.head()

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
4,2013,1,1,554.0,600,-6.0,812.0,837,-25.0,DL,461,N668DN,LGA,ATL,116.0,762,6,0,2013-01-01 06:00:00
18,2013,1,1,600.0,600,0.0,837.0,825,12.0,MQ,4650,N542MQ,LGA,ATL,134.0,762,6,0,2013-01-01 06:00:00
62,2013,1,1,658.0,700,-2.0,944.0,939,5.0,DL,1547,N6703D,LGA,ATL,126.0,762,7,0,2013-01-01 07:00:00
101,2013,1,1,754.0,759,-5.0,1039.0,1041,-2.0,DL,2047,N935DL,LGA,ATL,126.0,762,7,59,2013-01-01 07:00:00
123,2013,1,1,814.0,810,4.0,1047.0,1030,17.0,FL,346,N977AT,LGA,ATL,132.0,762,8,10,2013-01-01 08:00:00


## Question 4

Include those airlines with more than 1000 departures. Using a chi-square test, test the hypothesis that whether a flight is late or on time is independent of the airline. Hint: Set up a contingency table where the rows are “late” and “on time” (not late) and the columns are the
airlines. Then the expected value for the cell in row i and column j is the product of the total for row i and the total for column j, divided by the overall total. The number of degrees of freedom for the chi-square statistic is (r – 1)(c – 1), where r is the number of rows and c is the number of columns.

In [18]:
# Count the number of departures for each airline
counts = df['carrier'].value_counts()
# Create list of airlines with at least 1000 departures
airlines_to_keep = counts[counts >= 1000].index.tolist()
# Drop rows with airlines that have less than 1000 departures
filtered_lag_df = lag_df[lag_df['carrier'].isin(airlines_to_keep)].copy()
# Recall the mapping function used in the previous notebook
def map_status(delay):
    if delay > 15:
        return 'late'
    else:
        return 'not_late'
filtered_lag_df['status'] = filtered_lag_df['dep_delay'].apply(map_status)
# Print results
filtered_lag_df.head()

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour,status
1,2013,1,1,533.0,529,4.0,850.0,830,20.0,UA,1714,N24211,LGA,IAH,227.0,1416,5,29,2013-01-01 05:00:00,not_late
4,2013,1,1,554.0,600,-6.0,812.0,837,-25.0,DL,461,N668DN,LGA,ATL,116.0,762,6,0,2013-01-01 06:00:00,not_late
7,2013,1,1,557.0,600,-3.0,709.0,723,-14.0,EV,5708,N829AS,LGA,IAD,53.0,229,6,0,2013-01-01 06:00:00,not_late
9,2013,1,1,558.0,600,-2.0,753.0,745,8.0,AA,301,N3ALAA,LGA,ORD,138.0,733,6,0,2013-01-01 06:00:00,not_late
14,2013,1,1,559.0,600,-1.0,941.0,910,31.0,AA,707,N3DUAA,LGA,DFW,257.0,1389,6,0,2013-01-01 06:00:00,not_late


In [23]:
# Create contingency table
cont_tab = pd.crosstab(index=filtered_lag_df['status'], columns=filtered_lag_df['carrier'], margins=True)
# Display results
cont_tab

carrier,9E,AA,B6,DL,EV,FL,MQ,UA,US,WN,All
status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
late,459,2178,1370,3745,2310,845,3178,1473,1560,1587,18705
not_late,1900,12806,4541,19059,5915,2330,12924,6330,10981,4401,81187
All,2359,14984,5911,22804,8225,3175,16102,7803,12541,5988,99892


In [46]:
# Perform chi-squared test of independence
chi_sq, p, deg_freedom, exp_values = stats.chi2_contingency(cont_tab)
# Print results
chi_sq, p, deg_freedom

(1507.9698922346129, 7.766183700587659e-308, 20)

We find that we have a chi-squared statistic of 1507.97, with 20 degrees of freedom. This corresponds to a p-value of 7.77E-308, i.e. approximately 0. Thus, we conclude that there is sufficient evidence to reject the null in favor of the alternative, i.e. whether a flight is late or on time is dependent on the airline. This result is consistent with what one may expect, since certain airlines have more of a reputation for delaying flights than others.