# Import Dependencies

We begin by importing the necessary libraries.

In [67]:
# Data analysis
import pandas as pd
from math import sqrt
from scipy import stats

# Data Cleaning

Read in the `flights` dataframe from the `nycflights13` package. Note, a separate R script was used to export the data into a CSV file.

In [48]:
# Read data
df = pd.read_csv('../data/flights_df.csv')
# Drop irrelevant columns
df.drop(columns=['Unnamed: 0'], inplace=True)
# Display results
df.head()

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
0,2013,1,1,517.0,515,2.0,830.0,819,11.0,UA,1545,N14228,EWR,IAH,227.0,1400,5,15,2013-01-01 05:00:00
1,2013,1,1,533.0,529,4.0,850.0,830,20.0,UA,1714,N24211,LGA,IAH,227.0,1416,5,29,2013-01-01 05:00:00
2,2013,1,1,542.0,540,2.0,923.0,850,33.0,AA,1141,N619AA,JFK,MIA,160.0,1089,5,40,2013-01-01 05:00:00
3,2013,1,1,544.0,545,-1.0,1004.0,1022,-18.0,B6,725,N804JB,JFK,BQN,183.0,1576,5,45,2013-01-01 05:00:00
4,2013,1,1,554.0,600,-6.0,812.0,837,-25.0,DL,461,N668DN,LGA,ATL,116.0,762,6,0,2013-01-01 06:00:00


Next, perform the following cleaning steps:
1. Only keep flights from LGA or EWR to DFW
2. Remove flights scheduled to depart after 1200
3. Drop rows that aren't managed by one of the 3 major airlines (AA, UA, EV)
4. Also drop rows containing missing values

In [49]:
# Filter for flights from LGA or EWR to DFW
ny_to_dfw = df[(df['origin'].isin(['LGA', 'EWR'])) & (df['dest'] == 'DFW')]
# Filter for flights scheduled to depart before 1200
ny_to_dfw = ny_to_dfw[ny_to_dfw['sched_dep_time'] <= 1200]
# Only keep entries corresponding to AA, UA, or EV
ny_to_dfw = ny_to_dfw[ny_to_dfw['carrier'].isin(['AA', 'UA', 'EV'])]
# Drop rows with NaN values
ny_to_dfw.dropna(inplace=True)
# Reset index
ny_to_dfw.reset_index(drop=True, inplace=True)
# Display results
ny_to_dfw.head()

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
0,2013,1,1,559.0,600,-1.0,941.0,910,31.0,AA,707,N3DUAA,LGA,DFW,257.0,1389,6,0,2013-01-01 06:00:00
1,2013,1,1,635.0,635,0.0,1028.0,940,48.0,AA,711,N3GKAA,LGA,DFW,248.0,1389,6,35,2013-01-01 06:00:00
2,2013,1,1,724.0,730,-6.0,1111.0,1040,31.0,AA,715,N541AA,LGA,DFW,254.0,1389,7,30,2013-01-01 07:00:00
3,2013,1,1,725.0,730,-5.0,1052.0,1040,12.0,AA,2083,N4WRAA,EWR,DFW,238.0,1372,7,30,2013-01-01 07:00:00
4,2013,1,1,823.0,823,0.0,1151.0,1135,16.0,UA,1223,N39728,EWR,DFW,250.0,1372,8,23,2013-01-01 08:00:00


# Data Analysis

Then, create two new columns: `day_of_week` denoting the day of the week a given flight occured; `status` indicating whether a flight was early, on time, or late, where a late flight is defined as departing more than 15 minutes past the scheduled time. We summarize the data by creating a contingency table, with rows representing status and columns representing day of week.

In [78]:
# Convert column to datetime
ny_to_dfw['time_hour'] = pd.to_datetime(ny_to_dfw['time_hour'], format='%Y-%m-%d %H:%M:%S')
# Create new column with day of week (Monday=0, Sunday=6)
ny_to_dfw['day_of_week'] = ny_to_dfw['time_hour'].dt.dayofweek

# Create mapping function for delay to status
def map_status(delay):
    if delay < 0:
        return 'early'
    elif delay > 15:
        return 'late'
    else:
        return 'on_time'
# Map departure delays to status
ny_to_dfw['status'] = ny_to_dfw['dep_delay'].apply(map_status)

In [79]:
# Remove entries occuring on weekends
ny_to_dfw_weekday = ny_to_dfw[ny_to_dfw['day_of_week'] < 5]
# Create contingency table
cont_tab = pd.crosstab(index=ny_to_dfw_weekday['status'], columns=ny_to_dfw_weekday['day_of_week'], margins=True)
# Display results
cont_tab

day_of_week,0,1,2,3,4,All
status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
early,365,402,399,366,353,1885
late,35,29,37,48,45,194
on_time,88,62,56,66,89,361
All,488,493,492,480,487,2440


Notice, most flights depart either early or on time, with Thursdays and Fridays containing the most late departures w.r.t. the other days of week. To check whether this behavior is to be expected, that is, whether `status` is dependent on `day_of_week`, we perform a chi-squared test of independence. 

In [59]:
# Chi-squared test of independence
chi_sq, p, deg_freedom, exp_values = stats.chi2_contingency(cont_tab)
# Print results
print(f'Chi-square test statistic: {chi_sq:.4f}\nDegrees of freedom: {deg_freedom}\nP-value: {p:.4f}')

Chi-square test statistic: 23.9985
Degrees of freedom: 15
P-value: 0.0651


Interestingly, with a p-value of 0.0651, assuming a standard significance level of 95%, we conclude that there is insufficient evidence to reject the null in favor of the alternative. In other words, we can't say for certain whether `status` is independent of `day_of_week`.

Moving on, we produce 95% confidence intervals for the mean `air_time` during all days, only Mondays, and only Fridays, using the formula `CI = (mean +/- standard error)`, where `standard error = 2 * standard deviation / sqrt(sample size)`.

In [77]:
# Compute CI for all days
mean, stdev, n = ny_to_dfw['air_time'].mean(), ny_to_dfw['air_time'].std(), len(ny_to_dfw['air_time'])
conf_int = (round(mean - 2 * stdev / sqrt(n), 2), round(mean + 2 * stdev / sqrt(n), 2))

# Compute CI for only Mondays
ny_to_dfw_monday = ny_to_dfw[ny_to_dfw['day_of_week'] == 0]
mean_0, stdev_0, n_0 = ny_to_dfw_monday['air_time'].mean(), ny_to_dfw_monday['air_time'].std(), len(ny_to_dfw_monday['air_time'])
conf_int_0 = (round(mean_0 - 2 * stdev_0 / sqrt(n_0), 2), round(mean_0 + 2 * stdev_0 / sqrt(n_0), 2))
conf_int_0

# Compute CI for only Fridays
ny_to_dfw_friday = ny_to_dfw[ny_to_dfw['day_of_week'] == 4]
mean_4, stdev_4, n_4 = ny_to_dfw_friday['air_time'].mean(), ny_to_dfw_friday['air_time'].std(), len(ny_to_dfw_friday['air_time'])
conf_int_4 = (round(mean_4 - 2 * stdev_4 / sqrt(n_4), 2), round(mean_4 + 2 * stdev_4 / sqrt(n_4), 2))
conf_int_4

# Print results
print(f'Mean air time on all days, 95% CI: {conf_int}')
print(f'Mean air time on Mondays, 95% CI: {conf_int_0}')
print(f'Mean air time on Fridays, 95% CI: {conf_int_4}')

Mean air time on all days, 95% CI: (199.08, 200.34)
Mean air time on Mondays, 95% CI: (198.59, 201.85)
Mean air time on Fridays, 95% CI: (197.38, 200.38)


We find that the confidence intervals in all three cases remain fairly similar, with a significant amount of overlap between them, implying that the mean `air_time` is likely independent of `day_of_week` (note: we could of course verify this hypothesis using a chi-squared test).

Finally, consider the scenario of being a consultant for a company that uses EWR and/or LGA, and whose representatives frequently travel to DFW for business. Typically, they're required to be at the designated meeting location by 2 pm (Central Time; 3 pm Eastern Time). Assuming 1 hour to travel from DFW to the meeting location, and that flights can depart on any day of the week, our recommendation is to book flights scheduled to depart no later than 11:39 am Eastern (3 pm - `travel time: 1 hr` - `mean air time: 199.71 mins` - `mean departure delay: 1.44 mins`).

We can verify the probability of late meeting arrivals by computing a 95% confidence interval for the proportion of flights that obey our recommendation, that don't arrive in DFW 1 hour ahead of the scheduled meeting time.

In [99]:
# Take 1000 samples from the dataframe of flights departing no later than 11:39 am
ny_to_dfw_sample = ny_to_dfw[ny_to_dfw['sched_dep_time'] <= 1139].sample(n=1000, random_state=11).reset_index(drop=True)
# Compute the proportion of flights that arrive late (after 2 pm Eastern) in the sample
prop_late = len(ny_to_dfw_sample[ny_to_dfw_sample['arr_time'] > 1400]) / len(ny_to_dfw_sample)
# Compute a 95% CI for the proportion of late arrivals
conf_int = (round(prop_late - 2 * sqrt(prop_late * (1 - prop_late) / 1000), 2), round(prop_late + 2 * sqrt(prop_late * (1 - prop_late) / 1000), 2))
# Print results
print(f'Proportion of late arrivals, 95% CI: {conf_int}\nPoint estimate: {prop_late}')

Proportion of late arrivals, 95% CI: (0.04, 0.07)
Point estimate: 0.055


In other words, roughly 4-7% of representatives will be late for their meetings 95% of the time; not enough to significantly affect the meeting's proceedings, though if we'd like to lower this proportion, we'd simply pick an earlier latest-departure-time.