In [None]:
# load required libraries
library(dplyr)
library(ggplot2)
library(maps)
library(randomForest)

In [None]:
# Load airline.csv data
flights <- read.csv("./data/airline_trunc.csv",
  sep=",", 
  header=TRUE,
  stringsAsFactors=FALSE)

# Explore the data
head(flights)
str(flights)

In [None]:
# Remove flights with missing data
flights <- na.omit(flights)

dim(flights)

In [None]:
# Function to estimate birthmonth of aircraft by finding the month and year of first flight
birthmonth <- function(y){
  minYear <- min(y[,'Year'], na.rm=TRUE)
  these <- which(y[,'Year']==minYear)
  minMonth <- min(y[these, 'Month'], na.rm=TRUE)
  return(12 * minYear + minMonth - 1)
}

# Create vectors for each aircraft (aircrafts) and store their birthmonth (acStart)
aircrafts <- unique(flights[,'TailNum'])
aircrafts <- aircrafts[!is.na(aircrafts)]

acStart <- rep(0, length(aircrafts))

for (i in aircrafts) {
  acStart[i] <- birthmonth(flights[flights$TailNum==i,])
}

tail(acStart)

In [None]:
# Calculate flight age using the birthmonth
age <- data.frame(names(acStart), acStart, stringsAsFactors = FALSE)
colnames(age) <- c("TailNum", "acStart")
flights <- left_join(flights, age, by="TailNum")
flights <- mutate(flights, Age = (flights$Year * 12) + flights$Month - flights$acStart)

head(flights)

In [None]:
# Generate linear model for response: ArrDelay and predictor: Age 
lm <- lm(ArrDelay ~ Age, data=flights)
summary(lm)

In [None]:
# Convert Months from number to factor
flights$Month <- factor(flights$Month)
levels(flights$Month) <- month.abb

# Select a subset of fields needed to graph arrival delays by month
subset_month <- select(flights, Month, ArrDelay)

# Create violin graph showing arrival delays by month
ggplot(subset_month, aes(Month,ArrDelay, fill=factor(Month))) + 
       geom_violin(aes(group=Month)) + 
       theme(legend.position="none") +
       labs(y = "Arrival Delay (in minutes)") + 
       labs(title = "Average Flight Arrival Delay by Month")

In [None]:
# Load state list for airport codes and join departure state to flights dataframe
airport_codes <- read.csv("./data/airport_codes.csv",
                        col.names=c("OriginState", "Origin"),
                        stringsAsFactors = FALSE)
flights <- left_join(flights, airport_codes, by="Origin")

# Create subset of data containing origin state and arrival delay
subset_state <- select(flights, OriginState, ArrDelay)
subset_state <- subset_state[!is.na(subset_state$ArrDelay),]
subset_state <- group_by(subset_state, OriginState)
subset_summary <- summarise(subset_state, AveDelay=mean(ArrDelay))

# Create graphic of US States colored by average delay time
map = map_data("state")

ggplot(subset_summary, aes(fill=AveDelay)) + 
  geom_map(aes(map_id=OriginState), map=map) +
  scale_fill_distiller(name = "AveDelay(mins)", palette = "Spectral", direction=-1) +
  expand_limits(x=map$long, y=map$lat) +
  theme_void() +
  labs(title = "Average Flight Arrival Delay by State")

In [None]:
# Bin based on arrival times
flights$ArrCategory <- cut(flights$ArrDelay, c(-120,-15,15,120,400), c("Early", "On-Time", "Late", "Very Late"))

# Remove any remaining NA's
flights_subset <- select(flights, Month, Year, OriginState, Age, ArrCategory)
flights_subset <- na.omit(flights_subset)
flights_subset$OriginState <- as.factor(flights_subset$OriginState)

# Divide data into training and testing sets
flights.train <- rownames(sample_frac(flights_subset, .8))
flights.test <- rownames(flights)[!(rownames(flights_subset) %in% flights.train)]

length(flights.train)
length(flights.test)

In [None]:
# Create random forest to predict ArrCategory
forest <- randomForest(ArrCategory ~ OriginState + Month + Year, flights_subset[flights.train,], ntree=10)
forest

In [None]:
forest$importance