##Derived Features Function for Flight Dataset - Esther

In [0]:
from pyspark.sql.functions import col

import matplotlib.pyplot as plt
from pyspark.sql import functions as f
from pyspark.sql.functions import isnull, when, count
from pyspark.sql.functions import *
from pyspark.sql import SQLContext
from pyspark.sql.types import *
import pyspark
import pandas as pd
import numpy as np
from pyspark.sql import Window
from pyspark.sql.functions import col, substring

import seaborn as sns

### **Function for Derived Features

**LOCAL_DEP_HOUR**

0 - 23

**HOLIDAY** 

**0**: Not a holiday

**1**: Is a holiday

**2**: Near holiday

**Previous_Flight_Delay_15** (If the previous flight has delayed for 15 mins or more)

**0**: No Delay / Delay less than 15 minutes

**1**: Delay for 15 minutes or more

**2**: Lack of information (The scheduled departure time of previous flight is less than two hours before the scheduled departure time of current flight)

**Enough_Time_Btwn_Estimate_Arrival_and_Planned_Dep** (If there is enough time *(40 minutes)* between estimate arrival time of previous flight and planned departure time of current flight)

**0**: No Enough time 

**1**: Have Enough time

**2**: Lack of information (The scheduled departure time of previous flight is less than two hours before the scheduled departure time of current flight)

**Poor_Schedule** (If planned arrival time of Pre-Flight is later than planned departure time of Current-Flight)

**0**: Not a poor scheduling

**1**: Poor scheduling

In [0]:
# Create Derived Features for Flight Dataset
# This includes:
# FEATURE 1: Extraction of Local Departure Hour
# FEATURE 2: Create Holiday Indicator
# FEATURE 3: Create Previous Flight Delay Indicator
# FEATURE 4: Calculate the estimate arrival time for the previous flight, 
#            Get the time between previous flight estimate arrival time and CRS departure time for the current flight
# If time btwn estimate_arrival and crs_dep >= 40 minutes -------> 1, 
#                                            < 40 minutes -------> 0, 
#                          otherwise(lack of information) -------> 2   notes: The scheduled departure time of previous flight is less than two hours 
#                                                                             before the scheduled departure time of current flight
# FEATURE 5: Create Poor_Schedule, 
# If planned arrival time is later than planned departure time -----> 1
#                                                    otherwise -----> 0 
def flight_derived_features_creation(df):
  
  # FEATURE 1: Extraction of Local Departure Hour
  df = df.withColumn("FORMATTED_CRS_DEP_TIME", substring(format_string("0000%d", "CRS_DEP_TIME_AIRLNS"), -4, 4))
  df = df.withColumn("DATE_WITH_CRS_DEP_TIME", concat_ws(" ", df.FL_DATE_AIRLNS, df.FORMATTED_CRS_DEP_TIME))
  df = df.withColumn("LOCAL_DEP_HOUR", hour(to_timestamp(df.DATE_WITH_CRS_DEP_TIME, "yyyy-MM-dd HHmm")))
  columns_to_drop = ['FORMATTED_CRS_DEP_TIME', 'DATE_WITH_CRS_DEP_TIME']
  df = df.drop(*columns_to_drop)
  

  # FEATURE 2: Create Holiday Indicator
  df = df.withColumn("HOLIDAY", expr("""CASE WHEN FL_DATE_AIRLNS in (
                              '2015-01-01', '2015-07-03', '2015-07-04', '2015-11-26', '2015-12-25',
                              '2016-01-01', '2016-07-04', '2016-11-24', '2016-12-25', '2016-12-26', 
                              '2017-01-01', '2017-01-02', '2017-07-04', '2017-11-23', '2017-12-25',
                              '2018-01-01', '2018-07-04', '2018-11-22', '2018-12-25', 
                              '2019-01-01', '2019-07-04', '2019-11-28', '2019-12-25') THEN 1 """ + 
         """ WHEN FL_DATE_AIRLNS in (
                              '2015-01-02', '2015-01-03','2015-01-04', '2015-01-05', '2015-01-06', '2015-07-01', '2015-07-02', '2015-07-05', '2015-07-06', '2015-11-21', '2015-11-22', '2015-11-23', '2015-11-24', '2015-11-25', '2015-11-27', '2015-11-28', '2015-11-29', '2015-11-30', '2015-12-01', '2015-12-20', '2015-12-21', '2015-12-22', '2015-12-23', '2015-12-24', '2015-12-26', '2015-12-27','2015-12-28', '2015-12-29', '2015-12-30', '2015-12-31',

                              '2016-01-02', '2016-01-03','2016-01-04', '2016-01-05', '2016-01-06', '2016-07-02', '2016-07-03', '2016-07-05', '2016-07-06', '2016-11-19', '2016-11-20', '2016-11-21', '2016-11-22', '2016-11-23', '2016-11-25', '2016-11-26', '2016-11-27', '2016-11-28', '2016-11-29', '2016-12-20', '2016-12-21', '2016-12-22', '2016-12-23', '2016-12-24', '2016-12-27', '2016-12-28','2016-12-29', '2016-12-30', '2016-12-31',

                              '2017-01-03', '2017-01-04',  '2017-01-05', '2017-01-06', '2017-01-07', '2017-07-02', '2017-07-03', '2017-07-05', '2017-07-06','2017-11-18', '2017-11-19', '2017-11-20', '2017-11-21', '2017-11-22', '2017-11-24', '2017-11-25','2017-11-26', '2017-11-27', '2017-11-28', '2017-12-20', '2017-12-21', '2017-12-22', '2017-12-23', '2017-12-24', '2017-12-26', '2017-12-27', '2017-12-28', '2017-12-29', '2017-12-30', '2017-12-31', 

                              '2018-01-02', '2018-01-03', '2018-01-04', '2018-01-05', '2018-01-06', '2018-07-02', '2018-07-03', '2018-07-05', '2018-07-06','2018-11-17','2018-11-18', '2018-11-19', '2018-11-20', '2018-11-21', '2018-11-23', '2018-11-24','2018-11-25', '2018-11-26', '2018-11-27','2018-12-20','2018-12-21', '2018-12-22', '2018-12-23', '2018-12-24', '2018-12-26', '2018-12-27','2018-12-28', '2018-12-29','2018-12-30', '2018-12-31', 

                              '2019-01-02', '2019-01-03','2019-01-04', '2019-01-05', '2019-01-06', '2019-07-02', '2019-07-03', '2019-07-05', '2019-07-06', '2019-11-23', '2019-11-24', '2019-11-25', '2019-11-26', '2019-11-27', '2019-11-29', '2019-11-30','2019-12-01', '2019-12-02', '2019-12-03', '2019-12-20', '2019-12-21', '2019-12-22','2019-12-23', '2019-12-24', '2019-12-26', '2019-12-27','2019-12-28', '2019-12-29','2019-12-30', '2019-12-31') THEN 2 """
                              "ELSE 0 END"))
  
  # FEATURE 3: Create Previous Flight Delay Indicator
  # FEATURE 4: Calculate the estimate arrival time for the previous flight, 
  #            Get the time between previous flight estimate arrival time and CRS departure time for the current flight
  # If time btwn estimate_arrival and crs_dep >= 40 minutes -------> 1, 
  #                                            < 40 minutes -------> 0, 
  #                          otherwise(lack of information) -------> 2   notes: The scheduled departure time of previous flight is less than two hours 
  #                                                                             before the scheduled departure time of current flight
  # FEATURE 5: Create Poor_Schedule Indicator
  
  utc_arrive_for_each_tail = Window.partitionBy('TAIL_NUM_AIRLNS').orderBy('utc_arrive')

  # Two_Hour_Btwn_Prev_Departure_and_Current_Departure ----> 
  # 1: Time > 2 hours    0: Within 2 hours.(The scheduled departure time of previous flight is less than two hours 
  #                                         before the scheduled departure time of current flight)
  utc_arrive_for_each_tail = Window.partitionBy('TAIL_NUM_AIRLNS').orderBy('utc_arrive')
  df = df.withColumn('Prev_Flight_Planned_Departure_UTC', f.lag('utc_dep', 1).over(utc_arrive_for_each_tail))\
         .withColumn('Time_Btwn_Prev_Departure_and_Current_Departure', (f.unix_timestamp('utc_dep') - f.unix_timestamp('Prev_Flight_Planned_Departure_UTC')) /60 /60 ) \
         .withColumn('Two_Hour_Btwn_Prev_Departure_and_Current_Departure', expr("CASE WHEN Time_Btwn_Prev_Departure_and_Current_Departure > 2 THEN '1'" + "ELSE '0' END"))
  
  df = df.withColumn('Prev_Flight_Delay_15', f.lag('DEP_DEL15_AIRLNS', 1).over(utc_arrive_for_each_tail))\
         .withColumn('Prev_Flight_Delay', f.lag('DEP_DELAY_NEW_AIRLNS', 1).over(utc_arrive_for_each_tail))\
         .withColumn('Prev_Flight_Planned_Arrive_UTC', f.lag('utc_arrive', 1).over(utc_arrive_for_each_tail))

  df = df.withColumn('Estimate_Pre_Flight_Arrival_Time', col("Prev_Flight_Planned_Arrive_UTC") +  col("Prev_Flight_Delay") * f.expr("Interval 1 Minutes"))

  # To get how many minutes between Estimate_Pre_Flight_Arrival_Time and utc_dep for the current flight
  # And if there is enough time between Estimate_Pre_Flight_Arrival_Time and utc_dep for the current flight (40 minutes)
  df = df.withColumn('Time_Btwn_Estimate_Arrival_and_Planned_Dep', (f.unix_timestamp('utc_dep') - f.unix_timestamp('Estimate_Pre_Flight_Arrival_Time')) / 60)\
         .withColumn('Enough_Time_Btwn_Estimate_Arrival_and_Planned_Dep', expr("CASE WHEN Time_Btwn_Estimate_Arrival_and_Planned_Dep >= 40 THEN '1'" + "ELSE '0' END"))
  
  # Create Feature 'Poor_Schedule' if planned arrival time is later than planned departure time
  df = df.withColumn('time_btwn_dep_and_plannedArrival', (f.unix_timestamp('utc_dep') - f.unix_timestamp('Prev_Flight_Planned_Arrive_UTC'))/60) \
         .withColumn('Poor_Schedule', expr("CASE WHEN time_btwn_dep_and_plannedArrival <= 0 THEN '1'" + "ELSE '0' END"))

  # Update df:
  # mark Enough_Time_Btwn_Estimate_Arrival_and_Planned_Dep = 2 for flights whose previous flight depart within 2 hours(Two_Hour_Btwn_Prev_Departure_and_Current_Departure == 0)
  # mark Enough_Time_Btwn_Estimate_Arrival_and_Planned_Dep = 1 for flights that do not have previous flights
  # mark Prev_Flight_Delay_15 = 0 for flights that do not have previous flights
  # mark Poor_Schedule = 0 for flights that do not have previous flights


  # Do have previous flight & previous flight depart less than 2 hours before the departure time of current flight, 
  # Enough_Time_Btwn_Estimate_Arrival_and_Planned_Dep = 2(lack of information, we don't know if there is enough time)
  # Prev_Flight_Delay_15 = 2(lack of information, we don't know if the previous flight delay or not)
  df = df.withColumn('Enough_Time_Btwn_Estimate_Arrival_and_Planned_Dep', when((df.Prev_Flight_Planned_Arrive_UTC.isNotNull()) & (df.Two_Hour_Btwn_Prev_Departure_and_Current_Departure == 0) , '2').otherwise(df.Enough_Time_Btwn_Estimate_Arrival_and_Planned_Dep))
  df = df.withColumn('Prev_Flight_Delay_15', when((df.Prev_Flight_Planned_Arrive_UTC.isNotNull()) & (df.Two_Hour_Btwn_Prev_Departure_and_Current_Departure == 0) , '2').otherwise(df.Prev_Flight_Delay_15.cast("Integer")))

  # mark Enough_Time_Btwn_Estimate_Arrival_and_Planned_Dep = 1 for flights that do not have previous flights
  # mark Prev_Flight_Delay_15 = 0 for flights that do not have previous flights
  # mark Poor_Schedule = 0 for flights that do not have previous flights
  df = df.withColumn('Enough_Time_Btwn_Estimate_Arrival_and_Planned_Dep', when(df.Prev_Flight_Planned_Arrive_UTC.isNull(), '1').otherwise(df.Enough_Time_Btwn_Estimate_Arrival_and_Planned_Dep))\
         .withColumn('Prev_Flight_Delay_15', when(df.Prev_Flight_Planned_Arrive_UTC.isNull(), '0').otherwise(df.Prev_Flight_Delay_15.cast("Integer")))\
         .withColumn('Poor_Schedule', when(df.Prev_Flight_Planned_Arrive_UTC.isNull(), '0').otherwise(df.Poor_Schedule.cast("Integer")))
  
  # Drop no-longer needed columns
  columns_to_drop = ['Time_Btwn_Estimate_Arrival_and_Planned_Dep', 'Estimate_Pre_Flight_Arrival_Time', 'Prev_Flight_Planned_Arrive_UTC', 
                     'Prev_Flight_Delay', 'Prev_Flight_Planned_Departure_UTC', 'Time_Btwn_Prev_Departure_and_Current_Departure', 'Two_Hour_Btwn_Prev_Departure_and_Current_Departure', 'time_btwn_dep_and_plannedArrival']
  df = df.drop(*columns_to_drop)
  
  return df
