In [1]:
#importing librearies
import numpy as np
import pandas as pd

In [3]:
#read the csv from data cleaning
df = pd.read_csv('flights_cleaned.csv')
df.head()

Unnamed: 0,fl_date,mkt_unique_carrier,origin,dest,crs_dep_time,dep_delay,taxi_out,taxi_in,crs_arr_time,arr_delay,crs_elapsed_time,distance
0,2018-05-31,WN,BUR,SFO,1100,38.0,18.0,7.0,1220,51.0,80,326
1,2019-04-20,AA,ORD,DFW,1311,-8.0,17.0,8.0,1544,-35.0,153,801
2,2019-02-13,WN,MSY,HOU,2130,29.0,5.0,4.0,2245,18.0,75,302
3,2018-11-20,AA,DFW,IND,1834,-1.0,30.0,10.0,2142,13.0,128,761
4,2018-08-28,DL,JFK,IAD,1359,-6.0,16.0,3.0,1537,-34.0,98,228


In [5]:
#Rename the carrier feature
df.rename(columns={'mkt_unique_carrier':'carrier_name'}, inplace=True)
df.head()

Unnamed: 0,fl_date,carrier_name,origin,dest,crs_dep_time,dep_delay,taxi_out,taxi_in,crs_arr_time,arr_delay,crs_elapsed_time,distance
0,2018-05-31,WN,BUR,SFO,1100,38.0,18.0,7.0,1220,51.0,80,326
1,2019-04-20,AA,ORD,DFW,1311,-8.0,17.0,8.0,1544,-35.0,153,801
2,2019-02-13,WN,MSY,HOU,2130,29.0,5.0,4.0,2245,18.0,75,302
3,2018-11-20,AA,DFW,IND,1834,-1.0,30.0,10.0,2142,13.0,128,761
4,2018-08-28,DL,JFK,IAD,1359,-6.0,16.0,3.0,1537,-34.0,98,228


In [6]:
#Change crs departure/arrival time into categorical timeframe
def time_category(sch_time):
    '''
    The function will take in a pandas series for the hour of the day the flight is happening and 
    categorize it into one of the 4 timeframes
    '''
    
    time_cat = []
    for i in sch_time.index:
        time = sch_time[i]
        if 0 <= time < 6:
            time_cat.append('12am-6am')
        elif 6 <= time < 12:
            time_cat.append('6am-12pm')
        elif 12 <= time < 18:
            time_cat.append('12pm-6pm')
        else:
            time_cat.append('6pm-12am')
        
    return time_cat

#Turning departure and arrival time to string before changing it to datetime hour
df['crs_dep_time'] = df['crs_dep_time'].astype(str).str.zfill(4)
df['crs_arr_time'] = df['crs_arr_time'].astype(str).str.zfill(4)
df['sch_dep_time'] = pd.to_datetime(df['crs_dep_time'], format='%H%M%S').dt.hour
df['sch_arr_time'] = pd.to_datetime(df['crs_arr_time'], format='%H%M%S').dt.hour

#Using the time_category function to categorize the time
df['dep_timeframe'] = pd.Series(time_category(df['sch_dep_time']))
df['arr_timeframe'] = pd.Series(time_category(df['sch_arr_time']))

df.head()

Unnamed: 0,fl_date,carrier_name,origin,dest,crs_dep_time,dep_delay,taxi_out,taxi_in,crs_arr_time,arr_delay,crs_elapsed_time,distance,sch_dep_time,sch_arr_time,dep_timeframe,arr_timeframe
0,2018-05-31,WN,BUR,SFO,1100,38.0,18.0,7.0,1220,51.0,80,326,11,12,6am-12pm,12pm-6pm
1,2019-04-20,AA,ORD,DFW,1311,-8.0,17.0,8.0,1544,-35.0,153,801,13,15,12pm-6pm,12pm-6pm
2,2019-02-13,WN,MSY,HOU,2130,29.0,5.0,4.0,2245,18.0,75,302,21,22,6pm-12am,6pm-12am
3,2018-11-20,AA,DFW,IND,1834,-1.0,30.0,10.0,2142,13.0,128,761,18,21,6pm-12am,6pm-12am
4,2018-08-28,DL,JFK,IAD,1359,-6.0,16.0,3.0,1537,-34.0,98,228,13,15,12pm-6pm,12pm-6pm


In [10]:
#Select the month part of the flight date
df['fl_month'] = pd.to_datetime(df['fl_date'], format='%Y-%m-%d').dt.month_name()
df.head()

Unnamed: 0,fl_date,carrier_name,origin,dest,crs_dep_time,dep_delay,taxi_out,taxi_in,crs_arr_time,arr_delay,crs_elapsed_time,distance,sch_dep_time,sch_arr_time,dep_timeframe,arr_timeframe,fl_month
0,2018-05-31,WN,BUR,SFO,1100,38.0,18.0,7.0,1220,51.0,80,326,11,12,6am-12pm,12pm-6pm,May
1,2019-04-20,AA,ORD,DFW,1311,-8.0,17.0,8.0,1544,-35.0,153,801,13,15,12pm-6pm,12pm-6pm,April
2,2019-02-13,WN,MSY,HOU,2130,29.0,5.0,4.0,2245,18.0,75,302,21,22,6pm-12am,6pm-12am,February
3,2018-11-20,AA,DFW,IND,1834,-1.0,30.0,10.0,2142,13.0,128,761,18,21,6pm-12am,6pm-12am,November
4,2018-08-28,DL,JFK,IAD,1359,-6.0,16.0,3.0,1537,-34.0,98,228,13,15,12pm-6pm,12pm-6pm,August


In [12]:
#Removing the old features that are no longer needed
to_drop = ['fl_date',
          'crs_dep_time',
          'crs_arr_time',
          'sch_dep_time',
          'sch_arr_time']

df_sorted = df.drop(to_drop, axis=1)
df_sorted

Unnamed: 0,carrier_name,origin,dest,dep_delay,taxi_out,taxi_in,arr_delay,crs_elapsed_time,distance,dep_timeframe,arr_timeframe,fl_month
0,WN,BUR,SFO,38.0,18.0,7.0,51.0,80,326,6am-12pm,12pm-6pm,May
1,AA,ORD,DFW,-8.0,17.0,8.0,-35.0,153,801,12pm-6pm,12pm-6pm,April
2,WN,MSY,HOU,29.0,5.0,4.0,18.0,75,302,6pm-12am,6pm-12am,February
3,AA,DFW,IND,-1.0,30.0,10.0,13.0,128,761,6pm-12am,6pm-12am,November
4,DL,JFK,IAD,-6.0,16.0,3.0,-34.0,98,228,12pm-6pm,12pm-6pm,August
...,...,...,...,...,...,...,...,...,...,...,...,...
196188,DL,MLI,ATL,-9.0,13.0,15.0,-5.0,115,633,12pm-6pm,6pm-12am,January
196189,AA,TPA,MIA,27.0,13.0,89.0,112.0,71,204,12pm-6pm,12pm-6pm,July
196190,UA,EWR,DTW,-9.0,36.0,7.0,-23.0,137,488,6am-12pm,6am-12pm,November
196191,AS,SFO,PDX,-3.0,42.0,3.0,20.0,101,550,6am-12pm,6am-12pm,August


In [13]:
#Categorize arrival delay into delay categories
def delay_category(arr_delay):
    
    '''This function categorize arrival delays into one of 4 categories
    0 : early or no delay
    1 : slight delay (less than 5 mins)
    2 : medium delay (5 to 30 mins)
    3 : long delay (over 30 mins)
    '''
    
    delay_cat = []
    for delay in arr_delay.index:
        time = arr_delay[delay]
        if  time <= 0:
            delay_cat.append(0)
        elif 0 < time <= 5:
            delay_cat.append(1)
        elif 5 < time <= 30:
            delay_cat.append(2)
        else:
            delay_cat.append(3)
        
    return delay_cat

df_sorted['delay_status'] = delay_category(df_sorted['arr_delay'])
df_sorted.head()

Unnamed: 0,carrier_name,origin,dest,dep_delay,taxi_out,taxi_in,arr_delay,crs_elapsed_time,distance,dep_timeframe,arr_timeframe,fl_month,delay_status
0,WN,BUR,SFO,38.0,18.0,7.0,51.0,80,326,6am-12pm,12pm-6pm,May,3
1,AA,ORD,DFW,-8.0,17.0,8.0,-35.0,153,801,12pm-6pm,12pm-6pm,April,0
2,WN,MSY,HOU,29.0,5.0,4.0,18.0,75,302,6pm-12am,6pm-12am,February,2
3,AA,DFW,IND,-1.0,30.0,10.0,13.0,128,761,6pm-12am,6pm-12am,November,2
4,DL,JFK,IAD,-6.0,16.0,3.0,-34.0,98,228,12pm-6pm,12pm-6pm,August,0


In [14]:
#Save the file in a csv before moving on to data modeling
df_sorted.to_csv('flights_features.csv', index=False)