## Clean Delayed Flights Data

In [1]:
# Import all packages
import pandas as pd
import numpy as np
from datetime import datetime, date

In [2]:
flights_raw = '/Users/thaysmartinez/Documents/Flights Data Exploration/flights_raw.csv'
delayed = pd.read_csv(flights_raw)

In [3]:
# Filter DataFrame with delayed flights
delayed = delayed.query('CANCELLED == 0 & ARR_DELAY >= 15')

In [4]:
delayed.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2741963 entries, 0 to 14635477
Data columns (total 28 columns):
 #   Column               Non-Null Count    Dtype  
---  ------               --------------    -----  
 0   FL_DATE              2741963 non-null  object 
 1   ORIGIN               2741963 non-null  object 
 2   ORIGIN_CITY_NAME     2741963 non-null  object 
 3   DEST                 2741963 non-null  object 
 4   DEST_CITY_NAME       2741963 non-null  object 
 5   CRS_DEP_TIME         2741963 non-null  int64  
 6   DEP_TIME             2741963 non-null  float64
 7   DEP_DELAY            2741628 non-null  float64
 8   TAXI_OUT             2741963 non-null  float64
 9   WHEELS_OFF           2741963 non-null  float64
 10  WHEELS_ON            2741963 non-null  float64
 11  TAXI_IN              2741963 non-null  float64
 12  CRS_ARR_TIME         2741963 non-null  int64  
 13  ARR_TIME             2741963 non-null  float64
 14  ARR_DELAY            2741963 non-null  float64
 1

In [5]:
pd.set_option('display.max_columns', 30)
delayed.head()

Unnamed: 0,FL_DATE,ORIGIN,ORIGIN_CITY_NAME,DEST,DEST_CITY_NAME,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,CANCELLED,CANCELLATION_CODE,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 27
0,2019-03-01,CVG,"Cincinnati, OH",MSP,"Minneapolis, MN",1534,1617.0,43.0,28.0,1645.0,1729.0,18.0,1648,1747.0,59.0,0.0,,0.0,134.0,150.0,104.0,596.0,0.0,0.0,59.0,0.0,0.0,
1,2019-03-01,JFK,"New York, NY",BUF,"Buffalo, NY",940,1035.0,55.0,56.0,1131.0,1228.0,8.0,1114,1236.0,82.0,0.0,,0.0,94.0,121.0,57.0,301.0,0.0,55.0,27.0,0.0,0.0,
2,2019-03-01,MSN,"Madison, WI",LGA,"New York, NY",1845,1954.0,69.0,14.0,2008.0,2259.0,13.0,2158,2312.0,74.0,0.0,,0.0,133.0,138.0,111.0,812.0,0.0,0.0,6.0,0.0,68.0,
4,2019-03-01,JFK,"New York, NY",BUF,"Buffalo, NY",1336,1530.0,114.0,30.0,1600.0,1652.0,6.0,1513,1658.0,105.0,0.0,,0.0,97.0,88.0,52.0,301.0,35.0,0.0,0.0,0.0,70.0,
6,2019-03-01,ATL,"Atlanta, GA",TYS,"Knoxville, TN",1340,1425.0,45.0,18.0,1443.0,1517.0,3.0,1438,1520.0,42.0,0.0,,0.0,58.0,55.0,34.0,152.0,42.0,0.0,0.0,0.0,0.0,


In [6]:
# Lowercase columns
delayed.columns = map(str.lower, delayed.columns)

In [7]:
# Rename columns
dict_columns = {'fl_date': 'date', 
                'origin': 'dep_airport', 
                'origin_city_name': 'dep_city', 
                'dest': 'dest_airport', 
                'dest_city_name': 'dest_city',
                'cancelled': 'fl_type',
                'cancellation_code': 'canc_desc',
                'actual_elapsed_time': 'elapsed_time', 
                'carrier_delay': 'Carrier',
                'weather_delay': 'Weather', 
                'nas_delay': 'NAS', 
                'security_delay': 'Security', 
                'late_aircraft_delay': 'Late Aircraft'}


delayed.rename(columns=dict_columns, errors='raise', inplace=True)

In [8]:
# Convert flight date to datetime data type
delayed['date'] = pd.to_datetime(delayed['date'])

In [9]:
# Convert columns from numeric to object types and pad zeroes
columns = ['crs_dep_time', 'dep_time', 'wheels_off', 
           'wheels_on', 'crs_arr_time', 'arr_time']

converter = (lambda x: x.astype(str)
                        .str.replace(r'(\.0)$', '')
                        .str.zfill(4)
                        .str.replace('2400', '0000')
             if x.name in columns else x)

delayed = delayed.apply(converter)

In [10]:
# Convert columns to time timestamp
for column in columns:
    delayed[column] = delayed[column].apply(lambda x: datetime.strptime(x, '%H%M').time())

In [21]:
# Convert to int
columns = ['dep_delay', 'taxi_out', 'taxi_in', 'arr_delay', 'fl_type', 'diverted',
           'crs_elapsed_time', 'elapsed_time', 'air_time', 'distance']

convert_int = lambda x: x.fillna(0).astype(int) if x.name in columns else x

delayed = delayed.apply(convert_int)

In [22]:
delayed.head()

Unnamed: 0,date,dep_airport,dep_city,dest_airport,dest_city,crs_dep_time,dep_time,dep_delay,taxi_out,wheels_off,wheels_on,taxi_in,crs_arr_time,arr_time,arr_delay,fl_type,canc_desc,diverted,crs_elapsed_time,elapsed_time,air_time,distance,Carrier,Weather,NAS,Security,Late Aircraft,unnamed: 27
0,2019-03-01,CVG,"Cincinnati, OH",MSP,"Minneapolis, MN",15:34:00,16:17:00,43,28,16:45:00,17:29:00,18,16:48:00,17:47:00,59,0,,0,134,150,104,596,0.0,0.0,59.0,0.0,0.0,
1,2019-03-01,JFK,"New York, NY",BUF,"Buffalo, NY",09:40:00,10:35:00,55,56,11:31:00,12:28:00,8,11:14:00,12:36:00,82,0,,0,94,121,57,301,0.0,55.0,27.0,0.0,0.0,
2,2019-03-01,MSN,"Madison, WI",LGA,"New York, NY",18:45:00,19:54:00,69,14,20:08:00,22:59:00,13,21:58:00,23:12:00,74,0,,0,133,138,111,812,0.0,0.0,6.0,0.0,68.0,
4,2019-03-01,JFK,"New York, NY",BUF,"Buffalo, NY",13:36:00,15:30:00,114,30,16:00:00,16:52:00,6,15:13:00,16:58:00,105,0,,0,97,88,52,301,35.0,0.0,0.0,0.0,70.0,
6,2019-03-01,ATL,"Atlanta, GA",TYS,"Knoxville, TN",13:40:00,14:25:00,45,18,14:43:00,15:17:00,3,14:38:00,15:20:00,42,0,,0,58,55,34,152,42.0,0.0,0.0,0.0,0.0,


In [23]:
delayed.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2741963 entries, 0 to 14635477
Data columns (total 28 columns):
 #   Column            Non-Null Count    Dtype         
---  ------            --------------    -----         
 0   date              2741963 non-null  datetime64[ns]
 1   dep_airport       2741963 non-null  object        
 2   dep_city          2741963 non-null  object        
 3   dest_airport      2741963 non-null  object        
 4   dest_city         2741963 non-null  object        
 5   crs_dep_time      2741963 non-null  object        
 6   dep_time          2741963 non-null  object        
 7   dep_delay         2741963 non-null  int64         
 8   taxi_out          2741963 non-null  int64         
 9   wheels_off        2741963 non-null  object        
 10  wheels_on         2741963 non-null  object        
 11  taxi_in           2741963 non-null  int64         
 12  crs_arr_time      2741963 non-null  object        
 13  arr_time          2741963 non-null  objec

## Clean Cancelled Flights

In [30]:
# Read file into DataFrame
cancelled = pd.read_csv(flights_raw)

In [31]:
# Display first rows
pd.set_option('display.max_columns', 30)
cancelled.head()

Unnamed: 0,FL_DATE,ORIGIN,ORIGIN_CITY_NAME,DEST,DEST_CITY_NAME,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,CANCELLED,CANCELLATION_CODE,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 27
0,2019-03-01,CVG,"Cincinnati, OH",MSP,"Minneapolis, MN",1534,1617.0,43.0,28.0,1645.0,1729.0,18.0,1648,1747.0,59.0,0.0,,0.0,134.0,150.0,104.0,596.0,0.0,0.0,59.0,0.0,0.0,
1,2019-03-01,JFK,"New York, NY",BUF,"Buffalo, NY",940,1035.0,55.0,56.0,1131.0,1228.0,8.0,1114,1236.0,82.0,0.0,,0.0,94.0,121.0,57.0,301.0,0.0,55.0,27.0,0.0,0.0,
2,2019-03-01,MSN,"Madison, WI",LGA,"New York, NY",1845,1954.0,69.0,14.0,2008.0,2259.0,13.0,2158,2312.0,74.0,0.0,,0.0,133.0,138.0,111.0,812.0,0.0,0.0,6.0,0.0,68.0,
3,2019-03-01,TYS,"Knoxville, TN",DTW,"Detroit, MI",735,729.0,-6.0,15.0,744.0,855.0,8.0,925,903.0,-22.0,0.0,,0.0,110.0,94.0,71.0,443.0,,,,,,
4,2019-03-01,JFK,"New York, NY",BUF,"Buffalo, NY",1336,1530.0,114.0,30.0,1600.0,1652.0,6.0,1513,1658.0,105.0,0.0,,0.0,97.0,88.0,52.0,301.0,35.0,0.0,0.0,0.0,70.0,


In [32]:
# Lowercase columns
cancelled.columns = map(str.lower, cancelled.columns)

In [33]:
# Create DataFrame with cancelled flights
cancelled = cancelled.query('cancelled == 1')

In [34]:
# Rename columns
dict_columns = {'fl_date': 'date', 
                'origin': 'dep_airport', 
                'origin_city_name': 'dep_city', 
                'dest': 'dest_airport', 
                'dest_city_name': 'dest_city',
                'cancelled': 'fl_type',
                'cancellation_code': 'canc_desc',
                'actual_elapsed_time': 'elapsed_time', 
                'carrier_delay': 'Carrier',
                'weather_delay': 'Weather', 
                'nas_delay': 'NAS', 
                'security_delay': 'Security', 
                'late_aircraft_delay': 'Late Aircraft'}


cancelled.rename(columns=dict_columns, errors='raise', inplace=True)

In [35]:
# Convert flight date to datetime data type
cancelled['date'] = pd.to_datetime(cancelled['date'])

In [36]:
# Convert columns from numeric to object types and pad zeroes
columns = ['crs_dep_time', 'crs_arr_time']

converter = (lambda x: x.astype(str)
                        .str.zfill(4)
                        .str.replace('2400', '0000')
               if x.name in columns else x)

cancelled = cancelled.apply(converter)

In [37]:
# Convert columns to time timestamp
for column in columns:
    cancelled[column] = cancelled[column].apply(lambda x: datetime.strptime(x, '%H%M').time())

In [38]:
# Apply timedelta
time_diff = lambda x: datetime.combine(date.min, x)
cancelled['crs_elapsed_time'] = (cancelled['crs_arr_time'].apply(time_diff) 
                                    - cancelled['crs_dep_time'].apply(time_diff))

In [39]:
# Convert to minutes
ls_minutes = []
for item in cancelled['crs_elapsed_time']:
    ls_minutes.append(item.total_seconds()/60)

cancelled['crs_elapsed_time'] = ls_minutes

In [40]:
# Convert to int
columns = ['fl_type', 'diverted', 'crs_elapsed_time', 'distance']
converter = (lambda x: x.astype(int) if x.name in columns else x)

cancelled = cancelled.apply(converter)

In [41]:
# Assign nulls to columns not used
cancelled[['dep_time', 'dep_delay', 'taxi_out', 'wheels_off', 'wheels_on', 
           'taxi_in', 'arr_time', 'arr_delay', 'elapsed_time', 'air_time',
           'Carrier', 'Weather', 'NAS', 'Security', 'Late Aircraft']] = np.nan

In [42]:
# Replace cancellation codes with descriptions
dict_desc = {'A': 'Carrier',
             'B': 'Weather',
             'C': 'NAS',
             'D': 'Security'}

ls_canc = []
for item in cancelled['canc_desc']:
    if item in dict_desc:
        ls_canc.append(dict_desc[item])

cancelled['canc_desc'] = ls_canc

In [43]:
# Append dataframe
flights = delayed.append(cancelled).reset_index(drop=True)

In [44]:
# Split columns
flights[['dep_city', 'dep_state']] = flights.dep_city.str.split(',', expand=True)
flights[['dest_city', 'dest_state']] = flights.dest_city.str.split(',', expand=True)

In [45]:
# Strip columns
flights['dep_state'] = flights.dep_state.str.strip()
flights['dest_state'] = flights.dest_state.str.strip()

In [46]:
# Label cancelled and diverted column to True or False
ls_types = []
    
for item in flights['fl_type']:
    if item == 0:
        ls_types.append('Delayed')
    else:
        ls_types.append('Cancelled')
        
flights['fl_type'] = ls_types

In [47]:
ls_diverted = []

for item in flights['diverted']:
    if item == 0:
        ls_diverted.append('No')
    else:
        ls_diverted.append('Yes')

flights['diverted'] = ls_diverted

In [48]:
# Break down datetime objects
flights['year'] = flights['date'].apply(lambda x: x.year)

flights['month'] = flights['date'].apply(lambda x: x.strftime('%b'))

flights['weekday'] = flights['date'].apply(lambda x: x.isoweekday())

In [49]:
dict_weekday = {1: 'Mon',
                2: 'Tue',
                3: 'Wed', 
                4: 'Thu',
                5: 'Fri',
                6: 'Sat',
                7: 'Sun'}

ls_weekday = []

for item in flights['weekday']:
    if item in dict_weekday:
        ls_weekday.append(dict_weekday[item])

flights['weekday'] = ls_weekday

flights['weekday'] = flights['weekday'].astype('category')

In [52]:
# Reorder columns and ignore unnecessary columns
ordered_columns = ['year', 'month', 'weekday', 'dep_airport', 
                   'dep_state', 'dest_airport', 
                   'dest_state', 'crs_dep_time',  
                   'dep_delay', 'taxi_out', 'taxi_in', 
                   'crs_arr_time', 'arr_delay', 'fl_type', 
                   'canc_desc', 'diverted','crs_elapsed_time', 
                   'air_time', 'Carrier', 
                   'Weather', 'NAS', 'Security', 'Late Aircraft']

flights = flights[ordered_columns]

In [53]:
flights.head()

Unnamed: 0,year,month,weekday,dep_airport,dep_state,dest_airport,dest_state,crs_dep_time,dep_delay,taxi_out,taxi_in,crs_arr_time,arr_delay,fl_type,canc_desc,diverted,crs_elapsed_time,air_time,Carrier,Weather,NAS,Security,Late Aircraft
0,2019,Mar,Fri,CVG,OH,MSP,MN,15:34:00,43.0,28.0,18.0,16:48:00,59.0,Delayed,,No,134,104.0,0.0,0.0,59.0,0.0,0.0
1,2019,Mar,Fri,JFK,NY,BUF,NY,09:40:00,55.0,56.0,8.0,11:14:00,82.0,Delayed,,No,94,57.0,0.0,55.0,27.0,0.0,0.0
2,2019,Mar,Fri,MSN,WI,LGA,NY,18:45:00,69.0,14.0,13.0,21:58:00,74.0,Delayed,,No,133,111.0,0.0,0.0,6.0,0.0,68.0
3,2019,Mar,Fri,JFK,NY,BUF,NY,13:36:00,114.0,30.0,6.0,15:13:00,105.0,Delayed,,No,97,52.0,35.0,0.0,0.0,0.0,70.0
4,2019,Mar,Fri,ATL,GA,TYS,TN,13:40:00,45.0,18.0,3.0,14:38:00,42.0,Delayed,,No,58,34.0,42.0,0.0,0.0,0.0,0.0


In [54]:
flights.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2993472 entries, 0 to 2993471
Data columns (total 23 columns):
 #   Column            Non-Null Count    Dtype   
---  ------            --------------    -----   
 0   year              2993472 non-null  int64   
 1   month             2993472 non-null  object  
 2   weekday           2993472 non-null  category
 3   dep_airport       2993472 non-null  object  
 4   dep_state         2993472 non-null  object  
 5   dest_airport      2993472 non-null  object  
 6   dest_state        2993472 non-null  object  
 7   crs_dep_time      2993472 non-null  object  
 8   dep_delay         2741963 non-null  float64 
 9   taxi_out          2741963 non-null  float64 
 10  taxi_in           2741963 non-null  float64 
 11  crs_arr_time      2993472 non-null  object  
 12  arr_delay         2741963 non-null  float64 
 13  fl_type           2993472 non-null  object  
 14  canc_desc         251509 non-null   object  
 15  diverted          2993472 non-nu

In [55]:
# Write to csv
dest_folder = '/Users/thaysmartinez/Documents/Flights Data Exploration/'
extension = '.csv'
flights.to_csv('%sflights%s' % (dest_folder, extension), index=False, encoding='utf-8')