## Clean Delayed Flights Data

In [1]:
# Import all packages
import pandas as pd
import numpy as np
from datetime import datetime, date

In [2]:
# Load data
flights_raw = '/Users/thaysmartinez/Documents/Flights Data Exploration/flights_raw.csv'
df_sample = pd.read_csv(flights_raw)

In [3]:
# Load sample
df_sample = df_sample.sample(n=500000, random_state=42)

In [4]:
# Create copy
flights = df_sample.copy()

In [5]:
# Filter DataFrame with finalized flights
df1 = flights.query('CANCELLED == 0 & DIVERTED == 0').reset_index(drop=True)

In [6]:
df1.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 491127 entries, 0 to 491126
Data columns (total 28 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   FL_DATE              491127 non-null  object 
 1   ORIGIN               491127 non-null  object 
 2   ORIGIN_CITY_NAME     491127 non-null  object 
 3   DEST                 491127 non-null  object 
 4   DEST_CITY_NAME       491127 non-null  object 
 5   CRS_DEP_TIME         491127 non-null  int64  
 6   DEP_TIME             491127 non-null  float64
 7   DEP_DELAY            491060 non-null  float64
 8   TAXI_OUT             491127 non-null  float64
 9   WHEELS_OFF           491127 non-null  float64
 10  WHEELS_ON            491127 non-null  float64
 11  TAXI_IN              491127 non-null  float64
 12  CRS_ARR_TIME         491127 non-null  int64  
 13  ARR_TIME             491127 non-null  float64
 14  ARR_DELAY            491091 non-null  float64
 15  CANCELLED        

In [7]:
pd.set_option('display.max_columns', 30)
df1.head()

Unnamed: 0,FL_DATE,ORIGIN,ORIGIN_CITY_NAME,DEST,DEST_CITY_NAME,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,CANCELLED,CANCELLATION_CODE,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 27
0,2019-01-30,FNT,"Flint, MI",ATL,"Atlanta, GA",600,554.0,-6.0,38.0,632.0,814.0,4.0,820,818.0,-2.0,0.0,,0.0,140.0,144.0,102.0,645.0,,,,,,
1,2019-10-18,CMH,"Columbus, OH",MIA,"Miami, FL",657,713.0,16.0,23.0,736.0,948.0,4.0,941,952.0,11.0,0.0,,0.0,164.0,159.0,132.0,990.0,,,,,,
2,2016-11-18,MIA,"Miami, FL",LAX,"Los Angeles, CA",730,731.0,1.0,17.0,748.0,948.0,10.0,1016,958.0,-18.0,0.0,,0.0,346.0,327.0,300.0,2342.0,,,,,,
3,2019-05-13,LRD,"Laredo, TX",DFW,"Dallas/Fort Worth, TX",1614,1616.0,2.0,15.0,1631.0,1730.0,8.0,1745,1738.0,-7.0,0.0,,0.0,91.0,82.0,59.0,396.0,,,,,,
4,2017-04-19,LAX,"Los Angeles, CA",MSY,"New Orleans, LA",925,940.0,15.0,16.0,956.0,1505.0,2.0,1507,1507.0,0.0,0.0,,0.0,222.0,207.0,189.0,1670.0,,,,,,


In [8]:
# Lowercase columns
df1.columns = map(str.lower, df1.columns)

In [9]:
# Rename columns
dict_columns = {'fl_date': 'date', 
                'origin': 'dep_airport', 
                'origin_city_name': 'dep_city', 
                'dest': 'dest_airport', 
                'dest_city_name': 'dest_city',
                'cancelled': 'fl_type',
                'cancellation_code': 'canc_desc',
                'actual_elapsed_time': 'elapsed_time', 
                'carrier_delay': 'Carrier',
                'weather_delay': 'Weather', 
                'nas_delay': 'NAS', 
                'security_delay': 'Security', 
                'late_aircraft_delay': 'Late Aircraft'}


df1.rename(columns=dict_columns, errors='raise', inplace=True)

In [10]:
# Convert flight date to datetime data type
df1['date'] = pd.to_datetime(df1['date'])

In [11]:
# Convert columns from numeric to object types and pad zeroes
columns = ['crs_dep_time', 'dep_time', 'wheels_off', 
           'wheels_on', 'crs_arr_time', 'arr_time']

converter = (lambda x: x.astype(str)
                        .str.replace(r'(\.0)$', '')
                        .str.zfill(4)
                        .str.replace('2400', '0000')
             if x.name in columns else x)

df1 = df1.apply(converter)

In [12]:
# Convert columns to time timestamp
for column in columns:
    df1[column] = df1[column].apply(lambda x: datetime.strptime(x, '%H%M').time())

In [15]:
# Convert to int
columns = ['dep_delay', 'taxi_out', 'taxi_in', 'arr_delay', 'fl_type', 'diverted',
           'crs_elapsed_time', 'elapsed_time', 'air_time', 'distance']

convert_int = lambda x: x.fillna(0).astype(int) if x.name in columns else x

df1 = df1.apply(convert_int)

In [16]:
df1.head()

Unnamed: 0,date,dep_airport,dep_city,dest_airport,dest_city,crs_dep_time,dep_time,dep_delay,taxi_out,wheels_off,wheels_on,taxi_in,crs_arr_time,arr_time,arr_delay,fl_type,canc_desc,diverted,crs_elapsed_time,elapsed_time,air_time,distance,Carrier,Weather,NAS,Security,Late Aircraft,unnamed: 27
0,2019-01-30,FNT,"Flint, MI",ATL,"Atlanta, GA",06:00:00,05:54:00,-6,38,06:32:00,08:14:00,4,08:20:00,08:18:00,-2,0,,0,140,144,102,645,,,,,,
1,2019-10-18,CMH,"Columbus, OH",MIA,"Miami, FL",06:57:00,07:13:00,16,23,07:36:00,09:48:00,4,09:41:00,09:52:00,11,0,,0,164,159,132,990,,,,,,
2,2016-11-18,MIA,"Miami, FL",LAX,"Los Angeles, CA",07:30:00,07:31:00,1,17,07:48:00,09:48:00,10,10:16:00,09:58:00,-18,0,,0,346,327,300,2342,,,,,,
3,2019-05-13,LRD,"Laredo, TX",DFW,"Dallas/Fort Worth, TX",16:14:00,16:16:00,2,15,16:31:00,17:30:00,8,17:45:00,17:38:00,-7,0,,0,91,82,59,396,,,,,,
4,2017-04-19,LAX,"Los Angeles, CA",MSY,"New Orleans, LA",09:25:00,09:40:00,15,16,09:56:00,15:05:00,2,15:07:00,15:07:00,0,0,,0,222,207,189,1670,,,,,,


In [17]:
df1.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 491127 entries, 0 to 491126
Data columns (total 28 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   date              491127 non-null  datetime64[ns]
 1   dep_airport       491127 non-null  object        
 2   dep_city          491127 non-null  object        
 3   dest_airport      491127 non-null  object        
 4   dest_city         491127 non-null  object        
 5   crs_dep_time      491127 non-null  object        
 6   dep_time          491127 non-null  object        
 7   dep_delay         491127 non-null  int64         
 8   taxi_out          491127 non-null  int64         
 9   wheels_off        491127 non-null  object        
 10  wheels_on         491127 non-null  object        
 11  taxi_in           491127 non-null  int64         
 12  crs_arr_time      491127 non-null  object        
 13  arr_time          491127 non-null  object        
 14  arr_

## Clean Cancelled Flights

In [18]:
# Read file into DataFrame
df2 = df_sample.copy()

In [19]:
# Display first rows
pd.set_option('display.max_columns', 30)
df2.head()

Unnamed: 0,FL_DATE,ORIGIN,ORIGIN_CITY_NAME,DEST,DEST_CITY_NAME,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,CANCELLED,CANCELLATION_CODE,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 27
6482319,2019-01-30,FNT,"Flint, MI",ATL,"Atlanta, GA",600,554.0,-6.0,38.0,632.0,814.0,4.0,820,818.0,-2.0,0.0,,0.0,140.0,144.0,102.0,645.0,,,,,,
10664859,2019-10-18,CMH,"Columbus, OH",MIA,"Miami, FL",657,713.0,16.0,23.0,736.0,948.0,4.0,941,952.0,11.0,0.0,,0.0,164.0,159.0,132.0,990.0,,,,,,
19665024,2016-11-18,MIA,"Miami, FL",LAX,"Los Angeles, CA",730,731.0,1.0,17.0,748.0,948.0,10.0,1016,958.0,-18.0,0.0,,0.0,346.0,327.0,300.0,2342.0,,,,,,
9917058,2019-05-13,LRD,"Laredo, TX",DFW,"Dallas/Fort Worth, TX",1614,1616.0,2.0,15.0,1631.0,1730.0,8.0,1745,1738.0,-7.0,0.0,,0.0,91.0,82.0,59.0,396.0,,,,,,
5747941,2017-04-19,LAX,"Los Angeles, CA",MSY,"New Orleans, LA",925,940.0,15.0,16.0,956.0,1505.0,2.0,1507,1507.0,0.0,0.0,,0.0,222.0,207.0,189.0,1670.0,,,,,,


In [20]:
# Lowercase columns
df2.columns = map(str.lower, df2.columns)

In [21]:
# Create DataFrame with cancelled flights
df2 = df2.query('cancelled == 1').reset_index(drop=True)

In [22]:
# Rename columns
dict_columns = {'fl_date': 'date', 
                'origin': 'dep_airport', 
                'origin_city_name': 'dep_city', 
                'dest': 'dest_airport', 
                'dest_city_name': 'dest_city',
                'cancelled': 'fl_type',
                'cancellation_code': 'canc_desc',
                'actual_elapsed_time': 'elapsed_time', 
                'carrier_delay': 'Carrier',
                'weather_delay': 'Weather', 
                'nas_delay': 'NAS', 
                'security_delay': 'Security', 
                'late_aircraft_delay': 'Late Aircraft'}


df2.rename(columns=dict_columns, errors='raise', inplace=True)

In [23]:
# Convert flight date to datetime data type
df2['date'] = pd.to_datetime(df2['date'])

In [24]:
# Convert columns from numeric to object types and pad zeroes
columns = ['crs_dep_time', 'crs_arr_time']

converter = (lambda x: x.astype(str)
                        .str.zfill(4)
                        .str.replace('2400', '0000')
               if x.name in columns else x)

df2 = df2.apply(converter)

In [25]:
# Convert columns to time timestamp
for column in columns:
    df2[column] = df2[column].apply(lambda x: datetime.strptime(x, '%H%M').time())

In [None]:
# Apply timedelta
time_diff = lambda x: datetime.combine(date.min, x)
df2['crs_elapsed_time'] = (df2['crs_arr_time'].apply(time_diff) 
                           - df2['crs_dep_time'].apply(time_diff))

In [None]:
# Convert to minutes
ls_minutes = []
for item in df2['crs_elapsed_time']:
    ls_minutes.append(item.total_seconds()/60)

df2['crs_elapsed_time'] = ls_minutes

In [None]:
# Convert to int
columns = ['crs_elapsed_time', 'distance']
converter = (lambda x: x.astype(int) if x.name in columns else x)

df2 = df2.apply(converter)

In [26]:
df2.head()

Unnamed: 0,date,dep_airport,dep_city,dest_airport,dest_city,crs_dep_time,dep_time,dep_delay,taxi_out,wheels_off,wheels_on,taxi_in,crs_arr_time,arr_time,arr_delay,fl_type,canc_desc,diverted,crs_elapsed_time,elapsed_time,air_time,distance,Carrier,Weather,NAS,Security,Late Aircraft,unnamed: 27
0,2018-12-28,IAH,"Houston, TX",STL,"St. Louis, MO",14:55:00,,,,,,,17:02:00,,,1.0,A,0.0,127.0,,,667.0,,,,,,
1,2016-08-09,ATL,"Atlanta, GA",PHL,"Philadelphia, PA",16:36:00,,,,,,,18:56:00,,,1.0,A,0.0,140.0,,,666.0,,,,,,
2,2018-09-14,CLT,"Charlotte, NC",ILM,"Wilmington, NC",14:27:00,,,,,,,15:26:00,,,1.0,B,0.0,59.0,,,185.0,,,,,,
3,2019-04-25,LAX,"Los Angeles, CA",SFO,"San Francisco, CA",08:15:00,,,,,,,09:40:00,,,1.0,A,0.0,85.0,,,337.0,,,,,,
4,2017-01-09,RNO,"Reno, NV",LAX,"Los Angeles, CA",19:40:00,,,,,,,21:19:00,,,1.0,C,0.0,99.0,,,391.0,,,,,,


In [27]:
# Assign nulls to columns not used
df2[['dep_time', 'dep_delay', 'taxi_out', 'wheels_off', 'wheels_on', 
     'taxi_in', 'arr_time', 'arr_delay', 'elapsed_time', 'air_time',
     'Carrier', 'Weather', 'NAS', 'Security', 'Late Aircraft']] = np.nan

In [28]:
# Replace cancellation codes with descriptions
dict_desc = {'A': 'Carrier',
             'B': 'Weather',
             'C': 'NAS',
             'D': 'Security'}

df2['canc_desc'].replace(dict_desc, inplace=True)

In [29]:
df2.head()

Unnamed: 0,date,dep_airport,dep_city,dest_airport,dest_city,crs_dep_time,dep_time,dep_delay,taxi_out,wheels_off,wheels_on,taxi_in,crs_arr_time,arr_time,arr_delay,fl_type,canc_desc,diverted,crs_elapsed_time,elapsed_time,air_time,distance,Carrier,Weather,NAS,Security,Late Aircraft,unnamed: 27
0,2018-12-28,IAH,"Houston, TX",STL,"St. Louis, MO",14:55:00,,,,,,,17:02:00,,,1.0,Carrier,0.0,127.0,,,667.0,,,,,,
1,2016-08-09,ATL,"Atlanta, GA",PHL,"Philadelphia, PA",16:36:00,,,,,,,18:56:00,,,1.0,Carrier,0.0,140.0,,,666.0,,,,,,
2,2018-09-14,CLT,"Charlotte, NC",ILM,"Wilmington, NC",14:27:00,,,,,,,15:26:00,,,1.0,Weather,0.0,59.0,,,185.0,,,,,,
3,2019-04-25,LAX,"Los Angeles, CA",SFO,"San Francisco, CA",08:15:00,,,,,,,09:40:00,,,1.0,Carrier,0.0,85.0,,,337.0,,,,,,
4,2017-01-09,RNO,"Reno, NV",LAX,"Los Angeles, CA",19:40:00,,,,,,,21:19:00,,,1.0,NAS,0.0,99.0,,,391.0,,,,,,


In [30]:
# Append dataframe
flights = df1.append(df2).reset_index(drop=True)

In [31]:
flights.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 498818 entries, 0 to 498817
Data columns (total 28 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   date              498818 non-null  datetime64[ns]
 1   dep_airport       498818 non-null  object        
 2   dep_city          498818 non-null  object        
 3   dest_airport      498818 non-null  object        
 4   dest_city         498818 non-null  object        
 5   crs_dep_time      498818 non-null  object        
 6   dep_time          491127 non-null  object        
 7   dep_delay         491127 non-null  float64       
 8   taxi_out          491127 non-null  float64       
 9   wheels_off        491127 non-null  object        
 10  wheels_on         491127 non-null  object        
 11  taxi_in           491127 non-null  float64       
 12  crs_arr_time      498818 non-null  object        
 13  arr_time          491127 non-null  object        
 14  arr_

In [32]:
# Split columns
flights[['dep_city', 'dep_state']] = flights.dep_city.str.split(',', expand=True)
flights[['dest_city', 'dest_state']] = flights.dest_city.str.split(',', expand=True)

In [33]:
# Strip columns
flights['dep_state'] = flights.dep_state.str.strip()
flights['dest_state'] = flights.dest_state.str.strip()

In [34]:
# Label cancelled and diverted column to True or False
ls_types = []
    
for item, delay in zip(flights.fl_type, flights.arr_delay):
    if item == 1:
        ls_types.append('Cancelled')
    elif delay < 15:
        ls_types.append('On time')
    else:
        ls_types.append('Delayed')
        
flights['fl_type'] = ls_types

In [35]:
# Break down datetime objects
flights['year'] = flights['date'].apply(lambda x: x.year)

flights['month'] = flights['date'].apply(lambda x: x.month)

flights['weekday'] = flights['date'].apply(lambda x: x.isoweekday())

In [43]:
# Reorder columns and ignore unnecessary column 'unnamed: 27'
ordered_columns = ['date', 'year', 'month', 'weekday', 
                   'dep_airport', 'dep_city', 'dep_state', 
                   'dest_airport', 'dest_city', 'dest_state', 
                   'crs_dep_time', 'dep_time', 'dep_delay', 
                   'taxi_out', 'wheels_off', 'wheels_on', 
                   'taxi_in', 'crs_arr_time', 'arr_time', 
                   'arr_delay', 'fl_type', 'canc_desc', 
                   'crs_elapsed_time', 'elapsed_time', 
                   'air_time', 'distance', 'Carrier', 
                   'Weather', 'NAS', 'Security',
                   'Late Aircraft']

flights = flights[ordered_columns]

In [45]:
flights.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 498818 entries, 0 to 498817
Data columns (total 31 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   date              498818 non-null  datetime64[ns]
 1   year              498818 non-null  int64         
 2   month             498818 non-null  int64         
 3   weekday           498818 non-null  int64         
 4   dep_airport       498818 non-null  object        
 5   dep_city          498818 non-null  object        
 6   dep_state         498818 non-null  object        
 7   dest_airport      498818 non-null  object        
 8   dest_city         498818 non-null  object        
 9   dest_state        498818 non-null  object        
 10  crs_dep_time      498818 non-null  object        
 11  dep_time          491127 non-null  object        
 12  dep_delay         491127 non-null  float64       
 13  taxi_out          491127 non-null  float64       
 14  whee

In [46]:
# Write to csv
dest_folder = '/Users/thaysmartinez/Documents/Flights Data Exploration/'
extension = '.csv'
flights.to_csv('%sflights%s' % (dest_folder, extension), index=False, encoding='utf-8')