## Clean Delayed Flights Data

In [1]:
# Import all packages
import pandas as pd
import numpy as np
from datetime import datetime, date

In [2]:
flights_raw = '/Users/thaysmartinez/Documents/Flights Data Exploration/flights_raw.csv'
df_delayed = pd.read_csv(flights_raw)

In [3]:
# Filter DataFrame with delayed flights
df_delayed = df_delayed.query('CANCELLED == 0 & ARR_DELAY > 0')

In [4]:
df_delayed.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5017439 entries, 0 to 14635477
Data columns (total 28 columns):
 #   Column               Non-Null Count    Dtype  
---  ------               --------------    -----  
 0   FL_DATE              5017439 non-null  object 
 1   ORIGIN               5017439 non-null  object 
 2   ORIGIN_CITY_NAME     5017439 non-null  object 
 3   DEST                 5017439 non-null  object 
 4   DEST_CITY_NAME       5017439 non-null  object 
 5   CRS_DEP_TIME         5017439 non-null  int64  
 6   DEP_TIME             5017439 non-null  float64
 7   DEP_DELAY            5016242 non-null  float64
 8   TAXI_OUT             5017439 non-null  float64
 9   WHEELS_OFF           5017439 non-null  float64
 10  WHEELS_ON            5017439 non-null  float64
 11  TAXI_IN              5017439 non-null  float64
 12  CRS_ARR_TIME         5017439 non-null  int64  
 13  ARR_TIME             5017439 non-null  float64
 14  ARR_DELAY            5017439 non-null  float64
 1

In [5]:
pd.set_option('display.max_columns', 30)
df_delayed.head()

Unnamed: 0,FL_DATE,ORIGIN,ORIGIN_CITY_NAME,DEST,DEST_CITY_NAME,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,CANCELLED,CANCELLATION_CODE,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 27
0,2019-03-01,CVG,"Cincinnati, OH",MSP,"Minneapolis, MN",1534,1617.0,43.0,28.0,1645.0,1729.0,18.0,1648,1747.0,59.0,0.0,,0.0,134.0,150.0,104.0,596.0,0.0,0.0,59.0,0.0,0.0,
1,2019-03-01,JFK,"New York, NY",BUF,"Buffalo, NY",940,1035.0,55.0,56.0,1131.0,1228.0,8.0,1114,1236.0,82.0,0.0,,0.0,94.0,121.0,57.0,301.0,0.0,55.0,27.0,0.0,0.0,
2,2019-03-01,MSN,"Madison, WI",LGA,"New York, NY",1845,1954.0,69.0,14.0,2008.0,2259.0,13.0,2158,2312.0,74.0,0.0,,0.0,133.0,138.0,111.0,812.0,0.0,0.0,6.0,0.0,68.0,
4,2019-03-01,JFK,"New York, NY",BUF,"Buffalo, NY",1336,1530.0,114.0,30.0,1600.0,1652.0,6.0,1513,1658.0,105.0,0.0,,0.0,97.0,88.0,52.0,301.0,35.0,0.0,0.0,0.0,70.0,
6,2019-03-01,ATL,"Atlanta, GA",TYS,"Knoxville, TN",1340,1425.0,45.0,18.0,1443.0,1517.0,3.0,1438,1520.0,42.0,0.0,,0.0,58.0,55.0,34.0,152.0,42.0,0.0,0.0,0.0,0.0,


In [6]:
# Lowercase columns
df_delayed.columns = map(str.lower, df_delayed.columns)

In [7]:
# Rename columns
df_delayed.rename(columns={'origin': 'origin_airport', 
                           'dest': 'dest_airport', 
                           'origin_city_name': 'origin_city', 
                           'dest_city_name': 'dest_city', 
                           'dep_time': 'actual_dep_time', 
                           'arr_time': 'actual_arr_time',
                           'cancellation_code': 'cancellation_desc'},
                  errors='raise', inplace=True)

In [8]:
# Convert flight date to datetime data type
df_delayed['fl_date'] = pd.to_datetime(df_delayed['fl_date'])

In [9]:
# Convert columns from numeric to object types and pad zeroes
columns = ['crs_dep_time', 'actual_dep_time', 'wheels_off', 
           'wheels_on', 'crs_arr_time', 'actual_arr_time']

converter = (lambda x: x.astype(str)
                        .str.replace(r'(\.0)$', '')
                        .str.zfill(4)
                        .str.replace('2400', '0000')
               if x.name in columns else x)

df_delayed = df_delayed.apply(converter)

In [10]:
# Convert columns to time timestamp
for column in columns:
    df_delayed[column] = df_delayed[column].apply(lambda x: datetime.strptime(x, '%H%M').time())

In [11]:
# Apply timedelta
time_diff = lambda x: datetime.combine(date.min, x)
df_delayed['dep_delay'] = (df_delayed['actual_dep_time'].apply(time_diff) 
                           - df_delayed['crs_dep_time'].apply(time_diff))

df_delayed['actual_elapsed_time'] = (df_delayed['actual_arr_time'].apply(time_diff) 
                                     - df_delayed['actual_dep_time'].apply(time_diff))

df_delayed['air_time'] = (df_delayed['crs_arr_time'].apply(time_diff) 
                          - df_delayed['crs_dep_time'].apply(time_diff))

In [12]:
# Convert to minutes
columns = ['dep_delay', 'actual_elapsed_time', 'air_time']

for column in columns:
    ls_min = []
    for item in df_delayed[column]:
        ls_min.append(item.total_seconds()/60)
    
    df_delayed[column] = ls_min

In [13]:
df_delayed.head()

Unnamed: 0,fl_date,origin_airport,origin_city,dest_airport,dest_city,crs_dep_time,actual_dep_time,dep_delay,taxi_out,wheels_off,wheels_on,taxi_in,crs_arr_time,actual_arr_time,arr_delay,cancelled,cancellation_desc,diverted,crs_elapsed_time,actual_elapsed_time,air_time,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,unnamed: 27
0,2019-03-01,CVG,"Cincinnati, OH",MSP,"Minneapolis, MN",15:34:00,16:17:00,43.0,28.0,16:45:00,17:29:00,18.0,16:48:00,17:47:00,59.0,0.0,,0.0,134.0,90.0,74.0,596.0,0.0,0.0,59.0,0.0,0.0,
1,2019-03-01,JFK,"New York, NY",BUF,"Buffalo, NY",09:40:00,10:35:00,55.0,56.0,11:31:00,12:28:00,8.0,11:14:00,12:36:00,82.0,0.0,,0.0,94.0,121.0,94.0,301.0,0.0,55.0,27.0,0.0,0.0,
2,2019-03-01,MSN,"Madison, WI",LGA,"New York, NY",18:45:00,19:54:00,69.0,14.0,20:08:00,22:59:00,13.0,21:58:00,23:12:00,74.0,0.0,,0.0,133.0,198.0,193.0,812.0,0.0,0.0,6.0,0.0,68.0,
4,2019-03-01,JFK,"New York, NY",BUF,"Buffalo, NY",13:36:00,15:30:00,114.0,30.0,16:00:00,16:52:00,6.0,15:13:00,16:58:00,105.0,0.0,,0.0,97.0,88.0,97.0,301.0,35.0,0.0,0.0,0.0,70.0,
6,2019-03-01,ATL,"Atlanta, GA",TYS,"Knoxville, TN",13:40:00,14:25:00,45.0,18.0,14:43:00,15:17:00,3.0,14:38:00,15:20:00,42.0,0.0,,0.0,58.0,55.0,58.0,152.0,42.0,0.0,0.0,0.0,0.0,


In [14]:
# Convert to int
columns = ['dep_delay', 'taxi_out', 'taxi_in', 'arr_delay', 'cancelled',
           'diverted', 'crs_elapsed_time', 'actual_elapsed_time', 
           'air_time', 'distance']

convert_int = lambda x: x.astype(int) if x.name in columns else x

df_delayed = df_delayed.apply(convert_int)

In [15]:
df_delayed.head()

Unnamed: 0,fl_date,origin_airport,origin_city,dest_airport,dest_city,crs_dep_time,actual_dep_time,dep_delay,taxi_out,wheels_off,wheels_on,taxi_in,crs_arr_time,actual_arr_time,arr_delay,cancelled,cancellation_desc,diverted,crs_elapsed_time,actual_elapsed_time,air_time,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,unnamed: 27
0,2019-03-01,CVG,"Cincinnati, OH",MSP,"Minneapolis, MN",15:34:00,16:17:00,43,28,16:45:00,17:29:00,18,16:48:00,17:47:00,59,0,,0,134,90,74,596,0.0,0.0,59.0,0.0,0.0,
1,2019-03-01,JFK,"New York, NY",BUF,"Buffalo, NY",09:40:00,10:35:00,55,56,11:31:00,12:28:00,8,11:14:00,12:36:00,82,0,,0,94,121,94,301,0.0,55.0,27.0,0.0,0.0,
2,2019-03-01,MSN,"Madison, WI",LGA,"New York, NY",18:45:00,19:54:00,69,14,20:08:00,22:59:00,13,21:58:00,23:12:00,74,0,,0,133,198,193,812,0.0,0.0,6.0,0.0,68.0,
4,2019-03-01,JFK,"New York, NY",BUF,"Buffalo, NY",13:36:00,15:30:00,114,30,16:00:00,16:52:00,6,15:13:00,16:58:00,105,0,,0,97,88,97,301,35.0,0.0,0.0,0.0,70.0,
6,2019-03-01,ATL,"Atlanta, GA",TYS,"Knoxville, TN",13:40:00,14:25:00,45,18,14:43:00,15:17:00,3,14:38:00,15:20:00,42,0,,0,58,55,58,152,42.0,0.0,0.0,0.0,0.0,


In [16]:
df_delayed.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5017439 entries, 0 to 14635477
Data columns (total 28 columns):
 #   Column               Non-Null Count    Dtype         
---  ------               --------------    -----         
 0   fl_date              5017439 non-null  datetime64[ns]
 1   origin_airport       5017439 non-null  object        
 2   origin_city          5017439 non-null  object        
 3   dest_airport         5017439 non-null  object        
 4   dest_city            5017439 non-null  object        
 5   crs_dep_time         5017439 non-null  object        
 6   actual_dep_time      5017439 non-null  object        
 7   dep_delay            5017439 non-null  int64         
 8   taxi_out             5017439 non-null  int64         
 9   wheels_off           5017439 non-null  object        
 10  wheels_on            5017439 non-null  object        
 11  taxi_in              5017439 non-null  int64         
 12  crs_arr_time         5017439 non-null  object        
 

## Clean Cancelled Flights

In [17]:
# Read file into DataFrame
df_cancelled = pd.read_csv(flights_raw)

In [18]:
# Display first rows
pd.set_option('display.max_columns', 30)
df_cancelled.head()

Unnamed: 0,FL_DATE,ORIGIN,ORIGIN_CITY_NAME,DEST,DEST_CITY_NAME,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,CANCELLED,CANCELLATION_CODE,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 27
0,2019-03-01,CVG,"Cincinnati, OH",MSP,"Minneapolis, MN",1534,1617.0,43.0,28.0,1645.0,1729.0,18.0,1648,1747.0,59.0,0.0,,0.0,134.0,150.0,104.0,596.0,0.0,0.0,59.0,0.0,0.0,
1,2019-03-01,JFK,"New York, NY",BUF,"Buffalo, NY",940,1035.0,55.0,56.0,1131.0,1228.0,8.0,1114,1236.0,82.0,0.0,,0.0,94.0,121.0,57.0,301.0,0.0,55.0,27.0,0.0,0.0,
2,2019-03-01,MSN,"Madison, WI",LGA,"New York, NY",1845,1954.0,69.0,14.0,2008.0,2259.0,13.0,2158,2312.0,74.0,0.0,,0.0,133.0,138.0,111.0,812.0,0.0,0.0,6.0,0.0,68.0,
3,2019-03-01,TYS,"Knoxville, TN",DTW,"Detroit, MI",735,729.0,-6.0,15.0,744.0,855.0,8.0,925,903.0,-22.0,0.0,,0.0,110.0,94.0,71.0,443.0,,,,,,
4,2019-03-01,JFK,"New York, NY",BUF,"Buffalo, NY",1336,1530.0,114.0,30.0,1600.0,1652.0,6.0,1513,1658.0,105.0,0.0,,0.0,97.0,88.0,52.0,301.0,35.0,0.0,0.0,0.0,70.0,


In [19]:
# Lowercase columns
df_cancelled.columns = map(str.lower, df_cancelled.columns)

In [20]:
# Create DataFrame with cancelled flights
df_cancelled = df_cancelled.query('cancelled == 1')

In [21]:
# Convert flight date to datetime data type
df_cancelled['fl_date'] = pd.to_datetime(df_cancelled['fl_date'])

In [22]:
# Rename columns
df_cancelled.rename(columns={'origin': 'origin_airport', 
                             'dest': 'dest_airport', 
                             'origin_city_name': 'origin_city', 
                             'dest_city_name': 'dest_city', 
                             'dep_time': 'actual_dep_time', 
                             'arr_time': 'actual_arr_time',
                             'cancellation_code': 'cancellation_desc'},
                  errors='raise', inplace=True)

In [23]:
# Convert columns from numeric to object types and pad zeroes
columns = ['crs_dep_time', 'crs_arr_time']

converter = (lambda x: x.astype(str)
                        .str.zfill(4)
                        .str.replace('2400', '0000')
               if x.name in columns else x)

df_cancelled = df_cancelled.apply(converter)

In [24]:
# Convert columns to time timestamp
for column in columns:
    df_cancelled[column] = df_cancelled[column].apply(lambda x: datetime.strptime(x, '%H%M').time())

In [25]:
# Apply timedelta
time_diff = lambda x: datetime.combine(date.min, x)
df_cancelled['crs_elapsed_time'] = (df_cancelled['crs_arr_time'].apply(time_diff) 
                                    - df_cancelled['crs_dep_time'].apply(time_diff))

In [26]:
# Convert to minutes
ls_minutes = []
for item in df_cancelled['crs_elapsed_time']:
    ls_minutes.append(item.total_seconds()/60)

df_cancelled['crs_elapsed_time'] = ls_minutes

In [27]:
df_cancelled.head()

Unnamed: 0,fl_date,origin_airport,origin_city,dest_airport,dest_city,crs_dep_time,actual_dep_time,dep_delay,taxi_out,wheels_off,wheels_on,taxi_in,crs_arr_time,actual_arr_time,arr_delay,cancelled,cancellation_desc,diverted,crs_elapsed_time,actual_elapsed_time,air_time,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,unnamed: 27
230,2019-03-01,ORF,"Norfolk, VA",LGA,"New York, NY",18:55:00,,,,,,,20:42:00,,,1.0,B,0.0,107.0,,,296.0,,,,,,
250,2019-03-01,CHO,"Charlottesville, VA",CLT,"Charlotte, NC",19:40:00,,,,,,,21:05:00,,,1.0,B,0.0,85.0,,,245.0,,,,,,
251,2019-03-01,CLT,"Charlotte, NC",CHO,"Charlottesville, VA",17:55:00,,,,,,,19:10:00,,,1.0,B,0.0,75.0,,,245.0,,,,,,
366,2019-03-01,GSP,"Greer, SC",ORD,"Chicago, IL",10:11:00,,,,,,,11:15:00,,,1.0,B,0.0,64.0,,,577.0,,,,,,
418,2019-03-01,DFW,"Dallas/Fort Worth, TX",CRP,"Corpus Christi, TX",20:43:00,2128.0,45.0,20.0,2148.0,,,22:07:00,,,1.0,B,0.0,84.0,,,354.0,,,,,,


In [28]:
# Convert to int
columns = ['cancelled', 'diverted', 'crs_elapsed_time', 'distance']
converter = (lambda x: x.astype(int) if x.name in columns else x)

df_cancelled = df_cancelled.apply(converter)

In [34]:
# Assign nulls to columns not used
df_cancelled[['actual_dep_time', 'dep_delay', 'taxi_out', 
              'wheels_off', 'wheels_on', 'taxi_in', 
              'actual_arr_time', 'arr_delay', 'actual_elapsed_time', 'air_time',
              'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay',
              'late_aircraft_delay']] = np.nan

In [35]:
df_cancelled.head()

Unnamed: 0,fl_date,origin_airport,origin_city,dest_airport,dest_city,crs_dep_time,actual_dep_time,dep_delay,taxi_out,wheels_off,wheels_on,taxi_in,crs_arr_time,actual_arr_time,arr_delay,cancelled,cancellation_desc,diverted,crs_elapsed_time,actual_elapsed_time,air_time,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,unnamed: 27
230,2019-03-01,ORF,"Norfolk, VA",LGA,"New York, NY",18:55:00,,,,,,,20:42:00,,,1,B,0,107,,,296,,,,,,
250,2019-03-01,CHO,"Charlottesville, VA",CLT,"Charlotte, NC",19:40:00,,,,,,,21:05:00,,,1,B,0,85,,,245,,,,,,
251,2019-03-01,CLT,"Charlotte, NC",CHO,"Charlottesville, VA",17:55:00,,,,,,,19:10:00,,,1,B,0,75,,,245,,,,,,
366,2019-03-01,GSP,"Greer, SC",ORD,"Chicago, IL",10:11:00,,,,,,,11:15:00,,,1,B,0,64,,,577,,,,,,
418,2019-03-01,DFW,"Dallas/Fort Worth, TX",CRP,"Corpus Christi, TX",20:43:00,,,,,,,22:07:00,,,1,B,0,84,,,354,,,,,,


In [36]:
# Replace cancellation codes with descriptions
cancellation_desc = {'A': 'Carrier',
                     'B': 'Weather',
                     'C': 'NAS',
                     'D': 'Security'}

ls_canc = []
for item in df_cancelled['cancellation_desc']:
    if item in cancellation_desc:
        ls_canc.append(cancellation_desc[item])

df_cancelled['cancellation_desc'] = ls_canc

In [37]:
# Display resulting DataFrame
df_cancelled.head()

Unnamed: 0,fl_date,origin_airport,origin_city,dest_airport,dest_city,crs_dep_time,actual_dep_time,dep_delay,taxi_out,wheels_off,wheels_on,taxi_in,crs_arr_time,actual_arr_time,arr_delay,cancelled,cancellation_desc,diverted,crs_elapsed_time,actual_elapsed_time,air_time,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,unnamed: 27
230,2019-03-01,ORF,"Norfolk, VA",LGA,"New York, NY",18:55:00,,,,,,,20:42:00,,,1,Weather,0,107,,,296,,,,,,
250,2019-03-01,CHO,"Charlottesville, VA",CLT,"Charlotte, NC",19:40:00,,,,,,,21:05:00,,,1,Weather,0,85,,,245,,,,,,
251,2019-03-01,CLT,"Charlotte, NC",CHO,"Charlottesville, VA",17:55:00,,,,,,,19:10:00,,,1,Weather,0,75,,,245,,,,,,
366,2019-03-01,GSP,"Greer, SC",ORD,"Chicago, IL",10:11:00,,,,,,,11:15:00,,,1,Weather,0,64,,,577,,,,,,
418,2019-03-01,DFW,"Dallas/Fort Worth, TX",CRP,"Corpus Christi, TX",20:43:00,,,,,,,22:07:00,,,1,Weather,0,84,,,354,,,,,,


In [38]:
# Append dataframe
flights = df_delayed.append(df_cancelled).reset_index(drop=True)

In [40]:
# Split columns
flights[['origin_city', 'origin_state']] = flights.origin_city.str.split(',', expand=True)
flights[['dest_city', 'dest_state']] = flights.dest_city.str.split(',', expand=True)

In [42]:
# Drop unnamed:27
flights.drop(columns=['unnamed: 27'], inplace=True)

In [45]:
# Reorder columns
ordered_columns = ['fl_date', 'origin_airport', 'origin_city', 'origin_state', 
                   'dest_airport', 'dest_city', 'dest_state', 'crs_dep_time', 
                   'actual_dep_time', 'dep_delay', 'taxi_out', 'wheels_off', 
                   'wheels_on', 'taxi_in', 'crs_arr_time', 'actual_arr_time', 
                   'arr_delay', 'cancelled', 'cancellation_desc', 'diverted',
                   'crs_elapsed_time', 'actual_elapsed_time', 'air_time', 
                   'distance', 'carrier_delay', 'weather_delay', 'nas_delay', 
                   'security_delay', 'late_aircraft_delay', ]

flights = flights[ordered_columns]

In [46]:
flights.head()

Unnamed: 0,fl_date,origin_airport,origin_city,origin_state,dest_airport,dest_city,dest_state,crs_dep_time,actual_dep_time,dep_delay,taxi_out,wheels_off,wheels_on,taxi_in,crs_arr_time,actual_arr_time,arr_delay,cancelled,cancellation_desc,diverted,crs_elapsed_time,actual_elapsed_time,air_time,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay
0,2019-03-01,CVG,Cincinnati,OH,MSP,Minneapolis,MN,15:34:00,16:17:00,43.0,28.0,16:45:00,17:29:00,18.0,16:48:00,17:47:00,59.0,0,,0,134,90.0,74.0,596,0.0,0.0,59.0,0.0,0.0
1,2019-03-01,JFK,New York,NY,BUF,Buffalo,NY,09:40:00,10:35:00,55.0,56.0,11:31:00,12:28:00,8.0,11:14:00,12:36:00,82.0,0,,0,94,121.0,94.0,301,0.0,55.0,27.0,0.0,0.0
2,2019-03-01,MSN,Madison,WI,LGA,New York,NY,18:45:00,19:54:00,69.0,14.0,20:08:00,22:59:00,13.0,21:58:00,23:12:00,74.0,0,,0,133,198.0,193.0,812,0.0,0.0,6.0,0.0,68.0
3,2019-03-01,JFK,New York,NY,BUF,Buffalo,NY,13:36:00,15:30:00,114.0,30.0,16:00:00,16:52:00,6.0,15:13:00,16:58:00,105.0,0,,0,97,88.0,97.0,301,35.0,0.0,0.0,0.0,70.0
4,2019-03-01,ATL,Atlanta,GA,TYS,Knoxville,TN,13:40:00,14:25:00,45.0,18.0,14:43:00,15:17:00,3.0,14:38:00,15:20:00,42.0,0,,0,58,55.0,58.0,152,42.0,0.0,0.0,0.0,0.0


In [48]:
flights.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5268948 entries, 0 to 5268947
Data columns (total 29 columns):
 #   Column               Non-Null Count    Dtype         
---  ------               --------------    -----         
 0   fl_date              5268948 non-null  datetime64[ns]
 1   origin_airport       5268948 non-null  object        
 2   origin_city          5268948 non-null  object        
 3   origin_state         5268948 non-null  object        
 4   dest_airport         5268948 non-null  object        
 5   dest_city            5268948 non-null  object        
 6   dest_state           5268948 non-null  object        
 7   crs_dep_time         5268948 non-null  object        
 8   actual_dep_time      5017439 non-null  object        
 9   dep_delay            5017439 non-null  float64       
 10  taxi_out             5017439 non-null  float64       
 11  wheels_off           5017439 non-null  object        
 12  wheels_on            5017439 non-null  object        
 1

In [49]:
# Write to csv
dest_folder = '/Users/thaysmartinez/Documents/Flights Data Exploration/'
extension = '.csv'
flights.to_csv('%sflights%s' % (dest_folder, extension), index=False, encoding='utf-8')