# Project Overview

## OSEMN Pipeline

* O - Obtaining our data
* S - Scrubbing/Cleaning our data
* E - Exploring/Visualizing our data - in EMN_modeling
* M - Modeling our data - in EMN_modeling
* N - Interpreting the data - in EMN_modeling

## Notebook Preparation

In [1]:
import pandas as pd
import numpy as np
import time

import warnings
warnings.filterwarnings('ignore')

# Obtaining our Data

In [2]:
df = pd.read_csv('data/raw_data/ONTIME_REPORTING_01.csv')
df.shape

(583985, 33)

In [3]:
df

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN,ORIGIN_CITY_NAME,DEST_AIRPORT_ID,...,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 32
0,1,6,7,9E,N8694A,3280,10397,ATL,"Atlanta, GA",11150,...,47.0,37.0,83.0,1,,,,,,
1,1,7,1,9E,N8970D,3280,10397,ATL,"Atlanta, GA",11150,...,47.0,32.0,83.0,1,,,,,,
2,1,8,2,9E,N820AY,3280,10397,ATL,"Atlanta, GA",11150,...,47.0,39.0,83.0,1,,,,,,
3,1,9,3,9E,N840AY,3280,10397,ATL,"Atlanta, GA",11150,...,47.0,37.0,83.0,1,,,,,,
4,1,10,4,9E,N8969A,3280,10397,ATL,"Atlanta, GA",11150,...,47.0,41.0,83.0,1,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
583980,1,30,3,UA,N819UA,2024,14683,SAT,"San Antonio, TX",12266,...,61.0,47.0,191.0,1,,,,,,
583981,1,30,3,UA,N37462,2022,14843,SJU,"San Juan, PR",12264,...,234.0,240.0,1571.0,7,,,,,,
583982,1,30,3,UA,N37462,2021,12264,IAD,"Washington, DC",14843,...,226.0,233.0,1571.0,7,,,,,,
583983,1,30,3,UA,N26967,2020,12266,IAH,"Houston, TX",14771,...,252.0,231.0,1635.0,7,11.0,0.0,0.0,0.0,11.0,


In [4]:
df.memory_usage().sum()

154172168

In [5]:
df.dtypes

MONTH                    int64
DAY_OF_MONTH             int64
DAY_OF_WEEK              int64
OP_UNIQUE_CARRIER       object
TAIL_NUM                object
OP_CARRIER_FL_NUM        int64
ORIGIN_AIRPORT_ID        int64
ORIGIN                  object
ORIGIN_CITY_NAME        object
DEST_AIRPORT_ID          int64
DEST                    object
DEST_CITY_NAME          object
CRS_DEP_TIME             int64
DEP_TIME               float64
DEP_DELAY_NEW          float64
DEP_DEL15              float64
DEP_TIME_BLK            object
CRS_ARR_TIME             int64
ARR_TIME               float64
ARR_DELAY_NEW          float64
ARR_TIME_BLK            object
CANCELLED              float64
CANCELLATION_CODE       object
CRS_ELAPSED_TIME       float64
ACTUAL_ELAPSED_TIME    float64
DISTANCE               float64
DISTANCE_GROUP           int64
CARRIER_DELAY          float64
WEATHER_DELAY          float64
NAS_DELAY              float64
SECURITY_DELAY         float64
LATE_AIRCRAFT_DELAY    float64
Unnamed:

In [6]:
df.describe()

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,DEST_AIRPORT_ID,CRS_DEP_TIME,DEP_TIME,DEP_DELAY_NEW,DEP_DEL15,...,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 32
count,583985.0,583985.0,583985.0,583985.0,583985.0,583985.0,583985.0,567633.0,567630.0,567630.0,...,583851.0,565963.0,583985.0,583985.0,105222.0,105222.0,105222.0,105222.0,105222.0,0.0
mean,1.0,15.960088,3.835626,2537.869334,12659.701982,12659.470015,1326.266642,1331.957814,13.258226,0.174281,...,143.557401,138.610077,797.742767,3.664861,20.10608,4.428551,18.508392,0.079508,24.95059,
std,0.0,8.987942,1.921899,1821.736145,1519.405493,1519.336466,484.081,495.40402,47.50739,0.379351,...,73.216303,73.078565,589.999261,2.32389,65.762202,38.535323,41.726392,3.203342,50.851973,
min,1.0,1.0,1.0,1.0,10135.0,10135.0,1.0,1.0,0.0,0.0,...,20.0,16.0,31.0,1.0,0.0,0.0,0.0,0.0,0.0,
25%,1.0,8.0,2.0,979.0,11292.0,11292.0,917.0,921.0,0.0,0.0,...,90.0,85.0,363.0,2.0,0.0,0.0,0.0,0.0,0.0,
50%,1.0,16.0,4.0,2114.0,12889.0,12889.0,1320.0,1328.0,0.0,0.0,...,125.0,121.0,640.0,3.0,0.0,0.0,5.0,0.0,0.0,
75%,1.0,24.0,5.0,3902.0,13931.0,13931.0,1730.0,1738.0,5.0,0.0,...,175.0,170.0,1037.0,5.0,16.0,0.0,21.0,0.0,29.0,
max,1.0,31.0,7.0,7439.0,16218.0,16218.0,2359.0,2400.0,1651.0,1.0,...,703.0,737.0,4983.0,11.0,1638.0,1416.0,1447.0,816.0,1514.0,


In [7]:
df['ORIGIN_AIRPORT_ID'].nunique()

346

In [8]:
df.isna().sum()

MONTH                       0
DAY_OF_MONTH                0
DAY_OF_WEEK                 0
OP_UNIQUE_CARRIER           0
TAIL_NUM                 2543
OP_CARRIER_FL_NUM           0
ORIGIN_AIRPORT_ID           0
ORIGIN                      0
ORIGIN_CITY_NAME            0
DEST_AIRPORT_ID             0
DEST                        0
DEST_CITY_NAME              0
CRS_DEP_TIME                0
DEP_TIME                16352
DEP_DELAY_NEW           16355
DEP_DEL15               16355
DEP_TIME_BLK                0
CRS_ARR_TIME                0
ARR_TIME                17061
ARR_DELAY_NEW           18022
ARR_TIME_BLK                0
CANCELLED                   0
CANCELLATION_CODE      567259
CRS_ELAPSED_TIME          134
ACTUAL_ELAPSED_TIME     18022
DISTANCE                    0
DISTANCE_GROUP              0
CARRIER_DELAY          478763
WEATHER_DELAY          478763
NAS_DELAY              478763
SECURITY_DELAY         478763
LATE_AIRCRAFT_DELAY    478763
Unnamed: 32            583985
dtype: int

# Scrubbing/Cleaning our Data

## Data for Merging

In [11]:
passengers = pd.read_csv('data/raw_data/T3_AIR_CARRIER_SUMMARY_AIRPORT_ACTIVITY_2019.csv')
passengers

Unnamed: 0,OP_UNIQUE_CARRIER,CARRIER_NAME,ORIGIN_AIRPORT_ID,SERVICE_CLASS,REV_ACRFT_DEP_PERF_510,REV_PAX_ENP_110
0,04Q,Tradewind Aviation,15024,K,10.0,39.0
1,04Q,Tradewind Aviation,14843,K,677.0,3649.0
2,04Q,Tradewind Aviation,10257,V,4.0,6.0
3,04Q,Tradewind Aviation,15323,V,1.0,3.0
4,04Q,Tradewind Aviation,10158,V,1.0,2.0
...,...,...,...,...,...,...
27247,ZW,Air Wisconsin Airlines Corp,11637,K,122.0,4535.0
27248,ZW,Air Wisconsin Airlines Corp,11721,K,143.0,5800.0
27249,ZW,Air Wisconsin Airlines Corp,10469,K,248.0,8901.0
27250,ZW,Air Wisconsin Airlines Corp,12884,K,187.0,7923.0


In [12]:
# Load airplane info so we can get seat count
aircraft = pd.read_csv("data/raw_data/B43_AIRCRAFT_INVENTORY.csv",encoding='latin1')
aircraft.drop_duplicates(subset='TAIL_NUM', inplace=True)
aircraft

Unnamed: 0,MANUFACTURE_YEAR,TAIL_NUM,NUMBER_OF_SEATS
0,1944,N54514,0.0
1,1945,N1651M,0.0
2,1953,N100CE,0.0
3,1953,N141FL,0.0
4,1953,N151FL,0.0
...,...,...,...
7378,2019,N14011,337.0
7379,2019,N16008,337.0
7380,2019,N16009,337.0
7381,2019,N2250U,276.0


In [13]:
# load coordinates of airports
coords = pd.read_csv('data/raw_data/AIRPORT_COORDINATES.csv')
coords.drop_duplicates(subset='ORIGIN_AIRPORT_ID', inplace=True)
coords

Unnamed: 0,ORIGIN_AIRPORT_ID,DISPLAY_AIRPORT_NAME,LATITUDE,LONGITUDE
0,10001,Afognak Lake Airport,58.109444,-152.906667
1,10003,Bear Creek Mining Strip,65.548056,-161.071667
2,10004,Lik Mining Camp,68.083333,-163.166667
3,10005,Little Squaw Airport,67.570000,-148.183889
4,10006,Kizhuyak Bay,57.745278,-152.882778
...,...,...,...,...
18128,16908,Deer Park Airport,47.966944,-117.428611
18129,16909,South Texas International at Edinburg,26.441667,-98.122222
18130,16910,Louisa County Freeman Field,38.009722,-77.970000
18131,16911,Caldwell Industrial,43.641944,-116.635833


In [14]:
# Load proper names of carriers
names = pd.read_csv("data/raw_data/CARRIER_DECODE.csv")
names.drop_duplicates(inplace=True)
names.drop_duplicates(subset=['OP_UNIQUE_CARRIER'], inplace=True)
names

Unnamed: 0,AIRLINE_ID,OP_UNIQUE_CARRIER,CARRIER_NAME
0,21754,2PQ,21 Air LLC
3,20342,Q5,40-Mile Air
4,20342,WRB,40-Mile Air
6,19627,CIQ,A/S Conair
7,19072,AAE,AAA Airlines
...,...,...,...
2702,20379,ZKQ,Zantop International
2706,19771,ZAQ,Zas Airline Of Egypt
2707,21118,37,Zeal 320
2708,22069,ZG,ZIPAIR Tokyo Inc.


In [15]:
employees = pd.read_csv('data/raw_data/P10_EMPLOYEES.csv')
employees = employees[['OP_UNIQUE_CARRIER', 'PASS_GEN_SVC_ADMIN', 'PASSENGER_HANDLING']]
employees = employees.groupby('OP_UNIQUE_CARRIER').sum().reset_index()
employees

Unnamed: 0,OP_UNIQUE_CARRIER,PASS_GEN_SVC_ADMIN,PASSENGER_HANDLING
0,0WQ,19,0
1,1BQ,41,0
2,2HQ,24,0
3,3EQ,32,0
4,5V,0,0
5,5X,0,0
6,5Y,273,0
7,8C,37,0
8,9E,1361,0
9,9S,3,0


### Weather Data

In [16]:
weather_report = pd.read_csv('data/raw_data/airport_weather_2019.csv')
weather_report

Unnamed: 0,STATION,NAME,DATE,AWND,PGTM,PRCP,SNOW,SNWD,TAVG,TMAX,...,WT08,WT09,WESD,WT10,PSUN,TSUN,SN32,SX32,TOBS,WT11
0,USW00013874,ATLANTA HARTSFIELD JACKSON INTERNATIONAL AIRPO...,1/1/2019,4.70,,0.14,0.0,0.0,64.0,66.0,...,,,,,,,,,,
1,USW00013874,ATLANTA HARTSFIELD JACKSON INTERNATIONAL AIRPO...,1/2/2019,4.92,,0.57,0.0,0.0,56.0,59.0,...,1.0,,,,,,,,,
2,USW00013874,ATLANTA HARTSFIELD JACKSON INTERNATIONAL AIRPO...,1/3/2019,5.37,,0.15,0.0,0.0,52.0,55.0,...,,,,,,,,,,
3,USW00013874,ATLANTA HARTSFIELD JACKSON INTERNATIONAL AIRPO...,1/4/2019,12.08,,1.44,0.0,0.0,56.0,66.0,...,,,,,,,,,,
4,USW00013874,ATLANTA HARTSFIELD JACKSON INTERNATIONAL AIRPO...,1/5/2019,13.42,,0.00,0.0,0.0,49.0,59.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38670,USW00093805,"TALLAHASSEE REGIONAL AIRPORT, FL US",2019-12-27,6.04,,0.00,,,68.0,80.0,...,,,,,,,,,,
38671,USW00093805,"TALLAHASSEE REGIONAL AIRPORT, FL US",2019-12-28,5.37,,0.06,,,69.0,74.0,...,1.0,,,,,,,,,
38672,USW00093805,"TALLAHASSEE REGIONAL AIRPORT, FL US",2019-12-29,7.61,,0.10,,,70.0,74.0,...,,,,,,,,,,
38673,USW00093805,"TALLAHASSEE REGIONAL AIRPORT, FL US",2019-12-30,5.82,,0.02,,,68.0,72.0,...,,,,,,,,,,


In [17]:
weather_report['NAME'].nunique()

106

In [18]:
cities = pd.read_csv('data/raw_data/airports_list.csv')
cities

Unnamed: 0,ORIGIN_AIRPORT_ID,DISPLAY_AIRPORT_NAME,ORIGIN_CITY_NAME,NAME
0,12992,Adams Field,"Little Rock, AR","NORTH LITTLE ROCK AIRPORT, AR US"
1,10257,Albany International,"Albany, NY","ALBANY INTERNATIONAL AIRPORT, NY US"
2,10140,Albuquerque International Sunport,"Albuquerque, NM","ALBUQUERQUE INTERNATIONAL AIRPORT, NM US"
3,10299,Anchorage International,"Anchorage, AK","ANCHORAGE TED STEVENS INTERNATIONAL AIRPORT, A..."
4,10397,Atlanta Municipal,"Atlanta, GA",ATLANTA HARTSFIELD JACKSON INTERNATIONAL AIRPO...
...,...,...,...,...
92,15370,Tulsa International,"Tulsa, OK","OKLAHOMA CITY WILL ROGERS WORLD AIRPORT, OK US"
93,12264,Washington Dulles International,"Washington, DC","WASHINGTON DULLES INTERNATIONAL AIRPORT, VA US"
94,13851,Will Rogers World,"Oklahoma City, OK","OKLAHOMA CITY WILL ROGERS WORLD AIRPORT, OK US"
95,12191,William P Hobby,"Houston, TX","HOUSTON WILLIAM P HOBBY AIRPORT, TX US"


In [19]:
weather_merge = pd.merge(cities, weather_report, how='left', on='NAME')
weather_merge

Unnamed: 0,ORIGIN_AIRPORT_ID,DISPLAY_AIRPORT_NAME,ORIGIN_CITY_NAME,NAME,STATION,DATE,AWND,PGTM,PRCP,SNOW,...,WT08,WT09,WESD,WT10,PSUN,TSUN,SN32,SX32,TOBS,WT11
0,12992,Adams Field,"Little Rock, AR","NORTH LITTLE ROCK AIRPORT, AR US",USW00003952,2019-01-01,4.70,,0.00,0.0,...,,,,,,,,,,
1,12992,Adams Field,"Little Rock, AR","NORTH LITTLE ROCK AIRPORT, AR US",USW00003952,2019-01-02,2.01,,0.39,0.0,...,,,,,,,,,,
2,12992,Adams Field,"Little Rock, AR","NORTH LITTLE ROCK AIRPORT, AR US",USW00003952,2019-01-03,6.26,,0.44,0.0,...,,,,,,,,,,
3,12992,Adams Field,"Little Rock, AR","NORTH LITTLE ROCK AIRPORT, AR US",USW00003952,2019-01-04,2.01,,0.13,0.0,...,,,,,,,,,,
4,12992,Adams Field,"Little Rock, AR","NORTH LITTLE ROCK AIRPORT, AR US",USW00003952,2019-01-05,1.79,,0.00,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35020,10713,Boise Air Terminal,"Boise, ID","BOISE AIR TERMINAL, ID US",USW00024131,2019-12-27,5.82,,0.00,0.0,...,,,,,,,,,,
35021,10713,Boise Air Terminal,"Boise, ID","BOISE AIR TERMINAL, ID US",USW00024131,2019-12-28,2.24,,0.00,0.0,...,,,,,,,,,,
35022,10713,Boise Air Terminal,"Boise, ID","BOISE AIR TERMINAL, ID US",USW00024131,2019-12-29,6.26,,0.04,0.1,...,1.0,,,,,,,,,
35023,10713,Boise Air Terminal,"Boise, ID","BOISE AIR TERMINAL, ID US",USW00024131,2019-12-30,2.46,,0.00,0.0,...,,,,,,,,,,


In [20]:
weather = weather_merge[['DATE', 'PRCP', 'SNOW', 'SNWD', 'TMAX', 'AWND', 'ORIGIN_AIRPORT_ID']]

In [21]:
weather.loc[weather['TMAX'].isna()]

Unnamed: 0,DATE,PRCP,SNOW,SNWD,TMAX,AWND,ORIGIN_AIRPORT_ID
4786,2/11/2019,0.22,,0.0,,9.62,11298
19976,2019-10-06,0.0,,0.0,,12.3,15919
24068,12/23/2019,0.0,,,,1.57,11066
24807,,,,,,,14843
30085,6/19/2019,,,0.0,,7.61,14635
31953,8/1/2019,0.66,0.0,0.0,,7.61,15304


In [22]:
weather.drop(weather.loc[weather['ORIGIN_AIRPORT_ID'].isna()].index, axis=0, inplace=True)

In [23]:
weather.loc[weather['TMAX'].isna()]

Unnamed: 0,DATE,PRCP,SNOW,SNWD,TMAX,AWND,ORIGIN_AIRPORT_ID
4786,2/11/2019,0.22,,0.0,,9.62,11298
19976,2019-10-06,0.0,,0.0,,12.3,15919
24068,12/23/2019,0.0,,,,1.57,11066
24807,,,,,,,14843
30085,6/19/2019,,,0.0,,7.61,14635
31953,8/1/2019,0.66,0.0,0.0,,7.61,15304


In [24]:
weather['TMAX'].fillna(round(weather.groupby('ORIGIN_AIRPORT_ID')['TMAX'].transform('mean'), 1), inplace=True)
weather['AWND'].fillna(round(weather.groupby('ORIGIN_AIRPORT_ID')['AWND'].transform('mean'), 1), inplace=True)
weather.fillna(0, inplace=True)

In [25]:
weather.isna().sum()

DATE                 0
PRCP                 0
SNOW                 0
SNWD                 0
TMAX                 0
AWND                 0
ORIGIN_AIRPORT_ID    0
dtype: int64

In [26]:
weather['DATE'] = pd.to_datetime(weather['DATE'])
weather['MONTH'] = pd.DatetimeIndex(weather['DATE']).month
weather['DAY_OF_MONTH'] = pd.DatetimeIndex(weather['DATE']).day
weather

Unnamed: 0,DATE,PRCP,SNOW,SNWD,TMAX,AWND,ORIGIN_AIRPORT_ID,MONTH,DAY_OF_MONTH
0,2019-01-01,0.00,0.0,0.0,45.0,4.70,12992,1,1
1,2019-01-02,0.39,0.0,0.0,39.0,2.01,12992,1,2
2,2019-01-03,0.44,0.0,0.0,41.0,6.26,12992,1,3
3,2019-01-04,0.13,0.0,0.0,47.0,2.01,12992,1,4
4,2019-01-05,0.00,0.0,0.0,62.0,1.79,12992,1,5
...,...,...,...,...,...,...,...,...,...
35020,2019-12-27,0.00,0.0,0.0,35.0,5.82,10713,12,27
35021,2019-12-28,0.00,0.0,0.0,39.0,2.24,10713,12,28
35022,2019-12-29,0.04,0.1,0.0,32.0,6.26,10713,12,29
35023,2019-12-30,0.00,0.0,0.0,34.0,2.46,10713,12,30


## Cleaning Function

In [27]:
def month_cleanup(monthly_data, aircraft, coords, names, weather, passengers, employees):
    
    '''Function which performs features engineering, data merges and cleanup using one month of On-Time data 
    from Bureau of Transportation Services
    Parameters:
    monthly_data: month of on-time data as downloaded from BTS
    aircraft: Aircraft inventory data from BTS
    coords: Airport coordinates data from BTS
    names: Carrier names based on carrier code from BTS
    weather: Daily weather reported at airports from National Center for Environmental Information
    passengers: Yearly passenger information for carriers and airports from BTS
    employees: Employee statistics for carriers from BTS
    '''
    
    # start the timer so we can track how long the cleaning function takes
    start = time.time()
    
    
    # drop rows with no departure time, tail number, or were cancelled
    print("Dropping NaNs from Dep Time, Tail Num. Dropping Cancellations.")
    monthly_data.drop(monthly_data.loc[monthly_data['DEP_TIME'].isna()].index, axis=0, inplace=True)
    monthly_data.drop(monthly_data.loc[monthly_data['TAIL_NUM'].isna()].index, axis=0, inplace=True)
    monthly_data.drop(monthly_data.loc[monthly_data['CANCELLED']==1].index, axis=0, inplace=True)
    print(f'Elapsed Time: {time.time() - start}')
   
    
    # Create time blocks for departure for cleaner categories
    print("\nCreating Departure Time Blocks - DEP_BLK")
    monthly_data.loc[(monthly_data['DEP_TIME_BLK']=='2100-2159') | (monthly_data['DEP_TIME_BLK']=='2200-2259') | (monthly_data['DEP_TIME_BLK']=='2300-2359'), 'DEP_BLOCK'] = 'LATE_NIGHT'
    monthly_data.loc[(monthly_data['DEP_TIME_BLK']=='0001-0559'), 'DEP_BLOCK'] = 'EARLY_MORNING'      
    monthly_data.loc[(monthly_data['DEP_TIME_BLK']=='0600-0659') | (monthly_data['DEP_TIME_BLK']=='0700-0759') | (monthly_data['DEP_TIME_BLK']=='0800-0859') | (monthly_data['DEP_TIME_BLK']=='0900-0959'), 'DEP_BLOCK'] = 'MORNING'
    monthly_data.loc[(monthly_data['DEP_TIME_BLK']=='1000-1059') | (monthly_data['DEP_TIME_BLK']=='1100-1159') | (monthly_data['DEP_TIME_BLK']=='1200-1259'), 'DEP_BLOCK'] = 'MIDDAY'
    monthly_data.loc[(monthly_data['DEP_TIME_BLK']=='1300-1359') | (monthly_data['DEP_TIME_BLK']=='1400-1459') | (monthly_data['DEP_TIME_BLK']=='1500-1559') | (monthly_data['DEP_TIME_BLK']=='1600-1659'), 'DEP_BLOCK'] = 'AFTERNOON'
    monthly_data.loc[(monthly_data['DEP_TIME_BLK']=='1700-1759') | (monthly_data['DEP_TIME_BLK']=='1800-1859') | (monthly_data['DEP_TIME_BLK']=='1900-1959') | (monthly_data['DEP_TIME_BLK']=='2000-2059') , 'DEP_BLOCK'] = 'EVENING'
    print(f'Elapsed Time: {time.time() - start}')
   
    
    # List flight segment number for daily flight segments by tracking tail number
    print("\nAdding Flight Number Sequence - SEGMENT_NUMBER")
    monthly_data["SEGMENT_NUMBER"] = monthly_data.groupby(["TAIL_NUM", 'DAY_OF_MONTH'])["DEP_TIME"].rank("dense", ascending=True)
    print(f'Elapsed Time: {time.time() - start}') 
    
   
    # Listing the concurrent flights at the airport in the time block 
    print("\nAdding Concurrent Flights - CONCURRENT_FLIGHTS")
    monthly_data['CONCURRENT_FLIGHTS'] = monthly_data.groupby(['ORIGIN_AIRPORT_ID','DAY_OF_MONTH', 'DEP_BLOCK'])['OP_UNIQUE_CARRIER'].transform("count")
    print(f'Elapsed Time: {time.time() - start}')
 
    
    # Getting seat counts for each aircraft
    print("\nApplying seat counts to flights - NUMBER_OF_SEATS")   
    # Merge aircraft info with main frame on tail number
    monthly_data = pd.merge(monthly_data, aircraft, how="left", on='TAIL_NUM')
    # Fill missing aircraft info with means
    monthly_data['NUMBER_OF_SEATS'].fillna((monthly_data['NUMBER_OF_SEATS'].mean()), inplace=True)
    # simplify data type of number of seats to reduce memory usage
    monthly_data['NUMBER_OF_SEATS'] = monthly_data['NUMBER_OF_SEATS'].astype('int16')
    print(f'Elapsed Time: {time.time() - start}')

    
    # Merge proper carrier name
    print("\nApplying Carrier Names - CARRIER_NAME")  
    monthly_data = pd.merge(monthly_data, names, how='left', on=['OP_UNIQUE_CARRIER'])
    print(f'Elapsed Time: {time.time() - start}')    
    
    
    # Add monthly flight statistics for carrier and airport
    print("\nAdding flight statistics for carrier and airport - AIRPORT_FLIGHTS_MONTH, AIRLINE_FLIGHTS_MONTH, AIRLINE_AIRPORT_FLIGHTS_MONTH")
    monthly_data['AIRPORT_FLIGHTS_MONTH'] = monthly_data.groupby(['ORIGIN_AIRPORT_ID'])['ORIGIN_CITY_NAME'].transform('count')
    monthly_data['AIRLINE_FLIGHTS_MONTH'] = monthly_data.groupby(['OP_UNIQUE_CARRIER'])['ORIGIN_CITY_NAME'].transform('count')
    monthly_data['AIRLINE_AIRPORT_FLIGHTS_MONTH'] = monthly_data.groupby(['OP_UNIQUE_CARRIER', 'ORIGIN_AIRPORT_ID'])['ORIGIN_CITY_NAME'].transform('count')
    print(f'Elapsed Time: {time.time() - start}')
    
    
    #Add monthly passenger statistics for carrier and airport
    print("\nAdding passenger statistics for carrier and airport - AVG_MONTHLY_PASS_AIRPORT, AVG_MONTHLY_PASS_AIRLINE")
    monthly_airport_passengers = pd.DataFrame(passengers.groupby(['ORIGIN_AIRPORT_ID'])['REV_PAX_ENP_110'].sum())
    monthly_data = pd.merge(monthly_data, monthly_airport_passengers, how='left', on=['ORIGIN_AIRPORT_ID'])
    monthly_data['AVG_MONTHLY_PASS_AIRPORT'] = (monthly_data['REV_PAX_ENP_110']/12).astype('int64')
    monthly_airline_passengers = pd.DataFrame(passengers.groupby(['OP_UNIQUE_CARRIER'])['REV_PAX_ENP_110'].sum())
    monthly_data = pd.merge(monthly_data, monthly_airline_passengers, how='left', on=['OP_UNIQUE_CARRIER'])
    monthly_data['AVG_MONTHLY_PASS_AIRLINE'] = (monthly_data['REV_PAX_ENP_110_y']/12).astype('int64')
    print(f'Elapsed Time: {time.time() - start}')  
    
    
    # Add employee stats
    print("\nAdding employee statistics for carrier - FLT_ATTENDANTS_PER_PASS, GROUND_SERV_PER_PASS")
    monthly_data = pd.merge(monthly_data, employees, how='left', on=['OP_UNIQUE_CARRIER'])
    monthly_data['FLT_ATTENDANTS_PER_PASS'] = monthly_data['PASSENGER_HANDLING']/monthly_data['REV_PAX_ENP_110_y']
    monthly_data['GROUND_SERV_PER_PASS'] = monthly_data['PASS_GEN_SVC_ADMIN']/monthly_data['REV_PAX_ENP_110_y']
    print(f'Elapsed Time: {time.time() - start}')   
    
    
    # Calculate age of plane
    print("\nCalculate Fleet Age - PLANE_AGE")
    monthly_data['MANUFACTURE_YEAR'].fillna((monthly_data['MANUFACTURE_YEAR'].mean()), inplace=True)
    monthly_data['PLANE_AGE'] = 2019 - monthly_data['MANUFACTURE_YEAR']
    print(f'Elapsed Time: {time.time() - start}') 

    
    # Merge airport coordinates
    print("\nAdding airport coordinates - LATITUDE, LONGITUDE, DEPARTING_AIRPORT")
    monthly_data = pd.merge(monthly_data, coords, how='left', on=['ORIGIN_AIRPORT_ID'])
    monthly_data['LATITUDE'] = round(monthly_data['LATITUDE'], 3)
    monthly_data['LONGITUDE'] = round(monthly_data['LONGITUDE'], 3)
    print(f'Elapsed Time: {time.time() - start}')

    
    # Get previous airport for tail number
    print("\nAdding airports - PREVIOUS_AIRPORT")
    segment_temp = monthly_data[['DAY_OF_MONTH', 'TAIL_NUM', 'DISPLAY_AIRPORT_NAME', 'SEGMENT_NUMBER']]
    monthly_data = pd.merge_asof(monthly_data.sort_values('SEGMENT_NUMBER'), segment_temp.sort_values('SEGMENT_NUMBER'), on='SEGMENT_NUMBER', by=['DAY_OF_MONTH', 'TAIL_NUM'], allow_exact_matches=False)
    monthly_data['DISPLAY_AIRPORT_NAME_y'].fillna('NONE', inplace=True)
    monthly_data.rename(columns={"DISPLAY_AIRPORT_NAME_y": "PREVIOUS_AIRPORT", "DISPLAY_AIRPORT_NAME_x": "DEPARTING_AIRPORT"}, inplace=True)  
    print(f'Elapsed Time: {time.time() - start}')
    


    
    
    # Drop airports below the 10th percentile
    print("\nDropping bottom 10% of airports")
    monthly_data.drop(monthly_data.loc[monthly_data['AIRPORT_FLIGHTS_MONTH'] < 1100].index, axis=0, inplace=True)
    print(f'Elapsed Time: {time.time() - start}')
    
    
    # Merge weather data
    print("\nAdding daily weather data - PRCP, SNOW, SNWD, SMAX, TMIN, AWND")
    monthly_data = pd.merge(monthly_data, weather, how='inner', on=['ORIGIN_AIRPORT_ID', 'MONTH', 'DAY_OF_MONTH'])
    
    print(f'Elapsed Time: {time.time() - start}')
    
    
    # drop columns that we won't use
    print("\nClean up unneeded columns")
    monthly_data.drop(columns = ['ORIGIN',  'DEST',  
                   'CRS_DEP_TIME', 'DEP_DELAY_NEW', 'CRS_ARR_TIME', 'ARR_TIME', 
                   'CANCELLED', 'CANCELLATION_CODE', 'CRS_ELAPSED_TIME', 'DISTANCE',
                   'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY',
                  'ARR_DELAY_NEW', 'Unnamed: 32', 'DEP_TIME_BLK', 'ARR_TIME_BLK', 'ACTUAL_ELAPSED_TIME',
                  'DEST_AIRPORT_ID', 'DEST_CITY_NAME',  'OP_CARRIER_FL_NUM',  'OP_UNIQUE_CARRIER',
                       'AIRLINE_ID', 'DATE', 'DAY_OF_MONTH', 'TAIL_NUM','DEP_TIME',
                    'ORIGIN_AIRPORT_ID', 'ORIGIN_CITY_NAME',  'PASSENGER_HANDLING', 'REV_PAX_ENP_110_x', 'REV_PAX_ENP_110_y', 
                                 'PASS_GEN_SVC_ADMIN', 'MANUFACTURE_YEAR',
                                 ],
                    axis=1, inplace=True) #,    
    print(f'Elapsed Time: {time.time() - start}') 
    
    
    # specify data types of various fields to reduce memory usage
    print("\nCleaning up data types")
    monthly_data['MONTH'] = monthly_data['MONTH'].astype('object')
    monthly_data['DAY_OF_WEEK'] = monthly_data['DAY_OF_WEEK'].astype('object')
    monthly_data['DEP_DEL15'] = monthly_data['DEP_DEL15'].astype('int8')
    monthly_data['DISTANCE_GROUP'] = monthly_data['DISTANCE_GROUP'].astype('int8')
    monthly_data['DEP_BLOCK'] = monthly_data['DEP_BLOCK'].astype('object')
    monthly_data['SEGMENT_NUMBER'] = monthly_data['SEGMENT_NUMBER'].astype('int8')
    monthly_data['AIRPORT_FLIGHTS_MONTH'] = monthly_data['AIRPORT_FLIGHTS_MONTH'].astype('int64')
    monthly_data['AIRLINE_FLIGHTS_MONTH'] = monthly_data['AIRLINE_FLIGHTS_MONTH'].astype('int64')
    monthly_data['AIRLINE_AIRPORT_FLIGHTS_MONTH'] = monthly_data['AIRLINE_AIRPORT_FLIGHTS_MONTH'].astype('int64')
    monthly_data['PLANE_AGE'] = monthly_data['PLANE_AGE'].astype('int32')

    monthly_data.reset_index(inplace=True, drop=True)
    
    print(f'Elapsed Time: {time.time() - start}')
    
    print("\nFINISHED")
    return monthly_data

## Process Training Sets

In [49]:
df = pd.read_csv('data/raw_data/ONTIME_REPORTING_01.csv')
month01 = month_cleanup(df, aircraft, coords, names, weather, passengers, employees)
month01.to_pickle("data/pkl/small_train_test.pkl")
month01.to_csv('data/train_test_small.csv', index=False)

Dropping NaNs from Dep Time, Tail Num. Dropping Cancellations.
Elapsed Time: 0.31528592109680176

Creating Departure Time Blocks - DEP_BLK
Elapsed Time: 0.9378514289855957

Adding Flight Number Sequence - SEGMENT_NUMBER
Elapsed Time: 1.1270225048065186

Adding Concurrent Flights - CONCURRENT_FLIGHTS
Elapsed Time: 1.2201075553894043

Applying seat counts to flights - NUMBER_OF_SEATS
Elapsed Time: 2.0108253955841064

Applying Carrier Names - CARRIER_NAME
Elapsed Time: 2.2110066413879395

Adding flight statistics for carrier and airport - AIRPORT_FLIGHTS_MONTH, AIRLINE_FLIGHTS_MONTH, AIRLINE_AIRPORT_FLIGHTS_MONTH
Elapsed Time: 2.4812519550323486

Adding passenger statistics for carrier and airport - AVG_MONTHLY_PASS_AIRPORT, AVG_MONTHLY_PASS_AIRLINE
Elapsed Time: 3.7974462509155273

Adding employee statistics for carrier - FLT_ATTENDANTS_PER_PASS, GROUND_SERV_PER_PASS
Elapsed Time: 4.025653839111328

Calculate Fleet Age - PLANE_AGE
Elapsed Time: 4.03065824508667

Adding airport coordinate

In [29]:
df = pd.read_csv('data/raw_data/ONTIME_REPORTING_02.csv')
month02 = month_cleanup(df, aircraft, coords, names, weather, passengers, employees)

Dropping NaNs from Dep Time, Tail Num. Dropping Cancellations.
Elapsed Time: 0.3132753372192383

Creating Departure Time Blocks - DEP_BLK
Elapsed Time: 0.8948032855987549

Adding Flight Number Sequence - SEGMENT_NUMBER
Elapsed Time: 1.0629558563232422

Adding Concurrent Flights - CONCURRENT_FLIGHTS
Elapsed Time: 1.151036024093628

Applying seat counts to flights - NUMBER_OF_SEATS
Elapsed Time: 1.8737006187438965

Applying Carrier Names - CARRIER_NAME
Elapsed Time: 2.0458567142486572

Adding flight statistics for carrier and airport - AIRPORT_FLIGHTS_MONTH, AIRLINE_FLIGHTS_MONTH, AIRLINE_AIRPORT_FLIGHTS_MONTH
Elapsed Time: 2.2780675888061523

Adding passenger statistics for carrier and airport - AVG_MONTHLY_PASS_AIRPORT, AVG_MONTHLY_PASS_AIRLINE
Elapsed Time: 3.492161273956299

Adding employee statistics for carrier - FLT_ATTENDANTS_PER_PASS, GROUND_SERV_PER_PASS
Elapsed Time: 3.697347402572632

Calculate Fleet Age - PLANE_AGE
Elapsed Time: 3.700350284576416

Adding airport coordinates 

In [30]:
df = pd.read_csv('data/raw_data/ONTIME_REPORTING_03.csv')
month03 = month_cleanup(df, aircraft, coords, names, weather, passengers, employees)

Dropping NaNs from Dep Time, Tail Num. Dropping Cancellations.
Elapsed Time: 0.34231090545654297

Creating Departure Time Blocks - DEP_BLK
Elapsed Time: 1.0309274196624756

Adding Flight Number Sequence - SEGMENT_NUMBER
Elapsed Time: 1.240117073059082

Adding Concurrent Flights - CONCURRENT_FLIGHTS
Elapsed Time: 1.345212697982788

Applying seat counts to flights - NUMBER_OF_SEATS
Elapsed Time: 2.2260119915008545

Applying Carrier Names - CARRIER_NAME
Elapsed Time: 2.436211347579956

Adding flight statistics for carrier and airport - AIRPORT_FLIGHTS_MONTH, AIRLINE_FLIGHTS_MONTH, AIRLINE_AIRPORT_FLIGHTS_MONTH
Elapsed Time: 2.7124621868133545

Adding passenger statistics for carrier and airport - AVG_MONTHLY_PASS_AIRPORT, AVG_MONTHLY_PASS_AIRLINE
Elapsed Time: 4.160768032073975

Adding employee statistics for carrier - FLT_ATTENDANTS_PER_PASS, GROUND_SERV_PER_PASS
Elapsed Time: 4.400986194610596

Calculate Fleet Age - PLANE_AGE
Elapsed Time: 4.404989957809448

Adding airport coordinates -

In [31]:
df = pd.read_csv('data/raw_data/ONTIME_REPORTING_04.csv')
month04 = month_cleanup(df, aircraft, coords, names, weather, passengers, employees)

Dropping NaNs from Dep Time, Tail Num. Dropping Cancellations.
Elapsed Time: 0.3242943286895752

Creating Departure Time Blocks - DEP_BLK
Elapsed Time: 0.9938936233520508

Adding Flight Number Sequence - SEGMENT_NUMBER
Elapsed Time: 1.195084810256958

Adding Concurrent Flights - CONCURRENT_FLIGHTS
Elapsed Time: 1.2951757907867432

Applying seat counts to flights - NUMBER_OF_SEATS
Elapsed Time: 2.1329362392425537

Applying Carrier Names - CARRIER_NAME
Elapsed Time: 2.335110664367676

Adding flight statistics for carrier and airport - AIRPORT_FLIGHTS_MONTH, AIRLINE_FLIGHTS_MONTH, AIRLINE_AIRPORT_FLIGHTS_MONTH
Elapsed Time: 2.618367910385132

Adding passenger statistics for carrier and airport - AVG_MONTHLY_PASS_AIRPORT, AVG_MONTHLY_PASS_AIRLINE
Elapsed Time: 3.991615056991577

Adding employee statistics for carrier - FLT_ATTENDANTS_PER_PASS, GROUND_SERV_PER_PASS
Elapsed Time: 4.22183084487915

Calculate Fleet Age - PLANE_AGE
Elapsed Time: 4.225834846496582

Adding airport coordinates - L

In [32]:
df = pd.read_csv('data/raw_data/ONTIME_REPORTING_05.csv')
month05 = month_cleanup(df, aircraft, coords, names, weather, passengers, employees)

Dropping NaNs from Dep Time, Tail Num. Dropping Cancellations.
Elapsed Time: 0.33929991722106934

Creating Departure Time Blocks - DEP_BLK
Elapsed Time: 1.0309357643127441

Adding Flight Number Sequence - SEGMENT_NUMBER
Elapsed Time: 1.2411179542541504

Adding Concurrent Flights - CONCURRENT_FLIGHTS
Elapsed Time: 1.3532202243804932

Applying seat counts to flights - NUMBER_OF_SEATS
Elapsed Time: 2.230015516281128

Applying Carrier Names - CARRIER_NAME
Elapsed Time: 2.4462122917175293

Adding flight statistics for carrier and airport - AIRPORT_FLIGHTS_MONTH, AIRLINE_FLIGHTS_MONTH, AIRLINE_AIRPORT_FLIGHTS_MONTH
Elapsed Time: 2.737476348876953

Adding passenger statistics for carrier and airport - AVG_MONTHLY_PASS_AIRPORT, AVG_MONTHLY_PASS_AIRLINE
Elapsed Time: 4.188802242279053

Adding employee statistics for carrier - FLT_ATTENDANTS_PER_PASS, GROUND_SERV_PER_PASS
Elapsed Time: 4.441023111343384

Calculate Fleet Age - PLANE_AGE
Elapsed Time: 4.445026397705078

Adding airport coordinates 

In [33]:
df = pd.read_csv('data/raw_data/ONTIME_REPORTING_06.csv')
month06 = month_cleanup(df, aircraft, coords, names, weather, passengers, employees)

Dropping NaNs from Dep Time, Tail Num. Dropping Cancellations.
Elapsed Time: 0.3393080234527588

Creating Departure Time Blocks - DEP_BLK
Elapsed Time: 1.0339300632476807

Adding Flight Number Sequence - SEGMENT_NUMBER
Elapsed Time: 1.2451300621032715

Adding Concurrent Flights - CONCURRENT_FLIGHTS
Elapsed Time: 1.3512182235717773

Applying seat counts to flights - NUMBER_OF_SEATS
Elapsed Time: 2.2320258617401123

Applying Carrier Names - CARRIER_NAME
Elapsed Time: 2.4422154426574707

Adding flight statistics for carrier and airport - AIRPORT_FLIGHTS_MONTH, AIRLINE_FLIGHTS_MONTH, AIRLINE_AIRPORT_FLIGHTS_MONTH
Elapsed Time: 2.726466655731201

Adding passenger statistics for carrier and airport - AVG_MONTHLY_PASS_AIRPORT, AVG_MONTHLY_PASS_AIRLINE
Elapsed Time: 4.181787014007568

Adding employee statistics for carrier - FLT_ATTENDANTS_PER_PASS, GROUND_SERV_PER_PASS
Elapsed Time: 4.426009178161621

Calculate Fleet Age - PLANE_AGE
Elapsed Time: 4.430012464523315

Adding airport coordinates 

In [34]:
df = pd.read_csv('data/raw_data/ONTIME_REPORTING_07.csv')
month07 = month_cleanup(df, aircraft, coords, names, weather, passengers, employees)

Dropping NaNs from Dep Time, Tail Num. Dropping Cancellations.
Elapsed Time: 0.358325719833374

Creating Departure Time Blocks - DEP_BLK
Elapsed Time: 1.0839838981628418

Adding Flight Number Sequence - SEGMENT_NUMBER
Elapsed Time: 1.3031830787658691

Adding Concurrent Flights - CONCURRENT_FLIGHTS
Elapsed Time: 1.4132912158966064

Applying seat counts to flights - NUMBER_OF_SEATS
Elapsed Time: 2.3431270122528076

Applying Carrier Names - CARRIER_NAME
Elapsed Time: 2.5603325366973877

Adding flight statistics for carrier and airport - AIRPORT_FLIGHTS_MONTH, AIRLINE_FLIGHTS_MONTH, AIRLINE_AIRPORT_FLIGHTS_MONTH
Elapsed Time: 2.851588010787964

Adding passenger statistics for carrier and airport - AVG_MONTHLY_PASS_AIRPORT, AVG_MONTHLY_PASS_AIRLINE
Elapsed Time: 4.348947525024414

Adding employee statistics for carrier - FLT_ATTENDANTS_PER_PASS, GROUND_SERV_PER_PASS
Elapsed Time: 4.602184772491455

Calculate Fleet Age - PLANE_AGE
Elapsed Time: 4.6061882972717285

Adding airport coordinates 

In [35]:
df = pd.read_csv('data/raw_data/ONTIME_REPORTING_08.csv')
month08 = month_cleanup(df, aircraft, coords, names, weather, passengers, employees)

Dropping NaNs from Dep Time, Tail Num. Dropping Cancellations.
Elapsed Time: 0.35831689834594727

Creating Departure Time Blocks - DEP_BLK
Elapsed Time: 1.0839753150939941

Adding Flight Number Sequence - SEGMENT_NUMBER
Elapsed Time: 1.30318284034729

Adding Concurrent Flights - CONCURRENT_FLIGHTS
Elapsed Time: 1.4102802276611328

Applying seat counts to flights - NUMBER_OF_SEATS
Elapsed Time: 2.3331098556518555

Applying Carrier Names - CARRIER_NAME
Elapsed Time: 2.553309440612793

Adding flight statistics for carrier and airport - AIRPORT_FLIGHTS_MONTH, AIRLINE_FLIGHTS_MONTH, AIRLINE_AIRPORT_FLIGHTS_MONTH
Elapsed Time: 2.8545820713043213

Adding passenger statistics for carrier and airport - AVG_MONTHLY_PASS_AIRPORT, AVG_MONTHLY_PASS_AIRLINE
Elapsed Time: 4.3569464683532715

Adding employee statistics for carrier - FLT_ATTENDANTS_PER_PASS, GROUND_SERV_PER_PASS
Elapsed Time: 4.614187240600586

Calculate Fleet Age - PLANE_AGE
Elapsed Time: 4.618183374404907

Adding airport coordinates 

In [36]:
df = pd.read_csv('data/raw_data/ONTIME_REPORTING_09.csv')
month09 = month_cleanup(df, aircraft, coords, names, weather, passengers, employees)

Dropping NaNs from Dep Time, Tail Num. Dropping Cancellations.
Elapsed Time: 0.35933351516723633

Creating Departure Time Blocks - DEP_BLK
Elapsed Time: 1.0819823741912842

Adding Flight Number Sequence - SEGMENT_NUMBER
Elapsed Time: 1.30318284034729

Adding Concurrent Flights - CONCURRENT_FLIGHTS
Elapsed Time: 1.4142839908599854

Applying seat counts to flights - NUMBER_OF_SEATS
Elapsed Time: 2.336642265319824

Applying Carrier Names - CARRIER_NAME
Elapsed Time: 2.559844970703125

Adding flight statistics for carrier and airport - AIRPORT_FLIGHTS_MONTH, AIRLINE_FLIGHTS_MONTH, AIRLINE_AIRPORT_FLIGHTS_MONTH
Elapsed Time: 2.8661227226257324

Adding passenger statistics for carrier and airport - AVG_MONTHLY_PASS_AIRPORT, AVG_MONTHLY_PASS_AIRLINE
Elapsed Time: 4.3875041007995605

Adding employee statistics for carrier - FLT_ATTENDANTS_PER_PASS, GROUND_SERV_PER_PASS
Elapsed Time: 4.643736124038696

Calculate Fleet Age - PLANE_AGE
Elapsed Time: 4.648740530014038

Adding airport coordinates -

In [37]:
df = pd.read_csv('data/raw_data/ONTIME_REPORTING_10.csv')
month10 = month_cleanup(df, aircraft, coords, names, weather, passengers, employees)

Dropping NaNs from Dep Time, Tail Num. Dropping Cancellations.
Elapsed Time: 0.3433113098144531

Creating Departure Time Blocks - DEP_BLK
Elapsed Time: 1.0439558029174805

Adding Flight Number Sequence - SEGMENT_NUMBER
Elapsed Time: 1.2551472187042236

Adding Concurrent Flights - CONCURRENT_FLIGHTS
Elapsed Time: 1.365238904953003

Applying seat counts to flights - NUMBER_OF_SEATS
Elapsed Time: 2.2600507736206055

Applying Carrier Names - CARRIER_NAME
Elapsed Time: 2.477248430252075

Adding flight statistics for carrier and airport - AIRPORT_FLIGHTS_MONTH, AIRLINE_FLIGHTS_MONTH, AIRLINE_AIRPORT_FLIGHTS_MONTH
Elapsed Time: 2.7775208950042725

Adding passenger statistics for carrier and airport - AVG_MONTHLY_PASS_AIRPORT, AVG_MONTHLY_PASS_AIRLINE
Elapsed Time: 4.240857362747192

Adding employee statistics for carrier - FLT_ATTENDANTS_PER_PASS, GROUND_SERV_PER_PASS
Elapsed Time: 4.490075349807739

Calculate Fleet Age - PLANE_AGE
Elapsed Time: 4.494079113006592

Adding airport coordinates -

In [38]:
df = pd.read_csv('data/raw_data/ONTIME_REPORTING_11.csv')
month11 = month_cleanup(df, aircraft, coords, names, weather, passengers, employees)

Dropping NaNs from Dep Time, Tail Num. Dropping Cancellations.
Elapsed Time: 0.31828904151916504

Creating Departure Time Blocks - DEP_BLK
Elapsed Time: 0.9818828105926514

Adding Flight Number Sequence - SEGMENT_NUMBER
Elapsed Time: 1.184067726135254

Adding Concurrent Flights - CONCURRENT_FLIGHTS
Elapsed Time: 1.2851667404174805

Applying seat counts to flights - NUMBER_OF_SEATS
Elapsed Time: 2.140934944152832

Applying Carrier Names - CARRIER_NAME
Elapsed Time: 2.345120429992676

Adding flight statistics for carrier and airport - AIRPORT_FLIGHTS_MONTH, AIRLINE_FLIGHTS_MONTH, AIRLINE_AIRPORT_FLIGHTS_MONTH
Elapsed Time: 2.6123712062835693

Adding passenger statistics for carrier and airport - AVG_MONTHLY_PASS_AIRPORT, AVG_MONTHLY_PASS_AIRLINE
Elapsed Time: 4.0146355628967285

Adding employee statistics for carrier - FLT_ATTENDANTS_PER_PASS, GROUND_SERV_PER_PASS
Elapsed Time: 4.248848915100098

Calculate Fleet Age - PLANE_AGE
Elapsed Time: 4.252851963043213

Adding airport coordinates 

In [39]:
df = pd.read_csv('data/raw_data/ONTIME_REPORTING_12.csv')
month12 = month_cleanup(df, aircraft, coords, names, weather, passengers, employees)

Dropping NaNs from Dep Time, Tail Num. Dropping Cancellations.
Elapsed Time: 0.33530402183532715

Creating Departure Time Blocks - DEP_BLK
Elapsed Time: 1.0269317626953125

Adding Flight Number Sequence - SEGMENT_NUMBER
Elapsed Time: 1.238130807876587

Adding Concurrent Flights - CONCURRENT_FLIGHTS
Elapsed Time: 1.3432273864746094

Applying seat counts to flights - NUMBER_OF_SEATS
Elapsed Time: 2.222025156021118

Applying Carrier Names - CARRIER_NAME
Elapsed Time: 2.426210403442383

Adding flight statistics for carrier and airport - AIRPORT_FLIGHTS_MONTH, AIRLINE_FLIGHTS_MONTH, AIRLINE_AIRPORT_FLIGHTS_MONTH
Elapsed Time: 2.713463068008423

Adding passenger statistics for carrier and airport - AVG_MONTHLY_PASS_AIRPORT, AVG_MONTHLY_PASS_AIRLINE
Elapsed Time: 4.171786308288574

Adding employee statistics for carrier - FLT_ATTENDANTS_PER_PASS, GROUND_SERV_PER_PASS
Elapsed Time: 4.4180097579956055

Calculate Fleet Age - PLANE_AGE
Elapsed Time: 4.422014474868774

Adding airport coordinates -

In [40]:
# COMBINE MASTER FILE

combined = pd.concat([month01, month02, month03, month04, month05, month06, month07, month08, month09, month10, month11, month12])
combined.to_pickle("data/pkl/train_test.pkl")
combined.to_csv('data/train_test.csv', index=False)

## Process Test Set

### Weather Data

In [41]:
weather_report2 = pd.read_csv('data/raw_data/airport_weather_2020.csv')
weather_merge2 = pd.merge(cities, weather_report2, how='left', on='NAME')
weather2 = weather_merge2[['DATE', 'PRCP', 'SNOW', 'SNWD', 'TMAX', 'AWND', 'ORIGIN_AIRPORT_ID']]

In [42]:
weather2.drop(weather2.loc[weather2['ORIGIN_AIRPORT_ID'].isna()].index, axis=0, inplace=True)
weather2['TMAX'].fillna(round(weather2.groupby('ORIGIN_AIRPORT_ID')['TMAX'].transform('mean'), 1), inplace=True)
weather2['AWND'].fillna(round(weather2.groupby('ORIGIN_AIRPORT_ID')['AWND'].transform('mean'), 1), inplace=True)
weather2.fillna(0, inplace=True)
weather2['DATE'] = pd.to_datetime(weather2['DATE'])
weather2['MONTH'] = pd.DatetimeIndex(weather2['DATE']).month
weather2['DAY_OF_MONTH'] = pd.DatetimeIndex(weather2['DATE']).day
weather2

Unnamed: 0,DATE,PRCP,SNOW,SNWD,TMAX,AWND,ORIGIN_AIRPORT_ID,MONTH,DAY_OF_MONTH
0,2020-01-01,0.00,0.0,0.0,50.0,4.92,12992,1,1
1,2020-01-02,0.50,0.0,0.0,53.0,6.04,12992,1,2
2,2020-01-03,0.18,0.0,0.0,56.0,4.03,12992,1,3
3,2020-01-04,0.00,0.0,0.0,50.0,4.03,12992,1,4
4,2020-01-05,0.00,0.0,0.0,65.0,2.91,12992,1,5
...,...,...,...,...,...,...,...,...,...
8370,2020-03-27,0.00,0.0,0.0,50.0,6.04,10713,3,27
8371,2020-03-28,0.00,0.0,0.0,57.0,4.70,10713,3,28
8372,2020-03-29,0.10,0.0,0.0,55.0,7.38,10713,3,29
8373,2020-03-30,0.03,0.0,0.0,54.0,11.41,10713,3,30


In [43]:
passengers2 = pd.read_csv('data/raw_data/T3_AIR_CARRIER_SUMMARY_AIRPORT_ACTIVITY_2020.csv')
passengers2

Unnamed: 0,OP_UNIQUE_CARRIER,CARRIER_NAME,ORIGIN_AIRPORT_ID,SERVICE_CLASS,REV_ACRFT_DEP_PERF_510,REV_PAX_ENP_110
0,04Q,Tradewind Aviation,13535,K,20,105.0
1,04Q,Tradewind Aviation,15024,K,9,38.0
2,04Q,Tradewind Aviation,13987,K,1,2.0
3,04Q,Tradewind Aviation,14843,K,626,3553.0
4,04Q,Tradewind Aviation,12197,K,21,109.0
...,...,...,...,...,...,...
6253,ZW,Air Wisconsin Airlines Corp,11721,K,119,4463.0
6254,ZW,Air Wisconsin Airlines Corp,10469,K,160,5095.0
6255,ZW,Air Wisconsin Airlines Corp,12884,K,159,5165.0
6256,ZW,Air Wisconsin Airlines Corp,15380,K,118,4011.0


In [47]:
df = pd.read_csv('data/raw_data/ONTIME_REPORTING_2020_01.csv')
test01 = month_cleanup(df, aircraft, coords, names, weather2, passengers2, employees)
test01.to_pickle("data/pkl/new_data.pkl")
test01.to_csv("data/new_data_small.csv", index=False)

Dropping NaNs from Dep Time, Tail Num. Dropping Cancellations.
Elapsed Time: 0.3262960910797119

Creating Departure Time Blocks - DEP_BLK
Elapsed Time: 0.9959044456481934

Adding Flight Number Sequence - SEGMENT_NUMBER
Elapsed Time: 1.1960856914520264

Adding Concurrent Flights - CONCURRENT_FLIGHTS
Elapsed Time: 1.2981784343719482

Applying seat counts to flights - NUMBER_OF_SEATS
Elapsed Time: 2.1659748554229736

Applying Carrier Names - CARRIER_NAME
Elapsed Time: 2.387174606323242

Adding flight statistics for carrier and airport - AIRPORT_FLIGHTS_MONTH, AIRLINE_FLIGHTS_MONTH, AIRLINE_AIRPORT_FLIGHTS_MONTH
Elapsed Time: 2.6824350357055664

Adding passenger statistics for carrier and airport - AVG_MONTHLY_PASS_AIRPORT, AVG_MONTHLY_PASS_AIRLINE
Elapsed Time: 4.114735126495361

Adding employee statistics for carrier - FLT_ATTENDANTS_PER_PASS, GROUND_SERV_PER_PASS
Elapsed Time: 4.352950811386108

Calculate Fleet Age - PLANE_AGE
Elapsed Time: 4.356954574584961

Adding airport coordinates 

In [45]:
df = pd.read_csv('data/raw_data/ONTIME_REPORTING_2020_02.csv')
test02 = month_cleanup(df, aircraft, coords, names, weather2, passengers2, employees)

Dropping NaNs from Dep Time, Tail Num. Dropping Cancellations.
Elapsed Time: 0.3272981643676758

Creating Departure Time Blocks - DEP_BLK
Elapsed Time: 0.9738764762878418

Adding Flight Number Sequence - SEGMENT_NUMBER
Elapsed Time: 1.1650500297546387

Adding Concurrent Flights - CONCURRENT_FLIGHTS
Elapsed Time: 1.264148473739624

Applying seat counts to flights - NUMBER_OF_SEATS
Elapsed Time: 2.09489369392395

Applying Carrier Names - CARRIER_NAME
Elapsed Time: 2.3020823001861572

Adding flight statistics for carrier and airport - AIRPORT_FLIGHTS_MONTH, AIRLINE_FLIGHTS_MONTH, AIRLINE_AIRPORT_FLIGHTS_MONTH
Elapsed Time: 2.599360227584839

Adding passenger statistics for carrier and airport - AVG_MONTHLY_PASS_AIRPORT, AVG_MONTHLY_PASS_AIRLINE
Elapsed Time: 4.01763916015625

Adding employee statistics for carrier - FLT_ATTENDANTS_PER_PASS, GROUND_SERV_PER_PASS
Elapsed Time: 4.250858306884766

Calculate Fleet Age - PLANE_AGE
Elapsed Time: 4.2558629512786865

Adding airport coordinates - L

In [46]:
df = pd.read_csv('data/raw_data/ONTIME_REPORTING_2020_03.csv')
test03 = month_cleanup(df, aircraft, coords, names, weather2, passengers2, employees)

Dropping NaNs from Dep Time, Tail Num. Dropping Cancellations.
Elapsed Time: 0.3673422336578369

Creating Departure Time Blocks - DEP_BLK
Elapsed Time: 0.976886510848999

Adding Flight Number Sequence - SEGMENT_NUMBER
Elapsed Time: 1.1630558967590332

Adding Concurrent Flights - CONCURRENT_FLIGHTS
Elapsed Time: 1.2551395893096924

Applying seat counts to flights - NUMBER_OF_SEATS
Elapsed Time: 2.0568673610687256

Applying Carrier Names - CARRIER_NAME
Elapsed Time: 2.241034507751465

Adding flight statistics for carrier and airport - AIRPORT_FLIGHTS_MONTH, AIRLINE_FLIGHTS_MONTH, AIRLINE_AIRPORT_FLIGHTS_MONTH
Elapsed Time: 2.517293691635132

Adding passenger statistics for carrier and airport - AVG_MONTHLY_PASS_AIRPORT, AVG_MONTHLY_PASS_AIRLINE
Elapsed Time: 3.8655176162719727

Adding employee statistics for carrier - FLT_ATTENDANTS_PER_PASS, GROUND_SERV_PER_PASS
Elapsed Time: 4.087710618972778

Calculate Fleet Age - PLANE_AGE
Elapsed Time: 4.091713905334473

Adding airport coordinates -

In [48]:
# COMBINE FILE
test = pd.concat([test01, test02, test03])
test.to_pickle("data/pkl/test.pkl")
test.to_csv('data/new_data.csv', index=False)

In [None]:
break

In [47]:
combined

Unnamed: 0,MONTH,DAY_OF_WEEK,DEP_DEL15,DISTANCE_GROUP,DEP_BLOCK,SEGMENT_NUMBER,CONCURRENT_FLIGHTS,NUMBER_OF_SEATS,CARRIER_NAME,AIRPORT_FLIGHTS_MONTH,...,PLANE_AGE,DEPARTING_AIRPORT,LATITUDE,LONGITUDE,PREVIOUS_AIRPORT,PRCP,SNOW,SNWD,TMAX,AWND
0,1,7,0,2,MORNING,1,109,143,Southwest Airlines Co.,13056,...,8,McCarran International,36.080,-115.152,NONE,0.00,0.0,0.0,65.0,2.91
1,1,7,0,7,MORNING,1,109,191,Delta Air Lines Inc.,13056,...,3,McCarran International,36.080,-115.152,NONE,0.00,0.0,0.0,65.0,2.91
2,1,7,0,7,MORNING,1,109,199,Delta Air Lines Inc.,13056,...,18,McCarran International,36.080,-115.152,NONE,0.00,0.0,0.0,65.0,2.91
3,1,7,0,9,MORNING,1,109,180,Delta Air Lines Inc.,13056,...,2,McCarran International,36.080,-115.152,NONE,0.00,0.0,0.0,65.0,2.91
4,1,7,0,7,EARLY_MORNING,1,10,182,Spirit Air Lines,13056,...,1,McCarran International,36.080,-115.152,NONE,0.00,0.0,0.0,65.0,2.91
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
551891,12,7,0,1,LATE_NIGHT,11,6,123,Hawaiian Airlines Inc.,1318,...,18,Lihue Airport,21.979,-159.346,Honolulu International,0.06,0.0,0.0,84.0,15.21
551892,12,7,0,1,EVENING,11,8,123,Hawaiian Airlines Inc.,1318,...,16,Lihue Airport,21.979,-159.346,Honolulu International,0.06,0.0,0.0,84.0,15.21
551893,12,7,0,1,EVENING,11,8,123,Hawaiian Airlines Inc.,1318,...,18,Lihue Airport,21.979,-159.346,Honolulu International,0.06,0.0,0.0,84.0,15.21
551894,12,7,0,1,LATE_NIGHT,12,6,123,Hawaiian Airlines Inc.,1318,...,18,Lihue Airport,21.979,-159.346,Honolulu International,0.06,0.0,0.0,84.0,15.21


In [48]:
test

Unnamed: 0,MONTH,DAY_OF_WEEK,DEP_DEL15,DISTANCE_GROUP,DEP_BLOCK,SEGMENT_NUMBER,CONCURRENT_FLIGHTS,NUMBER_OF_SEATS,CARRIER_NAME,AIRPORT_FLIGHTS_MONTH,...,PLANE_AGE,DEPARTING_AIRPORT,LATITUDE,LONGITUDE,PREVIOUS_AIRPORT,PRCP,SNOW,SNWD,TMAX,AWND
0,1,1,0,1,MORNING,1,49,66,Mesa Airlines Inc.,5021,...,17,Raleigh-Durham International,35.875,-78.782,NONE,0.46,0.0,0.0,67.0,5.37
1,1,1,0,3,EARLY_MORNING,1,8,128,American Airlines Inc.,5021,...,19,Raleigh-Durham International,35.875,-78.782,NONE,0.46,0.0,0.0,67.0,5.37
2,1,1,0,2,MORNING,1,49,99,American Airlines Inc.,5021,...,11,Raleigh-Durham International,35.875,-78.782,NONE,0.46,0.0,0.0,67.0,5.37
3,1,1,0,1,EARLY_MORNING,1,8,99,American Airlines Inc.,5021,...,11,Raleigh-Durham International,35.875,-78.782,NONE,0.46,0.0,0.0,67.0,5.37
4,1,1,0,3,MORNING,1,49,143,Southwest Airlines Co.,5021,...,8,Raleigh-Durham International,35.875,-78.782,NONE,0.46,0.0,0.0,67.0,5.37
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
437694,3,1,0,1,AFTERNOON,8,17,123,Hawaiian Airlines Inc.,1258,...,18,Keahole,19.739,-156.046,Honolulu International,0.00,0.0,0.0,83.0,8.72
437695,3,1,0,1,AFTERNOON,9,17,123,Hawaiian Airlines Inc.,1258,...,18,Keahole,19.739,-156.046,Honolulu International,0.00,0.0,0.0,83.0,8.72
437696,3,1,0,1,AFTERNOON,10,17,123,Hawaiian Airlines Inc.,1258,...,15,Keahole,19.739,-156.046,Kahului Airport,0.00,0.0,0.0,83.0,8.72
437697,3,1,0,1,EVENING,12,6,123,Hawaiian Airlines Inc.,1258,...,18,Keahole,19.739,-156.046,Honolulu International,0.00,0.0,0.0,83.0,8.72


In [49]:
month01

Unnamed: 0,MONTH,DAY_OF_WEEK,DEP_DEL15,DISTANCE_GROUP,DEP_BLOCK,SEGMENT_NUMBER,CONCURRENT_FLIGHTS,NUMBER_OF_SEATS,CARRIER_NAME,AIRPORT_FLIGHTS_MONTH,...,PLANE_AGE,DEPARTING_AIRPORT,LATITUDE,LONGITUDE,PREVIOUS_AIRPORT,PRCP,SNOW,SNWD,TMAX,AWND
0,1,7,0,2,MORNING,1,109,143,Southwest Airlines Co.,13056,...,8,McCarran International,36.080,-115.152,NONE,0.00,0.0,0.0,65.0,2.91
1,1,7,0,7,MORNING,1,109,191,Delta Air Lines Inc.,13056,...,3,McCarran International,36.080,-115.152,NONE,0.00,0.0,0.0,65.0,2.91
2,1,7,0,7,MORNING,1,109,199,Delta Air Lines Inc.,13056,...,18,McCarran International,36.080,-115.152,NONE,0.00,0.0,0.0,65.0,2.91
3,1,7,0,9,MORNING,1,109,180,Delta Air Lines Inc.,13056,...,2,McCarran International,36.080,-115.152,NONE,0.00,0.0,0.0,65.0,2.91
4,1,7,0,7,EARLY_MORNING,1,10,182,Spirit Air Lines,13056,...,1,McCarran International,36.080,-115.152,NONE,0.00,0.0,0.0,65.0,2.91
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
502045,1,2,0,1,EVENING,11,6,123,Hawaiian Airlines Inc.,2092,...,18,Kahului Airport,20.901,-156.434,Honolulu International,0.01,0.0,0.0,79.0,11.86
502046,1,2,0,1,EVENING,11,6,123,Hawaiian Airlines Inc.,2092,...,18,Kahului Airport,20.901,-156.434,Honolulu International,0.01,0.0,0.0,79.0,11.86
502047,1,2,0,1,LATE_NIGHT,12,12,123,Hawaiian Airlines Inc.,2092,...,19,Kahului Airport,20.901,-156.434,Honolulu International,0.01,0.0,0.0,79.0,11.86
502048,1,2,0,1,LATE_NIGHT,13,12,123,Hawaiian Airlines Inc.,2092,...,18,Kahului Airport,20.901,-156.434,Honolulu International,0.01,0.0,0.0,79.0,11.86


## DEPRECATED

In [None]:
    ## create a mask of our delay conditions and apply to new delay field
    #print("\nAdd delay type - DELAY")
    #conditions = [
    #    (monthly_data['CARRIER_DELAY'] > 0),
    #    (monthly_data['WEATHER_DELAY'] > 0),
    #    (monthly_data['NAS_DELAY'] > 0),
    #    (monthly_data['LATE_AIRCRAFT_DELAY'] > 0)
    #    ]
    ## create a list of the values we want to assign for each condition
    #values = ['CARRIER', 'WEATHER', 'NAS', 'LATE_AIRCRAFT']
    ## create a new column and use np.select to assign values to it using our lists as arguments
    #monthly_data['DELAY'] = np.select(conditions, values)
    #print(f'Elapsed Time: {time.time() - start}')

In [None]:
# Load daily weather information
weather = pd.read_pickle('data/weather/weather.pkl')
weather['DATE'] = pd.to_datetime(weather['DATE'])
weather['MONTH'] = pd.DatetimeIndex(weather['DATE']).month
weather['DAY_OF_MONTH'] = pd.DatetimeIndex(weather['DATE']).day
weather

In [None]:
# Get Airports List
airportcounts = combined.drop_duplicates(subset=['DISPLAY_AIRPORT_NAME', 'ORIGIN_CITY_NAME'])
airports = airportcounts[['ORIGIN_AIRPORT_ID', 'DISPLAY_AIRPORT_NAME', 'ORIGIN_CITY_NAME']]
airports.to_csv('data/weather/airports_list_raw.csv')

airports