# Project Overview

## OSEMN Pipeline

* O - Obtaining our data
* S - Scrubbing/Cleaning our data
* E - Exploring/Visualizing our data - in EMN_modeling
* M - Modeling our data - in EMN_modeling
* N - Interpreting the data - in EMN_modeling

## Notebook Preparation

In [1]:
import pandas as pd
import numpy as np
import time

import warnings
warnings.filterwarnings('ignore')

# Obtaining our Data

In [2]:
df = pd.read_csv('data/raw_data/ONTIME_REPORTING_01.csv')
df.shape

(583985, 33)

In [3]:
df

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN,ORIGIN_CITY_NAME,DEST_AIRPORT_ID,...,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 32
0,1,6,7,9E,N8694A,3280,10397,ATL,"Atlanta, GA",11150,...,47.0,37.0,83.0,1,,,,,,
1,1,7,1,9E,N8970D,3280,10397,ATL,"Atlanta, GA",11150,...,47.0,32.0,83.0,1,,,,,,
2,1,8,2,9E,N820AY,3280,10397,ATL,"Atlanta, GA",11150,...,47.0,39.0,83.0,1,,,,,,
3,1,9,3,9E,N840AY,3280,10397,ATL,"Atlanta, GA",11150,...,47.0,37.0,83.0,1,,,,,,
4,1,10,4,9E,N8969A,3280,10397,ATL,"Atlanta, GA",11150,...,47.0,41.0,83.0,1,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
583980,1,30,3,UA,N819UA,2024,14683,SAT,"San Antonio, TX",12266,...,61.0,47.0,191.0,1,,,,,,
583981,1,30,3,UA,N37462,2022,14843,SJU,"San Juan, PR",12264,...,234.0,240.0,1571.0,7,,,,,,
583982,1,30,3,UA,N37462,2021,12264,IAD,"Washington, DC",14843,...,226.0,233.0,1571.0,7,,,,,,
583983,1,30,3,UA,N26967,2020,12266,IAH,"Houston, TX",14771,...,252.0,231.0,1635.0,7,11.0,0.0,0.0,0.0,11.0,


In [4]:
df.memory_usage().sum()

154172168

In [5]:
df.dtypes

MONTH                    int64
DAY_OF_MONTH             int64
DAY_OF_WEEK              int64
OP_UNIQUE_CARRIER       object
TAIL_NUM                object
OP_CARRIER_FL_NUM        int64
ORIGIN_AIRPORT_ID        int64
ORIGIN                  object
ORIGIN_CITY_NAME        object
DEST_AIRPORT_ID          int64
DEST                    object
DEST_CITY_NAME          object
CRS_DEP_TIME             int64
DEP_TIME               float64
DEP_DELAY_NEW          float64
DEP_DEL15              float64
DEP_TIME_BLK            object
CRS_ARR_TIME             int64
ARR_TIME               float64
ARR_DELAY_NEW          float64
ARR_TIME_BLK            object
CANCELLED              float64
CANCELLATION_CODE       object
CRS_ELAPSED_TIME       float64
ACTUAL_ELAPSED_TIME    float64
DISTANCE               float64
DISTANCE_GROUP           int64
CARRIER_DELAY          float64
WEATHER_DELAY          float64
NAS_DELAY              float64
SECURITY_DELAY         float64
LATE_AIRCRAFT_DELAY    float64
Unnamed:

In [6]:
df.describe()

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,DEST_AIRPORT_ID,CRS_DEP_TIME,DEP_TIME,DEP_DELAY_NEW,DEP_DEL15,...,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 32
count,583985.0,583985.0,583985.0,583985.0,583985.0,583985.0,583985.0,567633.0,567630.0,567630.0,...,583851.0,565963.0,583985.0,583985.0,105222.0,105222.0,105222.0,105222.0,105222.0,0.0
mean,1.0,15.960088,3.835626,2537.869334,12659.701982,12659.470015,1326.266642,1331.957814,13.258226,0.174281,...,143.557401,138.610077,797.742767,3.664861,20.10608,4.428551,18.508392,0.079508,24.95059,
std,0.0,8.987942,1.921899,1821.736145,1519.405493,1519.336466,484.081,495.40402,47.50739,0.379351,...,73.216303,73.078565,589.999261,2.32389,65.762202,38.535323,41.726392,3.203342,50.851973,
min,1.0,1.0,1.0,1.0,10135.0,10135.0,1.0,1.0,0.0,0.0,...,20.0,16.0,31.0,1.0,0.0,0.0,0.0,0.0,0.0,
25%,1.0,8.0,2.0,979.0,11292.0,11292.0,917.0,921.0,0.0,0.0,...,90.0,85.0,363.0,2.0,0.0,0.0,0.0,0.0,0.0,
50%,1.0,16.0,4.0,2114.0,12889.0,12889.0,1320.0,1328.0,0.0,0.0,...,125.0,121.0,640.0,3.0,0.0,0.0,5.0,0.0,0.0,
75%,1.0,24.0,5.0,3902.0,13931.0,13931.0,1730.0,1738.0,5.0,0.0,...,175.0,170.0,1037.0,5.0,16.0,0.0,21.0,0.0,29.0,
max,1.0,31.0,7.0,7439.0,16218.0,16218.0,2359.0,2400.0,1651.0,1.0,...,703.0,737.0,4983.0,11.0,1638.0,1416.0,1447.0,816.0,1514.0,


In [7]:
df.isna().sum()

MONTH                       0
DAY_OF_MONTH                0
DAY_OF_WEEK                 0
OP_UNIQUE_CARRIER           0
TAIL_NUM                 2543
OP_CARRIER_FL_NUM           0
ORIGIN_AIRPORT_ID           0
ORIGIN                      0
ORIGIN_CITY_NAME            0
DEST_AIRPORT_ID             0
DEST                        0
DEST_CITY_NAME              0
CRS_DEP_TIME                0
DEP_TIME                16352
DEP_DELAY_NEW           16355
DEP_DEL15               16355
DEP_TIME_BLK                0
CRS_ARR_TIME                0
ARR_TIME                17061
ARR_DELAY_NEW           18022
ARR_TIME_BLK                0
CANCELLED                   0
CANCELLATION_CODE      567259
CRS_ELAPSED_TIME          134
ACTUAL_ELAPSED_TIME     18022
DISTANCE                    0
DISTANCE_GROUP              0
CARRIER_DELAY          478763
WEATHER_DELAY          478763
NAS_DELAY              478763
SECURITY_DELAY         478763
LATE_AIRCRAFT_DELAY    478763
Unnamed: 32            583985
dtype: int

# Scrubbing/Cleaning our Data

## TO DO

Weighted airport. Weight airport on fail rate ? Look up target encoding on classification here and see what's valid


target encoding - Weight airport on how many times it shows up for "busy-ness" statistic. Might correlate with delays?


target encoding - Weight airline on how many times it shows up for "busy-ness" statistic


Find brute force way to calculate liklihood of delay from previous segment (airline + airport ?)


Get exact passengers through airport on specific day (check BTS)? Rank into busy segments


Find way to incorporate liklihood that INCOMING flight on previous segment was delayed


Avg flight distance from airport


Age of fleet


Fare?


Gates at airport

OG airport out of dataset "small airport"

employees


Add more airports to get 95% of departures



In [8]:
passengers = pd.read_csv('data/passengers_airport_airline.csv')
passengers

Unnamed: 0,PASSENGERS,UNIQUE_CARRIER,AIRLINE_ID,UNIQUE_CARRIER_NAME,ORIGIN_AIRPORT_ID,DEST_AIRPORT_ID,MONTH
0,0,ZW,20046,Air Wisconsin Airlines Corp,11977,13342,1
1,0,5V,20408,Tatonduk Outfitters Limited d/b/a Everts Air A...,10245,10299,1
2,0,5V,20408,Tatonduk Outfitters Limited d/b/a Everts Air A...,10247,11630,1
3,0,5V,20408,Tatonduk Outfitters Limited d/b/a Everts Air A...,10299,10245,1
4,0,5V,20408,Tatonduk Outfitters Limited d/b/a Everts Air A...,10299,10304,1
...,...,...,...,...,...,...,...
251764,96984,AS,19930,Alaska Airlines Inc.,10299,14747,7
251765,97011,AS,19930,Alaska Airlines Inc.,14747,10299,6
251766,97098,AS,19930,Alaska Airlines Inc.,10299,14747,8
251767,97329,DL,19790,Delta Air Lines Inc.,13204,10397,3


In [10]:
# Load airplane info so we can get seat count
aircraft = pd.read_csv("data/aircraft_type_by_tail_number.csv")
aircraft.drop(columns=['MANUFACTURER', 'MODEL', 'Unnamed: 4'], axis=1, inplace=True)
aircraft

Unnamed: 0,TAIL_NUMBER,NUMBER_OF_SEATS
0,B708RE,9.0
1,CFEXB,0.0
2,CFEXF,0.0
3,CFEXH,0.0
4,CFEXI,0.0
...,...,...
7378,ZS-JIV,0.0
7379,ZS-OPS,0.0
7380,ZS-RSC,0.0
7381,ZS-RSF,0.0


In [14]:
# load coordinates of airports
coords = pd.read_csv('data/airport_coordinates.csv')
coords.drop_duplicates(subset='ORIGIN_AIRPORT_ID', inplace=True)
coords

Unnamed: 0,ORIGIN_AIRPORT_ID,DISPLAY_AIRPORT_NAME,LATITUDE,LONGITUDE
0,10001,Afognak Lake Airport,58.109444,-152.906667
1,10003,Bear Creek Mining Strip,65.548056,-161.071667
2,10004,Lik Mining Camp,68.083333,-163.166667
3,10005,Little Squaw Airport,67.570000,-148.183889
4,10006,Kizhuyak Bay,57.745278,-152.882778
...,...,...,...,...
18128,16908,Deer Park Airport,47.966944,-117.428611
18129,16909,South Texas International at Edinburg,26.441667,-98.122222
18130,16910,Louisa County Freeman Field,38.009722,-77.970000
18131,16911,Caldwell Industrial,43.641944,-116.635833


In [12]:
# Load proper names of carriers
names = pd.read_csv("data/carrier_names.csv")
names.drop_duplicates(inplace=True)
names

Unnamed: 0,OP_UNIQUE_CARRIER,UNIQUE_CARRIER_NAME
0,02Q,Titan Airways
1,04Q,Tradewind Aviation
2,05Q,"Comlux Aviation, AG"
3,06Q,Master Top Linhas Aereas Ltd.
4,07Q,Flair Airlines Ltd.
...,...,...
2699,ZW,Air Wisconsin Airlines Corp
2704,ZX,Air Georgian
2705,ZX (1),Airbc Ltd.
2706,ZY,Atlantic Gulf Airlines


In [13]:
# Load daily weather information
weather = pd.read_pickle('data/weather/weather.pkl')
weather['DATE'] = pd.to_datetime(weather['DATE'])
weather['MONTH'] = pd.DatetimeIndex(weather['DATE']).month
weather['DAY_OF_MONTH'] = pd.DatetimeIndex(weather['DATE']).day
weather

Unnamed: 0,DATE,PRCP,SNOW,SNWD,TMAX,TMIN,AWND,ORIGIN_CITY_NAME,MONTH,DAY_OF_MONTH
0,2019-01-01,0.14,0.0,0.0,66.0,57.0,4.70,"Atlanta, GA",1,1
1,2019-01-02,0.57,0.0,0.0,59.0,49.0,4.92,"Atlanta, GA",1,2
2,2019-01-03,0.15,0.0,0.0,55.0,51.0,5.37,"Atlanta, GA",1,3
3,2019-01-04,1.44,0.0,0.0,66.0,45.0,12.08,"Atlanta, GA",1,4
4,2019-01-05,0.00,0.0,0.0,59.0,44.0,13.42,"Atlanta, GA",1,5
...,...,...,...,...,...,...,...,...,...,...
16781,2019-12-27,0.00,0.0,0.0,57.0,32.0,2.01,"Washington, DC",12,27
16782,2019-12-28,0.00,0.0,0.0,65.0,32.0,1.57,"Washington, DC",12,28
16783,2019-12-29,0.45,0.0,0.0,52.0,40.0,3.13,"Washington, DC",12,29
16784,2019-12-30,0.18,0.0,0.0,63.0,45.0,4.92,"Washington, DC",12,30


In [27]:
def month_cleanup(monthly_data, aircraft, coords, names, weather):
    
    start = time.time()
    
    # drop rows with no departure time, tail number, or were cancelled
    print("Dropping NaNs from Dep Time, Tail Num. Dropping Cancellations.")
    monthly_data.drop(monthly_data.loc[monthly_data['DEP_TIME'].isna()].index, axis=0, inplace=True)
    monthly_data.drop(monthly_data.loc[monthly_data['TAIL_NUM'].isna()].index, axis=0, inplace=True)
    monthly_data.drop(monthly_data.loc[monthly_data['CANCELLED']==1].index, axis=0, inplace=True)
    print(f'Elapsed Time: {time.time() - start}')
   
    
    # Create time blocks for departure for cleaner categories
    print("\nCreating Departure Time Blocks - DEP_BLK")
    monthly_data.loc[(monthly_data['DEP_TIME_BLK']=='2100-2159') | (monthly_data['DEP_TIME_BLK']=='2200-2259') | (monthly_data['DEP_TIME_BLK']=='2300-2359'), 'DEP_BLOCK'] = 'LATE_NIGHT'
    monthly_data.loc[(monthly_data['DEP_TIME_BLK']=='0001-0559'), 'DEP_BLOCK'] = 'EARLY_MORNING'      
    monthly_data.loc[(monthly_data['DEP_TIME_BLK']=='0600-0659') | (monthly_data['DEP_TIME_BLK']=='0700-0759') | (monthly_data['DEP_TIME_BLK']=='0800-0859') | (monthly_data['DEP_TIME_BLK']=='0900-0959'), 'DEP_BLOCK'] = 'MORNING'
    monthly_data.loc[(monthly_data['DEP_TIME_BLK']=='1000-1059') | (monthly_data['DEP_TIME_BLK']=='1100-1159') | (monthly_data['DEP_TIME_BLK']=='1200-1259'), 'DEP_BLOCK'] = 'MIDDAY'
    monthly_data.loc[(monthly_data['DEP_TIME_BLK']=='1300-1359') | (monthly_data['DEP_TIME_BLK']=='1400-1459') | (monthly_data['DEP_TIME_BLK']=='1500-1559') | (monthly_data['DEP_TIME_BLK']=='1600-1659'), 'DEP_BLOCK'] = 'AFTERNOON'
    monthly_data.loc[(monthly_data['DEP_TIME_BLK']=='1700-1759') | (monthly_data['DEP_TIME_BLK']=='1800-1859') | (monthly_data['DEP_TIME_BLK']=='1900-1959') | (monthly_data['DEP_TIME_BLK']=='2000-2059') , 'DEP_BLOCK'] = 'EVENING'
    print(f'Elapsed Time: {time.time() - start}')
   
    
    # List flight segment number for daily flight segments
    print("\nAdding Flight Number Sequence - SEGMENT_NUMBER")
    monthly_data["SEGMENT_NUMBER"] = monthly_data.groupby(["OP_UNIQUE_CARRIER", 'DAY_OF_MONTH', 'OP_CARRIER_FL_NUM'])["DEP_TIME"].rank("dense", ascending=True)
    print(f'Elapsed Time: {time.time() - start}') 
    
   
    # Listing the concurrent flights at the airport in the time block 
    print("\nAdding Concurrent Flights - CONCURRENT_FLIGHTS")
    monthly_data['CONCURRENT_FLIGHTS'] = monthly_data.groupby(['ORIGIN_AIRPORT_ID','DAY_OF_MONTH', 'DEP_BLOCK'])['OP_UNIQUE_CARRIER'].transform("count")
    print(f'Elapsed Time: {time.time() - start}')
 
    
    # Getting seat counts for each aircraft
    print("\nApplying seat counts to flights - NUMBER_OF_SEATS")   
    # Merge aircraft info with main frame on tail number
    monthly_data = df.merge(aircraft, how="inner", left_on='TAIL_NUM', right_on='TAIL_NUMBER')
    # drop any entries that didn't match to a tail number
    monthly_data.drop(columns=['TAIL_NUM', 'TAIL_NUMBER'], axis=1, inplace=True)
    # simplify data type of number of seats to reduce memory usage
    monthly_data['NUMBER_OF_SEATS'] = monthly_data['NUMBER_OF_SEATS'].astype('int16')
    print(f'Elapsed Time: {time.time() - start}')
    
    
    # Add monthly flight statistics for carrier and airport
    print("Adding flight statistics for carrier and airport")
    monthly_data['AIRPORT_FLIGHTS_MONTH'] = monthly_data.groupby(['DISPLAY_AIRPORT_NAME'])['ORIGIN_CITY_NAME'].transform('count')
    monthly_data['AIRLINE_AIRPORT_FLIGHTS_MONTH'] = monthly_data.groupby(['UNIQUE_CARRIER_NAME', 'DISPLAY_AIRPORT_NAME'])['ORIGIN_CITY_NAME'].transform('count')
    print(f'Elapsed Time: {time.time() - start}')
    
    
    # Drop airports below the 10th percentile
    print("Dropping bottom 10% of airports")
    monthly_data.drop(monthly_data.loc[monthly_data['AIRPORT_FLIGHTS_MONTH'] < 1100].index, axis=0, inplace=True)
    print(f'Elapsed Time: {time.time() - start}')
    
    
    # Merge proper carrier name
    print("\nApplying Carrier Names - UNIQUE_CARRIER_NAME")  
    monthly_data = pd.merge(monthly_data, names, how='left', on=['OP_UNIQUE_CARRIER'])
    print(f'Elapsed Time: {time.time() - start}') 
    
    
    # Merge weather data
    print("\nAdding daily weather data - PRCP, SNOW, SNWD, SMAX, TMIN, AWND")
    monthly_data = pd.merge(monthly_data, weather, how='left', on=['ORIGIN_CITY_NAME', 'MONTH', 'DAY_OF_MONTH'])
    print(f'Elapsed Time: {time.time() - start}')
    
    
    # Merge airport coordinates
    print("\nAdding airport coordinates - LATITUDE, LONGITUDE")
    monthly_data = pd.merge(monthly_data, coords, how='left', on=['ORIGIN_AIRPORT_ID'])
    monthly_data['LATITUDE'] = round(monthly_data['LATITUDE'], 3)
    monthly_data['LONGITUDE'] = round(monthly_data['LONGITUDE'], 3)
    print(f'Elapsed Time: {time.time() - start}')

    
    # drop columns that we won't use
    print("\nClean up unneeded columns")
    monthly_data.drop(columns = ['ORIGIN',  'DEST',  
                   'CRS_DEP_TIME', 'DEP_DELAY_NEW', 'CRS_ARR_TIME', 'ARR_TIME', 
                   'CANCELLED', 'CANCELLATION_CODE', 'CRS_ELAPSED_TIME', 'DISTANCE',
                   'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY',
                  'ARR_DELAY_NEW', 'Unnamed: 32', 'DEP_TIME_BLK', 'ARR_TIME_BLK', 'ACTUAL_ELAPSED_TIME',
                  'DEST_AIRPORT_ID', 'DEST_CITY_NAME', 'DAY_OF_MONTH', 'OP_CARRIER_FL_NUM', 'DEP_TIME', 'OP_UNIQUE_CARRIER',
                                'DATE', 'ORIGIN_AIRPORT_ID'],
        axis=1, inplace=True)
    print(f'Elapsed Time: {time.time() - start}') 
    
    
    # specify data types of various fields to reduce memory usage
    print("\nCleaning up data types")
    monthly_data['MONTH'] = monthly_data['MONTH'].astype('object')
    monthly_data['DAY_OF_WEEK'] = monthly_data['DAY_OF_WEEK'].astype('object')
    monthly_data['DEP_DEL15'] = monthly_data['DEP_DEL15'].astype('int8')
    monthly_data['DISTANCE_GROUP'] = monthly_data['DISTANCE_GROUP'].astype('int8')
    monthly_data['DEP_BLOCK'] = monthly_data['DEP_BLOCK'].astype('object')
    monthly_data['SEGMENT_NUMBER'] = monthly_data['SEGMENT_NUMBER'].astype('int8')
    print(f'Elapsed Time: {time.time() - start}')
    
    print("\nFINISHED")
    return monthly_data

In [28]:
df = pd.read_csv('data/raw_data/ONTIME_REPORTING_01.csv')
final = month_cleanup(df, aircraft, coords, names, weather)
final.to_pickle("data/final_flights_month01.pkl")



Dropping NaNs from Dep Time, Tail Num. Dropping Cancellations.
Elapsed Time: 0.28426671028137207


Creating Departure Time Blocks - DEP_BLK
Elapsed Time: 0.8377692699432373


Adding Flight Number Sequence - SEGMENT_NUMBER
Elapsed Time: 1.0399446487426758


Adding Concurrent Flights - CONCURRENT_FLIGHTS
Elapsed Time: 1.1130108833312988


Applying seat counts to flights - NUMBER_OF_SEATS
Elapsed Time: 2.170971632003784


Applying Carrier Names - UNIQUE_CARRIER_NAME
Elapsed Time: 2.8335821628570557


Adding daily weather data - PRCP, SNOW, SNWD, SMAX, TMIN, AWND
Elapsed Time: 3.0317535400390625


Adding airport coordinates - LATITUDE, LONGITUDE
Elapsed Time: 3.2669756412506104

Clean up unneeded columns
Elapsed Time: 3.396092653274536


Cleaning up data types
Elapsed Time: 3.4271209239959717

FINISHED


In [29]:
final

Unnamed: 0,MONTH,DAY_OF_WEEK,ORIGIN_CITY_NAME,DEP_DEL15,DISTANCE_GROUP,DEP_BLOCK,SEGMENT_NUMBER,CONCURRENT_FLIGHTS,NUMBER_OF_SEATS,UNIQUE_CARRIER_NAME,PRCP,SNOW,SNWD,TMAX,TMIN,AWND,DISPLAY_AIRPORT_NAME,LATITUDE,LONGITUDE
0,1,7,"Atlanta, GA",0,1,AFTERNOON,1,280,50,Endeavor Air Inc.,0.00,0.0,0.0,69.0,43.0,6.49,Atlanta Municipal,33.641,-84.427
1,1,7,"Columbus, GA",0,1,EVENING,2,1,50,Endeavor Air Inc.,,,,,,,Columbus Metropolitan,32.516,-84.939
2,1,4,"Atlanta, GA",0,2,AFTERNOON,1,251,50,Endeavor Air Inc.,0.15,0.0,0.0,55.0,51.0,5.37,Atlanta Municipal,33.641,-84.427
3,1,4,"Charlottesville, VA",0,2,EVENING,2,3,50,Endeavor Air Inc.,,,,,,,Charlottesville Albemarle,38.141,-78.453
4,1,5,"Atlanta, GA",1,1,LATE_NIGHT,1,135,50,Endeavor Air Inc.,1.44,0.0,0.0,66.0,45.0,12.08,Atlanta Municipal,33.641,-84.427
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
536768,1,3,"Los Angeles, CA",1,10,LATE_NIGHT,1,88,276,United Air Lines Inc.,0.00,0.0,0.0,61.0,39.0,4.70,Los Angeles International,33.942,-118.408
536769,1,2,"Newark, NJ",0,11,AFTERNOON,1,85,276,United Air Lines Inc.,0.08,0.0,0.0,58.0,40.0,14.99,Newark Liberty International,40.696,-74.172
536770,1,2,"Washington, DC",0,10,MORNING,1,59,219,United Air Lines Inc.,0.00,0.0,0.0,62.0,40.0,12.30,Washington Dulles International,38.942,-77.458
536771,1,4,"Houston, TX",0,7,EVENING,1,122,219,United Air Lines Inc.,0.14,0.0,0.0,59.0,46.0,8.72,Houston Intercontinental,29.983,-95.340


In [31]:
df = pd.read_csv('data/raw_data/ONTIME_REPORTING_02.csv')
final = month_cleanup(df, aircraft, coords, names, weather)
final.to_pickle("data/final_flights_month02.pkl")



Dropping NaNs from Dep Time, Tail Num. Dropping Cancellations.
Elapsed Time: 0.30428481101989746


Creating Departure Time Blocks - DEP_BLK
Elapsed Time: 0.7987339496612549


Adding Flight Number Sequence - SEGMENT_NUMBER
Elapsed Time: 1.0039114952087402


Adding Concurrent Flights - CONCURRENT_FLIGHTS
Elapsed Time: 1.0699717998504639


Applying seat counts to flights - NUMBER_OF_SEATS
Elapsed Time: 2.0258395671844482


Applying Carrier Names - UNIQUE_CARRIER_NAME
Elapsed Time: 2.6334002017974854


Adding daily weather data - PRCP, SNOW, SNWD, SMAX, TMIN, AWND
Elapsed Time: 2.810561180114746


Adding airport coordinates - LATITUDE, LONGITUDE
Elapsed Time: 3.0237467288970947

Clean up unneeded columns
Elapsed Time: 3.146857976913452


Cleaning up data types
Elapsed Time: 3.175884246826172

FINISHED


In [32]:
df = pd.read_csv('data/raw_data/ONTIME_REPORTING_03.csv')
final = month_cleanup(df, aircraft, coords, names, weather)
final.to_pickle("data/final_flights_month03.pkl")



Dropping NaNs from Dep Time, Tail Num. Dropping Cancellations.
Elapsed Time: 0.3142850399017334


Creating Departure Time Blocks - DEP_BLK
Elapsed Time: 0.9138293266296387


Adding Flight Number Sequence - SEGMENT_NUMBER
Elapsed Time: 1.1290333271026611


Adding Concurrent Flights - CONCURRENT_FLIGHTS
Elapsed Time: 1.2086272239685059


Applying seat counts to flights - NUMBER_OF_SEATS
Elapsed Time: 2.384695053100586


Applying Carrier Names - UNIQUE_CARRIER_NAME
Elapsed Time: 3.1353769302368164


Adding daily weather data - PRCP, SNOW, SNWD, SMAX, TMIN, AWND
Elapsed Time: 3.349571466445923


Adding airport coordinates - LATITUDE, LONGITUDE
Elapsed Time: 3.6077992916107178

Clean up unneeded columns
Elapsed Time: 3.7649402618408203


Cleaning up data types
Elapsed Time: 3.8009729385375977

FINISHED


In [33]:
df = pd.read_csv('data/raw_data/ONTIME_REPORTING_04.csv')
final = month_cleanup(df, aircraft, coords, names, weather)
final.to_pickle("data/final_flights_month04.pkl")



Dropping NaNs from Dep Time, Tail Num. Dropping Cancellations.
Elapsed Time: 0.30828022956848145


Creating Departure Time Blocks - DEP_BLK
Elapsed Time: 0.8838026523590088


Adding Flight Number Sequence - SEGMENT_NUMBER
Elapsed Time: 1.0950028896331787


Adding Concurrent Flights - CONCURRENT_FLIGHTS
Elapsed Time: 1.1720645427703857


Applying seat counts to flights - NUMBER_OF_SEATS
Elapsed Time: 2.3176088333129883


Applying Carrier Names - UNIQUE_CARRIER_NAME
Elapsed Time: 3.0382633209228516


Adding daily weather data - PRCP, SNOW, SNWD, SMAX, TMIN, AWND
Elapsed Time: 3.2464609146118164


Adding airport coordinates - LATITUDE, LONGITUDE
Elapsed Time: 3.495687246322632

Clean up unneeded columns
Elapsed Time: 3.63981032371521


Cleaning up data types
Elapsed Time: 3.6768434047698975

FINISHED


In [34]:
df = pd.read_csv('data/raw_data/ONTIME_REPORTING_05.csv')
final = month_cleanup(df, aircraft, coords, names, weather)
final.to_pickle("data/final_flights_month05.pkl")



Dropping NaNs from Dep Time, Tail Num. Dropping Cancellations.
Elapsed Time: 0.31828904151916504


Creating Departure Time Blocks - DEP_BLK
Elapsed Time: 0.9238393306732178


Adding Flight Number Sequence - SEGMENT_NUMBER
Elapsed Time: 1.147050142288208


Adding Concurrent Flights - CONCURRENT_FLIGHTS
Elapsed Time: 1.2261219024658203


Applying seat counts to flights - NUMBER_OF_SEATS
Elapsed Time: 2.446744203567505


Applying Carrier Names - UNIQUE_CARRIER_NAME
Elapsed Time: 3.2114386558532715


Adding daily weather data - PRCP, SNOW, SNWD, SMAX, TMIN, AWND
Elapsed Time: 3.4306375980377197


Adding airport coordinates - LATITUDE, LONGITUDE
Elapsed Time: 3.700883388519287

Clean up unneeded columns
Elapsed Time: 3.8430120944976807


Cleaning up data types
Elapsed Time: 3.879044771194458

FINISHED


In [35]:
df = pd.read_csv('data/raw_data/ONTIME_REPORTING_06.csv')
final = month_cleanup(df, aircraft, coords, names, weather)
final.to_pickle("data/final_flights_month06.pkl")



Dropping NaNs from Dep Time, Tail Num. Dropping Cancellations.
Elapsed Time: 0.32029080390930176


Creating Departure Time Blocks - DEP_BLK
Elapsed Time: 0.9228465557098389


Adding Flight Number Sequence - SEGMENT_NUMBER
Elapsed Time: 1.1430463790893555


Adding Concurrent Flights - CONCURRENT_FLIGHTS
Elapsed Time: 1.224642276763916


Applying seat counts to flights - NUMBER_OF_SEATS
Elapsed Time: 2.4307377338409424


Applying Carrier Names - UNIQUE_CARRIER_NAME
Elapsed Time: 3.187424898147583


Adding daily weather data - PRCP, SNOW, SNWD, SMAX, TMIN, AWND
Elapsed Time: 3.398608922958374


Adding airport coordinates - LATITUDE, LONGITUDE
Elapsed Time: 3.6618473529815674

Clean up unneeded columns
Elapsed Time: 3.803976535797119


Cleaning up data types
Elapsed Time: 3.8390095233917236

FINISHED


In [36]:
df = pd.read_csv('data/raw_data/ONTIME_REPORTING_07.csv')
final = month_cleanup(df, aircraft, coords, names, weather)
final.to_pickle("data/final_flights_month07.pkl")



Dropping NaNs from Dep Time, Tail Num. Dropping Cancellations.
Elapsed Time: 0.3343033790588379


Creating Departure Time Blocks - DEP_BLK
Elapsed Time: 0.95186448097229


Adding Flight Number Sequence - SEGMENT_NUMBER
Elapsed Time: 1.1860771179199219


Adding Concurrent Flights - CONCURRENT_FLIGHTS
Elapsed Time: 1.267150640487671


Applying seat counts to flights - NUMBER_OF_SEATS
Elapsed Time: 2.5257976055145264


Applying Carrier Names - UNIQUE_CARRIER_NAME
Elapsed Time: 3.3055055141448975


Adding daily weather data - PRCP, SNOW, SNWD, SMAX, TMIN, AWND
Elapsed Time: 3.546724557876587


Adding airport coordinates - LATITUDE, LONGITUDE
Elapsed Time: 3.8119657039642334

Clean up unneeded columns
Elapsed Time: 3.959099531173706


Cleaning up data types
Elapsed Time: 3.9951322078704834

FINISHED


In [37]:
df = pd.read_csv('data/raw_data/ONTIME_REPORTING_08.csv')
final = month_cleanup(df, aircraft, coords, names, weather)
final.to_pickle("data/final_flights_month08.pkl")



Dropping NaNs from Dep Time, Tail Num. Dropping Cancellations.
Elapsed Time: 0.33330297470092773


Creating Departure Time Blocks - DEP_BLK
Elapsed Time: 0.9608814716339111


Adding Flight Number Sequence - SEGMENT_NUMBER
Elapsed Time: 1.1900811195373535


Adding Concurrent Flights - CONCURRENT_FLIGHTS
Elapsed Time: 1.27215576171875


Applying seat counts to flights - NUMBER_OF_SEATS
Elapsed Time: 2.5463130474090576


Applying Carrier Names - UNIQUE_CARRIER_NAME
Elapsed Time: 3.351043701171875


Adding daily weather data - PRCP, SNOW, SNWD, SMAX, TMIN, AWND
Elapsed Time: 3.582253932952881


Adding airport coordinates - LATITUDE, LONGITUDE
Elapsed Time: 3.853508710861206

Clean up unneeded columns
Elapsed Time: 4.008640766143799


Cleaning up data types
Elapsed Time: 4.050678968429565

FINISHED


In [38]:
df = pd.read_csv('data/raw_data/ONTIME_REPORTING_09.csv')
final = month_cleanup(df, aircraft, coords, names, weather)
final.to_pickle("data/final_flights_month09.pkl")



Dropping NaNs from Dep Time, Tail Num. Dropping Cancellations.
Elapsed Time: 0.33030009269714355


Creating Departure Time Blocks - DEP_BLK
Elapsed Time: 0.9628746509552002


Adding Flight Number Sequence - SEGMENT_NUMBER
Elapsed Time: 1.1920826435089111


Adding Concurrent Flights - CONCURRENT_FLIGHTS
Elapsed Time: 1.2741572856903076


Applying seat counts to flights - NUMBER_OF_SEATS
Elapsed Time: 2.523291826248169


Applying Carrier Names - UNIQUE_CARRIER_NAME
Elapsed Time: 3.3100063800811768


Adding daily weather data - PRCP, SNOW, SNWD, SMAX, TMIN, AWND
Elapsed Time: 3.5362117290496826


Adding airport coordinates - LATITUDE, LONGITUDE
Elapsed Time: 3.817466974258423

Clean up unneeded columns
Elapsed Time: 3.966602325439453


Cleaning up data types
Elapsed Time: 4.003636121749878

FINISHED


In [39]:
df = pd.read_csv('data/raw_data/ONTIME_REPORTING_10.csv')
final = month_cleanup(df, aircraft, coords, names, weather)
final.to_pickle("data/final_flights_month10.pkl")



Dropping NaNs from Dep Time, Tail Num. Dropping Cancellations.
Elapsed Time: 0.3222835063934326


Creating Departure Time Blocks - DEP_BLK
Elapsed Time: 0.9283459186553955


Adding Flight Number Sequence - SEGMENT_NUMBER
Elapsed Time: 1.1515486240386963


Adding Concurrent Flights - CONCURRENT_FLIGHTS
Elapsed Time: 1.231621265411377


Applying seat counts to flights - NUMBER_OF_SEATS
Elapsed Time: 2.473740816116333


Applying Carrier Names - UNIQUE_CARRIER_NAME
Elapsed Time: 3.250454902648926


Adding daily weather data - PRCP, SNOW, SNWD, SMAX, TMIN, AWND
Elapsed Time: 3.4776532649993896


Adding airport coordinates - LATITUDE, LONGITUDE
Elapsed Time: 3.7448952198028564

Clean up unneeded columns
Elapsed Time: 3.89102840423584


Cleaning up data types
Elapsed Time: 3.9290711879730225

FINISHED


In [40]:
df = pd.read_csv('data/raw_data/ONTIME_REPORTING_11.csv')
final = month_cleanup(df, aircraft, coords, names, weather)
final.to_pickle("data/final_flights_month11.pkl")



Dropping NaNs from Dep Time, Tail Num. Dropping Cancellations.
Elapsed Time: 0.3122832775115967


Creating Departure Time Blocks - DEP_BLK
Elapsed Time: 0.8858127593994141


Adding Flight Number Sequence - SEGMENT_NUMBER
Elapsed Time: 1.0950028896331787


Adding Concurrent Flights - CONCURRENT_FLIGHTS
Elapsed Time: 1.1710717678070068


Applying seat counts to flights - NUMBER_OF_SEATS
Elapsed Time: 2.334623336791992


Applying Carrier Names - UNIQUE_CARRIER_NAME
Elapsed Time: 3.076305627822876


Adding daily weather data - PRCP, SNOW, SNWD, SMAX, TMIN, AWND
Elapsed Time: 3.2854866981506348


Adding airport coordinates - LATITUDE, LONGITUDE
Elapsed Time: 3.5407187938690186

Clean up unneeded columns
Elapsed Time: 3.6758499145507812


Cleaning up data types
Elapsed Time: 3.709880828857422

FINISHED


In [41]:
df = pd.read_csv('data/raw_data/ONTIME_REPORTING_12.csv')
final = month_cleanup(df, aircraft, coords, names, weather)
final.to_pickle("data/final_flights_month12.pkl")



Dropping NaNs from Dep Time, Tail Num. Dropping Cancellations.
Elapsed Time: 0.30827951431274414


Creating Departure Time Blocks - DEP_BLK
Elapsed Time: 0.9108269214630127


Adding Flight Number Sequence - SEGMENT_NUMBER
Elapsed Time: 1.1340296268463135


Adding Concurrent Flights - CONCURRENT_FLIGHTS
Elapsed Time: 1.2151033878326416


Applying seat counts to flights - NUMBER_OF_SEATS
Elapsed Time: 2.418195962905884


Applying Carrier Names - UNIQUE_CARRIER_NAME
Elapsed Time: 3.168381929397583


Adding daily weather data - PRCP, SNOW, SNWD, SMAX, TMIN, AWND
Elapsed Time: 3.3915843963623047


Adding airport coordinates - LATITUDE, LONGITUDE
Elapsed Time: 3.6578259468078613

Clean up unneeded columns
Elapsed Time: 3.799954891204834


Cleaning up data types
Elapsed Time: 3.8359880447387695

FINISHED


In [79]:
# LOAD FILE

month01 = pd.read_pickle("data/final_flights_month01.pkl")
month02 = pd.read_pickle("data/final_flights_month02.pkl")
month03 = pd.read_pickle("data/final_flights_month03.pkl")
month04 = pd.read_pickle("data/final_flights_month04.pkl")
month05 = pd.read_pickle("data/final_flights_month05.pkl")
month06 = pd.read_pickle("data/final_flights_month06.pkl")
month07 = pd.read_pickle("data/final_flights_month07.pkl")
month08 = pd.read_pickle("data/final_flights_month08.pkl")
month09 = pd.read_pickle("data/final_flights_month09.pkl")
month10 = pd.read_pickle("data/final_flights_month10.pkl")
month11 = pd.read_pickle("data/final_flights_month11.pkl")
month12 = pd.read_pickle("data/final_flights_month12.pkl")

combined = pd.concat([month01, month02, month03, month04, month05, month06, month07, month08, month09, month10, month11, month12])

In [None]:
final.to_pickle("data/final_flights.pkl")

In [None]:
final

In [122]:
# Get Airports List

airportcounts = pd.DataFrame(combined.groupby(['DISPLAY_AIRPORT_NAME', 'ORIGIN_CITY_NAME']).count().reset_index())
airports = airportcounts[['DISPLAY_AIRPORT_NAME', 'ORIGIN_CITY_NAME']]
airports.to_csv('data/airports_list.csv')

### Get Top 50 List

In [None]:
cities = pd.DataFrame(final['ORIGIN_CITY_NAME'].unique())
cities

In [None]:
cities.to_csv('data/weather/cities.csv')

## DEPRECATED

In [None]:
break

In [None]:
df.drop(df.loc[df['DEP_TIME'].isna()].index, axis=0, inplace=True)
df.drop(df.loc[df['TAIL_NUM'].isna()].index, axis=0, inplace=True)
df.drop(df.loc[df['CANCELLED']==1].index, axis=0, inplace=True)

In [None]:
df.loc[(df['DEP_TIME_BLK']=='2100-2159') | (df['DEP_TIME_BLK']=='2200-2259') | (df['DEP_TIME_BLK']=='2300-2359'), 'DEP_BLOCK'] = 'LATE_NIGHT'

df.loc[(df['DEP_TIME_BLK']=='0001-0559'), 'DEP_BLOCK'] = 'EARLY_MORNING'
       
df.loc[(df['DEP_TIME_BLK']=='0600-0659') | (df['DEP_TIME_BLK']=='0700-0759') | (df['DEP_TIME_BLK']=='0800-0859') | (df['DEP_TIME_BLK']=='0900-0959'), 'DEP_BLOCK'] = 'MORNING'

df.loc[(df['DEP_TIME_BLK']=='1000-1059') | (df['DEP_TIME_BLK']=='1100-1159') | (df['DEP_TIME_BLK']=='1200-1259'), 'DEP_BLOCK'] = 'MIDDAY'

df.loc[(df['DEP_TIME_BLK']=='1300-1359') | (df['DEP_TIME_BLK']=='1400-1459') | (df['DEP_TIME_BLK']=='1500-1559') | (df['DEP_TIME_BLK']=='1600-1659'), 'DEP_BLOCK'] = 'AFTERNOON'

df.loc[(df['DEP_TIME_BLK']=='1700-1759') | (df['DEP_TIME_BLK']=='1800-1859') | (df['DEP_TIME_BLK']=='1900-1959') | (df['DEP_TIME_BLK']=='2000-2059') , 'DEP_BLOCK'] = 'EVENING'


In [None]:
df

In [None]:
df.drop(columns = ['MONTH', 'ORIGIN',  'DEST',  
                   'CRS_DEP_TIME', 'DEP_DELAY_NEW', 'CRS_ARR_TIME', 'ARR_TIME', 
                   'CANCELLED', 'CANCELLATION_CODE', 'CRS_ELAPSED_TIME', 'DISTANCE',
                   'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY',
                  'ARR_DELAY_NEW', 'Unnamed: 32', 'DEP_TIME_BLK', 'ARR_TIME_BLK', 'ACTUAL_ELAPSED_TIME',
                  'DEST_AIRPORT_ID', 'DEST_CITY_NAME'],
        axis=1, inplace=True)

In [None]:
df.memory_usage().sum()

In [None]:
df.dtypes

In [None]:
df['DAY_OF_MONTH'] = df['DAY_OF_MONTH'].astype('int8')
df['DAY_OF_WEEK'] = df['DAY_OF_WEEK'].astype('object')
df['OP_CARRIER_FL_NUM'] = df['OP_CARRIER_FL_NUM'].astype('object')
#df['ORIGIN_AIRPORT_ID'] = df['ORIGIN_AIRPORT_ID'].astype('object')
#df['DEST_AIRPORT_ID'] = df['DEST_AIRPORT_ID'].astype('object')
df['DEP_DEL15'] = df['DEP_DEL15'].astype('int8')
df['DISTANCE_GROUP'] = df['DISTANCE_GROUP'].astype('int8')

#### Flight Sequence Order

In [None]:
carriers = list(df['OP_UNIQUE_CARRIER'].unique())
days = list(df['DAY_OF_MONTH'].unique())
flight_nums = list(df['OP_CARRIER_FL_NUM'].unique())

# TEMP STUFF
#carriers = ['WN']
#days = [1]

In [None]:
# apply sequence order to same-day flight number sequences

for carrier in carriers:
    print("Working on Carrier: {}".format(carrier))
    for day in days:
        print("Carrier {} Day {}".format(carrier, day))
        flights = df.loc[(df['OP_UNIQUE_CARRIER'] == carrier) & 
                  (df['DAY_OF_MONTH'] == day), 'OP_CARRIER_FL_NUM'].unique()
        print(flights)
        for flight in flights:
            sequence = df.loc[(df['OP_UNIQUE_CARRIER'] == carrier) & 
                  (df['DAY_OF_MONTH'] == day) & (df['OP_CARRIER_FL_NUM'] == flight)]
            df.loc[(df['OP_UNIQUE_CARRIER'] == carrier) & 
                  (df['DAY_OF_MONTH'] == day) & (df['OP_CARRIER_FL_NUM'] == flight), 
                      'sequence_rank'] = sequence['DEP_TIME'].rank()

            
df.loc[(df['OP_UNIQUE_CARRIER'] == carrier) & (df['DAY_OF_MONTH'] == day)]                     

#### Airplane Seat Count

In [None]:
# Drop unneeded columns. All we need is the Tail Number and the number of seats
aircraft.drop(columns=['MANUFACTURER', 'MODEL', 'Unnamed: 4'], axis=1, inplace=True)

# Merge aircraft info with main frame on tail number
final = df.merge(aircraft, how="inner", left_on='TAIL_NUM', right_on='TAIL_NUMBER')

# drop any entries that didn't match to a tail number
final.drop(columns=['TAIL_NUM', 'TAIL_NUMBER'], axis=1, inplace=True)

# simplify data type of number of seats to reduce memory usage
final['NUMBER_OF_SEATS'] = final['NUMBER_OF_SEATS'].astype('int16')

final

#### Concurrent Flights

In [None]:
def find_flights(line):
    origin = line['ORIGIN_AIRPORT_ID']
    day = line['DAY_OF_MONTH']
    block = line['DEP_BLOCK']
    flights = airport_business.loc[(airport_business['ORIGIN_AIRPORT_ID'] == origin) & 
                                    (airport_business['DAY_OF_MONTH'] == day) & 
                                   (airport_business['DEP_BLOCK'] == block), ['FLIGHTS_IN_BLOCK']]
    number = flights.iloc[0]
    return number

In [None]:
final['CONCURRENT_FLIGHTS'] = 0
final['CONCURRENT_FLIGHTS'] = final.apply(lambda x: find_flights(x), axis=1)

#### Airport Coordinates

In [None]:
# make a copy of the original df to do the coordinates EDA

df_with_coords = final

df_with_coords['ORIGIN_LAT'] = 0
df_with_coords['ORIGIN_LONG'] = 0

In [None]:
for item in coords_dict:
    airport = item['AIRPORT_ID']
    lat = item['LATITUDE']
    long = item['LONGITUDE']
    df_with_coords.loc[df_with_coords['ORIGIN_AIRPORT_ID']==airport, 'ORIGIN_LAT'] = lat
    df_with_coords.loc[df_with_coords['ORIGIN_AIRPORT_ID']==airport, 'ORIGIN_LONG'] = long

#### Flight Sequencing

In [None]:
    # List flight sequence for daily flight segments
    print("\n\nStarting Flight Sequencing...")
    carriers = list(monthly_data['OP_UNIQUE_CARRIER'].unique())
    days = list(monthly_data['DAY_OF_MONTH'].unique())
    flight_nums = list(monthly_data['OP_CARRIER_FL_NUM'].unique())
    
    for carrier in carriers:
        print("\nWorking on Carrier: {}\n".format(carrier))
        flights = monthly_data.loc[(monthly_data['OP_UNIQUE_CARRIER'] == carrier), 'OP_CARRIER_FL_NUM'].unique()
        print(flights)
        for day in days:
            print(f'\nElapsed Time: {time.time() - start}')
            print("Carrier {} Day {}".format(carrier, day)) 
            for flight in flights:
                try:
                    sequence = monthly_data.loc[(monthly_data['OP_UNIQUE_CARRIER'] == carrier) & 
                          (monthly_data['DAY_OF_MONTH'] == day) & (monthly_data['OP_CARRIER_FL_NUM'] == flight)]
                    monthly_data.loc[(monthly_data['OP_UNIQUE_CARRIER'] == carrier) & 
                          (monthly_data['DAY_OF_MONTH'] == day) & (monthly_data['OP_CARRIER_FL_NUM'] == flight), 
                              'sequence_rank'] = sequence['DEP_TIME'].rank()
                except:
                    continue