# Data Processing

Fetch data by `anyflights`: https://github.com/simonpcouch/anyflights.

Airline_code: https://www.bts.gov/topics/airlines-and-airports/airline-codes.

In [1]:
import glob
import pandas as pd
from tqdm.notebook import tqdm

## Combine 12 months files

In [2]:
names = glob.glob('C:/CS_DATA/2020_flights/*')
print(names)

['C:/CS_DATA/2020_flights\\202001.csv', 'C:/CS_DATA/2020_flights\\202002.csv', 'C:/CS_DATA/2020_flights\\202003.csv', 'C:/CS_DATA/2020_flights\\202004.csv', 'C:/CS_DATA/2020_flights\\202005.csv', 'C:/CS_DATA/2020_flights\\202006.csv', 'C:/CS_DATA/2020_flights\\202007.csv', 'C:/CS_DATA/2020_flights\\202008.csv', 'C:/CS_DATA/2020_flights\\202009.csv', 'C:/CS_DATA/2020_flights\\202010.csv', 'C:/CS_DATA/2020_flights\\202011.csv', 'C:/CS_DATA/2020_flights\\202012.csv']


In [3]:
select_cols = ['FlightDate', 'Reporting_Airline', 'Origin', 'OriginCityName', 'OriginStateName', 
               'Dest', 'DestCityName', 'DestStateName', 'CRSDepTime', 'DepTime', 'DepDelay', 
               'CRSArrTime', 'ArrTime', 'ArrDelay']
df1 = pd.read_csv(names[0], usecols=select_cols)
df1.head()

Unnamed: 0,FlightDate,Reporting_Airline,Origin,OriginCityName,OriginStateName,Dest,DestCityName,DestStateName,CRSDepTime,DepTime,DepDelay,CRSArrTime,ArrTime,ArrDelay
0,2020-01-17,B6,PBI,"West Palm Beach/Palm Beach, FL",Florida,BDL,"Hartford, CT",Connecticut,1940,1926.0,-14.0,2229,2218.0,-11.0
1,2020-01-18,B6,PBI,"West Palm Beach/Palm Beach, FL",Florida,BDL,"Hartford, CT",Connecticut,1940,1926.0,-14.0,2229,2220.0,-9.0
2,2020-01-19,B6,PBI,"West Palm Beach/Palm Beach, FL",Florida,BDL,"Hartford, CT",Connecticut,1940,1947.0,7.0,2229,2223.0,-6.0
3,2020-01-20,B6,PBI,"West Palm Beach/Palm Beach, FL",Florida,BDL,"Hartford, CT",Connecticut,1940,1933.0,-7.0,2229,2233.0,4.0
4,2020-01-21,B6,PBI,"West Palm Beach/Palm Beach, FL",Florida,BDL,"Hartford, CT",Connecticut,1940,1952.0,12.0,2229,2235.0,6.0


In [4]:
df1.columns

Index(['FlightDate', 'Reporting_Airline', 'Origin', 'OriginCityName',
       'OriginStateName', 'Dest', 'DestCityName', 'DestStateName',
       'CRSDepTime', 'DepTime', 'DepDelay', 'CRSArrTime', 'ArrTime',
       'ArrDelay'],
      dtype='object')

In [5]:
df1.isnull().sum()

FlightDate              0
Reporting_Airline       0
Origin                  0
OriginCityName          0
OriginStateName         0
Dest                    0
DestCityName            0
DestStateName           0
CRSDepTime              0
DepTime              6664
DepDelay             6699
CRSArrTime              0
ArrTime              7075
ArrDelay             8078
dtype: int64

In [6]:
df1.shape

(607346, 14)

In [7]:
df1.dropna(inplace=True)

In [8]:
df1.isnull().sum()

FlightDate           0
Reporting_Airline    0
Origin               0
OriginCityName       0
OriginStateName      0
Dest                 0
DestCityName         0
DestStateName        0
CRSDepTime           0
DepTime              0
DepDelay             0
CRSArrTime           0
ArrTime              0
ArrDelay             0
dtype: int64

In [9]:
df1.shape

(599268, 14)

In [10]:
Hou_flights = pd.DataFrame(columns = select_cols)
for name in tqdm(names):
    print(name)
    df = pd.read_csv(name, usecols=select_cols)
    df.dropna(inplace=True)
    df = df[(df['Origin']=='IAH') | (df['Origin']=='HOU')]
    Hou_flights = Hou_flights.append(df)
Hou_flights.head()

  0%|          | 0/12 [00:00<?, ?it/s]

C:/CS_DATA/2020_flights\202001.csv
C:/CS_DATA/2020_flights\202002.csv
C:/CS_DATA/2020_flights\202003.csv
C:/CS_DATA/2020_flights\202004.csv
C:/CS_DATA/2020_flights\202005.csv
C:/CS_DATA/2020_flights\202006.csv
C:/CS_DATA/2020_flights\202007.csv
C:/CS_DATA/2020_flights\202008.csv
C:/CS_DATA/2020_flights\202009.csv
C:/CS_DATA/2020_flights\202010.csv
C:/CS_DATA/2020_flights\202011.csv
C:/CS_DATA/2020_flights\202012.csv


Unnamed: 0,FlightDate,Reporting_Airline,Origin,OriginCityName,OriginStateName,Dest,DestCityName,DestStateName,CRSDepTime,DepTime,DepDelay,CRSArrTime,ArrTime,ArrDelay
6256,2020-01-01,B6,IAH,"Houston, TX",Texas,JFK,"New York, NY",New York,1134,1127.0,-7.0,1603,1531.0,-32.0
6257,2020-01-02,B6,IAH,"Houston, TX",Texas,JFK,"New York, NY",New York,1134,1122.0,-12.0,1603,1514.0,-49.0
6258,2020-01-03,B6,IAH,"Houston, TX",Texas,JFK,"New York, NY",New York,1134,1155.0,21.0,1603,1618.0,15.0
6259,2020-01-04,B6,IAH,"Houston, TX",Texas,JFK,"New York, NY",New York,1130,1127.0,-3.0,1559,1544.0,-15.0
6260,2020-01-05,B6,IAH,"Houston, TX",Texas,JFK,"New York, NY",New York,1134,1209.0,35.0,1603,1643.0,40.0


In [11]:
Hou_flights['Origin'].unique()

array(['IAH', 'HOU'], dtype=object)

In [12]:
carrier_names = pd.read_excel('airlines_code.xlsx')
carrier_names.head()

Unnamed: 0,carrier,name
0,02Q,Titan Airways
1,04Q,Tradewind Aviation
2,05Q,"Comlux Aviation, AG"
3,06Q,Master Top Linhas Aereas Ltd.
4,07Q,Flair Airlines Ltd.


In [13]:
Hou_flights = Hou_flights.merge(carrier_names, how = 'left', left_on = 'Reporting_Airline', right_on = 'carrier')
# Hou_flights.rename(columns = {'Reporting_Airline':'carrier'}, inplace = True)
Hou_flights.head()

Unnamed: 0,FlightDate,Reporting_Airline,Origin,OriginCityName,OriginStateName,Dest,DestCityName,DestStateName,CRSDepTime,DepTime,DepDelay,CRSArrTime,ArrTime,ArrDelay,carrier,name
0,2020-01-01,B6,IAH,"Houston, TX",Texas,JFK,"New York, NY",New York,1134,1127.0,-7.0,1603,1531.0,-32.0,B6,JetBlue Airways
1,2020-01-02,B6,IAH,"Houston, TX",Texas,JFK,"New York, NY",New York,1134,1122.0,-12.0,1603,1514.0,-49.0,B6,JetBlue Airways
2,2020-01-03,B6,IAH,"Houston, TX",Texas,JFK,"New York, NY",New York,1134,1155.0,21.0,1603,1618.0,15.0,B6,JetBlue Airways
3,2020-01-04,B6,IAH,"Houston, TX",Texas,JFK,"New York, NY",New York,1130,1127.0,-3.0,1559,1544.0,-15.0,B6,JetBlue Airways
4,2020-01-05,B6,IAH,"Houston, TX",Texas,JFK,"New York, NY",New York,1134,1209.0,35.0,1603,1643.0,40.0,B6,JetBlue Airways


In [14]:
Hou_flights.to_csv('Hou_flights.csv', index=False)