# Initial EDA

This notebook will serve to perform some initial EDA to get a better sensing about the data and its distribution. Will be using Fireducks (only for Linux :( ) because it's supposedly faster than pandas.

## Loading Dependencies

In [5]:
!pip install opendatasets

Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl.metadata (9.2 kB)
Collecting kaggle (from opendatasets)
  Downloading kaggle-1.6.17.tar.gz (82 kB)
     ---------------------------------------- 0.0/82.7 kB ? eta -:--:--
     ---------------------------------------  81.9/82.7 kB ? eta -:--:--
     ---------------------------------------  81.9/82.7 kB ? eta -:--:--
     ---------------------------------------  81.9/82.7 kB ? eta -:--:--
     ---------------------------------------  81.9/82.7 kB ? eta -:--:--
     ---------------------------------------  81.9/82.7 kB ? eta -:--:--
     ---------------------------------------  81.9/82.7 kB ? eta -:--:--
     ---------------------------------------  81.9/82.7 kB ? eta -:--:--
     -------------------------------------- 82.7/82.7 kB 231.5 kB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting python-slugify (from kaggle->opendatasets)
  Downl


[notice] A new release of pip is available: 23.3.2 -> 24.2
[notice] To update, run: C:\Users\kengb\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [1]:
# import fireducks.pandas as pd
import pandas as pd
import opendatasets as od
import time, IPython
import numpy as np

# download the dataset
# od.download("https://www.kaggle.com/datasets/bulter22/airline-data/data")

In [2]:
carriers_path = "./airline-data/carriers.csv"
carrier_df = pd.read_csv(carriers_path)
carrier_df

Unnamed: 0,Code,Description
0,02Q,Titan Airways
1,04Q,Tradewind Aviation
2,05Q,"Comlux Aviation, AG"
3,06Q,Master Top Linhas Aereas Ltd.
4,07Q,Flair Airlines Ltd.
...,...,...
1486,ZW,Air Wisconsin Airlines Corp
1487,ZX,Air Georgian
1488,ZX (1),Airbc Ltd.
1489,ZY,Atlantic Gulf Airlines


In [5]:
carrier_df[carrier_df.Code=='NW']

Unnamed: 0,Code,Description
920,NW,Northwest Airlines Inc.


In [7]:
# need to process the shuffle file
pathfile='./airline-data/airline.csv.shuffle'       
reader = pd.read_csv(pathfile, chunksize=10**6, iterator=True, encoding='iso8859-1')       

def print_mem_usage(df):
    mem = df.memory_usage().sum() / 1024 ** 2
    print('The DataFrame takes up {:.3} MB'.format(mem))

train = pd.DataFrame()
start=time.time()  
for i, chunk in enumerate(reader): 
    train = pd.concat([train, chunk.sample(frac=.08, replace=False, random_state=213) ], axis=0)  
    if i % 20 == 0:
        print('Processing Chunk No. {}'.format(i))  
print('the program costs %.2f seconds'%(time.time() - start))
    
print_mem_usage(train)
print('train has {} rows and {} columns'.format(train.shape[0], train.shape[1]))

train.to_csv('airline.csv', index=False)

Processing Chunk No. 0
Processing Chunk No. 20
Processing Chunk No. 40
Processing Chunk No. 60
Processing Chunk No. 80
Processing Chunk No. 100
Processing Chunk No. 120
the program costs 1335.48 seconds
The DataFrame takes up 2.26e+03 MB
train has 9882798 rows and 29 columns


## Exploring Data

In [10]:
airline_path = "./airline.csv"
airline_df = pd.read_csv(airline_path, encoding='utf-8')
airline_df

Unnamed: 0,ActualElapsedTime,AirTime,ArrDelay,ArrTime,CRSArrTime,CRSDepTime,CRSElapsedTime,CancellationCode,Cancelled,CarrierDelay,...,Month,NASDelay,Origin,SecurityDelay,TailNum,TaxiIn,TaxiOut,UniqueCarrier,WeatherDelay,Year
0,154.0,122.0,90.0,1850.0,1720,1455,145.0,,0,0.0,...,6,23.0,ORD,0.0,N293AA,8.0,24.0,AA,0.0,2006
1,159.0,125.0,59.0,1703.0,1604,1510,114.0,,0,,...,9,,DTW,,N8921E,19.0,15.0,NW,,1997
2,,71.0,,,1140,1037,63.0,,1,,...,2,,CVG,,N331DL,4.0,15.0,DL,,1995
3,70.0,,80.0,20.0,2300,2100,60.0,,0,,...,2,,MDW,,,,,ML (1),,1991
4,150.0,135.0,5.0,2030.0,2025,1740,165.0,,0,,...,2,,PHL,,N512AU,3.0,12.0,US,,1997
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9882793,115.0,93.0,5.0,906.0,0,0,119.0,,0,,...,12,,DFW,,N579AA,9.0,13.0,AA,,1997
9882794,137.0,,-10.0,708.0,718,550,148.0,,0,,...,8,,MCO,,,,,DL,,1992
9882795,76.0,61.0,-4.0,736.0,740,620,80.0,,0,,...,5,,ONT,,N639@@,3.0,12.0,WN,,2001
9882796,157.0,124.0,18.0,3.0,2345,2015,150.0,,0,0.0,...,12,7.0,SEA,0.0,N916FR,6.0,27.0,F9,0.0,2005


In [16]:
years = list(airline_df.Year.unique())
years.sort()
years

[1987,
 1988,
 1989,
 1990,
 1991,
 1992,
 1993,
 1994,
 1995,
 1996,
 1997,
 1998,
 1999,
 2000,
 2001,
 2002,
 2003,
 2004,
 2005,
 2006,
 2007,
 2008]

In [None]:
airline_df

In [11]:

airline_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9882798 entries, 0 to 9882797
Data columns (total 29 columns):
 #   Column             Dtype  
---  ------             -----  
 0   ActualElapsedTime  float64
 1   AirTime            float64
 2   ArrDelay           float64
 3   ArrTime            float64
 4   CRSArrTime         int64  
 5   CRSDepTime         int64  
 6   CRSElapsedTime     float64
 7   CancellationCode   object 
 8   Cancelled          int64  
 9   CarrierDelay       float64
 10  DayOfWeek          int64  
 11  DayofMonth         int64  
 12  DepDelay           float64
 13  DepTime            float64
 14  Dest               object 
 15  Distance           float64
 16  Diverted           int64  
 17  FlightNum          int64  
 18  LateAircraftDelay  float64
 19  Month              int64  
 20  NASDelay           float64
 21  Origin             object 
 22  SecurityDelay      float64
 23  TailNum            object 
 24  TaxiIn             float64
 25  TaxiOut           

In [10]:
airline_df.columns

Index(['ActualElapsedTime', 'AirTime', 'ArrDelay', 'ArrTime', 'CRSArrTime',
       'CRSDepTime', 'CRSElapsedTime', 'CancellationCode', 'Cancelled',
       'CarrierDelay', 'DayOfWeek', 'DayofMonth', 'DepDelay', 'DepTime',
       'Dest', 'Distance', 'Diverted', 'FlightNum', 'LateAircraftDelay',
       'Month', 'NASDelay', 'Origin', 'SecurityDelay', 'TailNum', 'TaxiIn',
       'TaxiOut', 'UniqueCarrier', 'WeatherDelay', 'Year'],
      dtype='object')

## Checking Cancelled FLights

In [19]:
cancelled_airlines = airline_df[~airline_df["CancellationCode"].isnull()]
cancelled_airlines.shape

(58620, 29)

In [20]:
cancelled_airlines

Unnamed: 0,ActualElapsedTime,AirTime,ArrDelay,ArrTime,CRSArrTime,CRSDepTime,CRSElapsedTime,CancellationCode,Cancelled,CarrierDelay,...,Month,NASDelay,Origin,SecurityDelay,TailNum,TaxiIn,TaxiOut,UniqueCarrier,WeatherDelay,Year
73,,,,,1748,1544,124.0,C,1,0.0,...,5,0.0,IAH,0.0,0,0.0,0.0,XE,0.0,2006
242,,,,,1021,925,56.0,A,1,0.0,...,2,0.0,RDU,0.0,0,0.0,0.0,US,0.0,2005
295,,,,,827,610,257.0,A,1,,...,5,,ORD,,,,,UA,,2008
336,,,,,1711,1335,156.0,C,1,0.0,...,8,0.0,IAH,0.0,0,0.0,0.0,CO,0.0,2004
727,,,,,1137,1055,42.0,A,1,0.0,...,1,0.0,COS,0.0,000000,0.0,0.0,UA,0.0,2004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9882126,,,,,1120,840,220.0,B,1,0.0,...,12,0.0,PVD,0.0,0,0.0,0.0,WN,0.0,2005
9882462,,,,,830,630,120.0,B,1,0.0,...,11,0.0,GSP,0.0,N940CA,0.0,0.0,OH,0.0,2006
9882674,,,,,2335,2230,65.0,C,1,0.0,...,10,0.0,JFK,0.0,0,0.0,0.0,MQ,0.0,2007
9882704,,,,,908,745,83.0,A,1,0.0,...,10,0.0,PIT,0.0,0,0.0,0.0,US,0.0,2004


In [22]:
cancelled_airlines.info()

<class 'pandas.core.frame.DataFrame'>
Index: 58620 entries, 73 to 9882742
Data columns (total 29 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ActualElapsedTime  0 non-null      float64
 1   AirTime            0 non-null      float64
 2   ArrDelay           0 non-null      float64
 3   ArrTime            0 non-null      float64
 4   CRSArrTime         58620 non-null  int64  
 5   CRSDepTime         58620 non-null  int64  
 6   CRSElapsedTime     58576 non-null  float64
 7   CancellationCode   58620 non-null  object 
 8   Cancelled          58620 non-null  int64  
 9   CarrierDelay       47661 non-null  float64
 10  DayOfWeek          58620 non-null  int64  
 11  DayofMonth         58620 non-null  int64  
 12  DepDelay           105 non-null    float64
 13  DepTime            105 non-null    float64
 14  Dest               58620 non-null  object 
 15  Distance           58620 non-null  float64
 16  Diverted           58620

In [25]:
len(airline_df[airline_df["Cancelled"]==1])

184011

In [27]:
# saving df first cause too many records and the main dataset is too big to work with properly
airline_df[airline_df["Cancelled"]==1].to_csv("cancelled_flights.csv", index_label=False)

In [35]:
top_cancellations = cancelled_airlines.UniqueCarrier.value_counts().index.to_list()
carrier_df[carrier_df['Code'].isin(top_cancellations)]

Unnamed: 0,Code,Description
84,9E,Pinnacle Airlines Inc.
100,AA,American Airlines Inc.
206,AQ,Aloha Airlines Inc.
221,AS,Alaska Airlines Inc.
263,B6,JetBlue Airways
377,CO,Continental Air Lines Inc.
434,DH,Independence Air
441,DL,Delta Air Lines Inc.
505,EV,Atlantic Southeast Airlines
517,F9,Frontier Airlines Inc.


In [37]:
cancelled_airlines = pd.merge(cancelled_airlines, carrier_df, how='left', left_on='UniqueCarrier', right_on='Code')
cancelled_airlines.drop(columns=['Code'])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000022600842070>

In [46]:
cancelled_airlines.columns

Index(['ActualElapsedTime', 'AirTime', 'ArrDelay', 'ArrTime', 'CRSArrTime',
       'CRSDepTime', 'CRSElapsedTime', 'CancellationCode', 'Cancelled',
       'CarrierDelay', 'DayOfWeek', 'DayofMonth', 'DepDelay', 'DepTime',
       'Dest', 'Distance', 'Diverted', 'FlightNum', 'LateAircraftDelay',
       'Month', 'NASDelay', 'Origin', 'SecurityDelay', 'TailNum', 'TaxiIn',
       'TaxiOut', 'UniqueCarrier', 'WeatherDelay', 'Year', 'Code',
       'Description'],
      dtype='object')

In [55]:
cancelled_airlines.groupby(by="UniqueCarrier").sum()[["Cancelled", "Description"]].sort_values(by="Cancelled", ascending=False)

Unnamed: 0_level_0,Cancelled,Description
UniqueCarrier,Unnamed: 1_level_1,Unnamed: 2_level_1
MQ,8308,American Eagle Airlines Inc.American Eagle Air...
AA,5844,American Airlines Inc.American Airlines Inc.Am...
OO,4881,Skywest Airlines Inc.Skywest Airlines Inc.Skyw...
WN,4462,Southwest Airlines Co.Southwest Airlines Co.So...
DL,4280,Delta Air Lines Inc.Delta Air Lines Inc.Delta ...
XE,4019,Expressjet Airlines Inc.Expressjet Airlines In...
OH,3844,Comair Inc.Comair Inc.Comair Inc.Comair Inc.Co...
UA,3835,United Air Lines Inc.United Air Lines Inc.Unit...
EV,3551,Atlantic Southeast AirlinesAtlantic Southeast ...
US,3123,US Airways Inc. (Merged with America West 9/05...


### Notes
- Seems like only cancelled flights will have a Cancellation Code -> may want to focus on these flights only if looking at cancelled flights
- Not all cancelled flights have codes through 184011 (without code) vs 58620 (with code)
- Top flights with cancellations are:
  - American Eagle Airlines Inc.
  - American Airlines Inc.
  - Skywest Airlines Inc.
  - Southwest Airlines Co.
  - Delta Air Lines Inc.
  - Expressjet Airlines Inc.
  - Comair Inc.Comair Inc.
  - United Air Lines Inc.
  - Atlantic Southeast Airlines
  - US Airways Inc.

## Checking Diverted Flights

In [4]:
diverted_flights = airline_df[airline_df["Diverted"]==1]
diverted_flights.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22641 entries, 252 to 9881568
Data columns (total 29 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ActualElapsedTime  0 non-null      float64
 1   AirTime            0 non-null      float64
 2   ArrDelay           0 non-null      float64
 3   ArrTime            253 non-null    float64
 4   CRSArrTime         22641 non-null  int64  
 5   CRSDepTime         22641 non-null  int64  
 6   CRSElapsedTime     22396 non-null  float64
 7   CancellationCode   17 non-null     object 
 8   Cancelled          22641 non-null  int64  
 9   CarrierDelay       5397 non-null   float64
 10  DayOfWeek          22641 non-null  int64  
 11  DayofMonth         22641 non-null  int64  
 12  DepDelay           22641 non-null  float64
 13  DepTime            22641 non-null  float64
 14  Dest               22641 non-null  object 
 15  Distance           22578 non-null  float64
 16  Diverted           2264

In [5]:
diverted_flights

Unnamed: 0,ActualElapsedTime,AirTime,ArrDelay,ArrTime,CRSArrTime,CRSDepTime,CRSElapsedTime,CancellationCode,Cancelled,CarrierDelay,...,Month,NASDelay,Origin,SecurityDelay,TailNum,TaxiIn,TaxiOut,UniqueCarrier,WeatherDelay,Year
252,,,,,2239,2015,144.0,,0,0.0,...,10,0.0,EWR,0.0,N21154,0.0,19.0,XE,0.0,2006
322,,,,,625,550,35.0,,0,,...,5,,MLU,,,,,DL,,1993
512,,,,,830,2305,385.0,,0,,...,2,,HNL,,,,,UA,,1990
558,,,,,2058,1815,103.0,,0,0.0,...,3,0.0,ORD,0.0,N933UA,0.0,90.0,UA,0.0,2006
1189,,,,,1615,1425,170.0,,0,0.0,...,6,0.0,PIT,0.0,N512AE,0.0,10.0,MQ,0.0,2007
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9880136,,,,,835,746,49.0,,0,,...,11,,SUX,,,,,TW,,1989
9880579,,,,,40,15,25.0,,0,,...,12,,FLL,,,,,EA,,1990
9880782,,,,,646,635,71.0,,0,,...,3,,CVG,,N955UA,0.0,10.0,UA,,2000
9881443,,,,,2052,1300,292.0,,0,,...,4,,LAX,,N449UA,0.0,41.0,UA,,2000


In [29]:
diverted_flights.to_csv("diverted_flights.csv", index_label=False)

In [56]:
diverted_airlines = pd.merge(diverted_flights, carrier_df, how='left', left_on='UniqueCarrier', right_on='Code')
diverted_airlines.drop(columns=['Code'])
diverted_airlines.groupby(by="UniqueCarrier").sum()[["Diverted", "Description"]].sort_values(by="Diverted", ascending=False)

Unnamed: 0_level_0,Diverted,Description
UniqueCarrier,Unnamed: 1_level_1,Unnamed: 2_level_1
AA,3404,American Airlines Inc.American Airlines Inc.Am...
US,2537,US Airways Inc. (Merged with America West 9/05...
UA,2508,United Air Lines Inc.United Air Lines Inc.Unit...
DL,2200,Delta Air Lines Inc.Delta Air Lines Inc.Delta ...
WN,2074,Southwest Airlines Co.Southwest Airlines Co.So...
NW,2065,Northwest Airlines Inc.Northwest Airlines Inc....
CO,1842,Continental Air Lines Inc.Continental Air Line...
TW,855,Trans World Airways LLCTrans World Airways LLC...
AS,810,Alaska Airlines Inc.Alaska Airlines Inc.Alaska...
XE,636,Expressjet Airlines Inc.Expressjet Airlines In...


### Notes
- Top Diverted Airlines:
  - American Airlines Inc.
  - Airways Inc.
  - United Air Lines Inc.
  - Delta Air Lines Inc.
  - Southwest Airlines Co.
  - Northwest Airlines Inc.
  - Continental Air Lines Inc.
  - Trans World Airways LLC
  - Alaska Airlines Inc.
  - Expressjet Airlines Inc.

## Checking Delayed Flights

In [30]:
delayed_flight = airline_df[(airline_df['CarrierDelay'] > 0) | 
                 (airline_df['WeatherDelay'] > 0) | 
                 (airline_df['NASDelay'] > 0) | 
                 (airline_df['SecurityDelay'] > 0) | 
                 (airline_df['LateAircraftDelay'] > 0)]
delayed_flight.info()

<class 'pandas.core.frame.DataFrame'>
Index: 679845 entries, 0 to 9882796
Data columns (total 29 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   ActualElapsedTime  679845 non-null  float64
 1   AirTime            679845 non-null  float64
 2   ArrDelay           679845 non-null  float64
 3   ArrTime            679845 non-null  float64
 4   CRSArrTime         679845 non-null  int64  
 5   CRSDepTime         679845 non-null  int64  
 6   CRSElapsedTime     679845 non-null  float64
 7   CancellationCode   0 non-null       object 
 8   Cancelled          679845 non-null  int64  
 9   CarrierDelay       679845 non-null  float64
 10  DayOfWeek          679845 non-null  int64  
 11  DayofMonth         679845 non-null  int64  
 12  DepDelay           679845 non-null  float64
 13  DepTime            679845 non-null  float64
 14  Dest               679845 non-null  object 
 15  Distance           679845 non-null  float64
 16  Divert

In [31]:
delayed_flight.to_csv("delayed_flights.csv", index_label=False)

In [8]:
# delayed_flight = pd.read_csv("delayed_flights.csv")
delayed_flight[delayed_flight.UniqueCarrier == "AA"].Year.value_counts()

Year
2007    14114
2008    13127
2004    11859
2005    11683
2006    11530
2003     6601
Name: count, dtype: int64

In [30]:
aa_delays = pd.DataFrame(delayed_flight[delayed_flight.UniqueCarrier == "AA"].Year.value_counts()).sort_values(by="Year")
aa_delays = aa_delays.rename(columns={"count":"AA_flight_delays"})
aa_delays

Unnamed: 0_level_0,AA_flight_delays
Year,Unnamed: 1_level_1
2003,6601
2004,11859
2005,11683
2006,11530
2007,14114
2008,13127


In [59]:
delayed_airlines = pd.merge(delayed_flight, carrier_df, how='left', left_on='UniqueCarrier', right_on='Code')
delayed_airlines.drop(columns=['Code'])
# count as long as there is a delay
delayed_airlines["total_delay"] = np.where(
    (delayed_airlines['CarrierDelay'] > 0) | 
    (delayed_airlines['WeatherDelay'] > 0) | 
    (delayed_airlines['NASDelay'] > 0) | 
    (delayed_airlines['SecurityDelay'] > 0) | 
    (delayed_airlines['LateAircraftDelay'] > 0), 
    1, 
    0
)
delayed_airlines.groupby(by="UniqueCarrier").sum()[["total_delay", "Description"]].sort_values(by="total_delay", ascending=False)

Unnamed: 0_level_0,total_delay,Description
UniqueCarrier,Unnamed: 1_level_1,Unnamed: 2_level_1
WN,87706,Southwest Airlines Co.Southwest Airlines Co.So...
AA,68914,American Airlines Inc.American Airlines Inc.Am...
DL,53721,Delta Air Lines Inc.Delta Air Lines Inc.Delta ...
MQ,52838,American Eagle Airlines Inc.American Eagle Air...
UA,50659,United Air Lines Inc.United Air Lines Inc.Unit...
US,44132,US Airways Inc. (Merged with America West 9/05...
NW,43936,Northwest Airlines Inc.Northwest Airlines Inc....
OO,42260,Skywest Airlines Inc.Skywest Airlines Inc.Skyw...
XE,40497,Expressjet Airlines Inc.Expressjet Airlines In...
EV,32850,Atlantic Southeast AirlinesAtlantic Southeast ...


### Notes:
- Top delayed flights:
  - Southwest Airlines Co.
  - American Airlines Inc.
  - Delta Air Lines Inc.
  - American Eagle Airlines Inc.
  - United Air Lines Inc.
  - US Airways Inc.
  - Northwest Airlines Inc.
  - Skywest Airlines Inc.
  - Expressjet Airlines Inc.
  - Atlantic Southeast Airlines