# EDA temporario (testing)

In [1]:
import os
import pandas as pd

In [2]:
os.chdir("..")
os.getcwd()

'/home/troonies/repos/itba-aws-mle'

## Data

Detailed info on **flight arrivals and delays for U.S. airports**, categorized by carriers.

It includes metrics such as the number of arriving flights, delays over 15 minutes, cancellation and diversion counts, and the breakdown of delays attributed to carriers, weather, NAS (National Airspace System), security, and late aircraft arrivals.

In [3]:
cols_descriptions = {
    "year":         "Year of the data",
    "month":        "Month of the data",
    "carrier":      "Carrier code",
    "carrier_name": "Carrier name",
    "airport":      "Airport code",
    "airport_name": "Airport name",
    "arr_flights":      "Number of arriving flights",
    "arr_del15":        "Number of flights delayed by 15 minutes or more",
    "carrier_ct":       "Carrier count (delay due to the carrier)",
    "weather_ct":       "Weather count (delay due to weather)",
    "nas_ct":           "NAS (National Airspace System) count (delay due to the NAS)",
    "security_ct":      "Security count (delay due to security)",
    "late_aircraft_ct": "Late aircraft count (delay due to late aircraft arrival)",
    "arr_cancelled":    "Number of flights canceled",
    "arr_diverted":     "Number of flights diverted",
    "arr_delay":           "Total arrival delay",
    "carrier_delay":       "Delay attributed to the carrier",
    "weather_delay":       "Delay attributed to weather",
    "nas_delay":           "Delay attributed to the NAS",
    "security_delay":      "Delay attributed to security",
    "late_aircraft_delay": "Delay attributed to late aircraft arrival",
}

In [28]:
cols_types = {
    "int": [
        "year",
        "month",
        "arr_flights",
        "arr_del15",
        "arr_cancelled",
        "arr_diverted",
        "arr_delay",
        "carrier_delay",
        "weather_delay",
        "nas_delay",
        "security_delay",
        "late_aircraft_delay",
    ],
    "float": [
        "carrier_ct",
        "weather_ct",
        "nas_ct",
        "security_ct",
        "late_aircraft_ct",
    ],
    "cat": [
        "carrier",
        "carrier_name",
        "airport",
        "airport_name",
    ],
}

Use training and validation splits only for the EDA.

In [4]:
df = pd.read_csv("data/train_val.csv")
df

Unnamed: 0,year,month,carrier,carrier_name,airport,airport_name,arr_flights,arr_del15,carrier_ct,weather_ct,...,security_ct,late_aircraft_ct,arr_cancelled,arr_diverted,arr_delay,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay
0,2013,10,B6,JetBlue Airways,SLC,"Salt Lake City, UT: Salt Lake City International",93.0,12.0,4.04,0.00,...,0.73,5.23,0.0,0.0,649.0,208.0,0.0,54.0,37.0,350.0
1,2015,4,UA,United Air Lines Inc.,SNA,"Santa Ana, CA: John Wayne Airport-Orange County",449.0,81.0,34.33,2.23,...,0.00,28.87,2.0,5.0,3698.0,1263.0,75.0,580.0,0.0,1780.0
2,2015,10,DL,Delta Air Lines Inc.,HSV,"Huntsville, AL: Huntsville International-Carl ...",172.0,6.0,1.78,0.00,...,0.00,3.00,0.0,0.0,214.0,67.0,0.0,22.0,0.0,125.0
3,2016,3,EV,ExpressJet Airlines Inc.,CRW,"Charleston/Dunbar, WV: Yeager",174.0,29.0,14.62,0.00,...,0.00,6.13,2.0,3.0,1581.0,929.0,0.0,336.0,0.0,316.0
4,2019,10,F9,Frontier Airlines Inc.,GSP,"Greer, SC: Greenville-Spartanburg International",29.0,2.0,1.12,0.00,...,0.00,0.88,1.0,0.0,105.0,55.0,0.0,0.0,0.0,50.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
154494,2016,10,OO,SkyWest Airlines Inc.,EWR,"Newark, NJ: Newark Liberty International",36.0,5.0,0.00,0.82,...,0.00,0.00,0.0,0.0,146.0,0.0,42.0,104.0,0.0,0.0
154495,2015,8,DL,Delta Air Lines Inc.,GSO,"Greensboro/High Point, NC: Piedmont Triad Inte...",180.0,28.0,6.93,2.98,...,0.00,12.86,0.0,0.0,2183.0,374.0,802.0,210.0,0.0,797.0
154496,2015,10,OO,SkyWest Airlines Inc.,FLG,"Flagstaff, AZ: Flagstaff Pulliam",158.0,37.0,9.32,0.00,...,0.00,11.65,1.0,2.0,1423.0,544.0,0.0,397.0,0.0,482.0
154497,2023,1,AA,American Airlines Inc.,BNA,"Nashville, TN: Nashville International",622.0,142.0,45.98,2.90,...,1.19,50.32,9.0,1.0,8722.0,2549.0,456.0,1788.0,38.0,3891.0


In [5]:
df.isnull().sum()

year                     0
month                    0
carrier                  0
carrier_name             0
airport                  0
airport_name             0
arr_flights            214
arr_del15              394
carrier_ct             214
weather_ct             214
nas_ct                 214
security_ct            214
late_aircraft_ct       214
arr_cancelled          214
arr_diverted           214
arr_delay              214
carrier_delay          214
weather_delay          214
nas_delay              214
security_delay         214
late_aircraft_delay    214
dtype: int64

In [6]:
df.describe().round(2)

Unnamed: 0,year,month,arr_flights,arr_del15,carrier_ct,weather_ct,nas_ct,security_ct,late_aircraft_ct,arr_cancelled,arr_diverted,arr_delay,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay
count,154499.0,154499.0,154285.0,154105.0,154285.0,154285.0,154285.0,154285.0,154285.0,154285.0,154285.0,154285.0,154285.0,154285.0,154285.0,154285.0,154285.0
mean,2018.55,6.49,361.27,66.2,20.74,2.25,19.3,0.16,23.68,7.53,0.86,4224.4,1432.92,222.06,916.19,7.34,1645.88
std,2.89,3.44,990.61,179.02,50.2,7.29,61.38,0.72,72.26,44.14,3.73,12599.4,4216.43,817.52,3407.85,41.77,5217.59
min,2013.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2016.0,3.0,50.0,6.0,2.16,0.0,1.0,0.0,1.23,0.0,0.0,334.0,110.0,0.0,34.0,0.0,65.0
50%,2019.0,7.0,100.0,17.0,6.4,0.39,3.91,0.0,5.0,1.0,0.0,1016.0,374.0,18.0,146.0,0.0,320.0
75%,2021.0,9.0,249.0,47.0,17.25,1.86,11.7,0.0,15.25,4.0,1.0,2877.0,1109.0,146.0,477.0,0.0,1068.0
max,2023.0,12.0,21977.0,4176.0,1293.91,266.42,1884.42,58.69,2069.07,4951.0,197.0,438783.0,196944.0,31960.0,112018.0,3760.0,227959.0


In [7]:
df[df.arr_flights.isnull()]

Unnamed: 0,year,month,carrier,carrier_name,airport,airport_name,arr_flights,arr_del15,carrier_ct,weather_ct,...,security_ct,late_aircraft_ct,arr_cancelled,arr_diverted,arr_delay,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay
1265,2020,10,OO,SkyWest Airlines Inc.,DCA,"Washington, DC: Ronald Reagan Washington National",,,,,...,,,,,,,,,,
1934,2021,10,OO,SkyWest Airlines Inc.,TYR,"Tyler, TX: Tyler Pounds Regional",,,,,...,,,,,,,,,,
2487,2014,11,DL,Delta Air Lines Inc.,PSC,"Pasco/Kennewick/Richland, WA: Tri Cities",,,,,...,,,,,,,,,,
2497,2020,10,YV,Mesa Airlines Inc.,RSW,"Fort Myers, FL: Southwest Florida International",,,,,...,,,,,,,,,,
4071,2021,8,YX,Republic Airline,CID,"Cedar Rapids/Iowa City, IA: The Eastern Iowa",,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149454,2022,1,MQ,Envoy Air,COU,"Columbia, MO: Columbia Regional",,,,,...,,,,,,,,,,
151501,2016,5,EV,ExpressJet Airlines Inc.,SLC,"Salt Lake City, UT: Salt Lake City International",,,,,...,,,,,,,,,,
151859,2014,4,DL,Delta Air Lines Inc.,JAC,"Jackson, WY: Jackson Hole",,,,,...,,,,,,,,,,
152205,2014,1,OO,SkyWest Airlines Inc.,BRO,"Brownsville, TX: Brownsville South Padre Islan...",,,,,...,,,,,,,,,,


In [10]:
df[df.arr_flights.notnull()].isnull().sum()

year                     0
month                    0
carrier                  0
carrier_name             0
airport                  0
airport_name             0
arr_flights              0
arr_del15              180
carrier_ct               0
weather_ct               0
nas_ct                   0
security_ct              0
late_aircraft_ct         0
arr_cancelled            0
arr_diverted             0
arr_delay                0
carrier_delay            0
weather_delay            0
nas_delay                0
security_delay           0
late_aircraft_delay      0
dtype: int64