In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA

In [None]:
# !pip3 install --upgrade plotly

# Post-merge EDA

In [None]:
df = pd.read_csv("../scripts/eda_2018.csv")
df.head()

In [None]:
print(df.shape)
df.isnull().sum()

In [None]:
df.dropna(inplace=True)
print(df.shape)
df.head()

## Creating Outcome Column

In [None]:
departure_delay_df = df['Departure Delay'].reset_index()
departure_delay_df.drop('index', axis=1, inplace=True)
departure_delay_df.head()

In [None]:
flight_delay_df = df['Flight Delay'].reset_index()
flight_delay_df.drop('index', axis=1, inplace=True)
flight_delay_df.head()

In [None]:
print(departure_delay_df.min())
print(departure_delay_df.max())
print(departure_delay_df.mean())
print(departure_delay_df.std())

In [None]:
print(flight_delay_df.min())
print(flight_delay_df.max())
print(flight_delay_df.mean())
print(flight_delay_df.std())

In [None]:
fig = px.histogram(departure_delay_df, x='Departure Delay')
fig.update_layout(
    autosize=True
)
fig.show()

In [None]:
fig = px.histogram(flight_delay_df, x='Flight Delay')
fig.update_layout(
    autosize=True
)
fig.show()

## Feature Selection & Engineering

In [None]:
AIRLINE_COLS = [
    'Alaska Airlines', 'Allegiant Air', 'American Airlines', 'Delta Airlines', 'Endeavor Air',
    'Envoy Air', 'ExpressJet', 'Frontier Airlines', 'Hawaiian Airlines', 'JetBlue Airways',
    'Mesa Airline', 'PSA Airlines', 'Republic Airways','SkyWest Airlines', 'Southwest Airlines',
    'Spirit Airlines', 'United Airlines', 'Virgin America'
]

FEATURES_COLS = [
    'Origin Total Operations',
    'Origin Precipitation', 'Origin Rain', 'Origin Snowfall', 
    'Origin Windspeed', 'Origin Windgusts', 'Origin Evapotranspiration',
    'Origin Shortwave Radiation',
]

airline_type_df = df[AIRLINE_COLS].reset_index()
airline_type_df.drop('index', axis=1, inplace=True)
features_df = df[FEATURES_COLS].reset_index()
features_df.drop('index', axis=1, inplace=True)

airline_type_df.head()

In [None]:
num_airline_flights = list(airline_type_df.sum())
num_airline_flights

fig = go.Figure()
fig.add_trace(go.Bar(x=AIRLINE_COLS, y=num_airline_flights, name='Airlines'))

fig.update_layout(
    title='No. of Flights by Airline in 2018',
    xaxis_title='Category',
    yaxis_title='Count',
)
fig.show()

In [None]:
scaler = StandardScaler()
scaled_features_np = scaler.fit_transform(features_df)
scaled_features_df = pd.DataFrame(scaled_features_np , columns=FEATURES_COLS)

scaled_features_df['Departure Delay'] = departure_delay_df['Departure Delay']
scaled_features_df['Flight Delay'] = flight_delay_df['Flight Delay']

print(scaled_features_df.shape)
scaled_features_df.head()

In [None]:
threshold = 3
col = 'Departure Delay'

z_scores = np.abs((scaled_features_df[col] - scaled_features_df[col].mean()) / scaled_features_df[col].std())
scaled_df = scaled_features_df.loc[z_scores < threshold]
print(scaled_df.shape)
scaled_df.head()

In [None]:
threshold = 3
col = 'Flight Delay'

z_scores = np.abs((scaled_df[col] - scaled_df[col].mean()) / scaled_df[col].std())
scaled_df = scaled_df.loc[z_scores < threshold]
print(scaled_df.shape)
scaled_df.head()

In [None]:
print(scaled_df["Departure Delay"].max())
print(scaled_df["Departure Delay"].min())
print(scaled_df["Departure Delay"].mean())
print(scaled_df["Departure Delay"].std())

print(scaled_df["Flight Delay"].max())
print(scaled_df["Flight Delay"].min())
print(scaled_df["Flight Delay"].mean())
print(scaled_df["Flight Delay"].std())

In [None]:
def classify_dep_delay(row):
    if row <= -120:
        return 1
    elif row <= -60:
        return 2
    elif row <= -30:
        return 3
    elif row <= -15:
        return 4
    elif row <= 0:
        return 5
    elif row <= 15:
        return 6
    elif row <= 30:
        return 7
    elif row <= 60:
        return 8
    elif row <= 120:
        return 9
    else:
        return 10
    
def classify_fl_delay(row):
    if row <= -30:
        return 1
    elif row <= -15:
        return 2
    elif row <= 0:
        return 3
    elif row <= 15:
        return 4
    else:
        return 5
    
scaled_df["Classified Departure Delay"] = scaled_df["Departure Delay"].apply(classify_dep_delay)
scaled_df["Classified Flight Delay"] = scaled_df["Flight Delay"].apply(classify_fl_delay)

scaled_df.head()

In [None]:
scaled_df.to_csv(f"model_2018.csv", index=False)