In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA

from sklearn.cluster import KMeans

In [None]:
# !pip3 install --upgrade plotly

# Post-merge EDA

In [None]:
df = pd.read_csv("../scripts/eda_2018.csv")
df.head()

In [None]:
print(df.shape)
df.isnull().sum()

In [None]:
df.dropna(inplace=True)
df.reset_index(inplace=True, drop=True)
print(df.shape)
df.head()

## Creating Outcome Column

In [None]:
departure_delay_df = df['Departure Delay'].reset_index()
departure_delay_df.drop('index', axis=1, inplace=True)
departure_delay_df.head()

In [None]:
flight_delay_df = df['Flight Delay'].reset_index()
flight_delay_df.drop('index', axis=1, inplace=True)
flight_delay_df.head()

In [None]:
print(departure_delay_df.min())
print(departure_delay_df.max())
print(departure_delay_df.mean())
print(departure_delay_df.std())

In [None]:
print(flight_delay_df.min())
print(flight_delay_df.max())
print(flight_delay_df.mean())
print(flight_delay_df.std())

In [None]:
fig = px.histogram(departure_delay_df, x='Departure Delay')
fig.update_layout(
    autosize=True
)
fig.show()

In [None]:
fig = px.histogram(flight_delay_df, x='Flight Delay')
fig.update_layout(
    autosize=True
)
fig.show()

In [None]:
def classify_dep_delay(row):
    if row <= -60:
        return 1
    elif row <= -15:
        return 2
    elif row <= 15:
        return 3
    elif row <= 60:
        return 4
    else:
        return 5
    
def classify_fl_delay(row):
    if row <= -60:
        return 1
    elif row <= -15:
        return 2
    elif row <= 15:
        return 3
    elif row <= 60:
        return 4
    else:
        return 5
    
departure_delay_df["Classified Departure Delay"] = departure_delay_df["Departure Delay"].apply(classify_dep_delay)
flight_delay_df["Classified Flight Delay"] = flight_delay_df["Flight Delay"].apply(classify_fl_delay)

## Feature Selection & Engineering

In [None]:
OUTCOMES = [
    'Departure Delay', 'Flight Delay', 'Classified Departure Delay', 'Classified Flight Delay'
]

FEATURES_COLS = [
    'Seats', 'Airplane Age', 'Month',
    'Distance', 'Departure Time',
    'Origin Total Operations',
    'Origin Precipitation', 'Origin Rain', 'Origin Snowfall', 
    'Origin Windspeed', 'Origin Windgusts', 'Origin Evapotranspiration',
    'Origin Shortwave Radiation',
    'Destination Total Operations',
    'Destination Precipitation', 'Destination Rain', 'Destination Snowfall', 
    'Destination Windspeed', 'Destination Windgusts', 'Destination Evapotranspiration',
    'Destination Shortwave Radiation',
]

SCALED_FEATURES_COLS = [
    'Monthly Median Flight Delay', 'Monthly Median Departure Delay',
    'Seats', 'Airplane Age', 'Distance',
    'Origin Total Operations', 'Destination Total Operations',
    'Origin Precipitation', 'Origin Rain', 'Origin Snowfall', 
    'Origin Windspeed', 'Origin Windgusts', 'Origin Evapotranspiration',
    'Origin Shortwave Radiation',
    'Destination Precipitation', 'Destination Rain', 'Destination Snowfall', 
    'Destination Windspeed', 'Destination Windgusts', 'Destination Evapotranspiration',
    'Destination Shortwave Radiation',
]

NON_WEATHER_FEATURES = [
    'Seats', 'Airplane Age', 'Distance', 'Origin Total Operations', 'Destination Total Operations',
]

ORIGIN_WEATHER_FEATURES = [
    'Origin Precipitation', 'Origin Rain',
    'Origin Snowfall', 'Origin Windspeed', 'Origin Windgusts',
    'Origin Evapotranspiration', 'Origin Shortwave Radiation',
]

DEST_WEATHER_FEATURES = [
    'Destination Precipitation', 'Destination Rain',
    'Destination Snowfall', 'Destination Windspeed', 'Destination Windgusts',
    'Destination Evapotranspiration', 'Destination Shortwave Radiation'
]

In [None]:
df['Departure Delay'] = departure_delay_df['Departure Delay']
df['Classified Departure Delay'] = departure_delay_df['Classified Departure Delay']
df['Flight Delay'] = flight_delay_df['Flight Delay']
df['Classified Flight Delay'] = flight_delay_df['Classified Flight Delay']

airline_type_df = df['Carrier Code']
features_df = df[FEATURES_COLS]

In [None]:
airline_type_df

In [None]:
num_airline_flights = list(airline_type_df.sum())
num_airline_flights

fig = go.Figure()
fig.add_trace(go.Bar(x=AIRLINE_COLS, y=num_airline_flights, name='Airlines'))

fig.update_layout(
    title='No. of Flights by Airline in 2018',
    xaxis_title='Category',
    yaxis_title='Count',
)
fig.show()

In [None]:
# airline_type_df['Carrier'] = airline_type_df.idxmax(axis=1)
# airline_type_df.drop(AIRLINE_COLS, axis=1, inplace=True)
# airline_type_df.head()

In [None]:
airline_type_df = pd.concat([airline_type_df, df['Month'], df[OUTCOMES]], axis=1)
airline_type_df.head()

In [None]:
monthly_median_carrier_flight_delay = airline_type_df.groupby(['Carrier Code', 'Month'])['Flight Delay'].median().reset_index()
monthly_median_carrier_dep_delay = airline_type_df.groupby(['Carrier Code', 'Month'])['Departure Delay'].median().reset_index()

monthly_median_carrier_flight_delay.rename(columns={'Flight Delay': 'Monthly Median Flight Delay'}, inplace=True)
monthly_median_carrier_dep_delay.rename(columns={'Departure Delay': 'Monthly Median Departure Delay'}, inplace=True)
monthly_median_carrier_dep_delay.head()

In [None]:
monthly_mode_carrier_flight_delay_class = airline_type_df.groupby(['Carrier Code', 'Month'])['Classified Flight Delay'].agg(lambda x:x.value_counts().index[0]).reset_index()
monthly_mode_carrier_dep_delay_class = airline_type_df.groupby(['Carrier Code', 'Month'])['Classified Departure Delay'].agg(lambda x:x.value_counts().index[0]).reset_index()

monthly_mode_carrier_flight_delay_class.rename(columns={'Classified Flight Delay': 'Monthly Mode Flight Delay Class'}, inplace=True)
monthly_mode_carrier_dep_delay_class.rename(columns={'Classified Departure Delay': 'Monthly Mode Departure Delay Class'}, inplace=True)
monthly_mode_carrier_dep_delay_class.head()

In [None]:
df['Carrier'] = airline_type_df['Carrier']

In [None]:
df = df.merge(monthly_median_carrier_flight_delay.set_index(['Carrier Code', 'Month']), on=['Carrier Code', 'Month'], how='left')
df = df.merge(monthly_median_carrier_dep_delay.set_index(['Carrier Code', 'Month']), on=['Carrier Code', 'Month'], how='left')
df = df.merge(monthly_mode_carrier_flight_delay_class.set_index(['Carrier Code', 'Month']), on=['Carrier Code', 'Month'], how='left')
df = df.merge(monthly_mode_carrier_dep_delay_class.set_index(['Carrier Code', 'Month']), on=['Carrier Code', 'Month'], how='left')

In [None]:
df.head()

## Sampling

In [None]:
# rows = 3000000
# sampled_df = df.sample(n=rows, random_state=42)
# sampled_df = sampled_df.reset_index()
# sampled_df.drop('index', axis=1, inplace=True)
# print(sampled_df.shape)
# sampled_df.head()

sampled_df = df
print(sampled_df.shape)
sampled_df.head()

In [None]:
col = "Airplane Age"

fig = px.histogram(features_df, x=col)
fig.update_layout(title=f"{col} Histogram")
fig.show()

In [None]:
corr = features_df.corr()

fig = px.imshow(
    corr,
    x=corr.columns,
    y=corr.columns,
    color_continuous_scale='RdBu',
    zmin=-1,
    zmax=1
)

fig.update_layout(title='Correlation Heatmap')

fig.show()

In [None]:
col = "Departure Time"

fig = px.histogram(features_df, x=col)
fig.update_layout(title=f"{col} Histogram")
fig.show()

In [None]:
X = sampled_df[NON_WEATHER_FEATURES]
kmeans = KMeans(n_clusters=5).fit(X)
labels = kmeans.labels_

In [None]:
clustering_result_df = sampled_df[["Departure Time", "Flight Delay"]]
clustering_result_df['Cluster'] = labels

fig = px.scatter(clustering_result_df, x='Flight Delay', y="Departure Time", color='Cluster')
fig.update_layout(title='Scatter Plot with Clustering')
fig.show()

In [None]:
clustering_result_df = sampled_df[["Month", "Flight Delay"]]
clustering_result_df['Cluster'] = labels

fig = px.scatter(clustering_result_df, x='Flight Delay', y="Month", color='Cluster')
fig.update_layout(title='Scatter Plot with Clustering')
fig.show()

In [None]:
CLASSIFIED_DEP_TIME = {
    1: 'Morning Dep Time',
    2: 'Regular Dep Time',
    3: 'Night Dep Time'
}

def classify_dep_time(row):
    if row <= 600:
        return 1
    elif row <= 2000:
        return 2
    else:
        return 3
    
col = "Classified Departure Time"

sampled_df[col] = sampled_df["Departure Time"].apply(classify_dep_time)
sampled_df[col].replace(CLASSIFIED_DEP_TIME, inplace=True)
encoded_dep_time_df = pd.get_dummies(sampled_df[col])
sampled_df.drop(col, axis=1, inplace=True)
sampled_df = pd.concat([sampled_df, encoded_dep_time_df], axis=1)
sampled_df.head()

In [None]:
# MONTHS = {
#     1: 'January',
#     2: 'Feburary',
#     3: 'March',
#     4: 'April',
#     5: 'May',
#     6: 'June',
#     7: 'July',
#     8: 'August',
#     9: 'September',
#     10: 'October',
#     11: 'November', 
#     12: 'December'
# }

# col = "Month"

# sampled_df[col].replace(MONTHS, inplace=True)
# encoded_dep_time_df = pd.get_dummies(sampled_df[col])
# sampled_df.drop(col, axis=1, inplace=True)
# sampled_df = pd.concat([sampled_df, encoded_dep_time_df], axis=1)
# sampled_df.head()

In [None]:
scaler = StandardScaler()
scaled_sampled_np = scaler.fit_transform(sampled_df[SCALED_FEATURES_COLS])
scaled_sampled_df = pd.DataFrame(scaled_sampled_np , columns=SCALED_FEATURES_COLS)

print(scaled_sampled_df.shape)
scaled_sampled_df.head()

In [None]:
origin_weather_df = scaled_sampled_df[ORIGIN_WEATHER_FEATURES]

pca = PCA(n_components=2)
transformed_data = pca.fit_transform(origin_weather_df)
print(f"Variance explained by each component: {pca.explained_variance_ratio_}")

pca_origin_weather_df = pd.DataFrame(transformed_data, columns=["PCA Origin Weather 1", "PCA Origin Weather 2"])
pca_origin_weather_df.head()

In [None]:
dest_weather_df = scaled_sampled_df[DEST_WEATHER_FEATURES]

pca = PCA(n_components=2)
transformed_data = pca.fit_transform(dest_weather_df)
print(f"Variance explained by each component: {pca.explained_variance_ratio_}")

pca_dest_weather_df = pd.DataFrame(transformed_data, columns=["PCA Dest Weather 1", "PCA Dest Weather 2"])
pca_dest_weather_df.head()

In [None]:
final_sampled_df = pd.concat([
    scaled_sampled_df[NON_WEATHER_FEATURES], 
    pca_origin_weather_df, pca_dest_weather_df, 
    sampled_df[list(CLASSIFIED_DEP_TIME.values())],
    scaled_sampled_df[[
       'Monthly Median Flight Delay', 'Monthly Median Departure Delay', 
    ]],
    sampled_df[[
        'Monthly Mode Flight Delay Class', 'Monthly Mode Departure Delay Class',
    ]],
    sampled_df[OUTCOMES]
], axis=1)

print(final_sampled_df.shape)
final_sampled_df.head()

In [None]:
threshold = 3
col = 'Departure Delay'

z_scores = np.abs((final_sampled_df[col] - final_sampled_df[col].mean()) / final_sampled_df[col].std())
final_sampled_df = final_sampled_df.loc[z_scores < threshold]
print(final_sampled_df.shape)
final_sampled_df.head()

In [None]:
threshold = 3
col = 'Flight Delay'

z_scores = np.abs((final_sampled_df[col] - final_sampled_df[col].mean()) / final_sampled_df[col].std())
final_sampled_df = final_sampled_df.loc[z_scores < threshold]
print(final_sampled_df.shape)
final_sampled_df.head()

In [None]:
print(final_sampled_df["Departure Delay"].max())
print(final_sampled_df["Departure Delay"].min())
print(final_sampled_df["Departure Delay"].mean())
print(final_sampled_df["Departure Delay"].std())
print()
print(final_sampled_df["Flight Delay"].max())
print(final_sampled_df["Flight Delay"].min())
print(final_sampled_df["Flight Delay"].mean())
print(final_sampled_df["Flight Delay"].std())

In [None]:
print(final_sampled_df.shape)
final_sampled_df.isnull().sum()

In [None]:
corr = final_sampled_df.corr()

fig = px.imshow(
    corr,
    x=corr.columns,
    y=corr.columns,
    color_continuous_scale='RdBu',
    zmin=-1,
    zmax=1
)

fig.update_layout(title='Correlation Heatmap')

fig.show()

In [None]:
FINAL_COLS = [
    'Airplane Age', 'Distance', 
    'Origin Total Operations','Destination Total Operations',
    'PCA Origin Weather 1', 'PCA Origin Weather 2',
    'PCA Dest Weather 1', 'PCA Dest Weather 2',
    'Morning Dep Time', 'Regular Dep Time', 'Night Dep Time',
    'Monthly Median Flight Delay', 'Monthly Median Departure Delay',
    'Monthly Mode Flight Delay Class', 'Monthly Mode Departure Delay Class',
    'Departure Delay', 'Flight Delay', 'Classified Departure Delay',
    'Classified Flight Delay'
]

In [None]:
final = final_sampled_df[FINAL_COLS]
final

In [None]:
final.to_csv(f"model_2018.csv", index=False)