In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.feature_selection import VarianceThreshold

from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA

from sklearn.cluster import KMeans

In [None]:
# !pip3 install --upgrade plotly

In [None]:
# df_length = len(df)

# df1 = df.iloc[:df_length//2]
# df2 = df.iloc[df_length//2:]

# df1.to_csv('eda_2018_part1.csv', index=False)
# df2.to_csv('eda_2018_part2.csv', index=False)

# Post-merge EDA

In [None]:
df = pd.read_csv("eda_2018.csv")
print(df.shape)
df.head()

In [None]:
df.columns

In [None]:
df.isnull().sum()

In [None]:
df.fillna('', inplace=True)
print(df.shape)
df.head()

## Creating Outcome Column

In [None]:
df['Departure Delay'] = df['Departure Delay'] - df['NAS Delay'] - df['Security Delay']

In [None]:
col = 'Departure Delay'

In [None]:
print(df[col].max())
print(df[col].mean())
print(df[col].median())
print(df[col].min())

In [None]:
threshold = 3

z_scores = np.abs((df[col] - df[col].mean()) / df[col].std())
df = df.loc[z_scores < threshold]
df.shape

In [None]:
print(df[col].max())
print(df[col].mean())
print(df[col].median())
print(df[col].min())

## Feature Selection & Engineering

In [None]:
ALL_COLS = [
    'Date', 'Carrier Code', 'Departure Time', 'Distance',
    'Month', 'Airplane Age',
    'Origin Total Operations',
    'Origin Weather Code', 'Origin Temperature Max',
    'Origin Temperature Min', 'Origin Temperature Mean',
    'Origin Apparent Temperature Max', 'Origin Apparent Temperature Min',
    'Origin Apparent Temperature Mean', 'Origin Sunrise', 'Origin Sunset',
    'Origin Shortwave Radiation', 'Origin Precipitation', 'Origin Rain',
    'Origin Snowfall', 'Origin Precipitation Hours', 'Origin Windspeed',
    'Origin Windgusts', 'Origin Wind Direction', 'Origin Evapotranspiration', 
    'Holiday', 'Day of Holiday',
    'Departure Delay'
]

WEATHER_FEATURES = [
    'Date',
    'Origin Weather Code', 'Origin Temperature Max',
    'Origin Temperature Min', 'Origin Temperature Mean',
    'Origin Apparent Temperature Max', 'Origin Apparent Temperature Min',
    'Origin Apparent Temperature Mean', 'Origin Sunrise', 'Origin Sunset',
    'Origin Shortwave Radiation', 'Origin Precipitation', 'Origin Rain',
    'Origin Snowfall', 'Origin Precipitation Hours', 'Origin Windspeed',
    'Origin Windgusts', 'Origin Wind Direction', 'Origin Evapotranspiration', 
]

NON_WEATHER_FEATURES = [
    'Date', 'Carrier Code', 'Departure Time', 'Distance',
    'Month', 'Airplane Age',
    'Origin Total Operations',
    'Holiday', 'Day of Holiday',
    'Departure Delay'
]

In [None]:
airline_type_df = df['Carrier Code']
features_df = df[ALL_COLS]
weather_df = df[WEATHER_FEATURES]
non_weather_df = df[NON_WEATHER_FEATURES]

## Monthly Median Departure Delay for each Carrier

In [None]:
airline_type_df = pd.concat([airline_type_df, df['Month'], df['Departure Delay']], axis=1)
airline_type_df.head()

In [None]:
monthly_median_carrier_dep_delay = airline_type_df.groupby(['Carrier Code', 'Month'])['Departure Delay'].median().reset_index()
monthly_median_carrier_dep_delay.rename(columns={'Departure Delay': 'Monthly Median Departure Delay'}, inplace=True)
monthly_median_carrier_dep_delay.head()


In [None]:
monthly_median_carrier_dep_delay.isnull().sum()

## Weather EDA

In [None]:
weather_df['Origin Weather Code'].value_counts()

In [None]:
def interpret_wmo(row):
    if row in [53.0, 63.0, 73.0]:
        return 1
    elif row in [55.0, 65.0, 75.0]:
        return 2
    else:
        return 0

weather_df['Weather Code Intensity'] = weather_df['Origin Weather Code'].apply(interpret_wmo)
weather_df.head()

In [None]:
num_bins = 5

bin_edges = pd.qcut(weather_df['Origin Precipitation Hours'], q=num_bins, labels=False, duplicates='drop')

weather_df['Origin Precipitation Hours (Equal)'] = bin_edges
weather_df['Origin Precipitation Hours (Equal)'].value_counts()

In [None]:
weather_df.head()

In [None]:
weather_df['Origin Daylight'] = pd.to_datetime(weather_df['Origin Sunset']) - pd.to_datetime(weather_df['Origin Sunrise'])
weather_df['Origin Daylight'] = weather_df['Origin Daylight'].apply(lambda x:x.total_seconds())

num_bins = 4

bin_edges = pd.qcut(weather_df['Origin Daylight'], q=num_bins, labels=False, duplicates='drop')

weather_df['Origin Daylight (Equal)'] = bin_edges
weather_df['Origin Daylight (Equal)'].value_counts()

In [None]:
weather_df.columns

In [None]:
SELECTED_WEATHER_COLS = [
    'Date', 'Origin Temperature Mean',
    'Origin Shortwave Radiation', 'Origin Precipitation', 'Origin Rain', 'Origin Snowfall',
    'Origin Windspeed', 'Origin Windgusts', 'Origin Wind Direction', 'Origin Evapotranspiration',
]

selected_weather_df = weather_df[SELECTED_WEATHER_COLS]


In [None]:
selected_weather_df['Date'] = pd.to_datetime(selected_weather_df['Date'])

selected_weather_df.set_index('Date', inplace=True)

weekly_means = selected_weather_df.resample('W').mean()

In [None]:
RENAMED_WEEKLY_WEATHER_COLS = {
    'Origin Temperature Mean': 'Origin Weekly Mean Temperature',
    'Origin Shortwave Radiation':'Origin Weekly Mean Shortwave Radiation', 
    'Origin Precipitation': 'Origin Weekly Mean Precipitation', 
    'Origin Rain': 'Origin Weekly Mean Rain',
    'Origin Snowfall': 'Origin Weekly Mean Snowfall', 
    'Origin Windspeed': 'Origin Weekly Mean Windspeed', 
    'Origin Windgusts': 'Origin Weekly Mean Windgusts', 
    'Origin Wind Direction': 'Origin Weekly Mean Wind Direction',
    'Origin Evapotranspiration': 'Origin Weekly Mean Evapotranspiration', 
}


weekly_df = weekly_means.rename(columns=RENAMED_WEEKLY_WEATHER_COLS)
weekly_df.reset_index(inplace=True)
weekly_df.head()

In [None]:
count = 0
daily_df = pd.DataFrame([])
for week in weekly_df['Date']:
    start_date = week + pd.DateOffset(days=-6)
    end_date = week
    
    for date in pd.date_range(start_date, end_date, freq='D'):
        row = weekly_df.loc[count:count]
        row = row.assign(Day=str(date))
        daily_df = pd.concat([daily_df, row])
    count += 1

daily_df.drop(['Date'], axis=1, inplace=True)
daily_df['Day'] = pd.to_datetime(daily_df['Day'])
daily_df.rename(columns={'Day':'Date'}, inplace=True)
daily_df.reset_index(inplace=True, drop=True)
print(daily_df.shape)
daily_df.head()

In [None]:
weather_df['Date'] = pd.to_datetime(weather_df['Date'])

final_weather_df = weather_df.merge(daily_df, on='Date', how='left')
print(final_weather_df.shape)
final_weather_df.head()

In [None]:
final_weather_df.shape

In [None]:
final_weather_df.columns

In [None]:
final_weather_df.isnull().sum()

## Binning Continuous value features

In [None]:
CLASSIFIED_DEP_TIME = {
    1: 'Morning Dep Time',
    2: 'Regular Dep Time',
    3: 'Night Dep Time'
}

def classify_dep_time(row):
    if row <= 600:
        return 1
    elif row <= 2000:
        return 2
    else:
        return 3
    
col = "Classified Departure Time"

non_weather_df[col] = non_weather_df["Departure Time"].apply(classify_dep_time)
non_weather_df[col].replace(CLASSIFIED_DEP_TIME, inplace=True)
encoded_dep_time_df = pd.get_dummies(non_weather_df[col])
non_weather_df.drop(col, axis=1, inplace=True)
non_weather_df = pd.concat([non_weather_df, encoded_dep_time_df], axis=1)
non_weather_df.head()

In [None]:
def is_holiday(row):
    return 0 if row == '' else 1

def is_labour_day(row):
    return 1 if row == 'Labor Day' else 0

def is_juneteenth(row):
    return 1 if row == 'Juneteenth' else 0

def is_xmas(row):
    return 1 if (row == 'Christmas Day' or row == 'Christmas Eve') else 0

non_weather_df['Is Holiday'] = non_weather_df['Holiday'].apply(is_holiday)
non_weather_df['Is Labour Day'] = non_weather_df['Holiday'].apply(is_labour_day)
non_weather_df['Is Juneteenth'] = non_weather_df['Holiday'].apply(is_juneteenth)
non_weather_df['Is Xmas'] = non_weather_df['Holiday'].apply(is_xmas)
non_weather_df.head()

In [None]:
non_weather_df.isnull().sum()

In [None]:
non_weather_df.shape

## Merging EDA-ed columns

In [None]:
SELECTED_WEATHER = [
    'Date',
    'Origin Precipitation', 'Origin Precipitation Hours (Equal)',
    'Origin Daylight (Equal)', 'Origin Weekly Mean Temperature', 
    'Origin Weekly Mean Windspeed', 'Origin Weekly Mean Precipitation',
    'Weather Code Intensity'
]

SELECTED_NON_WEATHER = [
    'Carrier Code', 'Distance', 'Month',
    'Airplane Age', 'Origin Total Operations',
    'Departure Delay', 'Morning Dep Time', 'Night Dep Time',
    'Regular Dep Time', 'Is Holiday', 'Is Labour Day', 'Is Juneteenth',
    'Is Xmas'
]

df1 = final_weather_df[SELECTED_WEATHER]
df1.reset_index(inplace=True, drop=True)
df2 = non_weather_df[SELECTED_NON_WEATHER]
df2.reset_index(inplace=True, drop=True)

final_df = pd.concat([df1, df2], axis=1)
final_df = final_df.merge(monthly_median_carrier_dep_delay.set_index(['Carrier Code', 'Month']), on=['Carrier Code', 'Month'], how='left')
final_df.isnull().sum()


In [None]:
final_df.shape

In [None]:
FINAL_SELECTED = [
    'Date',
    'Origin Precipitation', 'Origin Precipitation Hours (Equal)', 'Weather Code Intensity',
    'Origin Daylight (Equal)', 'Origin Weekly Mean Temperature',
    'Origin Weekly Mean Windspeed', 'Origin Weekly Mean Precipitation', 'Distance', 'Month',
    'Airplane Age', 'Origin Total Operations', 'Departure Delay',
    'Morning Dep Time', 'Night Dep Time', 'Regular Dep Time', 'Is Holiday',
    'Is Labour Day', 'Is Juneteenth', 'Is Xmas',
    'Monthly Median Departure Delay'
]

final_df = final_df[FINAL_SELECTED]
print(final_df.shape)
final_df.head()

## Standardising Features

In [None]:
SCALED_FEATURES_COLS = [
    'Origin Precipitation', 'Origin Weekly Mean Temperature', 
    'Origin Weekly Mean Windspeed', 'Origin Weekly Mean Precipitation', 'Distance',
    'Airplane Age', 'Origin Total Operations',
    'Monthly Median Departure Delay'
]

scaler = StandardScaler()
scaled_final_np = scaler.fit_transform(final_df[SCALED_FEATURES_COLS])
scaled_final_df = pd.DataFrame(scaled_final_np, columns=SCALED_FEATURES_COLS)

print(scaled_final_df.shape)
scaled_final_df.head()

In [None]:
final_df.drop(columns=SCALED_FEATURES_COLS, inplace=True)
print(final_df.shape)
print(scaled_final_df[SCALED_FEATURES_COLS].shape)
final_df = pd.concat([final_df, scaled_final_df[SCALED_FEATURES_COLS]], axis=1)
print(final_df.shape)
final_df.head()

In [None]:
final_df.isnull().sum()

## PCA Analysis

In [None]:
# origin_weather_df = scaled_sampled_df[ORIGIN_WEATHER_FEATURES]

# pca = PCA(n_components=2)
# transformed_data = pca.fit_transform(origin_weather_df)
# print(f"Variance explained by each component: {pca.explained_variance_ratio_}")

# pca_origin_weather_df = pd.DataFrame(transformed_data, columns=["PCA Origin Weather 1", "PCA Origin Weather 2"])
# pca_origin_weather_df.head()

In [None]:
# dest_weather_df = scaled_sampled_df[DEST_WEATHER_FEATURES]

# pca = PCA(n_components=2)
# transformed_data = pca.fit_transform(dest_weather_df)
# print(f"Variance explained by each component: {pca.explained_variance_ratio_}")

# pca_dest_weather_df = pd.DataFrame(transformed_data, columns=["PCA Dest Weather 1", "PCA Dest Weather 2"])
# pca_dest_weather_df.head()

In [None]:
# final_sampled_df = pd.concat([
#     scaled_sampled_df[NON_WEATHER_FEATURES], 
#     pca_origin_weather_df, pca_dest_weather_df, 
#     sampled_df[list(CLASSIFIED_DEP_TIME.values())],
#     scaled_sampled_df[[
#        'Monthly Median Flight Delay', 'Monthly Median Departure Delay', 
#     ]],
#     sampled_df[OUTCOMES]
# ], axis=1)

# print(final_sampled_df.shape)
# final_sampled_df.head()

## Binning Departure Delay

In [None]:
def classify_dep_delay3(row):
    if row <= -15:
        return 0
    elif row <= 15:
        return 1
    else:
        return 2

def classify_dep_delay5(row):
    if row < -60:
        return 0
    elif row <= -15:
        return 1
    elif row <= 15:
        return 2
    elif row <= 60:
        return 3
    else:
        return 4
    
# final_df["Classified Departure Delay (3)"] = final_df["Departure Delay"].apply(classify_dep_delay3)
final_df["Classified Departure Delay (5)"] = final_df["Departure Delay"].apply(classify_dep_delay5)

In [None]:
# num_bins = 3

# bin_edges = pd.qcut(final_df["Departure Delay"], q=num_bins, labels=False, duplicates='drop')
# bin_edges = pd.qcut(final_df["Departure Delay"], q=num_bins, labels=False, duplicates='drop')

# final_df['Classified Departure Delay (3 Equal)'] = bin_edges
# final_df['Classified Departure Delay (3 Equal)'].value_counts()

In [None]:
# final_df["Classified Departure Delay (3)"].value_counts()

In [None]:
# print(final_df[final_df['Classified Departure Delay (3 Equal)'] == 0]['Departure Delay'].min())
# print(final_df[final_df['Classified Departure Delay (3 Equal)'] == 0]['Departure Delay'].max())
# print()
# print(final_df[final_df['Classified Departure Delay (3 Equal)'] == 1]['Departure Delay'].min())
# print(final_df[final_df['Classified Departure Delay (3 Equal)'] == 1]['Departure Delay'].max())
# print()
# print(final_df[final_df['Classified Departure Delay (3 Equal)'] == 2]['Departure Delay'].min())
# print(final_df[final_df['Classified Departure Delay (3 Equal)'] == 2]['Departure Delay'].max())

In [None]:
num_bins = 5

bin_edges = pd.qcut(final_df["Departure Delay"], q=num_bins, labels=False, duplicates='drop')
bin_edges = pd.qcut(final_df["Departure Delay"], q=num_bins, labels=False, duplicates='drop')

final_df['Classified Departure Delay (5 Equal)'] = bin_edges
final_df['Classified Departure Delay (5 Equal)'].value_counts()

In [None]:
print(final_df[final_df['Classified Departure Delay (5 Equal)'] == 0]['Departure Delay'].min())
print(final_df[final_df['Classified Departure Delay (5 Equal)'] == 0]['Departure Delay'].max())
print()
print(final_df[final_df['Classified Departure Delay (5 Equal)'] == 1]['Departure Delay'].min())
print(final_df[final_df['Classified Departure Delay (5 Equal)'] == 1]['Departure Delay'].max())
print()
print(final_df[final_df['Classified Departure Delay (5 Equal)'] == 2]['Departure Delay'].min())
print(final_df[final_df['Classified Departure Delay (5 Equal)'] == 2]['Departure Delay'].max())
print()
print(final_df[final_df['Classified Departure Delay (5 Equal)'] == 3]['Departure Delay'].min())
print(final_df[final_df['Classified Departure Delay (5 Equal)'] == 3]['Departure Delay'].max())
print()
print(final_df[final_df['Classified Departure Delay (5 Equal)'] == 4]['Departure Delay'].min())
print(final_df[final_df['Classified Departure Delay (5 Equal)'] == 4]['Departure Delay'].max())

In [None]:
final_df["Classified Departure Delay (5)"].value_counts()

In [None]:
corr = final_df.corr()

fig = px.imshow(
    corr,
    x=corr.columns,
    y=corr.columns,
    color_continuous_scale='tealrose', 
    template="ggplot2"
)

fig.update_layout(
    title='Final EDA-ed Features Correlation Heatmap',
    yaxis_nticks=len(list(corr.columns)),
    xaxis_nticks=len(list(corr.columns)),
)

fig.show()

In [None]:
final_df.columns

In [None]:
final = final_df[[
    'Weather Code Intensity',
    'Origin Precipitation Hours (Equal)', 'Origin Daylight (Equal)',
    'Morning Dep Time', 'Night Dep Time',
    'Regular Dep Time', 'Is Holiday', 'Is Labour Day', 'Is Juneteenth',
    'Is Xmas', 'Origin Precipitation',
    'Distance', 'Airplane Age', 'Origin Total Operations',
    'Monthly Median Departure Delay', 
    'Classified Departure Delay (5)', 'Classified Departure Delay (5 Equal)'
]]

In [None]:
print(final.shape)
final.head()

In [None]:
final.isnull().sum()

## Exporting Post-EDA Dataset

In [None]:
final.to_csv("model_2018.csv", index=False)

# Visualisations

## Initial Columns

In [None]:
corr = features_df.corr()

fig = px.imshow(
    corr,
    x=corr.columns,
    y=corr.columns,
    color_continuous_scale='tealrose', 
    template="ggplot2"
)

fig.update_layout(
    title='Initial Correlation Heatmap',
    yaxis_nticks=len(list(corr.columns)),
    xaxis_nticks=len(list(corr.columns)),
)

fig.show()

In [None]:
col = "Departure Time"

fig = px.histogram(
    df, 
    x=col,
    labels={'count': 'Frequency'},
    color_discrete_sequence=['teal'],
    title='Distribution of Departure Time values',
    template="ggplot2",
)

fig.update_layout(
    autosize=True
)
fig.show()

## Distribution of Departure Delay

In [None]:
fig = px.histogram(
    df, 
    x='Departure Delay',             
    color_discrete_sequence=['teal'],
    title='Distribution of Departure Delay values',
    template="ggplot2",
)
fig.update_layout(
    autosize=True,
    yaxis_title='Count',
)
fig.show()

In [None]:
rows = 350000
sampled_df = df.sample(n=rows, random_state=42)
sampled_df = sampled_df.reset_index()
sampled_df.drop('index', axis=1, inplace=True)
print(sampled_df.shape)
sampled_df.head()

fig = px.scatter(
    sampled_df, 
    x='Date', 
    y='Departure Delay', 
    opacity=0.01,
    template='ggplot2',
    color_continuous_scale='tealrose'
    title='Distribution of Departure Delay values across the year'
)

fig.update_layout(
    autosize=True
)
fig.show()

## Airlines 

In [None]:
num_airline_flights = list(airline_type_df.sum())
num_airline_flights

fig = go.Figure()
fig.add_trace(go.Bar(x=AIRLINE_COLS, y=num_airline_flights, name='Airlines'))

fig.update_layout(
    title='Number of Flights by Airline in 2018',
    xaxis_title='Category',
    yaxis_title='Count',
    template="ggplot2"
)

fig.show()

## Airplane

In [None]:
col = "Airplane Age"

fig = px.histogram(
    features_df, 
    x=col, 
    color_discrete_sequence=['teal'],
    title='Distribution of Departure Delay values',
    template="ggplot2",
)
fig.update_layout(
    autosize=True,
    yaxis_title='Count',
)
fig.show()

## Binning Departure Delays

In [None]:
num_bins = 5
bin_edges = pd.qcut(df["Departure Delay"], q=num_bins, labels=False, duplicates='drop')

df['Classified Departure Delay (Equal)'] = bin_edges
df['Classified Departure Delay (Equal)'].value_counts()

In [None]:
print(df[df['Classified Departure Delay (Equal)'] == 0]['Departure Delay'].min())
print(df[df['Classified Departure Delay (Equal)'] == 0]['Departure Delay'].max())
print()
print(df[df['Classified Departure Delay (Equal)'] == 1]['Departure Delay'].min())
print(df[df['Classified Departure Delay (Equal)'] == 1]['Departure Delay'].max())
print()
print(df[df['Classified Departure Delay (Equal)'] == 2]['Departure Delay'].min())
print(df[df['Classified Departure Delay (Equal)'] == 2]['Departure Delay'].max())
print()
print(df[df['Classified Departure Delay (Equal)'] == 3]['Departure Delay'].min())
print(df[df['Classified Departure Delay (Equal)'] == 3]['Departure Delay'].max())
print()
print(df[df['Classified Departure Delay (Equal)'] == 4]['Departure Delay'].min())
print(df[df['Classified Departure Delay (Equal)'] == 4]['Departure Delay'].max())

In [None]:
fig = px.histogram(
    df, 
    x='Departure Delay',
    color='Classified Departure Delay (Equal)',
    labels={'count': 'Frequency'},
    title='Distribution of Classified Departure Delay values using Equal Count Binning',
    template="ggplot2",
    color_discrete_sequence=px.colors.qualitative.Prism,
)
fig.update_layout(
    autosize=True,
    yaxis_title='Count',
    showlegend=False
)
fig.show()

In [None]:
def classify_dep_delay(row):
    if row < -60:
        return 0
    elif row <= -15:
        return 1
    elif row <= 15:
        return 2
    elif row <= 60:
        return 3
    else:
        return 4
    
df["Classified Departure Delay"] = df["Departure Delay"].apply(classify_dep_delay)

In [None]:
fig = px.histogram(
    df, 
    x='Departure Delay',
    color='Classified Departure Delay',
    labels={'count': 'Frequency'},
    title='Distribution of Classified Departure Delay values using Predefined Bins',
    template="ggplot2",
    color_discrete_sequence=px.colors.qualitative.Prism,
)
fig.update_layout(
    autosize=True,
    yaxis_title='Count',
    showlegend=False
)
fig.show()