In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.feature_selection import VarianceThreshold

from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA

from sklearn.cluster import KMeans

In [None]:
# !pip3 install --upgrade plotly

In [None]:
# df_length = len(df)

# df1 = df.iloc[:df_length//2]
# df2 = df.iloc[df_length//2:]

# df1.to_csv('eda_2018_part1.csv', index=False)
# df2.to_csv('eda_2018_part2.csv', index=False)

# Post-merge EDA

In [None]:
df = pd.read_csv("../scripts/eda_2018.csv")
df.head()

In [None]:
df.columns

In [None]:
print(df.shape)
df.isnull().sum()

In [None]:
df.dropna(inplace=True)
df.reset_index(inplace=True, drop=True)
print(df.shape)
df.head()

In [None]:
threshold = 3
col = 'Departure Delay'

z_scores = np.abs((df[col] - df[col].mean()) / df[col].std())
df = df.loc[z_scores < threshold]
df.shape

In [None]:
print(df[col].max())
print(df[col].mean())
print(df[col].median())
print(df[col].min())

## Creating Outcome Column

In [None]:
df['Departure Delay'] = df['Departure Delay'] - df['NAS Delay'] - df['Security Delay']

In [None]:
departure_delay_df = df['Departure Delay'].reset_index()
departure_delay_df.drop('index', axis=1, inplace=True)
departure_delay_df.head()

In [None]:
print(departure_delay_df.min())
print(departure_delay_df.max())
print(departure_delay_df.mean())
print(departure_delay_df.std())

## Feature Selection & Engineering

In [None]:
FEATURES_COLS = [
    'Distance',
    'Seats', 'Airplane Age',
    'Origin Total Operations',
    'Origin Weather Code', 'Origin Temperature Max',
    'Origin Temperature Min', 'Origin Temperature Mean',
    'Origin Apparent Temperature Max', 'Origin Apparent Temperature Min',
    'Origin Apparent Temperature Mean', 'Origin Sunrise', 'Origin Sunset',
    'Origin Shortwave Radiation', 'Origin Precipitation', 'Origin Rain',
    'Origin Snowfall', 'Origin Precipitation Hours', 'Origin Windspeed',
    'Origin Windgusts', 'Origin Wind Direction',
    'Origin Evapotranspiration',
    'Departure Delay'
]

SCALED_FEATURES_COLS = [
    'Monthly Median Flight Delay', 'Monthly Median Departure Delay',
    'Seats', 'Airplane Age', 'Distance',
    'Origin Total Operations', 'Destination Total Operations',
    'Origin Precipitation', 'Origin Rain', 'Origin Snowfall', 
    'Origin Windspeed', 'Origin Windgusts', 'Origin Evapotranspiration',
    'Origin Shortwave Radiation'
]

NON_WEATHER_FEATURES = [
    'Seats', 'Airplane Age', 'Distance', 'Origin Total Operations'
]

ORIGIN_WEATHER_FEATURES = [
    'Origin Precipitation', 'Origin Rain',
    'Origin Snowfall', 'Origin Windspeed', 'Origin Windgusts',
    'Origin Evapotranspiration', 'Origin Shortwave Radiation',
]

In [None]:
airline_type_df = df['Carrier Code']
features_df = df[FEATURES_COLS]

## Monthly Median Departure Delay for each Carrier

In [None]:
airline_type_df = pd.concat([airline_type_df, df['Month'], df['Departure Delay']], axis=1)
airline_type_df.head()

In [None]:
monthly_median_carrier_dep_delay = airline_type_df.groupby(['Carrier Code', 'Month'])['Departure Delay'].median().reset_index()
monthly_median_carrier_dep_delay.rename(columns={'Departure Delay': 'Monthly Median Departure Delay'}, inplace=True)
monthly_median_carrier_dep_delay.head()


In [None]:
df = df.merge(monthly_median_carrier_dep_delay.set_index(['Carrier Code', 'Month']), on=['Carrier Code', 'Month'], how='left')
df.head()

## Weather EDA

## Binning Continuous value features

In [None]:
clustering_result_df = sampled_df[["Departure Time", "Flight Delay"]]
clustering_result_df['Cluster'] = labels

fig = px.scatter(clustering_result_df, x='Flight Delay', y="Departure Time", color='Cluster')
fig.update_layout(title='Scatter Plot with Clustering')
fig.show()

In [None]:
CLASSIFIED_DEP_TIME = {
    1: 'Morning Dep Time',
    2: 'Regular Dep Time',
    3: 'Night Dep Time'
}

def classify_dep_time(row):
    if row <= 600:
        return 1
    elif row <= 2000:
        return 2
    else:
        return 3
    
col = "Classified Departure Time"

sampled_df[col] = sampled_df["Departure Time"].apply(classify_dep_time)
sampled_df[col].replace(CLASSIFIED_DEP_TIME, inplace=True)
encoded_dep_time_df = pd.get_dummies(sampled_df[col])
sampled_df.drop(col, axis=1, inplace=True)
sampled_df = pd.concat([sampled_df, encoded_dep_time_df], axis=1)
sampled_df.head()

## Standardising Features

In [None]:
scaler = StandardScaler()
scaled_sampled_np = scaler.fit_transform(sampled_df[SCALED_FEATURES_COLS])
scaled_sampled_df = pd.DataFrame(scaled_sampled_np , columns=SCALED_FEATURES_COLS)

print(scaled_sampled_df.shape)
scaled_sampled_df.head()

## PCA Analysis

In [None]:
origin_weather_df = scaled_sampled_df[ORIGIN_WEATHER_FEATURES]

pca = PCA(n_components=2)
transformed_data = pca.fit_transform(origin_weather_df)
print(f"Variance explained by each component: {pca.explained_variance_ratio_}")

pca_origin_weather_df = pd.DataFrame(transformed_data, columns=["PCA Origin Weather 1", "PCA Origin Weather 2"])
pca_origin_weather_df.head()

In [None]:
dest_weather_df = scaled_sampled_df[DEST_WEATHER_FEATURES]

pca = PCA(n_components=2)
transformed_data = pca.fit_transform(dest_weather_df)
print(f"Variance explained by each component: {pca.explained_variance_ratio_}")

pca_dest_weather_df = pd.DataFrame(transformed_data, columns=["PCA Dest Weather 1", "PCA Dest Weather 2"])
pca_dest_weather_df.head()

In [None]:
final_sampled_df = pd.concat([
    scaled_sampled_df[NON_WEATHER_FEATURES], 
    pca_origin_weather_df, pca_dest_weather_df, 
    sampled_df[list(CLASSIFIED_DEP_TIME.values())],
    scaled_sampled_df[[
       'Monthly Median Flight Delay', 'Monthly Median Departure Delay', 
    ]],
    sampled_df[OUTCOMES]
], axis=1)

print(final_sampled_df.shape)
final_sampled_df.head()

In [None]:
threshold = 3
col = 'Departure Delay'

z_scores = np.abs((final_sampled_df[col] - final_sampled_df[col].mean()) / final_sampled_df[col].std())
final_sampled_df = final_sampled_df.loc[z_scores < threshold]
print(final_sampled_df.shape)
final_sampled_df.head()

In [None]:
print(final_sampled_df["Departure Delay"].max())
print(final_sampled_df["Departure Delay"].min())
print(final_sampled_df["Departure Delay"].mean())
print(final_sampled_df["Departure Delay"].std())

In [None]:
print(final_sampled_df.shape)
final_sampled_df.isnull().sum()

## Binning Departure Delay (moved to modelling)

In [None]:
def classify_dep_delay(row):
    if row < -15:
        return 0
    elif row <= 15:
        return 1
    else:
        return 2
    
final_sampled_df["Classified Departure Delay"] = final_sampled_df["Departure Delay"].apply(classify_dep_delay)


In [None]:
bin_edges = pd.qcut(final_sampled_df["Departure Delay"], q=num_bins, labels=False, duplicates='drop')

final_sampled_df['Classified Departure Delay (Equal)'] = bin_edges
final_sampled_df['Classified Departure Delay (Equal)'].value_counts()

In [None]:
final_sampled_df["Classified Departure Delay"].value_counts()

In [None]:
print(final_sampled_df[final_sampled_df['Classified Departure Delay (Equal)'] == 0]['Departure Delay'].min())
print(final_sampled_df[final_sampled_df['Classified Departure Delay (Equal)'] == 0]['Departure Delay'].max())
print()
print(final_sampled_df[final_sampled_df['Classified Departure Delay (Equal)'] == 1]['Departure Delay'].min())
print(final_sampled_df[final_sampled_df['Classified Departure Delay (Equal)'] == 1]['Departure Delay'].max())
print()
print(final_sampled_df[final_sampled_df['Classified Departure Delay (Equal)'] == 2]['Departure Delay'].min())
print(final_sampled_df[final_sampled_df['Classified Departure Delay (Equal)'] == 2]['Departure Delay'].max())
print()
print(final_sampled_df[final_sampled_df['Classified Departure Delay (Equal)'] == 3]['Departure Delay'].min())
print(final_sampled_df[final_sampled_df['Classified Departure Delay (Equal)'] == 3]['Departure Delay'].max())
# print()
# print(final_sampled_df[final_sampled_df['Classified Departure Delay (Equal)'] == 4]['Departure Delay'].min())
# print(final_sampled_df[final_sampled_df['Classified Departure Delay (Equal)'] == 4]['Departure Delay'].max())

In [None]:
FINAL_COLS = [
    'Airplane Age', 'Distance', 
    'Origin Total Operations', 'Destination Total Operations',
    'PCA Origin Weather 1', 'PCA Origin Weather 2',
    'PCA Dest Weather 1', 'PCA Dest Weather 2',
    'Morning Dep Time', 'Regular Dep Time', 'Night Dep Time',
    'Monthly Median Departure Delay',
    'Departure Delay',
]

In [None]:
final = final_sampled_df[FINAL_COLS]
final

## Exporting Post-EDA Dataset

In [None]:
final.to_csv("model_2018.csv", index=False)

# Visualisations

## Initial Columns

In [None]:
corr = features_df.corr()

fig = px.imshow(
    corr,
    x=corr.columns,
    y=corr.columns,
    color_continuous_scale='tealrose', 
    template="ggplot2"
)

fig.update_layout(
    title='Initial Correlation Heatmap',
    yaxis_nticks=len(list(corr.columns)),
    xaxis_nticks=len(list(corr.columns)),
)

fig.show()

In [None]:
col = "Departure Time"

fig = px.histogram(
    df, 
    x=col,
    labels={'count': 'Frequency'},
    color_discrete_sequence=['teal'],
    title='Distribution of Departure Time values',
    template="ggplot2",
)

fig.update_layout(
    autosize=True
)
fig.show()

## Distribution of Departure Delay

In [None]:
fig = px.histogram(
    df, 
    x='Departure Delay',             
    labels={'count': 'Frequency'},
    color_discrete_sequence=['teal'],
    title='Distribution of Departure Delay values',
    template="ggplot2",
)
fig.update_layout(
    autosize=True
)
fig.show()

In [None]:
rows = 350000
sampled_df = df.sample(n=rows, random_state=42)
sampled_df = sampled_df.reset_index()
sampled_df.drop('index', axis=1, inplace=True)
print(sampled_df.shape)
sampled_df.head()

fig = px.scatter(
    sampled_df, 
    x='Date', 
    y='Departure Delay', 
    opacity=0.01,
    template='ggplot2',
    color_continuous_scale='tealrose'
    title='Distribution of Departure Delay values across the year'
)
fig.update_layout(
    autosize=True
)
fig.show()

## Airlines 

In [None]:
num_airline_flights = list(airline_type_df.sum())
num_airline_flights

fig = go.Figure()
fig.add_trace(go.Bar(x=AIRLINE_COLS, y=num_airline_flights, name='Airlines'))

fig.update_layout(
    title='Number of Flights by Airline in 2018',
    xaxis_title='Category',
    yaxis_title='Count',
    template="ggplot2"
)
fig.show()

## Airplane

In [None]:
col = "Airplane Age"

fig = px.histogram(
    features_df, 
    x=col, 
    labels={'count': 'Frequency'},
    color_discrete_sequence=['teal'],
    title='Distribution of Departure Delay values',
    template="ggplot2",
)
fig.update_layout(
    title=f"{col} Histogram"
)
fig.show()