In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns

from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA

from sklearn.cluster import KMeans

In [None]:
df = pd.read_csv("../scripts/eda_2018.csv")

In [None]:
print(df.shape)
df.head()

In [None]:
df.isnull().sum()

In [None]:
FEATURES = [
    'Departure Time', 'Origin', 'Destination', 'Month',
    'Origin Weather Code', 'Origin Temperature Max',
    'Origin Temperature Min', 'Origin Temperature Mean',
    'Origin Apparent Temperature Max', 'Origin Apparent Temperature Min',
    'Origin Apparent Temperature Mean', 'Origin Sunrise', 'Origin Sunset',
    'Origin Shortwave Radiation', 'Origin Precipitation', 'Origin Rain',
    'Origin Snowfall', 'Origin Precipitation Hours', 'Origin Windspeed',
    'Origin Windgusts', 'Origin Wind Direction',
    'Origin Evapotranspiration', 'Destination Weather Code',
    'Destination Temperature Max', 'Destination Temperature Min',
    'Destination Temperature Mean', 'Destination Apparent Temperature Max',
    'Destination Apparent Temperature Min',
    'Destination Apparent Temperature Mean', 'Destination Sunrise',
    'Destination Sunset', 'Destination Shortwave Radiation',
    'Destination Precipitation', 'Destination Rain', 'Destination Snowfall',
    'Destination Precipitation Hours', 'Destination Windspeed',
    'Destination Windgusts', 'Destination Wind Direction',
    'Destination Evapotranspiration',
    'Departure Delay',
]
weather_df = df[FEATURES]

In [None]:
n = 300000
sampled_weather_df = weather_df.sample(n)
sampled_weather_df.shape

In [None]:
corr = sampled_weather_df.corr()

fig = px.imshow(
    corr,
    x=corr.columns,
    y=corr.columns,
    color_continuous_scale='tealrose', 
    template="plotly_dark"
)

fig.update_layout(title='Correlation Heatmap')

fig.show()

In [None]:
MEDIAN_WEATHER_COLS = [
    'Origin Temperature Max', 'Origin Temperature Min', 'Origin Temperature Mean',
    'Origin Apparent Temperature Max', 'Origin Apparent Temperature Min', 'Origin Apparent Temperature Mean', 
    'Origin Shortwave Radiation', 'Origin Precipitation', 'Origin Rain', 'Origin Snowfall',
    'Origin Windspeed', 'Origin Windgusts', 'Origin Wind Direction', 'Origin Evapotranspiration',
    'Departure Delay'
]

MODE_WEATHER_COLS = [
    'Origin Weather Code', 'Origin Precipitation Hours'
]

monthly_median_df = sampled_weather_df.groupby('Month')[MEDIAN_WEATHER_COLS].median().reset_index()
monthly_mode_df = sampled_weather_df.groupby('Month')[MODE_WEATHER_COLS].agg(lambda x:x.value_counts().index[0]).reset_index()

In [None]:
monthly_median_df['Origin Monthly Median Temperature Range'] = monthly_median_df['Origin Temperature Max'] - monthly_median_df['Origin Temperature Min']
monthly_median_df['Origin Monthly Median Apparent Temperature Range'] = monthly_median_df['Origin Apparent Temperature Max'] - monthly_median_df['Origin Apparent Temperature Min']

In [None]:
RENAMED_MONTHLY_WEATHER_COLS = {
    'Origin Shortwave Radiation':'Origin Monthly Median Shortwave Radiation', 
    'Origin Precipitation': 'Origin Monthly Median Precipitation', 
    'Origin Rain': 'Origin Monthly Median Rain',
    'Origin Snowfall': 'Origin Monthly Median Snowfall', 
    'Origin Windspeed': 'Origin Monthly Median Windspeed', 
    'Origin Windgusts': 'Origin Monthly Median Windgusts', 
    'Origin Wind Direction': 'Origin Monthly Median Wind Direction',
    'Origin Evapotranspiration': 'Origin Monthly Median Evapotranspiration', 
    'Departure Delay': 'Monthly Median Departure Delay'
}

SELECTED_MONTHLY_WEATHER_COLS = [
    'Month',
    'Origin Monthly Median Temperature Range',
    'Origin Monthly Median Apparent Temperature Range',
    'Origin Monthly Median Shortwave Radiation',
    'Origin Monthly Median Precipitation',
    'Origin Monthly Median Rain',
    'Origin Monthly Median Snowfall',
    'Origin Monthly Median Windspeed',
    'Origin Monthly Median Windgusts',
    'Origin Monthly Median Wind Direction',
    'Origin Monthly Median Evapotranspiration',
    'Monthly Median Departure Delay'
]

monthly_median_df.rename(columns=RENAMED_MONTHLY_WEATHER_COLS, inplace = True)
monthly_df = monthly_median_df[SELECTED_MONTHLY_WEATHER_COLS]
monthly_df['Origin Monthly Mode Weather Code'] = monthly_mode_df['Origin Weather Code']
monthly_df['Origin Monthly Mode Precipitation Hours'] = monthly_mode_df['Origin Precipitation Hours']

In [None]:
new_weather_df = sampled_weather_df[['Departure Delay', 'Month']]
new_weather_df = new_weather_df.merge(monthly_df.set_index('Month'), on='Month', how='left')
new_weather_df.head()

In [None]:
num_bins = 5

bin_edges = pd.qcut(sampled_weather_df['Origin Precipitation Hours'], q=num_bins, labels=False, duplicates='drop')

sampled_weather_df['Origin Precipitation Hours (Equal)'] = bin_edges
sampled_weather_df['Origin Precipitation Hours (Equal)'].value_counts()

In [None]:
fig = px.scatter(
    sampled_weather_df,
    x='Origin Precipitation Hours', 
    y='Departure Delay', 
    color='Origin Precipitation Hours (Equal)',
    color_continuous_scale='tealrose', 
    template="plotly_dark"
)
fig.update_layout(title='Scatter Plot after binning')
fig.show()

In [None]:
sampled_weather_df['Origin Daylight'] = pd.to_datetime(sampled_weather_df['Origin Sunset']) - pd.to_datetime(sampled_weather_df['Origin Sunrise'])
sampled_weather_df['Origin Daylight'] = sampled_weather_df['Origin Daylight'].apply(lambda x:x.total_seconds())

num_bins = 4

bin_edges = pd.qcut(sampled_weather_df['Origin Daylight'], q=num_bins, labels=False, duplicates='drop')

sampled_weather_df['Origin Daylight (Equal)'] = bin_edges
sampled_weather_df['Origin Daylight (Equal)'].value_counts()

In [None]:
fig = px.scatter(
    sampled_weather_df,
    x='Origin Daylight', 
    y='Departure Delay',
    color='Origin Daylight (Equal)', 
    color_continuous_scale='tealrose', 
    template="plotly_dark"
)
fig.update_layout(title='Scatter Plot after binning')
fig.show()

In [None]:
fig = px.scatter(
    sampled_weather_df,
    x='Origin Weather Code', 
    y='Departure Delay',
    color='Origin Weather Code', 
    color_continuous_scale='tealrose', 
    template="plotly_dark"
)
fig.update_layout(title='Scatter Plot after binning')
fig.show()

In [None]:
SELECTED_WEATHER_COLS = [
    'Origin Shortwave Radiation', 'Origin Precipitation', 'Origin Rain',
    'Origin Snowfall', 'Origin Windspeed', 'Origin Windgusts', 'Origin Wind Direction',
    'Origin Evapotranspiration',
    'Departure Delay'
]

selected_df = sampled_weather_df[SELECTED_WEATHER_COLS]
corr = selected_df.corr()

fig = px.imshow(
    corr,
    x=corr.columns,
    y=corr.columns,
    color_continuous_scale='tealrose', 
    template="plotly_dark"
)

fig.update_layout(title='Correlation Heatmap')

fig.show()

In [None]:
corr = new_weather_df.corr()

fig = px.imshow(
    corr,
    x=corr.columns,
    y=corr.columns,
    color_continuous_scale='tealrose', 
    template="plotly_dark"
)

fig.update_layout(title='Correlation Heatmap')

fig.show()

In [None]:
final_df = sampled_weather_df[['Origin Precipitation', 'Origin Weather Code', 'Origin Daylight (Equal)', 'Origin Precipitation Hours (Equal)', 'Departure Delay']]

corr = final_df.corr()

fig = px.imshow(
    corr,
    x=corr.columns,
    y=corr.columns,
    color_continuous_scale='tealrose', 
    template="plotly_dark"
)

fig.update_layout(title='Correlation Heatmap')

fig.show()