In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, date, timedelta
import time
import statsmodels.api as sm

from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA

from sklearn.cluster import KMeans

In [None]:
df = pd.read_csv("eda_2018.csv")

In [None]:
print(df.shape)
df.isnull().sum()

In [None]:
df.head()

In [None]:
df['Departure Delay'] = df['Departure Delay'] - df['Security Delay']

In [None]:
threshold = 3
col = 'Departure Delay'

z_scores = np.abs((df[col] - df[col].mean()) / df[col].std())
df = df.loc[z_scores < threshold]
df.shape

In [None]:
print(df[col].max())
print(df[col].mean())
print(df[col].median())
print(df[col].min())

In [None]:
FEATURES = [
    'Date', 'Month',
    'Origin Weather Code', 'Origin Temperature Max',
    'Origin Temperature Min', 'Origin Temperature Mean',
    'Origin Apparent Temperature Max', 'Origin Apparent Temperature Min',
    'Origin Apparent Temperature Mean', 'Origin Sunrise', 'Origin Sunset',
    'Origin Shortwave Radiation', 'Origin Precipitation', 'Origin Rain',
    'Origin Snowfall', 'Origin Precipitation Hours', 'Origin Windspeed',
    'Origin Windgusts', 'Origin Wind Direction',
    'Origin Evapotranspiration',
    'Departure Delay',
]

# n = 300000
# weather_df = df[FEATURES]
# sampled_weather_df = weather_df.sample(n)
# sampled_weather_df.shape

sampled_weather_df = df[FEATURES]

## Initial Correlation Matrix

In [None]:
corr = sampled_weather_df.corr()

fig = px.imshow(
    corr,
    x=corr.columns,
    y=corr.columns,
    color_continuous_scale='tealrose', 
    title='Weather features Correlation Heatmap',
    template="ggplot2",
)

fig.update_layout(
    autosize=True,
    yaxis_nticks=len(list(corr.columns)),
    xaxis_nticks=len(list(corr.columns)),
)

fig.show()

## Forward Selection

In [None]:
y = sampled_weather_df["Departure Delay"]
X = sampled_weather_df[[
    'Origin Weather Code', 'Origin Temperature Max',
    'Origin Temperature Min', 'Origin Temperature Mean',
    'Origin Apparent Temperature Max', 'Origin Apparent Temperature Min',
    'Origin Apparent Temperature Mean', 'Origin Shortwave Radiation',
    'Origin Precipitation', 'Origin Rain', 'Origin Snowfall',
    'Origin Precipitation Hours', 'Origin Windspeed', 'Origin Windgusts',
    'Origin Wind Direction', 'Origin Evapotranspiration'
]]

In [None]:
def processSubset(feature_set):
    # Fit model on feature_set and calculate RSS
    model = sm.OLS(y,X[list(feature_set)])
    regr = model.fit()
    RSS = ((regr.predict(X[list(feature_set)]) - y) ** 2).sum()
    return {"model":regr, "RSS":RSS}

def forward(predictors):

    # Pull out predictors we still need to process
    remaining_predictors = [p for p in X.columns if p not in predictors]
    
    tic = time.time()
    
    results = []
    
    for p in remaining_predictors:
        results.append(processSubset(predictors+[p]))
    
    # Wrap everything up in a nice dataframe
    models = pd.DataFrame(results)
    
    # Choose the model with the highest RSS
    best_model = models.loc[models['RSS'].argmin()]
    
    toc = time.time()
    print("Processed ", models.shape[0], "models on", len(predictors)+1, "predictors in", (toc-tic), "seconds.")
    
    # Return the best model, along with some other useful information about the model
    return best_model

In [None]:
models_fwd = pd.DataFrame(columns=["RSS", "model"])

tic = time.time()
predictors = []

for i in range(1,6):    
    models_fwd.loc[i] = forward(predictors)
    predictors = models_fwd.loc[i]["model"].model.exog_names

toc = time.time()
print("Total elapsed time:", (toc-tic), "seconds.")

In [None]:
print(models_fwd.loc[1, "model"].summary())
print(models_fwd.loc[2, "model"].summary())

In [None]:
print(models_fwd.loc[3, "model"].summary())
print(models_fwd.loc[4, "model"].summary())

In [None]:
print(models_fwd.loc[5, "model"].summary())

* Not much change in Adj r squared beyond 3. 
* Hence for departure delay, Origin Windgusts, Origin Precipitation Hours, Origin Shortwave Radiation should be retained
* Rest to be dropped

## Smoothening

In [None]:
df_smooth_origin = sampled_weather_df[[
    'Date','Origin', 'Origin Temperature Max',
    'Origin Temperature Min', 'Origin Temperature Mean',
    'Origin Apparent Temperature Max', 'Origin Apparent Temperature Min',
    'Origin Apparent Temperature Mean',
    'Origin Shortwave Radiation', 'Origin Precipitation', 'Origin Rain',
    'Origin Snowfall', 'Origin Precipitation Hours', 'Origin Windspeed',
    'Origin Windgusts', 'Origin Wind Direction',
    'Origin Evapotranspiration'
]]

In [None]:
df_smooth_origin = df_smooth_origin.drop_duplicates()

In [None]:
for column in list(df_smooth_origin.iloc[:,2:].columns):
    df_smooth_origin[column] = df_smooth_origin.groupby('Origin')[column].transform(lambda x: x.rolling(window=2).mean())

In [None]:
df_merged = sampled_weather_df[["Date", "Origin", "Departure Delay"]]
df_merged = pd.merge(
    df_merged,
    df_smooth_origin, 
    how="left", 
    on=["Date", "Origin"]
)
df_merged = df_merged.dropna()
df_merged.iloc[:, 3:].corr()['Departure Delay']

## Monthly Macro Trends

In [None]:
MEDIAN_WEATHER_COLS = [
    'Origin Temperature Max', 'Origin Temperature Min', 'Origin Temperature Mean',
    'Origin Apparent Temperature Max', 'Origin Apparent Temperature Min', 'Origin Apparent Temperature Mean', 
    'Origin Shortwave Radiation', 'Origin Precipitation', 'Origin Rain', 'Origin Snowfall',
    'Origin Windspeed', 'Origin Windgusts', 'Origin Wind Direction', 'Origin Evapotranspiration',
    'Departure Delay'
]

MODE_WEATHER_COLS = [
    'Origin Weather Code', 'Origin Precipitation Hours'
]

monthly_median_df = sampled_weather_df.groupby('Month')[MEDIAN_WEATHER_COLS].median().reset_index()
monthly_mode_df = sampled_weather_df.groupby('Month')[MODE_WEATHER_COLS].agg(lambda x:x.value_counts().index[0]).reset_index()


In [None]:
monthly_median_df['Origin Monthly Median Temperature Range'] = monthly_median_df['Origin Temperature Max'] - monthly_median_df['Origin Temperature Min']
monthly_median_df['Origin Monthly Median Apparent Temperature Range'] = monthly_median_df['Origin Apparent Temperature Max'] - monthly_median_df['Origin Apparent Temperature Min']


In [None]:
RENAMED_MONTHLY_WEATHER_COLS = {
    'Origin Shortwave Radiation':'Origin Monthly Median Shortwave Radiation', 
    'Origin Precipitation': 'Origin Monthly Median Precipitation', 
    'Origin Rain': 'Origin Monthly Median Rain',
    'Origin Snowfall': 'Origin Monthly Median Snowfall', 
    'Origin Windspeed': 'Origin Monthly Median Windspeed', 
    'Origin Windgusts': 'Origin Monthly Median Windgusts', 
    'Origin Wind Direction': 'Origin Monthly Median Wind Direction',
    'Origin Evapotranspiration': 'Origin Monthly Median Evapotranspiration', 
    'Departure Delay': 'Monthly Median Departure Delay'
}

SELECTED_MONTHLY_WEATHER_COLS = [
    'Month',
    'Origin Monthly Median Temperature Range',
    'Origin Monthly Median Apparent Temperature Range',
    'Origin Monthly Median Shortwave Radiation',
    'Origin Monthly Median Precipitation',
    'Origin Monthly Median Rain',
    'Origin Monthly Median Snowfall',
    'Origin Monthly Median Windspeed',
    'Origin Monthly Median Windgusts',
    'Origin Monthly Median Wind Direction',
    'Origin Monthly Median Evapotranspiration',
    'Monthly Median Departure Delay'
]

monthly_median_df.rename(columns=RENAMED_MONTHLY_WEATHER_COLS, inplace = True)
monthly_df = monthly_median_df[SELECTED_MONTHLY_WEATHER_COLS]
monthly_df = monthly_median_df
monthly_df['Origin Monthly Mode Weather Code'] = monthly_mode_df['Origin Weather Code']
monthly_df['Origin Monthly Mode Precipitation Hours'] = monthly_mode_df['Origin Precipitation Hours']

In [None]:
tmp = sampled_weather_df[['Departure Delay', 'Month']]
new_monthly_df = tmp.merge(monthly_df.set_index('Month'), on='Month', how='left')
print(new_monthly_df.shape)
new_monthly_df.head()

In [None]:
corr = new_monthly_df.corr()

fig = px.imshow(
    corr,
    x=corr.columns,
    y=corr.columns,
    color_continuous_scale='tealrose', 
    template="ggplot2",
    title='Monthly Macro Weather Features Correlation Matrix Heatmap',
    text_auto=True,
)

fig.update_layout(
    autosize=True,
    yaxis_nticks=len(list(corr.columns)),
    xaxis_nticks=len(list(corr.columns)),
)

fig.show()

In [None]:
SELECTED = [
    'Origin Monthly Median Precipitation', 'Origin Monthly Median Windspeed',
    'Origin Monthly Median Windgusts',
    'Departure Delay', 'Monthly Median Departure Delay', 
]
selected = new_monthly_df[SELECTED]

corr = selected.corr()

fig = px.imshow(
    corr,
    x=corr.columns,
    y=corr.columns,
    color_continuous_scale='tealrose', 
    template="ggplot2",
    title='Monthly Macro Weather Features Correlation Matrix Heatmap',
    text_auto=True,
)

fig.update_layout(
    autosize=True,
    yaxis_nticks=len(list(corr.columns)),
    xaxis_nticks=len(list(corr.columns)),
)

fig.show()

In [None]:
SELECTED_MONTHLY_WEATHER_COLS = [
    'Origin Monthly Median Temperature Range',
    'Origin Monthly Median Apparent Temperature Range',
    'Origin Monthly Median Shortwave Radiation',
    'Origin Monthly Median Precipitation',
    'Origin Monthly Median Rain',
    'Origin Monthly Median Snowfall',
    'Origin Monthly Median Windspeed',
    'Origin Monthly Median Windgusts',
    'Origin Monthly Median Wind Direction',
    'Origin Monthly Median Evapotranspiration',
    'Monthly Median Departure Delay'
]

monthly_features = new_monthly_df[SELECTED_MONTHLY_WEATHER_COLS]
scaler = StandardScaler()
scaled_sampled_np = scaler.fit_transform(monthly_features)
scaled_sampled_df = pd.DataFrame(monthly_features , columns=SELECTED_MONTHLY_WEATHER_COLS)
scaled_sampled_df['Departure Delay'] = new_monthly_df['Departure Delay']

print(scaled_sampled_df.shape)
scaled_sampled_df.head()

In [None]:
corr = scaled_sampled_df.corr()

fig = px.imshow(
    corr,
    x=corr.columns,
    y=corr.columns,
    color_continuous_scale='tealrose', 
    template="ggplot2",
    title='Monthly Macro Standardised Weather Features Correlation Matrix Heatmap',
    text_auto=True,
)

fig.update_layout(
    autosize=True,
    yaxis_nticks=len(list(corr.columns)),
    xaxis_nticks=len(list(corr.columns)),
)

fig.show()

## Binning Continuous Values using Equal Frequency Binning

In [None]:
num_bins = 5

bin_edges = pd.qcut(sampled_weather_df['Origin Precipitation Hours'], q=num_bins, labels=False, duplicates='drop')

sampled_weather_df['Origin Precipitation Hours (Equal)'] = bin_edges
sampled_weather_df['Origin Precipitation Hours (Equal)'].value_counts()

In [None]:
fig = px.scatter(
    sampled_weather_df,
    x='Origin Precipitation Hours', 
    y='Departure Delay', 
    color='Origin Precipitation Hours (Equal)',
    color_continuous_scale='tealrose', 
    template="ggplot2",
    opacity=0.01,
    title='Scatter Plot after binning'
)
fig.update_layout(
    autosize=True
)
fig.show()

In [None]:
sampled_weather_df['Origin Daylight'] = pd.to_datetime(sampled_weather_df['Origin Sunset']) - pd.to_datetime(sampled_weather_df['Origin Sunrise'])
sampled_weather_df['Origin Daylight'] = sampled_weather_df['Origin Daylight'].apply(lambda x:x.total_seconds())

num_bins = 4

bin_edges = pd.qcut(sampled_weather_df['Origin Daylight'], q=num_bins, labels=False, duplicates='drop')

sampled_weather_df['Origin Daylight (Equal)'] = bin_edges
sampled_weather_df['Origin Daylight (Equal)'].value_counts()


In [None]:
fig = px.scatter(
    sampled_weather_df,
    x='Origin Daylight', 
    y='Departure Delay',
    color='Origin Daylight (Equal)', 
    color_continuous_scale='tealrose', 
    template="ggplot2"
)
fig.update_layout(title='Scatter Plot after binning')
fig.show()

In [None]:
fig = px.scatter(
    sampled_weather_df,
    x='Origin Weather Code', 
    y='Departure Delay',
    color='Origin Weather Code', 
    color_continuous_scale='tealrose', 
    template="ggplot2",
    title='Scatter Plot after binning'
)
fig.update_layout(
    autosize=True
)
fig.show()

In [None]:
BINNED_COLS = [
    'Origin Precipitation Hours (Equal)',
    'Origin Daylight (Equal)',
    'Origin Weather Code',
    'Departure Delay' 
]

new_binned_df = sampled_weather_df[BINNED_COLS]
corr = new_binned_df.corr()

fig = px.imshow(
    corr,
    x=corr.columns,
    y=corr.columns,
    color_continuous_scale='tealrose', 
    template="ggplot2",
    title='Binned Weather Features Correlation Matrix Heatmap',
    text_auto=True,
)

fig.update_layout(
    autosize=True,
    yaxis_nticks=len(list(corr.columns)),
    xaxis_nticks=len(list(corr.columns)),
)

fig.show()

## Weekly Macro Trends

In [None]:
MEDIAN_WEATHER_COLS = [
    'Date',
    'Origin Temperature Max', 'Origin Temperature Min', 'Origin Temperature Mean',
    'Origin Apparent Temperature Max', 'Origin Apparent Temperature Min', 'Origin Apparent Temperature Mean', 
    'Origin Shortwave Radiation', 'Origin Precipitation', 'Origin Rain', 'Origin Snowfall',
    'Origin Windspeed', 'Origin Windgusts', 'Origin Wind Direction', 'Origin Evapotranspiration',
    'Departure Delay'
]

MODE_WEATHER_COLS = [
    'Date',
    'Origin Weather Code', 'Origin Precipitation Hours'
]
median_weather_df = sampled_weather_df[MEDIAN_WEATHER_COLS]
mode_weather_df = sampled_weather_df[MODE_WEATHER_COLS]
mode_weather_df

In [None]:
median_weather_df['Date'] = pd.to_datetime(median_weather_df['Date'])
mode_weather_df['Date'] = pd.to_datetime(mode_weather_df['Date'])

median_weather_df.set_index('Date', inplace=True)
mode_weather_df.set_index('Date', inplace=True)

weekly_medians = median_weather_df.resample('W').median()
weekly_modes = mode_weather_df.resample('W').median()

print(weekly_medians.shape)
weekly_medians.columns

In [None]:
print(weekly_modes.shape)
weekly_modes.head()

In [None]:
RENAMED_WEEKLY_WEATHER_COLS = {
    'Origin Shortwave Radiation':'Origin Weekly Median Shortwave Radiation', 
    'Origin Precipitation': 'Origin Weekly Median Precipitation', 
    'Origin Rain': 'Origin Weekly Median Rain',
    'Origin Snowfall': 'Origin Weekly Median Snowfall', 
    'Origin Windspeed': 'Origin Weekly Median Windspeed', 
    'Origin Windgusts': 'Origin Weekly Median Windgusts', 
    'Origin Wind Direction': 'Origin Weekly Median Wind Direction',
    'Origin Evapotranspiration': 'Origin Weekly Median Evapotranspiration', 
    'Departure Delay': 'Weekly Median Departure Delay'
}

SELECTED_COLS = [
    'Date',
    'Origin Weekly Median Shortwave Radiation', 
    'Origin Weekly Median Precipitation', 
    'Origin Weekly Median Rain',
    'Origin Weekly Median Snowfall', 
    'Origin Weekly Median Windspeed', 
    'Origin Weekly Median Windgusts', 
    'Origin Weekly Median Wind Direction',
    'Origin Weekly Median Evapotranspiration', 
    'Weekly Median Departure Delay',
    'Origin Weekly Mode Weather Code',
    'Origin Weekly Mode Precipitation Hours',
    'Origin Weekly Median Temperature Range',
    'Origin Weekly Median Apparent Temperature Range',
]

weekly_df = weekly_medians.rename(columns=RENAMED_WEEKLY_WEATHER_COLS)
weekly_df['Origin Weekly Mode Weather Code'] = weekly_modes['Origin Weather Code']
weekly_df['Origin Weekly Mode Precipitation Hours'] = weekly_modes['Origin Precipitation Hours']

weekly_df['Origin Weekly Median Temperature Range'] = weekly_medians['Origin Temperature Max'] - weekly_medians['Origin Temperature Min']
weekly_df['Origin Weekly Median Apparent Temperature Range'] = weekly_medians['Origin Apparent Temperature Max'] - weekly_medians['Origin Apparent Temperature Min']
weekly_df.reset_index(inplace=True)
weekly_df = weekly_df[SELECTED_COLS]
weekly_df.head()

In [None]:
count = 0
daily_df = pd.DataFrame([])
for week in weekly_df['Date']:
    start_date = week + pd.DateOffset(days=-6)
    end_date = week
    
    for date in pd.date_range(start_date, end_date, freq='D'):
        row = weekly_df.loc[count:count]
        row = row.assign(Day=str(date))
        daily_df = pd.concat([daily_df, row])
    count += 1

daily_df.drop(['Date'], axis=1, inplace=True)
daily_df['Day'] = pd.to_datetime(daily_df['Day'])
daily_df.rename(columns={'Day':'Date'}, inplace=True)
daily_df.head()

In [None]:
tmp = sampled_weather_df[['Date', 'Departure Delay']]
tmp['Date'] = pd.to_datetime(tmp['Date'])

merged_weekly_df = tmp.merge(daily_df, on='Date', how='left')
merged_weekly_df.head()

In [None]:
corr = merged_weekly_df.corr()

fig = px.imshow(
    corr,
    x=corr.columns,
    y=corr.columns,
    color_continuous_scale='tealrose', 
    template="ggplot2",
    title="Weekly Correlation Heatmap"
)

fig.update_layout(
    autosize=True,
    yaxis_nticks=len(list(corr.columns)),
    xaxis_nticks=len(list(corr.columns)),
)

fig.show()

In [None]:
SELECTED = [
    'Origin Weekly Median Precipitation', 'Origin Weekly Median Windspeed',
    'Origin Weekly Median Windgusts',
    'Departure Delay', 'Weekly Median Departure Delay', 
]
selected = merged_weekly_df[SELECTED]

corr = selected.corr()

fig = px.imshow(
    corr,
    x=corr.columns,
    y=corr.columns,
    color_continuous_scale='tealrose', 
    template="ggplot2",
    title='Weekly Macro Weather Features Correlation Matrix Heatmap',
    text_auto=True,
)

fig.update_layout(
    autosize=True,
    yaxis_nticks=len(list(corr.columns)),
    xaxis_nticks=len(list(corr.columns)),
)

fig.show()


In [None]:
SELECTED_WEEKLY_COLS = [
    'Origin Weekly Median Temperature Range',
    'Origin Weekly Median Precipitation',
    'Origin Weekly Median Windspeed',
    'Origin Weekly Mode Weather Code',
    'Origin Weekly Mode Precipitation Hours',
    'Weekly Median Departure Delay',
    'Departure Delay'
]

selected_weekly_df = merged_weekly_df[SELECTED_WEEKLY_COLS]

corr = selected_weekly_df.corr()

fig = px.imshow(
    corr,
    x=corr.columns,
    y=corr.columns,
    color_continuous_scale='tealrose', 
    template="ggplot2",
    title="Weekly Correlation Heatmap"
)

fig.update_layout(
    autosize=True,
    yaxis_nticks=len(list(corr.columns)),
    xaxis_nticks=len(list(corr.columns)),
)

fig.show()

In [None]:
SELECTED_WEEKLY_WEATHER_COLS = [
    'Origin Weekly Median Temperature Range',
    'Origin Weekly Median Precipitation',
    'Origin Weekly Median Windspeed',
    'Origin Weekly Mode Weather Code',
    'Origin Weekly Mode Precipitation Hours',
    'Weekly Median Departure Delay',
]

weekly_features = merged_weekly_df[SELECTED_WEEKLY_WEATHER_COLS]
scaler = StandardScaler()
scaled_sampled_np = scaler.fit_transform(monthly_features)
scaled_sampled_df = pd.DataFrame(weekly_features , columns=SELECTED_WEEKLY_WEATHER_COLS)
scaled_sampled_df['Departure Delay'] = merged_weekly_df['Departure Delay']

print(scaled_sampled_df.shape)
scaled_sampled_df.head()

In [None]:
corr = selected_weekly_df.corr()

fig = px.imshow(
    corr,
    x=corr.columns,
    y=corr.columns,
    color_continuous_scale='tealrose', 
    template="ggplot2",
    title="Weekly Correlation Heatmap"
)

fig.update_layout(
    autosize=True,
    yaxis_nticks=len(list(corr.columns)),
    xaxis_nticks=len(list(corr.columns)),
)

fig.show()

In [None]:
selected_weekly_df['Weekly Median Departure Delay'].value_counts()