In [58]:
## In this notebook we will create a lot more numerical continuous features that relate to arr_delay
## We will analyze this to see if we achieve a better result

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import requests
import scipy.stats as stats
import numpy as np

import plotly.express as px
import plotly.graph_objs as go
from sklearn.preprocessing import MinMaxScaler

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold


In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold

In [3]:
# Imported this dataframe that was created in our EDA notebook
flights_df = pd.read_csv("/Users/ckz/Desktop/DATA_SCIENCE_BOOTCAMP/Midterm-project/flights_df.csv")

# print the first few rows of the DataFrame
flights_df.head()

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,...,first_dep_time,total_add_gtime,longest_add_gtime,no_name,month,dep_hour,arr_hour,taxi_time,origin_state,flight_distance_category
0,2018-01-01,B6,B6,B6,880,B6,N794JB,880,12954,LGB,...,,,,,1,7,8,83.0,CA,SHORT
1,2018-01-01,B6,B6,B6,885,B6,N337JB,885,12478,JFK,...,,,,,1,7,9,120.0,NY,SHORT
2,2018-01-01,B6,B6,B6,886,B6,N337JB,886,14492,RDU,...,,,,,1,10,11,102.0,NC,SHORT
3,2018-01-01,B6,B6,B6,889,B6,N318JB,889,11278,DCA,...,,,,,1,18,19,162.0,DC,SHORT
4,2018-01-01,B6,B6,B6,891,B6,N348JB,891,10721,BOS,...,,,,,1,6,9,317.0,MA,MEDIUM


In [4]:
flights_df.loc[flights_df['arr_delay'] < 0, 'arr_delay'] = 0

In [5]:
flights_df['avg_monthly_arr_delay'] = flights_df.groupby('month')['arr_delay'].transform('mean')

In [6]:
# This feature creates the avg taxi times for the arrival hour 
flights_df['avg_taxi_times_arr'] = flights_df.groupby('arr_hour')['taxi_time'].transform('mean')

In [7]:
# Group the flights data by origin airport and count the number of flights for each airport
airport_counts = flights_df.groupby('origin')['flights'].transform('count')

# Determine the thresholds for each category based on the quartiles
thresholds = [0, airport_counts.quantile(0.25), airport_counts.quantile(0.5), airport_counts.quantile(0.75), np.inf]

# Define the categories
categories = ['Not Busy', 'Moderate', 'Busy', 'Very Busy']

# Assign the traffic category based on the number of flights from each airport
flights_df['traffic'] = pd.cut(airport_counts, bins=thresholds, labels=categories, right=False)

In [8]:
from pandas.tseries.holiday import USFederalHolidayCalendar as holiday_calendar
from datetime import datetime

In [9]:
flights_df['fl_date'] = flights_df['fl_date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))

In [10]:
# holiday_cal
cal = holiday_calendar()
dr = flights_df['fl_date'] 
holidays = cal.holidays(start=dr.min(), end=dr.max())
flights_df['holiday'] = flights_df['fl_date'].dt.date.astype('datetime64').isin(holidays).astype(int)
flights_df.tail(5)

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,...,month,dep_hour,arr_hour,taxi_time,origin_state,flight_distance_category,avg_monthly_arr_delay,avg_taxi_times_arr,traffic,holiday
1559040,2019-07-31,DL,DL,DL,2899,DL,N932AT,2899,10397,ATL,...,7,15,17,114.0,GA,SHORT,17.060485,177.955554,Very Busy,0
1559041,2019-07-31,DL,DL,DL,2899,DL,N932AT,2899,11057,CLT,...,7,17,18,83.0,NC,SHORT,17.060485,170.038468,Very Busy,0
1559042,2019-07-31,DL,DL,DL,2900,DL,N301DQ,2900,10397,ATL,...,7,12,14,175.0,GA,MEDIUM,17.060485,178.305499,Very Busy,0
1559043,2019-07-31,DL,DL,DL,2900,DL,N301DQ,2900,11624,EYW,...,7,15,17,126.0,FL,MEDIUM,17.060485,177.955554,Not Busy,0
1559044,2019-07-31,DL,DL,DL,2901,DL,N397DA,2901,11292,DEN,...,7,7,9,159.0,CO,SHORT,17.060485,146.300733,Very Busy,0


In [11]:
flights_df.columns

Index(['fl_date', 'mkt_unique_carrier', 'branded_code_share', 'mkt_carrier',
       'mkt_carrier_fl_num', 'op_unique_carrier', 'tail_num',
       'op_carrier_fl_num', 'origin_airport_id', 'origin', 'origin_city_name',
       'dest_airport_id', 'dest', 'dest_city_name', 'crs_dep_time', 'dep_time',
       'dep_delay', 'taxi_out', 'wheels_off', 'wheels_on', 'taxi_in',
       'crs_arr_time', 'arr_time', 'arr_delay', 'cancelled',
       'cancellation_code', 'diverted', 'dup', 'crs_elapsed_time',
       'actual_elapsed_time', 'air_time', 'flights', 'distance',
       'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay',
       'late_aircraft_delay', 'first_dep_time', 'total_add_gtime',
       'longest_add_gtime', 'no_name', 'month', 'dep_hour', 'arr_hour',
       'taxi_time', 'origin_state', 'flight_distance_category',
       'avg_monthly_arr_delay', 'avg_taxi_times_arr', 'traffic', 'holiday'],
      dtype='object')

In [12]:
flights_df['fl_date'] = pd.to_datetime(flights_df['fl_date'])

flights_df['day_of_week'] = flights_df['fl_date'].dt.dayofweek

day_mapping = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'}
flights_df['day_of_week'] = flights_df['day_of_week'].map(day_mapping)

In [13]:
flights_df['avg_hourly_arr_delay'] = flights_df.groupby('arr_hour')['arr_delay'].transform('mean')

In [14]:
flights_df['avg_mktcarrier_arr_delay'] = flights_df.groupby('mkt_unique_carrier')['arr_delay'].transform('mean')

In [15]:
flights_df['avg_airport_arr_delay'] = flights_df.groupby('origin_airport_id')['arr_delay'].transform('mean')

In [16]:
flights_df['avg_plane_arr_delay'] = flights_df.groupby('tail_num')['arr_delay'].transform('mean')

In [17]:
flights_df['avg_destcity_arr_delay'] = flights_df.groupby('dest_city_name')['arr_delay'].transform('mean')

In [18]:
flights_df['avg_state_arr_delay'] = flights_df.groupby('origin_state')['arr_delay'].transform('mean')

In [19]:
flights_df['avg_dow_arr_delay'] = flights_df.groupby('day_of_week')['arr_delay'].transform('mean')

In [20]:
flights_df['avg_dow_arr_delay'] = flights_df.groupby('day_of_week')['arr_delay'].transform('mean')

In [21]:
# # Shuffle the data in the DataFrame and create a sample with half the size
sample_size = len(flights_df) // 10

flights_df_sample = flights_df.sample(n=sample_size, random_state=42)

## Reset the index of the shuffled sample

flights_df_sample.reset_index(drop=True, inplace=True)

In [22]:
flights_df_sample.columns

Index(['fl_date', 'mkt_unique_carrier', 'branded_code_share', 'mkt_carrier',
       'mkt_carrier_fl_num', 'op_unique_carrier', 'tail_num',
       'op_carrier_fl_num', 'origin_airport_id', 'origin', 'origin_city_name',
       'dest_airport_id', 'dest', 'dest_city_name', 'crs_dep_time', 'dep_time',
       'dep_delay', 'taxi_out', 'wheels_off', 'wheels_on', 'taxi_in',
       'crs_arr_time', 'arr_time', 'arr_delay', 'cancelled',
       'cancellation_code', 'diverted', 'dup', 'crs_elapsed_time',
       'actual_elapsed_time', 'air_time', 'flights', 'distance',
       'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay',
       'late_aircraft_delay', 'first_dep_time', 'total_add_gtime',
       'longest_add_gtime', 'no_name', 'month', 'dep_hour', 'arr_hour',
       'taxi_time', 'origin_state', 'flight_distance_category',
       'avg_monthly_arr_delay', 'avg_taxi_times_arr', 'traffic', 'holiday',
       'day_of_week', 'avg_hourly_arr_delay', 'avg_mktcarrier_arr_delay',
       'avg_ai

In [23]:
flights_df_sample = flights_df_sample[['fl_date','mkt_unique_carrier','arr_delay',
        'cancelled', 'diverted',
        'air_time','distance',
        'month', 'dep_hour', 'arr_hour',
       'origin_state', 'flight_distance_category',
       'avg_monthly_arr_delay', 'avg_taxi_times_arr', 'traffic', 'holiday','avg_hourly_arr_delay', 'avg_mktcarrier_arr_delay',
       'avg_airport_arr_delay', 'avg_plane_arr_delay',
       'avg_destcity_arr_delay', 'avg_state_arr_delay','day_of_week','avg_dow_arr_delay']]

In [24]:
flights_df_sample.loc[flights_df_sample['arr_delay'] < 0, 'arr_delay'] = 0

In [25]:
numerical_columns = ['air_time','distance','avg_taxi_times_arr','avg_hourly_arr_delay', 'avg_mktcarrier_arr_delay', 'avg_airport_arr_delay', 'avg_plane_arr_delay', 'avg_destcity_arr_delay', 'avg_state_arr_delay','avg_dow_arr_delay']
                    
categorical_columns = ['mkt_unique_carrier','diverted','month','dep_hour','arr_hour','origin_state','flight_distance_category','traffic','holiday','day_of_week']




In [26]:
df_encoded = pd.get_dummies(data=flights_df_sample, columns= categorical_columns , drop_first=False)

In [27]:
df_encoded = df_encoded.drop(['fl_date'], axis=1)

In [28]:
df_encoded = df_encoded.drop(['arr_delay'], axis=1)

In [29]:
# Linear regression model

# Separate the input features (X) and the target variable (y)
X = df_encoded

y = flights_df_sample['arr_delay']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a linear regression model
model = LinearRegression()

# Train the model using the training data
model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = model.predict(X_test)

# Evaluate the model
score = model.score(X_test, y_test)

# Print the coefficient and intercept of the linear regression model
print('Coefficients:', model.coef_)
print('Intercept:', model.intercept_)
print('R^2 Score:', score)

Coefficients: [ 1.17105266e-15  3.10613990e-01 -3.69623129e-02  1.00295533e+00
  7.35975866e-04  3.93306621e-01 -2.09806846e-01  7.21453427e-01
  9.92221480e-01  8.79078746e-01 -4.79473428e-01  6.06687550e-01
 -7.36643029e-01 -1.72666411e+00 -1.22807709e+00  1.79111280e+00
  1.28205987e+00 -6.78391263e-01 -1.22913731e+00  5.56804612e-01
 -2.37714846e+00  3.48556319e+00  8.60520803e-01 -2.59792188e-14
 -4.34080637e-01 -1.04116834e+00  9.43397999e-02  6.54508857e-01
 -2.51789717e-01  6.60813958e-01  6.89860413e-02  3.78815414e-01
  2.17542446e-01 -3.82801419e-02 -1.39536348e-01 -1.70151334e-01
  4.85019002e-01 -4.42799797e+00  5.78928604e+00 -5.79731663e+00
 -3.89181109e+00 -4.40233985e+00 -3.38815098e+00 -2.60857267e+00
  9.35416593e-02  1.11401152e-01  8.12660507e-01  4.19587677e-01
  4.17543872e-01  8.46684513e-01  2.46778500e-01 -2.76893753e-01
  6.40359775e-01  1.92710915e+00  1.26243189e+00  1.31293168e+00
  2.28320558e+00 -1.98195876e-01  4.67405360e-01  7.87533247e+00
  3.1390923

In [30]:
# Separate numeric columns and encoded columns

encoded_columns = df_encoded.drop(columns=numerical_columns)

In [31]:
# Scale the numeric columns
scaler = StandardScaler()
scaled_numeric_data = scaler.fit_transform(df_encoded[numerical_columns])

# Create a DataFrame for the scaled numeric data
df_scaled_numeric = pd.DataFrame(scaled_numeric_data, columns=numerical_columns)

# Rejoin the scaled numeric data with the encoded columns
df_scaled_encoded = pd.concat([df_scaled_numeric, encoded_columns], axis=1)

In [32]:
# Linear regression model 

# Separate the input features (X) and the target variable (y)
X = df_scaled_encoded

y = flights_df_sample['arr_delay']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a linear regression model
model = LinearRegression()

# Train the model using the training data
model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = model.predict(X_test)

# Evaluate the model
score = model.score(X_test, y_test)

# Print the coefficient and intercept of the linear regression model
print('Coefficients:', model.coef_)
print('Intercept:', model.intercept_)
print('R^2 Score:', score)

Coefficients: [ 2.15682776e+01 -2.14797221e+01  3.42159222e+12 -1.40156357e+12
  1.69336855e+12  2.32580922e+00  5.66619776e+00  3.10153368e+00
  2.91663640e+12  1.05727443e+13 -1.97316567e+12 -1.58101221e+13
  1.30753364e+13  1.57291519e+13  9.65697885e+12  1.40582147e+13
  9.81101858e+12  1.17534931e+13  1.75358828e+13  1.32466610e+13
  1.11603437e+13  1.33698482e+13  1.52042947e+13 -5.90315639e+12
  2.39649775e+13  6.51835489e+13 -1.16451534e+12  2.02223908e+13
  4.76161215e+13  9.37223002e+13  8.44851132e+13  8.72931308e+13
 -9.54606930e+12 -2.54362425e+12 -2.54427610e+12  3.65262661e+13
  2.15154199e+12  2.15154199e+12  2.15154199e+12  2.15154199e+12
  2.15154199e+12  2.15154199e+12  2.15154199e+12  2.15154199e+12
  2.15154199e+12  2.15154199e+12  2.15154199e+12  2.15154199e+12
  2.15154199e+12  2.15154199e+12  2.15154199e+12  2.15154199e+12
  2.15154199e+12  2.15154199e+12  2.15154199e+12  2.15154199e+12
  2.15154199e+12  2.15154199e+12  2.15154199e+12  2.15154199e+12
  1.0051567

In [33]:
# We can see that with all negative values turned to zero in the flights_df our model
# for regression is not performing as well. This may be due to imbalanced data with many zero values

In [34]:
y = flights_df_sample['arr_delay'].apply(lambda x: 1 if x > 0 else 0)


In [35]:
X = encoded_columns

In [36]:
df_scaled_encoded.shape

(155904, 153)

In [37]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_squared_error

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np

X = df_scaled_encoded
y = flights_df_sample['arr_delay']

# Step 1: Use a binary regressor to predict whether arr_delay is 0 or non-zero
binary_regressor = RandomForestRegressor()
binary_regressor.fit(X, (y > 0).astype(int))
binary_predictions = binary_regressor.predict(X)

# Step 2: Filter instances predicted as non-zero
non_zero_indices = np.where(binary_predictions > 0)[0]
X_regression = X.iloc[non_zero_indices]
y_regression = y.iloc[non_zero_indices]

# Step 3: Train a regression model
regression_model = RandomForestRegressor()
regression_model.fit(X_regression, y_regression)
regression_predictions = regression_model.predict(X_regression)

# Step 4: Combine binary and regression predictions
combined_predictions = np.zeros_like(binary_predictions)
combined_predictions[non_zero_indices] = regression_predictions

# Step 5: Evaluate the performance
binary_accuracy = accuracy_score((y > 0).astype(int), (binary_predictions > 0).astype(int))
regression_mse = mean_squared_error(y_regression, regression_predictions)
combined_mse = mean_squared_error(y.iloc[non_zero_indices], combined_predictions[non_zero_indices])

print('Binary Regression Accuracy:', binary_accuracy)
print('Regression Mean Squared Error:', regression_mse)
print('Combined Mean Squared Error:', combined_mse)

Binary Regression Accuracy: 0.3512674466338259
Regression Mean Squared Error: 319.12093503974
Combined Mean Squared Error: 319.12093503974


In [38]:
# Create a new column in the DataFrame for predicted values
flights_df_sample['predicted_delay'] = 0  # Initialize with 0 values

# Assign the predicted values from the combined model to the new column
flights_df_sample.loc[binary_predictions == 1, 'predicted_delay'] = combined_predictions[binary_predictions == 1]

In [39]:
non_zero_predictions = flights_df_sample[flights_df_sample['predicted_delay'] != 0][['predicted_delay', 'arr_delay']]
print(non_zero_predictions.head(20))

        predicted_delay  arr_delay
40476             82.49       98.0
62633             66.44       81.0
127565            89.10      126.0
153672            26.51       26.0


In [40]:
num_zero_delays = len(flights_df_sample[flights_df_sample['predicted_delay'] == 0])
print("Number of values where predicted_delays = 0:", num_zero_delays)

Number of values where predicted_delays = 0: 155900


In [41]:
num_zero_delays = len(flights_df_sample[flights_df_sample['arr_delay'] == 0])
print("Number of values where predicted_delays = 0:", num_zero_delays)

Number of values where predicted_delays = 0: 101332


In [None]:
# The model has .3 R^2 because it is mostly predicting zeros and has only made a prediciton above zero 4 times