In [13]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVR
from sklearn import tree
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
import graphviz

In [14]:
flights_df = pd.read_csv('complete_flight_info_and weather_data.csv')

# convert flight date to date object
flights_df = flights_df.drop(['date', 'date.1','CRS_DEP_TIME','ORIGIN','DEST'], axis=1)

flights_df = flights_df.dropna()
 
flights_df['FL_DATE'] = pd.to_datetime(flights_df['FL_DATE'])
del flights_df['DOT_CODE']
# Remove columns starting with 'origin' and 'dest'
columns_to_remove = [col for col in flights_df.columns if col.startswith('ORIGIN') or col.startswith('DEST')]
flights_df = flights_df.drop(columns=columns_to_remove)

# converts string TRUE/FALSE to boolean
flights_df.replace({'TRUE': True, 'FALSE': False}, inplace=True)

# convert FL_Date to year, month, day
flights_df['FL_YEAR'] = pd.to_datetime(flights_df['FL_DATE']).dt.year
flights_df['FL_MONTH'] = pd.to_datetime(flights_df['FL_DATE']).dt.month
flights_df['FL_DAY'] = pd.to_datetime(flights_df['FL_DATE']).dt.day

# drop original date time
flights_df.drop(columns=['FL_DATE'], inplace=True)

X = flights_df.loc[:, flights_df.columns != 'ARR_DELAY']
y = flights_df['ARR_DELAY']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 156, shuffle=True)

In [15]:
model = tree.DecisionTreeRegressor(max_depth=2, random_state=156) 
model = model.fit(X_train, y_train)

model_text = tree.export_text(model, feature_names=list(X_train.columns))
print(model_text)

|--- DEP_DELAY <= 417.00
|   |--- DEP_DELAY <= 127.50
|   |   |--- value: [42.01]
|   |--- DEP_DELAY >  127.50
|   |   |--- value: [191.13]
|--- DEP_DELAY >  417.00
|   |--- DEP_DELAY <= 841.50
|   |   |--- value: [577.56]
|   |--- DEP_DELAY >  841.50
|   |   |--- value: [1090.99]



In [16]:
fi = model.feature_importances_

names = X_train.columns
importance_dict = dict(zip(names, fi))

print("Feature Importance:")
for feature, importance in importance_dict.items():
    print(f"{feature}: {importance}")

Feature Importance:
FL_NUMBER: 0.0
DEP_TIME: 0.0
DEP_DELAY: 1.0
TAXI_OUT: 0.0
WHEELS_OFF: 0.0
CRS_ARR_TIME: 0.0
CRS_ELAPSED_TIME: 0.0
DISTANCE: 0.0
dep_temperature_2m: 0.0
dep_apparent_temperature: 0.0
dep_rain: 0.0
dep_wind_speed_10m: 0.0
dep_wind_speed_100m: 0.0
dep_cloud_cover: 0.0
dep_cloud_cover_low: 0.0
dep_cloud_cover_mid: 0.0
dep_cloud_cover_high: 0.0
dep_wind_direction_10m: 0.0
dep_wind_direction_100m: 0.0
dep_wind_gusts_10m: 0.0
dep_snow_depth: 0.0
dest_temperature_2m: 0.0
dest_apparent_temperature: 0.0
dest_rain: 0.0
dest_wind_speed_10m: 0.0
dest_wind_speed_100m: 0.0
dest_cloud_cover: 0.0
dest_cloud_cover_low: 0.0
dest_cloud_cover_mid: 0.0
dest_cloud_cover_high: 0.0
dest_wind_direction_10m: 0.0
dest_wind_direction_100m: 0.0
dest_wind_gusts_10m: 0.0
dest_snow_depth: 0.0
FL_YEAR: 0.0
FL_MONTH: 0.0
FL_DAY: 0.0


In [17]:
preds = model.predict(X_test)
print(mean_squared_error(y_test, preds), r2_score(y_test, preds))

1577.5270404657927 0.8681693985770631


In [18]:
mse = {'k':[], 'train_mse':[], 'test_mse':[], 'train_r2':[], 'test_r2':[], 'test_MAPE': []}
for k in range(1,30):
    print("Fit with max_depth:", k, end='\r', flush=True)
    
    model = tree.DecisionTreeRegressor(max_depth=k)
    model = model.fit(X_train, y_train)
    preds_train = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    mse['k'].append(k)
    mse['train_mse'].append(mean_squared_error(y_train, preds_train))
    mse['test_mse'].append(mean_squared_error(y_test, y_test_pred))
    mse['train_r2'].append(r2_score(y_train, preds_train))
    mse['test_r2'].append(r2_score(y_test, y_test_pred))
    # Calculate MAPE
    abs_errors = np.abs(y_test - y_test_pred)
    percentage_errors = (abs_errors / y_test) * 100
    mse['test_MAPE'].append(np.mean(percentage_errors))

    
idx = mse['test_mse'].index(min(mse['test_mse']))
print('Depth of the model yielding minimum test MSE is:', mse['k'][idx])
print('Optimized model has MSE:', min(mse['test_mse']), 'Optimized model has R2:', mse['test_r2'][idx], 'Optimized model has MAPE:', mse['test_MAPE'][idx])

Depth of the model yielding minimum test MSE is: 10
Optimized model has MSE: 88.23135609204158 Optimized model has R2: 0.9926266920061537 Optimized model has MAPE: 16.060176012874166
