In [106]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVR
from sklearn import tree
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression

In [107]:
flights_df = pd.read_csv('complete_flight_info_and weather_data.csv')

# convert flight date to date object
flights_df['FL_DATE'] = pd.to_datetime(flights_df['FL_DATE'])
del flights_df['DOT_CODE']
# Remove columns starting with 'origin' and 'dest'
columns_to_remove = [col for col in flights_df.columns if col.startswith('ORIGIN') or col.startswith('DEST')]
flights_df = flights_df.drop(columns=columns_to_remove)

# converts string TRUE/FALSE to boolean
flights_df.replace({'TRUE': True, 'FALSE': False}, inplace=True)

# convert FL_Date to year, month, day
flights_df['FL_YEAR'] = pd.to_datetime(flights_df['FL_DATE']).dt.year
flights_df['FL_MONTH'] = pd.to_datetime(flights_df['FL_DATE']).dt.month
flights_df['FL_DAY'] = pd.to_datetime(flights_df['FL_DATE']).dt.day

# drop original date time
flights_df.drop(columns=['FL_DATE'], inplace=True)

X = flights_df.loc[:, flights_df.columns != 'ARR_DELAY']
y = flights_df['ARR_DELAY']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 156, shuffle=True)

In [108]:
def forward_subset_selection(X, y, max_features=None, cv=5):
    selected_features = []
    best_score = float('-inf')
    
    while len(selected_features) < max_features:
        best_feature = None
        for feature in X.columns:
            if feature not in selected_features:
                features_to_try = selected_features + [feature]
                X_subset = X[features_to_try]
                model = LinearRegression()
                scores = cross_val_score(model, X_subset, y, cv=cv, scoring='r2')
                mean_score = scores.mean()
                if mean_score > best_score:
                    best_score = mean_score
                    best_feature = feature
        if best_feature is None:
            break
        selected_features.append(best_feature)
    
    return selected_features

# Example usage:
# X_train: training features, y_train: training labels
# max_features: maximum number of features to select
# You may need to preprocess your data (e.g., handle missing values, scale features) before using forward_subset_selection.
selected_features = forward_subset_selection(X_train, y_train, max_features=5)
print("Selected Features:", selected_features)

Selected Features: ['ORIGIN_LCH', 'DEST_DFW', 'FL_NUMBER', 'DEST_ORD', 'DEST_ATL']


In [109]:
model = tree.DecisionTreeRegressor(max_depth=2, random_state=156) 
model = model.fit(X_train, y_train)

model_text = tree.export_text(model, feature_names=list(X_train.columns))
print(model_text)

|--- ORIGIN_LCH <= 0.50
|   |--- dep_wind_direction_100m <= 0.59
|   |   |--- value: [1092.00]
|   |--- dep_wind_direction_100m >  0.59
|   |   |--- value: [74.53]
|--- ORIGIN_LCH >  0.50
|   |--- dep_wind_direction_10m <= 153.28
|   |   |--- value: [39.50]
|   |--- dep_wind_direction_10m >  153.28
|   |   |--- value: [1025.00]



In [110]:
fi = model.feature_importances_

names = X_train.columns
importance_dict = dict(zip(names, fi))

print("Feature Importance:")
for feature, importance in importance_dict.items():
    print(f"{feature}: {importance}")

Feature Importance:
FL_NUMBER: 0.0
CRS_ARR_TIME: 0.0
CRS_ELAPSED_TIME: 0.0
DISTANCE: 0.0
ORIGIN_ABE: 0.0
ORIGIN_ABI: 0.0
ORIGIN_ABQ: 0.0
ORIGIN_ABR: 0.0
ORIGIN_ABY: 0.0
ORIGIN_ACK: 0.0
ORIGIN_ACT: 0.0
ORIGIN_ACV: 0.0
ORIGIN_ACY: 0.0
ORIGIN_ADK: 0.0
ORIGIN_ADQ: 0.0
ORIGIN_AEX: 0.0
ORIGIN_AGS: 0.0
ORIGIN_AKN: 0.0
ORIGIN_ALB: 0.0
ORIGIN_ALO: 0.0
ORIGIN_ALS: 0.0
ORIGIN_ALW: 0.0
ORIGIN_AMA: 0.0
ORIGIN_ANC: 0.0
ORIGIN_APN: 0.0
ORIGIN_ART: 0.0
ORIGIN_ASE: 0.0
ORIGIN_ATL: 0.0
ORIGIN_ATW: 0.0
ORIGIN_ATY: 0.0
ORIGIN_AUS: 0.0
ORIGIN_AVL: 0.0
ORIGIN_AVP: 0.0
ORIGIN_AZA: 0.0
ORIGIN_AZO: 0.0
ORIGIN_BDL: 0.0
ORIGIN_BFF: 0.0
ORIGIN_BFL: 0.0
ORIGIN_BGM: 0.0
ORIGIN_BGR: 0.0
ORIGIN_BHM: 0.0
ORIGIN_BIH: 0.0
ORIGIN_BIL: 0.0
ORIGIN_BIS: 0.0
ORIGIN_BJI: 0.0
ORIGIN_BLI: 0.0
ORIGIN_BLV: 0.0
ORIGIN_BMI: 0.0
ORIGIN_BNA: 0.0
ORIGIN_BOI: 0.0
ORIGIN_BOS: 0.0
ORIGIN_BPT: 0.0
ORIGIN_BQK: 0.0
ORIGIN_BQN: 0.0
ORIGIN_BRD: 0.0
ORIGIN_BRO: 0.0
ORIGIN_BTM: 0.0
ORIGIN_BTR: 0.0
ORIGIN_BTV: 0.0
ORIGIN_BUF: 0.0
ORIGIN_BUR: 0.0

In [111]:
preds = model.predict(X_test)
print(mean_squared_error(y_test, preds), r2_score(y_test, preds))

12314.430805669072 -0.029090961770868562


In [112]:
mse = {'k':[], 'train_mse':[], 'test_mse':[]}
for k in range(1,30):
    print("Fit with max_depth:", k, end='\r', flush=True)
    
    model = tree.DecisionTreeRegressor(max_depth=k)
    model = model.fit(X_train, y_train)
    preds_train = model.predict(X_train)
    preds_test = model.predict(X_test)

    mse['k'].append(k)
    mse['train_mse'].append(mean_squared_error(y_train, preds_train))
    mse['test_mse'].append(mean_squared_error(y_test, preds_test))
    
idx = mse['test_mse'].index(min(mse['test_mse']))
print('Depth of the model yielding minimum test MSE is:', mse['k'][idx])
print('Optimized model has MSE:', min(mse['test_mse']))

Depth of the model yielding minimum test MSE is: 4
Optimized model has MSE: 11949.99279450181


In [113]:
model_rf = RandomForestRegressor (n_estimators=100, random_state=156, max_features=30)

model_rf.fit(X_train, y_train)
preds_test = model_rf.predict(X_test)

print(mean_squared_error(y_test, preds_test), r2_score(y_test, preds_test))

11998.649445986395 -0.002701780795036912


In [114]:
from sklearn.model_selection import GridSearchCV

clf = RandomForestRegressor(random_state=156)

params = {
    'max_depth': np.arange(5,10,1),
    'n_estimators': np.arange(5,10,1)
}

grid_search = GridSearchCV(estimator=clf, param_grid=params, cv=5, n_jobs=-1, verbose=1, scoring='neg_mean_absolute_error', return_train_score=True)
grid_search.fit(X_train, y_train)



Fitting 5 folds for each of 25 candidates, totalling 125 fits


In [None]:
# Get the best estimator and show parameters
grid_search.best_params_

{'max_depth': 9, 'n_estimators': 9}

In [None]:
# Evaluate the tuned model

clf_best = grid_search.best_estimator_
y_test_pred = clf_best.predict(X_test)

print('Hyperparameter tuning of random forest yields MSE and R-squared:', mean_squared_error(y_test, y_test_pred), r2_score(y_test, y_test_pred))

Hyperparameter tuning of random forest yields MSE and R-squared: 11954.490354643305 0.000988501159388977
