In [45]:
import warnings
warnings.simplefilter("ignore")
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import cross_val_score, train_test_split, cross_validate
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import BaggingRegressor

In [46]:
df = pd.read_csv('complete_flight_info_and weather_data.csv')

# convert flight date to date object
df = df.drop(['date', 'date.1','CRS_DEP_TIME','ORIGIN','DEST'], axis=1)

df = df.dropna()
 
df['FL_DATE'] = pd.to_datetime(df['FL_DATE'])
del df['DOT_CODE']
# Remove columns starting with 'origin' and 'dest'
columns_to_remove = [col for col in df.columns if col.startswith('ORIGIN') or col.startswith('DEST')]
df = df.drop(columns=columns_to_remove)

# converts string TRUE/FALSE to boolean
df.replace({'TRUE': True, 'FALSE': False}, inplace=True)

# convert FL_Date to year, month, day
df['FL_YEAR'] = pd.to_datetime(df['FL_DATE']).dt.year
df['FL_MONTH'] = pd.to_datetime(df['FL_DATE']).dt.month
df['FL_DAY'] = pd.to_datetime(df['FL_DATE']).dt.day

# drop original date time
df.drop(columns=['FL_DATE'], inplace=True)


In [47]:
y = df['ARR_DELAY']
X = df.drop(['ARR_DELAY'], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=156)



In [48]:
# Initialize variables
remaining_features = list(X_train.columns)
selected_features= []
best_score = np.inf
count = 0

# Selection Loop
while len(remaining_features) > 0:
    best_feature = None
    # Loop to find next best feature to add to the list
    for feature in remaining_features:
        model          = LinearRegression()
        # try the below set of featuresS
        trial_features = selected_features + [feature]
        
        # Cross-validation score
        scores = cross_validate(model, X_train[trial_features], y_train, scoring='neg_mean_squared_error', cv=5)
        score = -np.mean(scores['test_score'])  # Note the negative sign

#         print(score.round(2), trial_features)
        
        # If the new addition improved the scores, update best score
        if score < best_score:
            best_score   = score
            best_feature = feature

        count += 1

    if best_feature is not None:
        selected_features.append(best_feature)
        remaining_features.remove(best_feature)
    else:
        break

print(f"Best CV Score: {best_score}")
print("Selected features in Best Selection:", selected_features)
print(f"No of trials required: {count}")


Best CV Score: 111.87173857694908
Selected features in Best Selection: ['DEP_DELAY', 'TAXI_OUT', 'CRS_ELAPSED_TIME', 'DISTANCE', 'FL_NUMBER', 'FL_YEAR', 'dest_cloud_cover', 'DEP_TIME', 'dep_temperature_2m', 'dest_snow_depth', 'dest_wind_direction_100m', 'CRS_ARR_TIME', 'dest_rain', 'dest_wind_gusts_10m', 'dest_wind_speed_100m', 'dest_wind_speed_10m', 'WHEELS_OFF', 'dep_snow_depth', 'dest_cloud_cover_mid', 'dep_cloud_cover_mid', 'dep_wind_direction_10m', 'dest_wind_direction_10m']
No of trials required: 598


In [49]:
X_train_subset = X_train[selected_features]

# Initialize and train the linear regression model
model = LinearRegression()
model.fit(X_train_subset, y_train)

# Predict on the training data
y_train_pred = model.predict(X_train_subset)

# Evaluate the model
mse_train = mean_squared_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

print("Training MSE:", mse_train)
print("Training R-squared:", r2_train)
# Calculate MAPE for training set
mape_train = np.mean(np.abs((y_train - y_train_pred) / y_train)) * 100

print("Training MAPE:", mape_train)

Training MSE: 111.58105277530602
Training R-squared: 0.9926319132689244
Training MAPE: 20.70613450985154


In [50]:
# Subset X_test to include only the selected features
X_test_subset = X_test[selected_features]

# Predict on the testing data
y_test_pred = model.predict(X_test_subset)

# Evaluate the model on testing data
mse_test = mean_squared_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)


# Calculate MAPE for training set
mape_train = np.mean(np.abs((y_test - y_test_pred) / y_test)) * 100



In [51]:
model_bag = BaggingRegressor(n_estimators=100, random_state=156, max_features=30)

model_bag.fit(X_train, y_train)
preds_test = model_bag.predict(X_test)
print(mean_squared_error(y_test, preds_test), r2_score(y_test, preds_test))

261.19033600340134 0.9781728755210304


In [None]:
mse = {'train_mse':[], 'test_mse':[], 'n':[], 'r_squared':[], 'mape':[]}

for n in range(5,10,1):
    print("Fit with n_estimators:", n, end='\r', flush=True)
    
    model_bag = BaggingRegressor(n_estimators=n, random_state=156)
    model_bag = model_bag.fit(X_train, y_train)
    preds_train = model_bag.predict(X_train)
    preds_test = model_bag.predict(X_test)
    
    mse['n'].append(n)
    mse['train_mse'].append(mean_squared_error(y_train, preds_train))
    mse['test_mse'].append(mean_squared_error(y_test, preds_test))
    mse['r_squared'].append( r2_score(y_test, preds_test))
    
    abs_errors = np.abs(y_test - y_test_pred)
    percentage_errors = (abs_errors / y_test) * 100
    mape = np.mean(percentage_errors)
    mse['mape'].append(mape)

    
idx = mse['test_mse'].index(min(mse['test_mse']))
print('Bagging model yielding optimized test MSE has n_estimators:', mse['n'][idx])
print('Optimized model has MSE:', min(mse['test_mse']))
print('r_squared', min(mse['r_squared']))
print('mape', min(mse['mape']))

Fit with n_estimators: 9