In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVR
from sklearn import tree
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
import graphviz

In [2]:
flights_df = pd.read_csv('complete_flight_info_and weather_data.csv')

# convert flight date to date object
flights_df = flights_df.drop(['date', 'date.1','CRS_DEP_TIME','ORIGIN','DEST'], axis=1)

flights_df = flights_df.dropna()
 
flights_df['FL_DATE'] = pd.to_datetime(flights_df['FL_DATE'])
del flights_df['DOT_CODE']
# Remove columns starting with 'origin' and 'dest'
columns_to_remove = [col for col in flights_df.columns if col.startswith('ORIGIN') or col.startswith('DEST')]
flights_df = flights_df.drop(columns=columns_to_remove)

# converts string TRUE/FALSE to boolean
flights_df.replace({'TRUE': True, 'FALSE': False}, inplace=True)

# convert FL_Date to year, month, day
flights_df['FL_YEAR'] = pd.to_datetime(flights_df['FL_DATE']).dt.year
flights_df['FL_MONTH'] = pd.to_datetime(flights_df['FL_DATE']).dt.month
flights_df['FL_DAY'] = pd.to_datetime(flights_df['FL_DATE']).dt.day

# drop original date time
flights_df.drop(columns=['FL_DATE'], inplace=True)

X = flights_df.loc[:, flights_df.columns != 'ARR_DELAY']
y = flights_df['ARR_DELAY']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 156, shuffle=True)

In [3]:
def forward_subset_selection(X, y, max_features=None, cv=5):
    selected_features = []
    best_score = float('-inf')
    
    while len(selected_features) < max_features:
        best_feature = None
        for feature in X.columns:
            if feature not in selected_features:
                features_to_try = selected_features + [feature]
                X_subset = X[features_to_try]
                model = LinearRegression()
                scores = cross_val_score(model, X_subset, y, cv=cv, scoring='r2')
                mean_score = scores.mean()
                if mean_score > best_score:
                    best_score = mean_score
                    best_feature = feature
        if best_feature is None:
            break
        selected_features.append(best_feature)
    
    return selected_features

# Example usage:
# X_train: training features, y_train: training labels
# max_features: maximum number of features to select
selected_features = forward_subset_selection(X_train, y_train, max_features=10)
print("Selected Features:", selected_features)

Selected Features: ['DEP_DELAY', 'TAXI_OUT', 'CRS_ELAPSED_TIME', 'DISTANCE', 'FL_NUMBER', 'FL_YEAR', 'dest_cloud_cover', 'DEP_TIME', 'dep_temperature_2m', 'dest_snow_depth']


In [4]:
# Subset X_train to include only the selected features
X_train_subset = X_train[selected_features]

# Initialize and train the linear regression model
model = LinearRegression()
model.fit(X_train_subset, y_train)

# Predict on the training data
y_train_pred = model.predict(X_train_subset)

# Evaluate the model
mse_train = mean_squared_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

print("Training MSE:", mse_train)
print("Training R-squared:", r2_train)
# Calculate MAPE for training set
mape_train = np.mean(np.abs((y_train - y_train_pred) / y_train)) * 100

print("Training MAPE:", mape_train)

Training MSE: 112.50737580068535
Training R-squared: 0.9925707449233834
Training MAPE: 20.757756951829247


In [5]:
# Subset X_test to include only the selected features
X_test_subset = X_test[selected_features]

# Predict on the testing data
y_test_pred = model.predict(X_test_subset)

# Evaluate the model on testing data
mse_test = mean_squared_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)

print("Testing MSE:", mse_test)
print("Testing R-squared:", r2_test)
# Calculate MAPE for training set
mape_train = np.mean(np.abs((y_test - y_test_pred) / y_test)) * 100

print("Test MAPE:", mape_train)

Testing MSE: 103.93231410905388
Testing R-squared: 0.9913145961211361
Test MAPE: 21.08691149393886
