In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVR

In [10]:
flights_df = pd.read_csv('complete_flight_info_and weather_data.csv')

# convert flight date to date object
flights_df = flights_df.drop(['date', 'date.1','CRS_DEP_TIME','ORIGIN','DEST'], axis=1)

flights_df = flights_df.dropna()
 
flights_df['FL_DATE'] = pd.to_datetime(flights_df['FL_DATE'])
del flights_df['DOT_CODE']
# Remove columns starting with 'origin' and 'dest'
columns_to_remove = [col for col in flights_df.columns if col.startswith('ORIGIN') or col.startswith('DEST')]
flights_df = flights_df.drop(columns=columns_to_remove)

# converts string TRUE/FALSE to boolean
flights_df.replace({'TRUE': True, 'FALSE': False}, inplace=True)

# convert FL_Date to year, month, day
flights_df['FL_YEAR'] = pd.to_datetime(flights_df['FL_DATE']).dt.year
flights_df['FL_MONTH'] = pd.to_datetime(flights_df['FL_DATE']).dt.month
flights_df['FL_DAY'] = pd.to_datetime(flights_df['FL_DATE']).dt.day

# drop original date time
flights_df.drop(columns=['FL_DATE'], inplace=True)

In [12]:
X = flights_df[ ['DEP_DELAY', 'TAXI_OUT', 'CRS_ELAPSED_TIME', 'DISTANCE', 'FL_NUMBER', 'FL_YEAR', 'dest_cloud_cover', 'DEP_TIME', 'dep_temperature_2m', 'dest_snow_depth']]
y = flights_df['ARR_DELAY']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 156, shuffle=True)

svr_rbf = SVR(kernel='rbf')

svr_rbf.fit(X_train, y_train)

svr_predict = svr_rbf.predict(X_test)

print('MSE: ' + str(mean_squared_error(svr_predict, y_test)))
print('MAE: ' + str(mean_absolute_error(svr_predict, y_test)))
print('R2: ' + str(r2_score(svr_predict, y_test)))

MSE: 10185.877427845573
MAE: 41.55979860667354
R2: -59.46343810187049


In [11]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100