In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_percentage_error
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVR

In [2]:
 
flights_df = pd.read_csv('complete_flight_info_and weather_data.csv.csv')

# convert flight date to date object
flights_df = flights_df.drop(['date', 'date.1','CRS_DEP_TIME','ORIGIN','DEST'], axis=1)

flights_df = flights_df.dropna()
 
flights_df['FL_DATE'] = pd.to_datetime(flights_df['FL_DATE'])
del flights_df['DOT_CODE']
# Remove columns starting with 'origin' and 'dest'
columns_to_remove = [col for col in flights_df.columns if col.startswith('ORIGIN') or col.startswith('DEST')]
flights_df = flights_df.drop(columns=columns_to_remove)

# converts string TRUE/FALSE to boolean
flights_df.replace({'TRUE': True, 'FALSE': False}, inplace=True)

# convert FL_Date to year, month, day
flights_df['FL_YEAR'] = pd.to_datetime(flights_df['FL_DATE']).dt.year
flights_df['FL_MONTH'] = pd.to_datetime(flights_df['FL_DATE']).dt.month
flights_df['FL_DAY'] = pd.to_datetime(flights_df['FL_DATE']).dt.day

# drop original date time
flights_df.drop(columns=['FL_DATE'], inplace=True)


X = flights_df.loc[:, flights_df.columns != 'ARR_DELAY']
y = flights_df['ARR_DELAY']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 156, shuffle=True)

In [3]:
def quadratic_regression(X_train, X_test, y_train, y_test):
    poly = PolynomialFeatures(degree=2)
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly = poly.transform(X_test)
    
    lin_model = LinearRegression()
    lin_model.fit(X_train_poly, y_train)
    y_pred = lin_model.predict(X_test_poly)
    

    print("Quadratic:")
    print('MSE: ' + str(mean_squared_error(y_pred, y_test)))
    print('MAE: ' + str(mean_absolute_error(y_pred, y_test)))
    print('R2: ' + str(r2_score(y_pred, y_test)))
    print('MAPE: ' + str(mean_absolute_percentage_error(y_test, y_pred)))

In [4]:
quadratic_regression(X_train, X_test, y_train, y_test)

Quadratic:
MSE: 96.77530650694379
MAE: 6.788150852849231
R2: 0.9918621633287651
MAPE: 0.20097722185558997
