In [120]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV


In [100]:
def load_data(filepath):
    data = pd.read_csv(filepath)
    return data

In [101]:
df = load_data('data/processed-data.csv')
print(df.shape)

(245955, 34)


In [102]:
df

Unnamed: 0,year,quarter,season,citymarketid_1,city1,airportid_1,airport_iata_1,airport_name_1,airport_name_concat_1,state_1,...,carrier_lg,carrier_lg_name,carrier_lg_name_concat,large_ms,fare_lg,carrier_low,carrier_low_name,carrier_low_name_concat,lf_ms,fare_low
0,2021,3,Summer,30135,"Allentown/Bethlehem/Easton, PA",10135,ABE,Lehigh Valley International Airport,ABE - Lehigh Valley International Airport,Pennsylvania,...,G4,Allegiant Air,G4 - Allegiant Air,1.0000,81.43,G4,Allegiant Air,G4 - Allegiant Air,1.0000,81.43
1,2021,3,Summer,30135,"Allentown/Bethlehem/Easton, PA",10135,ABE,Lehigh Valley International Airport,ABE - Lehigh Valley International Airport,Pennsylvania,...,DL,Delta Air Lines,DL - Delta Air Lines,0.4659,219.98,UA,United Airlines,UA - United Airlines,0.1193,154.11
2,2021,3,Summer,30140,"Albuquerque, NM",10140,ABQ,Albuquerque International Sunport,ABQ - Albuquerque International Sunport,New Mexico,...,WN,Southwest Airlines,WN - Southwest Airlines,0.9968,184.44,WN,Southwest Airlines,WN - Southwest Airlines,0.9968,184.44
3,2021,3,Summer,30140,"Albuquerque, NM",10140,ABQ,Albuquerque International Sunport,ABQ - Albuquerque International Sunport,New Mexico,...,AA,American Airlines,AA - American Airlines,0.9774,183.09,AA,American Airlines,AA - American Airlines,0.9774,183.09
4,2021,3,Summer,30140,"Albuquerque, NM",10140,ABQ,Albuquerque International Sunport,ABQ - Albuquerque International Sunport,New Mexico,...,WN,Southwest Airlines,WN - Southwest Airlines,0.6061,184.49,AA,American Airlines,AA - American Airlines,0.3939,165.77
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245950,2024,1,Winter,35412,"Knoxville, TN",15412,TYS,McGhee Tyson Airport,TYS - McGhee Tyson Airport,Tennessee,...,DL,Delta Air Lines,DL - Delta Air Lines,0.7503,287.44,AA,American Airlines,AA - American Airlines,0.2359,248.46
245951,2024,1,Winter,35412,"Knoxville, TN",15412,TYS,McGhee Tyson Airport,TYS - McGhee Tyson Airport,Tennessee,...,G4,Allegiant Air,G4 - Allegiant Air,0.8255,114.45,G4,Allegiant Air,G4 - Allegiant Air,0.8255,114.45
245952,2024,1,Winter,35412,"Knoxville, TN",15412,TYS,McGhee Tyson Airport,TYS - McGhee Tyson Airport,Tennessee,...,AA,American Airlines,AA - American Airlines,0.8057,321.92,AA,American Airlines,AA - American Airlines,0.8057,321.92
245953,2024,1,Winter,35412,"Knoxville, TN",15412,TYS,McGhee Tyson Airport,TYS - McGhee Tyson Airport,Tennessee,...,G4,Allegiant Air,G4 - Allegiant Air,1.0000,95.65,G4,Allegiant Air,G4 - Allegiant Air,1.0000,95.65


# Data Preproecessing

In [103]:
df.columns

Index(['year', 'quarter', 'season', 'citymarketid_1', 'city1', 'airportid_1',
       'airport_iata_1', 'airport_name_1', 'airport_name_concat_1', 'state_1',
       'latitude_1', 'longitude_1', 'citymarketid_2', 'city2', 'airportid_2',
       'airport_iata_2', 'airport_name_2', 'airport_name_concat_2', 'state_2',
       'latitude_2', 'longitude_2', 'nsmiles', 'passengers', 'fare',
       'carrier_lg', 'carrier_lg_name', 'carrier_lg_name_concat', 'large_ms',
       'fare_lg', 'carrier_low', 'carrier_low_name', 'carrier_low_name_concat',
       'lf_ms', 'fare_low'],
      dtype='object')

In [104]:
# Select column/feature names that we will use to train with
selected_feature_names = ['city1', 'city2', 'season', 'year', 'fare']

In [105]:
df = df.loc[:,selected_feature_names]
print(df.shape)


(245955, 5)


In [106]:
df

Unnamed: 0,city1,city2,season,year,fare
0,"Allentown/Bethlehem/Easton, PA","Tampa, FL (Metropolitan Area)",Summer,2021,81.43
1,"Allentown/Bethlehem/Easton, PA","Tampa, FL (Metropolitan Area)",Summer,2021,208.93
2,"Albuquerque, NM","Dallas/Fort Worth, TX",Summer,2021,184.56
3,"Albuquerque, NM","Dallas/Fort Worth, TX",Summer,2021,182.64
4,"Albuquerque, NM","Phoenix, AZ",Summer,2021,177.11
...,...,...,...,...,...
245950,"Knoxville, TN","New York City, NY (Metropolitan Area)",Winter,2024,278.70
245951,"Knoxville, TN","Miami, FL (Metropolitan Area)",Winter,2024,148.69
245952,"Knoxville, TN","Miami, FL (Metropolitan Area)",Winter,2024,330.19
245953,"Knoxville, TN","Tampa, FL (Metropolitan Area)",Winter,2024,95.65


In [107]:
# One-hot encode city1 
unique_values = df['city1'].unique()
print('Number of unique city1 values: ', unique_values.shape[0])
df = pd.get_dummies(df, columns=['city1'], prefix='city1')
print(df.shape)

Number of unique city1 values:  141
(245955, 145)


In [108]:
# One-hot encode city2
unique_values = df['city2'].unique()
print('Number of unique city2 values: ', unique_values.shape[0])
df = pd.get_dummies(df, columns=['city2'], prefix='city2')
print(df.shape)

Number of unique city2 values:  128
(245955, 272)


In [109]:
# One-hot encode season
unique_values = df['season'].unique()
print('Number of unique season values: ', unique_values.shape[0])
df = pd.get_dummies(df, columns=['season'], prefix='season')
print(df.shape)

Number of unique season values:  4
(245955, 275)


In [126]:
df

Unnamed: 0,year,fare,"city1_Albany, NY","city1_Albuquerque, NM","city1_Allentown/Bethlehem/Easton, PA","city1_Amarillo, TX","city1_Appleton, WI","city1_Asheville, NC","city1_Ashland, WV","city1_Aspen, CO",...,"city2_Vero Beach, FL","city2_Washington, DC (Metropolitan Area)","city2_West Palm Beach/Palm Beach, FL","city2_Wichita, KS","city2_Wilmington, NC","city2_Worcester, MA",season_Fall,season_Spring,season_Summer,season_Winter
0,2021,81.43,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
1,2021,208.93,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
2,2021,184.56,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
3,2021,182.64,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
4,2021,177.11,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245950,2024,278.70,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
245951,2024,148.69,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
245952,2024,330.19,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
245953,2024,95.65,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


In [116]:
# MAPE
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# Data Train Test Split

In [110]:
X = df.drop('fare', axis=1)
y = df['fare']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [112]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(196764, 274)
(196764,)
(49191, 274)
(49191,)


# Decision Tree Regressor

### Train & Evaluate Example

In [117]:
regressor = DecisionTreeRegressor(random_state=42)
regressor.fit(X_train, y_train)


In [118]:
y_pred = regressor.predict(X_test)

# Using Mean Squared Error (MSE) to measure the performance
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")


Mean Squared Error: 4060.4034240060773


In [121]:
mape = mean_absolute_percentage_error(y_test, y_pred)
print(f"Mean Absolute Percentage Error (MAPE): {mape}%")

Mean Absolute Percentage Error (MAPE): 19.86351344359953%


### Hyperparameter Tuning Grid Search

In [173]:
param_grid = {
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 10, 20, 50, 75, 100, 150],
    'min_samples_leaf': [1, 2, 5, 10]
}
    
regressor = DecisionTreeRegressor(random_state=42)
grid_search = GridSearchCV(estimator=regressor, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1)
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
best_regressor = grid_search.best_estimator_

Fitting 5 folds for each of 168 candidates, totalling 840 fits
Best parameters: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 75}


In [176]:
# Train Error
print("Train:")
y_pred = best_regressor.predict(X_train)

mse = mean_squared_error(y_train, y_pred)
print(f"Mean Squared Error: {mse}")
mape = mean_absolute_percentage_error(y_train, y_pred)
print(f"Mean Absolute Percentage Error (MAPE): {mape}%")


# Test Error
print("Test:")
y_pred = best_regressor.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
mape = mean_absolute_percentage_error(y_test, y_pred)
print(f"Mean Absolute Percentage Error (MAPE): {mape}%")

Train:
Mean Squared Error: 2894.112371385647
Mean Absolute Percentage Error (MAPE): 16.099708133467836%
Test:
Mean Squared Error: 3167.2193171722415
Mean Absolute Percentage Error (MAPE): 17.299399287152674%


# Random Forest Regressor

In [184]:
from sklearn.ensemble import RandomForestRegressor 


### Train & Evaluate Example

In [185]:
# Initialize the RandomForest model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict on the test data
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
mape = mean_absolute_percentage_error(y_test, y_pred)
print(f"Mean Absolute Percentage Error (MAPE): {mape}%")

Mean Squared Error: 3695.5216114120026
Mean Absolute Percentage Error (MAPE): 18.637089288790698%


### Hyperparameter Grid Search

In [186]:
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features':['sqrt', 'log2'],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10,  50, 75, 100],
    'min_samples_leaf': [1, 2, 5, 10]
}

# Initialize the GridSearchCV object
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best score:", -grid_search.best_score_)

best_regressor = grid_search.best_estimator_

Fitting 3 folds for each of 576 candidates, totalling 1728 fits
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  15.4s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=  15.3s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   8.0s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=300; total time=  25.0s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=75, n_estimators=200; total time=  15.8s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=100, n_estimators=100; total time=   8.2s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=100, n_estimators=300; total time=  24.3s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_esti



[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  15.1s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   7.6s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=  24.6s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=50, n_estimators=200; total time=  16.2s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=75, n_estimators=100; total time=   8.1s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=75, n_estimators=300; total time=  24.4s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   9.0s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=  17.0s
[CV] END max_depth=10, max_features=s

In [187]:
# Best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best score:", -grid_search.best_score_)

Best parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 50, 'n_estimators': 300}
Best score: 3336.757589953915


In [188]:
# Train Error
print("Train:")
y_pred = best_regressor.predict(X_train)

mse = mean_squared_error(y_train, y_pred)
print(f"Mean Squared Error: {mse}")
mape = mean_absolute_percentage_error(y_train, y_pred)
print(f"Mean Absolute Percentage Error (MAPE): {mape}%")


# Test Error
print("Test:")
y_pred = best_regressor.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
mape = mean_absolute_percentage_error(y_test, y_pred)
print(f"Mean Absolute Percentage Error (MAPE): {mape}%")

Train:
Mean Squared Error: 2995.8945509158657
Mean Absolute Percentage Error (MAPE): 17.124272453382506%
Test:
Mean Squared Error: 3160.9635354692277
Mean Absolute Percentage Error (MAPE): 17.962955200884743%
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 3.6min
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time= 1.6min
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=50, n_estimators=100; total time=  36.1s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=50, n_estimators=200; total time= 1.2min
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=75, n_estimators=200; total time= 1.1min
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=100, n_estimators=100; total time=  33.2s
[CV] END max_depth=None, max_features=log2, min_sampl