In [1]:
import pickle
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
# load in airports dataset
airports_df = pd.read_csv('airports.csv', names=['Code', 'Latitude', 'Longitude'], header=None)

# remove entries with invalid codes
airports_df = airports_df[airports_df['Code'] != '\\N']
airports_df.set_index('Code', inplace=True)
airports_df

Unnamed: 0_level_0,Latitude,Longitude
Code,Unnamed: 1_level_1,Unnamed: 2_level_1
GKA,-6.081690,145.391998
MAG,-5.207080,145.789001
HGU,-5.826790,144.296005
LAE,-6.569803,146.725977
POM,-9.443380,147.220001
...,...,...
UGU,-3.739560,137.031998
ETM,29.723694,35.011416
MNH,23.640556,57.487500
CGY,8.612203,124.456496


In [3]:
# load in dataset of AirCanada routes originating from YYZ
yyz_routes_df = pd.read_csv('ac_routes.csv')
yyz_routes_df

Unnamed: 0,Departure city,Arrival city,Aircraft type,Distance (km),Frequency (days/wk)
0,YYZ,AUH,789,11132,3
1,YYZ,AMS,333,5987,7
2,YYZ,ANU,320,3382,2
3,YYZ,AUA,320,3577,2
4,YYZ,ATH,789,8140,3
...,...,...,...,...,...
124,YYZ,DCA,E75,580,7
125,YYZ,PBI,319,1893,7
126,YYZ,YQG,DH3,296,6
127,YYZ,YWG,320,1517,7


In [4]:
# check for missing data
print('Number of Missing Entries by Column')
yyz_routes_df.isnull().sum()

Number of Missing Entries by Column


Departure city         0
Arrival city           0
Aircraft type          0
Distance (km)          0
Frequency (days/wk)    0
dtype: int64

In [5]:
# check data types
yyz_routes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129 entries, 0 to 128
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Departure city       129 non-null    object
 1   Arrival city         129 non-null    object
 2   Aircraft type        129 non-null    object
 3   Distance (km)        129 non-null    int64 
 4   Frequency (days/wk)  129 non-null    int64 
dtypes: int64(2), object(3)
memory usage: 5.2+ KB


In [6]:
# numerical statistics
yyz_routes_df.describe()

Unnamed: 0,Distance (km),Frequency (days/wk)
count,129.0,129.0
mean,3500.674419,5.658915
std,2992.901146,2.001937
min,145.0,1.0
25%,1218.0,4.0
50%,2708.0,7.0
75%,4069.0,7.0
max,12601.0,7.0


In [7]:
# add latitude and longitude features
arrival_lat_long = yyz_routes_df['Arrival city'].apply(lambda x: airports_df.loc[x].to_numpy())
arrival_lat_long = pd.DataFrame(arrival_lat_long.to_list(), index=arrival_lat_long.index)
yyz_routes_df[['Arrival latitude', 'Arrival longitude']] = arrival_lat_long

# drop departure city feature since it is constant
yyz_routes_df.drop(columns='Departure city', inplace=True)

# drop arrival city feature since it is replace by lat/long
yyz_routes_df.drop(columns='Arrival city', inplace=True)
yyz_routes_df

Unnamed: 0,Aircraft type,Distance (km),Frequency (days/wk),Arrival latitude,Arrival longitude
0,789,11132,3,24.433001,54.651100
1,333,5987,7,52.308601,4.763890
2,320,3382,2,17.136700,-61.792702
3,320,3577,2,12.501400,-70.015198
4,789,8140,3,37.936401,23.944500
...,...,...,...,...,...
124,E75,580,7,38.852100,-77.037697
125,319,1893,7,26.683201,-80.095596
126,DH3,296,6,42.275600,-82.955597
127,320,1517,7,49.910000,-97.239899


In [8]:
# number of different aircraft types
yyz_routes_df.value_counts('Aircraft type')

Aircraft type
319    29
789    18
320    16
321    15
E75    12
333    10
CRJ     9
DH3     6
CR9     5
77W     4
788     2
763     1
77L     1
DH4     1
dtype: int64

In [9]:
# replace 'Aircraft type' feature with aircraft categories identified using KMeans model
with open('../cluster/identifiers_to_cat.p', 'rb') as f:
    identifiers_to_cat = pickle.load(f)

yyz_routes_df['Aircraft Category'] = yyz_routes_df['Aircraft type'].apply(lambda x: identifiers_to_cat[x])
yyz_routes_df['Aircraft Category'] = yyz_routes_df['Aircraft Category'].astype('category')
yyz_routes_df

Unnamed: 0,Aircraft type,Distance (km),Frequency (days/wk),Arrival latitude,Arrival longitude,Aircraft Category
0,789,11132,3,24.433001,54.651100,1
1,333,5987,7,52.308601,4.763890,1
2,320,3382,2,17.136700,-61.792702,0
3,320,3577,2,12.501400,-70.015198,0
4,789,8140,3,37.936401,23.944500,1
...,...,...,...,...,...,...
124,E75,580,7,38.852100,-77.037697,0
125,319,1893,7,26.683201,-80.095596,0
126,DH3,296,6,42.275600,-82.955597,2
127,320,1517,7,49.910000,-97.239899,0


## Preprocessing

In [10]:
# separate feature and label columns
X = yyz_routes_df[['Distance (km)', 'Frequency (days/wk)', 'Arrival latitude', 'Arrival longitude']]
y = yyz_routes_df['Aircraft Category']

In [11]:
X.shape

(129, 4)

In [12]:
y.shape

(129,)

In [13]:
# split into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
X_train.shape

(103, 4)

In [14]:
# scale training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

## Train a Model

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

forest_clf = RandomForestClassifier()
cross_val_score(forest_clf, X_train_scaled, y_train, cv=3)

array([0.88571429, 0.94117647, 0.88235294])

In [16]:
# this gives a good baseline but we can try to tune a few parameters
# using GridSearchCV
import numpy as np
from sklearn.model_selection import GridSearchCV

param_grid = [{
    'n_estimators': np.arange(10, 200, step=10),
    'bootstrap': [False, True],
    'max_features': ['auto', 'sqrt'],
    'max_depth': np.arange(10),
}]

grid_search = GridSearchCV(forest_clf, param_grid, cv=3)
grid_search.fit(X_train_scaled, y_train);

In [17]:
# the best parameters
grid_search.best_params_

{'bootstrap': True, 'max_depth': 5, 'max_features': 'auto', 'n_estimators': 10}

In [18]:
# accuracy of best model
grid_search.best_score_

0.9126050420168067

In [19]:
from sklearn.pipeline import Pipeline

# construct pipeline using best parameters
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(**grid_search.best_params_))
])

pipeline.fit(X_train, y_train);

## Evaluate on Test Set

In [20]:
from sklearn.metrics import accuracy_score

# calculate accuracy of model on test set
y_test_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_test_pred)

0.9615384615384616

In [21]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score

# compute precision and recall scores
precision = precision_score(y_test, y_test_pred, average='micro')
print(f'Precision: {precision}')

recall = recall_score(y_test, y_test_pred, average='micro')
print(f'Recall: {recall}')

# generate confusion matrix
display(confusion_matrix(y_test, y_test_pred))

Precision: 0.9615384615384616
Recall: 0.9615384615384616


array([[17,  0,  0],
       [ 0,  7,  0],
       [ 1,  0,  1]])

In [22]:
# save model
with open('aircraft_type_predictor.p', 'wb') as f:
    pickle.dump(pipeline, f)

## Predict a New Route
We will use our clustering and classification models to predict the ideal aicraft to serve a new route between Toronto (YYZ) and Seattle (SEA) that is served daily.

In [23]:
from predict_aircraft_type import predict_aircraft_type

# load our aicraft model to class id map
with open('../cluster/identifiers_to_cat.p', 'rb') as f:
    aircraft_id_to_class_map = pickle.load(f)

predict_aircraft_type(
    model_path='aircraft_type_predictor.p',
    dest_airport_code='SEA',
    aircraft_identifier_map=aircraft_id_to_class_map,
    n_days_with_flights=7
)

['77W', '77L', '333', '789', '788', '763']

Based on our model, we should be using a long range widebody aircraft to serve this new route