In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import tree
from sklearn import preprocessing
from sklearn.preprocessing import label_binarize
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
import datetime

In [55]:

# getting data from full dataset
dat = pd.read_csv("../data/Cleaned_Data.csv")
fuldat = pd.read_excel("../raw_data/airport_choice_survey_EN_ver2.0_Capstone.xlsx",sheet_name='Data', header =0)

In [28]:
#drop unused columns
dat = dat.drop(columns =["ID","FrequentDestination1",
                             "FrequentDestination2",
                             "FrequentDestination3",
                             "FrequentDestination4",
                             "FrequentDestination5",
                             "FrequentDestination6",
                             "FrequentDestination7",
                        "TotalDepartureHr","MileageAirline"], axis =1)

In [29]:
# create data for airport model
Xap = dat.loc[:, ~dat.columns.isin(['Airport'])]
yap = dat['Airport'].astype('category').cat.codes
yap.name= 'Airport' # this one is for name the tree graph
# prepare data for airline model
Xal = dat.drop(columns =["Airline"], axis =1)
yal = dat['Airline'].astype('category').cat.codes
yal.name = "Airline" # this one is for name the tree graph

In [39]:
# decision tree function, return trained model
def decision_tree(X_train, y_train, maxdepth = 10, max_feature= None, maxleaf =None, minsamleaf =1, minsamsplit = 2 ):
    now =str(datetime.datetime.now())
    clf =tree.DecisionTreeClassifier(class_weight=None, 
                                     criterion ='gini', 
                                     max_depth =maxdepth, 
                                     max_features = max_feature, 
                                     max_leaf_nodes= maxleaf,
                                    min_samples_leaf = minsamleaf, min_samples_split =minsamsplit,
                                    min_weight_fraction_leaf =0.0, presort =False, random_state =100, splitter = 'best')
    clf = clf.fit(X_train, y_train)
    tree.export_graphviz(clf,out_file=str(y_train.name)+now +".dot",feature_names=X_train.columns)
    return(clf)

# testing decision tree model, return the accuracy
# multi = false => Y only have 0 or 1
def test_decisiont_tree(X_test, y_test, model, multi = False):
    y_pred =model.predict(X_test)
    if multi==True:        
        print(metrics.confusion_matrix(y_test,y_pred))
        print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
        print(classification_report(y_test, y_pred, target_names=['Korean Air(KE)','Asiana Airlines','Korean LCC','Foreign Airlines']))
    else:
        print(metrics.confusion_matrix(y_test,y_pred))
        print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
        print("Recall:", metrics.recall_score(y_test, y_pred))
        print("Precision:",metrics.precision_score(y_test, y_pred))
        print("F measure:",metrics.f1_score(y_test, y_pred))

# k-fold validation
def kfold_test(model, X, y, k):
    kf = KFold(n_splits=k)
    kf.get_n_splits(X)
    acc= 0
    for train_index, test_index in kf.split(X):
        X_test, y_test = X.iloc[test_index], y.iloc[test_index]
        X_train, y_train= X.iloc[train_index], y.iloc[train_index]
        model = model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc =acc + metrics.accuracy_score(y_test, y_pred)
        print(metrics.confusion_matrix(y_test,y_pred))
        print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
    print("Average accuracy: ", acc/k) # return the average accuracy

### old function
def decision_tree_bin(Xset, Yset, maxdepth = 10, max_feature= None, maxleaf =None, minsamleaf =1, minsamsplit = 2 ):
    X_train, X_test, y_train, y_test = train_test_split(Xset, Yset, test_size=0.30,random_state=109)
    clf =tree.DecisionTreeClassifier(class_weight=None, 
                                     criterion ='gini', 
                                     max_depth =maxdepth, 
                                     max_features = max_feature, 
                                     max_leaf_nodes= maxleaf,
                                    min_samples_leaf = minsamleaf, min_samples_split =minsamsplit,
                                    min_weight_fraction_leaf =0.0, presort =False, random_state =100, splitter = 'best')
    clf = clf.fit(X_train, y_train)
    y_pred =clf.predict(X_test)
    print(metrics.confusion_matrix(y_test,y_pred))
    print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
    print("Recall:", metrics.recall_score(y_test, y_pred))
    print("Precision:",metrics.precision_score(y_test, y_pred))
    print("F measure:",metrics.f1_score(y_test, y_pred))
    tree.export_graphviz(clf,out_file="airport.dot",feature_names=Xset.columns)

#old function, dont run
def decision_tree_multi(Xset, Yset, maxdepth = 10, max_feature= None, maxleaf =None, minsamleaf =1, minsamsplit = 2 ):
    X_train, X_test, y_train, y_test = train_test_split(Xset, Yset, test_size=0.30,random_state=109)
    clf =tree.DecisionTreeClassifier(class_weight=None, 
                                     criterion ='gini', 
                                     max_depth =maxdepth, 
                                     max_features = max_feature, 
                                     max_leaf_nodes= maxleaf,
                                    min_samples_leaf = minsamleaf, min_samples_split =minsamsplit,
                                    min_weight_fraction_leaf =0.0, presort =False, random_state =100, splitter = 'best'
                                    )
    clf = clf.fit(X_train, y_train)
    y_pred =clf.predict(X_test)
    print(metrics.confusion_matrix(y_test,y_pred))
    print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred, target_names=['Korean Air(KE)','Asiana Airlines','Korean LCC','Foreign Airlines']))
    dot_data= tree.export_graphviz(clf,out_file="airline.dot",feature_names=Xset.columns)

## Model training
### Airport choice model

#### A. Default prameters

In [71]:
# train airport model with default parameter
X_train, X_test, y_train, y_test = train_test_split(Xap, yap, test_size=0.30,random_state=6)
airport_model_default = decision_tree(X_train, y_train)#, maxdepth = 10, max_feature= None, maxleaf =None, minsamleaf =1, minsamsplit = 2 )
#testing model training accuracy
test_decisiont_tree(X_train,y_train,airport_model_default, False)#, maxdepth = 5,max_feature= None, maxleaf =None, minsamleaf =1, minsamsplit = 2)

#testing with 30% testing
test_decisiont_tree(X_test,y_test,airport_model_default, False)

[[160   0]
 [  0 164]]
Accuracy: 1.0
Recall: 1.0
Precision: 1.0
F measure: 1.0
[[55 14]
 [13 57]]
Accuracy: 0.8057553956834532
Recall: 0.8142857142857143
Precision: 0.8028169014084507
F measure: 0.8085106382978723


#### B. With different parameter

In [86]:
airport_model_maxdepth10 = decision_tree(X_train, y_train, maxdepth = 8, max_feature= None, maxleaf =None, minsamleaf =1, minsamsplit = 2 )
#testing the result
print("maxdepth = 5, max_feature= None, maxleaf =None, minsamleaf =1, minsamsplit = 2 ")
test_decisiont_tree(X_train,y_train, airport_model_maxdepth10, False)#, maxdepth = 5,max_feature= None, maxleaf =None, minsamleaf =1, minsamsplit = 2)

#testing with 30% testing
test_decisiont_tree(X_test,y_test,airport_model_maxdepth10, False)

maxdepth = 5, max_feature= None, maxleaf =None, minsamleaf =1, minsamsplit = 2 
[[157   3]
 [  2 162]]
Accuracy: 0.9845679012345679
Recall: 0.9878048780487805
Precision: 0.9818181818181818
F measure: 0.9848024316109422
[[54 15]
 [11 59]]
Accuracy: 0.8129496402877698
Recall: 0.8428571428571429
Precision: 0.7972972972972973
F measure: 0.8194444444444444


### Airline choice model
#### A. with default parameters

In [87]:
# trainning airline model with default parameter
X_train, X_test, y_train, y_test = train_test_split(Xal,yal, test_size=0.30,random_state=6)
airline_model_default = decision_tree(X_train, y_train)#, maxdepth = 10, max_feature= None, maxleaf =None, minsamleaf =1, minsamsplit = 2 )
# testing airline model with train set
test_decisiont_tree(X_train,y_train,airline_model_default, True)#, maxdepth = 5,max_feature= None, maxleaf =None, minsamleaf =1, minsamsplit = 2)

# testing airline model with test set
test_decisiont_tree(X_test,y_test,airline_model_default, True)

[[113   0   0   0]
 [  0  68   0   0]
 [  0   0  51   0]
 [  3   0   0  89]]
Accuracy: 0.9907407407407407
                  precision    recall  f1-score   support

  Korean Air(KE)       0.97      1.00      0.99       113
 Asiana Airlines       1.00      1.00      1.00        68
      Korean LCC       1.00      1.00      1.00        51
Foreign Airlines       1.00      0.97      0.98        92

     avg / total       0.99      0.99      0.99       324

[[20  2  2 12]
 [ 9 13  2 10]
 [ 6  4 12  5]
 [14  9  4 15]]
Accuracy: 0.4316546762589928
                  precision    recall  f1-score   support

  Korean Air(KE)       0.41      0.56      0.47        36
 Asiana Airlines       0.46      0.38      0.42        34
      Korean LCC       0.60      0.44      0.51        27
Foreign Airlines       0.36      0.36      0.36        42

     avg / total       0.44      0.43      0.43       139



#### B. With different parameters

In [105]:
## trainning airline model with default parameter
X_train, X_test, y_train, y_test = train_test_split(Xal,yal, test_size=0.30,random_state=6)
airline_model_10_1_2 = decision_tree(X_train, y_train, maxdepth =11, max_feature= None, maxleaf =None, minsamleaf =1, minsamsplit =2 )
# testing airline model
print("maxdepth = 11, max_feature= None, maxleaf =None, minsamleaf =1, minsamsplit = 2 ")
test_decisiont_tree(X_train,y_train,airline_model_10_1_2, True)#, maxdepth = 5,max_feature= None, maxleaf =None, minsamleaf =1, minsamsplit = 2)
# testing airline model with test set
test_decisiont_tree(X_test,y_test,airline_model_10_1_2, True)

maxdepth = 11, max_feature= None, maxleaf =None, minsamleaf =1, minsamsplit = 2 
[[113   0   0   0]
 [  0  68   0   0]
 [  0   0  51   0]
 [  0   0   0  92]]
Accuracy: 1.0
                  precision    recall  f1-score   support

  Korean Air(KE)       1.00      1.00      1.00       113
 Asiana Airlines       1.00      1.00      1.00        68
      Korean LCC       1.00      1.00      1.00        51
Foreign Airlines       1.00      1.00      1.00        92

     avg / total       1.00      1.00      1.00       324

[[19  2  2 13]
 [ 9 10  2 13]
 [ 5  5 13  4]
 [10  8  4 20]]
Accuracy: 0.4460431654676259
                  precision    recall  f1-score   support

  Korean Air(KE)       0.44      0.53      0.48        36
 Asiana Airlines       0.40      0.29      0.34        34
      Korean LCC       0.62      0.48      0.54        27
Foreign Airlines       0.40      0.48      0.43        42

     avg / total       0.45      0.45      0.44       139



## Model testing (validation)
Method: K-fold

### Airport

In [106]:
#K =5
kfold_test(airport_model_default, Xap, yap, 5)
#K =10
kfold_test(airport_model_default, Xap, yap, 4)

[[64 29]
 [ 0  0]]
Accuracy: 0.6881720430107527
[[78 15]
 [ 0  0]]
Accuracy: 0.8387096774193549
[[28 15]
 [12 38]]
Accuracy: 0.7096774193548387
[[ 0  0]
 [20 72]]
Accuracy: 0.782608695652174
[[ 0  0]
 [22 70]]
Accuracy: 0.7608695652173914
Average accuracy:  0.7560074801309022
[[91 25]
 [ 0  0]]
Accuracy: 0.7844827586206896
[[87 26]
 [ 0  3]]
Accuracy: 0.7758620689655172
[[ 0  0]
 [43 73]]
Accuracy: 0.6293103448275862
[[ 0  0]
 [20 95]]
Accuracy: 0.8260869565217391
Average accuracy:  0.7539355322338831


In [107]:
#K =5
kfold_test(airport_model_maxdepth10, Xap, yap, 5)
#K =10
kfold_test(airport_model_maxdepth10, Xap, yap, 4)

[[65 28]
 [ 0  0]]
Accuracy: 0.6989247311827957
[[78 15]
 [ 0  0]]
Accuracy: 0.8387096774193549
[[29 14]
 [13 37]]
Accuracy: 0.7096774193548387
[[ 0  0]
 [21 71]]
Accuracy: 0.7717391304347826
[[ 0  0]
 [21 71]]
Accuracy: 0.7717391304347826
Average accuracy:  0.7581580177653109
[[88 28]
 [ 0  0]]
Accuracy: 0.7586206896551724
[[88 25]
 [ 0  3]]
Accuracy: 0.7844827586206896
[[ 0  0]
 [39 77]]
Accuracy: 0.6637931034482759
[[ 0  0]
 [18 97]]
Accuracy: 0.8434782608695652
Average accuracy:  0.7625937031484258


In [35]:
#K-fold testing Airline model
# k =5
kfold_test(airline_model_10_1_2, Xal, yal, 5)
#k=4
kfold_test(airline_model_10_1_2, Xal, yal, 4)

[[18  3  4  8]
 [ 5  8  1  2]
 [ 3  2 16  8]
 [ 7  1  2  5]]
Accuracy: 0.5053763440860215
[[20  4  2  4]
 [ 4  8  4  8]
 [11  0  9  3]
 [ 3  1  7  5]]
Accuracy: 0.45161290322580644
[[ 9  6  4  6]
 [11 13  0 14]
 [ 1  1  2  1]
 [ 6  5  4 10]]
Accuracy: 0.3655913978494624
[[12  4  2 12]
 [ 4  3  0  6]
 [ 4  0  3  2]
 [14  9  3 14]]
Accuracy: 0.34782608695652173
[[17  6  0  8]
 [ 3  6  0  2]
 [ 4  0  7  1]
 [10  5  2 21]]
Accuracy: 0.5543478260869565
Average accuracy:  0.4449509116409537
[[23  4  4  9]
 [ 6  8  1  5]
 [ 5  5 22  4]
 [ 7  5  2  6]]
Accuracy: 0.5086206896551724
[[18  6  5  7]
 [ 5 12  9 10]
 [ 1  0 17  2]
 [ 7  2  7  8]]
Accuracy: 0.47413793103448276
[[10  7  4 12]
 [ 8  8  1 14]
 [ 1  1  4  2]
 [16  9  2 17]]
Accuracy: 0.33620689655172414
[[16 10  0 14]
 [ 2 10  0  3]
 [ 2  0  9  3]
 [ 5 12  2 27]]
Accuracy: 0.5391304347826087
Average accuracy:  0.464523988005997


In [108]:
#K-fold testing Airline model, default parameter
# k =5
kfold_test(airline_model_default, Xal, yal, 5)
#k=4
kfold_test(airline_model_default, Xal, yal, 4)

[[18  3  4  8]
 [ 5  8  1  2]
 [ 3  2 16  8]
 [ 7  1  2  5]]
Accuracy: 0.5053763440860215
[[20  4  2  4]
 [ 4  8  4  8]
 [11  0  9  3]
 [ 3  1  7  5]]
Accuracy: 0.45161290322580644
[[ 9  6  4  6]
 [11 13  0 14]
 [ 1  1  2  1]
 [ 6  5  4 10]]
Accuracy: 0.3655913978494624
[[12  4  2 12]
 [ 4  3  0  6]
 [ 4  0  3  2]
 [14  9  3 14]]
Accuracy: 0.34782608695652173
[[17  6  0  8]
 [ 3  6  0  2]
 [ 4  0  7  1]
 [10  5  2 21]]
Accuracy: 0.5543478260869565
Average accuracy:  0.4449509116409537
[[23  4  4  9]
 [ 6  8  1  5]
 [ 5  5 22  4]
 [ 7  5  2  6]]
Accuracy: 0.5086206896551724
[[18  6  5  7]
 [ 5 12  9 10]
 [ 1  0 17  2]
 [ 7  2  7  8]]
Accuracy: 0.47413793103448276
[[10  7  4 12]
 [ 8  8  1 14]
 [ 1  1  4  2]
 [16  9  2 17]]
Accuracy: 0.33620689655172414
[[16 10  0 14]
 [ 2 10  0  3]
 [ 2  0  9  3]
 [ 5 12  2 27]]
Accuracy: 0.5391304347826087
Average accuracy:  0.464523988005997


In [582]:
#kfold test


In [586]:
k=5
kf = KFold(n_splits=5)
kf.get_n_splits(Xap)
acc= 0
for train_index, test_index in kf.split(Xap):
    #print("Train Index: ", train_index, "\n")
    #print("Test Index: ", test_index)
    X_train, X_test, y_train, y_test = Xap.iloc[train_index], Xap.iloc[test_index], yap.iloc[train_index], yap.iloc[test_index]
    clf =tree.DecisionTreeClassifier(class_weight=None, 
                                     criterion ='gini', 
                                     max_depth =11, 
                                     max_features = None, 
                                     max_leaf_nodes= None,
                                    min_samples_leaf = 1, min_samples_split =2,
                                    min_weight_fraction_leaf =0.0, presort =False, random_state =100, splitter = 'best'
                                    )
    clf = clf.fit(X_train, y_train)
    y_pred =clf.predict(X_test)
    acc =acc+metrics.accuracy_score(y_test, y_pred)
    #print("K=",k)
    #print(metrics.confusion_matrix(y_test,y_pred))
    #print("Accuracy:",round(metrics.accuracy_score(y_test, y_pred),3))
    #print("Recall:", round(metrics.recall_score(y_test, y_pred),3))
    #print("Precision:",round(metrics.precision_score(y_test, y_pred),3))
    #print("F measure:",round(metrics.f1_score(y_test, y_pred),3))
    #k=k+1
    #dot_data= tree.export_graphviz(clf,out_file="airport.dot",feature_names=X.columns)
print("Average accuracy: ", acc/5)

Average accuracy:  0.7688405797101449


In [321]:
kfold_test(airline_model, Xal, yal, 5)

Average accuracy:  0.859654043945769


In [201]:
kf = KFold(n_splits=5)
kf.get_n_splits(Xal)
acc=0
for train_index, test_index in kf.split(Xal):
    #print("Train Index: ", train_index, "\n")
    #print("Test Index: ", test_index)
    X_train, X_test, y_train, y_test = Xal.iloc[train_index], Xal.iloc[test_index], yal.iloc[train_index], yal.iloc[test_index]
    clf =tree.DecisionTreeClassifier(class_weight=None, 
                                     criterion ='gini', 
                                     max_depth =15, 
                                     #max_features = None, 
                                     #max_leaf_nodes= None,
                                    #min_samples_leaf = 1, min_samples_split =2,
                                    #min_weight_fraction_leaf =0.0, presort =False, random_state =100, splitter = 'best'
                                    )
    clf = clf.fit(X_train, y_train)
    y_pred =clf.predict(X_test)
    acc= acc+ metrics.accuracy_score(y_test, y_pred)
    #print("K=",k)
    #print(metrics.confusion_matrix(y_test,y_pred))
    #print("Accuracy:",round(metrics.accuracy_score(y_test, y_pred),3))
    #print(classification_report(y_test, y_pred, target_names=['Korean Air(KE)','Asiana Airlines','Korean LCC','Foreign Airlines']))
    #k=k+1

print("Average accuracy: ", acc/5)

Average accuracy:  0.4806451612903226


In [137]:
max_depths = np.linspace(1, 32,32, endpoint=True)
acc_list = []
for max_depth in max_depths:
    dt = tree.DecisionTreeClassifier(max_depth=max_depth)
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_test)
    acc= metrics.accuracy_score(y_test, y_pred)
    acc_list.append(acc)
   # Add auc score to previous test results
maxdeoth_resunt = pd.DataFrame({"Max_depth":max_depths, "Accuracy":acc_list})

In [138]:
maxdeoth_resunt

Unnamed: 0,Max_depth,Accuracy
0,1.0,0.565217
1,2.0,0.369565
2,3.0,0.521739
3,4.0,0.456522
4,5.0,0.521739
5,6.0,0.413043
6,7.0,0.434783
7,8.0,0.391304
8,9.0,0.456522
9,10.0,0.521739


In [139]:
min_samples_splits = np.linspace(0.1, 1.0, 10, endpoint=True)
minsamsplit_acc = []
for min_samples_split in min_samples_splits:
    dt = tree.DecisionTreeClassifier(min_samples_split=min_samples_split)
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_test)
    acc= metrics.accuracy_score(y_test, y_pred)
    minsamsplit_acc.append(acc)
minsamsplit_result = pd.DataFrame({"Min sample split":min_samples_splits, "Accuracy":minsamsplit_acc})
minsamsplit_result

Unnamed: 0,Min sample split,Accuracy
0,0.1,0.586957
1,0.2,0.5
2,0.3,0.608696
3,0.4,0.608696
4,0.5,0.608696
5,0.6,0.543478
6,0.7,0.369565
7,0.8,0.565217
8,0.9,0.565217
9,1.0,0.565217


In [143]:
#min_samples_leaf

min_samples_leafs = np.linspace(0.1, 0.5, 5, endpoint=True)
minsamleaf_acc = []
for min_samples_leaf in min_samples_leafs:
    dt = tree.DecisionTreeClassifier(min_samples_leaf=min_samples_leaf)
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_test)
    acc= metrics.accuracy_score(y_test, y_pred)
    minsamleaf_acc.append(acc)
minsamleaf_result = pd.DataFrame({"Min sample leaf":min_samples_leafs, "Accuracy":minsamleaf_acc})
minsamleaf_result

Unnamed: 0,Min sample leaf,Accuracy
0,0.1,0.456522
1,0.2,0.695652
2,0.3,0.521739
3,0.4,0.521739
4,0.5,0.369565


In [62]:
#max_features

max_features = list(range(1,Xal.shape[1]))
max_features_acc = []
for max_feature in max_features:
    dt = tree.DecisionTreeClassifier(max_features=max_feature)
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_test)
    acc= metrics.accuracy_score(y_test, y_pred)
    max_features_acc.append(acc)
max_features_result = pd.DataFrame({"Min sample split":max_features, "Accuracy":max_features_acc})
max_features_result

Unnamed: 0,Min sample split,Accuracy
0,1,0.438849
1,2,0.446043
2,3,0.388489
3,4,0.438849
4,5,0.47482
5,6,0.438849
6,7,0.503597
7,8,0.57554
8,9,0.510791
9,10,0.453237
