In [52]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import tree
from sklearn import preprocessing
from sklearn.preprocessing import label_binarize
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold

In [214]:
# getting data from full dataset
dat = pd.read_csv("../data/Cleaned_Data.csv")
dat.head(5)

Unnamed: 0,ID,Airport,Airline,Age,Gender,Nationality,TripPurpose,TripDuration,FlyingCompanion,ProvinceResidence,...,FrequentDestination3,FrequentDestination4,FrequentDestination5,FrequentDestination6,FrequentDestination7,MileageAirline1,MileageAirline2,MileageAirline3,MileageAirline4,MileageAirline5
0,1,1,1,49,1,1,2,7,0,3,...,0,0,0,0,0,1,0,0,0,0
1,2,1,1,49,2,1,1,4,4,3,...,0,0,0,0,0,0,0,0,0,1
2,3,1,1,25,1,1,1,10,2,3,...,0,0,0,0,0,0,0,0,0,1
3,4,1,1,29,1,1,2,7,2,3,...,0,0,1,0,0,0,1,0,0,0
4,5,1,1,34,2,1,2,4,0,3,...,0,0,0,0,0,0,0,0,0,1


In [None]:
#drop unused columns
dat = dat.drop(columns =["ID","FrequentDestination1",
                             "FrequentDestination2",
                             "FrequentDestination3",
                             "FrequentDestination4",
                             "FrequentDestination5",
                             "FrequentDestination6",
                             "FrequentDestination7"], axis =1)

In [534]:
# create data for airport model
Xap = dat.loc[:, ~dat.columns.isin(['Airport'])]
yap = dat['Airport'].astype('category').cat.codes
yap.name= 'Airport' # this one is for name the tree graph
# prepare data for airline model
Xal = dat.drop(columns =["Airline"], axis =1)
yal = dat['Airline'].astype('category').cat.codes
yal.name = "Airline" # this one is for name the tree graph

In [588]:
# decision tree function, return trained model
def decision_tree(X_train, y_train, maxdepth = 10, max_feature= None, maxleaf =None, minsamleaf =1, minsamsplit = 2 ):
    clf =tree.DecisionTreeClassifier(class_weight=None, 
                                     criterion ='gini', 
                                     max_depth =maxdepth, 
                                     max_features = max_feature, 
                                     max_leaf_nodes= maxleaf,
                                    min_samples_leaf = minsamleaf, min_samples_split =minsamsplit,
                                    min_weight_fraction_leaf =0.0, presort =False, random_state =100, splitter = 'best')
    clf = clf.fit(X_train, y_train)
    tree.export_graphviz(clf,out_file=str(y_train.name)+".dot",feature_names=X_train.columns)
    return(clf)

# testing decision tree model, return the accuracy
# multi = false => Y only have 0 or 1
def test_decisiont_tree(X_test, y_test, model, multi = False):
    y_pred =model.predict(X_test)
    if multi==True:        
        print(metrics.confusion_matrix(y_test,y_pred))
        print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
        print(classification_report(y_test, y_pred, target_names=['Korean Air(KE)','Asiana Airlines','Korean LCC','Foreign Airlines']))
    else:
        print(metrics.confusion_matrix(y_test,y_pred))
        print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
        print("Recall:", metrics.recall_score(y_test, y_pred))
        print("Precision:",metrics.precision_score(y_test, y_pred))
        print("F measure:",metrics.f1_score(y_test, y_pred))

# k-fold validation
def kfold_test(model, X, y, k):
    kf = KFold(n_splits=k)
    kf.get_n_splits(X)
    acc= 0
    for train_index, test_index in kf.split(X):
        X_test, y_test = X.iloc[test_index], y.iloc[test_index]
        y_pred = model.predict(X_test)
        acc =acc + metrics.accuracy_score(y_test, y_pred)
        print(metrics.confusion_matrix(y_test,y_pred))
        print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
    print("Average accuracy: ", acc/k) # return the average accuracy

### old function
def decision_tree_bin(Xset, Yset, maxdepth = 10, max_feature= None, maxleaf =None, minsamleaf =1, minsamsplit = 2 ):
    X_train, X_test, y_train, y_test = train_test_split(Xset, Yset, test_size=0.30,random_state=109)
    clf =tree.DecisionTreeClassifier(class_weight=None, 
                                     criterion ='gini', 
                                     max_depth =maxdepth, 
                                     max_features = max_feature, 
                                     max_leaf_nodes= maxleaf,
                                    min_samples_leaf = minsamleaf, min_samples_split =minsamsplit,
                                    min_weight_fraction_leaf =0.0, presort =False, random_state =100, splitter = 'best')
    clf = clf.fit(X_train, y_train)
    y_pred =clf.predict(X_test)
    print(metrics.confusion_matrix(y_test,y_pred))
    print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
    print("Recall:", metrics.recall_score(y_test, y_pred))
    print("Precision:",metrics.precision_score(y_test, y_pred))
    print("F measure:",metrics.f1_score(y_test, y_pred))
    tree.export_graphviz(clf,out_file="airport.dot",feature_names=Xset.columns)

#old function, dont run
def decision_tree_multi(Xset, Yset, maxdepth = 10, max_feature= None, maxleaf =None, minsamleaf =1, minsamsplit = 2 ):
    X_train, X_test, y_train, y_test = train_test_split(Xset, Yset, test_size=0.30,random_state=109)
    clf =tree.DecisionTreeClassifier(class_weight=None, 
                                     criterion ='gini', 
                                     max_depth =maxdepth, 
                                     max_features = max_feature, 
                                     max_leaf_nodes= maxleaf,
                                    min_samples_leaf = minsamleaf, min_samples_split =minsamsplit,
                                    min_weight_fraction_leaf =0.0, presort =False, random_state =100, splitter = 'best'
                                    )
    clf = clf.fit(X_train, y_train)
    y_pred =clf.predict(X_test)
    print(metrics.confusion_matrix(y_test,y_pred))
    print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred, target_names=['Korean Air(KE)','Asiana Airlines','Korean LCC','Foreign Airlines']))
    dot_data= tree.export_graphviz(clf,out_file="airline.dot",feature_names=Xset.columns)

## Model training
### Airport choice model

#### A. Default prameters

In [579]:
# train airport model with default parameter
X_train, X_test, y_train, y_test = train_test_split(Xap, yap, test_size=0.30,random_state=6)
airport_model_default = decision_tree(X_train, y_train)#, maxdepth = 10, max_feature= None, maxleaf =None, minsamleaf =1, minsamsplit = 2 )
#testing the result
test_decisiont_tree(X_train,y_train,airport_model_default, False)#, maxdepth = 5,max_feature= None, maxleaf =None, minsamleaf =1, minsamsplit = 2)

[[159   1]
 [  0 164]]
Accuracy: 0.9969135802469136
Recall: 1.0
Precision: 0.9939393939393939
F measure: 0.9969604863221885


#### B. With different parameter

In [552]:
airport_model_maxdepth10 = decision_tree(X_train, y_train, maxdepth = 10, max_feature= None, maxleaf =None, minsamleaf =1, minsamsplit = 2 )
#testing the result
print("maxdepth = 10, max_feature= None, maxleaf =None, minsamleaf =1, minsamsplit = 2 ")
test_decisiont_tree(X_train,y_train, airport_model_maxdepth10, False)#, maxdepth = 5,max_feature= None, maxleaf =None, minsamleaf =1, minsamsplit = 2)

maxdepth = 10, max_feature= None, maxleaf =None, minsamleaf =1, minsamsplit = 2 
[[159   1]
 [  0 164]]
Accuracy: 0.9969135802469136
Recall: 1.0
Precision: 0.9939393939393939
F measure: 0.9969604863221885


### Airline choice model
#### A. with default parameters

In [555]:
# trainning airline model with default parameter
X_train, X_test, y_train, y_test = train_test_split(Xal,yal, test_size=0.30,random_state=6)
airline_model_default = decision_tree(X_train, y_train)#, maxdepth = 10, max_feature= None, maxleaf =None, minsamleaf =1, minsamsplit = 2 )
# testing airline model
test_decisiont_tree(X_train,y_train,airline_model_default, True)#, maxdepth = 5,max_feature= None, maxleaf =None, minsamleaf =1, minsamsplit = 2)

[[112   0   0   1]
 [  0  68   0   0]
 [  0   0  51   0]
 [  0   0   0  92]]
Accuracy: 0.9969135802469136
                  precision    recall  f1-score   support

  Korean Air(KE)       1.00      0.99      1.00       113
 Asiana Airlines       1.00      1.00      1.00        68
      Korean LCC       1.00      1.00      1.00        51
Foreign Airlines       0.99      1.00      0.99        92

     avg / total       1.00      1.00      1.00       324



#### B. With different parameters

In [562]:
# trainning airline model with default parameter
X_train, X_test, y_train, y_test = train_test_split(Xal,yal, test_size=0.30,random_state=6)
airline_model_10_1_2 = decision_tree(X_train, y_train, maxdepth =10, max_feature= None, maxleaf =None, minsamleaf =1, minsamsplit =2 )
# testing airline model
print("maxdepth = 10, max_feature= None, maxleaf =None, minsamleaf =1, minsamsplit = 2 ")
test_decisiont_tree(X_train,y_train,airline_model_10_1_2, True)#, maxdepth = 5,max_feature= None, maxleaf =None, minsamleaf =1, minsamsplit = 2)

maxdepth = 10, max_feature= None, maxleaf =None, minsamleaf =1, minsamsplit = 2 
[[112   0   0   1]
 [  0  68   0   0]
 [  0   0  51   0]
 [  0   0   0  92]]
Accuracy: 0.9969135802469136
                  precision    recall  f1-score   support

  Korean Air(KE)       1.00      0.99      1.00       113
 Asiana Airlines       1.00      1.00      1.00        68
      Korean LCC       1.00      1.00      1.00        51
Foreign Airlines       0.99      1.00      0.99        92

     avg / total       1.00      1.00      1.00       324



## Model testing (validation)
Method: K-fold

### Airport

In [592]:
#K =4
kfold_test(airport_model_maxdepth10, Xap, yap, 4)
#K =5
kfold_test(airport_model_maxdepth10, Xap, yap, 5)

[[111   5]
 [  0   0]]
Accuracy: 0.9568965517241379
[[107   6]
 [  0   3]]
Accuracy: 0.9482758620689655
[[  0   0]
 [  4 112]]
Accuracy: 0.9655172413793104
[[  0   0]
 [  4 111]]
Accuracy: 0.9652173913043478
Average accuracy:  0.9589767616191904
[[89  4]
 [ 0  0]]
Accuracy: 0.956989247311828
[[87  6]
 [ 0  0]]
Accuracy: 0.9354838709677419
[[42  1]
 [ 3 47]]
Accuracy: 0.956989247311828
[[ 0  0]
 [ 2 90]]
Accuracy: 0.9782608695652174
[[ 0  0]
 [ 3 89]]
Accuracy: 0.967391304347826
Average accuracy:  0.9590229079008882


In [587]:
#K-fold testing
kfold_test(airline_model, Xal, yal, 5)

Average accuracy:  0.810004675081814


In [232]:
#airline = pd.read_csv("../data/Airline_Dataset.csv")
#airline

[[25  4  0  7]
 [ 7 14  3 10]
 [ 2  4 13  8]
 [12  3  4 23]]
Accuracy: 0.539568345323741
                  precision    recall  f1-score   support

  Korean Air(KE)       0.54      0.69      0.61        36
 Asiana Airlines       0.56      0.41      0.47        34
      Korean LCC       0.65      0.48      0.55        27
Foreign Airlines       0.48      0.55      0.51        42

     avg / total       0.55      0.54      0.54       139



In [382]:
kfold_test(airline_model, Xal, yal, 5)

Average accuracy:  0.859654043945769


In [582]:
#kfold test


In [586]:
k=5
kf = KFold(n_splits=5)
kf.get_n_splits(Xap)
acc= 0
for train_index, test_index in kf.split(Xap):
    #print("Train Index: ", train_index, "\n")
    #print("Test Index: ", test_index)
    X_train, X_test, y_train, y_test = Xap.iloc[train_index], Xap.iloc[test_index], yap.iloc[train_index], yap.iloc[test_index]
    clf =tree.DecisionTreeClassifier(class_weight=None, 
                                     criterion ='gini', 
                                     max_depth =11, 
                                     max_features = None, 
                                     max_leaf_nodes= None,
                                    min_samples_leaf = 1, min_samples_split =2,
                                    min_weight_fraction_leaf =0.0, presort =False, random_state =100, splitter = 'best'
                                    )
    clf = clf.fit(X_train, y_train)
    y_pred =clf.predict(X_test)
    acc =acc+metrics.accuracy_score(y_test, y_pred)
    #print("K=",k)
    #print(metrics.confusion_matrix(y_test,y_pred))
    #print("Accuracy:",round(metrics.accuracy_score(y_test, y_pred),3))
    #print("Recall:", round(metrics.recall_score(y_test, y_pred),3))
    #print("Precision:",round(metrics.precision_score(y_test, y_pred),3))
    #print("F measure:",round(metrics.f1_score(y_test, y_pred),3))
    #k=k+1
    #dot_data= tree.export_graphviz(clf,out_file="airport.dot",feature_names=X.columns)
print("Average accuracy: ", acc/5)

Average accuracy:  0.7688405797101449


In [321]:
kfold_test(airline_model, Xal, yal, 5)

Average accuracy:  0.859654043945769


In [201]:
kf = KFold(n_splits=5)
kf.get_n_splits(Xal)
acc=0
for train_index, test_index in kf.split(Xal):
    #print("Train Index: ", train_index, "\n")
    #print("Test Index: ", test_index)
    X_train, X_test, y_train, y_test = Xal.iloc[train_index], Xal.iloc[test_index], yal.iloc[train_index], yal.iloc[test_index]
    clf =tree.DecisionTreeClassifier(class_weight=None, 
                                     criterion ='gini', 
                                     max_depth =15, 
                                     #max_features = None, 
                                     #max_leaf_nodes= None,
                                    #min_samples_leaf = 1, min_samples_split =2,
                                    #min_weight_fraction_leaf =0.0, presort =False, random_state =100, splitter = 'best'
                                    )
    clf = clf.fit(X_train, y_train)
    y_pred =clf.predict(X_test)
    acc= acc+ metrics.accuracy_score(y_test, y_pred)
    #print("K=",k)
    #print(metrics.confusion_matrix(y_test,y_pred))
    #print("Accuracy:",round(metrics.accuracy_score(y_test, y_pred),3))
    #print(classification_report(y_test, y_pred, target_names=['Korean Air(KE)','Asiana Airlines','Korean LCC','Foreign Airlines']))
    #k=k+1

print("Average accuracy: ", acc/5)

Average accuracy:  0.4806451612903226


In [137]:
max_depths = np.linspace(1, 32,32, endpoint=True)
acc_list = []
for max_depth in max_depths:
    dt = tree.DecisionTreeClassifier(max_depth=max_depth)
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_test)
    acc= metrics.accuracy_score(y_test, y_pred)
    acc_list.append(acc)
   # Add auc score to previous test results
maxdeoth_resunt = pd.DataFrame({"Max_depth":max_depths, "Accuracy":acc_list})

In [138]:
maxdeoth_resunt

Unnamed: 0,Max_depth,Accuracy
0,1.0,0.565217
1,2.0,0.369565
2,3.0,0.521739
3,4.0,0.456522
4,5.0,0.521739
5,6.0,0.413043
6,7.0,0.434783
7,8.0,0.391304
8,9.0,0.456522
9,10.0,0.521739


In [139]:
min_samples_splits = np.linspace(0.1, 1.0, 10, endpoint=True)
minsamsplit_acc = []
for min_samples_split in min_samples_splits:
    dt = tree.DecisionTreeClassifier(min_samples_split=min_samples_split)
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_test)
    acc= metrics.accuracy_score(y_test, y_pred)
    minsamsplit_acc.append(acc)
minsamsplit_result = pd.DataFrame({"Min sample split":min_samples_splits, "Accuracy":minsamsplit_acc})
minsamsplit_result

Unnamed: 0,Min sample split,Accuracy
0,0.1,0.586957
1,0.2,0.5
2,0.3,0.608696
3,0.4,0.608696
4,0.5,0.608696
5,0.6,0.543478
6,0.7,0.369565
7,0.8,0.565217
8,0.9,0.565217
9,1.0,0.565217


In [143]:
#min_samples_leaf

min_samples_leafs = np.linspace(0.1, 0.5, 5, endpoint=True)
minsamleaf_acc = []
for min_samples_leaf in min_samples_leafs:
    dt = tree.DecisionTreeClassifier(min_samples_leaf=min_samples_leaf)
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_test)
    acc= metrics.accuracy_score(y_test, y_pred)
    minsamleaf_acc.append(acc)
minsamleaf_result = pd.DataFrame({"Min sample leaf":min_samples_leafs, "Accuracy":minsamleaf_acc})
minsamleaf_result

Unnamed: 0,Min sample leaf,Accuracy
0,0.1,0.456522
1,0.2,0.695652
2,0.3,0.521739
3,0.4,0.521739
4,0.5,0.369565


In [62]:
#max_features

max_features = list(range(1,Xal.shape[1]))
max_features_acc = []
for max_feature in max_features:
    dt = tree.DecisionTreeClassifier(max_features=max_feature)
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_test)
    acc= metrics.accuracy_score(y_test, y_pred)
    max_features_acc.append(acc)
max_features_result = pd.DataFrame({"Min sample split":max_features, "Accuracy":max_features_acc})
max_features_result

Unnamed: 0,Min sample split,Accuracy
0,1,0.438849
1,2,0.446043
2,3,0.388489
3,4,0.438849
4,5,0.47482
5,6,0.438849
6,7,0.503597
7,8,0.57554
8,9,0.510791
9,10,0.453237
